Created
January 28, 2019 07:41
-
-
Save imjosh/666ae47c700307210d1a873f4e61aff6 to your computer and use it in GitHub Desktop.
Messing around with hex and base36 character encodings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Messing around with character encodings */ | |
/* notes: | |
http://www.i18nguy.com/unicode/supplementary-test.html | |
https://stackoverflow.com/questions/6063148/java-unicode-where-to-find-example-n-byte-unicode-characters | |
https://stackoverflow.com/a/37954501/2034089 | |
https://unicode-table.com/ | |
*/ | |
// Adapted from https://stackoverflow.com/a/21648161/2034089 | |
String.prototype.encodeHex = function (bytes = 4) { | |
if (bytes < 2 || bytes > 4) { | |
throw new Error('Invalid bytes parameter'); | |
} | |
const pad = '0'.repeat(bytes); | |
let result = ""; | |
for (let i = 0; i < this.length; i++) { | |
if (bytes === 2 && this.charCodeAt(i) > 255) { | |
throw new Error('encode_2byte_UTF8_Hex does not support charCode > 255'); | |
} else if (bytes === 3 && this.charCodeAt(i) > 4095) { | |
throw new Error('encode_2byte_UTF8_Hex does not support charCode > 4095'); // 16**3 | |
} | |
else if ((bytes === 4) && (this.charCodeAt(i) > 65535)) { | |
throw new Error('encode_4byte_UTF8_Base36 does not support charCode > 65535'); | |
} | |
const hex = this.charCodeAt(i).toString(16); | |
result += (pad + hex).slice(bytes * -1); | |
} | |
return result | |
} | |
String.prototype.decodeHex = function (bytes = 4) { | |
if (bytes < 1 || bytes > 4) { | |
throw new Error('Invalid bytes parameter'); | |
} | |
const re = new RegExp(`.{1,${bytes}}`, 'g'); | |
const hexes = this.match(re) || []; | |
let back = ""; | |
for (let j = 0; j < hexes.length; j++) { | |
back += String.fromCharCode(parseInt(hexes[j], 16)); | |
} | |
return back; | |
} | |
String.prototype.encode36 = function (bytes = 4) { | |
if (bytes !== 0 && bytes < 2 || bytes > 4) { | |
throw new Error('Invalid bytes parameter'); | |
} | |
if (bytes === 0) { | |
if (this.match(/[^a-z0-9]/)) { | |
throw new Error('encode_az09_Base36 does not support characters other than lowercase a-z and digits 0-9'); | |
} | |
return parseInt(this, 36).toString(); | |
} | |
const pad = '0'.repeat(bytes); | |
let result = ""; | |
for (let i = 0; i < this.length; i++) { | |
if (bytes === 2 && this.charCodeAt(i) > 1295) { | |
throw new Error('encode_2byte_UTF8_Base36 does not support charCode > 1295'); // 36**2 | |
} else if (bytes === 3 && this.charCodeAt(i) > 46655) { | |
throw new Error('encode_3byte_UTF8_Base36 does not support charCode > 46655'); // 36**3 | |
} | |
else if ((bytes === 4) && (this.charCodeAt(i) > 65535)) { | |
throw new Error('encode_4byte_UTF8_Base36 does not support charCode > 65535'); | |
} | |
const b36 = this.charCodeAt(i).toString(36); | |
result += (pad + b36).slice(bytes * -1); | |
} | |
return result; | |
} | |
String.prototype.decode36 = function (bytes = 4) { | |
const str = this; | |
if (bytes !== 0 && bytes < 2 || bytes > 4) { | |
throw new Error('Invalid bytes parameter'); | |
} | |
if (bytes === 0) { | |
return parseInt(this).toString(36); | |
} | |
const re = new RegExp(`.{1,${bytes}}`, 'g'); | |
const chars = str.match(re) || []; | |
return chars.reduce((acc, char) => { | |
return acc + String.fromCharCode(parseInt(char, 36)); | |
}, ''); | |
} | |
String.prototype.encode_az09_Base36 = function () { | |
return this.encode36(0); | |
} | |
String.prototype.decode_az09_Hex = function () { | |
return this.decodeHex(1); | |
} | |
String.prototype.encode_2byte_UTF8_Hex = function () { | |
return this.encodeHex(2); | |
} | |
String.prototype.decode_2byte_UTF8_Hex = function () { | |
return this.decodeHex(2); | |
} | |
String.prototype.encode_3byte_UTF8_Hex = function () { | |
return this.encodeHex(3); | |
} | |
String.prototype.decode_3byte_UTF8_Hex = function () { | |
return this.decodeHex(3); | |
} | |
String.prototype.encode_4byte_UTF8_Hex = function () { | |
return this.encodeHex(4); | |
} | |
String.prototype.decode_4byte_UTF8_Hex = function () { | |
return this.decodeHex(4); | |
} | |
String.prototype.decode_az09_Base36 = function () { | |
return this.decode36(0); | |
} | |
String.prototype.encode_2byte_UTF8_Base36 = function () { | |
return this.encode36(2); | |
} | |
String.prototype.decode_2byte_UTF8_Base36 = function () { | |
return this.decode36(2); | |
} | |
String.prototype.encode_3byte_UTF8_Base36 = function () { | |
return this.encode36(3); | |
} | |
String.prototype.decode_3byte_UTF8_Base36 = function () { | |
return this.decode36(3); | |
} | |
String.prototype.encode_4byte_UTF8_Base36 = function () { | |
return this.encode36(4); | |
} | |
String.prototype.decode_4byte_UTF8_Base36 = function () { | |
return this.decode36(4); | |
} | |
// fixme | |
var testStrings = { | |
az09Str: 'az0', | |
azAZ09Str: 'aZ0', | |
utf8_one_byteStr: 'aZ!', | |
utf8_two_byteStr: 'aZ¶', | |
// utf8_three_byteStr: 'aZ‱', | |
utf8_three_byteStr: 'aZ' + String.fromCharCode(4095), // hex will pass | |
// utf8_three_byteStr: 'aZ'+ String.fromCharCode(4096), // hex will fail | |
utf8_four_byteStr: 'aZ𠴕', | |
} | |
var tests = { | |
// testString: { testFn: should } | |
az09Str: { | |
'az09_Base36': 'match', | |
'2byte_UTF8_Hex': 'match', | |
'2byte_UTF8_Base36': 'match', | |
'3byte_UTF8_Hex': 'match', | |
'3byte_UTF8_Base36': 'match', | |
'4byte_UTF8_Hex': 'match', | |
'4byte_UTF8_Base36': 'match', | |
}, | |
azAZ09Str: { | |
'az09_Base36': 'throw', | |
'2byte_UTF8_Hex': 'match', | |
'2byte_UTF8_Base36': 'match', | |
'3byte_UTF8_Hex': 'match', | |
'3byte_UTF8_Base36': 'match', | |
'4byte_UTF8_Hex': 'match', | |
'4byte_UTF8_Base36': 'match', | |
}, | |
utf8_one_byteStr: { | |
'az09_Base36': 'throw', | |
'2byte_UTF8_Hex': 'match', | |
'2byte_UTF8_Base36': 'match', | |
'3byte_UTF8_Hex': 'match', | |
'3byte_UTF8_Base36': 'match', | |
'4byte_UTF8_Hex': 'match', | |
'4byte_UTF8_Base36': 'match', | |
}, | |
utf8_two_byteStr: { | |
'az09_Base36': 'throw', | |
'2byte_UTF8_Hex': 'match', | |
'2byte_UTF8_Base36': 'match', | |
'3byte_UTF8_Hex': 'match', | |
'3byte_UTF8_Base36': 'match', | |
'4byte_UTF8_Hex': 'match', | |
'4byte_UTF8_Base36': 'match', | |
}, | |
utf8_three_byteStr: { | |
'az09_Base36': 'throw', | |
'2byte_UTF8_Hex': 'throw', | |
'2byte_UTF8_Base36': 'throw', | |
'3byte_UTF8_Hex': 'match', | |
'3byte_UTF8_Base36': 'match', | |
'4byte_UTF8_Hex': 'match', | |
'4byte_UTF8_Base36': 'match', | |
}, | |
utf8_four_byteStr: { | |
'az09_Base36': 'throw', | |
'2byte_UTF8_Hex': 'throw', | |
'2byte_UTF8_Base36': 'throw', | |
'3byte_UTF8_Hex': 'throw', | |
'3byte_UTF8_Base36': 'throw', | |
'4byte_UTF8_Hex': 'match', | |
'4byte_UTF8_Base36': 'match', | |
} | |
} | |
var testResults = [] | |
Object.keys(tests).forEach(testStringKey => { | |
const test = tests[testStringKey]; | |
Object.keys(test).forEach(fnName => { | |
const testResult = {}; | |
const should = test[fnName]; | |
testResult.test = `Test ${testStringKey} on ${fnName}`; | |
try { | |
const result = testEncoding(fnName, testStrings[testStringKey]); | |
testResult.match = result.matches; | |
testResult.chars = result.length; | |
if (should === 'match') { | |
if (result.matches) { | |
testResult.passFail = 'PASS'; | |
} else { | |
testResult.passFail = 'FAIL! - should match'; | |
} | |
} | |
if (should === 'throw') { | |
testResult.passFail = 'FAIL! - should throw'; | |
} | |
} catch (error) { | |
testResult.match = `${error.message}`; | |
if (should === 'throw') { | |
testResult.passFail = 'PASS'; | |
} else { | |
testResult.passFail = `FAIL! - should ${should}`; | |
} | |
} | |
testResults.push(testResult); | |
}); | |
}); | |
console.table(testResults); | |
function testEncoding(encoding, myStr) { | |
const encoded = myStr[`encode_${encoding}`](); | |
const decoded = encoded[`decode_${encoding}`](); | |
const matches = myStr === decoded; | |
return { length: encoded.length, matches }; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment