Created
April 3, 2011 01:59
-
-
Save paulbaumgart/900101 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var SYSTEM = require('system'), | |
HTTP = require('http-client'), | |
OS = require('os'); | |
var url = SYSTEM.args[1]; | |
if (!url) { | |
print('usage:'); | |
print('js ' + SYSTEM.args[0] + ' <fileformat.info URL>'); | |
OS.exit(1); | |
} | |
var fileContents = HTTP.read(url).decodeToString('utf-8'), | |
charRegex = /\s*<td align="center"><a href="\/info\/unicode\/char\/.+?">U+(.+?)<\/a><\/td>\s*/g, | |
matches = null, | |
lastCharCode = null, | |
classStartCharCode = null, | |
classes = []; | |
var convertToUnicodeHex = function(charCode) { | |
if (charCode > 0xFFFF) { | |
charCode -= 0x10000; | |
var highSurrogate = ((charCode & 0xFFC00) >> 10) + 0xD800; | |
var lowSurrogate = (charCode & 0x3FF) + 0xDC00; | |
return '\\u' + highSurrogate.toString(16).toUpperCase() + '\\u' + lowSurrogate.toString(16).toUpperCase(); | |
} else { | |
var str = charCode.toString(16).toUpperCase(); | |
while (str.length < 4) | |
str = '0' + str; | |
return '\\u' + str; | |
} | |
}; | |
var charClassFromCharCodes = function(startCharCode, endCharCode) { | |
if (startCharCode === endCharCode) { | |
if (startCharCode < 0xFFFF) | |
return convertToUnicodeHex(startCharCode); | |
else | |
return '"' + convertToUnicodeHex(startCharCode) + '"'; | |
} else { | |
return convertToUnicodeHex(startCharCode) + '-' + convertToUnicodeHex(endCharCode); | |
} | |
}; | |
while (matches = charRegex.exec(fileContents)) { | |
var charCode = parseInt(matches[1], 16); | |
if (charCode > 0xFFFF) { // funky utf-16 escape sequence required | |
if (classStartCharCode !== null) { // end the previous char class if it exists | |
classes[classes.length - 1] += charClassFromCharCodes(classStartCharCode, lastCharCode) + ']'; | |
classStartCharCode = null; | |
} | |
classes.push(charClassFromCharCodes(charCode, charCode)); | |
} | |
else if (classStartCharCode === null) { | |
classStartCharCode = charCode; | |
classes.push('['); | |
} | |
else if (charCode !== lastCharCode + 1) { | |
classes[classes.length - 1] += charClassFromCharCodes(classStartCharCode, lastCharCode); | |
classStartCharCode = charCode; | |
} | |
lastCharCode = charCode; | |
} | |
if (lastCharCode <= 0xFFFF && classStartCharCode !== null) | |
classes[classes.length - 1] += charClassFromCharCodes(classStartCharCode, lastCharCode) + ']'; | |
print(classes.join(' / ')); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment