-
-
Save AgtLucas/960caf617ae46eb4184dbc57a3fe15af to your computer and use it in GitHub Desktop.
Converting English number sentences ("one hundred forty two point three") to numeric digits ("142.3")
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
convert("one hundred five"); // "105" | |
convert("six hundred and fifty three"); // "653" | |
convert("zero zero one two three"); // "123" | |
convert("twelve o three"); // "1203" | |
convert("thirteen zero nine"); // "1309" | |
convert("fifteen sixteen"); // "1516" | |
convert("fourteen ninety two"); // "1492" | |
convert("nineteen ten"); // "1910" | |
convert("twenty twenty"); // "2020" <---- ugh! | |
convert("twenty twenty one"); // "2021" <---- ehhh... | |
convert("twenty twenty two"); // "2022" <---- let's hope! | |
convert("four five two three eight"); // "45238" | |
convert("sixteen thousand three eighty four"); // "16384" | |
convert("seven billion six hundred eighty-one million"); // "7681000000" | |
convert("twenty three trillion and nine"); // "23000000000009" | |
convert("four billion two hundred nine thousand"); // "4000209000" | |
convert("nine hundred ninety nine quadrillion nine ninety nine trillion nine hundred and ninety nine billion nine ninety-nine million nine hundred ninety-nine thousand nine ninety nine"); // "999999999999999999" | |
convert("one two three four five six seven eight nine eight seven six five four three two one two three four five"); // "123456789876543212345" | |
convert("forty two point zero"); // "42.0" | |
convert("three point one four one five nine two six"); // "3.1415926" | |
convert("point"); // "0.0" | |
convert("four point zero o o o zero"); // "4.00000" | |
convert("sixty five thousand five thirty six",","); // "65,536" | |
convert("four billion two hundred nine thousand",","); // "4,000,209,000" | |
convert("forty two",","); // "42" | |
convert("twenty one twenty three",","); // "2,123" | |
convert("one two three four five six seven eight nine eight seven six five four three two one two three four five",","); // "123456,789,876,543,212,345" <---- not a mistake, quadrillion is the highest supported "place" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"use strict"; | |
var digits = { | |
"o": "0", | |
"zero": "0", | |
"one": "1", | |
"two": "2", | |
"three": "3", | |
"four": "4", | |
"five": "5", | |
"six": "6", | |
"seven": "7", | |
"eight": "8", | |
"nine": "9", | |
}; | |
var tens = { | |
"ten": "10", | |
"eleven": "11", | |
"twelve": "12", | |
"thirteen": "13", | |
"fourteen": "14", | |
"fifteen": "15", | |
"sixteen": "16", | |
"seventeen": "17", | |
"eighteen": "18", | |
"nineteen": "19", | |
}; | |
var doubles = { | |
"twenty": "20", | |
"thirty": "30", | |
"forty": "40", | |
"fifty": "50", | |
"sixty": "60", | |
"seventy": "70", | |
"eighty": "80", | |
"ninety": "90", | |
}; | |
var units = [ | |
"hundred", | |
"thousand", | |
"million", | |
"billion", | |
"trillion", | |
"quadrillion", | |
]; | |
function convert(numstr,separator = "") { | |
var ast = parse(numstr); | |
var numberDigits = ""; | |
var node = ast; | |
while (node) { | |
numberDigits += ( | |
(node.unit == "decimal" ? | |
("." + (node.value || "0")) : | |
( | |
(numberDigits != "" ? separator : "") + | |
(node.value || "000") | |
) | |
) | |
); | |
node = node.and; | |
} | |
// normalize leading zeros | |
numberDigits = numberDigits.replace(/^0+/,"").replace(/^\./,"0.") || "0"; | |
return numberDigits; | |
} | |
function parse(numstr) { | |
var words = numstr.trim().replace(/[^\-0-9a-z\s]+/ig,"").toLowerCase().split(/[\s\-]+/).filter(Boolean); | |
// (STEP 1) tokenize the string | |
var tokens = []; | |
var inDecimal = false; | |
for (let word of words) { | |
let curToken = tokens[tokens.length - 1]; | |
if (word == "point" || word == "dot") { | |
if (curToken && !curToken.complete) { | |
if (!curToken.unit) { | |
curToken.unit = "hundred"; | |
} | |
curToken.complete = true; | |
} | |
if (!inDecimal) { | |
inDecimal = true; | |
tokens.push({ type: "point", value: ".", complete: true, }); | |
} | |
else { | |
throw new Error("Invalid! " + word); | |
} | |
} | |
else if (word == "o" || word == "zero") { | |
if (curToken && !curToken.complete) { | |
tokens.push({ type: "digit", value: "0", complete: true, }); | |
curToken.complete = true; | |
} | |
else { | |
tokens.push({ type: "digit", value: "0", complete: true, }); | |
} | |
} | |
else if (word in digits) { | |
if (curToken && !curToken.complete) { | |
// replace a trailing zero (from a double or hundred)? | |
if (curToken.value.endsWith("0")) { | |
curToken.value = curToken.value.slice(0,-1) + digits[word]; | |
curToken.complete = true; | |
} | |
else { | |
tokens.push({ type: "digit", value: digits[word], complete: true, }); | |
curToken.complete = true; | |
} | |
} | |
else { | |
tokens.push({ type: "digit", value: digits[word], complete: true, }); | |
} | |
} | |
else if (word in tens) { | |
if (curToken && !curToken.complete) { | |
// replace two trailing zeros (from a hundred)? | |
if (curToken.value.endsWith("00")) { | |
curToken.value = curToken.value.slice(0,1) + tens[word]; | |
curToken.complete = true; | |
} | |
else { | |
tokens.push({ type: "ten", value: tens[word], complete: true, }); | |
curToken.complete = true; | |
} | |
} | |
// promote a single digit to a complete triple? | |
else if (curToken && !curToken.unit && curToken.type == "digit") { | |
curToken.type = "triple"; | |
curToken.value = curToken.value.slice(0,1) + tens[word]; | |
} | |
else { | |
tokens.push({ type: "ten", value: tens[word], complete: true, }); | |
} | |
} | |
else if (word in doubles) { | |
if (curToken && !curToken.complete) { | |
// replace two trailing zeros (from a triple)? | |
if (curToken.value.endsWith("00")) { | |
curToken.value = curToken.value.slice(0,1) + doubles[word]; | |
// NOTE: leave complete:false since a digit can complete a double | |
} | |
else { | |
tokens.push({ type: "double", value: doubles[word], complete: false, }); | |
curToken.complete = true; | |
} | |
} | |
// promote a single digit to an incomplete triple? | |
else if (curToken && !curToken.unit && curToken.type == "digit") { | |
curToken.type = "triple"; | |
curToken.value = curToken.value.slice(0,1) + doubles[word]; | |
curToken.complete = false; | |
} | |
else { | |
tokens.push({ type: "double", value: doubles[word], complete: false, }); | |
} | |
} | |
else if (!inDecimal) { | |
if (word == "hundred") { | |
if (curToken && !curToken.complete) { | |
curToken.complete = true; | |
tokens.push({ type: "triple", value: "100", complete: false, }); | |
} | |
// promote a single digit to an incomplete triple? | |
else if (curToken && !curToken.unit && curToken.type == "digit") { | |
curToken.type = "triple"; | |
curToken.value = curToken.value.slice(0,1) + "00"; | |
curToken.complete = false; | |
} | |
else { | |
tokens.push({ type: "triple", value: "100", complete: false, }); | |
} | |
} | |
// thousand, million, etc | |
else if (units.includes(word)) { | |
if (curToken) { | |
curToken.unit = word; | |
curToken.complete = true; | |
} | |
else { | |
tokens.push({ type: "digit", unit: word, value: "1", complete: true, }); | |
} | |
} | |
// harmless conjunction word? | |
else if (word == "and") { | |
continue; | |
} | |
// unrecognized/invalid word | |
else { | |
throw new Error("Invalid! " + word); | |
} | |
} | |
// word not allowed while tokenizing decimal values | |
else { | |
throw new Error("Invalid! " + word); | |
} | |
} | |
// (STEP 2) parse the token list into an AST | |
var ast = {}; | |
var curNode = ast; | |
for (let tokenIdx = 0; tokenIdx < tokens.length; tokenIdx++) { | |
let token = tokens[tokenIdx]; | |
let nextToken = tokens[tokenIdx + 1]; | |
// token indicates an assigned unit-place? | |
if (token.unit) { | |
// current node has no assigned unit-place? | |
if (!curNode.unit) { | |
curNode.unit = token.unit; | |
curNode.value = ( | |
curNode == ast ? | |
token.value : | |
token.value.padStart(3,"0") | |
); | |
let unit = nextUnit(token.unit); | |
if (unit) { | |
// create next placeholder node | |
curNode = curNode.and = { unit, }; | |
} | |
} | |
// token unit same as current node? | |
else if (token.unit == curNode.unit) { | |
// current node is a placeholder that has not yet | |
// been assigned a value from token? | |
if (!curNode.value) { | |
curNode.value = ( | |
curNode == ast ? | |
token.value : | |
token.value.padStart(3,"0") | |
); | |
let unit = nextUnit(token.unit); | |
if (unit) { | |
// create next placeholder node | |
curNode = curNode.and = { unit, }; | |
} | |
} | |
else { | |
throw new Error("Invalid! " + token.unit); | |
} | |
} | |
// current node is different (higher?) unit place | |
// than token? | |
else { | |
// attempt to generate missing unit node(s) | |
let [ tree, leaf,] = | |
generateMissingUnitNodes(curNode.unit,token.unit); | |
if (tree) { | |
curNode.and = tree.and; | |
curNode = leaf; | |
curNode.value = token.value; | |
} | |
else { | |
throw new Error("Invalid! " + token.unit); | |
} | |
} | |
} | |
// decimal point? | |
else if (token.type == "point") { | |
// current node has no unit-place assigned yet? | |
if (!curNode.unit) { | |
curNode.unit = "hundred"; | |
curNode = curNode.and = { unit: "decimal", value: "", }; | |
} | |
else if (curNode.unit == "hundred") { | |
curNode = curNode.and = { unit: "decimal", value: "", }; | |
} | |
else { | |
// attempt to generate missing unit-place node(s) | |
let [ tree, leaf,] = | |
generateMissingUnitNodes(curNode.unit,"hundred"); | |
if (tree) { | |
curNode.and = tree.and; | |
curNode = leaf; | |
curNode = curNode.and = { unit: "decimal", value: "", }; | |
} | |
else { | |
throw new Error("Invalid! " + token.type); | |
} | |
} | |
} | |
// separate digit? | |
else if (token.type == "digit") { | |
// append digit to the decimal node? | |
if (curNode.unit == "decimal") { | |
// look-ahead to collect all consecutive digits, if any | |
let digitTokens = collectConsecutiveDigits(tokens,tokenIdx); | |
tokenIdx += (digitTokens.length - 1); | |
// add digit token(s) to current node | |
for (let digit of digitTokens) { | |
curNode.value = (curNode.value || "") + digit.value; | |
} | |
} | |
// multiple adjacent (non-decimal) digits? | |
else if ( | |
nextToken && | |
nextToken.type == "digit" | |
) { | |
// current node is "empty", so we can implicitly | |
// create arbitrary unit-place segment(s) from multiple | |
// digits? | |
if (!curNode.unit) { | |
// look-ahead to collect all consecutive digits | |
let digitTokens = collectConsecutiveDigits(tokens,tokenIdx); | |
tokenIdx += (digitTokens.length - 1); | |
// skip any leading zeros (since we're at the | |
// start of the number) | |
let firstNonZeroDigitIdx = digitTokens.findIndex(digit => digit.value != "0"); | |
if (firstNonZeroDigitIdx > 0) { | |
digitTokens = digitTokens.slice(firstNonZeroDigitIdx); | |
} | |
// any digits remain to be added to the AST? | |
if (digitTokens.length > 0) { | |
// determine how many unit-place groups are needed | |
let numGroups = Math.ceil( | |
Math.min(digitTokens.length,units.length * 3) / 3 | |
); | |
// determine number of digits in first group | |
let groupSize = ( | |
digitTokens.length > (units.length * 3) ? | |
digitTokens.length - (units.length * 3) + 3 : | |
digitTokens.length % 3 || 3 | |
); | |
// create the necessary unit-place nodes in the AST | |
let [ tree, leaf ] = generateMissingUnitNodes( | |
units[ | |
Math.min(units.length - 1,numGroups - 1) | |
], | |
"hundred" | |
); | |
if (tree) { | |
curNode.unit = tree.unit; | |
curNode.value = ""; | |
if (tree.and) { | |
curNode.and = tree.and; | |
} | |
// fill in the unit-place groups to the AST | |
do { | |
// collect a group of digits into current node | |
let digitGroup = digitTokens.slice(0,groupSize); | |
digitTokens = digitTokens.slice(groupSize); | |
curNode.value = digitGroup.reduce((val,digit) => val + digit.value,""); | |
// more digits to add as a unit-place group? | |
if (curNode.and && digitTokens.length > 0) { | |
curNode = curNode.and; | |
// from here forward, all digit groups are | |
// fixed size of 3 | |
groupSize = 3; | |
} | |
} | |
// keep going while digits remain to be grouped | |
while (digitTokens.length > 0); | |
} | |
} | |
else { | |
// NOTE: should never get here | |
throw new Error("Invalid! " + token.value); | |
} | |
} | |
else { | |
// look-ahead to collect up to 3 consecutive digits | |
let digitTokens = | |
collectConsecutiveDigits(tokens,tokenIdx,/*limit=*/3); | |
tokenIdx += (digitTokens.length - 1); | |
// combine digits into a single value | |
let val = digitTokens.reduce((val,digit) => val + digit.value,""); | |
// assign combined-digits to "hundred" unit-place node | |
curNode = assignHundredUnitPlaceNode( | |
curNode, | |
// zero-pad the value | |
val.padStart(3,"0") | |
); | |
} | |
} | |
else { | |
// assign single digit to "hundred" unit-place node | |
curNode = assignHundredUnitPlaceNode( | |
curNode, | |
// zero-pad the value | |
token.value.padStart(3,"0") | |
); | |
} | |
} | |
// stand-alone ten or double token? | |
else if (token.type == "ten" || token.type == "double") { | |
// append numbers to the decimal node? | |
if (curNode.unit == "decimal") { | |
curNode.value += token.value; | |
} | |
// literal/year form: | |
// * "seventeen nineteen" | |
// * "seventeen thirty" | |
// * "twenty fourteen" | |
// * "twenty fifty" | |
else if ( | |
nextToken && | |
(nextToken.type == "ten" || nextToken.type == "double") | |
) { | |
if (!curNode.unit) { | |
curNode.unit = "thousand"; | |
curNode.value = token.value.slice(0,1); | |
curNode = curNode.and = { | |
unit: "hundred", | |
value: token.value.slice(1) + nextToken.value, | |
}; | |
tokenIdx += 1; // lookahead: 1 spot | |
} | |
else { | |
throw new Error("Invalid! " + token.value); | |
} | |
} | |
// ten/double followed by: | |
// * any 3 digits | |
// * '0' plus another digit | |
else if ( | |
!curNode.unit && | |
nextToken && | |
nextToken.type == "digit" && | |
!nextToken.unit | |
) { | |
let tokenN2 = tokens[tokenIdx + 2]; | |
let tokenN3 = tokens[tokenIdx + 3]; | |
// any 3 digits | |
if ( | |
tokenN2 && | |
tokenN2.type == "digit" && | |
tokenN3 && | |
tokenN3.type == "digit" | |
) { | |
curNode.unit = "thousand"; | |
curNode.value = token.value; | |
curNode = curNode.and = { | |
unit: "hundred", | |
value: nextToken.value + tokenN2.value + tokenN3.value, | |
}; | |
tokenIdx += 3; // lookahead: 3 spots | |
} | |
// '0' plus another digit | |
else if ( | |
nextToken.value == "0" && | |
tokenN2 && | |
tokenN2.type == "digit" | |
) { | |
curNode.unit = "thousand"; | |
curNode.value = token.value.slice(0,1); | |
curNode = curNode.and = { | |
unit: "hundred", | |
value: token.value.slice(1) + nextToken.value + tokenN2.value, | |
}; | |
tokenIdx += 2; // lookahead: 2 spots | |
} | |
else { | |
throw new Error("Invalid! " + token.value); | |
} | |
} | |
// assumed "thousand" unit: | |
// * "thirteen nine forty two" | |
// * "thirty nine two o six" | |
else if ( | |
!curNode.unit && | |
nextToken && | |
nextToken.type == "triple" && | |
!nextToken.unit | |
) { | |
curNode.unit = "thousand"; | |
curNode.value = token.value; | |
curNode = curNode.and = { | |
unit: "hundred", | |
value: nextToken.value.padStart(3,"0"), | |
}; | |
tokenIdx += 1; // lookahead: 1 spot | |
} | |
else { | |
// assign ten/double value to "hundred" unit-place node | |
curNode = assignHundredUnitPlaceNode( | |
curNode, | |
// zero-pad the value | |
token.value.padStart(3,"0") | |
); | |
} | |
} | |
else if (token.type == "triple") { | |
if (curNode.unit == "decimal") { | |
curNode.value += token.value; | |
} | |
else { | |
// assign triple value to "hundred" unit-place node | |
curNode = assignHundredUnitPlaceNode( | |
curNode, | |
// zero-pad the value | |
token.value.padStart(3,"0") | |
); | |
} | |
} | |
else { | |
// NOTE: should never get here | |
throw new Error("Invalid! " + token.type); | |
} | |
} | |
// append missing AST nodes (if any) | |
if (![ "hundred", "decimal" ].includes(curNode.unit)) { | |
let [ tree ] = generateMissingUnitNodes(curNode.unit,"hundred"); | |
if (tree) { | |
curNode.and = tree.and; | |
} | |
else { | |
throw new Error("Invalid! " + curNode.value); | |
} | |
} | |
return ast; | |
} | |
function assignHundredUnitPlaceNode(curNode,val) { | |
if (curNode.unit != "hundred") { | |
// current node is "empty", so we can assign it | |
// as the "hundred" unit-place node | |
if (!curNode.unit && !curNode.value) { | |
curNode.unit = "hundred"; | |
curNode.value = val; | |
return curNode; | |
} | |
else { | |
// attempt to generate missing unit node(s) | |
let [ tree, leaf,] = | |
generateMissingUnitNodes(curNode.unit,"hundred"); | |
if (tree) { | |
curNode.and = tree.and; | |
curNode = leaf; | |
} | |
else { | |
throw new Error("Invalid! " + val); | |
} | |
} | |
} | |
// current node is a placeholder in the "hundred" | |
// unit-place, that has not yet been assigned any | |
// value from a token? | |
if (!curNode.value) { | |
curNode.value = val; | |
return curNode; | |
} | |
else { | |
throw new Error("Invalid! " + val); | |
} | |
} | |
function collectConsecutiveDigits(tokens,tokenIdx,limit = Number.MAX_SAFE_INTEGER) { | |
var digitTokens = [ tokens[tokenIdx], ]; | |
for ( | |
let adjIdx = tokenIdx + 1; | |
( | |
adjIdx < tokens.length && | |
tokens[adjIdx].type == "digit" && | |
!tokens[adjIdx].unit && | |
digitTokens.length < limit | |
); | |
adjIdx++ | |
) { | |
digitTokens.push(tokens[adjIdx]); | |
} | |
return digitTokens; | |
} | |
function generateMissingUnitNodes(curUnit,targetUnit) { | |
var unit = curUnit; | |
var tree = { unit: curUnit, }; | |
var leaf = tree; | |
while (unit && unit != targetUnit) { | |
unit = nextUnit(unit); | |
if (unit) { | |
leaf = leaf.and = { unit, }; | |
} | |
} | |
if (unit && unit == targetUnit) { | |
return [ tree, leaf ]; | |
} | |
return []; | |
} | |
function nextUnit(unit) { | |
var unitIdx = units.indexOf(unit); | |
if (unitIdx > 0) { | |
return units[unitIdx - 1]; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment