Created
November 30, 2021 22:22
-
-
Save nopeless/f3dc2e6888475b051e484f72dd6c846d to your computer and use it in GitHub Desktop.
i speed run arithmetic operation tokenization in 90 minutes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* eslint-disable consistent-return */ | |
class TokenType { | |
constructor(type, regex) { | |
this.type = type; | |
this.regex = regex; | |
} | |
match(s) { | |
const isMatch = this.regex.exec(s); | |
console.log(isMatch); | |
if (isMatch) { | |
return { | |
name: `Token`, | |
type: this, | |
content: isMatch[0], | |
length: isMatch[0].length, | |
}; | |
} | |
return; | |
} | |
} | |
// Apparently there isn't a way to make it anchor start and end by default (didn't know that) | |
const INT = new TokenType(`int`, /^\d+/); | |
const MUL = new TokenType(`mul`, /^\*/); | |
const ADD = new TokenType(`add`, /^\+/); | |
const OPEN_PAREN = new TokenType(`(`, /^\(/); | |
const CLOSE_PAREN = new TokenType(`)`, /^\)/); | |
const TOKEN_LIST = [INT, MUL, ADD, OPEN_PAREN, CLOSE_PAREN]; | |
/** | |
* Splits by whitespace | |
*/ | |
function splitter(s) { | |
return s.split(/\s/g); | |
} | |
// I forgot what this was called, but ill just call it context for now | |
class Context { | |
constructor(name, count) { | |
this.name = name; | |
this.count = count; | |
} | |
} | |
class Grammar { | |
/** | |
* Executor should return self | |
*/ | |
constructor(executor) { | |
this.executor = executor; | |
} | |
/** | |
* Some other implementations write this as "start" and "end" | |
*/ | |
exec(tokens, entry) { | |
return this.executor(tokens, entry); | |
} | |
} | |
class IExpression { | |
value() { | |
throw new Error(`Not implemented`); | |
} | |
length() { | |
throw new Error(`Not implemented`); | |
} | |
} | |
class IntegerExpression extends IExpression { | |
constructor(val) { | |
super(); | |
this.val = val; | |
} | |
value() { | |
return this.val; | |
} | |
length() { | |
return 1; | |
} | |
} | |
class AddExpression extends IExpression { | |
constructor(left, right) { | |
super(); | |
this.left = left; | |
this.right = right; | |
} | |
value() { | |
return this.left.value() + this.right.value(); | |
} | |
length() { | |
return 1 + this.left.length() + this.right.length(); | |
} | |
} | |
class MultiplyExpression extends IExpression { | |
constructor(left, right) { | |
super(); | |
this.left = left; | |
this.right = right; | |
} | |
value() { | |
return this.left.value() * this.right.value(); | |
} | |
length() { | |
return 1 + this.left.length() + this.right.length(); | |
} | |
} | |
class ParenthesizedExpression extends IExpression { | |
constructor(expression) { | |
super(); | |
this.expression = expression; | |
} | |
value() { | |
return this.expression.value(); | |
} | |
length() { | |
return 2 + this.expression.length(); | |
} | |
} | |
class TermExpression extends IExpression { | |
constructor(expression) { | |
super(); | |
this.expression = expression; | |
} | |
value() { | |
return this.expression.value(); | |
} | |
length() { | |
return this.expression.length(); | |
} | |
} | |
class ExpressionExpression extends IExpression { | |
constructor(expression) { | |
super(); | |
this.expression = expression; | |
} | |
value() { | |
return this.expression.value(); | |
} | |
length() { | |
return this.expression.length(); | |
} | |
} | |
// Keep it simple just keep track of a global | |
// entry point is Exp | |
const Exp = new Grammar(function(tokens, entry) { | |
const res = (() => { | |
const term = Grammar.Term.exec(tokens); | |
if (!term) throw new Error(`cannot parse`); | |
tokens = tokens.slice(term.length()); | |
if (term instanceof TermExpression) { | |
if (tokens.length) { | |
// attempt to parse + | |
if (tokens[0].type.type === `add`) { | |
tokens = tokens.slice(1); | |
const right = Grammar.Term.exec(tokens); | |
if (!right) throw new Error(`cannot parse`); | |
tokens = tokens.slice(right.length()); | |
return new AddExpression(term, right); | |
} | |
// attempt to parse * | |
if (tokens[0].type.type === `mul`) { | |
tokens = tokens.slice(1); | |
const right = Grammar.Term.exec(tokens); | |
if (!right) throw new Error(`cannot parse`); | |
tokens = tokens.slice(term.length()); | |
return new MultiplyExpression(term, right); | |
} | |
throw new Error(`cannot parse anything other than add`); | |
} | |
} | |
return new ExpressionExpression(term); | |
})(); | |
if (entry && tokens.length) { | |
throw new Error(`EXPECTED EOF, got ${tokens[0].content}`); | |
} | |
return res; | |
}); | |
const Term = new Grammar(function(tokens, entry) { | |
const res = (() => { | |
if (tokens[0] && tokens[0].type.type === `(`) { | |
tokens = tokens.slice(1); | |
const exp = Grammar.Exp.exec(tokens); | |
tokens = tokens.slice(exp.length()); | |
if (tokens[0].type.type !== `)`) { | |
throw new Error(`expected closing paren`); | |
} | |
tokens = tokens.slice(1); | |
return new ParenthesizedExpression(exp); | |
} | |
if (tokens[0] && tokens[0].type.type === `int`) { | |
const num = Number(tokens[0].content); | |
tokens = tokens.slice(1); | |
if (tokens.length && tokens[0].type.type === `mul`) { | |
tokens = tokens.slice(1); | |
const right = Grammar.Term.exec(tokens); | |
if (!right) { | |
throw new Error(`cannot parse right side`); | |
} | |
return new TermExpression(new MultiplyExpression(new IntegerExpression(num), right)); | |
} | |
return new TermExpression(new IntegerExpression(num)); | |
} | |
})(); | |
if (entry && tokens.length) { | |
throw new Error(`EXPECTED EOF, got ${tokens[0].content}`); | |
} | |
return res; | |
}); | |
Grammar.Exp = Exp; | |
Grammar.Term = Term; | |
function tokenize(s) { | |
s = splitter(s); | |
const res = []; | |
for (const t of s) { | |
let pointer = 0; | |
for (;;) { | |
let hasMatch = false; | |
const target = t.substring(pointer); | |
if (!target) break; | |
for (const token of TOKEN_LIST) { | |
const match = token.match(target); | |
if (match) { | |
pointer += match.length; | |
res.push(match); | |
hasMatch = true; | |
break; | |
} | |
} | |
if (!hasMatch) { | |
throw new Error(`Cannot parse '${target}'`); | |
} | |
} | |
if (pointer < t.length) { | |
throw new Error(`Expected EOF, got '${t.substring(pointer)}'`); | |
} | |
} | |
return res; | |
} | |
const tokenized = tokenize(`1 + 2`); | |
console.log(`tokenized`, tokenized); | |
const res = Grammar.Exp.exec(tokenized, true); | |
console.log(res); | |
console.log(res.value()); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment