Skip to content

Instantly share code, notes, and snippets.

@jabney
Last active November 26, 2024 03:08
Show Gist options
  • Save jabney/c4660433966869401ac1313d73605683 to your computer and use it in GitHub Desktop.
Save jabney/c4660433966869401ac1313d73605683 to your computer and use it in GitHub Desktop.
tokenizer: a single token lookahead tokenizer
/**
* A single token lookahead tokenizer.
*
* Author: James Abney
* License: MIT
*/
export type Token<T extends string = string> = { readonly type: T; readonly value: string }
export type TokenSpec<T extends string = string> = readonly [name: T | null, match: RegExp] // empty name tokens are thrown away.
export class Tokenizer<T extends string = string> {
private readonly tokens: readonly TokenSpec<T>[]
private readonly _source: string
private _cursor = 0
private _lookahead: Token<T> | null = null
constructor(source: string, tokens: TokenSpec<T>[]) {
this._source = source
this.tokens = tokens.slice()
this._lookahead = this._consume()
}
get source(): string {
return this._source
}
get cursor(): number {
return this._cursor
}
get eof(): boolean {
return this._cursor >= this._source.length
}
get lookahead(): Token<T> | null {
return this._lookahead
}
/**
* Consumes a token and returns the previous lookahead. If a token
* type is specified, it must match the lookahead token's type.
*
* @throws {SyntaxError}
*/
consume(): Token<T> | null
consume(type: T): Token<T> | null
consume(type?: T): Token<T> | null {
const token = this._lookahead
this._lookahead = this._consume()
if (type == null) {
return token
}
if (token == null) {
return null
}
if (token.type === type) {
return token
}
throw new SyntaxError(`expected token "${type}", received "${token.type}"`)
}
/**
* Asserts that the lookahead token is not null.
*
* @throws {SyntaxError}
*/
consumeN(): Token<T> {
const token = this._lookahead
this._lookahead = this._consume()
if (token == null) {
throw new SyntaxError(`token is null`)
}
return token
}
private _consume(): Token<T> | null {
if (this._cursor > this._source.length) {
throw new SyntaxError(`tokenizer cursor overrun`)
}
if (this.eof) return null
for (const [name, matcher] of this.tokens) {
const str = this._source.slice(this._cursor)
const match = matcher.exec(str)?.[0]
const skip = name == null || name.length === 0
if (match != null) {
this._cursor += match.length
if (skip) {
return this._consume()
}
return { type: name as T, value: match }
}
}
throw new SyntaxError(`tokenizer unexpected token "${this._source[this._cursor]}"`)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment