jabney · November 26, 2024 03:07
diff --git a/arg-parser-stateless.ts b/arg-parser-stateless.ts
 /**
 * A minimally opinionated argument parser with a stateless implementation.
 *
 * Author: James Abney
 * License: MIT
 */
 import { Tokenizer, TokenSpec } from './tokenizer'

 export interface IArg {
    type: 'arg'
    value: string
 }

 export interface ILong {
    type: 'long'
    name: string
    args: string[]
 }

 export interface IShort {
    type: 'short'
    name: string
    args: string[]
 }

 export interface IVar {
    type: 'var'
    var: [string, string]
 }

 export interface IStop {
    type: 'stop'
    args: string[]
 }

 export type ArgType = IArg | ILong | IShort | IVar | IStop

 type TokenType = { [P in keyof ArgType]: ArgType[P] }['type']

 const tokens: TokenSpec<TokenType>[] = [
    [null, /^\s+/], // whitespace
    ['long', /^--[A-Za-z][\w-]+/],
    ['short', /^-[A-Za-z]+/],
    ['var', /^[A-Za-z]\w*=[^\s=]*/],
    ['stop', /^--(?=\s|$)/],
    ['arg', /^\S+/],
 ]

 export function parseArgs(source: string): IterableIterator<ArgType>
 export function parseArgs(source: string, cb: (item: ArgType) => void): void
 export function parseArgs(source: string, cb?: (item: ArgType) => void): IterableIterator<ArgType> | void {
    if (typeof cb === 'function') {
        return parseArgsCallback(source, cb)
    }
    return parseArgsGenerator(source)
 }

 function parseArgsCallback(source: string, cb: (item: ArgType) => void): void {
    const t = new Tokenizer<TokenType>(source, tokens)

    while (t.lookahead != null) {
        const token = t.consumeN()

        switch (token.type) {
            case 'arg':
                cb({ type: 'arg', value: token.value })
                break
            case 'long':
                cb({ type: 'long', name: token.value.slice(2), args: [...consumeArgs(t)] })
                break
            case 'short':
                cb({ type: 'short', name: token.value.slice(1), args: [...consumeArgs(t)] })
                break
            case 'var':
                cb({ type: 'var', var: token.value.split('=') as [string, string] })
                break
            case 'stop':
                cb({ type: 'stop', args: [...consumeArgs(t)] })
                break
            default:
                throw new SyntaxError(`unrecognized token type "${token.type}"`)
        }
    }
 }

 function* parseArgsGenerator(source: string): IterableIterator<ArgType> {
    const t = new Tokenizer<TokenType>(source, tokens)

    while (t.lookahead != null) {
        const token = t.consumeN()

        switch (token.type) {
            case 'arg':
                yield { type: 'arg', value: token.value }
                break
            case 'long':
                yield { type: 'long', name: token.value.slice(2), args: [...consumeArgs(t)] }
                break
            case 'short':
                yield { type: 'short', name: token.value.slice(1), args: [...consumeArgs(t)] }
                break
            case 'var':
                yield { type: 'var', var: token.value.split('=') as [string, string] }
                break
            case 'stop':
                yield { type: 'stop', args: [...consumeArgs(t)] }
                break
            default:
                throw new SyntaxError(`unrecognized token type "${token.type}"`)
        }
    }
 }

 function* consumeArgs(t: Tokenizer<TokenType>): IterableIterator<string> {
    let count = 0
    while (t.lookahead?.type === 'arg') {
        const token = t.consumeN()
        yield token.value
        count += 1
    }
 }
diff --git a/tokenizer.ts b/tokenizer.ts
 /**
 * A single token lookahead tokenizer.
 *
 * Author: James Abney
 * License: MIT
 */
 export type Token<T extends string = string> = { readonly type: T; readonly value: string }
 export type TokenSpec<T extends string = string> = readonly [name: T | null, match: RegExp] // empty name tokens are thrown away.

 export class Tokenizer<T extends string = string> {
    private readonly tokens: readonly TokenSpec<T>[]
    private readonly _source: string
    private _cursor = 0
    private _lookahead: Token<T> | null = null

    constructor(source: string, tokens: TokenSpec<T>[]) {
        this._source = source
        this.tokens = tokens.slice()
        this._lookahead = this._consume()
    }

    get source(): string {
        return this._source
    }

    get cursor(): number {
        return this._cursor
    }

    get eof(): boolean {
        return this._cursor >= this._source.length
    }

    get lookahead(): Token<T> | null {
        return this._lookahead
    }

    /**
     * Consumes a token and returns the previous lookahead. If a token
     * type is specified, it must match the lookahead token's type.
     *
     * @throws {SyntaxError}
     */
    consume(): Token<T> | null
    consume(type: T): Token<T> | null
    consume(type?: T): Token<T> | null {
        const token = this._lookahead
        this._lookahead = this._consume()

        if (type == null) {
            return token
        }

        if (token == null) {
            return null
        }

        if (token.type === type) {
            return token
        }

        throw new SyntaxError(`expected token "${type}", received "${token.type}"`)
    }

    /**
     * Asserts that the lookahead token is not null.
     *
     * @throws {SyntaxError}
     */
    consumeN(): Token<T> {
        const token = this._lookahead
        this._lookahead = this._consume()

        if (token == null) {
            throw new SyntaxError(`token is null`)
        }
        return token
    }

    private _consume(): Token<T> | null {
        if (this._cursor > this._source.length) {
            throw new SyntaxError(`tokenizer cursor overrun`)
        }
        if (this.eof) return null

        for (const [name, matcher] of this.tokens) {
            const str = this._source.slice(this._cursor)
            const match = matcher.exec(str)?.[0]
            const skip = name == null || name.length === 0

            if (match != null) {
                this._cursor += match.length
                if (skip) {
                    return this._consume()
                }
                return { type: name as T, value: match }
            }
        }
        throw new SyntaxError(`tokenizer unexpected token "${this._source[this._cursor]}"`)
    }
 }
	/**
	* A minimally opinionated argument parser with a stateless implementation.
	*
	* Author: James Abney
	* License: MIT
	*/
	import { Tokenizer, TokenSpec } from './tokenizer'

	export interface IArg {
	type: 'arg'
	value: string
	}

	export interface ILong {
	type: 'long'
	name: string
	args: string[]
	}

	export interface IShort {
	type: 'short'
	name: string
	args: string[]
	}

	export interface IVar {
	type: 'var'
	var: [string, string]
	}

	export interface IStop {
	type: 'stop'
	args: string[]
	}

	export type ArgType = IArg \| ILong \| IShort \| IVar \| IStop

	type TokenType = { [P in keyof ArgType]: ArgType[P] }['type']

	const tokens: TokenSpec<TokenType>[] = [
	[null, /^\s+/], // whitespace
	['long', /^--[A-Za-z][\w-]+/],
	['short', /^-[A-Za-z]+/],
	['var', /^[A-Za-z]\w=[^\s=]/],
	['stop', /^--(?=\s\|$)/],
	['arg', /^\S+/],
	]

	export function parseArgs(source: string): IterableIterator<ArgType>
	export function parseArgs(source: string, cb: (item: ArgType) => void): void
	export function parseArgs(source: string, cb?: (item: ArgType) => void): IterableIterator<ArgType> \| void {
	if (typeof cb === 'function') {
	return parseArgsCallback(source, cb)
	}
	return parseArgsGenerator(source)
	}

	function parseArgsCallback(source: string, cb: (item: ArgType) => void): void {
	const t = new Tokenizer<TokenType>(source, tokens)

	while (t.lookahead != null) {
	const token = t.consumeN()

	switch (token.type) {
	case 'arg':
	cb({ type: 'arg', value: token.value })
	break
	case 'long':
	cb({ type: 'long', name: token.value.slice(2), args: [...consumeArgs(t)] })
	break
	case 'short':
	cb({ type: 'short', name: token.value.slice(1), args: [...consumeArgs(t)] })
	break
	case 'var':
	cb({ type: 'var', var: token.value.split('=') as [string, string] })
	break
	case 'stop':
	cb({ type: 'stop', args: [...consumeArgs(t)] })
	break
	default:
	throw new SyntaxError(`unrecognized token type "${token.type}"`)
	}
	}
	}

	function* parseArgsGenerator(source: string): IterableIterator<ArgType> {
	const t = new Tokenizer<TokenType>(source, tokens)

	while (t.lookahead != null) {
	const token = t.consumeN()

	switch (token.type) {
	case 'arg':
	yield { type: 'arg', value: token.value }
	break
	case 'long':
	yield { type: 'long', name: token.value.slice(2), args: [...consumeArgs(t)] }
	break
	case 'short':
	yield { type: 'short', name: token.value.slice(1), args: [...consumeArgs(t)] }
	break
	case 'var':
	yield { type: 'var', var: token.value.split('=') as [string, string] }
	break
	case 'stop':
	yield { type: 'stop', args: [...consumeArgs(t)] }
	break
	default:
	throw new SyntaxError(`unrecognized token type "${token.type}"`)
	}
	}
	}

	function* consumeArgs(t: Tokenizer<TokenType>): IterableIterator<string> {
	let count = 0
	while (t.lookahead?.type === 'arg') {
	const token = t.consumeN()
	yield token.value
	count += 1
	}
	}
	/**
	* A single token lookahead tokenizer.
	*
	* Author: James Abney
	* License: MIT
	*/
	export type Token<T extends string = string> = { readonly type: T; readonly value: string }
	export type TokenSpec<T extends string = string> = readonly [name: T \| null, match: RegExp] // empty name tokens are thrown away.

	export class Tokenizer<T extends string = string> {
	private readonly tokens: readonly TokenSpec<T>[]
	private readonly _source: string
	private _cursor = 0
	private _lookahead: Token<T> \| null = null

	constructor(source: string, tokens: TokenSpec<T>[]) {
	this._source = source
	this.tokens = tokens.slice()
	this._lookahead = this._consume()
	}

	get source(): string {
	return this._source
	}

	get cursor(): number {
	return this._cursor
	}

	get eof(): boolean {
	return this._cursor >= this._source.length
	}

	get lookahead(): Token<T> \| null {
	return this._lookahead
	}

	/**
	* Consumes a token and returns the previous lookahead. If a token
	* type is specified, it must match the lookahead token's type.
	*
	* @throws {SyntaxError}
	*/
	consume(): Token<T> \| null
	consume(type: T): Token<T> \| null
	consume(type?: T): Token<T> \| null {
	const token = this._lookahead
	this._lookahead = this._consume()

	if (type == null) {
	return token
	}

	if (token == null) {
	return null
	}

	if (token.type === type) {
	return token
	}

	throw new SyntaxError(`expected token "${type}", received "${token.type}"`)
	}

	/**
	* Asserts that the lookahead token is not null.
	*
	* @throws {SyntaxError}
	*/
	consumeN(): Token<T> {
	const token = this._lookahead
	this._lookahead = this._consume()

	if (token == null) {
	throw new SyntaxError(`token is null`)
	}
	return token
	}

	private _consume(): Token<T> \| null {
	if (this._cursor > this._source.length) {
	throw new SyntaxError(`tokenizer cursor overrun`)
	}
	if (this.eof) return null

	for (const [name, matcher] of this.tokens) {
	const str = this._source.slice(this._cursor)
	const match = matcher.exec(str)?.[0]
	const skip = name == null \|\| name.length === 0

	if (match != null) {
	this._cursor += match.length
	if (skip) {
	return this._consume()
	}
	return { type: name as T, value: match }
	}
	}
	throw new SyntaxError(`tokenizer unexpected token "${this._source[this._cursor]}"`)
	}
	}