// @ts-check const CURLY_OPEN = '{'.charCodeAt(0); const CURLY_CLOSE = '}'.charCodeAt(0); const SQUARE_OPEN = '['.charCodeAt(0); const SQUARE_CLOSE = ']'.charCodeAt(0); const DOUBLE_QUOTE = '"'.charCodeAt(0); const CHARACTER_COLON = ':'.charCodeAt(0); const CHARACTER_COMMA = ','.charCodeAt(0); const CHARACTER_BACKSLASH = '\\'.charCodeAt(0); const CHARACTER_LOWER_E = 'e'.charCodeAt(0); const CHARACTER_LOWER_F = 'f'.charCodeAt(0); const CHARACTER_LOWER_N = 'n'.charCodeAt(0); const CHARACTER_LOWER_T = 't'.charCodeAt(0); const CHARACTER_UPPER_E = 'E'.charCodeAt(0); const CHARACTER_0 = '0'.charCodeAt(0); const CHARACTER_9 = '9'.charCodeAt(0); const CHARACTER_DECIMAL = '.'.charCodeAt(0); const CHARACTER_PLUS = '+'.charCodeAt(0); const CHARACTER_MINUS = '-'.charCodeAt(0); const WHITESPACE_SPACE = 0x20; const WHITESPACE_HORIZONTAL_TAB = 0x09; const WHITESPACE_NL = 0x0a; const WHITESPACE_CR = 0x0d; const WRITE_REJECTED = tagLiteral(0, 'writeResult'); const WRITE_ACCEPTED = tagLiteral(1, 'writeResult'); const WRITE_COMPLETE = tagLiteral(2, 'writeResult'); /** @typedef {typeof WRITE_REJECTED | typeof WRITE_ACCEPTED | typeof WRITE_COMPLETE} WriteResult */ /** * @template T * @template {string} Tag * @param {T} val * @param {Tag} tagName * @returns {T & { __tag: Tag }} */ function tagLiteral(val, tagName) { return /** @type {T & { __tag: Tag }} */ (val); } const STATE_INITIAL = tagLiteral(0, 'state'); const STATE_CONSUMING = tagLiteral(1, 'state'); const STATE_COMPLETE = tagLiteral(2, 'state'); /** @typedef {typeof STATE_INITIAL | typeof STATE_CONSUMING | typeof STATE_COMPLETE} ParserState */ /** * @implements {TypedDecoder} */ class ValueDecoder { /** * @type {TypedDecoder | null} */ #typedDecoder = null; /** @type {ParserState} */ #state = STATE_INITIAL; #bytesWritten = 0; write(byte) { this.#bytesWritten++; switch (this.#state) { case STATE_INITIAL: { switch (byte) { case CURLY_OPEN: { this.#typedDecoder = new ObjectDecoder(); this.#state = STATE_CONSUMING; return WRITE_ACCEPTED; } case SQUARE_OPEN: { this.#typedDecoder = new ArrayDecoder(); this.#state = STATE_CONSUMING; return WRITE_ACCEPTED; } case DOUBLE_QUOTE: { this.#typedDecoder = new StringDecoder(); this.#state = STATE_CONSUMING; return WRITE_ACCEPTED; } case CHARACTER_LOWER_F: { this.#typedDecoder = new FalseDecoder(); this.#state = STATE_CONSUMING; return WRITE_ACCEPTED; } case CHARACTER_LOWER_N: { this.#typedDecoder = new NullDecoder(); this.#state = STATE_CONSUMING; return WRITE_ACCEPTED; } case CHARACTER_LOWER_T: { this.#typedDecoder = new TrueDecoder(); this.#state = STATE_CONSUMING; return WRITE_ACCEPTED; } } if ( byte === CHARACTER_MINUS || byte === CHARACTER_PLUS || (byte >= CHARACTER_0 && byte <= CHARACTER_9) ) { this.#typedDecoder = new NumberDecoder(); this.#typedDecoder.write(byte); this.#state = STATE_CONSUMING; return WRITE_ACCEPTED; } if (isWhitespace(byte)) { return WRITE_ACCEPTED; } throw new Error(`Unexpected character: ${String.fromCharCode(byte)}`); } case STATE_CONSUMING: { invariant(this.#typedDecoder, 'Missing a typed decoder'); const decoderResult = this.#typedDecoder.write(byte); if (decoderResult === WRITE_COMPLETE) { this.#state = STATE_COMPLETE; } else if (decoderResult === WRITE_REJECTED) { // The NumberDecoder rejected the write suggesting the potential end of the number. // Let's mark decoding as completed and allow a parent decoder to do the back-tracking. this.#state = STATE_COMPLETE; } return decoderResult; } case STATE_COMPLETE: { if (!isWhitespace(byte)) { throw new Error('Expected whitespace'); } return WRITE_ACCEPTED; } } throw new Error('Unexpected character'); } /** * * @returns {unknown} */ finalize() { invariant(this.#typedDecoder, 'Attempting to finalize without a typed decoder'); return this.#typedDecoder.finalize(); } } /** * @implements {TypedDecoder<false>} */ class FalseDecoder { static #expect = ['a'.charCodeAt(0), 'l'.charCodeAt(0), 's'.charCodeAt(0), 'e'.charCodeAt(0)]; #nextIndex = 0; write(byte) { if (this.#nextIndex >= FalseDecoder.#expect.length) { if (isWhitespace(byte)) { return WRITE_ACCEPTED; } throw new Error(`Unexpected character, got ${byte}`); } if (this.#nextIndex === 0 && isWhitespace(byte)) { return WRITE_ACCEPTED; } const expectedByte = FalseDecoder.#expect[this.#nextIndex]; this.#nextIndex++; if (byte !== expectedByte) { throw new Error(`Expecting character ${JSON.stringify(String.fromCharCode(expectedByte))}`); } if (this.#nextIndex === FalseDecoder.#expect.length) { return WRITE_COMPLETE; } return WRITE_ACCEPTED; } /** * * @returns {false} */ finalize() { return false; } } /** * @implements {TypedDecoder<null>} */ class NullDecoder { static #expect = ['u'.charCodeAt(0), 'l'.charCodeAt(0), 'l'.charCodeAt(0)]; #nextIndex = 0; write(byte) { if (this.#nextIndex >= NullDecoder.#expect.length) { if (isWhitespace(byte)) { return WRITE_ACCEPTED; } throw new Error(`Unexpected character, got ${byte}`); } if (this.#nextIndex === 0 && isWhitespace(byte)) { return WRITE_ACCEPTED; } const expectedByte = NullDecoder.#expect[this.#nextIndex]; this.#nextIndex++; if (byte !== expectedByte) { throw new Error(`Expecting character ${JSON.stringify(String.fromCharCode(expectedByte))}`); } if (this.#nextIndex === NullDecoder.#expect.length) { return WRITE_COMPLETE; } return WRITE_ACCEPTED; } /** * * @returns {null} */ finalize() { return null; } } /** * @implements {TypedDecoder<true>} */ class TrueDecoder { static #expect = ['r'.charCodeAt(0), 'u'.charCodeAt(0), 'e'.charCodeAt(0)]; #nextIndex = 0; write(byte) { if (this.#nextIndex >= TrueDecoder.#expect.length) { if (isWhitespace(byte)) { return WRITE_ACCEPTED; } throw new Error(`Unexpected character, got ${byte}`); } if (this.#nextIndex === 0 && isWhitespace(byte)) { return WRITE_ACCEPTED; } const expectedByte = TrueDecoder.#expect[this.#nextIndex]; this.#nextIndex++; if (byte !== expectedByte) { throw new Error(`Expecting character ${JSON.stringify(String.fromCharCode(expectedByte))}`); } if (this.#nextIndex === TrueDecoder.#expect.length) { return WRITE_COMPLETE; } return WRITE_ACCEPTED; } /** * * @returns {true} */ finalize() { return true; } } /** * @implements {TypedDecoder<string>} */ class StringDecoder { /** * @readonly * @type {Array<number>} */ #bytes = []; #escaped = false; /** * @type {ParserState} */ #state = STATE_CONSUMING; write(byte) { invariant( this.#state === STATE_CONSUMING, 'Must not write bytes to a StringDecoder after it has finalized' ); // End of the string if (!this.#escaped && byte === DOUBLE_QUOTE) { this.#state = STATE_COMPLETE; this.#escaped = false; // Probably unnecessary return WRITE_COMPLETE; } if (!this.#escaped && byte === CHARACTER_BACKSLASH) { this.#escaped = true; } else { this.#escaped = false; } this.#bytes.push(byte); return WRITE_ACCEPTED; } finalize() { invariant( this.#state === STATE_COMPLETE, "Attempting to finalize a string that isn't in the complete state" ); const decoder = new TextDecoder('utf-8'); const buf = Uint8Array.from(this.#bytes); const str = decoder.decode(buf); const parsed = JSON.parse(`"${str}"`); invariant(typeof parsed === 'string', 'Finalizing a string resulted in a non-string type'); return parsed; } } const ARRAY_STATE_BEFORE_VALUE = tagLiteral(0, 'arrayState'); const ARRAY_STATE_VALUE = tagLiteral(1, 'arrayState'); const ARRAY_STATE_AFTER_VALUE = tagLiteral(2, 'arrayState'); /** @typedef {typeof ARRAY_STATE_BEFORE_VALUE | typeof ARRAY_STATE_VALUE | typeof ARRAY_STATE_AFTER_VALUE} ArrayParserState */ /** * @implements {TypedDecoder<Array<unknown>>} */ class ArrayDecoder { /** * @readonly * @type {Array<unknown>} */ #arr = []; /** * @type {ArrayParserState} */ #arrayState = ARRAY_STATE_BEFORE_VALUE; /** * @type {ParserState} */ #state = STATE_CONSUMING; /** * @type {ValueDecoder | null} */ #valueDecoder = new ValueDecoder(); write(byte) { invariant( this.#state === STATE_CONSUMING, 'Invalid to call write() when not in consuming state' ); switch (this.#arrayState) { case ARRAY_STATE_BEFORE_VALUE: { if (isWhitespace(byte)) { return WRITE_ACCEPTED; } if (byte === SQUARE_CLOSE) { this.#state = STATE_COMPLETE; return WRITE_COMPLETE; } this.#arrayState = ARRAY_STATE_VALUE; this.#valueDecoder = new ValueDecoder(); this.#valueDecoder.write(byte); return WRITE_ACCEPTED; } case ARRAY_STATE_VALUE: { invariant(this.#valueDecoder, 'Expected to have a value decoder while in value state'); const ret = this.#valueDecoder.write(byte); if (ret === WRITE_COMPLETE) { this.#arr.push(this.#valueDecoder.finalize()); this.#valueDecoder = null; this.#arrayState = ARRAY_STATE_AFTER_VALUE; return WRITE_ACCEPTED; } else if (ret === WRITE_ACCEPTED) { return WRITE_ACCEPTED; } this.#arr.push(this.#valueDecoder.finalize()); this.#valueDecoder = null; this.#arrayState = ARRAY_STATE_AFTER_VALUE; // Intentionally allow ourselves to fall through here because the // ValueDecoder didn't accept the value. } case ARRAY_STATE_AFTER_VALUE: { if (isWhitespace(byte)) { return WRITE_ACCEPTED; } switch (byte) { case SQUARE_CLOSE: { // End of the object this.#state = STATE_COMPLETE; return WRITE_COMPLETE; } case CHARACTER_COMMA: { this.#arrayState = ARRAY_STATE_BEFORE_VALUE; return WRITE_ACCEPTED; } } } } throw new Error('Unexpected character'); } finalize() { invariant( this.#state === STATE_COMPLETE, "Attempting to finalize a array that isn't in the complete state" ); return this.#arr; } } const NUMBER_STATE_SIGN = tagLiteral(0, 'numberState'); const NUMBER_STATE_INT = tagLiteral(1, 'numberState'); const NUMBER_STATE_FRACTION = tagLiteral(2, 'numberState'); const NUMBER_STATE_EXPONENT = tagLiteral(3, 'numberState'); /** @typedef {typeof NUMBER_STATE_SIGN | typeof NUMBER_STATE_INT | typeof NUMBER_STATE_FRACTION | typeof NUMBER_STATE_EXPONENT} NumberParserState */ /** * @implements {TypedDecoder<number>} */ class NumberDecoder { /** * @readonly * @type {Array<number>} */ #bytes = []; /** * @type {ParserState} */ #state = STATE_CONSUMING; /** * @type {NumberParserState} */ #numberState = NUMBER_STATE_SIGN; write(byte) { invariant( this.#state === STATE_CONSUMING, 'Invalid to call write() when not in consuming state' ); switch (this.#numberState) { case NUMBER_STATE_SIGN: { // Sign or no sign, we're going to be in the next state after this character. this.#numberState = NUMBER_STATE_INT; if (byte === CHARACTER_MINUS || byte === CHARACTER_PLUS) { this.#bytes.push(byte); return WRITE_ACCEPTED; } // Intentionally fall-through since we didn't see a sign byte. } case NUMBER_STATE_INT: { if (byte >= CHARACTER_0 && byte <= CHARACTER_9) { this.#bytes.push(byte); return WRITE_ACCEPTED; } if (byte === CHARACTER_DECIMAL) { this.#bytes.push(byte); this.#numberState = NUMBER_STATE_FRACTION; return WRITE_ACCEPTED; } break; } case NUMBER_STATE_FRACTION: { if (byte >= CHARACTER_0 && byte <= CHARACTER_9) { this.#bytes.push(byte); return WRITE_ACCEPTED; } if (byte === CHARACTER_LOWER_E || byte === CHARACTER_UPPER_E) { this.#bytes.push(byte); this.#numberState = NUMBER_STATE_EXPONENT; return WRITE_ACCEPTED; } break; } case NUMBER_STATE_EXPONENT: { if (byte >= CHARACTER_0 && byte <= CHARACTER_9) { this.#bytes.push(byte); return WRITE_ACCEPTED; } break; } } this.#state = STATE_COMPLETE; return WRITE_REJECTED; } finalize() { // invariant( // this.#state === STATE_COMPLETE, // 'Invalid to call finalize() when not in complete state' // ); // Since numbers have no terminal token, we allow them to be finalized any time. const decoder = new TextDecoder('utf-8'); const buf = Uint8Array.from(this.#bytes); const str = decoder.decode(buf); const parsed = JSON.parse(str); invariant( typeof parsed === 'number' && Number.isFinite(parsed) && !Number.isNaN(parsed), 'Invalid number' ); return parsed; } } const OBJECT_STATE_BEFORE_KEY = tagLiteral(0, 'objectState'); const OBJECT_STATE_KEY = tagLiteral(1, 'objectState'); const OBJECT_STATE_AFTER_KEY = tagLiteral(2, 'objectState'); const OBJECT_STATE_VALUE = tagLiteral(3, 'objectState'); const OBJECT_STATE_AFTER_VALUE = tagLiteral(4, 'objectState'); /** @typedef {typeof OBJECT_STATE_BEFORE_KEY | typeof OBJECT_STATE_KEY | typeof OBJECT_STATE_AFTER_KEY | typeof OBJECT_STATE_VALUE | typeof OBJECT_STATE_AFTER_VALUE } ObjectParserState */ /** * @implements {TypedDecoder<Record<string, unknown>>} */ class ObjectDecoder { /** * @readonly * @type {Record<string, unknown>} */ #obj = {}; /** * @type {ObjectParserState} */ #objectState = OBJECT_STATE_BEFORE_KEY; /** * @type {ParserState} */ #state = STATE_CONSUMING; /** * @type {StringDecoder | null} */ #keyDecoder = null; /** * @type {string | null} */ #key = null; /** * @type {ValueDecoder | null} */ #valueDecoder = new ValueDecoder(); write(byte) { invariant( this.#state === STATE_CONSUMING, 'Invalid to call write() when not in consuming state' ); switch (this.#objectState) { case OBJECT_STATE_BEFORE_KEY: { if (isWhitespace(byte)) { return WRITE_ACCEPTED; } if (byte === CURLY_CLOSE) { this.#state = STATE_COMPLETE; return WRITE_COMPLETE; } if (byte !== DOUBLE_QUOTE) { throw new Error('Expecting double quote'); } this.#objectState = OBJECT_STATE_KEY; this.#keyDecoder = new StringDecoder(); return WRITE_ACCEPTED; } case OBJECT_STATE_KEY: { invariant(this.#keyDecoder, 'Expected to have a string decoder while in key parsing mode'); const keyState = this.#keyDecoder.write(byte); if (keyState === WRITE_COMPLETE) { this.#key = this.#keyDecoder.finalize(); this.#keyDecoder = null; this.#objectState = OBJECT_STATE_AFTER_KEY; } return WRITE_ACCEPTED; } case OBJECT_STATE_AFTER_KEY: { if (isWhitespace(byte)) { return WRITE_ACCEPTED; } if (byte !== CHARACTER_COLON) { throw new Error("Expected ':'"); } this.#objectState = OBJECT_STATE_VALUE; this.#valueDecoder = new ValueDecoder(); return WRITE_ACCEPTED; } case OBJECT_STATE_VALUE: { invariant( this.#valueDecoder, 'Expected to have a value decoder while in value parsing mode' ); const ret = this.#valueDecoder.write(byte); if (ret === WRITE_COMPLETE) { invariant(this.#key != null, 'Expected to have a stored key'); this.#obj[this.#key] = this.#valueDecoder.finalize(); this.#valueDecoder = null; this.#objectState = OBJECT_STATE_AFTER_VALUE; return WRITE_ACCEPTED; } else if (ret === WRITE_ACCEPTED) { return WRITE_ACCEPTED; } // The value decoder didn't accept the character. That we means we need // to finalize it and fall through. invariant(this.#key != null, 'Expected to have a stored key'); this.#obj[this.#key] = this.#valueDecoder.finalize(); this.#valueDecoder = null; this.#objectState = OBJECT_STATE_AFTER_VALUE; // Intentionally fall through } case OBJECT_STATE_AFTER_VALUE: { if (isWhitespace(byte)) { return WRITE_ACCEPTED; } switch (byte) { case CURLY_CLOSE: { // End of the object this.#state = STATE_COMPLETE; return WRITE_COMPLETE; } case CHARACTER_COMMA: { this.#objectState = OBJECT_STATE_BEFORE_KEY; return WRITE_ACCEPTED; } } } } throw new Error('Unexpected character'); } finalize() { invariant( this.#state === STATE_COMPLETE, "Attempting to finalize a object that isn't in the complete state" ); return this.#obj; } } function isWhitespace(byte) { switch (byte) { case WHITESPACE_CR: case WHITESPACE_HORIZONTAL_TAB: case WHITESPACE_NL: case WHITESPACE_SPACE: return true; } return false; } /** * Check an assertion for truthiness in a way that TypeScript's flow control will * be able to reason about. * * @param {unknown} test * @param {string} message * @returns {asserts test} */ function invariant(test, message) { if (!test) { throw new Error(`Invariant violation: ${message}`); } } /** * @template [DecodedValue=unknown] * @typedef TypedDecoder * @property {(byte: number) => WriteResult} write * @property {() => DecodedValue} finalize */ /* -------------------- TESTS -------------------- */ const assert = require('assert'); const { createReadStream, readFileSync } = require('fs'); /** * * @param {Uint8Array} buf * @param {unknown} wantedValue */ function test(buf, wantedValue) { const valueDecoder = new ValueDecoder(); let lastResult = 0; for (let i = 0; i < buf.byteLength; i++) { try { lastResult = valueDecoder.write(buf[i]); } catch (err) { console.error('Parsing failed', { offset: i, prelude: new TextDecoder().decode(buf.slice(0, i)), decoder: valueDecoder, }); throw err; } } let finalValue; try { finalValue = valueDecoder.finalize(); } catch (err) { console.error('Finalize failed', { decoder: valueDecoder, }); throw err; } assert.deepEqual(finalValue, wantedValue); } const textEncoder = new TextEncoder(); const numbers = [1, -1, 1.0, -1.0, 1.0e10, -1.0e10]; for (const number of numbers) { test(textEncoder.encode(JSON.stringify(number)), number); } test(textEncoder.encode(`${JSON.stringify(true)}`), true); test(textEncoder.encode(`${JSON.stringify(false)}`), false); test(textEncoder.encode(`${JSON.stringify(null)}`), null); const strings = [ 'hello world', 'Hello with some \\ escape characters and fake terminals ".', '', ' \t\r\n ', ]; for (const string of strings) { test(textEncoder.encode(JSON.stringify(string)), string); } const objects = [{ 'hello world': false }, { nested: { object: true } }]; for (const object of objects) { test(textEncoder.encode(JSON.stringify(object)), object); test(textEncoder.encode(JSON.stringify(object, null, 2)), object); test(textEncoder.encode(JSON.stringify(object, null, '\t')), object); } const arrays = [[{ 'hello world': false }, { nested: { object: true } }], 12, 1e8, -9.000001]; for (const arr of arrays) { test(textEncoder.encode(JSON.stringify(arr)), arr); test(textEncoder.encode(JSON.stringify(arr, null, 2)), arr); test(textEncoder.encode(JSON.stringify(arr, null, '\t')), arr); } const maxIters = 10000; const start = Date.now(); const canadaJsonBytes = readFileSync('./Canada.json'); invariant(Buffer.isBuffer(canadaJsonBytes), 'must be a buffer'); const canadaJson = JSON.parse(canadaJsonBytes.toString()); console.log('Initial parse', Date.now() - start); const iterStart = Date.now(); for (let iter = 0; iter < maxIters; iter++) { const d = new ValueDecoder(); for (let i = 0; i < canadaJsonBytes.byteLength; i++) { try { d.write(canadaJsonBytes[i]); } catch (err) { console.error('Parsing failed', { offset: i, prelude: new TextDecoder().decode(canadaJsonBytes.slice(0, i)), decoder: d, }); throw err; } } d.finalize(); console.log('Average cycle time after %d iters', iter, (Date.now() - iterStart) / (iter + 1)); }