Last active
June 24, 2025 16:32
-
-
Save James-E-A/b3457c568c9c61d671bb1df46997ef9d to your computer and use it in GitHub Desktop.
Javascript parse CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const END_OF_FIELD = Symbol('\u001f'); | |
const END_OF_RECORD = Symbol('\u001e'); | |
export async function* csv_iterate(text_stream) { | |
// public iterator. | |
if (typeof text_stream === 'string') | |
text_stream = [text_stream]; | |
else if (text_stream instanceof Blob) | |
text_stream = text_stream.stream().pipeThrough(new TextDecoderStream( | |
text_stream.type?.match(/;\s*charset=("?)(?<value>\S+)(?=\1)/).groups.value ?? 'utf-8', | |
{ fatal: true, ignoreBOM: true } | |
)); | |
else if (text_stream instanceof Response) | |
text_stream = text_stream.body.pipeThrough(new TextDecoderStream(( | |
text_stream.headers.get('content-type')?.match(/;\s*charset=("?)(?<value>\S+)(?=\1)/).groups.value ?? 'utf-8', | |
{ fatal: true, ignoreBOM: true } | |
)); | |
const rows = csv_iterate_1(text_stream); | |
const keys = (await rows.next()).value; | |
for await (const row of rows) { | |
if (row.length === keys.length) | |
yield Object.fromEntries(keys.map((k, i) => [k, row[i]])); | |
else | |
yield Object.fromEntries(row.map((v, i) => [keys[i] ?? `_${i+1}`,v])); | |
} | |
} | |
async function* csv_iterate_1(text_stream) { | |
// core iterator. | |
let field = ""; | |
let row = []; | |
for await (const token of csv_iterate_2(text_stream)) { | |
switch (token) { | |
case END_OF_FIELD: | |
row.push(field); | |
field = ""; | |
break; | |
case END_OF_RECORD: | |
row.push(field); | |
field = ""; | |
yield row; | |
row = []; | |
break; | |
default: | |
field += token; | |
} | |
} | |
// END_OF_FILE | |
row.push(field); | |
yield row; | |
} | |
async function* csv_iterate_2(text_stream) { | |
// meta tokenizer. | |
let state = 0; | |
for await (const s of csv_iterate_3(text_stream)) { | |
switch (state) { | |
case 1: | |
// Inside quotes | |
if (s !== '"') { | |
yield s; | |
} else { | |
state = 2; | |
} | |
break; | |
case 2: | |
// Maybe exiting quotes | |
if (s !== '"') { | |
state = 0; | |
'goto case 0'; | |
} else { | |
// Nope, just an escaped quote | |
yield s; | |
state = 1; | |
break; | |
} | |
case 0: | |
// Not in quotes | |
switch (s) { | |
case ',': | |
yield END_OF_FIELD; | |
break; | |
case '\n': | |
case '\r\n': | |
yield END_OF_RECORD; | |
break; | |
case '"': | |
// Entering quotes | |
state = 1; | |
break; | |
default: | |
yield s; | |
} | |
break; | |
} | |
} | |
} | |
async function* csv_iterate_3(text_stream) { | |
// raw tokenizer. | |
let maybeSplitCrlf = false; | |
for await (const chunk of text_stream) { | |
for (const [s] of chunk.matchAll(/(?:,|\r?\n|"|.+?(?=,|\r?\n|"|$))/gs)) { | |
if (maybeSplitCrlf) { | |
// codepath to handle data that would have matched as CRLF were it not for a chunk boundary in the underlying stream | |
if (s === '\n') { | |
yield '\r\n'; | |
maybeSplitCrlf = false; | |
continue; | |
} | |
maybeSplitCrlf = false; | |
} | |
if (s === '\r') { | |
maybeSplitCrlf = true; | |
continue; | |
} | |
yield s; | |
} | |
} | |
if (maybeSplitCrlf) | |
// was not actually a split CRLF, we still owe caller this token | |
yield '\r'; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.