Created
November 23, 2025 20:35
-
-
Save stanleyugwu/571db6d46a82a9bcb1d5b34525b3dc92 to your computer and use it in GitHub Desktop.
Mini Tokenizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| let corpus = "the cat full overran the ran dog running do the needful"; | |
| corpus = corpus.replace(/ /g, "_"); | |
| let corpArr = corpus.split(""); | |
| const vocab = {}; | |
| const rules = []; | |
| let id = 1; | |
| const maxTokens = 50; | |
| const sep = "<sep>"; | |
| // assign chars ID | |
| for (const c of corpArr) { | |
| if (!vocab[c]) vocab[c] = id++; | |
| } | |
| for (let c = 0; c < maxTokens; c++) { | |
| const pairs = {}; | |
| // count pairs | |
| for (let i = 0; i < corpArr.length - 1; i++) { | |
| const char = corpArr[i]; | |
| const nextChar = corpArr[i + 1]; | |
| if (!nextChar || nextChar === "_") continue; | |
| const pair = `${char}${sep}${nextChar}`; | |
| pairs[pair] = (pairs[pair] || 0) + 1; | |
| } | |
| const pairEntries = Object.entries(pairs); | |
| if (pairEntries.length === 0) break; | |
| // sort without flattening | |
| pairEntries.sort((a, b) => b[1] - a[1]); | |
| const [hoPair, count] = pairEntries[0]; | |
| if (count < 2) break; // no meaningful merge | |
| const mergedPair = hoPair.replace(sep, ""); | |
| vocab[mergedPair] = id++; | |
| rules.push(hoPair.replace(sep, ",")); | |
| // merge pairs in corpArr correctly | |
| const newArr = []; | |
| for (let i = 0; i < corpArr.length; i++) { | |
| const char = corpArr[i]; | |
| const nextChar = corpArr[i + 1]; | |
| if (nextChar && `${char}${sep}${nextChar}` === hoPair) { | |
| newArr.push(mergedPair); | |
| i++; // skip next | |
| } else { | |
| newArr.push(char); | |
| } | |
| } | |
| corpArr = newArr; | |
| } | |
| console.log(corpus) | |
| console.log(rules) | |
| console.log(vocab) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment