Skip to content

Instantly share code, notes, and snippets.

@stanleyugwu
Created November 23, 2025 20:35
Show Gist options
  • Select an option

  • Save stanleyugwu/571db6d46a82a9bcb1d5b34525b3dc92 to your computer and use it in GitHub Desktop.

Select an option

Save stanleyugwu/571db6d46a82a9bcb1d5b34525b3dc92 to your computer and use it in GitHub Desktop.
Mini Tokenizer
let corpus = "the cat full overran the ran dog running do the needful";
corpus = corpus.replace(/ /g, "_");
let corpArr = corpus.split("");
const vocab = {};
const rules = [];
let id = 1;
const maxTokens = 50;
const sep = "<sep>";
// assign chars ID
for (const c of corpArr) {
if (!vocab[c]) vocab[c] = id++;
}
for (let c = 0; c < maxTokens; c++) {
const pairs = {};
// count pairs
for (let i = 0; i < corpArr.length - 1; i++) {
const char = corpArr[i];
const nextChar = corpArr[i + 1];
if (!nextChar || nextChar === "_") continue;
const pair = `${char}${sep}${nextChar}`;
pairs[pair] = (pairs[pair] || 0) + 1;
}
const pairEntries = Object.entries(pairs);
if (pairEntries.length === 0) break;
// sort without flattening
pairEntries.sort((a, b) => b[1] - a[1]);
const [hoPair, count] = pairEntries[0];
if (count < 2) break; // no meaningful merge
const mergedPair = hoPair.replace(sep, "");
vocab[mergedPair] = id++;
rules.push(hoPair.replace(sep, ","));
// merge pairs in corpArr correctly
const newArr = [];
for (let i = 0; i < corpArr.length; i++) {
const char = corpArr[i];
const nextChar = corpArr[i + 1];
if (nextChar && `${char}${sep}${nextChar}` === hoPair) {
newArr.push(mergedPair);
i++; // skip next
} else {
newArr.push(char);
}
}
corpArr = newArr;
}
console.log(corpus)
console.log(rules)
console.log(vocab)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment