Skip to content

Instantly share code, notes, and snippets.

@thewh1teagle
Created October 21, 2025 02:25
Show Gist options
  • Select an option

  • Save thewh1teagle/6b05909b60aaae7f22992b4c72fed6cb to your computer and use it in GitHub Desktop.

Select an option

Save thewh1teagle/6b05909b60aaae7f22992b4c72fed6cb to your computer and use it in GitHub Desktop.
Process text with Phonikud
"""
uv pip install transformers tqdm
"""
from transformers import AutoTokenizer, AutoModel
from transformers.models.bert.tokenization_bert_fast import BertTokenizerFast
from tqdm import tqdm
in_path = 'input.txt'
out_path = 'output.txt'
NIKUD_HASER = "\u05af"
model = AutoModel.from_pretrained("thewh1teagle/phonikud", trust_remote_code=True)
tokenizer: BertTokenizerFast = AutoTokenizer.from_pretrained("thewh1teagle/phonikud")
model.to("cuda")
model.eval()
batch_size = 100
def batch_vocalize(texts: list[str]) -> list[str]:
return model.predict(texts, tokenizer, mark_matres_lectionis=NIKUD_HASER)
# Count total lines for tqdm
with open(in_path, "r") as f:
total = sum(1 for _ in f)
with open(in_path, "r") as f, open(out_path, "w") as f2, tqdm(total=total) as pbar:
batch = []
for line in f:
line = line.strip()
if not line:
continue
batch.append(line)
if len(batch) == batch_size:
preds = batch_vocalize(batch)
f2.write("\n".join(preds) + "\n")
pbar.update(len(batch))
batch.clear()
if batch:
preds = batch_vocalize(batch)
f2.write("\n".join(preds) + "\n")
pbar.update(len(batch))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment