thewh1teagle · October 21, 2025 02:25
diff --git a/main.py b/main.py
 """
 uv pip install transformers tqdm
 """
 from transformers import AutoTokenizer, AutoModel
 from transformers.models.bert.tokenization_bert_fast import BertTokenizerFast
 from tqdm import tqdm

 in_path = 'input.txt'
 out_path = 'output.txt'

 NIKUD_HASER = "\u05af"

 model = AutoModel.from_pretrained("thewh1teagle/phonikud", trust_remote_code=True)
 tokenizer: BertTokenizerFast = AutoTokenizer.from_pretrained("thewh1teagle/phonikud")
 model.to("cuda")
 model.eval()

 batch_size = 100

 def batch_vocalize(texts: list[str]) -> list[str]:
    return model.predict(texts, tokenizer, mark_matres_lectionis=NIKUD_HASER)

 # Count total lines for tqdm
 with open(in_path, "r") as f:
    total = sum(1 for _ in f)

 with open(in_path, "r") as f, open(out_path, "w") as f2, tqdm(total=total) as pbar:
    batch = []
    for line in f:
        line = line.strip()
        if not line:
            continue
        batch.append(line)
        if len(batch) == batch_size:
            preds = batch_vocalize(batch)
            f2.write("\n".join(preds) + "\n")
            pbar.update(len(batch))
            batch.clear()
    if batch:
        preds = batch_vocalize(batch)
        f2.write("\n".join(preds) + "\n")
        pbar.update(len(batch))
	"""
	uv pip install transformers tqdm
	"""
	from transformers import AutoTokenizer, AutoModel
	from transformers.models.bert.tokenization_bert_fast import BertTokenizerFast
	from tqdm import tqdm

	in_path = 'input.txt'
	out_path = 'output.txt'

	NIKUD_HASER = "\u05af"

	model = AutoModel.from_pretrained("thewh1teagle/phonikud", trust_remote_code=True)
	tokenizer: BertTokenizerFast = AutoTokenizer.from_pretrained("thewh1teagle/phonikud")
	model.to("cuda")
	model.eval()

	batch_size = 100

	def batch_vocalize(texts: list[str]) -> list[str]:
	return model.predict(texts, tokenizer, mark_matres_lectionis=NIKUD_HASER)

	# Count total lines for tqdm
	with open(in_path, "r") as f:
	total = sum(1 for _ in f)

	with open(in_path, "r") as f, open(out_path, "w") as f2, tqdm(total=total) as pbar:
	batch = []
	for line in f:
	line = line.strip()
	if not line:
	continue
	batch.append(line)
	if len(batch) == batch_size:
	preds = batch_vocalize(batch)
	f2.write("\n".join(preds) + "\n")
	pbar.update(len(batch))
	batch.clear()
	if batch:
	preds = batch_vocalize(batch)
	f2.write("\n".join(preds) + "\n")
	pbar.update(len(batch))
No results found