Skip to content

Instantly share code, notes, and snippets.

@thewh1teagle
Created October 19, 2025 00:54
Show Gist options
  • Select an option

  • Save thewh1teagle/fff3c5461192b52ffa14a943a52646fe to your computer and use it in GitHub Desktop.

Select an option

Save thewh1teagle/fff3c5461192b52ffa14a943a52646fe to your computer and use it in GitHub Desktop.
phonemes and hebrew
import pandas as pd
import re
HEBREW_PHONEMES = r'ˈaeioubvdhzχtjklmnsfpʃwʔɡʁʒ'
HEBREW_LETTERS = r"אבגדהוזחטיכךלמםנןסעפףצץקרשת"
HEBREW_WORD_PATTERN = rf'[{HEBREW_LETTERS}]+'
HEBREW_PHONEME_WORD_PATTERN = rf'[{HEBREW_PHONEMES}]+'
df_phonemes = pd.read_csv('metadata.csv', header=None, index_col=False, sep='|', names=['id', 'text'])
df_hebrew = pd.read_csv('metadata_text.csv', header=None, index_col=False, sep='|', names=['id', 'text'])
rows = []
for row_ph, row_he in zip(df_phonemes.itertuples(index=False), df_hebrew.itertuples(index=False)):
hebrew_words = re.findall(HEBREW_WORD_PATTERN, row_he.text)
phoneme_words = re.findall(HEBREW_PHONEME_WORD_PATTERN, row_ph.text)
heb_count = len(hebrew_words)
pho_count = len(phoneme_words)
if heb_count == pho_count:
rows.append({
'id': row_ph.id,
'text': row_he.text,
'phonemes': row_ph.text,
# 'text_count': heb_count,
# 'phonemes_count': pho_count
})
df_result = pd.DataFrame(rows, columns=['text', 'phonemes'])
df_result.to_csv('result.csv', sep='\t', index=False, header=False)
print(f"Saved {len(df_result)} mismatched rows to result.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment