thewh1teagle · October 19, 2025 00:54
diff --git a/gistfile1.py b/gistfile1.py
 import pandas as pd
 import re


 HEBREW_PHONEMES = r'ˈaeioubvdhzχtjklmnsfpʃwʔɡʁʒ'
 HEBREW_LETTERS = r"אבגדהוזחטיכךלמםנןסעפףצץקרשת"

 HEBREW_WORD_PATTERN = rf'[{HEBREW_LETTERS}]+'
 HEBREW_PHONEME_WORD_PATTERN = rf'[{HEBREW_PHONEMES}]+'

 df_phonemes = pd.read_csv('metadata.csv', header=None, index_col=False, sep='|', names=['id', 'text'])
 df_hebrew = pd.read_csv('metadata_text.csv', header=None, index_col=False, sep='|', names=['id', 'text'])


 rows = []

 for row_ph, row_he in zip(df_phonemes.itertuples(index=False), df_hebrew.itertuples(index=False)):
    hebrew_words = re.findall(HEBREW_WORD_PATTERN, row_he.text)
    phoneme_words = re.findall(HEBREW_PHONEME_WORD_PATTERN, row_ph.text)

    heb_count = len(hebrew_words)
    pho_count = len(phoneme_words)

    if heb_count == pho_count:
        rows.append({
            'id': row_ph.id,
            'text': row_he.text,
            'phonemes': row_ph.text,
            # 'text_count': heb_count,
            # 'phonemes_count': pho_count
        })

 df_result = pd.DataFrame(rows, columns=['text', 'phonemes'])
 df_result.to_csv('result.csv', sep='\t', index=False, header=False)

 print(f"Saved {len(df_result)} mismatched rows to result.csv")
	import pandas as pd
	import re


	HEBREW_PHONEMES = r'ˈaeioubvdhzχtjklmnsfpʃwʔɡʁʒ'
	HEBREW_LETTERS = r"אבגדהוזחטיכךלמםנןסעפףצץקרשת"

	HEBREW_WORD_PATTERN = rf'[{HEBREW_LETTERS}]+'
	HEBREW_PHONEME_WORD_PATTERN = rf'[{HEBREW_PHONEMES}]+'

	df_phonemes = pd.read_csv('metadata.csv', header=None, index_col=False, sep='\|', names=['id', 'text'])
	df_hebrew = pd.read_csv('metadata_text.csv', header=None, index_col=False, sep='\|', names=['id', 'text'])


	rows = []

	for row_ph, row_he in zip(df_phonemes.itertuples(index=False), df_hebrew.itertuples(index=False)):
	hebrew_words = re.findall(HEBREW_WORD_PATTERN, row_he.text)
	phoneme_words = re.findall(HEBREW_PHONEME_WORD_PATTERN, row_ph.text)

	heb_count = len(hebrew_words)
	pho_count = len(phoneme_words)

	if heb_count == pho_count:
	rows.append({
	'id': row_ph.id,
	'text': row_he.text,
	'phonemes': row_ph.text,
	# 'text_count': heb_count,
	# 'phonemes_count': pho_count
	})

	df_result = pd.DataFrame(rows, columns=['text', 'phonemes'])
	df_result.to_csv('result.csv', sep='\t', index=False, header=False)

	print(f"Saved {len(df_result)} mismatched rows to result.csv")
No results found