Skip to content

Instantly share code, notes, and snippets.

@devig
Last active May 16, 2025 12:41
Show Gist options
  • Save devig/1fb23e57f69fa91480d0843b770f30c3 to your computer and use it in GitHub Desktop.
Save devig/1fb23e57f69fa91480d0843b770f30c3 to your computer and use it in GitHub Desktop.
Фильтрация фраз по списку минус-слов
import re
import sys
from pathlib import Path
def load_words(words_file):
words_set = set()
with open(words_file, encoding='utf-8') as f:
for line in f:
word = line.strip()
# Удаляем начальные -, ", !
word = re.sub(r'^[-"!]+', '', word)
if word:
words_set.add(word.lower())
return words_set
def tokenize(text):
# Разделение по символам-разделителям
return re.split(r'[@#\-,\";\s]+', text)
def process_phrases(words_lowercase_set, phrases_file):
original_name = Path(phrases_file).name
dirty_file = f'dirty_{original_name}'
clean_file = f'clean_{original_name}'
dirty_count = 0
clean_count = 0
matched_words = set()
with open(phrases_file, encoding='utf-8') as pf, \
open(dirty_file, 'w', encoding='utf-8') as df, \
open(clean_file, 'w', encoding='utf-8') as cf:
for line in pf:
line_stripped = line.strip()
tokens = tokenize(line_stripped)
tokens_lower = [t.lower() for t in tokens if t]
# Ищем пересечение
matched = set(tokens_lower) & words_lowercase_set
if matched:
matched_words.update(matched)
df.write(line)
dirty_count += 1
else:
cf.write(line)
clean_count += 1
# Вывод отчета
print(f"\n🧼 Найденные слова: {', '.join(sorted(matched_words)) or '—'}")
print(f"📝 Строк записано в dirty файл: {dirty_count}")
print(f"✅ Строк записано в clean файл: {clean_count}")
print(f"\nРезультат сохранён в {dirty_file} и {clean_file}")
if __name__ == '__main__':
if len(sys.argv) != 3:
print('Использование: python filter.py слова.txt фразы.txt')
sys.exit(1)
words_file = sys.argv[1]
phrases_file = sys.argv[2]
words = load_words(words_file)
process_phrases(words, phrases_file)
@devig
Copy link
Author

devig commented May 16, 2025

python3 filter.py minus-kondei.txt kondei.txt
python filter.py minus-kondei.txt kondei.txt

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment