Created
October 8, 2017 08:38
-
-
Save 0/de44ef0f44b30b5b0bde747e3fd30c72 to your computer and use it in GitHub Desktop.
Find misspelled words in text that contains many non-words.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
A wrapper around difflib.get_close_matches to make it easy to find misspelled | |
words in text that contains many non-words. | |
For regular text, where every letter sequence must be a dictionary word, | |
checking spelling is straightforward. Technical text, such as documentation or | |
code comments, may contain many valid non-words (e.g. function names) and | |
deliberate nonsense example strings. As a consequence, there tend to be many | |
false positives. To reduce the amount of noise, we ignore any unrecognized | |
words that aren't close to legitimate words, since those are unlikely to be | |
misspellings. | |
""" | |
from difflib import get_close_matches | |
from math import ceil | |
from re import split | |
from sys import argv, exit, stderr | |
## Parse arguments. | |
dicts = [] | |
texts = [] | |
dict_next = False | |
done_options = False | |
for arg in argv[1:]: | |
if dict_next: | |
dicts.append(arg) | |
dict_next = False | |
elif not done_options and arg.startswith('-'): | |
if arg == '--': | |
done_options = True | |
elif arg == '-d': | |
dict_next = True | |
else: | |
print('Unrecognized option:', arg) | |
exit(1) | |
else: | |
texts.append(arg) | |
if not dicts or not texts: | |
print(f'usage: {argv[0]} -d <dict> [-d <dict> ...] [--] <text> [<text> ...]') | |
print() | |
print('Check the words in the text files against the words in the dict files.') | |
exit(1) | |
print('dict:', ', '.join(dicts), file=stderr) | |
print('text:', ', '.join(texts), file=stderr) | |
## Extract words. | |
dict_words = set() | |
for path in dicts: | |
with open(path) as f: | |
# Each word is on its own line. | |
dict_words.update(x.rstrip().lower() for x in f.readlines()) | |
text_words = set() | |
for path in texts: | |
with open(path) as f: | |
for line in f: | |
# Every run of ASCII letters is a word. | |
text_words.update(split('[^A-Za-z]', line.lower())) | |
## Scan words. | |
for i, text_word in enumerate(text_words): | |
if i % 100 == 0: | |
print(f'{i}/{len(text_words)} ({ceil(100*i/len(text_words))}%)', file=stderr) | |
# Ignore words we recognize. | |
if text_word in dict_words: | |
continue | |
# A higher cutoff leads to faster matches and fewer results. | |
match = get_close_matches(text_word, dict_words, n=1, cutoff=0.8) | |
# Ignore words that aren't close to anything. | |
if not match: | |
continue | |
print(f'{text_word} => {match[0]}') | |
print(f'{len(text_words)}/{len(text_words)} (100%)', file=stderr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment