0 · October 8, 2017 08:38
diff --git a/misspell.py b/misspell.py
 #!/usr/bin/env python3

 """
 A wrapper around difflib.get_close_matches to make it easy to find misspelled
 words in text that contains many non-words.

 For regular text, where every letter sequence must be a dictionary word,
 checking spelling is straightforward. Technical text, such as documentation or
 code comments, may contain many valid non-words (e.g. function names) and
 deliberate nonsense example strings. As a consequence, there tend to be many
 false positives. To reduce the amount of noise, we ignore any unrecognized
 words that aren't close to legitimate words, since those are unlikely to be
 misspellings.
 """

 from difflib import get_close_matches
 from math import ceil
 from re import split
 from sys import argv, exit, stderr


 ## Parse arguments.
 dicts = []
 texts = []
 dict_next = False
 done_options = False
 for arg in argv[1:]:
    if dict_next:
        dicts.append(arg)
        dict_next = False
    elif not done_options and arg.startswith('-'):
        if arg == '--':
            done_options = True
        elif arg == '-d':
            dict_next = True
        else:
            print('Unrecognized option:', arg)
            exit(1)
    else:
        texts.append(arg)

 if not dicts or not texts:
    print(f'usage: {argv[0]} -d <dict> [-d <dict> ...] [--] <text> [<text> ...]')
    print()
    print('Check the words in the text files against the words in the dict files.')
    exit(1)

 print('dict:', ', '.join(dicts), file=stderr)
 print('text:', ', '.join(texts), file=stderr)


 ## Extract words.
 dict_words = set()
 for path in dicts:
    with open(path) as f:
        # Each word is on its own line.
        dict_words.update(x.rstrip().lower() for x in f.readlines())

 text_words = set()
 for path in texts:
    with open(path) as f:
        for line in f:
            # Every run of ASCII letters is a word.
            text_words.update(split('[^A-Za-z]', line.lower()))


 ## Scan words.
 for i, text_word in enumerate(text_words):
    if i % 100 == 0:
        print(f'{i}/{len(text_words)} ({ceil(100*i/len(text_words))}%)', file=stderr)

    # Ignore words we recognize.
    if text_word in dict_words:
        continue

    # A higher cutoff leads to faster matches and fewer results.
    match = get_close_matches(text_word, dict_words, n=1, cutoff=0.8)

    # Ignore words that aren't close to anything.
    if not match:
        continue

    print(f'{text_word} => {match[0]}')

 print(f'{len(text_words)}/{len(text_words)} (100%)', file=stderr)
	#!/usr/bin/env python3

	"""
	A wrapper around difflib.get_close_matches to make it easy to find misspelled
	words in text that contains many non-words.

	For regular text, where every letter sequence must be a dictionary word,
	checking spelling is straightforward. Technical text, such as documentation or
	code comments, may contain many valid non-words (e.g. function names) and
	deliberate nonsense example strings. As a consequence, there tend to be many
	false positives. To reduce the amount of noise, we ignore any unrecognized
	words that aren't close to legitimate words, since those are unlikely to be
	misspellings.
	"""

	from difflib import get_close_matches
	from math import ceil
	from re import split
	from sys import argv, exit, stderr


	## Parse arguments.
	dicts = []
	texts = []
	dict_next = False
	done_options = False
	for arg in argv[1:]:
	if dict_next:
	dicts.append(arg)
	dict_next = False
	elif not done_options and arg.startswith('-'):
	if arg == '--':
	done_options = True
	elif arg == '-d':
	dict_next = True
	else:
	print('Unrecognized option:', arg)
	exit(1)
	else:
	texts.append(arg)

	if not dicts or not texts:
	print(f'usage: {argv[0]} -d <dict> [-d <dict> ...] [--] <text> [<text> ...]')
	print()
	print('Check the words in the text files against the words in the dict files.')
	exit(1)

	print('dict:', ', '.join(dicts), file=stderr)
	print('text:', ', '.join(texts), file=stderr)


	## Extract words.
	dict_words = set()
	for path in dicts:
	with open(path) as f:
	# Each word is on its own line.
	dict_words.update(x.rstrip().lower() for x in f.readlines())

	text_words = set()
	for path in texts:
	with open(path) as f:
	for line in f:
	# Every run of ASCII letters is a word.
	text_words.update(split('[^A-Za-z]', line.lower()))


	## Scan words.
	for i, text_word in enumerate(text_words):
	if i % 100 == 0:
	print(f'{i}/{len(text_words)} ({ceil(100*i/len(text_words))}%)', file=stderr)

	# Ignore words we recognize.
	if text_word in dict_words:
	continue

	# A higher cutoff leads to faster matches and fewer results.
	match = get_close_matches(text_word, dict_words, n=1, cutoff=0.8)

	# Ignore words that aren't close to anything.
	if not match:
	continue

	print(f'{text_word} => {match[0]}')

	print(f'{len(text_words)}/{len(text_words)} (100%)', file=stderr)