Created
May 25, 2015 05:55
-
-
Save atomictom/f008d4ad1b4464d47b48 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import os | |
import sys | |
class Node(object): | |
def __init__(self): | |
self.is_word = False | |
self.children = {} | |
class Trie(object): | |
def __init__(self, words): | |
self.root = Node() | |
for word in words: | |
self.add_word(word) | |
def add_word(self, word): | |
cur_node = self.root | |
for c in word: | |
if not c in cur_node.children: | |
new_node = Node() | |
cur_node.children[c] = new_node | |
cur_node = cur_node.children[c] | |
cur_node.is_word = True | |
def is_word(self, word): | |
cur_node = self.root | |
for c in word: | |
if c not in cur_node.children: | |
return False | |
cur_node = cur_node.children[c] | |
return cur_node.is_word | |
# parse_words :: Trie -> String -> [(String, String)] | |
def parse_words(dictionary, sentence): | |
cur_node = dictionary.root | |
cur_word = "" | |
if not sentence: | |
return [("", "")] | |
possible_words = [] | |
for i, c in enumerate(sentence): | |
if c not in cur_node.children: | |
break | |
cur_word += c | |
cur_node = cur_node.children[c] | |
if cur_node.is_word: | |
word = cur_word | |
remaining = sentence[i+1:] | |
possible_words.append((word, remaining)) | |
return possible_words | |
# parse_ngram :: Trie -> Int -> String -> [([String], String)] | |
def parse_ngram(dictionary, n, sentence): | |
if n <= 0: | |
return [([], sentence)] | |
ngrams = [] | |
for word, remaining in parse_words(dictionary, sentence): | |
possible_ngrams = parse_ngram(dictionary, n-1, remaining) | |
new_word = ([word] if word else []) | |
ngrams.extend([(new_word + words, ngram_remaining) for words, ngram_remaining in possible_ngrams]) | |
return ngrams | |
def beam_parse(sentence, dictionary, ngram_freqs, beam_width=5): | |
heuristic = lambda words: score_words(ngram_freqs, words) | |
top_candidates = [([], sentence)] | |
while any(remaining for _, remaining in top_candidates): | |
next_candidates = [] | |
for words, remaining in top_candidates: | |
parses = parse_ngram(dictionary, 5, remaining) | |
next_candidates.extend((words + new_words, new_remaining) for new_words, new_remaining in parses) | |
ranked = [(heuristic(words), words, remaining) for words, remaining in next_candidates] | |
top_ranked = sorted(ranked, reverse=True)[:beam_width] | |
top_candidates = [(words, remaining) for _, words, remaining in top_ranked] | |
print "\n".join(map(str, top_candidates)) | |
return [' '.join(words) for words, remaining in top_candidates] | |
def score_words(ngram_freqs, words): | |
score = 0 | |
for i in range(len(words)): | |
for j in range(i, len(words)): | |
key = tuple(words[i:j+1]) | |
if 5 >= len(key) > 1 and key in ngram_freqs: | |
modifier = reduce(lambda x, y: x * y, [get_word_len_freqs().get(len(word), 0.001) for word in key]) | |
score += ngram_freqs[key] * modifier | |
return score | |
def get_words(): | |
with open("/usr/share/dict/words") as fin: | |
words = [word.strip().lower() for word in fin.readlines()] | |
return words | |
def get_ngrams_freqs_upto(n): | |
ngram_freqs = {} | |
for i in range(1, n+1): | |
with open(str(i) + "gram-freq.txt") as fin: | |
for line in fin.readlines(): | |
fields = line.split() | |
freq = fields[0] | |
words = fields[1:] | |
ngram_freqs[tuple(words)] = int(freq) | |
return ngram_freqs | |
def get_word_len_freqs(): | |
return { | |
1: .027, | |
2: .679, | |
3: 4.730, | |
4: 7.151, | |
5: 10.804, | |
6: 13.674, | |
7: 14.751, | |
8: 13.616, | |
9: 11.356, | |
10: 8.679, | |
11: 5.913, | |
12: 3.792, | |
13: 2.329, | |
14: 1.232, | |
15: 0.685, | |
16: 0.290, | |
17: 0.162, | |
18: 0.066, | |
19: 0.041 | |
} | |
def main(): | |
sys.setrecursionlimit(100000) | |
sentence = sys.argv[1].lower() | |
words = get_words() | |
ngram_freqs = get_ngrams_freqs_upto(5) | |
dictionary = Trie(words) | |
parses = beam_parse(sentence, dictionary, ngram_freqs) | |
for p in parses: | |
print p | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment