Created
December 12, 2021 15:36
-
-
Save makmac213/43a05c3c5b88c9ef55e467705908a410 to your computer and use it in GitHub Desktop.
Auto-suggest next word
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from nltk import ( | |
FreqDist, | |
ngrams, | |
word_tokenize | |
) | |
def tokenize_phrases(phrases): | |
ret = [] | |
for phrase in phrases: | |
words = re.sub(r'[^a-zA-Z \n]+', '', phrase) | |
tokenized = word_tokenize(phrase) | |
tokenized = [word.lower() for word in tokenized] | |
ret.append(tokenized) | |
return ret | |
def get_bigram_dict(phrases): | |
fd = FreqDist() | |
for phrase in phrases: | |
for bigram in ngrams(phrase, 2): | |
fd[bigram] += 1 | |
bigram_dict = {} | |
bigram_list = list(fd) | |
for item in bigram_list: | |
if item[0] in bigram_dict: | |
bigram_dict[item[0]].append(item[1]) | |
else: | |
bigram_dict[item[0]] = [item[1]] | |
return bigram_dict | |
def suggest_next_word(curr_word, bigram_dict): | |
try: | |
return list(set(bigram_dict[curr_word])) | |
except KeyError: | |
return "" | |
phrases = [ | |
"thank you very much", | |
"goodbye and thank you", | |
"best wishes", | |
"good to see you", | |
"nice to see you", | |
"looking forward to work with you", | |
] | |
tokens = tokenize_phrases(phrases) | |
bigram_dict = get_bigram_dict(tokens) | |
suggest_next_word('you', bigram_dict) | |
suggest_next_word('to', bigram_dict) | |
suggest_next_word('looking', bigram_dict) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment