Created
August 25, 2017 18:05
-
-
Save fabianobizarro/648165d1a507b440506120b5b52dc340 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk.stem.lancaster import LancasterStemmer | |
# word stemmer | |
stemmer = LancasterStemmer() | |
# 3 classes of training data | |
training_data = [] | |
training_data.append({"class":"greeting", "sentence":"como vai você?"}) | |
training_data.append({"class":"greeting", "sentence":"como cê ta?"}) | |
training_data.append({"class":"greeting", "sentence":"como vai?"}) | |
training_data.append({"class":"greeting", "sentence":"tudo bem?"}) | |
training_data.append({"class":"greeting", "sentence":"e ai"}) | |
training_data.append({"class":"greeting", "sentence":"Tudo joia?"}) | |
training_data.append({"class":"greeting", "sentence":"tudo beleza?"}) | |
training_data.append({"class":"greeting", "sentence":"tudo blz?"}) | |
training_data.append({"class":"goodbye", "sentence":"flw"}) | |
training_data.append({"class":"goodbye", "sentence":"até mais"}) | |
training_data.append({"class":"goodbye", "sentence":"thcau"}) | |
training_data.append({"class":"goodbye", "sentence":"flw vlw"}) | |
training_data.append({"class":"goodbye", "sentence":"adeus"}) | |
training_data.append({"class":"goodbye", "sentence":"te mais"}) | |
training_data.append({"class":"sandwich", "sentence":"make me a sandwich"}) | |
training_data.append({"class":"sandwich", "sentence":"can you make a sandwich?"}) | |
training_data.append({"class":"sandwich", "sentence":"having a sandwich today?"}) | |
training_data.append({"class":"sandwich", "sentence":"what's for lunch?"}) | |
#print ("%s sentences of training data" % len(training_data)) | |
corpus_words = { } | |
class_words = { } | |
classes = list(set([a['class'] for a in training_data])) | |
for c in classes: | |
class_words[c] = [] | |
# loop through each sentence in our training data | |
for data in training_data: | |
# tokenize each sentence into words | |
for word in nltk.word_tokenize(data['sentence']): | |
# ignore a some things | |
if word not in ["?", "'s"]: | |
# stem and lowercase each word | |
stemmed_word = stemmer.stem(word.lower()) | |
# have we not seen this word already? | |
if stemmed_word not in corpus_words: | |
corpus_words[stemmed_word] = 1 | |
else: | |
corpus_words[stemmed_word] += 1 | |
# add the word to our words in class list | |
class_words[data['class']].extend([stemmed_word]) | |
# print ("Corpus words and counts: %s \n" % corpus_words) | |
# print ("Class words: %s" % class_words) | |
print(corpus_words) | |
# calculate a score for a given class | |
def calculate_class_score(sentence, class_name, show_details=True): | |
score = 0 | |
# tokenize each word in our new sentence | |
for word in nltk.word_tokenize(sentence): | |
# check to see if the stem of the word is in any of our classes | |
if stemmer.stem(word.lower()) in class_words[class_name]: | |
# treat each word with same weight | |
score += (1 / corpus_words[stemmer.stem(word.lower())]) | |
if show_details: | |
print (" match: %s (%s)" % (stemmer.stem(word.lower()), 1 / corpus_words[stemmer.stem(word.lower())])) | |
return score | |
def classify(sentence): | |
high_class = None | |
high_score = 0 | |
# loop through our classes | |
for c in class_words.keys(): | |
# calculate score of sentence for each class | |
score = calculate_class_score(sentence, c) | |
# keep track of highest score | |
if score > high_score: | |
high_class = c | |
high_score = score | |
return high_class, high_score | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment