Last active
July 27, 2018 17:04
-
-
Save efueyo/2b02e4e17de32c7fb46418e0ab87fefd to your computer and use it in GitHub Desktop.
TF-IDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Install dependencies first | |
# pip install spacy wikipedia | |
# python -m spacy download en | |
# Run | |
# python tfidf.py | |
from collections import Counter | |
import math | |
import wikipedia | |
import spacy | |
nlp = spacy.load('en') | |
pages = [ | |
"The Tigger Movie 2000", "Dinosaur 2000 Movie", "The Emperor's New Groove 2000", "Recess School's Out", "Atlantis: The Lost Empire", "Monsters, Inc", "Return to Never Land", "Lilo & Stitch", "Treasure Planet 2002", "The Jungle Book 2 2003", "Piglet's Big Movie 2003", "Finding Nemo 2003", "Brother Bear 2003", "Teacher's Pet 2004", "Home on the Range 2004", "The Incredibles 2004", "Pooh's Heffalump Movie 2005", "Valiant 2005", "Chicken Little 2005", "The Wild 2006", "Cars 2006", "The Nightmare Before Christmas", "Meet the Robinsons 2007", "Ratatouille 2007", "WALL-E 2008", "Roadside Romeo 2008", "Bolt 2008", "Up 2009", | |
"A Christmas Carol 2009", "The Princess and the Frog 2009", "Toy Story", "Toy Story 2", "Toy Story 3", "Tangled", "Mars Needs Moms", "Cars 2", "Winnie the Pooh 2011", "Arjun: The Warrior Prince", "Brave 2012", "Frankenweenie", "Wreck-It Ralph", "Monsters University", "Planes film", "Frozen 2013", "Planes: Fire & Rescue", "Big Hero 6", "Inside Out", "The Good Dinosaur", "Zootopia", "Finding Dory", "Moana 2016", "Cars 3", "Coco 2017 film", "Incredibles 2", | |
"Shrek", "Shrek 2", "Shrek 3", "Antz", "A bugs life", "Bee movie", "Madagascar 2005 film", "Madagascar 2", "Kung fu panda", "Kung fu panda 2","Kung fu panda 3", | |
] | |
def valid_token(tk): | |
is_valid = tk.is_alpha | |
return is_valid and not tk.is_stop | |
def get_lemma(tk): | |
if tk.pos_ == 'PRON' or tk.lemma_ == '-PRON-': | |
return tk.text.lower() | |
return tk.lemma_.lower() | |
def read_wikipedia_page(page_name): | |
page = wikipedia.page(page_name) | |
content = page.content | |
return content | |
def tokenize_page(page_name): | |
text = read_wikipedia_page(page_name) | |
return [ | |
get_lemma(t) | |
for t in nlp(text) | |
if valid_token(t) | |
] | |
vocabulary = set() | |
idf_counter = Counter() | |
for page in pages: | |
print(" Processing page {}...".format(page)) | |
page_words = set(tokenize_page(page)) | |
vocabulary = vocabulary | page_words | |
idf_counter.update(page_words) | |
print("All pages processed") | |
idf = { | |
word: math.log(len(pages)/df, 2) | |
for word, df in idf_counter.items() | |
} | |
print("vocabulary size: {}".format(len(vocabulary))) | |
def analyze_page(target_page): | |
target_words = tokenize_page(target_page) | |
tfidf = { | |
word: (1 + math.log(_tf, 2)) * idf[word] | |
for word, _tf in Counter(target_words).items() | |
} | |
num_words = 20 | |
most_frequent = [ | |
w for (w, _) in Counter(target_words).most_common(num_words) | |
] | |
sorted_tfidf = [ | |
w for (w, _) in sorted(tfidf.items(), key=lambda kv: kv[1], reverse=True) | |
] | |
print(target_page) | |
print("Most frequent: {}".format(most_frequent)) | |
print("Higher TF-IDF: {}".format(sorted_tfidf[:num_words])) | |
analyze_page("Moana") | |
analyze_page("The Incredibles 2004") | |
analyze_page("Monsters, Inc") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment