Created
April 12, 2019 18:59
Bob's Burgers 2 Vec
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bs4 as bs | |
import urllib.request | |
import re | |
import os | |
import nltk | |
from gensim.models import Word2Vec | |
import logging | |
import pprint | |
pp = pprint.PrettyPrinter(indent=4) | |
logging.basicConfig( | |
format='%(asctime)s : %(levelname)s : %(message)s', | |
level=logging.INFO) | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
burger_url = "https://bobs-burgers.fandom.com/wiki/Burger_of_the_Day" | |
def gather_data(): | |
scrapped_data = urllib.request.urlopen(burger_url) | |
article = scrapped_data.read() | |
parsed_article = bs.BeautifulSoup(article, 'lxml') | |
paragraphs = parsed_article.find_all('li') | |
article_text = "" | |
for p in paragraphs: | |
article_text += p.text | |
return article_text | |
def process_data(data): | |
processed_data = data.lower() | |
processed_data = re.sub('[^a-zA-Z]', ' ', processed_data) | |
processed_data = re.sub(r'\s+', ' ', processed_data) | |
# Preparing the dataset | |
all_sentences = nltk.sent_tokenize(processed_data) | |
all_words = [nltk.word_tokenize(sent) for sent in all_sentences] | |
# Removing Stop Words | |
from nltk.corpus import stopwords | |
for i in range(len(all_words)): | |
all_words[i] = [w for w in all_words[i] | |
if w not in stopwords.words('english')] | |
return all_words | |
def generate_model(data): | |
model = Word2Vec(data, size=150, | |
window=10, | |
min_count=2, | |
workers=10) | |
model.train(data, total_examples=len(data), epochs=10) | |
abspath = os.path.dirname(os.path.abspath(__file__)) | |
model.wv.save(os.path.join(abspath, "vectors/burgers")) | |
return model | |
def find_word(word2vec, word): | |
return word2vec.wv.most_similar(positive=word) | |
if __name__ == '__main__': | |
data = gather_data() | |
processed_data = process_data(data) | |
model = generate_model(processed_data) | |
pp.pprint(find_word(model, 'garlic')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment