Skip to content

Instantly share code, notes, and snippets.

@bryanjos
Created April 12, 2019 18:59
Bob's Burgers 2 Vec
import bs4 as bs
import urllib.request
import re
import os
import nltk
from gensim.models import Word2Vec
import logging
import pprint
pp = pprint.PrettyPrinter(indent=4)
logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
nltk.download('punkt')
nltk.download('stopwords')
burger_url = "https://bobs-burgers.fandom.com/wiki/Burger_of_the_Day"
def gather_data():
scrapped_data = urllib.request.urlopen(burger_url)
article = scrapped_data.read()
parsed_article = bs.BeautifulSoup(article, 'lxml')
paragraphs = parsed_article.find_all('li')
article_text = ""
for p in paragraphs:
article_text += p.text
return article_text
def process_data(data):
processed_data = data.lower()
processed_data = re.sub('[^a-zA-Z]', ' ', processed_data)
processed_data = re.sub(r'\s+', ' ', processed_data)
# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_data)
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]
# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
all_words[i] = [w for w in all_words[i]
if w not in stopwords.words('english')]
return all_words
def generate_model(data):
model = Word2Vec(data, size=150,
window=10,
min_count=2,
workers=10)
model.train(data, total_examples=len(data), epochs=10)
abspath = os.path.dirname(os.path.abspath(__file__))
model.wv.save(os.path.join(abspath, "vectors/burgers"))
return model
def find_word(word2vec, word):
return word2vec.wv.most_similar(positive=word)
if __name__ == '__main__':
data = gather_data()
processed_data = process_data(data)
model = generate_model(processed_data)
pp.pprint(find_word(model, 'garlic'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment