Last active
January 8, 2019 08:21
-
-
Save nvbn/d062b62ed340732c5b6034f40bd78e87 to your computer and use it in GitHub Desktop.
Extracting popular topics from subreddits
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
import json | |
from functools import lru_cache | |
from multiprocessing import Pool | |
import sys | |
from nltk.tokenize import word_tokenize, sent_tokenize | |
from nltk.corpus import stopwords | |
from nltk.util import ngrams | |
from nltk import WordNetLemmatizer, pos_tag | |
if __name__ == '__main__': | |
import nltk | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
nltk.download('wordnet') | |
stop_words = set(stopwords.words('english')) | |
lemma = WordNetLemmatizer() | |
lemmatize = lru_cache(maxsize=1000000)(lemma.lemmatize) | |
morphy_tag = {'NN':'n', 'JJ':'a', | |
'VB':'v', 'RB':'r'} | |
def to_lemmas(text): | |
for sentence in sent_tokenize(text): | |
for token, tag in pos_tag(word_tokenize(sentence)): | |
token = token.lower() | |
if token in stop_words: | |
continue | |
if len(token) < 2: | |
continue | |
yield lemmatize(token, pos=morphy_tag.get(tag[:2], 'n')) | |
def to_ngrams(text): | |
tokens = [*to_lemmas(text)] | |
for token in tokens: | |
yield token | |
if len(tokens) >= 2: | |
for bigram in ngrams(tokens, 2): | |
yield ' '.join(bigram) | |
if len(tokens) >= 3: | |
for trigram in ngrams(tokens, 3): | |
yield ' '.join(trigram) | |
def handle_line(line): | |
data = json.loads(line) | |
data['ngrams'] = [*to_ngrams(data['body'])] | |
return json.dumps(data) | |
if __name__ == '__main__': | |
with Pool(processes=12) as pool: | |
for handled in pool.imap_unordered(handle_line, sys.stdin): | |
print(handled) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import re | |
from glob import glob | |
from collections import defaultdict | |
from datetime import datetime | |
import sys | |
def read_file(path): | |
with open(path) as f: | |
return [json.loads(line) for line in f] | |
subreddit = sys.argv[1] | |
counter = defaultdict(lambda: 0) | |
paths = list(glob(f'counted/*_{subreddit}')) | |
for n, path in enumerate(paths): | |
if len(path) == len(f'counted/2018-10-10_{subreddit}'): | |
for ngram, count in read_file(path): | |
counter[ngram] += count | |
sys.stderr.write(f'\r{n} of {len(paths)}') | |
sys.stderr.write('\n') | |
print(json.dumps(list(counter.items()))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from glob import glob | |
import json | |
from multiprocessing import Pool | |
from collections import Counter, defaultdict | |
import os | |
import sys | |
def count(paths): | |
input_path, output_path = paths | |
counter = defaultdict(lambda: 1) | |
with open(input_path) as f: | |
for line in f: | |
ngram = json.loads(line) | |
counter[ngram] += 1 | |
with open(output_path, 'w') as f: | |
for ngram, count in counter.items(): | |
f.write(json.dumps([ngram, count]) + '\n') | |
if __name__ == '__main__': | |
paths = [(input_path, os.path.join(sys.argv[2], os.path.split(input_path)[-1])) | |
for input_path in glob(os.path.join(sys.argv[1], '*'))] | |
amount = len(paths) | |
with Pool(processes=12) as pool: | |
for n, _ in enumerate(pool.imap_unordered(count, paths)): | |
if n % 100 == 0: | |
sys.stdout.write(f'\r{n} of {amount}') | |
print('done') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# Very ugly, it didn't want to upload `.ipynb` | |
# In[2]: | |
get_ipython().run_line_magic('config', 'IPCompleter.greedy=True') | |
get_ipython().run_line_magic('matplotlib', 'inline') | |
# In[3]: | |
import json | |
import re | |
from glob import glob | |
from datetime import datetime | |
import pandas as pd | |
from pandas.api.types import CategoricalDtype | |
import seaborn as sns | |
from matplotlib import pyplot as plt | |
# In[4]: | |
plt.rc('figure', facecolor='w') | |
# In[76]: | |
norm = lambda x: (x - x.mean()) / (x.max() - x.min()) | |
def diff_n(n, base, path, ngram_size=0, min_amount=0, exclude=None): | |
df = pd .read_json(path, lines=True) .rename(columns={0: 'ngram', 1: 'amount'}) | |
if exclude is not None: | |
df = df[~df.ngram.isin(exclude)] | |
if ngram_size > 0: | |
df = df[df.ngram.str.count(' ') >= ngram_size] | |
if min_amount: | |
df = df[df.amount > min_amount] | |
df['amount_norm'] = norm(df.amount) | |
df = df .merge(base, how='left', on='ngram', suffixes=('_left', '_right')) .fillna(0) | |
df['diff'] = df.amount_norm_left - df.amount_norm_right | |
return df .sort_values('diff', ascending=False) .head(n) | |
def diff_n_by_day(n, base, subreddit, start, end, ngram_size=0, min_amount=0, exclude=None): | |
by_day_df = None | |
for ts in pd.date_range(start, end).tolist(): | |
current_df = diff_n(n, base, f"counted/{ts.strftime('%Y-%m-%d')}_{subreddit}", | |
ngram_size, min_amount, exclude) | |
current_df['date'] = ts | |
if by_day_df is None: | |
by_day_df = current_df | |
else: | |
by_day_df = by_day_df.append(current_df) | |
return by_day_df | |
# In[87]: | |
def weekly_heatmap(by_day_df, title): | |
top_topics = by_day_df .groupby('ngram') .sum()['diff'] .reset_index() .sort_values('diff', ascending=False) .head(10) .ngram | |
only_top_df = by_day_df [['date', 'ngram', 'diff']] [by_day_df.ngram.isin(top_topics)] .groupby([pd.Grouper(key='date', freq='W-MON'), 'ngram']) .mean() .reset_index() .sort_values('date') | |
pivot = only_top_df .pivot(index='ngram', columns='date', values='diff') .fillna(-1) | |
_, ax = plt.subplots(figsize=(10, 8)) | |
sns.heatmap(pivot, xticklabels=only_top_df.date.dt.week.unique(), ax=ax) | |
plt.title(title) | |
plt.xlabel('Week') | |
plt.tight_layout() | |
# In[88]: | |
def weekday_heatmap(by_day_df, title): | |
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] | |
weekdays_type = CategoricalDtype(categories=weekdays, ordered=True) | |
by_day_df['weekday'] = by_day_df.date.dt.weekday_name.astype(weekdays_type) | |
top_topics = by_day_df .groupby('ngram') .sum()['diff'] .reset_index() .sort_values('diff', ascending=False) .head(10) .ngram | |
pivot = by_day_df [by_day_df.ngram.isin(top_topics)] .groupby(['weekday', 'ngram']) .mean()['diff'] .reset_index() .pivot(index='ngram', columns='weekday', values='diff') .fillna(-1) | |
_, ax = plt.subplots(figsize=(10, 8)) | |
sns.heatmap(pivot, ax=ax) | |
plt.title(title) | |
plt.xlabel('Day of week') | |
plt.tight_layout() | |
# # Base words from AskReddit | |
# In[8]: | |
whole_askreddit_df = pd.read_json('aggregated/askreddit_whole.json', orient='values') | |
# In[9]: | |
whole_askreddit_df = whole_askreddit_df.rename(columns={0: 'ngram', 1: 'amount'}) | |
# In[10]: | |
whole_askreddit_df['amount_norm'] = norm(whole_askreddit_df.amount) | |
# In[11]: | |
whole_askreddit_df = whole_askreddit_df[whole_askreddit_df.amount > 99] | |
# # Compare with others | |
# In[84]: | |
r_programming_by_day = diff_n_by_day( | |
50, whole_askreddit_df, 'programming', '2018-08-01', '2018-10-31', | |
exclude=['gt', 'use', 'write'], | |
) | |
# In[85]: | |
weekly_heatmap(r_programming_by_day, 'r/programming') | |
# In[89]: | |
weekday_heatmap(r_programming_by_day, 'r/programming by weekday') | |
# In[96]: | |
r_sports_by_day = diff_n_by_day( | |
50, whole_askreddit_df, 'sports', '2018-08-01', '2018-10-31', | |
exclude=['r/sports'], | |
) | |
# In[102]: | |
weekly_heatmap(r_sports_by_day, 'r/sports') | |
# In[98]: | |
weekday_heatmap(r_sports_by_day, 'r/sports by weekday') | |
# In[204]: | |
r_television_by_day = diff_n_by_day( | |
50, whole_askreddit_df, 'television', '2018-08-01', '2018-10-31', | |
exclude=['r/television'], | |
) | |
# In[208]: | |
weekly_heatmap(r_television_by_day, 'r/television') | |
# In[90]: | |
weekday_heatmap(r_television_by_day, 'r/television by weekday') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
import json | |
from functools import lru_cache | |
from collections import defaultdict | |
import os | |
import sys | |
limit = 1000 | |
fhs = {} | |
opened = [] | |
def get_fh(day, subreddit): | |
path = os.path.join(sys.argv[1], f'{day}_{subreddit}') | |
if path not in fhs: | |
if len(opened) > limit: | |
to_close = opened.pop(0) | |
fhs[to_close].close() | |
del fhs[to_close] | |
fhs[path] = open(path, 'a') | |
opened.append(path) | |
return fhs[path] | |
def write(day, subreddit, ngram): | |
get_fh(day, subreddit).write(json.dumps(ngram) + '\n') | |
for line in sys.stdin: | |
data = json.loads(line) | |
day = datetime.fromtimestamp(data['created_utc']).date().isoformat() | |
for ngram in data['ngrams']: | |
write(day, data['subreddit'], ngram) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# Very ugly, it didn't want to upload `.ipynb` | |
# In[1]: | |
get_ipython().run_line_magic('config', 'IPCompleter.greedy=True') | |
# In[2]: | |
import json | |
options = json.load(open('credentials.json')) | |
subreddits = ['all', 'tifu', 'linux', 'trees', 'worldnews', 'news', | |
'personalfinance', 'europe', 'gaming', | |
'docker', 'food', 'python', 'drunk'] | |
# In[3]: | |
from collections import Counter | |
from datetime import datetime | |
from functools import wraps | |
from itertools import takewhile | |
from multiprocessing import Pool | |
import re | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from datetime import datetime | |
from matplotlib.ticker import FuncFormatter | |
from nltk.sentiment.vader import SentimentIntensityAnalyzer | |
from nltk.tokenize import word_tokenize, sent_tokenize | |
from nltk.corpus import stopwords | |
from nltk.util import ngrams | |
from nltk import WordNetLemmatizer, pos_tag | |
import praw | |
from praw.models import Comment | |
reddit = praw.Reddit(**options) | |
sid = SentimentIntensityAnalyzer() | |
stop_words = set(stopwords.words('english')) | |
lemma = WordNetLemmatizer() | |
# In[4]: | |
def eager(fn): | |
@wraps(fn) | |
def wrapper(*args, **kwargs): | |
return list(fn(*args, **kwargs)) | |
return wrapper | |
# In[6]: | |
def get_comments(comments, level=0): | |
if level > 3: | |
return | |
for comment in comments: | |
if isinstance(comment, Comment): | |
yield comment | |
yield from get_comments(comment.replies, level + 1) | |
def fetch_texts(subreddit): | |
[*posts] = reddit.subreddit(subreddit).top(time_filter='year', limit=1000) | |
for n, post in enumerate(posts): | |
yield (f'{post.id}_title', subreddit, post.id, 'title', | |
post.title, post.created, post.score) | |
yield (f'{post.id}_selftext', subreddit, post.id, 'selftext', | |
post.selftext, post.created, post.score) | |
for comment in get_comments(post.comments): | |
yield (f'{post.id}_comment_{comment.id}', subreddit, post.id, | |
'comment', comment.body, comment.created, comment.score) | |
print(f"{subreddit}: {n + 1} of {len(posts)}") | |
def get_subreddit_df(subreddit): | |
try: | |
texts_df = pd.read_pickle(f'r_{subreddit}_texts_df.pkl') | |
except FileNotFoundError: | |
texts_df = pd.DataFrame( | |
list(fetch_texts(subreddit)), | |
columns=['id', 'subreddit', 'post_id', 'kind', 'text', 'created', 'score']) | |
texts_df.to_pickle(f'r_{subreddit}_texts_df.pkl') | |
return texts_df | |
# In[7]: | |
subreddits_dfs = {subreddit: get_subreddit_df(subreddit) | |
for subreddit in subreddits} | |
# In[10]: | |
subreddits_dfs['all'].head() | |
# In[12]: | |
# From https://stackoverflow.com/questions/50992974/nltk-wordnetlemmatizer-not-lemmatizing-as-expected | |
def penn2morphy(penntag): | |
""" Converts Penn Treebank tags to WordNet. """ | |
morphy_tag = {'NN':'n', 'JJ':'a', | |
'VB':'v', 'RB':'r'} | |
try: | |
return morphy_tag[penntag[:2]] | |
except: | |
return 'n' | |
clean_lemma_re = re.compile(r'\W') | |
def to_lemmas(text): | |
for sentence in sent_tokenize(text): | |
for token, tag in pos_tag(word_tokenize(sentence)): | |
token = token.lower() | |
if token in stop_words: | |
continue | |
token = clean_lemma_re.sub('', token) | |
if not token: | |
continue | |
yield lemma.lemmatize(token, pos=penn2morphy(tag)) | |
@eager | |
def to_tokens(text): | |
tokens = [*to_lemmas(text)] | |
for token in tokens: | |
yield token | |
if len(tokens) >= 2: | |
for bigram in ngrams(tokens, 2): | |
yield ' '.join(bigram) | |
if len(tokens) >= 3: | |
for trigram in ngrams(tokens, 3): | |
yield ' '.join(trigram) | |
pool = Pool(4) | |
def to_counted_tokens(series): | |
counter = Counter(token for text_tokens in pool.map(to_tokens, series.tolist()) | |
for token in text_tokens) | |
return counter.items() | |
def to_tokens_df(series): | |
return pd.DataFrame([*to_counted_tokens(series)], | |
columns=['token', 'amount']) | |
def get_tokens_df(subreddit): | |
try: | |
tokens_df = pd.read_pickle(f'r_{subreddit}_tokens_2_df.pkl') | |
except FileNotFoundError: | |
tokens_df = to_tokens_df(subreddits_dfs[subreddit].text) | |
tokens_df.to_pickle(f'r_{subreddit}_tokens_2_df.pkl') | |
return tokens_df | |
# In[13]: | |
tokens_dfs = {subreddit: get_tokens_df(subreddit) | |
for subreddit in subreddits} | |
# In[14]: | |
df = tokens_dfs['all'] | |
df.head() | |
# In[15]: | |
for df in tokens_dfs.values(): | |
df['amount_norm'] = (df.amount - df.amount.mean()) / (df.amount.max() - df.amount.min()) | |
# In[16]: | |
df.head() | |
# In[18]: | |
def diff_tokens(left_df, right_df): | |
tokens_df = left_df .merge(right_df, on='token', how='outer', suffixes=('_left', '_right')) .fillna(-1) | |
tokens_df['amount_diff'] = tokens_df['amount_left'] - tokens_df['amount_right'] | |
tokens_df['amount_norm_diff'] = tokens_df['amount_norm_left'] - tokens_df['amount_norm_right'] | |
return tokens_df[['token', 'amount_diff', 'amount_norm_diff']] .sort_values('amount_norm_diff', ascending=False) | |
# In[20]: | |
diff_tokens(tokens_dfs['linux'], tokens_dfs['all']).head() | |
# In[21]: | |
diff_tokens(tokens_dfs['drunk'], tokens_dfs['all']).head() | |
# In[22]: | |
diff_tokens(tokens_dfs['personalfinance'], tokens_dfs['all']).head() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment