Created
November 27, 2018 23:37
-
-
Save nvbn/ece1528ff5af2ecac6d2ee39234287ea to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from matplotlib.ticker import FuncFormatter | |
from nltk.sentiment.vader import SentimentIntensityAnalyzer | |
import praw | |
options = dict(client_id='', | |
client_secret='', | |
user_agent='') | |
reddit = praw.Reddit(**options) | |
sid = SentimentIntensityAnalyzer() | |
url = '' | |
[*posts] = reddit.subreddit('all').search(f"url:{url}", limit=1000) | |
posts_df = pd.DataFrame( | |
[(post.id, post.subreddit.display_name, post.title, post.score, | |
datetime.utcfromtimestamp(post.created_utc), post.url, | |
post.num_comments, post.upvote_ratio) | |
for post in posts], | |
columns=['id', 'subreddit', 'title', 'score', 'created', | |
'url', 'num_comments', 'upvote_ratio']) | |
posts_df[['subreddit', 'upvote_ratio']] \ | |
.groupby('subreddit') \ | |
.mean()['upvote_ratio'] \ | |
.reset_index() \ | |
.plot(kind='barh', x='subreddit', y='upvote_ratio', | |
title='Upvote ratio', legend=False) \ | |
.xaxis \ | |
.set_major_formatter(FuncFormatter(lambda x, _: f'{x * 100:.1f}%')) | |
plt.tight_layout() | |
# Number of comments | |
posts_df[['subreddit', 'num_comments']] \ | |
.groupby('subreddit') \ | |
.sum()['num_comments'] \ | |
.reset_index() \ | |
.plot(kind='barh', x='subreddit', y='num_comments', | |
title='Number of comments', legend=False) | |
plt.tight_layout() | |
# Score | |
posts_df[['subreddit', 'score']] \ | |
.groupby('subreddit') \ | |
.sum()['score'] \ | |
.reset_index() \ | |
.plot(kind='barh', x='subreddit', y='score', title='Score', legend=False) | |
plt.tight_layout() | |
# Title sentiments | |
posts_sentiments = posts_df.title.apply(sid.polarity_scores).apply(pd.Series) | |
posts_df = posts_df.assign(title_neg=posts_sentiments.neg, | |
title_neu=posts_sentiments.neu, | |
title_pos=posts_sentiments.pos, | |
title_compound=posts_sentiments['compound']) | |
posts_df[['subreddit', 'title_neg', 'title_neu', 'title_pos', 'title_compound']] \ | |
.groupby('subreddit') \ | |
.sum()[['title_neg', 'title_neu', 'title_pos', 'title_compound']] \ | |
.reset_index() \ | |
.rename(columns={'title_neg': 'Negative', | |
'title_pos': 'Positive', | |
'title_neu': 'Neutral', | |
'title_compound': 'Compound'}) \ | |
.plot(kind='barh', x='subreddit', title='Title sentiments', legend=True) | |
plt.tight_layout() | |
def normalize(post): | |
[*subreddit_posts] = reddit.subreddit(post.subreddit.display_name).new(limit=1000) | |
subreddit_posts_df = pd.DataFrame([(post.id, post.score, post.num_comments) | |
for post in subreddit_posts], | |
columns=('id', 'score', 'num_comments')) | |
norm_score = ((post.score - subreddit_posts_df.score.mean()) | |
/ (subreddit_posts_df.score.max() - subreddit_posts_df.score.min())) | |
norm_num_comments = ((post.num_comments - subreddit_posts_df.num_comments.mean()) | |
/ (subreddit_posts_df.num_comments.max() - subreddit_posts_df.num_comments.min())) | |
return norm_score, norm_num_comments | |
normalized_vals = pd \ | |
.DataFrame([normalize(post) for post in posts], | |
columns=['norm_score', 'norm_num_comments']) \ | |
.fillna(0) | |
posts_df[['norm_score', 'norm_num_comments']] = normalized_vals | |
posts_df[['subreddit', 'norm_score', 'norm_num_comments']] \ | |
.groupby('subreddit') \ | |
.sum()[['norm_score', 'norm_num_comments']] \ | |
.reset_index() \ | |
.rename(columns={'norm_score': 'Normalized score', | |
'norm_num_comments': 'Normalized number of comments'}) \ | |
.plot(kind='barh', x='subreddit',title='Normalized popularity') | |
plt.tight_layout() | |
posts_df[['subreddit', 'norm_score', 'norm_num_comments', 'title_compound']] \ | |
.groupby('subreddit') \ | |
.sum()[['norm_score', 'norm_num_comments', 'title_compound']] \ | |
.reset_index() \ | |
.plot(kind='barh', x='subreddit', title='Normalized', legend=True) | |
plt.tight_layout() | |
def handle_post_comments(post): | |
if not post.num_comments: | |
return | |
root_comments_df = pd.DataFrame([(comment.id, comment.body, comment.score) | |
for comment in post.comments.list() | |
if hasattr(comment, 'body')], | |
columns=['id', 'body', 'score']) | |
root_comments_df['norm_score'] = ((root_comments_df.score - root_comments_df.score.mean()) | |
/ (root_comments_df.score.max() - root_comments_df.score.min())) | |
root_comments_sentiments = root_comments_df.body.apply(sid.polarity_scores).apply(pd.Series) | |
root_comments_df = root_comments_df.assign(body_neg=root_comments_sentiments.neg, | |
body_neu=root_comments_sentiments.neu, | |
body_pos=root_comments_sentiments.pos, | |
body_compound=root_comments_sentiments['compound']) | |
bucketed_root_comments = root_comments_df \ | |
[root_comments_df.body_compound >= 0.6] \ | |
.assign(bucket='pos_pos') \ | |
.append( | |
root_comments_df[ | |
(root_comments_df.body_compound >= 0.2) | |
& (root_comments_df.body_compound < 0.6) | |
].assign(bucket='pos_neu') | |
) \ | |
.append( | |
root_comments_df[ | |
(root_comments_df.body_compound >= -0.2) | |
& (root_comments_df.body_compound < 0.2) | |
].assign(bucket='neu_neu') | |
) \ | |
.append( | |
root_comments_df[ | |
(root_comments_df.body_compound >= -0.6) | |
& (root_comments_df.body_compound < -0.2) | |
].assign(bucket='neg_neu') | |
) \ | |
.append( | |
root_comments_df[ | |
root_comments_df.body_compound < -0.6 | |
].assign(bucket='neg_neg') | |
) | |
buckets = bucketed_root_comments \ | |
.groupby('bucket') \ | |
.agg({'norm_score': 'mean', 'id': 'count'}) \ | |
.rename(columns={'norm_score': 'norm_score', 'id': 'amount'}) \ | |
.reset_index() | |
buckets['percent'] = buckets.amount / buckets.amount.sum() | |
post_comments_df = pd.DataFrame([post.id], columns=['post_id']).assign(key=0) | |
for bucket in buckets.bucket: | |
post_comments_df = buckets[buckets.bucket == bucket] \ | |
[['norm_score', 'amount', 'percent']] \ | |
.add_prefix(f'{bucket}_') \ | |
.assign(key=0) \ | |
.merge(post_comments_df, how='outer') | |
return post_comments_df \ | |
.add_prefix('root_comments_') \ | |
.assign(key=post.id) | |
posts_comments_df = pd \ | |
.concat([handle_post_comments(post) for post in posts]) \ | |
.fillna(0) | |
posts_with_comments_df = posts_df \ | |
.assign(key=lambda x: x.id) \ | |
.merge(posts_comments_df, on='key', how='left') \ | |
.fillna(0) | |
percent_columns = ['root_comments_neg_neg_percent', | |
'root_comments_neg_neu_percent', 'root_comments_neu_neu_percent', | |
'root_comments_pos_neu_percent', 'root_comments_pos_pos_percent'] | |
posts_with_comments_df[['subreddit'] + percent_columns] \ | |
.groupby('subreddit') \ | |
.mean()[percent_columns] \ | |
.reset_index() \ | |
.rename(columns={column: column[13:-7].replace('_', ' ') | |
for column in percent_columns}) \ | |
.plot(kind='bar', x='subreddit', legend=True, | |
title='Percent of comments by sentiments buckets') \ | |
.yaxis \ | |
.set_major_formatter(FuncFormatter(lambda y, _: f'{y * 100:.1f}%')) | |
plt.tight_layout() | |
norm_score_columns = ['root_comments_neg_neg_norm_score', | |
'root_comments_neg_neu_norm_score', | |
'root_comments_neu_neu_norm_score', | |
'root_comments_pos_neu_norm_score', | |
'root_comments_pos_pos_norm_score'] | |
posts_with_comments_df[['subreddit'] + norm_score_columns] \ | |
.groupby('subreddit') \ | |
.mean()[norm_score_columns] \ | |
.reset_index() \ | |
.rename(columns={column: column[13:-10].replace('_', ' ') | |
for column in norm_score_columns}) \ | |
.plot(kind='bar', x='subreddit', legend=True, | |
title='Mean normalized score of comments by sentiments buckets') | |
plt.tight_layout() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment