nvbn · November 27, 2018 23:37
diff --git a/community_opinion.pm b/community_opinion.pm
 from datetime import datetime
 import pandas as pd
 import matplotlib.pyplot as plt
 from matplotlib.ticker import FuncFormatter
 from nltk.sentiment.vader import SentimentIntensityAnalyzer
 import praw

 options = dict(client_id='',
               client_secret='',
               user_agent='')

 reddit = praw.Reddit(**options)
 sid = SentimentIntensityAnalyzer()

 url = ''

 [*posts] = reddit.subreddit('all').search(f"url:{url}", limit=1000)

 posts_df = pd.DataFrame(
    [(post.id, post.subreddit.display_name, post.title, post.score,
      datetime.utcfromtimestamp(post.created_utc), post.url,
      post.num_comments, post.upvote_ratio)
     for post in posts],
    columns=['id', 'subreddit', 'title', 'score', 'created',
             'url', 'num_comments', 'upvote_ratio'])

 posts_df[['subreddit', 'upvote_ratio']] \
    .groupby('subreddit') \
    .mean()['upvote_ratio'] \
    .reset_index() \
    .plot(kind='barh', x='subreddit', y='upvote_ratio',
          title='Upvote ratio', legend=False) \
    .xaxis \
    .set_major_formatter(FuncFormatter(lambda x, _: f'{x * 100:.1f}%'))

 plt.tight_layout()

 # Number of comments
 posts_df[['subreddit', 'num_comments']] \
    .groupby('subreddit') \
    .sum()['num_comments'] \
    .reset_index() \
    .plot(kind='barh', x='subreddit', y='num_comments',
          title='Number of comments', legend=False)

 plt.tight_layout()

 # Score
 posts_df[['subreddit', 'score']] \
    .groupby('subreddit') \
    .sum()['score'] \
    .reset_index() \
    .plot(kind='barh', x='subreddit', y='score', title='Score', legend=False)

 plt.tight_layout()

 # Title sentiments
 posts_sentiments = posts_df.title.apply(sid.polarity_scores).apply(pd.Series)
 posts_df = posts_df.assign(title_neg=posts_sentiments.neg,
                           title_neu=posts_sentiments.neu,
                           title_pos=posts_sentiments.pos,
                           title_compound=posts_sentiments['compound'])

 posts_df[['subreddit', 'title_neg', 'title_neu', 'title_pos', 'title_compound']] \
    .groupby('subreddit') \
    .sum()[['title_neg', 'title_neu', 'title_pos', 'title_compound']] \
    .reset_index() \
    .rename(columns={'title_neg': 'Negative',
                     'title_pos': 'Positive',
                     'title_neu': 'Neutral',
                     'title_compound': 'Compound'}) \
    .plot(kind='barh', x='subreddit', title='Title sentiments', legend=True)

 plt.tight_layout()


 def normalize(post):
    [*subreddit_posts] = reddit.subreddit(post.subreddit.display_name).new(limit=1000)
    subreddit_posts_df = pd.DataFrame([(post.id, post.score, post.num_comments)
                                       for post in subreddit_posts],
                                      columns=('id', 'score', 'num_comments'))

    norm_score = ((post.score - subreddit_posts_df.score.mean())
                  / (subreddit_posts_df.score.max() - subreddit_posts_df.score.min()))
    norm_num_comments = ((post.num_comments - subreddit_posts_df.num_comments.mean())
                         / (subreddit_posts_df.num_comments.max() - subreddit_posts_df.num_comments.min()))

    return norm_score, norm_num_comments

 normalized_vals = pd \
    .DataFrame([normalize(post) for post in posts],
               columns=['norm_score', 'norm_num_comments']) \
    .fillna(0)

 posts_df[['norm_score', 'norm_num_comments']] = normalized_vals

 posts_df[['subreddit', 'norm_score', 'norm_num_comments']] \
    .groupby('subreddit') \
    .sum()[['norm_score', 'norm_num_comments']] \
    .reset_index() \
    .rename(columns={'norm_score': 'Normalized score',
                     'norm_num_comments': 'Normalized number of comments'}) \
    .plot(kind='barh', x='subreddit',title='Normalized popularity')

 plt.tight_layout()


 posts_df[['subreddit', 'norm_score', 'norm_num_comments', 'title_compound']] \
    .groupby('subreddit') \
    .sum()[['norm_score', 'norm_num_comments', 'title_compound']] \
    .reset_index() \
    .plot(kind='barh', x='subreddit', title='Normalized', legend=True)

 plt.tight_layout()


 def handle_post_comments(post):
    if not post.num_comments:
        return

    root_comments_df = pd.DataFrame([(comment.id, comment.body, comment.score)
                                     for comment in post.comments.list()
                                     if hasattr(comment, 'body')],
                                    columns=['id', 'body', 'score'])
    root_comments_df['norm_score'] = ((root_comments_df.score - root_comments_df.score.mean())
                                      / (root_comments_df.score.max() - root_comments_df.score.min()))

    root_comments_sentiments = root_comments_df.body.apply(sid.polarity_scores).apply(pd.Series)

    root_comments_df = root_comments_df.assign(body_neg=root_comments_sentiments.neg,
                                               body_neu=root_comments_sentiments.neu,
                                               body_pos=root_comments_sentiments.pos,
                                               body_compound=root_comments_sentiments['compound'])

    bucketed_root_comments = root_comments_df \
        [root_comments_df.body_compound >= 0.6] \
        .assign(bucket='pos_pos') \
        .append(
        root_comments_df[
            (root_comments_df.body_compound >= 0.2)
            & (root_comments_df.body_compound < 0.6)
            ].assign(bucket='pos_neu')
    ) \
        .append(
        root_comments_df[
            (root_comments_df.body_compound >= -0.2)
            & (root_comments_df.body_compound < 0.2)
            ].assign(bucket='neu_neu')
    ) \
        .append(
        root_comments_df[
            (root_comments_df.body_compound >= -0.6)
            & (root_comments_df.body_compound < -0.2)
            ].assign(bucket='neg_neu')
    ) \
        .append(
        root_comments_df[
            root_comments_df.body_compound < -0.6
            ].assign(bucket='neg_neg')
    )

    buckets = bucketed_root_comments \
        .groupby('bucket') \
        .agg({'norm_score': 'mean', 'id': 'count'}) \
        .rename(columns={'norm_score': 'norm_score', 'id': 'amount'}) \
        .reset_index()

    buckets['percent'] = buckets.amount / buckets.amount.sum()

    post_comments_df = pd.DataFrame([post.id], columns=['post_id']).assign(key=0)
    for bucket in buckets.bucket:
        post_comments_df = buckets[buckets.bucket == bucket] \
            [['norm_score', 'amount', 'percent']] \
            .add_prefix(f'{bucket}_') \
            .assign(key=0) \
            .merge(post_comments_df, how='outer')

    return post_comments_df \
        .add_prefix('root_comments_') \
        .assign(key=post.id)

 posts_comments_df = pd \
    .concat([handle_post_comments(post) for post in posts]) \
    .fillna(0)

 posts_with_comments_df = posts_df \
    .assign(key=lambda x: x.id) \
    .merge(posts_comments_df, on='key', how='left') \
    .fillna(0)


 percent_columns = ['root_comments_neg_neg_percent',
                   'root_comments_neg_neu_percent', 'root_comments_neu_neu_percent',
                   'root_comments_pos_neu_percent', 'root_comments_pos_pos_percent']

 posts_with_comments_df[['subreddit'] + percent_columns] \
    .groupby('subreddit') \
    .mean()[percent_columns] \
    .reset_index() \
    .rename(columns={column: column[13:-7].replace('_', ' ')
                     for column in percent_columns}) \
    .plot(kind='bar', x='subreddit', legend=True,
          title='Percent of comments by sentiments buckets') \
    .yaxis \
    .set_major_formatter(FuncFormatter(lambda y, _: f'{y * 100:.1f}%'))

 plt.tight_layout()


 norm_score_columns = ['root_comments_neg_neg_norm_score',
                      'root_comments_neg_neu_norm_score',
                      'root_comments_neu_neu_norm_score',
                      'root_comments_pos_neu_norm_score',
                      'root_comments_pos_pos_norm_score']

 posts_with_comments_df[['subreddit'] + norm_score_columns] \
    .groupby('subreddit') \
    .mean()[norm_score_columns] \
    .reset_index() \
    .rename(columns={column: column[13:-10].replace('_', ' ')
                     for column in norm_score_columns}) \
    .plot(kind='bar', x='subreddit', legend=True,
          title='Mean normalized score of comments by sentiments buckets')

 plt.tight_layout()
	from datetime import datetime
	import pandas as pd
	import matplotlib.pyplot as plt
	from matplotlib.ticker import FuncFormatter
	from nltk.sentiment.vader import SentimentIntensityAnalyzer
	import praw

	options = dict(client_id='',
	client_secret='',
	user_agent='')

	reddit = praw.Reddit(**options)
	sid = SentimentIntensityAnalyzer()

	url = ''

	[*posts] = reddit.subreddit('all').search(f"url:{url}", limit=1000)

	posts_df = pd.DataFrame(
	[(post.id, post.subreddit.display_name, post.title, post.score,
	datetime.utcfromtimestamp(post.created_utc), post.url,
	post.num_comments, post.upvote_ratio)
	for post in posts],
	columns=['id', 'subreddit', 'title', 'score', 'created',
	'url', 'num_comments', 'upvote_ratio'])

	posts_df[['subreddit', 'upvote_ratio']] \
	.groupby('subreddit') \
	.mean()['upvote_ratio'] \
	.reset_index() \
	.plot(kind='barh', x='subreddit', y='upvote_ratio',
	title='Upvote ratio', legend=False) \
	.xaxis \
	.set_major_formatter(FuncFormatter(lambda x, _: f'{x * 100:.1f}%'))

	plt.tight_layout()

	# Number of comments
	posts_df[['subreddit', 'num_comments']] \
	.groupby('subreddit') \
	.sum()['num_comments'] \
	.reset_index() \
	.plot(kind='barh', x='subreddit', y='num_comments',
	title='Number of comments', legend=False)

	plt.tight_layout()

	# Score
	posts_df[['subreddit', 'score']] \
	.groupby('subreddit') \
	.sum()['score'] \
	.reset_index() \
	.plot(kind='barh', x='subreddit', y='score', title='Score', legend=False)

	plt.tight_layout()

	# Title sentiments
	posts_sentiments = posts_df.title.apply(sid.polarity_scores).apply(pd.Series)
	posts_df = posts_df.assign(title_neg=posts_sentiments.neg,
	title_neu=posts_sentiments.neu,
	title_pos=posts_sentiments.pos,
	title_compound=posts_sentiments['compound'])

	posts_df[['subreddit', 'title_neg', 'title_neu', 'title_pos', 'title_compound']] \
	.groupby('subreddit') \
	.sum()[['title_neg', 'title_neu', 'title_pos', 'title_compound']] \
	.reset_index() \
	.rename(columns={'title_neg': 'Negative',
	'title_pos': 'Positive',
	'title_neu': 'Neutral',
	'title_compound': 'Compound'}) \
	.plot(kind='barh', x='subreddit', title='Title sentiments', legend=True)

	plt.tight_layout()


	def normalize(post):
	[*subreddit_posts] = reddit.subreddit(post.subreddit.display_name).new(limit=1000)
	subreddit_posts_df = pd.DataFrame([(post.id, post.score, post.num_comments)
	for post in subreddit_posts],
	columns=('id', 'score', 'num_comments'))

	norm_score = ((post.score - subreddit_posts_df.score.mean())
	/ (subreddit_posts_df.score.max() - subreddit_posts_df.score.min()))
	norm_num_comments = ((post.num_comments - subreddit_posts_df.num_comments.mean())
	/ (subreddit_posts_df.num_comments.max() - subreddit_posts_df.num_comments.min()))

	return norm_score, norm_num_comments

	normalized_vals = pd \
	.DataFrame([normalize(post) for post in posts],
	columns=['norm_score', 'norm_num_comments']) \
	.fillna(0)

	posts_df[['norm_score', 'norm_num_comments']] = normalized_vals

	posts_df[['subreddit', 'norm_score', 'norm_num_comments']] \
	.groupby('subreddit') \
	.sum()[['norm_score', 'norm_num_comments']] \
	.reset_index() \
	.rename(columns={'norm_score': 'Normalized score',
	'norm_num_comments': 'Normalized number of comments'}) \
	.plot(kind='barh', x='subreddit',title='Normalized popularity')

	plt.tight_layout()


	posts_df[['subreddit', 'norm_score', 'norm_num_comments', 'title_compound']] \
	.groupby('subreddit') \
	.sum()[['norm_score', 'norm_num_comments', 'title_compound']] \
	.reset_index() \
	.plot(kind='barh', x='subreddit', title='Normalized', legend=True)

	plt.tight_layout()


	def handle_post_comments(post):
	if not post.num_comments:
	return

	root_comments_df = pd.DataFrame([(comment.id, comment.body, comment.score)
	for comment in post.comments.list()
	if hasattr(comment, 'body')],
	columns=['id', 'body', 'score'])
	root_comments_df['norm_score'] = ((root_comments_df.score - root_comments_df.score.mean())
	/ (root_comments_df.score.max() - root_comments_df.score.min()))

	root_comments_sentiments = root_comments_df.body.apply(sid.polarity_scores).apply(pd.Series)

	root_comments_df = root_comments_df.assign(body_neg=root_comments_sentiments.neg,
	body_neu=root_comments_sentiments.neu,
	body_pos=root_comments_sentiments.pos,
	body_compound=root_comments_sentiments['compound'])

	bucketed_root_comments = root_comments_df \
	[root_comments_df.body_compound >= 0.6] \
	.assign(bucket='pos_pos') \
	.append(
	root_comments_df[
	(root_comments_df.body_compound >= 0.2)
	& (root_comments_df.body_compound < 0.6)
	].assign(bucket='pos_neu')
	) \
	.append(
	root_comments_df[
	(root_comments_df.body_compound >= -0.2)
	& (root_comments_df.body_compound < 0.2)
	].assign(bucket='neu_neu')
	) \
	.append(
	root_comments_df[
	(root_comments_df.body_compound >= -0.6)
	& (root_comments_df.body_compound < -0.2)
	].assign(bucket='neg_neu')
	) \
	.append(
	root_comments_df[
	root_comments_df.body_compound < -0.6
	].assign(bucket='neg_neg')
	)

	buckets = bucketed_root_comments \
	.groupby('bucket') \
	.agg({'norm_score': 'mean', 'id': 'count'}) \
	.rename(columns={'norm_score': 'norm_score', 'id': 'amount'}) \
	.reset_index()

	buckets['percent'] = buckets.amount / buckets.amount.sum()

	post_comments_df = pd.DataFrame([post.id], columns=['post_id']).assign(key=0)
	for bucket in buckets.bucket:
	post_comments_df = buckets[buckets.bucket == bucket] \
	[['norm_score', 'amount', 'percent']] \
	.add_prefix(f'{bucket}_') \
	.assign(key=0) \
	.merge(post_comments_df, how='outer')

	return post_comments_df \
	.add_prefix('root_comments_') \
	.assign(key=post.id)

	posts_comments_df = pd \
	.concat([handle_post_comments(post) for post in posts]) \
	.fillna(0)

	posts_with_comments_df = posts_df \
	.assign(key=lambda x: x.id) \
	.merge(posts_comments_df, on='key', how='left') \
	.fillna(0)


	percent_columns = ['root_comments_neg_neg_percent',
	'root_comments_neg_neu_percent', 'root_comments_neu_neu_percent',
	'root_comments_pos_neu_percent', 'root_comments_pos_pos_percent']

	posts_with_comments_df[['subreddit'] + percent_columns] \
	.groupby('subreddit') \
	.mean()[percent_columns] \
	.reset_index() \
	.rename(columns={column: column[13:-7].replace('_', ' ')
	for column in percent_columns}) \
	.plot(kind='bar', x='subreddit', legend=True,
	title='Percent of comments by sentiments buckets') \
	.yaxis \
	.set_major_formatter(FuncFormatter(lambda y, _: f'{y * 100:.1f}%'))

	plt.tight_layout()


	norm_score_columns = ['root_comments_neg_neg_norm_score',
	'root_comments_neg_neu_norm_score',
	'root_comments_neu_neu_norm_score',
	'root_comments_pos_neu_norm_score',
	'root_comments_pos_pos_norm_score']

	posts_with_comments_df[['subreddit'] + norm_score_columns] \
	.groupby('subreddit') \
	.mean()[norm_score_columns] \
	.reset_index() \
	.rename(columns={column: column[13:-10].replace('_', ' ')
	for column in norm_score_columns}) \
	.plot(kind='bar', x='subreddit', legend=True,
	title='Mean normalized score of comments by sentiments buckets')

	plt.tight_layout()