-
-
Save de8ug/cdae0e5106f86fed235ac3a74b71654f to your computer and use it in GitHub Desktop.
A data grab and analyze project about Chinese fans of member in AKB48
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import codecs, datetime, math, re | |
import pandas as pd | |
import numpy as np | |
from numpy import nan as NA | |
import matplotlib.pyplot as plt | |
from matplotlib.font_manager import FontProperties | |
COLNAMES = [u'用户ID',u'用户名',u'性别',u'生日',u'一推',u'二推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'] | |
NUM_SHOWN = 20 | |
plt.rcParams['font.sans-serif'] = ['SimHei'] | |
font = FontProperties(fname=r"c:\\windows\\fonts\\simsun.ttc", size=14) | |
def cnstr2date(datetimestr): | |
m = re.match(ur"(\d{4}) 年 (\d{1,2}) 月 (\d{1,2}) 日", datetimestr) | |
if m: | |
g = m.groups() | |
return datetime.date(int(g[0]),int(g[1]),int(g[2])) | |
return NA | |
def str2datetime(datetimestr): | |
datetimestr = datetimestr.split(' ') | |
date = datetimestr[0].split('-') | |
time = datetimestr[1].split(':') | |
return datetime.datetime(int(date[0]),int(date[1]),int(date[2]),int(time[0]),int(time[1])) | |
def str2dig(digstr): | |
if digstr: | |
return int(digstr) | |
else: | |
return 0 | |
def csvSorter( | |
src = 'oshimen_raw_data.csv', | |
tar = 'oshimen_data.csv', | |
spam = 'oshimen_spam_data', | |
ghost = 'oshimen_ghost_data', | |
zombie = 'oshimen_zombie_data', | |
): | |
threshold = 2 | |
srcfile = codecs.open(src,'r','utf-8') | |
tarfile = codecs.open(tar,'wb+','utf-8') | |
spamfile = codecs.open(spam,'wb+','utf-8') | |
for line in srcfile.readlines()[1:]: | |
score = 0 | |
line = line.strip() | |
data = line.split(',') | |
# 注册时间得分 | |
regtime = str2datetime(data[8]) | |
lasttime = str2datetime(data[9]) | |
delta = lasttime - regtime | |
if delta.days > 7: | |
score += 1 | |
pass | |
# 积分得分 (总积分 = 发帖数 + 精华帖数X5 + 应援力X50 + 在线时间(小时)) | |
point = str2dig(data[10]) | |
score += math.log(point + 1) * 0.5 | |
# 通货得分 | |
akcoin = str2dig(data[11]) | |
score += math.log((akcoin + 1.0) / 20 ) * 0.1 | |
# 如果只有发帖没有回帖 认为是广告帐号 | |
comment = str2dig(data[7]) | |
post = str2dig(data[6]) | |
if post > 0 and comment == 0: | |
score = 0 | |
if score < threshold: | |
spamfile.write(','.join(data)) | |
spamfile.write('\n') | |
else : | |
tarfile.write(','.join(data)) | |
tarfile.write('\n') | |
pass | |
srcfile.close() | |
tarfile.close() | |
spamfile.close() | |
def loadcsv(csvfilename): | |
data = [] | |
f = codecs.open(csvfilename,'r','utf-8') | |
for line in f.readlines(): | |
line = line.strip() | |
sample = line.split(',') | |
# 用户ID转换为数值 | |
sample[0] = str2dig(sample[0]) | |
# 性别转换为数值 | |
if sample[2] == u"男": | |
sample[2] = 1 | |
elif sample[2] == u'女': | |
sample[2] = 0 | |
else: | |
sample[2] = NA | |
# 生日转换为日期 | |
sample[3] = cnstr2date(sample[3]) | |
# 去除1推2推名中的空格 | |
sample[4] = sample[4].replace(' ','') | |
sample[5] = sample[5].replace(' ','') | |
# 1推2推如果是空 设置为NA | |
if not sample[4]: | |
sample[4] = NA | |
if not sample[5]: | |
sample[5] = NA | |
# 注册和最后登录转换为日期时间 | |
sample[8] = str2datetime(sample[8]) | |
sample[9] = str2datetime(sample[9]) | |
# 发帖、回复、积分和通货转换为数值 | |
sample[6] = str2dig(sample[6]) | |
sample[7] = str2dig(sample[7]) | |
sample[10] = str2dig(sample[10]) | |
sample[11] = str2dig(sample[11]) | |
data.append(sample) | |
return data | |
def sex(data): | |
data = data[u'性别'] | |
# data.dropna(inplace = True) | |
res = data.value_counts() | |
plt.pie(res, labels = [u'男',u'女'] , explode = [0.05,0.05], colors = ['blue','pink']) | |
plt.legend() | |
plt.savefig('userinfo_sex.png') | |
return res[1] * 1.0 / res[0] | |
pass | |
def sex_oshi(data, base_rate = 1.0): | |
first_oshi = data.drop([u'用户ID',u'用户名',u'生日',u'二推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1) | |
second_oshi = data.drop([u'用户ID',u'用户名',u'生日',u'一推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1) | |
first_oshi.dropna(inplace = True) | |
second_oshi.dropna(inplace = True) | |
first_groups = first_oshi.groupby(u'性别') | |
second_groups = second_oshi.groupby(u'性别') | |
first_data_M = first_groups.get_group(0)[u'一推'].value_counts() | |
first_data_F = first_groups.get_group(1)[u'一推'].value_counts() | |
dt = first_oshi[u'一推'].value_counts() | |
second_data_M = second_groups.get_group(0)[u'二推'].value_counts() | |
second_data_F = second_groups.get_group(1)[u'二推'].value_counts() | |
d = pd.DataFrame({ | |
u'女 一推' : first_data_M, | |
u'男 一推' : first_data_F, | |
u'女 二推' : second_data_M, | |
u'男 二推' : second_data_F, | |
u'总 一推' : dt, | |
}).fillna(value = 0) | |
d = d.sort(u'总 一推' , ascending = False).head(NUM_SHOWN) | |
sexscore = [] | |
for i in xrange(NUM_SHOWN): | |
male = d[u'男 一推'][i] + d[u'男 二推'][i] * 0.5 | |
female = d[u'女 一推'][i] + d[u'女 二推'][i] * 0.5 | |
rate = male / female | |
score = rate / base_rate | |
if score < 1: | |
score = - 1 / score + 1 | |
else: | |
score -= 1 | |
pass | |
sexscore.append(score) | |
pass | |
d.insert(4,u'性别得分',sexscore) | |
f = codecs.open('sex_oshi.txt','wb+','utf-8') | |
f.write(d.to_string()) | |
f.close() | |
bar_width = 0.8 | |
ind = -np.arange( NUM_SHOWN ) * 2.0 - bar_width * 0.5 | |
fig = plt.figure(figsize=(4,10)) | |
fig.subplots_adjust(left=0.27, top=0.98, right=0.96, bottom = 0.05) | |
b0 = plt.barh(ind + 0.5 * bar_width, d[u'女 一推'], bar_width, color = 'red', label= u'女 一推') | |
b1 = plt.barh(ind + 0.5 * bar_width, d[u'男 一推'], bar_width, color = 'blue', label= u'男 一推', left = - d[u'男 一推']) | |
b2 = plt.barh(ind - 0.5 * bar_width, d[u'女 二推'], bar_width, color = 'pink', label= u'女 二推') | |
b3 = plt.barh(ind - 0.5 * bar_width, d[u'男 二推'], bar_width, color = 'cyan', label= u'男 二推', left = - d[u'男 二推']) | |
ax = plt.gca() | |
ax.axis([-400,200,ind[NUM_SHOWN-1]-bar_width, ind[0] + 2 * bar_width]) | |
plt.xlabel(u'数量', fontproperties = font,) | |
plt.xticks(np.linspace(-400, 200, 7), ('400','300','200','100','0','100','200')) | |
plt.yticks(ind + 0.5 * bar_width, d.axes[0], fontproperties=font) | |
plt.legend(loc='lower left') | |
plt.savefig('sex_oshi.png') | |
def age(data): | |
pass | |
def ageGrouping(birthday, tillday): | |
age = tillday.year - birthday.year | |
age = (age + 1) / 4 - 4 | |
if age < 0 : | |
age = 0 | |
if age > 4: | |
age = 4 | |
return age | |
pass | |
def age_oshi(data): | |
first_oshi = data.drop([u'用户ID',u'用户名',u'性别',u'二推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1) | |
second_oshi = data.drop([u'用户ID',u'用户名',u'性别',u'一推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1) | |
first_oshi.dropna(inplace = True) | |
second_oshi.dropna(inplace = True) | |
first_agegroup = [] | |
second_agegroup = [] | |
today = datetime.date.today() | |
# ~18 19~22 22~26 26~30 30~ | |
# (age + 1) / 4 - 4 | |
for birth in first_oshi[u'生日']: | |
first_agegroup.append(ageGrouping(birth,today)) | |
first_oshi.insert(2,u'年龄分段',first_agegroup) | |
# print first_oshi | |
for birth in second_oshi[u'生日']: | |
second_agegroup.append(ageGrouping(birth,today)) | |
second_oshi.insert(2,u'年龄分段',second_agegroup) | |
first_groups = first_oshi.groupby(u'年龄分段') | |
second_groups = second_oshi.groupby(u'年龄分段') | |
first_d = [] | |
for g in first_groups: | |
first_d.append(g[1][u'一推'].value_counts()) | |
pass | |
dt = first_oshi[u'一推'].value_counts() | |
second_d = [] | |
for g in second_groups: | |
second_d.append(g[1][u'二推'].value_counts()) | |
pass | |
d = pd.DataFrame({ | |
u'一推 ~18' : first_d[0], | |
u'一推 19~22' : first_d[1], | |
u'一推 22~26':first_d[2], | |
u'一推 26~':first_d[3], | |
u'二推 ~18' : second_d[0], | |
u'二推 19~22' : second_d[1], | |
u'二推 22~26':second_d[2], | |
u'二推 26~':second_d[3], | |
u'总 一推':dt}).fillna(value = 0).sort(u'总 一推' , ascending = False).head(20) | |
f = codecs.open('age_oshi.txt','wb+','utf-8') | |
f.write(d.to_string()) | |
f.close() | |
fig = plt.figure(figsize=(4,10)) | |
fig.subplots_adjust(left=0.27, top=0.98, right=0.96, bottom = 0.05) | |
bar_width = 0.8 | |
ind = -np.arange(len(d)) * 2.0 - bar_width * 0.5 | |
first_labels = [u'一推 ~18', u'一推 19~22', u'一推 22~26', u'一推 26~',] | |
second_labels = [u'二推 ~18', u'二推 19~22', u'二推 22~26', u'二推 26~',] | |
first_colors = ['#0000FF','#00CCCC','#00FF00','#FFA500',] | |
second_colors = ['#5555FF','#55CCCC','#55FF55','#FFFF00',] | |
b = [] | |
first_left = [0 for i in xrange(len(d.index))] | |
second_left = [0 for i in xrange(len(d.index))] | |
for i in xrange(4): | |
b.append(plt.barh(ind + bar_width, d[first_labels[i]], bar_width, label=first_labels[i], left = first_left, color = first_colors[i])) | |
b.append(plt.barh(ind, d[second_labels[i]], bar_width, label=second_labels[i], left = second_left, color = second_colors[i])) | |
first_left = map(lambda a,b:a+b, first_left, d[first_labels[i]]) | |
second_left = map(lambda a,b:a+b, second_left, d[second_labels[i]]) | |
pass | |
ax = plt.gca() | |
ax.axis([0,500,ind[NUM_SHOWN-1] - bar_width, ind[0] + 2.5 * bar_width]) | |
plt.xlabel(u'数量', fontproperties=font,) | |
plt.yticks(ind + bar_width, d.axes[0], fontproperties = font) | |
plt.legend(loc = 'lower right') | |
plt.savefig('age_oshi.png') | |
pass | |
def point(data): | |
pass | |
if __name__ == '__main__': | |
#csvSorter('oshimen_raw_data.csv','oshimen_data.csv','oshimen_trash_data.csv') | |
data = loadcsv('oshimen_data.csv') | |
data = pd.DataFrame(data,columns = COLNAMES) | |
data[u'发帖'].value_counts().plot() | |
plt.show() | |
base_sexrate = sex(data) | |
sex_oshi(data,base_sexrate) | |
age_oshi(data) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import urllib2,re,csv,codecs,time | |
from lxml.html import soupparser | |
URLPATTERN = 'http://club.akb48.com.cn/home.php?mod=space&uid=%d&do=profile' | |
RE_USER = re.compile(ur'([A-Za-z0-9\u2E80-\u9FFF_ ]+)<span class="xw0">\(UID:') | |
RE_SEX = re.compile(ur'<li><em>性别</em>([\w\u2E80-\u9FFF ]+)</li>') | |
RE_BIRTHDAY = re.compile(ur'<li><em>生日</em>([\w\u2E80-\u9FFF ]+)</li>') | |
RE_OSHI_1 = re.compile(ur'<li><em>第一推し成员</em>([\w\u2E80-\u9FFF ]+)</li>') | |
RE_OSHI_2 = re.compile(ur'<li><em>第二推し成员</em>([\w\u2E80-\u9FFF ]+)</li>') | |
RE_COMMENTS = re.compile(ur'回帖数 (\d+)') | |
RE_POSTS = re.compile(ur'主题数 (\d+)') | |
RE_REGTIME = re.compile(ur'<li><em>注册时间</em>([\-\d: ]+)</li>') | |
RE_LASTTIME = re.compile(ur'<li><em>最后访问</em>([\-\d: ]+)</li>') | |
RE_POINT = re.compile(ur'<li><em>积分</em>(\d+)</li>') | |
RE_AKCOIN = re.compile(ur'<li><em>AK币</em>(\d+) </li>') | |
# print RE_USER.pattern, RE_OSHI.pattern | |
def grabHTML(url): | |
try: | |
response = urllib2.urlopen(url,timeout = 10) | |
code = response.getcode() | |
if not 200 <= code <= 300 : | |
return None | |
html = response.read() | |
return html | |
except Exception, e: | |
print e | |
return None | |
def ana(html): | |
html = html.decode('utf-8') | |
user_match = RE_USER.search(html) | |
sex_match = RE_SEX.search(html) | |
birthday_match= RE_BIRTHDAY.search(html) | |
oshi_1_match = RE_OSHI_1.search(html) | |
oshi_2_match = RE_OSHI_2.search(html) | |
post_match = RE_POSTS.search(html) | |
comment_match = RE_COMMENTS.search(html) | |
reg_match = RE_REGTIME.search(html) | |
last_match = RE_LASTTIME.search(html) | |
point_match = RE_POINT.search(html) | |
akcoin_match = RE_AKCOIN.search(html) | |
user = '' | |
sex = '' | |
birthday= '' | |
oshi_1 = '' | |
oshi_2 = '' | |
post = '' | |
comment = '' | |
reg = '' | |
last = '' | |
point = '' | |
akcoin = '' | |
if user_match: | |
user = user_match.groups()[0] | |
if sex_match: | |
sex = sex_match.groups()[0] | |
if birthday_match: | |
birthday = birthday_match.groups()[0] | |
if oshi_1_match: | |
oshi_1 = oshi_1_match.groups()[0] | |
if oshi_2_match: | |
oshi_2 = oshi_2_match.groups()[0] | |
if post_match: | |
post = post_match.groups()[0] | |
if comment_match: | |
comment = comment_match.groups()[0] | |
if reg_match: | |
reg = reg_match.groups()[0] | |
if last_match: | |
last = last_match.groups()[0] | |
if point_match: | |
point = point_match.groups()[0] | |
if akcoin_match: | |
akcoin = akcoin_match.groups()[0] | |
return (user,sex,birthday,oshi_1,oshi_2,post,comment,reg,last,point,akcoin,) | |
def Collect(saveto = 'oshimendata.csv'): | |
counter = 0 | |
success = 0 | |
failcounter = 0 | |
if not saveto: | |
return | |
f = codecs.open(saveto,'wb+','utf-8') | |
f.write(u'userid,username,sex,birthday,firstOshimen,secondOshimen,post,comment,reg,last,point,akcoin\n') | |
for x in xrange(1,540000): #540000 | |
counter += 1 | |
html = grabHTML(URLPATTERN % x) | |
if html: | |
data = ana(html) | |
if ''.join(data): | |
success += 1 | |
failcounter = 0 | |
f.write(u'%d,'%x) | |
f.write(u','.join(data)) | |
f.write(u'\n') | |
else: | |
failcounter += 1 | |
else: | |
failcounter += 1 | |
if failcounter > 100: | |
break | |
print u'抓取数据 用户ID:%d, (总抓取:%d 成功抓取:%d)' % (x, counter, success) | |
f.close() | |
return data | |
if __name__ == '__main__': | |
Collect('oshimendata.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment