Skip to content

Instantly share code, notes, and snippets.

@CuGBabyBeaR
Last active September 6, 2018 09:38
Show Gist options
  • Save CuGBabyBeaR/5020281af85216939031 to your computer and use it in GitHub Desktop.
Save CuGBabyBeaR/5020281af85216939031 to your computer and use it in GitHub Desktop.
A data grab and analyze project about Chinese fans of member in AKB48
# -*- coding: utf-8 -*-
import codecs, datetime, math, re
import pandas as pd
import numpy as np
from numpy import nan as NA
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
COLNAMES = [u'用户ID',u'用户名',u'性别',u'生日',u'一推',u'二推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货']
NUM_SHOWN = 20
plt.rcParams['font.sans-serif'] = ['SimHei']
font = FontProperties(fname=r"c:\\windows\\fonts\\simsun.ttc", size=14)
def cnstr2date(datetimestr):
m = re.match(ur"(\d{4}) 年 (\d{1,2}) 月 (\d{1,2}) 日", datetimestr)
if m:
g = m.groups()
return datetime.date(int(g[0]),int(g[1]),int(g[2]))
return NA
def str2datetime(datetimestr):
datetimestr = datetimestr.split(' ')
date = datetimestr[0].split('-')
time = datetimestr[1].split(':')
return datetime.datetime(int(date[0]),int(date[1]),int(date[2]),int(time[0]),int(time[1]))
def str2dig(digstr):
if digstr:
return int(digstr)
else:
return 0
def csvSorter(
src = 'oshimen_raw_data.csv',
tar = 'oshimen_data.csv',
spam = 'oshimen_spam_data',
ghost = 'oshimen_ghost_data',
zombie = 'oshimen_zombie_data',
):
threshold = 2
srcfile = codecs.open(src,'r','utf-8')
tarfile = codecs.open(tar,'wb+','utf-8')
spamfile = codecs.open(spam,'wb+','utf-8')
for line in srcfile.readlines()[1:]:
score = 0
line = line.strip()
data = line.split(',')
# 注册时间得分
regtime = str2datetime(data[8])
lasttime = str2datetime(data[9])
delta = lasttime - regtime
if delta.days > 7:
score += 1
pass
# 积分得分 (总积分 = 发帖数 + 精华帖数X5 + 应援力X50 + 在线时间(小时))
point = str2dig(data[10])
score += math.log(point + 1) * 0.5
# 通货得分
akcoin = str2dig(data[11])
score += math.log((akcoin + 1.0) / 20 ) * 0.1
# 如果只有发帖没有回帖 认为是广告帐号
comment = str2dig(data[7])
post = str2dig(data[6])
if post > 0 and comment == 0:
score = 0
if score < threshold:
spamfile.write(','.join(data))
spamfile.write('\n')
else :
tarfile.write(','.join(data))
tarfile.write('\n')
pass
srcfile.close()
tarfile.close()
spamfile.close()
def loadcsv(csvfilename):
data = []
f = codecs.open(csvfilename,'r','utf-8')
for line in f.readlines():
line = line.strip()
sample = line.split(',')
# 用户ID转换为数值
sample[0] = str2dig(sample[0])
# 性别转换为数值
if sample[2] == u"男":
sample[2] = 1
elif sample[2] == u'女':
sample[2] = 0
else:
sample[2] = NA
# 生日转换为日期
sample[3] = cnstr2date(sample[3])
# 去除1推2推名中的空格
sample[4] = sample[4].replace(' ','')
sample[5] = sample[5].replace(' ','')
# 1推2推如果是空 设置为NA
if not sample[4]:
sample[4] = NA
if not sample[5]:
sample[5] = NA
# 注册和最后登录转换为日期时间
sample[8] = str2datetime(sample[8])
sample[9] = str2datetime(sample[9])
# 发帖、回复、积分和通货转换为数值
sample[6] = str2dig(sample[6])
sample[7] = str2dig(sample[7])
sample[10] = str2dig(sample[10])
sample[11] = str2dig(sample[11])
data.append(sample)
return data
def sex(data):
data = data[u'性别']
# data.dropna(inplace = True)
res = data.value_counts()
plt.pie(res, labels = [u'男',u'女'] , explode = [0.05,0.05], colors = ['blue','pink'])
plt.legend()
plt.savefig('userinfo_sex.png')
return res[1] * 1.0 / res[0]
pass
def sex_oshi(data, base_rate = 1.0):
first_oshi = data.drop([u'用户ID',u'用户名',u'生日',u'二推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1)
second_oshi = data.drop([u'用户ID',u'用户名',u'生日',u'一推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1)
first_oshi.dropna(inplace = True)
second_oshi.dropna(inplace = True)
first_groups = first_oshi.groupby(u'性别')
second_groups = second_oshi.groupby(u'性别')
first_data_M = first_groups.get_group(0)[u'一推'].value_counts()
first_data_F = first_groups.get_group(1)[u'一推'].value_counts()
dt = first_oshi[u'一推'].value_counts()
second_data_M = second_groups.get_group(0)[u'二推'].value_counts()
second_data_F = second_groups.get_group(1)[u'二推'].value_counts()
d = pd.DataFrame({
u'女 一推' : first_data_M,
u'男 一推' : first_data_F,
u'女 二推' : second_data_M,
u'男 二推' : second_data_F,
u'总 一推' : dt,
}).fillna(value = 0)
d = d.sort(u'总 一推' , ascending = False).head(NUM_SHOWN)
sexscore = []
for i in xrange(NUM_SHOWN):
male = d[u'男 一推'][i] + d[u'男 二推'][i] * 0.5
female = d[u'女 一推'][i] + d[u'女 二推'][i] * 0.5
rate = male / female
score = rate / base_rate
if score < 1:
score = - 1 / score + 1
else:
score -= 1
pass
sexscore.append(score)
pass
d.insert(4,u'性别得分',sexscore)
f = codecs.open('sex_oshi.txt','wb+','utf-8')
f.write(d.to_string())
f.close()
bar_width = 0.8
ind = -np.arange( NUM_SHOWN ) * 2.0 - bar_width * 0.5
fig = plt.figure(figsize=(4,10))
fig.subplots_adjust(left=0.27, top=0.98, right=0.96, bottom = 0.05)
b0 = plt.barh(ind + 0.5 * bar_width, d[u'女 一推'], bar_width, color = 'red', label= u'女 一推')
b1 = plt.barh(ind + 0.5 * bar_width, d[u'男 一推'], bar_width, color = 'blue', label= u'男 一推', left = - d[u'男 一推'])
b2 = plt.barh(ind - 0.5 * bar_width, d[u'女 二推'], bar_width, color = 'pink', label= u'女 二推')
b3 = plt.barh(ind - 0.5 * bar_width, d[u'男 二推'], bar_width, color = 'cyan', label= u'男 二推', left = - d[u'男 二推'])
ax = plt.gca()
ax.axis([-400,200,ind[NUM_SHOWN-1]-bar_width, ind[0] + 2 * bar_width])
plt.xlabel(u'数量', fontproperties = font,)
plt.xticks(np.linspace(-400, 200, 7), ('400','300','200','100','0','100','200'))
plt.yticks(ind + 0.5 * bar_width, d.axes[0], fontproperties=font)
plt.legend(loc='lower left')
plt.savefig('sex_oshi.png')
def age(data):
pass
def ageGrouping(birthday, tillday):
age = tillday.year - birthday.year
age = (age + 1) / 4 - 4
if age < 0 :
age = 0
if age > 4:
age = 4
return age
pass
def age_oshi(data):
first_oshi = data.drop([u'用户ID',u'用户名',u'性别',u'二推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1)
second_oshi = data.drop([u'用户ID',u'用户名',u'性别',u'一推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1)
first_oshi.dropna(inplace = True)
second_oshi.dropna(inplace = True)
first_agegroup = []
second_agegroup = []
today = datetime.date.today()
# ~18 19~22 22~26 26~30 30~
# (age + 1) / 4 - 4
for birth in first_oshi[u'生日']:
first_agegroup.append(ageGrouping(birth,today))
first_oshi.insert(2,u'年龄分段',first_agegroup)
# print first_oshi
for birth in second_oshi[u'生日']:
second_agegroup.append(ageGrouping(birth,today))
second_oshi.insert(2,u'年龄分段',second_agegroup)
first_groups = first_oshi.groupby(u'年龄分段')
second_groups = second_oshi.groupby(u'年龄分段')
first_d = []
for g in first_groups:
first_d.append(g[1][u'一推'].value_counts())
pass
dt = first_oshi[u'一推'].value_counts()
second_d = []
for g in second_groups:
second_d.append(g[1][u'二推'].value_counts())
pass
d = pd.DataFrame({
u'一推 ~18' : first_d[0],
u'一推 19~22' : first_d[1],
u'一推 22~26':first_d[2],
u'一推 26~':first_d[3],
u'二推 ~18' : second_d[0],
u'二推 19~22' : second_d[1],
u'二推 22~26':second_d[2],
u'二推 26~':second_d[3],
u'总 一推':dt}).fillna(value = 0).sort(u'总 一推' , ascending = False).head(20)
f = codecs.open('age_oshi.txt','wb+','utf-8')
f.write(d.to_string())
f.close()
fig = plt.figure(figsize=(4,10))
fig.subplots_adjust(left=0.27, top=0.98, right=0.96, bottom = 0.05)
bar_width = 0.8
ind = -np.arange(len(d)) * 2.0 - bar_width * 0.5
first_labels = [u'一推 ~18', u'一推 19~22', u'一推 22~26', u'一推 26~',]
second_labels = [u'二推 ~18', u'二推 19~22', u'二推 22~26', u'二推 26~',]
first_colors = ['#0000FF','#00CCCC','#00FF00','#FFA500',]
second_colors = ['#5555FF','#55CCCC','#55FF55','#FFFF00',]
b = []
first_left = [0 for i in xrange(len(d.index))]
second_left = [0 for i in xrange(len(d.index))]
for i in xrange(4):
b.append(plt.barh(ind + bar_width, d[first_labels[i]], bar_width, label=first_labels[i], left = first_left, color = first_colors[i]))
b.append(plt.barh(ind, d[second_labels[i]], bar_width, label=second_labels[i], left = second_left, color = second_colors[i]))
first_left = map(lambda a,b:a+b, first_left, d[first_labels[i]])
second_left = map(lambda a,b:a+b, second_left, d[second_labels[i]])
pass
ax = plt.gca()
ax.axis([0,500,ind[NUM_SHOWN-1] - bar_width, ind[0] + 2.5 * bar_width])
plt.xlabel(u'数量', fontproperties=font,)
plt.yticks(ind + bar_width, d.axes[0], fontproperties = font)
plt.legend(loc = 'lower right')
plt.savefig('age_oshi.png')
pass
def point(data):
pass
if __name__ == '__main__':
#csvSorter('oshimen_raw_data.csv','oshimen_data.csv','oshimen_trash_data.csv')
data = loadcsv('oshimen_data.csv')
data = pd.DataFrame(data,columns = COLNAMES)
data[u'发帖'].value_counts().plot()
plt.show()
base_sexrate = sex(data)
sex_oshi(data,base_sexrate)
age_oshi(data)
# -*- coding: utf-8 -*-
import urllib2,re,csv,codecs,time
from lxml.html import soupparser
URLPATTERN = 'http://club.akb48.com.cn/home.php?mod=space&uid=%d&do=profile'
RE_USER = re.compile(ur'([A-Za-z0-9\u2E80-\u9FFF_ ]+)<span class="xw0">\(UID:')
RE_SEX = re.compile(ur'<li><em>性别</em>([\w\u2E80-\u9FFF ]+)</li>')
RE_BIRTHDAY = re.compile(ur'<li><em>生日</em>([\w\u2E80-\u9FFF ]+)</li>')
RE_OSHI_1 = re.compile(ur'<li><em>第一推し成员</em>([\w\u2E80-\u9FFF ]+)</li>')
RE_OSHI_2 = re.compile(ur'<li><em>第二推し成员</em>([\w\u2E80-\u9FFF ]+)</li>')
RE_COMMENTS = re.compile(ur'回帖数 (\d+)')
RE_POSTS = re.compile(ur'主题数 (\d+)')
RE_REGTIME = re.compile(ur'<li><em>注册时间</em>([\-\d: ]+)</li>')
RE_LASTTIME = re.compile(ur'<li><em>最后访问</em>([\-\d: ]+)</li>')
RE_POINT = re.compile(ur'<li><em>积分</em>(\d+)</li>')
RE_AKCOIN = re.compile(ur'<li><em>AK币</em>(\d+) </li>')
# print RE_USER.pattern, RE_OSHI.pattern
def grabHTML(url):
try:
response = urllib2.urlopen(url,timeout = 10)
code = response.getcode()
if not 200 <= code <= 300 :
return None
html = response.read()
return html
except Exception, e:
print e
return None
def ana(html):
html = html.decode('utf-8')
user_match = RE_USER.search(html)
sex_match = RE_SEX.search(html)
birthday_match= RE_BIRTHDAY.search(html)
oshi_1_match = RE_OSHI_1.search(html)
oshi_2_match = RE_OSHI_2.search(html)
post_match = RE_POSTS.search(html)
comment_match = RE_COMMENTS.search(html)
reg_match = RE_REGTIME.search(html)
last_match = RE_LASTTIME.search(html)
point_match = RE_POINT.search(html)
akcoin_match = RE_AKCOIN.search(html)
user = ''
sex = ''
birthday= ''
oshi_1 = ''
oshi_2 = ''
post = ''
comment = ''
reg = ''
last = ''
point = ''
akcoin = ''
if user_match:
user = user_match.groups()[0]
if sex_match:
sex = sex_match.groups()[0]
if birthday_match:
birthday = birthday_match.groups()[0]
if oshi_1_match:
oshi_1 = oshi_1_match.groups()[0]
if oshi_2_match:
oshi_2 = oshi_2_match.groups()[0]
if post_match:
post = post_match.groups()[0]
if comment_match:
comment = comment_match.groups()[0]
if reg_match:
reg = reg_match.groups()[0]
if last_match:
last = last_match.groups()[0]
if point_match:
point = point_match.groups()[0]
if akcoin_match:
akcoin = akcoin_match.groups()[0]
return (user,sex,birthday,oshi_1,oshi_2,post,comment,reg,last,point,akcoin,)
def Collect(saveto = 'oshimendata.csv'):
counter = 0
success = 0
failcounter = 0
if not saveto:
return
f = codecs.open(saveto,'wb+','utf-8')
f.write(u'userid,username,sex,birthday,firstOshimen,secondOshimen,post,comment,reg,last,point,akcoin\n')
for x in xrange(1,540000): #540000
counter += 1
html = grabHTML(URLPATTERN % x)
if html:
data = ana(html)
if ''.join(data):
success += 1
failcounter = 0
f.write(u'%d,'%x)
f.write(u','.join(data))
f.write(u'\n')
else:
failcounter += 1
else:
failcounter += 1
if failcounter > 100:
break
print u'抓取数据 用户ID:%d, (总抓取:%d 成功抓取:%d)' % (x, counter, success)
f.close()
return data
if __name__ == '__main__':
Collect('oshimendata.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment