CuGBabyBeaR · September 6, 2018 09:38
diff --git a/analyze.py b/analyze.py
 # -*- coding: utf-8 -*-

 import codecs, datetime, math, re
 import pandas as pd
 import numpy as np
 from numpy import nan as NA
 import matplotlib.pyplot as plt
 from matplotlib.font_manager import FontProperties

 COLNAMES = [u'用户ID',u'用户名',u'性别',u'生日',u'一推',u'二推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货']
 NUM_SHOWN = 20

 plt.rcParams['font.sans-serif'] = ['SimHei']
 font = FontProperties(fname=r"c:\\windows\\fonts\\simsun.ttc", size=14) 


 def cnstr2date(datetimestr):
    m = re.match(ur"(\d{4}) 年 (\d{1,2}) 月 (\d{1,2}) 日", datetimestr)
    if m:
        g = m.groups()
        return datetime.date(int(g[0]),int(g[1]),int(g[2]))
    return NA

 def str2datetime(datetimestr):
    datetimestr = datetimestr.split(' ')
    date = datetimestr[0].split('-')
    time = datetimestr[1].split(':')
    return datetime.datetime(int(date[0]),int(date[1]),int(date[2]),int(time[0]),int(time[1]))

 def str2dig(digstr):
    if digstr:
        return int(digstr)
    else:
        return 0

 def csvSorter(
    src = 'oshimen_raw_data.csv',
    tar = 'oshimen_data.csv',
    spam = 'oshimen_spam_data',
    ghost =  'oshimen_ghost_data',
    zombie =  'oshimen_zombie_data',
    ):
    threshold = 2

    srcfile = codecs.open(src,'r','utf-8')
    tarfile = codecs.open(tar,'wb+','utf-8')
    spamfile = codecs.open(spam,'wb+','utf-8')
    for line in srcfile.readlines()[1:]:
        score = 0
        line = line.strip()
        data = line.split(',')

        # 注册时间得分
        regtime = str2datetime(data[8])
        lasttime = str2datetime(data[9])
        delta = lasttime - regtime
        if delta.days > 7:
            score += 1
            pass
        # 积分得分 （总积分 = 发帖数 + 精华帖数X5 + 应援力X50 + 在线时间(小时)）
        point = str2dig(data[10])
        score += math.log(point + 1) * 0.5

        # 通货得分
        akcoin = str2dig(data[11])
        score += math.log((akcoin + 1.0) / 20 ) * 0.1

        # 如果只有发帖没有回帖 认为是广告帐号
        comment = str2dig(data[7])
        post = str2dig(data[6])
        if post > 0 and comment == 0:
            score = 0

        if score < threshold:
            spamfile.write(','.join(data))
            spamfile.write('\n')
        else :
            tarfile.write(','.join(data))
            tarfile.write('\n')
        pass
    srcfile.close()
    tarfile.close()
    spamfile.close()

 def loadcsv(csvfilename):
    data = []
    f = codecs.open(csvfilename,'r','utf-8')
    for line in f.readlines():
        line = line.strip()
        sample = line.split(',')
        # 用户ID转换为数值
        sample[0] = str2dig(sample[0])
        # 性别转换为数值
        if sample[2] == u"男":
            sample[2] = 1
        elif sample[2] == u'女':
            sample[2] = 0
        else:
            sample[2] = NA
        # 生日转换为日期
        sample[3] = cnstr2date(sample[3])
        # 去除1推2推名中的空格
        sample[4] = sample[4].replace(' ','')
        sample[5] = sample[5].replace(' ','')
        # 1推2推如果是空 设置为NA
        if not sample[4]:
            sample[4] = NA
        if not sample[5]:
            sample[5] = NA
        # 注册和最后登录转换为日期时间
        sample[8] = str2datetime(sample[8])
        sample[9] = str2datetime(sample[9])
        # 发帖、回复、积分和通货转换为数值
        sample[6] = str2dig(sample[6])
        sample[7] = str2dig(sample[7])
        sample[10] = str2dig(sample[10])
        sample[11] = str2dig(sample[11])

        data.append(sample)
    return data

 def sex(data):
    data = data[u'性别']
    # data.dropna(inplace = True)
    res = data.value_counts()

    plt.pie(res, labels = [u'男',u'女'] , explode = [0.05,0.05], colors = ['blue','pink'])
    plt.legend()
    plt.savefig('userinfo_sex.png')
    return res[1] * 1.0 / res[0]
    pass

 def sex_oshi(data, base_rate = 1.0):

    first_oshi = data.drop([u'用户ID',u'用户名',u'生日',u'二推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1)
    second_oshi = data.drop([u'用户ID',u'用户名',u'生日',u'一推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1)
    first_oshi.dropna(inplace = True)
    second_oshi.dropna(inplace = True)
    first_groups = first_oshi.groupby(u'性别')
    second_groups = second_oshi.groupby(u'性别')
    first_data_M = first_groups.get_group(0)[u'一推'].value_counts()
    first_data_F = first_groups.get_group(1)[u'一推'].value_counts()
    dt = first_oshi[u'一推'].value_counts()
    second_data_M = second_groups.get_group(0)[u'二推'].value_counts()
    second_data_F = second_groups.get_group(1)[u'二推'].value_counts()

    d = pd.DataFrame({
        u'女 一推' : first_data_M,
        u'男 一推' : first_data_F,
        u'女 二推' : second_data_M,
        u'男 二推' : second_data_F,
        u'总 一推' : dt,
        }).fillna(value = 0)
    d = d.sort(u'总 一推' , ascending = False).head(NUM_SHOWN)
    


    sexscore = []
    for i in xrange(NUM_SHOWN):
        male = d[u'男 一推'][i] + d[u'男 二推'][i] * 0.5
        female = d[u'女 一推'][i] + d[u'女 二推'][i] * 0.5
        rate = male / female
        score = rate / base_rate
        if score < 1:
            score = - 1 / score + 1
        else:
            score -= 1
            pass
        sexscore.append(score)
        pass
    d.insert(4,u'性别得分',sexscore)
    f = codecs.open('sex_oshi.txt','wb+','utf-8')
    f.write(d.to_string())
    f.close()


    bar_width = 0.8 
    ind = -np.arange( NUM_SHOWN ) * 2.0 - bar_width * 0.5

    fig = plt.figure(figsize=(4,10))
    fig.subplots_adjust(left=0.27, top=0.98, right=0.96, bottom = 0.05)

    b0 = plt.barh(ind + 0.5 * bar_width, d[u'女 一推'], bar_width, color = 'red', label= u'女 一推') 
    b1 = plt.barh(ind + 0.5 * bar_width, d[u'男 一推'], bar_width, color = 'blue', label= u'男 一推', left = - d[u'男 一推'])
    b2 = plt.barh(ind - 0.5 * bar_width, d[u'女 二推'], bar_width, color = 'pink', label= u'女 二推') 
    b3 = plt.barh(ind - 0.5 * bar_width, d[u'男 二推'], bar_width, color = 'cyan', label= u'男 二推', left = - d[u'男 二推'])


    ax = plt.gca()
    ax.axis([-400,200,ind[NUM_SHOWN-1]-bar_width, ind[0] + 2 * bar_width])
    plt.xlabel(u'数量', fontproperties = font,)
    plt.xticks(np.linspace(-400, 200, 7), ('400','300','200','100','0','100','200'))
    plt.yticks(ind + 0.5 * bar_width, d.axes[0], fontproperties=font)
    plt.legend(loc='lower left')

    plt.savefig('sex_oshi.png')

 def age(data):

    pass

 def ageGrouping(birthday, tillday):
    age = tillday.year - birthday.year
    age = (age + 1) / 4 - 4 
    if age < 0 :
        age = 0
    if age > 4:
        age = 4

    return age
    pass

 def age_oshi(data):
    first_oshi = data.drop([u'用户ID',u'用户名',u'性别',u'二推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1)
    second_oshi = data.drop([u'用户ID',u'用户名',u'性别',u'一推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1)
    first_oshi.dropna(inplace = True)
    second_oshi.dropna(inplace = True)

    first_agegroup = []
    second_agegroup = []
    today = datetime.date.today()

    # ~18 19~22 22~26 26~30 30~
    # (age + 1) / 4 - 4 
    for birth in first_oshi[u'生日']:
        first_agegroup.append(ageGrouping(birth,today)) 
    first_oshi.insert(2,u'年龄分段',first_agegroup)

    # print first_oshi

    for birth in second_oshi[u'生日']:
        second_agegroup.append(ageGrouping(birth,today))
    second_oshi.insert(2,u'年龄分段',second_agegroup)

    first_groups = first_oshi.groupby(u'年龄分段')
    second_groups = second_oshi.groupby(u'年龄分段')

    first_d = []
    for g in first_groups:
        first_d.append(g[1][u'一推'].value_counts())
        pass

    dt = first_oshi[u'一推'].value_counts()

    second_d = []
    for g in second_groups:
        second_d.append(g[1][u'二推'].value_counts())
        pass

    d = pd.DataFrame({
        u'一推 ~18' : first_d[0], 
        u'一推 19~22' : first_d[1], 
        u'一推 22~26':first_d[2], 
        u'一推 26~':first_d[3],
        u'二推 ~18' : second_d[0], 
        u'二推 19~22' : second_d[1], 
        u'二推 22~26':second_d[2], 
        u'二推 26~':second_d[3],
        u'总 一推':dt}).fillna(value = 0).sort(u'总 一推' , ascending = False).head(20)

    f = codecs.open('age_oshi.txt','wb+','utf-8')
    f.write(d.to_string())
    f.close()

    fig = plt.figure(figsize=(4,10))
    fig.subplots_adjust(left=0.27, top=0.98, right=0.96, bottom = 0.05)
    
    bar_width = 0.8 
    ind = -np.arange(len(d)) * 2.0 - bar_width * 0.5

    first_labels = [u'一推 ~18', u'一推 19~22', u'一推 22~26', u'一推 26~',]
    second_labels = [u'二推 ~18', u'二推 19~22', u'二推 22~26', u'二推 26~',]
    first_colors = ['#0000FF','#00CCCC','#00FF00','#FFA500',]
    second_colors = ['#5555FF','#55CCCC','#55FF55','#FFFF00',]

    b = []
    first_left = [0 for i in xrange(len(d.index))]
    second_left = [0 for i in xrange(len(d.index))]
    for i in xrange(4):
        b.append(plt.barh(ind + bar_width, d[first_labels[i]], bar_width, label=first_labels[i], left = first_left, color = first_colors[i]))
        b.append(plt.barh(ind, d[second_labels[i]], bar_width, label=second_labels[i], left = second_left, color = second_colors[i]))
        first_left = map(lambda a,b:a+b, first_left, d[first_labels[i]])
        second_left = map(lambda a,b:a+b, second_left, d[second_labels[i]])
        pass

    ax = plt.gca()
    ax.axis([0,500,ind[NUM_SHOWN-1] - bar_width, ind[0] + 2.5 * bar_width])

    plt.xlabel(u'数量', fontproperties=font,)
    plt.yticks(ind + bar_width, d.axes[0], fontproperties = font)
    plt.legend(loc = 'lower right')

    plt.savefig('age_oshi.png')
    pass


 def point(data):
    pass



 if __name__ == '__main__':
    #csvSorter('oshimen_raw_data.csv','oshimen_data.csv','oshimen_trash_data.csv')

    data = loadcsv('oshimen_data.csv')
    data = pd.DataFrame(data,columns = COLNAMES)

    data[u'发帖'].value_counts().plot()
    plt.show()
    base_sexrate = sex(data)
    sex_oshi(data,base_sexrate)
    age_oshi(data)
    



diff --git a/grabData.py b/grabData.py
 # -*- coding: utf-8 -*-
 import urllib2,re,csv,codecs,time
 from lxml.html import soupparser 

 URLPATTERN = 'http://club.akb48.com.cn/home.php?mod=space&uid=%d&do=profile'
 RE_USER = re.compile(ur'([A-Za-z0-9\u2E80-\u9FFF_ ]+)<span class="xw0">\(UID:')
 RE_SEX = re.compile(ur'<li><em>性别</em>([\w\u2E80-\u9FFF ]+)</li>')
 RE_BIRTHDAY = re.compile(ur'<li><em>生日</em>([\w\u2E80-\u9FFF ]+)</li>')
 RE_OSHI_1 = re.compile(ur'<li><em>第一推し成员</em>([\w\u2E80-\u9FFF ]+)</li>')
 RE_OSHI_2 = re.compile(ur'<li><em>第二推し成员</em>([\w\u2E80-\u9FFF ]+)</li>')
 RE_COMMENTS = re.compile(ur'回帖数 (\d+)')
 RE_POSTS = re.compile(ur'主题数 (\d+)')
 RE_REGTIME = re.compile(ur'<li><em>注册时间</em>([\-\d: ]+)</li>')
 RE_LASTTIME = re.compile(ur'<li><em>最后访问</em>([\-\d: ]+)</li>')
 RE_POINT = re.compile(ur'<li><em>积分</em>(\d+)</li>')
 RE_AKCOIN =  re.compile(ur'<li><em>AK币</em>(\d+) </li>')
 #　print RE_USER.pattern, RE_OSHI.pattern

 def grabHTML(url):
    try:
        response = urllib2.urlopen(url,timeout = 10)
        code = response.getcode()
        if not 200 <= code <= 300 :
            return None
        html = response.read()
        return html
    except Exception, e:
        print e
        return None

 def ana(html):
    html = html.decode('utf-8')
    user_match   = RE_USER.search(html)
    sex_match    = RE_SEX.search(html)
    birthday_match= RE_BIRTHDAY.search(html)
    oshi_1_match = RE_OSHI_1.search(html)
    oshi_2_match = RE_OSHI_2.search(html)
    post_match = RE_POSTS.search(html)
    comment_match = RE_COMMENTS.search(html)
    reg_match = RE_REGTIME.search(html)
    last_match = RE_LASTTIME.search(html)
    point_match = RE_POINT.search(html)
    akcoin_match = RE_AKCOIN.search(html)
    user   = ''
    sex    = ''
    birthday= ''
    oshi_1 = ''
    oshi_2 = ''
    post = ''
    comment = ''
    reg = ''
    last = ''
    point = ''
    akcoin = ''

    if user_match:
        user = user_match.groups()[0]
    if sex_match:
        sex = sex_match.groups()[0]
    if birthday_match:
        birthday = birthday_match.groups()[0]
    if oshi_1_match:
        oshi_1 = oshi_1_match.groups()[0]
    if oshi_2_match:
        oshi_2 = oshi_2_match.groups()[0]
    if post_match:
        post = post_match.groups()[0]
    if comment_match: 
        comment = comment_match.groups()[0]
    if reg_match:
        reg = reg_match.groups()[0]
    if last_match:
        last = last_match.groups()[0]
    if point_match:
        point = point_match.groups()[0]
    if akcoin_match:
        akcoin = akcoin_match.groups()[0]
    return (user,sex,birthday,oshi_1,oshi_2,post,comment,reg,last,point,akcoin,)

 def Collect(saveto = 'oshimendata.csv'):
    counter = 0
    success = 0

    failcounter = 0 

    if not saveto:
        return

    f = codecs.open(saveto,'wb+','utf-8')
    f.write(u'userid,username,sex,birthday,firstOshimen,secondOshimen,post,comment,reg,last,point,akcoin\n')

    for x in xrange(1,540000): #540000
        counter += 1
        html = grabHTML(URLPATTERN % x)
        if html:
            data = ana(html)
            if ''.join(data):
                success += 1
                failcounter = 0
                f.write(u'%d,'%x)
                f.write(u','.join(data))
                f.write(u'\n')
            else:
                failcounter += 1
        else:
            failcounter += 1
        if failcounter > 100:
            break
        print u'抓取数据 用户ID：%d, (总抓取：%d 成功抓取：%d)' % (x, counter, success)

    f.close()
    return data

 if __name__ == '__main__':
    Collect('oshimendata.csv')
	# -- coding: utf-8 --

	import codecs, datetime, math, re
	import pandas as pd
	import numpy as np
	from numpy import nan as NA
	import matplotlib.pyplot as plt
	from matplotlib.font_manager import FontProperties

	COLNAMES = [u'用户ID',u'用户名',u'性别',u'生日',u'一推',u'二推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货']
	NUM_SHOWN = 20

	plt.rcParams['font.sans-serif'] = ['SimHei']
	font = FontProperties(fname=r"c:\\windows\\fonts\\simsun.ttc", size=14)


	def cnstr2date(datetimestr):
	m = re.match(ur"(\d{4}) 年 (\d{1,2}) 月 (\d{1,2}) 日", datetimestr)
	if m:
	g = m.groups()
	return datetime.date(int(g[0]),int(g[1]),int(g[2]))
	return NA

	def str2datetime(datetimestr):
	datetimestr = datetimestr.split(' ')
	date = datetimestr[0].split('-')
	time = datetimestr[1].split(':')
	return datetime.datetime(int(date[0]),int(date[1]),int(date[2]),int(time[0]),int(time[1]))

	def str2dig(digstr):
	if digstr:
	return int(digstr)
	else:
	return 0

	def csvSorter(
	src = 'oshimen_raw_data.csv',
	tar = 'oshimen_data.csv',
	spam = 'oshimen_spam_data',
	ghost = 'oshimen_ghost_data',
	zombie = 'oshimen_zombie_data',
	):
	threshold = 2

	srcfile = codecs.open(src,'r','utf-8')
	tarfile = codecs.open(tar,'wb+','utf-8')
	spamfile = codecs.open(spam,'wb+','utf-8')
	for line in srcfile.readlines()[1:]:
	score = 0
	line = line.strip()
	data = line.split(',')

	# 注册时间得分
	regtime = str2datetime(data[8])
	lasttime = str2datetime(data[9])
	delta = lasttime - regtime
	if delta.days > 7:
	score += 1
	pass
	# 积分得分（总积分 = 发帖数 + 精华帖数X5 + 应援力X50 + 在线时间(小时)）
	point = str2dig(data[10])
	score += math.log(point + 1) * 0.5

	# 通货得分
	akcoin = str2dig(data[11])
	score += math.log((akcoin + 1.0) / 20 ) * 0.1

	# 如果只有发帖没有回帖认为是广告帐号
	comment = str2dig(data[7])
	post = str2dig(data[6])
	if post > 0 and comment == 0:
	score = 0

	if score < threshold:
	spamfile.write(','.join(data))
	spamfile.write('\n')
	else :
	tarfile.write(','.join(data))
	tarfile.write('\n')
	pass
	srcfile.close()
	tarfile.close()
	spamfile.close()

	def loadcsv(csvfilename):
	data = []
	f = codecs.open(csvfilename,'r','utf-8')
	for line in f.readlines():
	line = line.strip()
	sample = line.split(',')
	# 用户ID转换为数值
	sample[0] = str2dig(sample[0])
	# 性别转换为数值
	if sample[2] == u"男":
	sample[2] = 1
	elif sample[2] == u'女':
	sample[2] = 0
	else:
	sample[2] = NA
	# 生日转换为日期
	sample[3] = cnstr2date(sample[3])
	# 去除1推2推名中的空格
	sample[4] = sample[4].replace(' ','')
	sample[5] = sample[5].replace(' ','')
	# 1推2推如果是空设置为NA
	if not sample[4]:
	sample[4] = NA
	if not sample[5]:
	sample[5] = NA
	# 注册和最后登录转换为日期时间
	sample[8] = str2datetime(sample[8])
	sample[9] = str2datetime(sample[9])
	# 发帖、回复、积分和通货转换为数值
	sample[6] = str2dig(sample[6])
	sample[7] = str2dig(sample[7])
	sample[10] = str2dig(sample[10])
	sample[11] = str2dig(sample[11])

	data.append(sample)
	return data

	def sex(data):
	data = data[u'性别']
	# data.dropna(inplace = True)
	res = data.value_counts()

	plt.pie(res, labels = [u'男',u'女'] , explode = [0.05,0.05], colors = ['blue','pink'])
	plt.legend()
	plt.savefig('userinfo_sex.png')
	return res[1] * 1.0 / res[0]
	pass

	def sex_oshi(data, base_rate = 1.0):

	first_oshi = data.drop([u'用户ID',u'用户名',u'生日',u'二推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1)
	second_oshi = data.drop([u'用户ID',u'用户名',u'生日',u'一推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1)
	first_oshi.dropna(inplace = True)
	second_oshi.dropna(inplace = True)
	first_groups = first_oshi.groupby(u'性别')
	second_groups = second_oshi.groupby(u'性别')
	first_data_M = first_groups.get_group(0)[u'一推'].value_counts()
	first_data_F = first_groups.get_group(1)[u'一推'].value_counts()
	dt = first_oshi[u'一推'].value_counts()
	second_data_M = second_groups.get_group(0)[u'二推'].value_counts()
	second_data_F = second_groups.get_group(1)[u'二推'].value_counts()

	d = pd.DataFrame({
	u'女一推' : first_data_M,
	u'男一推' : first_data_F,
	u'女二推' : second_data_M,
	u'男二推' : second_data_F,
	u'总一推' : dt,
	}).fillna(value = 0)
	d = d.sort(u'总一推' , ascending = False).head(NUM_SHOWN)



	sexscore = []
	for i in xrange(NUM_SHOWN):
	male = d[u'男一推'][i] + d[u'男二推'][i] * 0.5
	female = d[u'女一推'][i] + d[u'女二推'][i] * 0.5
	rate = male / female
	score = rate / base_rate
	if score < 1:
	score = - 1 / score + 1
	else:
	score -= 1
	pass
	sexscore.append(score)
	pass
	d.insert(4,u'性别得分',sexscore)
	f = codecs.open('sex_oshi.txt','wb+','utf-8')
	f.write(d.to_string())
	f.close()


	bar_width = 0.8
	ind = -np.arange( NUM_SHOWN ) * 2.0 - bar_width * 0.5

	fig = plt.figure(figsize=(4,10))
	fig.subplots_adjust(left=0.27, top=0.98, right=0.96, bottom = 0.05)

	b0 = plt.barh(ind + 0.5 * bar_width, d[u'女一推'], bar_width, color = 'red', label= u'女一推')
	b1 = plt.barh(ind + 0.5 * bar_width, d[u'男一推'], bar_width, color = 'blue', label= u'男一推', left = - d[u'男一推'])
	b2 = plt.barh(ind - 0.5 * bar_width, d[u'女二推'], bar_width, color = 'pink', label= u'女二推')
	b3 = plt.barh(ind - 0.5 * bar_width, d[u'男二推'], bar_width, color = 'cyan', label= u'男二推', left = - d[u'男二推'])


	ax = plt.gca()
	ax.axis([-400,200,ind[NUM_SHOWN-1]-bar_width, ind[0] + 2 * bar_width])
	plt.xlabel(u'数量', fontproperties = font,)
	plt.xticks(np.linspace(-400, 200, 7), ('400','300','200','100','0','100','200'))
	plt.yticks(ind + 0.5 * bar_width, d.axes[0], fontproperties=font)
	plt.legend(loc='lower left')

	plt.savefig('sex_oshi.png')

	def age(data):

	pass

	def ageGrouping(birthday, tillday):
	age = tillday.year - birthday.year
	age = (age + 1) / 4 - 4
	if age < 0 :
	age = 0
	if age > 4:
	age = 4

	return age
	pass

	def age_oshi(data):
	first_oshi = data.drop([u'用户ID',u'用户名',u'性别',u'二推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1)
	second_oshi = data.drop([u'用户ID',u'用户名',u'性别',u'一推',u'发帖',u'回复',u'注册时间',u'最后登陆时间',u'积分',u'通货'], axis = 1)
	first_oshi.dropna(inplace = True)
	second_oshi.dropna(inplace = True)

	first_agegroup = []
	second_agegroup = []
	today = datetime.date.today()

	# ~18 19~22 22~26 26~30 30~
	# (age + 1) / 4 - 4
	for birth in first_oshi[u'生日']:
	first_agegroup.append(ageGrouping(birth,today))
	first_oshi.insert(2,u'年龄分段',first_agegroup)

	# print first_oshi

	for birth in second_oshi[u'生日']:
	second_agegroup.append(ageGrouping(birth,today))
	second_oshi.insert(2,u'年龄分段',second_agegroup)

	first_groups = first_oshi.groupby(u'年龄分段')
	second_groups = second_oshi.groupby(u'年龄分段')

	first_d = []
	for g in first_groups:
	first_d.append(g[1][u'一推'].value_counts())
	pass

	dt = first_oshi[u'一推'].value_counts()

	second_d = []
	for g in second_groups:
	second_d.append(g[1][u'二推'].value_counts())
	pass

	d = pd.DataFrame({
	u'一推 ~18' : first_d[0],
	u'一推 19~22' : first_d[1],
	u'一推 22~26':first_d[2],
	u'一推 26~':first_d[3],
	u'二推 ~18' : second_d[0],
	u'二推 19~22' : second_d[1],
	u'二推 22~26':second_d[2],
	u'二推 26~':second_d[3],
	u'总一推':dt}).fillna(value = 0).sort(u'总一推' , ascending = False).head(20)

	f = codecs.open('age_oshi.txt','wb+','utf-8')
	f.write(d.to_string())
	f.close()

	fig = plt.figure(figsize=(4,10))
	fig.subplots_adjust(left=0.27, top=0.98, right=0.96, bottom = 0.05)

	bar_width = 0.8
	ind = -np.arange(len(d)) * 2.0 - bar_width * 0.5

	first_labels = [u'一推 ~18', u'一推 19~22', u'一推 22~26', u'一推 26~',]
	second_labels = [u'二推 ~18', u'二推 19~22', u'二推 22~26', u'二推 26~',]
	first_colors = ['#0000FF','#00CCCC','#00FF00','#FFA500',]
	second_colors = ['#5555FF','#55CCCC','#55FF55','#FFFF00',]

	b = []
	first_left = [0 for i in xrange(len(d.index))]
	second_left = [0 for i in xrange(len(d.index))]
	for i in xrange(4):
	b.append(plt.barh(ind + bar_width, d[first_labels[i]], bar_width, label=first_labels[i], left = first_left, color = first_colors[i]))
	b.append(plt.barh(ind, d[second_labels[i]], bar_width, label=second_labels[i], left = second_left, color = second_colors[i]))
	first_left = map(lambda a,b:a+b, first_left, d[first_labels[i]])
	second_left = map(lambda a,b:a+b, second_left, d[second_labels[i]])
	pass

	ax = plt.gca()
	ax.axis([0,500,ind[NUM_SHOWN-1] - bar_width, ind[0] + 2.5 * bar_width])

	plt.xlabel(u'数量', fontproperties=font,)
	plt.yticks(ind + bar_width, d.axes[0], fontproperties = font)
	plt.legend(loc = 'lower right')

	plt.savefig('age_oshi.png')
	pass


	def point(data):
	pass



	if __name__ == '__main__':
	#csvSorter('oshimen_raw_data.csv','oshimen_data.csv','oshimen_trash_data.csv')

	data = loadcsv('oshimen_data.csv')
	data = pd.DataFrame(data,columns = COLNAMES)

	data[u'发帖'].value_counts().plot()
	plt.show()
	base_sexrate = sex(data)
	sex_oshi(data,base_sexrate)
	age_oshi(data)
	# -- coding: utf-8 --
	import urllib2,re,csv,codecs,time
	from lxml.html import soupparser

	URLPATTERN = 'http://club.akb48.com.cn/home.php?mod=space&uid=%d&do=profile'
	RE_USER = re.compile(ur'([A-Za-z0-9\u2E80-\u9FFF_ ]+)<span class="xw0">\(UID:')
	RE_SEX = re.compile(ur'<li><em>性别</em>([\w\u2E80-\u9FFF ]+)</li>')
	RE_BIRTHDAY = re.compile(ur'<li><em>生日</em>([\w\u2E80-\u9FFF ]+)</li>')
	RE_OSHI_1 = re.compile(ur'<li><em>第一推し成员</em>([\w\u2E80-\u9FFF ]+)</li>')
	RE_OSHI_2 = re.compile(ur'<li><em>第二推し成员</em>([\w\u2E80-\u9FFF ]+)</li>')
	RE_COMMENTS = re.compile(ur'回帖数 (\d+)')
	RE_POSTS = re.compile(ur'主题数 (\d+)')
	RE_REGTIME = re.compile(ur'<li><em>注册时间</em>([\-\d: ]+)</li>')
	RE_LASTTIME = re.compile(ur'<li><em>最后访问</em>([\-\d: ]+)</li>')
	RE_POINT = re.compile(ur'<li><em>积分</em>(\d+)</li>')
	RE_AKCOIN = re.compile(ur'<li><em>AK币</em>(\d+) </li>')
	#　print RE_USER.pattern, RE_OSHI.pattern

	def grabHTML(url):
	try:
	response = urllib2.urlopen(url,timeout = 10)
	code = response.getcode()
	if not 200 <= code <= 300 :
	return None
	html = response.read()
	return html
	except Exception, e:
	print e
	return None

	def ana(html):
	html = html.decode('utf-8')
	user_match = RE_USER.search(html)
	sex_match = RE_SEX.search(html)
	birthday_match= RE_BIRTHDAY.search(html)
	oshi_1_match = RE_OSHI_1.search(html)
	oshi_2_match = RE_OSHI_2.search(html)
	post_match = RE_POSTS.search(html)
	comment_match = RE_COMMENTS.search(html)
	reg_match = RE_REGTIME.search(html)
	last_match = RE_LASTTIME.search(html)
	point_match = RE_POINT.search(html)
	akcoin_match = RE_AKCOIN.search(html)
	user = ''
	sex = ''
	birthday= ''
	oshi_1 = ''
	oshi_2 = ''
	post = ''
	comment = ''
	reg = ''
	last = ''
	point = ''
	akcoin = ''

	if user_match:
	user = user_match.groups()[0]
	if sex_match:
	sex = sex_match.groups()[0]
	if birthday_match:
	birthday = birthday_match.groups()[0]
	if oshi_1_match:
	oshi_1 = oshi_1_match.groups()[0]
	if oshi_2_match:
	oshi_2 = oshi_2_match.groups()[0]
	if post_match:
	post = post_match.groups()[0]
	if comment_match:
	comment = comment_match.groups()[0]
	if reg_match:
	reg = reg_match.groups()[0]
	if last_match:
	last = last_match.groups()[0]
	if point_match:
	point = point_match.groups()[0]
	if akcoin_match:
	akcoin = akcoin_match.groups()[0]
	return (user,sex,birthday,oshi_1,oshi_2,post,comment,reg,last,point,akcoin,)

	def Collect(saveto = 'oshimendata.csv'):
	counter = 0
	success = 0

	failcounter = 0

	if not saveto:
	return

	f = codecs.open(saveto,'wb+','utf-8')
	f.write(u'userid,username,sex,birthday,firstOshimen,secondOshimen,post,comment,reg,last,point,akcoin\n')

	for x in xrange(1,540000): #540000
	counter += 1
	html = grabHTML(URLPATTERN % x)
	if html:
	data = ana(html)
	if ''.join(data):
	success += 1
	failcounter = 0
	f.write(u'%d,'%x)
	f.write(u','.join(data))
	f.write(u'\n')
	else:
	failcounter += 1
	else:
	failcounter += 1
	if failcounter > 100:
	break
	print u'抓取数据用户ID：%d, (总抓取：%d 成功抓取：%d)' % (x, counter, success)

	f.close()
	return data

	if __name__ == '__main__':
	Collect('oshimendata.csv')