Created
August 16, 2018 03:20
-
-
Save wgzhao/31b539004356662d88723b26843d5ccc to your computer and use it in GitHub Desktop.
从北京新发地(xinfadi.com.cn) 获取每个类目中有典型代表的商品(SKU)的2018年历史价格,然后验证是否出于上涨趋势
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as bs | |
import pandas as pd | |
import numpy as np | |
import requests | |
from urllib.parse import quote | |
""" | |
从北京新发地(xinfadi.com.cn) 获取每个类目中有典型代表的商品(SKU)的2018年历史价格,然后验证是否出于上涨趋势 | |
""" | |
# 索引与品类的映射关系 | |
cate_map = {1:'蔬菜',2:'水果',3:'肉禽蛋',4:'水产',5:'粮油'} | |
# 选择具有代表性的Sku进行抽样检测 | |
sku_list = {1:['白菜','大蒜','葱','尖椒','丝瓜','茄子','黄瓜'], | |
2:['苹果','西瓜','雪花梨','水蜜桃','哈密瓜'], | |
3:['鸡蛋','鸭蛋','松花蛋','五花肉','肉鸡','肥牛','羊肉'], | |
4:['草鱼','胖头鱼','鲫鱼','黄鳝','多宝鱼','基围虾'], | |
5:['东北大米','河北小米','小米面','金龙鱼调和油']} | |
query_string = 'begintime=2018-01-01&endtime=2018-08-15' | |
url_temp = 'http://xinfadi.com.cn/marketanalysis/{}/list/{}.shtml?' | |
#result = pd.DataFrame() | |
for k,v in sku_list.items(): | |
url = url_temp.format(k, 1) | |
for sku in v: | |
print(sku) | |
# 首先找出一共有多少页 | |
url += "&prodname={}".format(quote(sku)) | |
# 把当前页的内容获取 | |
df = pd.read_html(url, attrs={'class':'hq_table'}, skiprows=1)[0] | |
df.drop([7], axis=1, inplace=True) | |
result = result.append(df) | |
data = requests.get(url).text | |
root = bs(data, 'lxml') | |
manu = root.find('div', attrs={'class':'manu'}) | |
if not manu: | |
continue | |
e = manu.find('a',attrs={'title':'尾页'}) | |
if not e: | |
continue | |
# 页数 | |
pages = int(e.get('href').split('.')[0].split('/')[-1]) | |
#开始循环 | |
for page in range(2, pages+1): | |
url = url_temp.format(k, page) + query_string + "&prodname={}".format(quote(sku)) | |
df_iter = pd.read_html(url, attrs={'class':'hq_table'}, skiprows=1)[0] | |
df_iter.drop([7], axis=1, inplace=True) | |
result = result.append(df_iter) | |
#df.columns = ['品名','最低价','平均价','最高价','规格','单位','发布日期'] | |
result.columns=['品名','最低价','平均价','最高价','规格','单位','发布日期'] | |
result.to_csv('./xinfadi_sku_2018.csv', index=False) | |
result.to_excel('./xinfadi_sku_2018.xlsx', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment