Last active
March 14, 2022 09:46
-
-
Save cosven/2aab9f406e12f36ae24f1e9d76a297d0 to your computer and use it in GitHub Desktop.
北京豆瓣租房小脚本...
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
豆瓣租房爬虫 | |
Install: | |
pip install beautifulsoup4 | |
Usage: | |
python douban_zufang.py | |
""" | |
from __future__ import print_function | |
import asyncio | |
import copy | |
from datetime import datetime, timedelta | |
from functools import partial | |
from bs4 import BeautifulSoup | |
import requests | |
# 有些帖子已经看过了,手动把 url 加入黑名单 | |
post_url_black_list = [] | |
try: | |
with open('url_blacklist.txt', 'r') as f: | |
for line in f.readline(): | |
url = f.readline().split('\n')[0] | |
post_url_black_list.append(url) | |
except IOError: | |
pass | |
expected_groups = [ | |
(26926, u'北京租房豆瓣'), | |
(279962, u'北京租房(非中介)'), | |
(262626, u'北京无中介租房(寻天使投资)'), | |
(35417, u'北京租房'), | |
(56297, u'北京个人租房 (真房源|无中介)'), | |
(257523, u'北京租房房东联盟(中介勿扰) '), | |
] | |
expected_query_strs = (u'六道口', u'15号线', u'文成杰座', u'富润家园', ) | |
group_search_url = 'http://www.douban.com/group/search' | |
default_params = { | |
'cat': 1013, # 不要问我有啥用,不知道... | |
'sort': 'time', | |
} | |
default_headers = { | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) ' | |
'AppleWebKit/537.36 (KHTML, like Gecko) ' | |
'Chrome/61.0.3163.100 Safari/537.36', | |
} | |
def gen_search_params(group_id, q): | |
params = copy.deepcopy(default_params) | |
params.update(dict( | |
group=group_id, | |
q=q | |
)) | |
return params | |
def parse_html_tr(tr): | |
time_tag = tr.find('td', {'class': 'td-time'}) | |
time_str = time_tag['title'] | |
last_update_time = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S') | |
if datetime.now() - last_update_time > timedelta(days=7): | |
return None | |
a_tag = tr.td.a | |
url = a_tag.get('href', '') | |
title = a_tag.text | |
return (url, title) | |
async def search_group(group, q, all_posts): | |
event_loop = asyncio.get_event_loop() | |
posts = [] | |
group_id, group_alias = group | |
params = gen_search_params(group_id, q) | |
response = await event_loop.run_in_executor( | |
None, | |
partial(requests.get, url=group_search_url, params=params, headers=default_headers) | |
) | |
if response.status_code == 200: | |
html_doc = response.content | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
results = soup.find_all('tr', {'class': 'pl'}) | |
for index, result in enumerate(results): | |
rv = parse_html_tr(result) | |
if rv is not None: | |
url, title = rv | |
if url in post_url_black_list: | |
continue | |
if q in title: | |
posts.append((url, title)) | |
else: | |
print(u'爬虫出现了未知问题...') | |
all_posts.extend(posts) | |
print(u'在 『{group_alias}』 查找到了 {count} 个包含 『{q}』 的帖子' | |
.format(group_alias=group[1], count=len(posts), q=q)) | |
async def search_groups(): | |
"""查找符合要求的帖子 | |
1. 包含我们查询的字符 | |
2. 时间是 7 天以内的 | |
""" | |
event_loop = asyncio.get_event_loop() | |
all_posts = [] | |
tasks = [] | |
for group in expected_groups: | |
for q in expected_query_strs: | |
tasks.append(event_loop.create_task(search_group(group, q, all_posts))) | |
done, pending = await asyncio.wait(tasks) | |
assert bool(pending) is False | |
return all_posts | |
def uniq_and_sort(posts): | |
url_title_map = dict(posts) | |
post_weight_map = {} # {url: weight} | |
title_set = set() | |
for post in posts: | |
url, title = post | |
if any([u'次卧' in title, | |
u'马泉营' in title, | |
u'孙河' in title, | |
u'顺义' in title, | |
u'石门' in title, | |
u'法信' in title]): | |
continue | |
if title in title_set: | |
continue | |
else: | |
title_set.add(title) | |
if url in post_weight_map: | |
post_weight_map[url] += 1 | |
else: | |
post_weight_map[url] = 1 | |
sorted_result = sorted( | |
[(_url, weight) for (_url, weight) in post_weight_map.items()], | |
key=lambda each: each[1], | |
reverse=True | |
) | |
for url, weight in sorted_result: | |
title = url_title_map[url] | |
yield (url, title) | |
async def main(): | |
posts = await search_groups() | |
for url, title in uniq_and_sort(posts): | |
print('- [ ] ', title, '\n', ' ', url) | |
if __name__ == '__main__': | |
event_loop = asyncio.get_event_loop() | |
event_loop.run_until_complete(main()) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
豆瓣租房爬虫 | |
Install: | |
pip install beautifulsoup4 | |
Usage: | |
python douban_zufang.py | |
""" | |
from __future__ import print_function | |
import copy | |
import time | |
from datetime import datetime, timedelta | |
from bs4 import BeautifulSoup | |
import requests | |
# 有些帖子已经看过了,手动把 url 加入黑名单 | |
post_url_black_list = [ | |
] | |
expected_groups = [ | |
(26926, u'北京租房豆瓣'), | |
(279962, u'北京租房(非中介)'), | |
(262626, u'北京无中介租房(寻天使投资)'), | |
(35417, u'北京租房'), | |
(56297, u'北京个人租房 (真房源|无中介)'), | |
(257523, u'北京租房房东联盟(中介勿扰) '), | |
] | |
expected_query_strs = (u'六道口', u'15号线', u'富润家园', u'文成杰座', u'富润家园', ) | |
group_search_url = 'http://www.douban.com/group/search' | |
default_params = { | |
'cat': 1013, # 不要问我有啥用,不知道... | |
'sort': 'time', | |
} | |
default_headers = { | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) ' | |
'AppleWebKit/537.36 (KHTML, like Gecko) ' | |
'Chrome/61.0.3163.100 Safari/537.36', | |
} | |
def gen_search_params(group_id, q): | |
params = copy.deepcopy(default_params) | |
params.update(dict( | |
group=group_id, | |
q=q | |
)) | |
return params | |
def parse_html_tr(tr): | |
time_tag = tr.find('td', {'class': 'td-time'}) | |
time_str = time_tag['title'] | |
last_update_time = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S') | |
if datetime.now() - last_update_time > timedelta(days=7): | |
return None | |
a_tag = tr.td.a | |
url = a_tag.get('href', '') | |
title = a_tag.text | |
return (url, title) | |
def search_group(group, q): | |
posts = [] | |
group_id, group_alias = group | |
params = gen_search_params(group_id, q) | |
response = requests.get(url=group_search_url, params=params, headers=default_headers) | |
if response.status_code == 200: | |
html_doc = response.content | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
results = soup.find_all('tr', {'class': 'pl'}) | |
for index, result in enumerate(results): | |
rv = parse_html_tr(result) | |
if rv is not None: | |
url, title = rv | |
if url in post_url_black_list: | |
continue | |
if q in title: | |
posts.append((url, title)) | |
else: | |
print(u'爬虫出现了未知问题...') | |
return posts | |
def search_groups(): | |
"""查找符合要求的帖子 | |
1. 包含我们查询的字符 | |
2. 时间是 7 天以内的 | |
""" | |
all_posts = [] | |
for group in expected_groups: | |
for q in expected_query_strs: | |
posts = search_group(group, q) | |
all_posts.extend(posts) | |
print(u'在 『{group_alias}』 查找到了 {count} 个包含 『{q}』 的帖子' | |
.format(group_alias=group[1], count=len(posts), q=q)) | |
return all_posts | |
def uniq_and_sort(posts): | |
url_title_map = dict(posts) | |
post_weight_map = {} # {url: weight} | |
title_set = set() | |
for post in posts: | |
url, title = post | |
if u'次卧' in title: | |
continue | |
if title in title_set: | |
continue | |
else: | |
title_set.add(title) | |
if url in post_weight_map: | |
post_weight_map[url] += 1 | |
else: | |
post_weight_map[url] = 1 | |
sorted_result = sorted( | |
[(_url, weight) for (_url, weight) in post_weight_map.iteritems()], | |
key=lambda each: each[1], | |
reverse=True | |
) | |
for url, weight in sorted_result: | |
title = url_title_map[url] | |
yield (url, title) | |
if __name__ == '__main__': | |
posts = search_groups() | |
print() | |
print() | |
print() | |
for url, title in uniq_and_sort(posts): | |
print('- [ ] ', title, '\n', ' ', url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment