Last active
January 25, 2018 13:11
-
-
Save VencentYoung/4b320561534d6ac3caa2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
import urllib, http.cookiejar, requests | |
import threading | |
import os,re | |
def get_page(url,timeout=20): | |
# 仅返回网页,不做任何操作 | |
try: | |
request = urllib.request.Request(url) | |
request.add_header('Referer', cl_url) | |
request.add_header('User-Agent', 'Mozilla/5.0') | |
page = urllib.request.urlopen(request,timeout=timeout).read().decode('gbk') | |
return page | |
except: | |
print('>>> 页面下载失败...%s' % url) | |
def get_item(url): | |
# 通过分析网页返回该页所有的帖子信息:[地址, 标题] | |
# 部分帖子(尤其是达盖尔)存在颜色标记, 需要额外去除 | |
try: | |
page = get_page(url) | |
page = re.sub('[\n|\r|\t]|<font color=.+?>|</font>','',page) | |
item_pattern = re.compile('(?<=<h3><a href="htm_data).+?(?=</a></h3>)') | |
items = re.findall(item_pattern, page) | |
res = [re.split('" target="_blank" id="">',item) for item in items] | |
return res | |
except: | |
print('>>> 获取帖子信息失败...') | |
def get_range(url, page_start, page_end): | |
# 批量下载多页帖子信息,未实现多线程,需要优化 | |
# 返回结果同 get_item | |
items = [] | |
for page_num in range(page_start, page_end+1): | |
try: | |
print('>>> 开始下载第 %d 页...' % page_num) | |
items = items + get_item(url + '&page=%d' % page_num) | |
print(' 第 %d 页下载成功' % page_num) | |
except: | |
print(' 第 %d 页下载失败' % page_num) | |
finally: | |
print('>>> -------------------------------') | |
return items | |
def search_item(key_word_list, items): | |
print('>>> 目标包含 %d 条目, 按照关键词: %s 展开搜索...' % (len(items),' | '.join(key_word_list))) | |
search_result = [] | |
for item in items: | |
for key_word in key_word_list: | |
if key_word in item[1]: | |
search_result.append(item) | |
break | |
print('>>> 共搜索到 %d 个主题' % len(search_result)) | |
return search_result | |
def get_torrent_hash(page): | |
hash_pattern = re.compile('(?<=hash=).+?(?=&z">)') | |
torrent_hash = re.findall(hash_pattern, page)[0] | |
return torrent_hash | |
def get_pic_urls(page): | |
pic_pattern1 = re.compile('(?<=<input type=\'image\' src=\').+?(?=\'\s)') | |
pic_pattern2 = re.compile('(?<=img src=\').+?(?=\'\s)') | |
pic_urls = re.findall(pic_pattern1, page) + re.findall(pic_pattern2, page) | |
# pic_pattern = re.compile('(?<=[<input type=\'image\' src=\'|img src=\']).+?(?=\'\s)') | |
# pic_urls = re.findall(pic_pattern, page) | |
return pic_urls | |
def download_torrent(torrent_hash, torrent_name='', torrent_path=''): | |
# 此处 url 对应为帖子地址 | |
try: | |
print('>>> 开始下载种子...') | |
download_url = 'http://www.rmdown.com/link.php?hash=' + torrent_hash | |
torrent_server = 'http://www.rmdown.com/download.php' | |
header = {'User-Agent': 'Mozilla/5.0','Referer': download_url} | |
request = urllib.request.Request(download_url,headers=header) | |
page= urllib.request.urlopen(request).read().decode('utf-8') | |
reff_pattern = re.compile('(?<=NAME="reff" value=").+?(?="><BR>)') | |
torrent_reff = re.findall(reff_pattern, page)[0] | |
payload = {'ref': (None,torrent_hash), | |
'reff': (None,torrent_reff), | |
'submit': (None,'download')} | |
response = requests.post(torrent_server, files=payload, timeout=5) | |
if len(torrent_name) == 0: | |
torrent_name = torrent_hash | |
else: | |
torrent_name = re.sub('[>/:*\|?\\<]',' - ',torrent_name) | |
if len(torrent_path) != 0: | |
if not(os.path.exists(torrent_path)): | |
os.makedirs(torrent_path) | |
file_name = os.path.join(torrent_path, torrent_name + '.torrent') | |
else: | |
file_name = torrent_name + '.torrent' | |
with open(file_name, "wb") as code: | |
code.write(response.content) | |
except Exception as e: | |
print('>>> 从 %s 下载 %s 失败...' %(download_url, file_name)) | |
print(e) | |
finally: | |
print('>>> -------------------------------') | |
def download_pic(pic_url,pic_name='',pic_path=''): | |
try: | |
if len(pic_name) == 0: pic_name = re.split('/',pic_url)[-1] | |
if len(pic_path) != 0: | |
if not(os.path.exists(pic_path)): | |
os.makedirs(pic_path) | |
file_name = os.path.join(pic_path, pic_name) | |
else: | |
file_name = pic_name | |
if os.path.isfile(file_name): | |
print(' 文件已存在,无需重复下载') | |
return | |
r = requests.get(pic_url, timeout = 20) | |
with open(file_name, "wb") as code: | |
code.write(r.content) | |
print(' 下载成功 %s' % pic_url) | |
except Exception as e: | |
print(e) | |
print(' 下载失败 %s' % pic_url) | |
def download_pics(pic_urls, pic_path): | |
print('>>> 共 %d 张图片需要下载...' % len(pic_urls)) | |
task_threads = [] | |
for pic_url in pic_urls: | |
t = threading.Thread(target = download_pic, args = (pic_url,'',pic_path)) | |
task_threads.append(t) | |
for task in task_threads: | |
task.start() | |
for task in task_threads: | |
task.join() | |
def download_pics_from_range(url, page_start, page_end, key_word_list, save_path): | |
items = get_range(url, page_start, page_end) | |
matched_items = search_item(key_word_list, items) | |
for i in matched_items: | |
print('>>> 下载主题 %s' % i[1]) | |
page = get_page(cl_url+'htm_data'+i[0]) | |
pic_urls = get_pic_urls(page) | |
print(save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1])) | |
download_pics(pic_urls,save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1])) | |
def download_all_from_range(url, page_start, page_end, key_word_list, save_path): | |
items = get_range(url, page_start, page_end) | |
matched_items = search_item(key_word_list, items) | |
for i in matched_items: | |
print('>>> 下载主题 %s' % i[1]) | |
page = get_page(cl_url+'htm_data'+i[0]) | |
pic_urls = get_pic_urls(page) | |
torrent_hash = get_torrent_hash(page) | |
download_pics(pic_urls,save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1])) | |
download_torrent(torrent_hash, i[1], save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1])) | |
if __name__ == '__main__': | |
cl_url = 'http://你懂得/' # 定期更换 | |
Asia_non_mosaic = cl_url + 'thread0806.php?fid=2' # 亚洲无码 | |
Asia_mosaic = cl_url + 'thread0806.php?fid=15' # 亚洲有码 | |
Original_Western = cl_url + 'thread0806.php?fid=4' # 欧美原创 | |
Original_Animation = cl_url + 'thread0806.php?fid=5' # 动漫原创 | |
Flag_of_Daguerre = cl_url + 'thread0806.php?fid=16' # 达盖尔的旗帜 | |
New_Era_for_All = cl_url + 'thread0806.php?fid=8' # 新时代的我们 | |
Tech_Talk = cl_url + 'thread0806.php?fid=7' # 技术讨论区 | |
address_dic = {1: Asia_non_mosaic, | |
2: Asia_mosaic, | |
3: Original_Western, | |
4: Original_Animation, | |
5: Flag_of_Daguerre, | |
6: New_Era_for_All, | |
7: Tech_Talk} | |
welcome_info = '''>>> 你,国之栋梁,请注意节制 | |
1. 亚洲无码 Asia_non_mosaic | |
2. 亚洲有码 Asia_mosaic | |
3. 欧美原创 Original_Western | |
4. 动漫原创 Original_Animation | |
5. 达盖尔的旗帜 Flag_of_Daguerre | |
6. 新时代的我们 New_Era_for_All | |
7. 技术讨论区 Tech_Talk | |
''' | |
print(welcome_info) | |
save_path = 'D:\\FTPRoot' | |
key_words = ['夏洛特', '时代周刊'] | |
m = search_item(key_words, get_range(Tech_Talk,1,20)) | |
for s in m: | |
print(s) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment