Skip to content

Instantly share code, notes, and snippets.

@VencentYoung
Last active January 25, 2018 13:11
Show Gist options
  • Save VencentYoung/4b320561534d6ac3caa2 to your computer and use it in GitHub Desktop.
Save VencentYoung/4b320561534d6ac3caa2 to your computer and use it in GitHub Desktop.
# -*- coding:utf-8 -*-
import urllib, http.cookiejar, requests
import threading
import os,re
def get_page(url,timeout=20):
# 仅返回网页,不做任何操作
try:
request = urllib.request.Request(url)
request.add_header('Referer', cl_url)
request.add_header('User-Agent', 'Mozilla/5.0')
page = urllib.request.urlopen(request,timeout=timeout).read().decode('gbk')
return page
except:
print('>>> 页面下载失败...%s' % url)
def get_item(url):
# 通过分析网页返回该页所有的帖子信息:[地址, 标题]
# 部分帖子(尤其是达盖尔)存在颜色标记, 需要额外去除
try:
page = get_page(url)
page = re.sub('[\n|\r|\t]|<font color=.+?>|</font>','',page)
item_pattern = re.compile('(?<=<h3><a href="htm_data).+?(?=</a></h3>)')
items = re.findall(item_pattern, page)
res = [re.split('" target="_blank" id="">',item) for item in items]
return res
except:
print('>>> 获取帖子信息失败...')
def get_range(url, page_start, page_end):
# 批量下载多页帖子信息,未实现多线程,需要优化
# 返回结果同 get_item
items = []
for page_num in range(page_start, page_end+1):
try:
print('>>> 开始下载第 %d 页...' % page_num)
items = items + get_item(url + '&page=%d' % page_num)
print(' 第 %d 页下载成功' % page_num)
except:
print(' 第 %d 页下载失败' % page_num)
finally:
print('>>> -------------------------------')
return items
def search_item(key_word_list, items):
print('>>> 目标包含 %d 条目, 按照关键词: %s 展开搜索...' % (len(items),' | '.join(key_word_list)))
search_result = []
for item in items:
for key_word in key_word_list:
if key_word in item[1]:
search_result.append(item)
break
print('>>> 共搜索到 %d 个主题' % len(search_result))
return search_result
def get_torrent_hash(page):
hash_pattern = re.compile('(?<=hash=).+?(?=&z">)')
torrent_hash = re.findall(hash_pattern, page)[0]
return torrent_hash
def get_pic_urls(page):
pic_pattern1 = re.compile('(?<=<input type=\'image\' src=\').+?(?=\'\s)')
pic_pattern2 = re.compile('(?<=img src=\').+?(?=\'\s)')
pic_urls = re.findall(pic_pattern1, page) + re.findall(pic_pattern2, page)
# pic_pattern = re.compile('(?<=[<input type=\'image\' src=\'|img src=\']).+?(?=\'\s)')
# pic_urls = re.findall(pic_pattern, page)
return pic_urls
def download_torrent(torrent_hash, torrent_name='', torrent_path=''):
# 此处 url 对应为帖子地址
try:
print('>>> 开始下载种子...')
download_url = 'http://www.rmdown.com/link.php?hash=' + torrent_hash
torrent_server = 'http://www.rmdown.com/download.php'
header = {'User-Agent': 'Mozilla/5.0','Referer': download_url}
request = urllib.request.Request(download_url,headers=header)
page= urllib.request.urlopen(request).read().decode('utf-8')
reff_pattern = re.compile('(?<=NAME="reff" value=").+?(?="><BR>)')
torrent_reff = re.findall(reff_pattern, page)[0]
payload = {'ref': (None,torrent_hash),
'reff': (None,torrent_reff),
'submit': (None,'download')}
response = requests.post(torrent_server, files=payload, timeout=5)
if len(torrent_name) == 0:
torrent_name = torrent_hash
else:
torrent_name = re.sub('[>/:*\|?\\<]',' - ',torrent_name)
if len(torrent_path) != 0:
if not(os.path.exists(torrent_path)):
os.makedirs(torrent_path)
file_name = os.path.join(torrent_path, torrent_name + '.torrent')
else:
file_name = torrent_name + '.torrent'
with open(file_name, "wb") as code:
code.write(response.content)
except Exception as e:
print('>>> 从 %s 下载 %s 失败...' %(download_url, file_name))
print(e)
finally:
print('>>> -------------------------------')
def download_pic(pic_url,pic_name='',pic_path=''):
try:
if len(pic_name) == 0: pic_name = re.split('/',pic_url)[-1]
if len(pic_path) != 0:
if not(os.path.exists(pic_path)):
os.makedirs(pic_path)
file_name = os.path.join(pic_path, pic_name)
else:
file_name = pic_name
if os.path.isfile(file_name):
print(' 文件已存在,无需重复下载')
return
r = requests.get(pic_url, timeout = 20)
with open(file_name, "wb") as code:
code.write(r.content)
print(' 下载成功 %s' % pic_url)
except Exception as e:
print(e)
print(' 下载失败 %s' % pic_url)
def download_pics(pic_urls, pic_path):
print('>>> 共 %d 张图片需要下载...' % len(pic_urls))
task_threads = []
for pic_url in pic_urls:
t = threading.Thread(target = download_pic, args = (pic_url,'',pic_path))
task_threads.append(t)
for task in task_threads:
task.start()
for task in task_threads:
task.join()
def download_pics_from_range(url, page_start, page_end, key_word_list, save_path):
items = get_range(url, page_start, page_end)
matched_items = search_item(key_word_list, items)
for i in matched_items:
print('>>> 下载主题 %s' % i[1])
page = get_page(cl_url+'htm_data'+i[0])
pic_urls = get_pic_urls(page)
print(save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1]))
download_pics(pic_urls,save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1]))
def download_all_from_range(url, page_start, page_end, key_word_list, save_path):
items = get_range(url, page_start, page_end)
matched_items = search_item(key_word_list, items)
for i in matched_items:
print('>>> 下载主题 %s' % i[1])
page = get_page(cl_url+'htm_data'+i[0])
pic_urls = get_pic_urls(page)
torrent_hash = get_torrent_hash(page)
download_pics(pic_urls,save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1]))
download_torrent(torrent_hash, i[1], save_path+'\\'+re.sub('[>/:*\|?\\<]','-',i[1]))
if __name__ == '__main__':
cl_url = 'http://你懂得/' # 定期更换
Asia_non_mosaic = cl_url + 'thread0806.php?fid=2' # 亚洲无码
Asia_mosaic = cl_url + 'thread0806.php?fid=15' # 亚洲有码
Original_Western = cl_url + 'thread0806.php?fid=4' # 欧美原创
Original_Animation = cl_url + 'thread0806.php?fid=5' # 动漫原创
Flag_of_Daguerre = cl_url + 'thread0806.php?fid=16' # 达盖尔的旗帜
New_Era_for_All = cl_url + 'thread0806.php?fid=8' # 新时代的我们
Tech_Talk = cl_url + 'thread0806.php?fid=7' # 技术讨论区
address_dic = {1: Asia_non_mosaic,
2: Asia_mosaic,
3: Original_Western,
4: Original_Animation,
5: Flag_of_Daguerre,
6: New_Era_for_All,
7: Tech_Talk}
welcome_info = '''>>> 你,国之栋梁,请注意节制
1. 亚洲无码 Asia_non_mosaic
2. 亚洲有码 Asia_mosaic
3. 欧美原创 Original_Western
4. 动漫原创 Original_Animation
5. 达盖尔的旗帜 Flag_of_Daguerre
6. 新时代的我们 New_Era_for_All
7. 技术讨论区 Tech_Talk
'''
print(welcome_info)
save_path = 'D:\\FTPRoot'
key_words = ['夏洛特', '时代周刊']
m = search_item(key_words, get_range(Tech_Talk,1,20))
for s in m:
print(s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment