Created
August 6, 2020 10:02
-
-
Save zyr17/711f6d787e1c8226ab2ae11f77c7b5e9 to your computer and use it in GitHub Desktop.
kancollekai wiki crawl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import requests | |
import urllib | |
gotname = {} | |
def downimg(img): | |
name = re.search('plugin=([^&"]*)[&"]', img).group(1) + '-' | |
name += re.search('page=([^&"]*)[&"]', img).group(1) + '-' | |
name += re.search('src=([^&"]*)[&"]', img).group(1) | |
if (name in gotname): | |
return | |
print(name + ' not exist, start downloading') | |
gotname[name] = 1 | |
name = 'images/' + name | |
with open(name, 'wb') as f: | |
f.write(requests.get(img).content) | |
def replace_html(s): | |
s = s.replace('"','"') | |
s = s.replace('&','&') | |
s = s.replace('<','<') | |
s = s.replace('>','>') | |
s = s.replace(' ',' ') | |
return s | |
def onepage(page): | |
imgs = re.findall(r'<img[^>]*?src="/kancollekai/\?[^"]*?plugin=[^"]*?page=[^"]*?src=[^"]*"', page) | |
for timg in imgs: | |
img = re.search(r'<img[^>]*?src="/kancollekai/\?([^"]*?plugin=[^"]*?page=[^"]*?src=[^"]*)"', timg).group(1) | |
downimg('https://wikiwiki.jp/kancollekai/?' + '-'.join(urllib.parse.unquote(replace_html(img)).split('/'))) | |
folders = [os.getcwd() + '/wikiwiki.jp/kancollekai'] | |
files = [] | |
for num in range(11111): | |
if num >= len(folders): | |
break | |
i = folders[num] | |
for tmp in os.listdir(i): | |
if os.path.isdir(i + '/' + tmp): | |
folders.append(i + '/' + tmp) | |
else: | |
files.append(i + '/' + tmp) | |
for file in files: | |
print('---' + file + '---') | |
text = '' | |
try: | |
text = open(file).read() | |
except UnicodeDecodeError as e: | |
print('file decode error') | |
onepage(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment