Skip to content

Instantly share code, notes, and snippets.

@zyr17
Created August 6, 2020 10:02
Show Gist options
  • Save zyr17/711f6d787e1c8226ab2ae11f77c7b5e9 to your computer and use it in GitHub Desktop.
Save zyr17/711f6d787e1c8226ab2ae11f77c7b5e9 to your computer and use it in GitHub Desktop.
kancollekai wiki crawl
import os
import re
import requests
import urllib
gotname = {}
def downimg(img):
name = re.search('plugin=([^&"]*)[&"]', img).group(1) + '-'
name += re.search('page=([^&"]*)[&"]', img).group(1) + '-'
name += re.search('src=([^&"]*)[&"]', img).group(1)
if (name in gotname):
return
print(name + ' not exist, start downloading')
gotname[name] = 1
name = 'images/' + name
with open(name, 'wb') as f:
f.write(requests.get(img).content)
def replace_html(s):
s = s.replace('"','"')
s = s.replace('&','&')
s = s.replace('&lt;','<')
s = s.replace('&gt;','>')
s = s.replace('&nbsp;',' ')
return s
def onepage(page):
imgs = re.findall(r'<img[^>]*?src="/kancollekai/\?[^"]*?plugin=[^"]*?page=[^"]*?src=[^"]*"', page)
for timg in imgs:
img = re.search(r'<img[^>]*?src="/kancollekai/\?([^"]*?plugin=[^"]*?page=[^"]*?src=[^"]*)"', timg).group(1)
downimg('https://wikiwiki.jp/kancollekai/?' + '-'.join(urllib.parse.unquote(replace_html(img)).split('/')))
folders = [os.getcwd() + '/wikiwiki.jp/kancollekai']
files = []
for num in range(11111):
if num >= len(folders):
break
i = folders[num]
for tmp in os.listdir(i):
if os.path.isdir(i + '/' + tmp):
folders.append(i + '/' + tmp)
else:
files.append(i + '/' + tmp)
for file in files:
print('---' + file + '---')
text = ''
try:
text = open(file).read()
except UnicodeDecodeError as e:
print('file decode error')
onepage(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment