Created
October 10, 2019 11:55
-
-
Save airbob/0fcd80e3617da96188405f71d0d26f97 to your computer and use it in GitHub Desktop.
A Python script to crawl all project images from jasoncharleshill.com, I use them as my mac wallpaper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os | |
import re | |
import urllib | |
from bs4 import BeautifulSoup as Soup | |
URL_REG = re.compile(r'(https://[^/\\]+)', re.I) | |
IMG_REG = re.compile(r'<img[^>]*?src=([\'"])([^\1]*?)\1', re.I) | |
## query url and filter image resources and download them to dir folder | |
def download(dir, url): | |
if not os.path.isdir(dir): | |
os.mkdir(dir) | |
global URL_REG, IMG_REG | |
m = URL_REG.match(url) #IMG_REG | |
if not m: | |
print '[Error]Invalid URL: ', url | |
return | |
host = m.group(1) | |
if not os.path.isdir(dir): | |
os.mkdir(dir) | |
#retrieve image url | |
html = urllib.urlopen(url).read() | |
imgs = [ item[1] for item in IMG_REG.findall(html) ] | |
f = lambda path: path if path.startswith('https://') else \ | |
host + path if path.startswith('/') else url + '/' + path | |
imgs = list(set(map(f, imgs))) | |
print '[Info]Find %d images.' % len(imgs) | |
#download images | |
for idx, img in enumerate(imgs): | |
name = img.split('/')[-1] | |
path = os.path.join(dir, name) | |
try: | |
print '[Info]Download(%d): %s' % (idx+1, img) | |
urllib.urlretrieve(img, path) | |
except: | |
print "[Error]Can't download(%d): %s" % (idx+1, img) | |
## get all projects from this domain | |
def getAllProjects(url): | |
s = urllib.urlopen(url).read() | |
html = Soup(s, 'html.parser') | |
links = [a['href'] for a in html.find_all('a')] | |
projects = [] | |
for link in links: | |
if link.startswith('/') and len(link) > 1: | |
if link not in ['/about', '/film', '/contact']: | |
projects.append(link) | |
final = set(projects) | |
urls = [] | |
for project in final: | |
url = questHTTP + project | |
urls.append(url) | |
return urls | |
savePath = 'image' | |
domainUrl = "https://www.jasoncharleshill.com" | |
urls = getAllProjects(domainUrl) | |
for url in urls: | |
download(savePath, url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment