Created
July 21, 2021 04:40
-
-
Save makefile/797abe7f2ca46405a63b0c27299e4ec2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# requirements: requests | |
# conda/pip install requests | |
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple requests==1.2.3 # version for python2.6, but has problem in SSL | |
import os | |
from contextlib import closing | |
import threading | |
import time | |
import sys | |
import requests | |
from requests.adapters import HTTPAdapter | |
# lower python version does not support SNI used for SSL connection | |
# or throws exception: "hostname doesn't match", refer to https://docs.python-requests.org/zh_CN/latest/community/faq.html | |
assert sys.version_info >= (2, 7, 9) | |
s = requests.Session() | |
s.mount('http://', HTTPAdapter(max_retries=2)) | |
s.mount('https://', HTTPAdapter(max_retries=2)) | |
id_url_file = sys.argv[1] | |
out_dir = sys.argv[2] # './images' | |
thread_num = 20 | |
if len(sys.argv) > 3: thread_num = int(sys.argv[3]) | |
# connect + read timeout seconds | |
timeout = 5 | |
headers = { | |
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' | |
} | |
if not os.path.exists(out_dir): | |
os.mkdir(out_dir) | |
def download(img_url, img_name): | |
if os.path.isfile(os.path.join(out_dir, img_name)): | |
return | |
#with closing(requests.get(img_url, stream=True, headers=headers, timeout=timeout)) as r: | |
with closing(s.get(img_url, stream=True, headers=headers, timeout=timeout)) as r: | |
rc = r.status_code | |
if 299 < rc or rc < 200: | |
print('returnCode%s\t%s\t%s' % (rc, img_name, img_url)) | |
return | |
content_length = int(r.headers.get('content-length', '0')) | |
if content_length == 0: | |
print('size0\t%s' % img_url) | |
return | |
try: | |
with open(os.path.join(out_dir, img_name + ".jpg"), 'wb') as f: | |
for data in r.iter_content(1024): | |
f.write(data) | |
except: | |
print('savefail\t%s' % img_url) | |
def get_imgurl_generate(): | |
with open(id_url_file, 'r') as f: | |
index = 0 | |
for line in f: | |
index += 1 | |
if index % 500 == 0: | |
print('execute %s line at %s' % (index, time.time())) | |
if not line: | |
print('line %s is empty "\t"' % index) | |
continue | |
line = line.strip() | |
try: | |
imgs = line.split('\t') | |
if len(imgs) != 2: | |
print('line %s splite error' % index) | |
continue | |
if not imgs[0] or not imgs[1]: | |
print('line %s img is empty' % index) | |
continue | |
yield imgs | |
except: | |
print('line %s can not split by "\t"' % index) | |
lock = threading.Lock() | |
def loop(imgs): | |
print('thread %s is running...' % threading.current_thread().name) | |
while True: | |
try: | |
with lock: | |
#img_url, img_name = next(imgs) | |
img_name, img_url = next(imgs) | |
except StopIteration: | |
break | |
try: | |
download(img_url, img_name) | |
except Exception as e: | |
print('exceptfail\t%s\t%s' % (img_url, e)) | |
time.sleep(1) # seconds | |
print('thread %s is end...' % threading.current_thread().name) | |
img_gen = get_imgurl_generate() | |
for i in range(0, thread_num): | |
t = threading.Thread(target=loop, name='LoopThread %s' % i, args=(img_gen,)) | |
t.start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment