Created
July 21, 2021 04:40
Revisions
-
makefile created this gist
Jul 21, 2021 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,101 @@ # -*- coding: utf-8 -*- # requirements: requests # conda/pip install requests # pip install -i https://pypi.tuna.tsinghua.edu.cn/simple requests==1.2.3 # version for python2.6, but has problem in SSL import os from contextlib import closing import threading import time import sys import requests from requests.adapters import HTTPAdapter # lower python version does not support SNI used for SSL connection # or throws exception: "hostname doesn't match", refer to https://docs.python-requests.org/zh_CN/latest/community/faq.html assert sys.version_info >= (2, 7, 9) s = requests.Session() s.mount('http://', HTTPAdapter(max_retries=2)) s.mount('https://', HTTPAdapter(max_retries=2)) id_url_file = sys.argv[1] out_dir = sys.argv[2] # './images' thread_num = 20 if len(sys.argv) > 3: thread_num = int(sys.argv[3]) # connect + read timeout seconds timeout = 5 headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' } if not os.path.exists(out_dir): os.mkdir(out_dir) def download(img_url, img_name): if os.path.isfile(os.path.join(out_dir, img_name)): return #with closing(requests.get(img_url, stream=True, headers=headers, timeout=timeout)) as r: with closing(s.get(img_url, stream=True, headers=headers, timeout=timeout)) as r: rc = r.status_code if 299 < rc or rc < 200: print('returnCode%s\t%s\t%s' % (rc, img_name, img_url)) return content_length = int(r.headers.get('content-length', '0')) if content_length == 0: print('size0\t%s' % img_url) return try: with open(os.path.join(out_dir, img_name + ".jpg"), 'wb') as f: for data in r.iter_content(1024): f.write(data) except: print('savefail\t%s' % img_url) def get_imgurl_generate(): with open(id_url_file, 'r') as f: index = 0 for line in f: index += 1 if index % 500 == 0: print('execute %s line at %s' % (index, time.time())) if not line: print('line %s is empty "\t"' % index) continue line = line.strip() try: imgs = line.split('\t') if len(imgs) != 2: print('line %s splite error' % index) continue if not imgs[0] or not imgs[1]: print('line %s img is empty' % index) continue yield imgs except: print('line %s can not split by "\t"' % index) lock = threading.Lock() def loop(imgs): print('thread %s is running...' % threading.current_thread().name) while True: try: with lock: #img_url, img_name = next(imgs) img_name, img_url = next(imgs) except StopIteration: break try: download(img_url, img_name) except Exception as e: print('exceptfail\t%s\t%s' % (img_url, e)) time.sleep(1) # seconds print('thread %s is end...' % threading.current_thread().name) img_gen = get_imgurl_generate() for i in range(0, thread_num): t = threading.Thread(target=loop, name='LoopThread %s' % i, args=(img_gen,)) t.start()