makefile · July 21, 2021 04:40 · Jul 21, 2021
diff --git a/batch_download_img.py b/batch_download_img.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+# requirements: requests
+# conda/pip install requests
+# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple requests==1.2.3 # version for python2.6, but has problem in SSL
+import os
+from contextlib import closing
+import threading
+import time
+import sys
+import requests
+from requests.adapters import HTTPAdapter
+
+# lower python version does not support SNI used for SSL connection
+# or throws exception: "hostname doesn't match", refer to https://docs.python-requests.org/zh_CN/latest/community/faq.html
+assert sys.version_info >= (2, 7, 9)
+
+s = requests.Session()
+s.mount('http://', HTTPAdapter(max_retries=2))
+s.mount('https://', HTTPAdapter(max_retries=2))
+
+id_url_file = sys.argv[1]
+out_dir = sys.argv[2] # './images'
+thread_num = 20
+if len(sys.argv) > 3: thread_num = int(sys.argv[3])
+# connect + read timeout seconds
+timeout = 5
+
+headers = {
+    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
+}
+
+if not os.path.exists(out_dir):
+    os.mkdir(out_dir)
+
+def download(img_url, img_name):
+    if os.path.isfile(os.path.join(out_dir, img_name)):
+        return
+    #with closing(requests.get(img_url, stream=True, headers=headers, timeout=timeout)) as r:
+    with closing(s.get(img_url, stream=True, headers=headers, timeout=timeout)) as r:
+        rc = r.status_code
+        if 299 < rc or rc < 200:
+            print('returnCode%s\t%s\t%s' % (rc, img_name, img_url))
+            return
+        content_length = int(r.headers.get('content-length', '0'))
+        if content_length == 0:
+            print('size0\t%s' % img_url)
+            return
+        try:
+            with open(os.path.join(out_dir, img_name + ".jpg"), 'wb') as f:
+                for data in r.iter_content(1024):
+                    f.write(data)
+        except:
+            print('savefail\t%s' % img_url)
+
+def get_imgurl_generate():
+    with open(id_url_file, 'r') as f:
+        index = 0
+        for line in f:
+            index += 1
+            if index % 500 == 0:
+                print('execute %s line at %s' % (index, time.time()))
+            if not line:
+                print('line %s is empty "\t"' % index)
+                continue
+            line = line.strip()
+            try:
+                imgs = line.split('\t')
+                if len(imgs) != 2:
+                    print('line %s splite error' % index)
+                    continue
+                if not imgs[0] or not imgs[1]:
+                    print('line %s img is empty' % index)
+                    continue
+                yield imgs
+            except:
+                print('line %s can not split by "\t"' % index)
+
+
+lock = threading.Lock()
+def loop(imgs):
+    print('thread %s is running...' % threading.current_thread().name)
+
+    while True:
+        try:
+            with lock:
+                #img_url, img_name = next(imgs)
+                img_name, img_url = next(imgs)
+        except StopIteration:
+            break
+        try:
+            download(img_url, img_name)
+        except Exception as e:
+            print('exceptfail\t%s\t%s' % (img_url, e))
+            time.sleep(1) # seconds
+    print('thread %s is end...' % threading.current_thread().name)
+
+img_gen = get_imgurl_generate()
+
+for i in range(0, thread_num):
+    t = threading.Thread(target=loop, name='LoopThread %s' % i, args=(img_gen,))
+    t.start()