Skip to content

Instantly share code, notes, and snippets.

@makefile
Created July 21, 2021 04:40

Revisions

  1. makefile created this gist Jul 21, 2021.
    101 changes: 101 additions & 0 deletions batch_download_img.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,101 @@
    # -*- coding: utf-8 -*-
    # requirements: requests
    # conda/pip install requests
    # pip install -i https://pypi.tuna.tsinghua.edu.cn/simple requests==1.2.3 # version for python2.6, but has problem in SSL
    import os
    from contextlib import closing
    import threading
    import time
    import sys
    import requests
    from requests.adapters import HTTPAdapter

    # lower python version does not support SNI used for SSL connection
    # or throws exception: "hostname doesn't match", refer to https://docs.python-requests.org/zh_CN/latest/community/faq.html
    assert sys.version_info >= (2, 7, 9)

    s = requests.Session()
    s.mount('http://', HTTPAdapter(max_retries=2))
    s.mount('https://', HTTPAdapter(max_retries=2))

    id_url_file = sys.argv[1]
    out_dir = sys.argv[2] # './images'
    thread_num = 20
    if len(sys.argv) > 3: thread_num = int(sys.argv[3])
    # connect + read timeout seconds
    timeout = 5

    headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
    }

    if not os.path.exists(out_dir):
    os.mkdir(out_dir)

    def download(img_url, img_name):
    if os.path.isfile(os.path.join(out_dir, img_name)):
    return
    #with closing(requests.get(img_url, stream=True, headers=headers, timeout=timeout)) as r:
    with closing(s.get(img_url, stream=True, headers=headers, timeout=timeout)) as r:
    rc = r.status_code
    if 299 < rc or rc < 200:
    print('returnCode%s\t%s\t%s' % (rc, img_name, img_url))
    return
    content_length = int(r.headers.get('content-length', '0'))
    if content_length == 0:
    print('size0\t%s' % img_url)
    return
    try:
    with open(os.path.join(out_dir, img_name + ".jpg"), 'wb') as f:
    for data in r.iter_content(1024):
    f.write(data)
    except:
    print('savefail\t%s' % img_url)

    def get_imgurl_generate():
    with open(id_url_file, 'r') as f:
    index = 0
    for line in f:
    index += 1
    if index % 500 == 0:
    print('execute %s line at %s' % (index, time.time()))
    if not line:
    print('line %s is empty "\t"' % index)
    continue
    line = line.strip()
    try:
    imgs = line.split('\t')
    if len(imgs) != 2:
    print('line %s splite error' % index)
    continue
    if not imgs[0] or not imgs[1]:
    print('line %s img is empty' % index)
    continue
    yield imgs
    except:
    print('line %s can not split by "\t"' % index)


    lock = threading.Lock()
    def loop(imgs):
    print('thread %s is running...' % threading.current_thread().name)

    while True:
    try:
    with lock:
    #img_url, img_name = next(imgs)
    img_name, img_url = next(imgs)
    except StopIteration:
    break
    try:
    download(img_url, img_name)
    except Exception as e:
    print('exceptfail\t%s\t%s' % (img_url, e))
    time.sleep(1) # seconds
    print('thread %s is end...' % threading.current_thread().name)

    img_gen = get_imgurl_generate()

    for i in range(0, thread_num):
    t = threading.Thread(target=loop, name='LoopThread %s' % i, args=(img_gen,))
    t.start()