Last active
July 26, 2016 03:12
-
-
Save MattPitlyk/9c2fc73ba08d6b74558a3ed2ca458cf9 to your computer and use it in GitHub Desktop.
Template for making multithreaded requests calls.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from threading import Thread | |
from Queue import Queue | |
import requests | |
def worker(): | |
while True: | |
url = input_q.get() # grab a task (a url to scrape) from the input_q | |
res = requests.get(url) | |
output_q.put((url, res.content)) # save just the content or the entire response object to process later | |
input_q.task_done() # Tells the input_q that the task we grabbed above (the url) has been processed | |
input_q = Queue() | |
output_q = Queue() | |
# Set number of workers. You might want to do some time experience with this number. Just limit the number of | |
# urls you put in the input_q below to 1000 and time this script (or dump it all in a cell and time the cell) | |
# to see what the best number is. I usually find somewhere between 50-100 is good, but you can go higher. | |
num_of_worker_threads = 50 | |
for i in xrange(num_of_worker_threads): | |
t = Thread(target=worker) | |
t.daemon = True # Note that these deamon threads will continue to exist until the python interpreter closes. | |
# If you are running this in a notebook, you have to restart the kernal (and there's a different | |
# way to do this in a notebook. | |
t.start() | |
# Load you list of urls here | |
for url in list_of_urls: | |
input_q.put(url) | |
input_q.join() # block the program until all tasks are done | |
results = [output_q.get(False) for _ in xrange(len(list_of_urls))] | |
# Results is now a list of tuples of (url, res.content) and you can save them however you want. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""This version is better suited for Jupyter notebooks and long running scripts | |
because it does not use daemon threads and kills the threads when it is done. | |
""" | |
from threading import Thread | |
from Queue import Queue | |
import requests | |
def worker(): | |
while True: | |
# Variables to help with retries | |
retry_count = 0 | |
status_code = 0 | |
url = input_q.get() # grab a task (a url to scrape) from the input_q | |
if url is None: # Did we get an end condition? | |
break | |
while count <= 5 and status_code != 200: # Retry 5 times or until we get a good response | |
count += 1 | |
res = requests.get(url) | |
status_code = res.status_code | |
output_q.put((url, res.content)) # save just the content or the entire response object to process later | |
input_q.task_done() # Tells the input_q that the task we grabbed above (the url) has been processed | |
input_q = Queue() | |
output_q = Queue() | |
# Set number of workers. You might want to do some time experience with this number. Just limit the number of | |
# urls you put in the input_q below to 1000 and time this script (or dump it all in a cell and time the cell) | |
# to see what the best number is. I usually find somewhere between 50-100 is good, but you can go higher. | |
num_of_worker_threads = 50 | |
# Load you list of urls here | |
for url in list_of_urls: | |
input_q.put(url) | |
for i in xrange(num_of_worker_threads): | |
t = Thread(target=worker) | |
t.start() # No daemon=True this time | |
input_q.join() # block the program until all tasks are done | |
# When the program gets here, all the tasks have been processed, so let's shutdown the threads | |
for _ in xrange(num_of_threads): | |
input_q.put(None) # Add an end condition for each thread we created above | |
# Combine the results | |
results = [output_q.get(False) for _ in xrange(len(list_of_urls))] | |
# Results is now a list of tuples of (url, res.content) and you can save them however you want. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment