Last active
July 22, 2024 13:18
-
-
Save Mr0grog/d217a3dc4039b6b4d033c4e62c850905 to your computer and use it in GitHub Desktop.
Check the HTTP status codes for a list of URLs in a file. Undoubtedly there are fancier ways to do this with wget or some other common Linux utilities, but I’m not good enough at bash scripting to find a way to do that with minimal memory usage (if the list of URLs is millions long) and with nice formatting, so I just wrote a script.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
www.energy.gov | |
gobbledegook | |
apple.com | |
www.gpo.gov | |
httpstat.us/404 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import aiohttp | |
import argparse | |
import asyncio | |
import csv | |
import re | |
import sys | |
import time | |
from urllib.parse import urlparse | |
# Edit this to adjust how many HTTP requests you can make in parallel | |
CONCURRENT_REQUESTS = 20 | |
PROTOCOL_PATTERN = re.compile(r'^\w+://') | |
DOMAIN_PATTERN = re.compile(r'^[^:]+://([^/:]+)') | |
async def get_status(session, url): | |
'''Return the status code for a URL''' | |
if PROTOCOL_PATTERN.match(url) is None: | |
url = 'http://' + url | |
domain = DOMAIN_PATTERN.match(url).group(1) | |
rate_lock = next((x for x in options.rate if x.matches(domain)), None) | |
if rate_lock: | |
await rate_lock.wait() | |
async with session.head(url, allow_redirects=True) as response: | |
return response.status | |
async def worker(file, output, options): | |
''' | |
Reads URLs as lines from a file, requests their status, and outputs the | |
results in CSV format. Spawn multiple of these to work on more than one | |
URL in parallel. | |
''' | |
start_time = time.time() | |
writer = csv.writer(output) | |
ssl_mode = False if options.ignore_ssl_errors else None | |
connector = aiohttp.TCPConnector(ssl=ssl_mode) | |
async with aiohttp.ClientSession(connector=connector) as session: | |
while True: | |
line = file.readline() | |
if line == '': | |
break | |
url = line.strip() | |
try: | |
status = await get_status(session, url) | |
writer.writerow([url, status]) | |
except Exception as error: | |
writer.writerow([url, f'ERROR: {error}']) | |
if time.time() - start_time > 2: | |
output.flush() | |
class RateLimit: | |
'A lock that can only be acquired at a certain rate.' | |
def __init__(self, raw): | |
domain = None | |
rate_text = raw | |
if ':' in raw: | |
domain, rate_text = raw.split(':', 1) | |
try: | |
rate = float(rate_text) | |
except ValueError: | |
raise ValueError(f'Invalid rate format: "{raw}"') | |
self.domain = domain | |
self.rate = rate | |
self.interval = 1 / rate if rate > 0 else 0 | |
self.last_use = 0 | |
def matches(self, domain): | |
'Determine if this rate limit should be used for the given domain.' | |
if not self.domain: | |
return True | |
return domain == self.domain or domain.endswith(f'.{self.domain}') | |
async def wait(self): | |
'Wait for the next available usage according to the rate limit.' | |
remaining = 1 | |
while remaining > 0: | |
remaining = self.last_use + self.interval - time.time() | |
if remaining > 0: | |
await asyncio.sleep(remaining) | |
self.last_use = time.time() | |
parser = argparse.ArgumentParser(description='Check the statuses of a list of URLs.') | |
parser.add_argument('path', help='path to a file that is a newline-delimited list of URLs') | |
parser.add_argument('--ignore-ssl-errors', action='store_true', help='ignore errors in SSL handshakes') | |
parser.add_argument('--rate', action='append', type=RateLimit, default=[], help='Maximum number of requests per second to make. Repeat with `--rate "example.com:2"` to set specific rate limits per domain.') | |
options = parser.parse_args() | |
# Sort rate limits by longest (i.e. most specific) domain first | |
options.rate.sort(key=lambda x: x.domain or '', reverse=True) | |
# Start event loop, open the URL list, and spawn workers to read from it. | |
loop = asyncio.get_event_loop() | |
with open(options.path) as urls_file: | |
workers = [worker(urls_file, sys.stdout, options) | |
for i in range(CONCURRENT_REQUESTS)] | |
loop.run_until_complete(asyncio.gather(*workers)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python get_urls.py example_urls.txt > statuses.csv | |
# Result in statuses.csv is like: | |
# gobbledegook,"ERROR: Cannot connect to host gobbledegook:80 ssl:None [nodename nor servname provided, or not known]" | |
# httpstat.us/404,404 | |
# apple.com,200 | |
# www.energy.gov,200 | |
# www.gpo.gov,200 | |
# Or use --ignore-ssl-errors to retrieve status for URLs with SSL handshake | |
# problems, e.g. invalid or expired SSL certificates. | |
python get_urls.py example_urls.txt --ignore-ssl-errors > statuses.csv | |
# To apply a rate limit of 2 requests/second: | |
python get_urls.py example_urls.txt --rate 2 | |
# To apply a rate limit of 1 request/second for energy.gov, 2 requests/second | |
# for epa.gov, and 10/second for everything else: | |
python get_urls.py example_urls.txt --rate 10 --rate 'energy.gov:1' --rate 'epa.gov:2' |
Added rate limiting with --rate 5
for 5 requests/second. You can also repeat to apply different rate limits for different domains:
# Limit to 10 requests/s, but only 1 request/s for energy.gov and 2 requests/s for epa.gov
--rate 10 --rate 'energy.gov:1' --rate 'epa.gov:2'
Note that domains include subdomains, so in the above example, requests to arpa-e.energy.gov
would be rate-limited together with requests to energy.gov
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Input file sometimes has some obvious formatting problems we can fix, e.g…
Doubled up protocols:
Quotes: