Created
January 3, 2022 21:07
-
-
Save YoRyan/5033957efe5aff22bc0d9c4645061a5f to your computer and use it in GitHub Desktop.
Use the Wayback Machine API to search old websites.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf8 -*- | |
from argparse import ArgumentTypeError, ArgumentParser | |
from collections import Counter | |
from datetime import datetime, timezone | |
from pathlib import Path | |
from re import search | |
from archivecdx import Listing as ArchiveListing | |
def main(): | |
def input_dt(s): | |
try: | |
dt = datetime.fromisoformat(s) | |
except ValueError: | |
raise ArgumentTypeError(f'invalid date: {s}') | |
if dt.tzinfo: | |
return dt | |
else: | |
return dt.astimezone() | |
parser = ArgumentParser( | |
description='Resurrect websites from the Wayback Machine.') | |
parser.add_argument('url', help='URL parameter to search') | |
parser.add_argument('-matchType', default='exact', | |
choices=['exact', 'prefix', 'host', 'domain'], | |
help='select scope for search') | |
parser.add_argument('-from', dest='tfrom', type=input_dt, | |
help='start search from this date') | |
parser.add_argument('-to', dest='tto', type=input_dt, | |
help='end search at this date') | |
parser.add_argument('-filter', action='append', | |
help='apply a search filter; add multiple for AND') | |
parser.add_argument('-collapse', action='append', | |
help='apply a collapse filter; add multiple for AND') | |
args = parser.parse_args() | |
def wayback_time(dt): return dt.astimezone(timezone.utc).strftime('%Y%m%d%H%M%S') | |
download_listing(ArchiveListing( | |
args.url, | |
matchType=args.matchType, | |
filter=args.filter if args.filter else [], | |
collapse=args.collapse if args.collapse else [], | |
_from=wayback_time(args.tfrom) if args.tfrom else None, | |
to=wayback_time(args.tto) if args.tto else None)) | |
def download_listing(listing): | |
def urlkey_path(urlkey): | |
parts = Path(urlkey).parts | |
def domain(part): | |
m = search(r'(.+?)(?::(\d+))?\)$', part) | |
if not m: | |
raise ValueError(f'invalid domain: {part}') | |
domainl = m.group(1).split(',') | |
domainl.reverse() | |
domain = '.'.join(domainl) | |
port = m.group(2) | |
return f'{domain}:{port}' if port else domain | |
return Path(*((domain(parts[0]),) + parts[1:])) | |
def strip_illegals(path): | |
def strip(s): | |
return s.replace(':', '_') | |
return Path(*(strip(p) for p in path.parts)) | |
def row_path(row, isdupe): | |
if isdupe[row.urlkey]: | |
parts = urlkey_path(row.urlkey).parts | |
fsplit = parts[-1].split('.') | |
if len(fsplit) > 1: | |
fname = (f'{fsplit[0]}_{row.timestamp}_{row.digest}' | |
f".{'.'.join(fsplit[1:])}") | |
else: | |
fname = f'{fsplit[0]}_{row.timestamp}_{row.digest}' | |
return strip_illegals(Path(*(parts[:-1] + (fname,)))) | |
else: | |
return strip_illegals(urlkey_path(row.urlkey)) | |
urlkey_dupes = Counter(row.urlkey for row in listing) | |
urlkey_isdupe = {urlkey: True if urlkey_dupes[urlkey] > 1 else False | |
for urlkey in urlkey_dupes} | |
for row in listing: | |
out_path = row_path(row, urlkey_isdupe) | |
print(f'https://web.archive.org/web/{row.timestamp}id_/{row.original}') | |
print(f'\tout={out_path}') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment