Last active
November 21, 2022 14:28
-
-
Save dcinzona/e3b75f7110e777d603fa0e20f4e363d2 to your computer and use it in GitHub Desktop.
A quick and dirty python script I used to pull down all my old images from the web.archive.org cache of my ghost blog after losing them during a server migration...
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import requests | |
from bs4 import BeautifulSoup | |
import os | |
SITE = "https://tandeciarz.com" | |
# this will end up being the archive URL | |
AURL = SITE | |
# timestamp for date before the snapshot I want to scrape | |
ts = "20221010" | |
# not really needed but doing this for self-reference later. | |
# API service to verify whether a snapshot exists and grab the snapshot URL | |
avail = f"https://archive.org/wayback/available?url={SITE}×tamp={ts}" | |
# pretend I'm not a script | |
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} | |
snapshots = requests.get(url=avail,headers=headers).json() | |
# all pages hosted start with this url | |
root = "http://web.archive.org" | |
# used to split prefix since it will include the timestamp | |
PREFIX = "/web/" | |
if(snapshots['archived_snapshots']['closest']): | |
snap = snapshots['archived_snapshots']['closest'] | |
AURL=snap['url'] | |
PREFIX = AURL.replace(root,"").replace(SITE,"").replace("//","/") | |
# I was going to use this to maintain a unique set of all the links on my site (pointing to a page on my site) for processing later | |
# but I ended up just using this as my target starting point | |
links = {AURL} | |
# The URLs of the images I'm trying to scrape | |
images = set() | |
# Whether the URL (blog post or page) was already retrieved via the script (so we don't go into a scrape loop) | |
retrieved = {} | |
# function to extract html document from given url | |
def getHTMLdocument(url): | |
if(url in retrieved): | |
return retrieved[url] | |
# request for HTML document of given url | |
response = requests.get(url=url, headers=headers) | |
retrieved[url] = response.content | |
# response will be provided in JSON format | |
return response.content | |
# ends up looking like https://web.archive.org/web/20221015071024/https://tandeciarz.com/ | |
TRUNC = PREFIX + SITE | |
def processContent(soup): | |
# find all <img> tags and store the src url | |
for img in soup.find_all('img'): | |
src = img.get('src') | |
if(src is None): | |
continue | |
if "/https://tandeciarz.com/content/images/" in img.get('src'): | |
if src.startswith("/"): | |
src = root + src | |
if src not in images: | |
images.add(src) | |
# find all of the tags that have the style attribute with a background-image set (this is how ghost renders out blog post images)\ | |
# this was arguably the hardest part. Github Co-Pilot effectively wrote this block of code for me :) | |
for bgimg in soup.find_all(style=re.compile("background-image")): | |
src = bgimg.get('style') | |
if(src is None): | |
continue | |
src = src.replace("background-image: url(","").replace(");","") | |
# sometimes the URL would be the full URL, sometimes it would be the relative URL (not really sure why) | |
if src.startswith("/"): | |
src = root + src | |
if src.endswith(")"): | |
src = src[:-1] | |
# honestly, I forgot this was a set | |
if src not in images: | |
images.add(src) | |
# go through all the links on the page and track and scrape any links that go to my site | |
for link in soup.find_all('a'): | |
href = link.get('href') | |
# don't follow anchors | |
if(href is None): | |
continue | |
if "/https://tandeciarz.com/" in href: | |
if href.startswith("/"): | |
href = root + href | |
# I would comment out continue to only pull the first page (to test) | |
if href.startswith(AURL) and href not in retrieved: | |
# continue | |
print("Retrieving: " + href) | |
processContent(BeautifulSoup(getHTMLdocument(href), 'html5lib')) | |
print("Retrieving: " + AURL) | |
processContent(BeautifulSoup(getHTMLdocument(AURL), 'html5lib')) | |
print("Images:") | |
uniqueImages = {} | |
# using a dict this time | |
for i in images: | |
dirPath = i.split("/https://tandeciarz.com/")[1] | |
if dirPath not in uniqueImages: | |
uniqueImages[dirPath] = i | |
# get the URL and strip out everything before the /content so I can build out the folder structure locally and save the image | |
for i in uniqueImages: | |
if(os.path.exists(i)): | |
print("Skipping: " + i) | |
continue | |
dirpath = i.split("/") | |
dirpath = "/".join(dirpath[:-1]) | |
if not os.path.exists(dirpath): | |
os.makedirs(dirpath, exist_ok=True) | |
imgfile = i.split("/")[-1] | |
print("Downloading: " + imgfile) | |
# download the image and save it to the same folder structure that's on my server, so I can easily copy them over when I'm done | |
with open(i, 'wb') as handle: | |
response = requests.get(uniqueImages[i], stream=True) | |
if not response.ok: | |
print(response) | |
for block in response.iter_content(1024): | |
if not block: | |
break | |
handle.write(block) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment