Created
January 26, 2019 03:26
-
-
Save dmuth/d9a8ec66ce1c262addeb9924be23b8bf to your computer and use it in GitHub Desktop.
A shell script to back up an entire website with wget
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Download a copy of a website, with (hopefully) working links, | |
# and tar it up to a single file. | |
# | |
# Errors are fatal | |
set -e | |
DOMAIN="" | |
if test ! "$1" | |
then | |
echo "! " | |
echo "! Syntax: $0 domain_name" | |
echo "! " | |
echo "! domain_name - The domain_name of the website to backup. (e.g. www.anthrocon.org)" | |
echo "! " | |
echo "! Note that we use HTTPS for connecting to the website. This is highly recommended!" | |
echo "! " | |
echo "! " | |
exit 1 | |
fi | |
START_DIR=$PWD | |
DOMAIN=$1 | |
DOMAIN_TMP=$(echo $DOMAIN | sed -e s/[^A-Za-z0-9]/_/g) | |
TMP=$(mktemp -d /tmp/backup-${DOMAIN_TMP}-XXXXX) | |
BACKUP=${DOMAIN}-website-backup-$(date +%Y%m%d-%H%M%S).tgz | |
URL="https://${DOMAIN}/" | |
echo "# " | |
echo "# Changing to temp directory: ${TMP}" | |
echo "# " | |
pushd $TMP > /dev/null | |
echo "# " | |
echo "# Backing up ${URL}..." | |
echo "# " | |
# | |
# From https://www.guyrutenberg.com/2014/05/02/make-offline-mirror-of-a-site-using-wget/ | |
# | |
# --mirror – Makes (among other things) the download recursive. | |
# --convert-links – convert all the links (also to stuff like CSS stylesheets) to relative, so it will be suitable for offline viewing. | |
# --adjust-extension – Adds suitable extensions to filenames (html or css) depending on their content-type. | |
# --page-requisites – Download things like CSS style-sheets and images required to properly display the page offline. | |
# --no-parent – When recursing do not ascend to the parent directory. It useful for restricting the download to only a portion of the site. | |
# --retry-on-http-error - Treat certain codes as transient and retry after a delay. Source: http://tomszilagyi.github.io/2017/02/Wget-retry-on-http-error | |
# --span-hosts Enable spanning across hosts when doing recursive retrieving. | |
# --domains Set domains to be followed. | |
# | |
set +e | |
wget --mirror --convert-links --adjust-extension --page-requisites --no-parent --retry-on-http-error=429,503,504 --span-hosts --domains=${DOMAIN},squarespace.com ${URL} | |
EXIT_CODE=$? | |
set -e | |
#EXIT_CODE=255 # Debugging | |
if test "${EXIT_CODE}" -eq 6 | |
then | |
echo "# " | |
echo "# wget exited with a code of 6, which means there was an authentication failure somewhere." | |
echo "# That's not a fatal error, so we're continuing." | |
echo "# " | |
elif test "${EXIT_CODE}" -eq 8 | |
then | |
echo "# " | |
echo "# wget exited with a code of 8, but that's okay, " | |
echo "# it probably means that there was a 404 somewhere in there." | |
echo "# " | |
elif test "${EXIT_CODE}" -ne 0 | |
then | |
echo "! " | |
echo "! wget exited with non-zero code: ${EXIT_CODE}" | |
echo "! " | |
exit ${EXIT_CODE} | |
fi | |
# | |
# Make a single directory which will be tarred and move everything into it. | |
# We're ignoring the error from the mv because it will complain that you can't | |
# move a directory into itself, and quite frankly if an mv fails, you got bigger problems. | |
# | |
mkdir ${DOMAIN}-backup | |
mv * ${DOMAIN}-backup || true | |
tar cfvz ${START_DIR}/${BACKUP} * | |
echo "# Done!" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment