Last active
February 22, 2023 07:47
-
-
Save dirkjanfaber/96ebfb6af2f7433d401f20be0b4c1b68 to your computer and use it in GitHub Desktop.
Download the story of us as epub (waitbutwhy)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/bin/bash | |
dir=$(mktemp --directory) | |
declare -a input=() | |
cat <<__EOT__ > ${dir}/metadata.txt | |
--- | |
title: The Story of Us | |
author: Tim Urban | |
rights: Creative Commons Non-Commercial Share Alike 3.0 | |
language: en-US | |
... | |
__EOT__ | |
# Filter for filtering out the non relevant parts of the html files. | |
cat <<__EOT__ > ${dir}/filter.xslt | |
<?xml version="1.0" encoding="utf-8"?> | |
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | |
version="1.0"> | |
<xsl:output method="html" omit-xml-declaration="yes" indent="yes"/> | |
<xsl:strip-space elements="*" /> | |
<xsl:preserve-space elements="html body div" /> | |
<xsl:template match="@* | node()"> | |
<xsl:copy> | |
<xsl:apply-templates select="@* | node()"/> | |
</xsl:copy> | |
</xsl:template> | |
<xsl:template match="div[@id='sidebar']"/> | |
<xsl:template match="div[@id='disqus_thread']"/> | |
<xsl:template match="div[@id='social-ads']"/> | |
<xsl:template match="div[@class='related-posts']"/> | |
<xsl:template match="div[@class='entry-author']"/> | |
<xsl:template match="div[@class='mobile-menu']"/> | |
<xsl:template match="div[@class='social']"/> | |
<xsl:template match="div[@class='social_counter']"/> | |
<xsl:template match="div[@class='logo-section']"/> | |
<xsl:template match="div[@class='left']"/> | |
<xsl:template match="div[@class='entry-nav main-color-bg']"/> | |
<xsl:template match="footer[@id='footer']"/> | |
<xsl:template match="script"/> | |
<xsl:template match="style"/> | |
<xsl:template match="link"/> | |
<xsl:template match="nav"/> | |
</xsl:stylesheet> | |
__EOT__ | |
# Download all the blog posts and filter the html with the created filter | |
for link in \ | |
https://waitbutwhy.com/2019/08/story-intro.html \ | |
https://waitbutwhy.com/2019/08/fire-light.html \ | |
https://waitbutwhy.com/2019/08/giants.html \ | |
https://waitbutwhy.com/2019/09/stories.html \ | |
https://waitbutwhy.com/2019/09/enlightenment-kids.html \ | |
https://waitbutwhy.com/2019/09/mute-button.html \ | |
https://waitbutwhy.com/2019/09/american-brain.html \ | |
https://waitbutwhy.com/2019/09/thinking-ladder.html \ | |
https://waitbutwhy.com/2019/10/idea-labs-echo-chambers.html | |
do | |
# Download the chapter | |
curl -o ${dir}/${link##*/} ${link} | |
# First normalize the html, then filter the html to strip out non-relevant | |
# parts as scripts, links and commands | |
hxnormalize ${dir}/${link##*/} |\ | |
xsltproc --html ${dir}/filter.xslt - > ${dir}/f-${link##*/} | |
# Also replace the strong tags for chapters by h1 tags, so they will end up | |
# as chapters in the final epub | |
sed -e 's%<strong>.*\(Chapter[^<]*\)</strong>%<h1>\1</h1>%' -i ${dir}/f-${link##*/} | |
input+=(${dir}/f-${link##*/}) | |
done | |
# Convert the files to 1 epub file | |
pandoc -f html \ | |
-t epub3 \ | |
--epub-metadata=${dir}/metadata.txt \ | |
-o ${dir}/thestoryofus.epub \ | |
${input[@]} | |
# This epub is way too large because of the 100MB of images, so we | |
# extract the epub and reduce the image size, before packing | |
# again. An epub file is just a zip file, so just extract it | |
mkdir ${dir}/epub | |
unzip ${dir}/thestoryofus.epub -d ${dir}/epub | |
# And convert all of the images to max 640x480. This reduces | |
# the size to about 39 MB | |
find ${dir}/epub/EPUB/media -exec convert \{} -verbose -resize 640x480\> \{} \; | |
# We also want to make them grayscale to save some more space | |
# This gets the total down to 21MB | |
find ${dir}/epub/EPUB/media -exec convert \{} -verbose -colorspace Gray -separate -average \{} \; | |
# And pack the file again, making the new epub 19MB | |
cd ${dir}/epub | |
zip -r /tmp/thestoryofus.epub . | |
cd - | |
# Remove the temporary files | |
rm -rf ${dir} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Script to convert Tim Urbans blog posts on "The Story of Us" into a e-reader friendly ebook. This works not perfect, but does a reasonable job.
Do note that you need some programs installed in order to get this working: