Last active
July 12, 2016 06:22
-
-
Save philgooch/2feb33abf13d2f93ecf9184c9daa22c9 to your computer and use it in GitHub Desktop.
Pulls the Chilcot PDFs from http://www.iraqinquiry.org.uk/the-report/ and converts them to HTML for further processing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script will pull the Chilcot PDFs from the publisher site and convert them to html | |
# ready for import into your favourite CMS or NLP pipeline for further processing | |
brew install tidy-html5 | |
brew install parallel | |
brew install xpdf | |
# Alternatively | |
# brew install poppler | |
# or build from | |
# http://www.foolabs.com/xpdf/download.html | |
# and add the path to the executable in ~/.bash_profile, e.g.: | |
# export XPDF=/usr/local/xpdfbin-mac-3.04/bin64 | |
# Get the PDFs | |
curl http://www.iraqinquiry.org.uk/the-report/ | egrep -o 'href=".+pdf"' | awk -F'"' '{print "http://www.iraqinquiry.org.uk"$2 }' | parallel --gnu "wget {}" | |
# Convert to HTML (will create a separate directory for each PDF) | |
for f in *.pdf; do echo pdftohtml $f ${f%.*}; done | parallel eval | |
# Convert to well-formed XML/XHTML using Tidy | |
for f in **/*.html; do echo tidy -n -asxml -m $f | parallel eval; done | |
# In an update, we'll process this XML with XSLT to add all the crossreferencing, links etc | |
# For now, if you want to have the TOC pages hyperlinked to the relevant file, | |
# you can replace the TOC text and page numbers using the relevant page offsets | |
# but this is pretty tedious - a better way will follow soon! | |
# Examples: | |
cd the-report-of-the-iraq-inquiry_introduction | |
# offset: 0 | |
perl -pi -e 's/(<div.+?><span.+?>)([^.]+)([. ]+)(\d+)(<\/span><\/div>)$/$1."<a href=\"page".($4).".html\">".$2."<\/a>".$3."<a href=\"page".($4).".html\">".$4."<\/a>".$5/ge' page1.html | |
cd ../the-report-of-the-iraq-inquiry_executive-summary | |
# offset: 4 | |
perl -pi -e 's/(<div.+?><span.+?>)([^.]+)([. ]+)(\d+)(<\/span><\/div>)$/$1."<a href=\"page".($4+4).".html\">".$2."<\/a>".$3."<a href=\"page".($4+4).".html\">".$4."<\/a>".$5/ge' page5.html | |
cd ../the-report-of-the-iraq-inquiry_section-11 | |
# offset: -20 | |
perl -pi -e 's/(<div.+?><span.+?>)([^.]+)([. ]+)(\d+)(<\/span><\/div>)$/$1."<a href=\"page".($4-20).".html\">".$2."<\/a>".$3."<a href=\"page".($4-20).".html\">".$4."<\/a>".$5/ge' page1.html | |
perl -pi -e 's/(<div.+?><span.+?>)([^.]+)([. ]+)(\d+)(<\/span><\/div>)$/$1."<a href=\"page".($4-20).".html\">".$2."<\/a>".$3."<a href=\"page".($4-20).".html\">".$4."<\/a>".$5/ge' page2.html | |
# etc, etc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment