philgooch · July 12, 2016 06:22
diff --git a/chilcot.sh b/chilcot.sh
 # This script will pull the Chilcot PDFs from the publisher site and convert them to html
 # ready for import into your favourite CMS or NLP pipeline for further processing

 brew install tidy-html5
 brew install parallel
 brew install xpdf
 # Alternatively
 # brew install poppler

 # or build from
 # http://www.foolabs.com/xpdf/download.html
 # and add the path to the executable in ~/.bash_profile, e.g.:
 # export XPDF=/usr/local/xpdfbin-mac-3.04/bin64

 # Get the PDFs
 curl http://www.iraqinquiry.org.uk/the-report/ | egrep -o 'href=".+pdf"' | awk -F'"' '{print "http://www.iraqinquiry.org.uk"$2 }' | parallel --gnu "wget {}"

 # Convert to HTML (will create a separate directory for each PDF)
 for f in *.pdf; do echo pdftohtml $f ${f%.*}; done | parallel eval

 # Convert to well-formed XML/XHTML using Tidy
 for f in **/*.html; do echo tidy -n -asxml -m $f | parallel eval; done

 # In an update, we'll process this XML with XSLT to add all the crossreferencing, links etc

 # For now, if you want to have the TOC pages hyperlinked to the relevant file,
 # you can replace the TOC text and page numbers using the relevant page offsets
 # but this is pretty tedious - a better way will follow soon!

 # Examples:

 cd the-report-of-the-iraq-inquiry_introduction
 # offset: 0
 perl -pi -e 's/(<div.+?><span.+?>)([^.]+)([. ]+)(\d+)(<\/span><\/div>)$/$1."<a href=\"page".($4).".html\">".$2."<\/a>".$3."<a href=\"page".($4).".html\">".$4."<\/a>".$5/ge' page1.html

 cd ../the-report-of-the-iraq-inquiry_executive-summary
 # offset: 4
 perl -pi -e 's/(<div.+?><span.+?>)([^.]+)([. ]+)(\d+)(<\/span><\/div>)$/$1."<a href=\"page".($4+4).".html\">".$2."<\/a>".$3."<a href=\"page".($4+4).".html\">".$4."<\/a>".$5/ge' page5.html

 cd ../the-report-of-the-iraq-inquiry_section-11
 # offset: -20
 perl -pi -e 's/(<div.+?><span.+?>)([^.]+)([. ]+)(\d+)(<\/span><\/div>)$/$1."<a href=\"page".($4-20).".html\">".$2."<\/a>".$3."<a href=\"page".($4-20).".html\">".$4."<\/a>".$5/ge' page1.html
 perl -pi -e 's/(<div.+?><span.+?>)([^.]+)([. ]+)(\d+)(<\/span><\/div>)$/$1."<a href=\"page".($4-20).".html\">".$2."<\/a>".$3."<a href=\"page".($4-20).".html\">".$4."<\/a>".$5/ge' page2.html

 # etc, etc
	# This script will pull the Chilcot PDFs from the publisher site and convert them to html
	# ready for import into your favourite CMS or NLP pipeline for further processing

	brew install tidy-html5
	brew install parallel
	brew install xpdf
	# Alternatively
	# brew install poppler

	# or build from
	# http://www.foolabs.com/xpdf/download.html
	# and add the path to the executable in ~/.bash_profile, e.g.:
	# export XPDF=/usr/local/xpdfbin-mac-3.04/bin64

	# Get the PDFs
	curl http://www.iraqinquiry.org.uk/the-report/ \| egrep -o 'href=".+pdf"' \| awk -F'"' '{print "http://www.iraqinquiry.org.uk"$2 }' \| parallel --gnu "wget {}"

	# Convert to HTML (will create a separate directory for each PDF)
	for f in .pdf; do echo pdftohtml $f ${f%.}; done \| parallel eval

	# Convert to well-formed XML/XHTML using Tidy
	for f in */.html; do echo tidy -n -asxml -m $f \| parallel eval; done

	# In an update, we'll process this XML with XSLT to add all the crossreferencing, links etc

	# For now, if you want to have the TOC pages hyperlinked to the relevant file,
	# you can replace the TOC text and page numbers using the relevant page offsets
	# but this is pretty tedious - a better way will follow soon!

	# Examples:

	cd the-report-of-the-iraq-inquiry_introduction
	# offset: 0
	perl -pi -e 's/(<div.+?><span.+?>)([^.]+)([. ]+)(\d+)(<\/span><\/div>)$/$1."<a href=\"page".($4).".html\">".$2."<\/a>".$3."<a href=\"page".($4).".html\">".$4."<\/a>".$5/ge' page1.html

	cd ../the-report-of-the-iraq-inquiry_executive-summary
	# offset: 4
	perl -pi -e 's/(<div.+?><span.+?>)([^.]+)([. ]+)(\d+)(<\/span><\/div>)$/$1."<a href=\"page".($4+4).".html\">".$2."<\/a>".$3."<a href=\"page".($4+4).".html\">".$4."<\/a>".$5/ge' page5.html

	cd ../the-report-of-the-iraq-inquiry_section-11
	# offset: -20
	perl -pi -e 's/(<div.+?><span.+?>)([^.]+)([. ]+)(\d+)(<\/span><\/div>)$/$1."<a href=\"page".($4-20).".html\">".$2."<\/a>".$3."<a href=\"page".($4-20).".html\">".$4."<\/a>".$5/ge' page1.html
	perl -pi -e 's/(<div.+?><span.+?>)([^.]+)([. ]+)(\d+)(<\/span><\/div>)$/$1."<a href=\"page".($4-20).".html\">".$2."<\/a>".$3."<a href=\"page".($4-20).".html\">".$4."<\/a>".$5/ge' page2.html

	# etc, etc