Last active
March 1, 2024 12:50
-
-
Save bshillingford/6259986edca707ca58dd to your computer and use it in GitHub Desktop.
arxiv2kindle: recompiles an arxiv paper for kindle-sized screens, and sends it to your wifi-enabled kindle
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import lxml.html as html\n", | |
"import re\n", | |
"import urllib\n", | |
"import os, sys, subprocess, os.path\n", | |
"import glob\n", | |
"import IPython.display\n", | |
"import getpass\n", | |
"import tempfile" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Change the following:\n", | |
"The query can be an arxiv URL or any string containing an arxiv ID.\n", | |
"\n", | |
"It will prompt you for the Gmail account's password; note that the account security settings will have to \"allow unsecure apps\" for permission to use the Gmail SMTP server with TLS." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"query = \"http://arxiv.org/abs/1511.08228\"\n", | |
"kindle_email = '[email protected]'\n", | |
"your_gmail = '[email protected]'\n", | |
"gmailpass = getpass.getpass()\n", | |
"\n", | |
"# paper settings (decrease width/height to increase font)\n", | |
"landscape = True\n", | |
"width = \"6in\"\n", | |
"height = \"4in\"\n", | |
"margin = \"0.2in\"\n", | |
"# settings for latex geometry package:\n", | |
"if landscape:\n", | |
" geom_settings = dict(paperwidth=width, paperheight=height, margin=margin)\n", | |
"else:\n", | |
" geom_settings = dict(paperwidth=height, paperheight=width, margin=margin)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"----------" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"arxiv_id = re.match(r'(http://.*?/)?(?P<id>\\d{4}\\.\\d{4,5}(v\\d{1,2})?)', query).group('id')\n", | |
"arxiv_abs = 'http://arxiv.org/abs/' + arxiv_id\n", | |
"arxiv_pdf = 'http://arxiv.org/pdf/' + arxiv_id\n", | |
"arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0]\n", | |
"arxiv_title = re.sub(r'\\s+', ' ', re.sub(r'^\\[[^]]+\\]\\s*', '', arxiv_pgtitle), re.DOTALL)\n", | |
"arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"IPython.display.HTML('''\n", | |
"<h2><a href=\"{abs}\">[{id}] {title}</a><br />\n", | |
"[<a href=\"{pdf}\">pdf</a>]</h2>\n", | |
"'''.format(id=arxiv_id, abs=arxiv_abs, pdf=arxiv_pdf, title=arxiv_title))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"---------------" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"d = tempfile.mkdtemp(prefix='arxiv2kindle_')\n", | |
"\n", | |
"url = 'http://arxiv.org/e-print/' + arxiv_id\n", | |
"!wget -O {os.path.join(d, 'src.tar.gz')} --user-agent=\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0\" {url}\n", | |
"\n", | |
"os.chdir(d)\n", | |
"!tar xvf src.tar.gz" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"texfiles = glob.glob(os.path.join(d, '*.tex'))\n", | |
"for texfile in texfiles:\n", | |
" with open(texfile, 'r') as f:\n", | |
" src = f.readlines()\n", | |
" if 'documentclass' in src[0]:\n", | |
" print('correct file: ' + texfile)\n", | |
" break" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# filter comments/newlines for easier debugging:\n", | |
"src = [line for line in src if line[0] != '%' and len(line.strip()) > 0]\n", | |
"\n", | |
"# strip font size, column stuff, and paper size stuff in documentclass line:\n", | |
"src[0] = re.sub(r'\\b\\d+pt\\b', '', src[0])\n", | |
"src[0] = re.sub(r'\\b\\w+column\\b', '', src[0])\n", | |
"src[0] = re.sub(r'\\b\\w+paper\\b', '', src[0])\n", | |
"src[0] = re.sub(r'(?<=\\[),', '', src[0]) # remove extraneous starting commas\n", | |
"src[0] = re.sub(r',(?=[\\],])', '', src[0]) # remove extraneous middle/ending commas\n", | |
"\n", | |
"# find begin{document}:\n", | |
"begindocs = [i for i, line in enumerate(src) if line.startswith(r'\\begin{document}')]\n", | |
"assert(len(begindocs) == 1)\n", | |
"src.insert(begindocs[0], '\\\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\\n')\n", | |
"src.insert(begindocs[0], '\\\\usepackage{times}\\n')\n", | |
"src.insert(begindocs[0], '\\\\pagestyle{empty}\\n')\n", | |
"if landscape:\n", | |
" src.insert(begindocs[0], '\\\\usepackage{pdflscape}\\n')\n", | |
"\n", | |
"# shrink figures to be at most the size of the page:\n", | |
"for i in range(len(src)):\n", | |
" line = src[i]\n", | |
" m = re.search(r'\\\\includegraphics\\[width=([.\\d]+)\\\\(line|text)width\\]', line)\n", | |
" if m:\n", | |
" mul = m.group(1)\n", | |
" src[i] = re.sub(r'\\\\includegraphics\\[width=([.\\d]+)\\\\(line|text)width\\]',\n", | |
" r'\\\\includegraphics[width={mul}\\\\textwidth,height={mul}\\\\textheight,keepaspectratio]'.format(mul=mul),\n", | |
" line)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"os.rename(texfile, texfile+'.bak')\n", | |
"with open(texfile, 'w') as f:\n", | |
" f.writelines(src)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"texout = !pdflatex {texfile} && pdflatex {texfile} && pdflatex {texfile}\n", | |
"texout[-8:]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"------" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"pdffilename = texfile[:-4] + '.pdf'\n", | |
"if sys.platform == 'darwin':\n", | |
" os.system('open ' + pdffilename)\n", | |
"else:\n", | |
" os.system('xdg-open ' + pdffilename)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"-------" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from email.mime.application import MIMEApplication\n", | |
"from email.mime.multipart import MIMEMultipart\n", | |
"msg = MIMEMultipart()\n", | |
"pdf_part = MIMEApplication(open(texfile[:-4]+'.pdf', 'rb').read(), _subtype='pdf')\n", | |
"pdf_part.add_header('Content-Disposition', 'attachment', filename=arxiv_id+\"_\"+arxiv_title_scrubbed+\".pdf\")\n", | |
"msg.attach(pdf_part)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import smtplib\n", | |
"import getpass\n", | |
"server = smtplib.SMTP('smtp.gmail.com:587') \n", | |
"server.starttls() \n", | |
"server.login(your_gmail, gmailpass)\n", | |
"server.sendmail(your_gmail, kindle_email, msg.as_string())\n", | |
"server.close()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"------------" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
@sergeyf Good catch, thanks!
Great! (Though, does not work on all papers, for understandable reasons.)
In any case, while I'm in love with Jupyter Notebook, for such tool it seems more natural to have a standalone script (pip-installable?), or something for Calibre.
Gives me following error:
! LaTeX Error: File "eso-pic.sty\" not found.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for this great example!
To make the commented out part work you need quadruple backslash: