-
-
Save ishanfdo18098/0b40bd099638fc9b65b82df076d53dd2 to your computer and use it in GitHub Desktop.
Download Perusall readings as PDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
title = "The title of the article" | |
urls=""" | |
<image URLs scraped from the page> | |
""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# dependencies: imagemagick, img2pdf | |
# multithreaded by Ishan | |
import os | |
import requests | |
from data import title, urls | |
from concurrent.futures import ThreadPoolExecutor | |
NUM_OF_THREADS = 10 | |
def downloadImage(folder, i, u): | |
open('{}/{:0>2}.png'.format(folder, i), 'wb').write(requests.get(u.strip()).content) | |
folder = title.replace(' ', '-') | |
if not os.path.exists(folder): | |
os.mkdir(folder) | |
i = 0 | |
with ThreadPoolExecutor(max_workers=NUM_OF_THREADS) as exe: | |
for u in urls.splitlines(): | |
if u: | |
print('Downloading chunk', i, 'of', title) | |
exe.submit(downloadImage, folder, i, u) | |
i += 1 | |
pgno = 1 | |
def convertPage(f, folder, pgno): | |
os.system('convert -append %s %s/page_%s.png' % (f, folder, pgno)) | |
with ThreadPoolExecutor(max_workers=NUM_OF_THREADS) as exe: | |
for j in range(0, i, 6): | |
f = ' '.join(['{}/{:0>2}.png'.format(folder, k) for k in range(j, min(i, j+6))]) | |
print('Converting page', pgno) | |
exe.submit(convertPage, f, folder, pgno) | |
pgno += 1 | |
print('Converting to pdf') | |
pages = ' '.join(['{}/page_{}.png'.format(folder, k) for k in range(1, pgno)]) | |
os.system('img2pdf %s -o %s.pdf' % (pages, title)) | |
print('Done') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Click on a reading in the Perusall web interface, | |
* and run this script in the developer console. | |
* Copy-and-paste the console.info output to data.py. | |
*/ | |
var len = 0; | |
var times = 0; | |
var i = setInterval(() => { | |
var img = document.querySelectorAll("img.chunk"); img[img.length-1].scrollIntoView(); | |
if (len < img.length) { | |
len = img.length; | |
} else if (times > 3) { | |
var urls = []; | |
img.forEach((e) => urls.push(e.src)); | |
var spl = location.pathname.split('/'); | |
console.info('urls = """\n'+urls.join('\n')+'\n"""\n\ntitle="'+spl[spl.length-1]+'"\n'); | |
clearInterval(i); | |
} else { | |
times++; | |
} | |
}, 2000); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment