Skip to content

Instantly share code, notes, and snippets.

@jwoglom
Created October 13, 2020 02:35
Show Gist options
  • Save jwoglom/361a1051bfb8168ae69acafcc568005b to your computer and use it in GitHub Desktop.
Save jwoglom/361a1051bfb8168ae69acafcc568005b to your computer and use it in GitHub Desktop.
Download Perusall readings as PDF
title = "The title of the article"
urls="""
<image URLs scraped from the page>
"""
# dependencies: imagemagick, img2pdf
from data import title, urls
folder = title.replace(' ','-')
import requests
import os
if not os.path.exists(folder):
os.mkdir(folder)
i = 0
for u in urls.splitlines():
if u:
print('Downloading chunk', i, 'of', title)
open('{}/{:0>2}.png'.format(folder, i), 'wb').write(requests.get(u.strip()).content)
i += 1
pgno = 1
for j in range(0, i, 6):
f = ' '.join(['{}/{:0>2}.png'.format(folder, k) for k in range(j, min(i, j+6))])
print('Converting page', pgno)
os.system('convert -append %s %s/page_%s.png' % (f, folder, pgno))
pgno += 1
print('Converting to pdf')
pages = ' '.join(['{}/page_{}.png'.format(folder, k) for k in range(1, pgno)])
os.system('img2pdf %s -o %s.pdf' % (pages, title))
print('Done')
/*
* Click on a reading in the Perusall web interface,
* and run this script in the developer console.
* Copy-and-paste the console.info output to data.py.
*/
var len = 0;
var times = 0;
var i = setInterval(() => {
var img = document.querySelectorAll("img.chunk"); img[img.length-1].scrollIntoView();
if (len < img.length) {
len = img.length;
} else if (times > 3) {
var urls = [];
img.forEach((e) => urls.push(e.src));
var spl = location.pathname.split('/');
console.info('urls = """\n'+urls.join('\n')+'\n"""\n\ntitle="'+spl[spl.length-1]+'"\n');
clearInterval(i);
} else {
times++;
}
}, 2000);
@aarontbk
Copy link

aarontbk commented Apr 24, 2025

Amazing script. I had to modify the readurl and the script itself a bit because it was skipping some pages. I used chatGPT of course :)

`/*
 * Click on a reading in the Perusall web interface,
 * and run this script in the developer console.
 * Copy-and-paste the console.info output to data.py.
 */
var len = 0; 
var times = 0;
var i = setInterval(() => { 
  var img = document.querySelectorAll("img.chunk"); img[img.length-1].scrollIntoView(); 
  if (len < img.length) {
    len = img.length;
  } else if (times > 3) {
    var urls = [];
    img.forEach((e) => urls.push(e.src));
    var spl = location.pathname.split('/');
    console.info('urls = """\n'+urls.join('\n')+'\n"""\n\ntitle="'+spl[spl.length-1]+'"\n');
    clearInterval(i);
  } else {
      times++;
  }
}, 2000);`
`# dependencies: requests, img2pdf, pillow
from data import title, urls
import os
import requests
import img2pdf

folder = title.replace(' ', '-')
if not os.path.exists(folder):
    os.mkdir(folder)

# Step 1: Download all images
i = 0
for u in urls.splitlines():
    if u:
        print('Downloading chunk', i, 'of', title)
        with open(f'{folder}/{i:02}.png', 'wb') as f:
            f.write(requests.get(u.strip()).content)
        i += 1

# Step 2: Stack every 6 images vertically and save as page
pgno = 1
from PIL import Image

for j in range(0, i, 6):
    images = []
    for k in range(j, min(i, j + 6)):
        img = Image.open(f'{folder}/{k:02}.png')
        images.append(img)

    # Stack vertically
    widths, heights = zip(*(img.size for img in images))
    max_width = max(widths)
    total_height = sum(heights)

    combined = Image.new('RGB', (max_width, total_height))
    y_offset = 0
    for img in images:
        combined.paste(img, (0, y_offset))
        y_offset += img.height

    combined.save(f'{folder}/page_{pgno}.png')
    pgno += 1

# Step 3: Convert all pages to PDF
print('Converting to PDF...')
page_files = [f'{folder}/page_{k}.png' for k in range(1, pgno)]
with open(f'{title}.pdf', 'wb') as f:
    f.write(img2pdf.convert(page_files))

print('Done.')
`

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment