Skip to content

Instantly share code, notes, and snippets.

@Dragost
Last active July 2, 2025 20:27
Show Gist options
  • Save Dragost/4b58cc53de8ebca75a8fbef87c43c0ba to your computer and use it in GitHub Desktop.
Save Dragost/4b58cc53de8ebca75a8fbef87c43c0ba to your computer and use it in GitHub Desktop.
PDF Manual Downloader Script <manualpdf.es>
import os
import re
import shutil
from functools import partial
from multiprocessing import Pool
from urllib.parse import urlparse
import jinja2
import progressbar
import requests
from InquirerPy import prompt
from playwright.sync_api import sync_playwright
from pypdf import PdfMerger
TEMP_FOLDER = 'temp'
PRINT_TEMPLATE = """<html><head><meta charset="UTF-8"><style>{{custom_css}}</style><style>{{base_css}}</style><style>{{page_css}}</style></head><body><a name="{{page}}"></a><div class="viewer-page"><div class="page-{{page}} pf w0 h0">{{content}}</div></div></body></html>"""
def create_folder_if_not_exists(folder: str) -> None:
"""Create folder if not exists"""
if not os.path.exists(folder):
os.makedirs(folder)
def sanitize(input_string: str) -> str:
"""
Sanitize the input string by replacing '/' with '_'
and removing any unwanted characters.
Args:
input_string (str): The string to be sanitized.
Returns:
str: The sanitized string.
"""
# Replace '/' with '_'
sanitized_string = input_string.replace('/', '_')
# Remove any characters that are not alphanumeric or underscores
sanitized_string = re.sub(r'[^\w_]', '', sanitized_string)
return sanitized_string
def get_domain(url: str) -> str:
"""Extracts the base domain from a URL."""
parsed_url = urlparse(url)
base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
return base_domain
def get_manual_url() -> str:
"""Prompt input for Manual PDF url"""
url_question = [{
'type': 'input',
'name': 'url',
'message': 'Enter Manual PDF url:',
}]
url_answer = prompt(url_question)
return url_answer.get('url').split('#')[0].split('?')[0]
def get_data(url: str) -> dict:
"""Process url and return a dictionary with the data"""
html = requests.get(url).text
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(url)
page.wait_for_load_state("networkidle")
html = page.content()
file_id = re.search(r'viewer/([\d/]+)/1/bg1', html).group(1)
pages = re.search(r'<title>(.*)\(.*?(\d+).*?\)</title>', html)
title = pages.group(1).strip()
total_pages = int(pages.group(2))
css_url = page.locator(
"link[rel='stylesheet'][href*='_nuxt/entry']").get_attribute("href")
if css_url and "http" not in css_url:
css_url = url.split("/")[0] + "//" + url.split("/")[2] + css_url
custom_css = requests.get(css_url).text
return dict(file_id=file_id,
title=title,
total_pages=total_pages,
custom_css=custom_css)
def replace_urls_to_absolute(url_viewer: str, content: str) -> str:
"""Get html content and replace url relatives for absolutes"""
content = content.replace('src="', f'src="{url_viewer}')
content = content.replace('src:url(', f'src:url({url_viewer}')
return content
def get_html_page(domain: str, file_id: str, page: int) -> str:
"""Get html page from manualpdf.es"""
url_viewer = f"{domain}/viewer/{file_id}/{page}/"
# url return file, download it and read it
content = requests.get(f"{url_viewer}page-{page}.page").text
# replace relative links to absolute links
content = replace_urls_to_absolute(url_viewer=url_viewer, content=content)
return content
def generate_page(domain: str, file_id: str, page: int, content: str,
path: str, landscape: bool, custom_css: str):
"""Generate html page with jinja2 template"""
url_viewer = f"{domain}/viewer/{file_id}/{page}/"
template = jinja2.Template(PRINT_TEMPLATE)
base_url = "https://www.manualpdf.es/css/base.css"
base_css = requests.get(base_url).text
base_css = replace_urls_to_absolute(url_viewer=url_viewer,
content=base_css)
page_url = f"https://www.manualpdf.es/viewer/{file_id}/{page}/page.css"
page_css = requests.get(page_url).text
page_css = replace_urls_to_absolute(url_viewer=url_viewer,
content=page_css)
html = template.render(file_id=file_id,
page=page,
content=content,
custom_css=custom_css,
base_css=base_css,
page_css=page_css)
# Save html page
file_name = f'{sanitize(file_id)}_{page:04}.html'
with open(path + '/' + file_name, 'w', encoding='utf-8') as f:
f.write(html)
generate_pdf(path, file_name, landscape)
def generate_pdf(path: str, file_name: str, landscape: bool = False):
"""Generate PDF from html"""
apath = os.path.abspath(path + '/' + file_name)
out_name = file_name.split('.')[0] + '.pdf'
with sync_playwright() as p:
browser = p.chromium.launch()
context = browser.new_context()
page = context.new_page()
# Load local HTML
file_url = f"file://{apath}"
page.goto(file_url)
# Generate PDF file
page.pdf(path=f'{path}/{out_name}', format="A4", landscape=landscape)
browser.close()
def join_pdf_pages(path: str, file_id: str, title: str, out_path: str):
"""Join all pdf pages in a single pdf file"""
pdfs = [path + '/' + f for f in os.listdir(path) if f.endswith('.pdf')]
pdfs.sort()
merger = PdfMerger()
for pdf in pdfs:
merger.append(pdf)
title = re.sub(r'[^\w\s]', '', title).replace(' ', '_')
out_file_path = out_path + '/' + f'{sanitize(file_id)}_{title}.pdf'
merger.write(out_file_path)
merger.close()
return out_file_path
def delete_temp_folder():
"""Delete temp folder if all ok"""
ok_question = [{
'type': 'confirm',
'name': 'ok',
'message': '¿All ok? Delete temp folder?',
'default': True
}]
ok_answer = prompt(ok_question)
if ok_answer.get('ok'):
shutil.rmtree(TEMP_FOLDER)
def process_page(domain: str, file_id: str, page: int, wpath: str,
landscape: bool, custom_css: str):
"""Download and process a single page"""
content = get_html_page(domain, file_id, page)
generate_page(domain, file_id, page, content, wpath, landscape, custom_css)
return page
if __name__ == '__main__':
# Create temp folder if not exists
wpath = os.path.abspath(TEMP_FOLDER)
create_folder_if_not_exists(wpath)
# Enter url
url = get_manual_url()
# Get data from url
try:
domain = get_domain(url)
print("Loading manual data...")
pdf_data = get_data(url)
file_id = pdf_data['file_id']
except Exception as e:
print(e)
print('Error: pdf data not found')
exit()
# Ask continue downloading file
print(f'{pdf_data["title"]} with {pdf_data["total_pages"]} pages')
continue_question = [{
'type': 'confirm',
'name': 'continue',
'message': f'Continue downloading file?',
'default': True
}]
continue_answer = prompt(continue_question)
if not continue_answer.get('continue'):
exit()
# Create file_id folder
wpath = wpath + f'/{sanitize(file_id)}'
create_folder_if_not_exists(wpath)
# Files in temp folder for skip already downloaded pages
generated_files = [f for f in os.listdir(wpath) if f.endswith('.pdf')]
# Ask for landscape
landscape_question = [{
'type': 'confirm',
'name': 'landscape',
'message': '¿Landscape?',
'default': False
}]
landscape_answer = prompt(landscape_question)
landscape = landscape_answer.get('landscape')
# Ask for multiprocessing
multiprocessing_question = [{
'type': 'confirm',
'name': 'multiprocessing',
'message': '¿Multiprocessing?',
'default': True
}]
multiprocessing_answer = prompt(multiprocessing_question)
if multiprocessing_answer.get('multiprocessing'):
# Use multiprocessing to download and process pages in parallel
total_pages = pdf_data['total_pages']
pages_to_process = [
page for page in range(1, total_pages + 1)
if f'{sanitize(file_id)}_{page:04}.pdf' not in generated_files
]
with progressbar.ProgressBar(max_value=len(pages_to_process)) as bar:
bar.update(0)
with Pool() as pool:
for i, _ in enumerate(
pool.imap_unordered(
partial(process_page,
domain,
file_id,
wpath=wpath,
landscape=landscape,
custom_css=pdf_data["custom_css"]),
pages_to_process), 1):
bar.update(i)
else:
with progressbar.ProgressBar(max_value=pdf_data['total_pages']) as bar:
bar.update(0)
for page in range(1, pdf_data['total_pages'] + 1):
# If pdf page already exists, skip it
if f'{sanitize(file_id)}_{page:04}.pdf' in generated_files:
bar.update(page - 1)
continue
# Generate html page
generate_page(domain,
file_id,
page,
get_html_page(domain, file_id, page),
wpath,
landscape,
custom_css=pdf_data["custom_css"])
bar.update(page - 1)
# Join all pdf pages in a single pdf file
out_path = os.path.abspath('output')
create_folder_if_not_exists(out_path)
out_file = join_pdf_pages(wpath, file_id, pdf_data['title'], out_path)
# Open pdf file
os.system(f'open {out_file}')
# Delete temp folder if ok
delete_temp_folder()

PDF Manual Downloader Script

This script is designed to download PDF manuals from wdhmedia websites. It can be run on any operating system that supports Python, maybe.

This is not the most correct or fastest way to download PDFs, but it works.

Tested with python 3.10 and Poetry.

Websites

Install

poetry install
poetry run playwright install

Usage

❯ poetry run python main.py
? Enter Manual PDF url: https://www.manualpdf.es/ikea/renodlad/manual
Manual Ikea RENODLAD with 28 pages
? Continue downloading file? Yes
100% (28 of 28) |############################################| Elapsed Time: 0:00:00 Time:  0:00:00
? ¿All ok? Delete temp folder? Yes

Output

The downloaded PDF manuals will be saved in the outputfolder.

License

pdf_manual_downloaderis licensed under the GNU General Public License version 3.0.

[tool.poetry]
name = "pdf_manual_downloader"
version = "1.1.0"
description = "Descargador de PDFs de los chicos del maíz"
authors = ["Alberto <[email protected]>"]
license = "GPL"
package-mode = false
[tool.poetry.dependencies]
python = "^3.10"
inquirerpy = "^0.3.4"
progressbar2 = "^4.2.0"
pypdf = "^3.5.0"
jinja2 = "^3.1.5"
requests = "^2.32.3"
playwright = "^1.49.1"
lxml = "^6.0.0"
@ukleon
Copy link

ukleon commented Mar 22, 2025

having this error:

Continue downloading file? Yes ? ¿Landscape? No ? ¿Multiprocessing? Yes 100% (83 of 83) |#################################################| Elapsed Time: 0:00:22 Time: 0:00:22 Traceback (most recent call last): File "c:\Users\ukleo\Downloads\code\main.py", line 300, in out_file = join_pdf_pages(wpath, file_id, pdf_data['title'], out_path) File "c:\Users\ukleo\Downloads\code\main.py", line 165, in join_pdf_pages merger = PdfMerger() File "C:\Users\ukleo\AppData\Roaming\Python\Python313\site-packages\pypdf_merger.py", line 42, in init deprecation_with_replacement("PdfMerger", "PdfWriter", "5.0.0") ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ukleo\AppData\Roaming\Python\Python313\site-packages\pypdf_utils.py", line 392, in deprecation_with_replacement deprecation( ~~~~~~~~~~~^ f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead." ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "C:\Users\ukleo\AppData\Roaming\Python\Python313\site-packages\pypdf_utils.py", line 379, in deprecation raise DeprecationError(msg) pypdf.errors.DeprecationError: PdfMerger is deprecated and was removed in pypdf 5.0.0. Use PdfWriter instead.

Fixed using:

pip uninstall pypdf
pip install pypdf==4.3.1

@frozenspider
Copy link

JFYI, default 30s timeout of page.wait_for_load_state is not enough if the manual is large (such as 202-page https://www.manua.ls/honda/nc750x-2025/manual) and/or your internet connection isn't great, so if you're seeing something like

Loading manual data...
Timeout 30000ms exceeded.
Error: pdf data not found

It helps to specify a timeout manually.
Inside get_data function, change page.wait_for_load_state("networkidle") for, say, page.wait_for_load_state("networkidle", timeout = 240000) (value in ms)

@Goochy12
Copy link

Goochy12 commented Apr 24, 2025

Have the manual websites changed the way the process URLs? None of the URLs in the comments above lead anywhere, and even when refreshing the page of a manual I am looking at - they all go to "Page Not Found".

Might explain why I'm getting this error when trying to use the script:

poetry run python main.py
? Enter Manual PDF url: https://www.usermanuals.au/kia/optima-2018/manual
Loading manual data...
'NoneType' object has no attribute 'group'
Error: pdf data not found

@Dragost
Copy link
Author

Dragost commented Apr 24, 2025

Have the manual websites changed the way the process URLs? None of the URLs in the comments above lead anywhere, and even when refreshing the page of a manual I am looking at - they all go to "Page Not Found".

Might explain why I'm getting this error when trying to use the script:

poetry run python main.py
? Enter Manual PDF url: https://www.usermanuals.au/kia/optima-2018/manual
Loading manual data...
'NoneType' object has no attribute 'group'
Error: pdf data not found

This is actually happening because the website is returning a 500 error when accessing the URL. This could be due to two reasons: either something on the site is broken, or they have intentionally blocked direct access to the link. We can wait a couple of days to see if they fix it, or I can modify the script to bypass this limitation.

@Dragost
Copy link
Author

Dragost commented Apr 25, 2025

Have the manual websites changed the way the process URLs? None of the URLs in the comments above lead anywhere, and even when refreshing the page of a manual I am looking at - they all go to "Page Not Found".
Might explain why I'm getting this error when trying to use the script:

poetry run python main.py
? Enter Manual PDF url: https://www.usermanuals.au/kia/optima-2018/manual
Loading manual data...
'NoneType' object has no attribute 'group'
Error: pdf data not found

This is actually happening because the website is returning a 500 error when accessing the URL. This could be due to two reasons: either something on the site is broken, or they have intentionally blocked direct access to the link. We can wait a couple of days to see if they fix it, or I can modify the script to bypass this limitation.

@Goochy12 Today it is working properly again.

@luuimora
Copy link

luuimora commented May 4, 2025

It worked very well! Thank you very much. I used it on Fedora Linux 42, installed as written here and used multiproccesing.

@Im2RR
Copy link

Im2RR commented May 26, 2025

whenever I try to install poetry, I always get error like this :

PS C:\Users\act_i> poetry install
Installing dependencies from lock file

Package operations: 1 install, 0 updates, 0 removals

• Installing inquirerpy (0.3.4): Failed

KeyError

"There is no item named 'InquirerPy-0.3.4.dist-info/WHEEL' in the archive"

at anaconda3\Lib\zipfile.py:1476 in getinfo
1472│ def getinfo(self, name):
1473│ """Return the instance of ZipInfo given 'name'."""
1474│ info = self.NameToInfo.get(name)
1475│ if info is None:
→ 1476│ raise KeyError(
1477│ 'There is no item named %r in the archive' % name)
1478│
1479│ return info
1480│

I already try uninstalling & re-installing many times, I also tried using pip3 install InquirerPy besides poetry command, but still doesn't work. Btw, playwright already successfully installed

@lanoph
Copy link

lanoph commented Jun 16, 2025

Hello.
I get the follwing error:

Error: pdf data not found
PS C:\Manual> poetry run python main.py
? Enter Manual PDF url: https://www.manua.ls/kyocera/taskalfa-358ci/manual
Loading manual data...
Locator.get_attribute: Timeout 30000ms exceeded.
Call log:

  • waiting for locator("link[rel='stylesheet'][href*='_nuxt/manual']")

Error: pdf data not found

how can I fix this?

@UlrickDwarf
Copy link

I got the same problem as lanoph... any suggestion? Many Thanks...

@Dragost
Copy link
Author

Dragost commented Jul 2, 2025

@lanoph @UlrickDwarf The code has been updated — please reinstall and try again.

@UlrickDwarf
Copy link

Impressive!!! Many thanks for the prompt resolution...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment