Skip to content

Instantly share code, notes, and snippets.

@Dragost
Last active April 25, 2025 08:08
Show Gist options
  • Save Dragost/4b58cc53de8ebca75a8fbef87c43c0ba to your computer and use it in GitHub Desktop.
Save Dragost/4b58cc53de8ebca75a8fbef87c43c0ba to your computer and use it in GitHub Desktop.
PDF Manual Downloader Script <manualpdf.es>
import os
import re
import shutil
from functools import partial
from multiprocessing import Pool
from urllib.parse import urlparse
import jinja2
import progressbar
import requests
from InquirerPy import prompt
from playwright.sync_api import sync_playwright
from pypdf import PdfMerger
TEMP_FOLDER = 'temp'
PRINT_TEMPLATE = """<html><head><meta charset="UTF-8"><style>{{custom_css}}</style><style>{{base_css}}</style><style>{{page_css}}</style></head><body><a name="{{page}}"></a><div class="viewer-page"><div class="page-{{page}} pf w0 h0">{{content}}</div></div></body></html>"""
def create_folder_if_not_exists(folder: str) -> None:
"""Create folder if not exists"""
if not os.path.exists(folder):
os.makedirs(folder)
def sanitize(input_string: str) -> str:
"""
Sanitize the input string by replacing '/' with '_'
and removing any unwanted characters.
Args:
input_string (str): The string to be sanitized.
Returns:
str: The sanitized string.
"""
# Replace '/' with '_'
sanitized_string = input_string.replace('/', '_')
# Remove any characters that are not alphanumeric or underscores
sanitized_string = re.sub(r'[^\w_]', '', sanitized_string)
return sanitized_string
def get_domain(url: str) -> str:
"""Extracts the base domain from a URL."""
parsed_url = urlparse(url)
base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
return base_domain
def get_manual_url() -> str:
"""Prompt input for Manual PDF url"""
url_question = [{
'type': 'input',
'name': 'url',
'message': 'Enter Manual PDF url:',
}]
url_answer = prompt(url_question)
return url_answer.get('url').split('#')[0].split('?')[0]
def get_data(url: str) -> dict:
"""Process url and return a dictionary with the data"""
html = requests.get(url).text
file_id = re.search(r'viewer/([\d/]+)/1/bg1', html).group(1)
pages = re.search(r'<title>(.*)\(.*?(\d+).*?\)</title>', html)
title = pages.group(1).strip()
total_pages = int(pages.group(2))
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(url)
page.wait_for_load_state("networkidle")
css_url = page.locator("link[rel='stylesheet'][href*='_nuxt/manual']"
).get_attribute("href")
custom_css = requests.get(css_url).text
return dict(file_id=file_id,
title=title,
total_pages=total_pages,
custom_css=custom_css)
def replace_urls_to_absolute(url_viewer: str, content: str) -> str:
"""Get html content and replace url relatives for absolutes"""
content = content.replace('src="', f'src="{url_viewer}')
content = content.replace('src:url(', f'src:url({url_viewer}')
return content
def get_html_page(domain: str, file_id: str, page: int) -> str:
"""Get html page from manualpdf.es"""
url_viewer = f"{domain}/viewer/{file_id}/{page}/"
# url return file, download it and read it
content = requests.get(f"{url_viewer}page-{page}.page").text
# replace relative links to absolute links
content = replace_urls_to_absolute(url_viewer=url_viewer, content=content)
return content
def generate_page(domain: str, file_id: str, page: int, content: str,
path: str, landscape: bool, custom_css: str):
"""Generate html page with jinja2 template"""
url_viewer = f"{domain}/viewer/{file_id}/{page}/"
template = jinja2.Template(PRINT_TEMPLATE)
base_url = "https://www.manualpdf.es/css/base.css"
base_css = requests.get(base_url).text
base_css = replace_urls_to_absolute(url_viewer=url_viewer,
content=base_css)
page_url = f"https://www.manualpdf.es/viewer/{file_id}/{page}/page.css"
page_css = requests.get(page_url).text
page_css = replace_urls_to_absolute(url_viewer=url_viewer,
content=page_css)
html = template.render(file_id=file_id,
page=page,
content=content,
custom_css=custom_css,
base_css=base_css,
page_css=page_css)
# Save html page
file_name = f'{sanitize(file_id)}_{page:04}.html'
with open(path + '/' + file_name, 'w', encoding='utf-8') as f:
f.write(html)
generate_pdf(path, file_name, landscape)
def generate_pdf(path: str, file_name: str, landscape: bool = False):
"""Generate PDF from html"""
apath = os.path.abspath(path + '/' + file_name)
out_name = file_name.split('.')[0] + '.pdf'
with sync_playwright() as p:
browser = p.chromium.launch()
context = browser.new_context()
page = context.new_page()
# Load local HTML
file_url = f"file://{apath}"
page.goto(file_url)
# Generate PDF file
page.pdf(path=f'{path}/{out_name}', format="A4", landscape=landscape)
browser.close()
def join_pdf_pages(path: str, file_id: str, title: str, out_path: str):
"""Join all pdf pages in a single pdf file"""
pdfs = [path + '/' + f for f in os.listdir(path) if f.endswith('.pdf')]
pdfs.sort()
merger = PdfMerger()
for pdf in pdfs:
merger.append(pdf)
title = re.sub(r'[^\w\s]', '', title).replace(' ', '_')
out_file_path = out_path + '/' + f'{sanitize(file_id)}_{title}.pdf'
merger.write(out_file_path)
merger.close()
return out_file_path
def delete_temp_folder():
"""Delete temp folder if all ok"""
ok_question = [{
'type': 'confirm',
'name': 'ok',
'message': '¿All ok? Delete temp folder?',
'default': True
}]
ok_answer = prompt(ok_question)
if ok_answer.get('ok'):
shutil.rmtree(TEMP_FOLDER)
def process_page(domain: str, file_id: str, page: int, wpath: str,
landscape: bool, custom_css: str):
"""Download and process a single page"""
content = get_html_page(domain, file_id, page)
generate_page(domain, file_id, page, content, wpath, landscape, custom_css)
return page
if __name__ == '__main__':
# Create temp folder if not exists
wpath = os.path.abspath(TEMP_FOLDER)
create_folder_if_not_exists(wpath)
# Enter url
url = get_manual_url()
# Get data from url
try:
domain = get_domain(url)
print("Loading manual data...")
pdf_data = get_data(url)
file_id = pdf_data['file_id']
except Exception as e:
print(e)
print('Error: pdf data not found')
exit()
# Ask continue downloading file
print(f'{pdf_data["title"]} with {pdf_data["total_pages"]} pages')
continue_question = [{
'type': 'confirm',
'name': 'continue',
'message': f'Continue downloading file?',
'default': True
}]
continue_answer = prompt(continue_question)
if not continue_answer.get('continue'):
exit()
# Create file_id folder
wpath = wpath + f'/{sanitize(file_id)}'
create_folder_if_not_exists(wpath)
# Files in temp folder for skip already downloaded pages
generated_files = [f for f in os.listdir(wpath) if f.endswith('.pdf')]
# Ask for landscape
landscape_question = [{
'type': 'confirm',
'name': 'landscape',
'message': '¿Landscape?',
'default': False
}]
landscape_answer = prompt(landscape_question)
landscape = landscape_answer.get('landscape')
# Ask for multiprocessing
multiprocessing_question = [{
'type': 'confirm',
'name': 'multiprocessing',
'message': '¿Multiprocessing?',
'default': True
}]
multiprocessing_answer = prompt(multiprocessing_question)
if multiprocessing_answer.get('multiprocessing'):
# Use multiprocessing to download and process pages in parallel
total_pages = pdf_data['total_pages']
pages_to_process = [
page for page in range(1, total_pages + 1)
if f'{sanitize(file_id)}_{page:04}.pdf' not in generated_files
]
with progressbar.ProgressBar(max_value=len(pages_to_process)) as bar:
bar.update(0)
with Pool() as pool:
for i, _ in enumerate(
pool.imap_unordered(
partial(process_page,
domain,
file_id,
wpath=wpath,
landscape=landscape,
custom_css=pdf_data["custom_css"]),
pages_to_process), 1):
bar.update(i)
else:
with progressbar.ProgressBar(max_value=pdf_data['total_pages']) as bar:
bar.update(0)
for page in range(1, pdf_data['total_pages'] + 1):
# If pdf page already exists, skip it
if f'{sanitize(file_id)}_{page:04}.pdf' in generated_files:
bar.update(page - 1)
continue
# Generate html page
generate_page(domain,
file_id,
page,
get_html_page(domain, file_id, page),
wpath,
landscape,
custom_css=pdf_data["custom_css"])
bar.update(page - 1)
# Join all pdf pages in a single pdf file
out_path = os.path.abspath('output')
create_folder_if_not_exists(out_path)
out_file = join_pdf_pages(wpath, file_id, pdf_data['title'], out_path)
# Open pdf file
os.system(f'open {out_file}')
# Delete temp folder if ok
delete_temp_folder()

PDF Manual Downloader Script

This script is designed to download PDF manuals from wdhmedia websites. It can be run on any operating system that supports Python, maybe.

This is not the most correct or fastest way to download PDFs, but it works.

Tested with python 3.10 and Poetry.

Websites

Install

poetry install
poetry run playwright install

Usage

❯ poetry run python main.py
? Enter Manual PDF url: https://www.manualpdf.es/ikea/renodlad/manual
Manual Ikea RENODLAD with 28 pages
? Continue downloading file? Yes
100% (28 of 28) |############################################| Elapsed Time: 0:00:00 Time:  0:00:00
? ¿All ok? Delete temp folder? Yes

Output

The downloaded PDF manuals will be saved in the outputfolder.

License

pdf_manual_downloaderis licensed under the GNU General Public License version 3.0.

[tool.poetry]
name = "pdf_manual_downloader"
version = "1.1.0"
description = "Descargador de PDFs de los chicos del maíz"
authors = ["Alberto <[email protected]>"]
license = "GPL"
[tool.poetry.dependencies]
python = "^3.10"
lxml = "^4.9.2"
inquirerpy = "^0.3.4"
progressbar2 = "^4.2.0"
pypdf = "^3.5.0"
jinja2 = "^3.1.5"
requests = "^2.32.3"
playwright = "^1.49.1"
@richaardvark
Copy link

Hello! 🙂 I apologize if this is not the appropriate place for this message but I am running everything as instructed and it runs just fine but will not download my manual file even though it says the site hosting that manual file is supported. ☹

poetry run python main.py
? Enter Manual PDF url: https://www.manua.ls/pioneer/dmh-w2770nex/manual
Error: pdf data not found

Do you have any suggestions on how to download this manual using the script by chance? Thank you for making this awesome tool!!

@sylven
Copy link

sylven commented Oct 21, 2023

The regular expression to find the file_id no longer works.
You need to replace from main.py, line 36:
file_id = re.search(r'file_id:(\d+),', html).group(1)
with:
file_id = re.search(r'viewer/(\d+)', html).group(1)

Also make sure you have the latest version of the pyhtml2pdf library installed. You can use the following command to upgrade it:
poetry run pip install --upgrade pyhtml2pdf

@tapio80
Copy link

tapio80 commented Jun 22, 2024

I'm trying to use this script to download manual but I get this:

PS C:\Program Files\pdf_manual_downloader_script> poetry run python main.py
? Enter Manual PDF url: https://www.manua.ls/ground-zero/gzcr-755mon/manual
User manual Ground Zero GZCR 755MON with 42 pages
? Continue downloading file? Yes

Traceback (most recent call last):
  File "C:\Program Files\pdf_manual_downloader_script\main.py", line 154, in <module>
    generate_page(file_id, page, get_html_page(file_id, page), wpath)
  File "C:\Program Files\pdf_manual_downloader_script\main.py", line 67, in generate_page
    generate_pdf(path, file_name)
  File "C:\Program Files\pdf_manual_downloader_script\main.py", line 75, in generate_pdf
    converter.convert(f'file:///{apath}',
  File "C:\Users\xxx\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-1Gmo21HH-py3.12\Lib\site-packages\pyhtml2pdf\converter.py", line 36, in convert
    result = __get_pdf_from_html(
             ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xxx\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-1Gmo21HH-py3.12\Lib\site-packages\pyhtml2pdf\converter.py", line 74, in __get_pdf_from_html
    driver = webdriver.Chrome(
             ^^^^^^^^^^^^^^^^^
TypeError: WebDriver.__init__() got multiple values for argument 'options'
PS C:\Program Files\pdf_manual_downloader_script>

Any idea what I should do?

@Dragost
Copy link
Author

Dragost commented Jun 23, 2024

I'm trying to use this script to download manual but I get this:

PS C:\Program Files\pdf_manual_downloader_script> poetry run python main.py
? Enter Manual PDF url: https://www.manua.ls/ground-zero/gzcr-755mon/manual
User manual Ground Zero GZCR 755MON with 42 pages
? Continue downloading file? Yes

Traceback (most recent call last):
  File "C:\Program Files\pdf_manual_downloader_script\main.py", line 154, in <module>
    generate_page(file_id, page, get_html_page(file_id, page), wpath)
  File "C:\Program Files\pdf_manual_downloader_script\main.py", line 67, in generate_page
    generate_pdf(path, file_name)
  File "C:\Program Files\pdf_manual_downloader_script\main.py", line 75, in generate_pdf
    converter.convert(f'file:///{apath}',
  File "C:\Users\xxx\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-1Gmo21HH-py3.12\Lib\site-packages\pyhtml2pdf\converter.py", line 36, in convert
    result = __get_pdf_from_html(
             ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\xxx\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-1Gmo21HH-py3.12\Lib\site-packages\pyhtml2pdf\converter.py", line 74, in __get_pdf_from_html
    driver = webdriver.Chrome(
             ^^^^^^^^^^^^^^^^^
TypeError: WebDriver.__init__() got multiple values for argument 'options'
PS C:\Program Files\pdf_manual_downloader_script>

Any idea what I should do?

As @sylven pointed out, the regex had become obsolete, and it's necessary to upgrade the version of the pyhtml2pdf library. With that, it should work for you. I'll update the Gist.

@tapio80
Copy link

tapio80 commented Jun 23, 2024

I did correct that line 36 from main.py before but I downloaded corrected files again.

I used poetry run pip install --upgrade pyhtml2pdf command again and now I get this:

PS C:\Program Files\pdf_manual_downloader_script> poetry run python main.py
? Enter Manual PDF url: https://www.manua.ls/ground-zero/gzcr-755mon/manual
User manual Ground Zero GZCR 755MON with 42 pages
? Continue downloading file? Yes
  0% (0 of 42) |                                                                                                                                       | Elapsed Time: 0:00:00 ETA:  --:--:--
Traceback (most recent call last):
  File "C:\Program Files\pdf_manual_downloader_script\main.py", line 154, in <module>
    generate_page(file_id, page, get_html_page(file_id, page), wpath)
  File "C:\Program Files\pdf_manual_downloader_script\main.py", line 66, in generate_page
    f.write(html)
  File "C:\Program Files\Python\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\ue01c' in position 9364: character maps to <undefined>
PS C:\Program Files\pdf_manual_downloader_script>

@Dragost
Copy link
Author

Dragost commented Jun 23, 2024

I did correct that line 36 from main.py before but I downloaded corrected files again.

I used poetry run pip install --upgrade pyhtml2pdf command again and now I get this:

PS C:\Program Files\pdf_manual_downloader_script> poetry run python main.py
? Enter Manual PDF url: https://www.manua.ls/ground-zero/gzcr-755mon/manual
User manual Ground Zero GZCR 755MON with 42 pages
? Continue downloading file? Yes
  0% (0 of 42) |                                                                                                                                       | Elapsed Time: 0:00:00 ETA:  --:--:--
Traceback (most recent call last):
  File "C:\Program Files\pdf_manual_downloader_script\main.py", line 154, in <module>
    generate_page(file_id, page, get_html_page(file_id, page), wpath)
  File "C:\Program Files\pdf_manual_downloader_script\main.py", line 66, in generate_page
    f.write(html)
  File "C:\Program Files\Python\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\ue01c' in position 9364: character maps to <undefined>
PS C:\Program Files\pdf_manual_downloader_script>

I have updated the code to force the codec not to use the default Windows one. Additionally, I have added the option to generate the PDF in landscape format. Try again.

@tapio80
Copy link

tapio80 commented Jun 23, 2024

Now it worked. Though the first 3 pages were not correctly in landscape and did not include whole page. But it's ok in this case because I don't need that german part of the manual.

@Dragost
Copy link
Author

Dragost commented Jun 24, 2024

Hello! 🙂 I apologize if this is not the appropriate place for this message but I am running everything as instructed and it runs just fine but will not download my manual file even though it says the site hosting that manual file is supported. ☹

poetry run python main.py
? Enter Manual PDF url: https://www.manua.ls/pioneer/dmh-w2770nex/manual
Error: pdf data not found

Do you have any suggestions on how to download this manual using the script by chance? Thank you for making this awesome tool!!

Sorry for the delayed response. I have fixed an issue with complex identifiers like the one in your manual. It should now work with those manuals. I have also added multiprocessing to improve download speeds. Best regards.

@iiAlphaWolf
Copy link

iiAlphaWolf commented Jul 23, 2024

Hello, I tried to follow every hint i could find but i keep getting errors onl ine 233. Please can you help me?

As far as i know i did exactly what @sylven said aswlel.

############################

ralph@ubuntu:~/Desktop/manuals$ poetry run python main.py
? Enter Manual PDF url: https://www.manua.ls/toyota/paseo-1992/manual
User manual Toyota Paseo (1992) with 1061 pages
? Continue downloading file? Yes
? ¿Landscape? Yes
? ¿Multiprocessing? No
0% (1 of 1061) | | Elapsed Time: 0:00:07 ETA: 2:14:57
Traceback (most recent call last):
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 789, in urlopen
response = self._make_request(
^^^^^^^^^^^^^^^^^^^
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 536, in _make_request
response = conn.getresponse()
^^^^^^^^^^^^^^^^^^
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connection.py", line 464, in getresponse
httplib_response = super().getresponse()
^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/http/client.py", line 1428, in getresponse
response.begin()
File "/usr/lib/python3.12/http/client.py", line 331, in begin
version, status, reason = self._read_status()
^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/http/client.py", line 300, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/adapters.py", line 667, in send
resp = conn.urlopen(
^^^^^^^^^^^^^
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 843, in urlopen
retries = retries.increment(
^^^^^^^^^^^^^^^^^^
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/util/retry.py", line 474, in increment
raise reraise(type(error), error, _stacktrace)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/util/util.py", line 38, in reraise
raise value.with_traceback(tb)
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 789, in urlopen
response = self._make_request(
^^^^^^^^^^^^^^^^^^^
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 536, in _make_request
response = conn.getresponse()
^^^^^^^^^^^^^^^^^^
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connection.py", line 464, in getresponse
httplib_response = super().getresponse()
^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/http/client.py", line 1428, in getresponse
response.begin()
File "/usr/lib/python3.12/http/client.py", line 331, in begin
version, status, reason = self._read_status()
^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/http/client.py", line 300, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
urllib3.exceptions.ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/home/ralph/Desktop/manuals/main.py", line 223, in
generate_page(file_id, page, get_html_page(file_id, page),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ralph/Desktop/manuals/main.py", line 70, in get_html_page
content = requests.get(f"{url_page}page-{p}.page").text
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/api.py", line 73, in get
return request("get", url, params=params, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/api.py", line 59, in request
return session.request(method=method, url=url, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/sessions.py", line 589, in request
resp = self.send(prep, **send_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/sessions.py", line 703, in send
r = adapter.send(request, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/adapters.py", line 682, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

@iiAlphaWolf
Copy link

Hello, I tried to follow every hint i could find but i keep getting errors onl ine 233. Please can you help me?

As far as i know i did exactly what @sylven said aswlel.

############################

ralph@ubuntu:~/Desktop/manuals$ poetry run python main.py ? Enter Manual PDF url: https://www.manua.ls/toyota/paseo-1992/manual User manual Toyota Paseo (1992) with 1061 pages ? Continue downloading file? Yes ? ¿Landscape? Yes ? ¿Multiprocessing? No 0% (1 of 1061) | | Elapsed Time: 0:00:07 ETA: 2:14:57 Traceback (most recent call last): File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 789, in urlopen response = self._make_request( ^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 536, in _make_request response = conn.getresponse() ^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connection.py", line 464, in getresponse httplib_response = super().getresponse() ^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/http/client.py", line 1428, in getresponse response.begin() File "/usr/lib/python3.12/http/client.py", line 331, in begin version, status, reason = self._read_status() ^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/http/client.py", line 300, in _read_status raise RemoteDisconnected("Remote end closed connection without" http.client.RemoteDisconnected: Remote end closed connection without response

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/adapters.py", line 667, in send resp = conn.urlopen( ^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 843, in urlopen retries = retries.increment( ^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/util/retry.py", line 474, in increment raise reraise(type(error), error, _stacktrace) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/util/util.py", line 38, in reraise raise value.with_traceback(tb) File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 789, in urlopen response = self._make_request( ^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 536, in _make_request response = conn.getresponse() ^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connection.py", line 464, in getresponse httplib_response = super().getresponse() ^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/http/client.py", line 1428, in getresponse response.begin() File "/usr/lib/python3.12/http/client.py", line 331, in begin version, status, reason = self._read_status() ^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/http/client.py", line 300, in _read_status raise RemoteDisconnected("Remote end closed connection without" urllib3.exceptions.ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/home/ralph/Desktop/manuals/main.py", line 223, in generate_page(file_id, page, get_html_page(file_id, page), ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/Desktop/manuals/main.py", line 70, in get_html_page content = requests.get(f"{url_page}page-{p}.page").text ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/api.py", line 73, in get return request("get", url, params=params, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/api.py", line 59, in request return session.request(method=method, url=url, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/sessions.py", line 589, in request resp = self.send(prep, **send_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/sessions.py", line 703, in send r = adapter.send(request, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/adapters.py", line 682, in send raise ConnectionError(err, request=request) requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

I got it working by changing the command to python3 instead of python:

poetry run python3 main.py

@w6a8w8
Copy link

w6a8w8 commented Nov 25, 2024

Unfortunately the script does not work for me:

C:\Users\XXXX\Downloads\pythonpdf>poetry run python main.py
? Enter Manual PDF url: https://www.manua.ls/beko/wre-7612-xww/manual?p=5
User manual Beko WRE 7612 XWW with 44 pages
? Continue downloading file? Yes
? ¿Landscape? No
? ¿Multiprocessing? No
  0% (0 of 44) |                                                                                                       | Elapsed Time: 0:00:00 ETA:  --:--:--
DevTools listening on ws://127.0.0.1:2441/devtools/browser/1a7c9a61-633a-40b7-9285-9fb3459af693

Traceback (most recent call last):
  File "C:\Users\XXXX\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-B6OelTYV-py3.13\Lib\site-packages\pyhtml2pdf\converter.py", line 84, in __get_pdf_from_html
    WebDriverWait(driver, timeout).until(
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        staleness_of(driver.find_element(by=By.TAG_NAME, value="html"))
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "C:\Users\XXXX\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-B6OelTYV-py3.13\Lib\site-packages\selenium\webdriver\support\wait.py", line 105, in until
    raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:


During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\XXXX\Downloads\pythonpdf\main.py", line 223, in <module>
    generate_page(file_id, page, get_html_page(file_id, page),
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                  wpath, landscape)
                  ^^^^^^^^^^^^^^^^^
  File "C:\Users\XXXX\Downloads\pythonpdf\main.py", line 88, in generate_page
    generate_pdf(path, file_name, landscape)
    ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\XXXX\Downloads\pythonpdf\main.py", line 96, in generate_pdf
    converter.convert(f'file:///{apath}',
    ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^
                      path + '/' + out_name,
                      ^^^^^^^^^^^^^^^^^^^^^^
                      print_options=print_opt)
                      ^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\XXXX\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-B6OelTYV-py3.13\Lib\site-packages\pyhtml2pdf\converter.py", line 38, in convert
    result = __get_pdf_from_html(
        source, timeout, install_driver, print_options)
  File "C:\Users\XXXX\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-B6OelTYV-py3.13\Lib\site-packages\pyhtml2pdf\converter.py", line 95, in __get_pdf_from_html
    result = __send_devtools(
        driver, "Page.printToPDF", calculated_print_options)
  File "C:\Users\XXXX\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-B6OelTYV-py3.13\Lib\site-packages\pyhtml2pdf\converter.py", line 50, in __send_devtools
    url = driver.command_executor._url + resource
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'ChromiumRemoteConnection' object has no attribute '_url'

@xflyboy
Copy link

xflyboy commented Dec 4, 2024

Hi. Thaks for the script.
I got this: Please assist:

C:\Users\Max\AppData\Roaming\Python\Scripts>poetry.exe run python main.py
Traceback (most recent call last):
  File "C:\Users\Max\AppData\Roaming\Python\Scripts\main.py", line 4, in <module>
    import jinja2
ModuleNotFoundError: No module named 'jinja2'

Installed JINJA2 and still.

`C:\Users\Max\AppData\Roaming\Python\Scripts>pip install Jinja2
Collecting Jinja2
Downloading jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting MarkupSafe>=2.0 (from Jinja2)
Downloading MarkupSafe-3.0.2-cp311-cp311-win32.whl.metadata (4.1 kB)
Downloading jinja2-3.1.4-py3-none-any.whl (133 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 133.3/133.3 kB 1.3 MB/s eta 0:00:00
Downloading MarkupSafe-3.0.2-cp311-cp311-win32.whl (15 kB)
Installing collected packages: MarkupSafe, Jinja2
Successfully installed Jinja2-3.1.4 MarkupSafe-3.0.2

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip

C:\Users\Max\AppData\Roaming\Python\Scripts>poetry.exe run python main.py
Traceback (most recent call last):
File "C:\Users\Max\AppData\Roaming\Python\Scripts\main.py", line 4, in
import jinja2
ModuleNotFoundError: No module named 'jinja2'
`

@Fabienjulio
Copy link

Hello, I tried to install it and I get this error at the end:

Writing lock file

Installing the current project: pdf_manual_downloader (1.0.0)
Error: The current project could not be installed: No file/folder found for package pdf-manual-downloader
If you do not want to install the current project use --no-root.
If you want to use Poetry only for dependency management but not for packaging, you can disable package mode by setting package-mode = false in your pyproject.toml file.
If you did intend to install the current project, you may need to set `packages` in your pyproject.toml file.

Then I force I still force it to run but I get these errors during download:

? Enter Manual PDF url: https://www.manua.ls/toshiba/satellite-r35/manual
User manual Toshiba Satellite R35 with 73 pages
? Continue downloading file? Yes
? ¿Landscape? Yes
? ¿Multiprocessing? No
  0% (0 of 73) |                                                                                                                                     | Elapsed Time: 0:00:00 ETA:  --:--:--
Traceback (most recent call last):
  File "/home/fabien/pdf/main.py", line 223, in <module>
    generate_page(file_id, page, get_html_page(file_id, page),
  File "/home/fabien/pdf/main.py", line 88, in generate_page
    generate_pdf(path, file_name, landscape)
  File "/home/fabien/pdf/main.py", line 96, in generate_pdf
    converter.convert(f'file:///{apath}',
  File "/home/fabien/.cache/pypoetry/virtualenvs/pdf-manual-downloader-SC3ek3-u-py3.12/lib/python3.12/site-packages/pyhtml2pdf/converter.py", line 38, in convert
    result = __get_pdf_from_html(
             ^^^^^^^^^^^^^^^^^^^^
  File "/home/fabien/.cache/pypoetry/virtualenvs/pdf-manual-downloader-SC3ek3-u-py3.12/lib/python3.12/site-packages/pyhtml2pdf/converter.py", line 77, in __get_pdf_from_html
    driver = webdriver.Chrome(service=service, options=webdriver_options)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fabien/.cache/pypoetry/virtualenvs/pdf-manual-downloader-SC3ek3-u-py3.12/lib/python3.12/site-packages/selenium/webdriver/chrome/webdriver.py", line 45, in __init__
    super().__init__(
  File "/home/fabien/.cache/pypoetry/virtualenvs/pdf-manual-downloader-SC3ek3-u-py3.12/lib/python3.12/site-packages/selenium/webdriver/chromium/webdriver.py", line 55, in __init__
    self.service.start()
  File "/home/fabien/.cache/pypoetry/virtualenvs/pdf-manual-downloader-SC3ek3-u-py3.12/lib/python3.12/site-packages/selenium/webdriver/common/service.py", line 108, in start
    self.assert_process_still_running()
  File "/home/fabien/.cache/pypoetry/virtualenvs/pdf-manual-downloader-SC3ek3-u-py3.12/lib/python3.12/site-packages/selenium/webdriver/common/service.py", line 121, in assert_process_still_running
    raise WebDriverException(f"Service {self._path} unexpectedly exited. Status code was: {return_code}")
selenium.common.exceptions.WebDriverException: Message: Service /home/fabien/.wdm/drivers/chromedriver/linux64/114.0.5735.90/chromedriver unexpectedly exited. Status code was: 127

That was with a ubuntu with wsl but it's the same situation occurs during dependancy installation if I instead try powershell and I get these errors when I force-run it:

? Enter Manual PDF url: https://www.manua.ls/toshiba/satellite-r35/manual
User manual Toshiba Satellite R35 with 73 pages
? Continue downloading file? Yes
? ¿Landscape? Yes
? ¿Multiprocessing? No
  0% (0 of 73) |                                                                                                                                     | Elapsed Time: 0:00:00 ETA:  --:--:--
DevTools listening on ws://127.0.0.1:52591/devtools/browser/a92dcddb-916d-4a81-a3c9-4628e3852a3f

Traceback (most recent call last):
  File "C:\Users\Fabien J. RAJERISON\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-RhmTb8SP-py3.12\Lib\site-packages\pyhtml2pdf\converter.py", line 84, in __get_pdf_from_html
    WebDriverWait(driver, timeout).until(
  File "C:\Users\Fabien J. RAJERISON\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-RhmTb8SP-py3.12\Lib\site-packages\selenium\webdriver\support\wait.py", line 105, in until
    raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:


During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:\Projects\pdf\main.py", line 223, in <module>
    generate_page(file_id, page, get_html_page(file_id, page),
  File "D:\Projects\pdf\main.py", line 88, in generate_page
    generate_pdf(path, file_name, landscape)
  File "D:\Projects\pdf\main.py", line 96, in generate_pdf
    converter.convert(f'file:///{apath}',
  File "C:\Users\Fabien J. RAJERISON\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-RhmTb8SP-py3.12\Lib\site-packages\pyhtml2pdf\converter.py", line 38, in convert
    result = __get_pdf_from_html(
             ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Fabien J. RAJERISON\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-RhmTb8SP-py3.12\Lib\site-packages\pyhtml2pdf\converter.py", line 95, in __get_pdf_from_html
    result = __send_devtools(
             ^^^^^^^^^^^^^^^^
  File "C:\Users\Fabien J. RAJERISON\AppData\Local\pypoetry\Cache\virtualenvs\pdf-manual-downloader-RhmTb8SP-py3.12\Lib\site-packages\pyhtml2pdf\converter.py", line 50, in __send_devtools
    url = driver.command_executor._url + resource
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'ChromiumRemoteConnection' object has no attribute '_url'

@Acide-Burn
Copy link

Hello !
First, thanks for your script and your time !
Same error as @Fabienjulio to download a PDF (test on my local machine (Debian 11) in venv + test on docker (Debian and Ubuntu latest))

Multiprocess OFF:

Traceback (most recent call last):
  File "//main.py", line 223, in <module>
    generate_page(file_id, page, get_html_page(file_id, page),
  File "//main.py", line 88, in generate_page
    generate_pdf(path, file_name, landscape)
  File "//main.py", line 96, in generate_pdf
    converter.convert(f'file:///{apath}',
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/pyhtml2pdf/converter.py", line 38, in convert
    result = __get_pdf_from_html(
             ^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/pyhtml2pdf/converter.py", line 77, in __get_pdf_from_html
    driver = webdriver.Chrome(service=service, options=webdriver_options)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/selenium/webdriver/chrome/webdriver.py", line 45, in __init__
    super().__init__(
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/selenium/webdriver/chromium/webdriver.py", line 55, in __init__
    self.service.start()
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/selenium/webdriver/common/service.py", line 108, in start
    self.assert_process_still_running()
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/selenium/webdriver/common/service.py", line 121, in assert_process_still_running
    raise WebDriverException(f"Service {self._path} unexpectedly exited. Status code was: {return_code}")
selenium.common.exceptions.WebDriverException: Message: Service /root/.wdm/drivers/chromedriver/linux64/114.0.5735.90/chromedriver unexpectedly exited. Status code was: 127

Multiprocess ON:

Traceback (most recent call last):
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/core/file_manager.py", line 65, in __extract_zip
    archive.extractall(to_directory)
  File "/usr/lib/python3.11/zipfile.py", line 1690, in extractall
    self._extract_member(zipinfo, path, pwd)
  File "/usr/lib/python3.11/zipfile.py", line 1745, in _extract_member
    shutil.copyfileobj(source, target)
  File "/usr/lib/python3.11/shutil.py", line 197, in copyfileobj
    buf = fsrc_read(length)
          ^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/zipfile.py", line 954, in read
    data = self._read1(n)
           ^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/zipfile.py", line 1022, in _read1
    data += self._read2(n - len(data))
            ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/zipfile.py", line 1057, in _read2
    raise EOFError
EOFError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "//main.py", line 134, in process_page
    generate_page(file_id, page, content, wpath, landscape)
  File "//main.py", line 88, in generate_page
    generate_pdf(path, file_name, landscape)
  File "//main.py", line 96, in generate_pdf
    converter.convert(f'file:///{apath}',
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/pyhtml2pdf/converter.py", line 38, in convert
    result = __get_pdf_from_html(
             ^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/pyhtml2pdf/converter.py", line 76, in __get_pdf_from_html
    service = Service(ChromeDriverManager().install())
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/chrome.py", line 40, in install
    driver_path = self._get_driver_binary_path(self.driver)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/core/manager.py", line 41, in _get_driver_binary_path
    binary_path = self._cache_manager.save_file_to_cache(driver, file)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/core/driver_cache.py", line 54, in save_file_to_cache
    files = self.unpack_archive(archive, path)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/core/driver_cache.py", line 49, in unpack_archive
    return self._file_manager.unpack_archive(archive, path)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/core/file_manager.py", line 57, in unpack_archive
    return self.__extract_zip(archive_file, target_dir)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/core/file_manager.py", line 67, in __extract_zip
    if e.args[0] not in [26, 13] and e.args[1] not in [
       ~~~~~~^^^
IndexError: tuple index out of range
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "//main.py", line 203, in <module>
    for i, _ in enumerate(
  File "/usr/lib/python3.11/multiprocessing/pool.py", line 873, in next
    raise value
IndexError: tuple index out of range

Do you have a idea to fix it ?
Thanks !

@asntcrz
Copy link

asntcrz commented Jan 22, 2025

Hello ! First, thanks for your script and your time ! Same error as @Fabienjulio to download a PDF (test on my local machine (Debian 11) in venv + test on docker (Debian and Ubuntu latest))

Multiprocess OFF:

Traceback (most recent call last):
  File "//main.py", line 223, in <module>
    generate_page(file_id, page, get_html_page(file_id, page),
  File "//main.py", line 88, in generate_page
    generate_pdf(path, file_name, landscape)
  File "//main.py", line 96, in generate_pdf
    converter.convert(f'file:///{apath}',
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/pyhtml2pdf/converter.py", line 38, in convert
    result = __get_pdf_from_html(
             ^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/pyhtml2pdf/converter.py", line 77, in __get_pdf_from_html
    driver = webdriver.Chrome(service=service, options=webdriver_options)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/selenium/webdriver/chrome/webdriver.py", line 45, in __init__
    super().__init__(
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/selenium/webdriver/chromium/webdriver.py", line 55, in __init__
    self.service.start()
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/selenium/webdriver/common/service.py", line 108, in start
    self.assert_process_still_running()
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/selenium/webdriver/common/service.py", line 121, in assert_process_still_running
    raise WebDriverException(f"Service {self._path} unexpectedly exited. Status code was: {return_code}")
selenium.common.exceptions.WebDriverException: Message: Service /root/.wdm/drivers/chromedriver/linux64/114.0.5735.90/chromedriver unexpectedly exited. Status code was: 127

Multiprocess ON:

Traceback (most recent call last):
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/core/file_manager.py", line 65, in __extract_zip
    archive.extractall(to_directory)
  File "/usr/lib/python3.11/zipfile.py", line 1690, in extractall
    self._extract_member(zipinfo, path, pwd)
  File "/usr/lib/python3.11/zipfile.py", line 1745, in _extract_member
    shutil.copyfileobj(source, target)
  File "/usr/lib/python3.11/shutil.py", line 197, in copyfileobj
    buf = fsrc_read(length)
          ^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/zipfile.py", line 954, in read
    data = self._read1(n)
           ^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/zipfile.py", line 1022, in _read1
    data += self._read2(n - len(data))
            ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/zipfile.py", line 1057, in _read2
    raise EOFError
EOFError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "//main.py", line 134, in process_page
    generate_page(file_id, page, content, wpath, landscape)
  File "//main.py", line 88, in generate_page
    generate_pdf(path, file_name, landscape)
  File "//main.py", line 96, in generate_pdf
    converter.convert(f'file:///{apath}',
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/pyhtml2pdf/converter.py", line 38, in convert
    result = __get_pdf_from_html(
             ^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/pyhtml2pdf/converter.py", line 76, in __get_pdf_from_html
    service = Service(ChromeDriverManager().install())
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/chrome.py", line 40, in install
    driver_path = self._get_driver_binary_path(self.driver)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/core/manager.py", line 41, in _get_driver_binary_path
    binary_path = self._cache_manager.save_file_to_cache(driver, file)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/core/driver_cache.py", line 54, in save_file_to_cache
    files = self.unpack_archive(archive, path)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/core/driver_cache.py", line 49, in unpack_archive
    return self._file_manager.unpack_archive(archive, path)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/core/file_manager.py", line 57, in unpack_archive
    return self.__extract_zip(archive_file, target_dir)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.cache/pypoetry/virtualenvs/pdf-manual-downloader-il7asoJj-py3.11/lib/python3.11/site-packages/webdriver_manager/core/file_manager.py", line 67, in __extract_zip
    if e.args[0] not in [26, 13] and e.args[1] not in [
       ~~~~~~^^^
IndexError: tuple index out of range
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "//main.py", line 203, in <module>
    for i, _ in enumerate(
  File "/usr/lib/python3.11/multiprocessing/pool.py", line 873, in next
    raise value
IndexError: tuple index out of range

Do you have a idea to fix it ? Thanks !

Same error :'(

@Dragost
Copy link
Author

Dragost commented Jan 23, 2025

@Fabienjulio @asntcrz @Acide-Burn

It seems there was an issue with the "pyhtml2pdf" library when using the webdrivers. I have switched to a different library. It now uses "playwright."

Please make sure to reinstall the script and then install playwright properly:

  • poetry install
  • poetry run playwright install

I hope it works for you now!

@ReyesJayMilson
Copy link

PS C:\Users\Milson\Downloads\4b58cc53de8ebca75a8fbef87c43c0ba-cb521474d48ae4c19ce4db2122275585c990f436\4b58cc53de8ebca75a8fbef87c43c0ba-cb521474d48ae4c19ce4db2122275585c990f436> poetry run python main.py
? Enter Manual PDF url: https://www.manua.ls/bajaj/pulsar-180-cc-dts-i-2014/manual
Loading manual data...
Timeout 30000ms exceeded.
Error: pdf data not found

@ukleon
Copy link

ukleon commented Mar 22, 2025

having this error:

Continue downloading file? Yes
? ¿Landscape? No
? ¿Multiprocessing? Yes
100% (83 of 83) |#################################################| Elapsed Time: 0:00:22 Time: 0:00:22
Traceback (most recent call last):
File "c:\Users\ukleo\Downloads\code\main.py", line 300, in
out_file = join_pdf_pages(wpath, file_id, pdf_data['title'], out_path)
File "c:\Users\ukleo\Downloads\code\main.py", line 165, in join_pdf_pages
merger = PdfMerger()
File "C:\Users\ukleo\AppData\Roaming\Python\Python313\site-packages\pypdf_merger.py", line 42, in init
deprecation_with_replacement("PdfMerger", "PdfWriter", "5.0.0")
~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\ukleo\AppData\Roaming\Python\Python313\site-packages\pypdf_utils.py", line 392, in deprecation_with_replacement
deprecation(
~~~~~~~~~~~^
f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead."
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
File "C:\Users\ukleo\AppData\Roaming\Python\Python313\site-packages\pypdf_utils.py", line 379, in deprecation
raise DeprecationError(msg)
pypdf.errors.DeprecationError: PdfMerger is deprecated and was removed in pypdf 5.0.0. Use PdfWriter instead.

@ukleon
Copy link

ukleon commented Mar 22, 2025

having this error:

Continue downloading file? Yes ? ¿Landscape? No ? ¿Multiprocessing? Yes 100% (83 of 83) |#################################################| Elapsed Time: 0:00:22 Time: 0:00:22 Traceback (most recent call last): File "c:\Users\ukleo\Downloads\code\main.py", line 300, in out_file = join_pdf_pages(wpath, file_id, pdf_data['title'], out_path) File "c:\Users\ukleo\Downloads\code\main.py", line 165, in join_pdf_pages merger = PdfMerger() File "C:\Users\ukleo\AppData\Roaming\Python\Python313\site-packages\pypdf_merger.py", line 42, in init deprecation_with_replacement("PdfMerger", "PdfWriter", "5.0.0") ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\ukleo\AppData\Roaming\Python\Python313\site-packages\pypdf_utils.py", line 392, in deprecation_with_replacement deprecation( ~~~~~~~~~~~^ f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead." ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "C:\Users\ukleo\AppData\Roaming\Python\Python313\site-packages\pypdf_utils.py", line 379, in deprecation raise DeprecationError(msg) pypdf.errors.DeprecationError: PdfMerger is deprecated and was removed in pypdf 5.0.0. Use PdfWriter instead.

Fixed using:

pip uninstall pypdf
pip install pypdf==4.3.1

@frozenspider
Copy link

JFYI, default 30s timeout of page.wait_for_load_state is not enough if the manual is large (such as 202-page https://www.manua.ls/honda/nc750x-2025/manual) and/or your internet connection isn't great, so if you're seeing something like

Loading manual data...
Timeout 30000ms exceeded.
Error: pdf data not found

It helps to specify a timeout manually.
Inside get_data function, change page.wait_for_load_state("networkidle") for, say, page.wait_for_load_state("networkidle", timeout = 240000) (value in ms)

@Goochy12
Copy link

Goochy12 commented Apr 24, 2025

Have the manual websites changed the way the process URLs? None of the URLs in the comments above lead anywhere, and even when refreshing the page of a manual I am looking at - they all go to "Page Not Found".

Might explain why I'm getting this error when trying to use the script:

poetry run python main.py
? Enter Manual PDF url: https://www.usermanuals.au/kia/optima-2018/manual
Loading manual data...
'NoneType' object has no attribute 'group'
Error: pdf data not found

@Dragost
Copy link
Author

Dragost commented Apr 24, 2025

Have the manual websites changed the way the process URLs? None of the URLs in the comments above lead anywhere, and even when refreshing the page of a manual I am looking at - they all go to "Page Not Found".

Might explain why I'm getting this error when trying to use the script:

poetry run python main.py
? Enter Manual PDF url: https://www.usermanuals.au/kia/optima-2018/manual
Loading manual data...
'NoneType' object has no attribute 'group'
Error: pdf data not found

This is actually happening because the website is returning a 500 error when accessing the URL. This could be due to two reasons: either something on the site is broken, or they have intentionally blocked direct access to the link. We can wait a couple of days to see if they fix it, or I can modify the script to bypass this limitation.

@Dragost
Copy link
Author

Dragost commented Apr 25, 2025

Have the manual websites changed the way the process URLs? None of the URLs in the comments above lead anywhere, and even when refreshing the page of a manual I am looking at - they all go to "Page Not Found".
Might explain why I'm getting this error when trying to use the script:

poetry run python main.py
? Enter Manual PDF url: https://www.usermanuals.au/kia/optima-2018/manual
Loading manual data...
'NoneType' object has no attribute 'group'
Error: pdf data not found

This is actually happening because the website is returning a 500 error when accessing the URL. This could be due to two reasons: either something on the site is broken, or they have intentionally blocked direct access to the link. We can wait a couple of days to see if they fix it, or I can modify the script to bypass this limitation.

@Goochy12 Today it is working properly again.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment