Created
May 9, 2025 20:22
-
-
Save me-suzy/80099e8eb4a349f0cae4efa320e509d0 to your computer and use it in GitHub Desktop.
Aduna si adauga toate fisierele html din folder intr-un singur pdf (scoate diacriticile)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
from PyPDF2 import PdfMerger | |
from bs4 import BeautifulSoup | |
from reportlab.pdfgen import canvas | |
from reportlab.lib.pagesizes import A4 | |
# Dictionary to convert Romanian diacritics to standard ASCII | |
CHAR_MAP = { | |
# Romanian specific | |
'ă': 'a', 'Ă': 'A', 'â': 'a', 'Â': 'A', | |
'î': 'i', 'Î': 'I', 'ș': 's', 'Ș': 'S', | |
'ţ': 't', 'Ţ': 'T', 'ş': 's', 'Ş': 'S', | |
'ț': 't', 'Ț': 'T', | |
# HTML entities | |
'ă': 'a', 'â': 'a', 'ã': 'a', 'â': 'a', | |
'ă': 'a', 'â': 'a', 'î': 'i', 'Î': 'I', | |
'î': 'i', 'î': 'i', 'Î': 'I', 'Î': 'I', | |
'ș': 's', 'Ș': 'S', 'Ş': 'S', 'ș': 's', | |
'ş': 's', 'ț': 't', 'ţ': 't', 'Ţ': 'T', | |
'ț': 't', '”': '"', '“': '"', ' ': ' ', | |
'&': '&', | |
# Other problematic chars | |
'–': '-', '—': '--', '"': '"', '"': '"', | |
'…': '...', '•': '*', '´': "'", | |
'‚': ',', '„': '"', | |
'‹': '<', '›': '>', '«': '<<', '»': '>>', | |
# Unicode chars - using code points to avoid syntax errors | |
'\u2019': "'", '\u2018': "'", '\u201c': '"', '\u201d': '"', | |
'\u2013': '-', '\u2014': '--', '\u2022': '*', '\u00a0': ' ', | |
'\u00ad': '-', '\u00b4': "'", '\u00d7': 'x', '\ufeff': '', | |
'\u200b': '', '\u200c': '', '\u200d': '', '\u200e': '', | |
'\u200f': '', '\u2028': ' ', '\u2029': ' ', '\u202a': '', | |
'\u202b': '', '\u202c': '', '\u202d': '', '\u202e': '', | |
'\u202f': ' ', '\u205f': ' ', '\u2060': '', '\u2061': '', | |
'\u2062': '', '\u2063': '', '\u2064': '', '\u2066': '', | |
'\u2067': '', '\u2068': '', '\u2069': '', '\u206a': '', | |
'\u206b': '', '\u206c': '', '\u206d': '', '\u206e': '', | |
'\u206f': '', '\u0300': '', '\u0301': '', '\u0302': '', | |
'\u0303': '', '\u0304': '', '\u0305': '', '\u0306': '', | |
'\u0307': '', '\u0308': '', '\u0309': '', '\u030a': '', | |
'\u030b': '', '\u030c': '', '\u030d': '', '\u030e': '', | |
'\u030f': '', '\u0310': '', '\u0311': '', '\u0312': '', | |
'\u0313': '', '\u0314': '', '\u0315': '', '\u0316': '', | |
'\u0317': '', '\u0318': '', '\u0319': '', '\u031a': '', | |
'\u031b': '', '\u031c': '', '\u031d': '', '\u031e': '', | |
'\u031f': '', '\u0320': '', '\u0321': '', '\u0322': '', | |
'\u0323': '', '\u0324': '', '\u0325': '', '\u0326': '', | |
'\u0327': '', '\u0328': '', '\u0329': '', '\u032a': '', | |
'\u032b': '', '\u032c': '', '\u032d': '', '\u032e': '', | |
'\u032f': '', '\u0330': '', '\u0331': '', '\u0332': '', | |
'\u0333': '', '\u0334': '', '\u0335': '', '\u0336': '', | |
'\u0337': '', '\u0338': '', '\u0339': '', '\u033a': '', | |
'\u033b': '', '\u033c': '', '\u033d': '', '\u033e': '', | |
'\u033f': '', '\u0340': '', '\u0341': '', '\u0342': '', | |
'\u0343': '', '\u0344': '', '\u0345': '', '\u0346': '', | |
'\u0347': '', '\u0348': '', '\u0349': '', '\u034a': '', | |
'\u034b': '', '\u034c': '', '\u034d': '', '\u034e': '', | |
'\u034f': '', '\u1440': '', '\u1449': '', | |
} | |
def safe_char(c): | |
"""Convert character to a safe ASCII equivalent if needed""" | |
if c in CHAR_MAP: | |
return CHAR_MAP[c] | |
# Only allow ASCII characters (code 32-126), replace others | |
code = ord(c) | |
if code < 32 or code > 126: | |
return '?' | |
return c | |
def safe_text(text): | |
"""Convert text to use only safe characters""" | |
return ''.join(safe_char(c) for c in text) | |
def convert_html_to_pdf(html_folder_path): | |
""" | |
Convert all HTML files in a folder to individual PDFs and combine them | |
Args: | |
html_folder_path: Path to the folder containing HTML files | |
""" | |
print(f"Starting HTML to PDF conversion in: {html_folder_path}") | |
# Check if directory exists | |
if not os.path.isdir(html_folder_path): | |
print(f"Error: Directory '{html_folder_path}' does not exist.") | |
return False | |
# Find all HTML files | |
html_files = [] | |
# List all files in the directory | |
try: | |
all_files = os.listdir(html_folder_path) | |
print(f"Found {len(all_files)} total files in directory") | |
# Filter for HTML files with various extensions | |
for file in all_files: | |
file_lower = file.lower() | |
if (file_lower.endswith('.html') or | |
file_lower.endswith('.htm') or | |
'vizualizare' in file_lower or | |
'html-articol=' in file_lower): | |
html_files.append(os.path.join(html_folder_path, file)) | |
print(f"Found {len(html_files)} HTML files to convert") | |
except Exception as e: | |
print(f"Error listing files: {e}") | |
return False | |
if not html_files: | |
print(f"Error: No HTML files found in '{html_folder_path}'.") | |
return False | |
# Create a folder for PDF files | |
pdf_folder = os.path.join(html_folder_path, "pdf_files") | |
os.makedirs(pdf_folder, exist_ok=True) | |
# Convert each HTML file to PDF | |
pdf_files = [] | |
for i, html_file in enumerate(html_files): | |
base_name = os.path.basename(html_file) | |
pdf_file = os.path.join(pdf_folder, f"{i+1:04d}_{base_name}.pdf") | |
print(f"Converting {base_name} to PDF... ({i+1}/{len(html_files)})") | |
try: | |
# Try to read the HTML file with different encodings | |
html_content = None | |
for encoding in ['utf-8', 'latin-1', 'latin2', 'cp1250', 'cp1252', 'iso-8859-2']: | |
try: | |
with open(html_file, 'r', encoding=encoding) as f: | |
html_content = f.read() | |
print(f" Successfully read file with {encoding} encoding") | |
break | |
except UnicodeDecodeError: | |
continue | |
if html_content is None: | |
print(f" Skipping file due to encoding issues: {base_name}") | |
continue | |
# Parse HTML | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# Extract text | |
text = soup.get_text() | |
# Create PDF with reportlab | |
c = canvas.Canvas(pdf_file, pagesize=A4) | |
width, height = A4 | |
# Split text into lines | |
lines = text.split('\n') | |
y = height - 50 # Start from top | |
# Add title - using standard Helvetica fonts | |
c.setFont("Helvetica", 14) | |
title = safe_text(base_name[:40] + "..." if len(base_name) > 40 else base_name) | |
c.drawString(50, y, title) | |
y -= 30 | |
# Add text content | |
c.setFont("Helvetica", 10) | |
# Process each paragraph | |
for line in lines: | |
if line.strip(): | |
# Convert text to safe characters only | |
safe_line = safe_text(line) | |
# Split into words | |
words = safe_line.split() | |
if not words: | |
continue | |
# Handle word wrapping | |
line_buffer = "" | |
for word in words: | |
if not word.strip(): | |
continue | |
test_line = line_buffer + " " + word if line_buffer else word | |
if c.stringWidth(test_line, "Helvetica", 10) < width - 100: | |
line_buffer = test_line | |
else: | |
# Draw the current line | |
c.drawString(50, y, line_buffer) | |
y -= 15 | |
# Check if we need a new page | |
if y < 50: | |
c.showPage() | |
c.setFont("Helvetica", 10) | |
y = height - 50 | |
line_buffer = word | |
# Draw any remaining text | |
if line_buffer: | |
c.drawString(50, y, line_buffer) | |
y -= 15 | |
# Check if we need a new page | |
if y < 50: | |
c.showPage() | |
c.setFont("Helvetica", 10) | |
y = height - 50 | |
c.save() | |
pdf_files.append(pdf_file) | |
print(f" Successfully converted {base_name}") | |
except Exception as e: | |
print(f" Error converting {base_name}: {e}") | |
if not pdf_files: | |
print("Error: Failed to convert any HTML files to PDF.") | |
return False | |
# Merge all PDFs into a single file | |
print(f"Merging {len(pdf_files)} PDFs into a single file...") | |
# Create the final PDF | |
final_pdf_path = os.path.join(html_folder_path, "final.pdf") | |
merger = PdfMerger() | |
successful_merges = 0 | |
for pdf_file in pdf_files: | |
if os.path.exists(pdf_file) and os.path.getsize(pdf_file) > 0: | |
try: | |
merger.append(pdf_file) | |
successful_merges += 1 | |
except Exception as e: | |
print(f"Error appending {os.path.basename(pdf_file)}: {e}") | |
if successful_merges == 0: | |
print("Error: No PDFs could be merged.") | |
return False | |
merger.write(final_pdf_path) | |
merger.close() | |
print(f"Successfully merged {successful_merges} PDFs") | |
print(f"Final PDF created: {final_pdf_path}") | |
print(f"Individual PDFs available in: {pdf_folder}") | |
return True | |
if __name__ == "__main__": | |
if len(sys.argv) > 1: | |
folder_path = sys.argv[1] | |
else: | |
folder_path = input("Enter the path to the folder containing HTML files: ") | |
convert_html_to_pdf(folder_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment