Created
May 9, 2025 20:25
-
-
Save me-suzy/74a1ffc852646bde960732e9a1f14e82 to your computer and use it in GitHub Desktop.
Aduna si adauga toate fisierele html din folder intr-un singur pdf (Claude.ai)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
from PyPDF2 import PdfMerger | |
from bs4 import BeautifulSoup | |
from reportlab.pdfgen import canvas | |
from reportlab.lib.pagesizes import A4 | |
from reportlab.pdfbase import pdfmetrics | |
from reportlab.pdfbase.ttfonts import TTFont | |
# Dictionary of common HTML entities and their proper Unicode characters | |
HTML_ENTITIES = { | |
'ă': 'ă', 'â': 'â', 'ã': 'ã', 'â': 'â', | |
'ă': 'ă', 'â': 'â', 'î': 'î', 'Î': 'Î', | |
'î': 'î', 'î': 'î', 'Î': 'Î', 'Î': 'Î', | |
'ș': 'ș', 'Ș': 'Ș', 'Ş': 'Ş', 'ș': 'ș', 'ş': 'ș', | |
'ț': 'ț', 'ţ': 'ț', 'Ţ': 'Ţ', 'ț': 'ț', | |
'”': '"', '“': '"', ' ': ' ', '&': '&' | |
} | |
def replace_html_entities(text): | |
"""Replace HTML entities with their corresponding Unicode characters""" | |
for entity, char in HTML_ENTITIES.items(): | |
text = text.replace(entity, char) | |
return text | |
def find_and_register_unicode_font(): | |
"""Find and register a suitable Unicode font""" | |
# Common paths to fonts that support Romanian characters | |
font_paths = [ | |
# Windows fonts | |
"C:/Windows/Fonts/arial.ttf", | |
"C:/Windows/Fonts/cour.ttf", # Courier New | |
"C:/Windows/Fonts/times.ttf", # Times New Roman | |
"C:/Windows/Fonts/segoeui.ttf", # Segoe UI | |
"C:/Windows/Fonts/DejaVuSans.ttf", | |
# Linux fonts | |
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", | |
"/usr/share/fonts/TTF/DejaVuSans.ttf", | |
"/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf", | |
# macOS fonts | |
"/Library/Fonts/Arial Unicode.ttf", | |
"/System/Library/Fonts/LucidaGrande.ttc" | |
] | |
# Try to find and register a font | |
for font_path in font_paths: | |
if os.path.exists(font_path): | |
try: | |
font_name = os.path.basename(font_path).split('.')[0] | |
pdfmetrics.registerFont(TTFont(font_name, font_path)) | |
print(f"Registered font: {font_name} from {font_path}") | |
return font_name | |
except: | |
print(f"Failed to register font: {font_path}") | |
print("No suitable Unicode font found. Using built-in Helvetica (diacritics may not show correctly).") | |
return "Helvetica" | |
def convert_html_to_pdf(html_folder_path): | |
""" | |
Convert all HTML files in a folder to individual PDFs and combine them | |
Args: | |
html_folder_path: Path to the folder containing HTML files | |
""" | |
print(f"Starting HTML to PDF conversion in: {html_folder_path}") | |
# Find and register a font that supports Romanian diacritics | |
font_name = find_and_register_unicode_font() | |
# Check if directory exists | |
if not os.path.isdir(html_folder_path): | |
print(f"Error: Directory '{html_folder_path}' does not exist.") | |
return False | |
# Find all HTML files | |
html_files = [] | |
# List all files in the directory | |
try: | |
all_files = os.listdir(html_folder_path) | |
print(f"Found {len(all_files)} total files in directory") | |
# Filter for HTML files with various extensions | |
for file in all_files: | |
file_lower = file.lower() | |
if (file_lower.endswith('.html') or | |
file_lower.endswith('.htm') or | |
'vizualizare' in file_lower or | |
'html-articol=' in file_lower): | |
html_files.append(os.path.join(html_folder_path, file)) | |
print(f"Found {len(html_files)} HTML files to convert") | |
except Exception as e: | |
print(f"Error listing files: {e}") | |
return False | |
if not html_files: | |
print(f"Error: No HTML files found in '{html_folder_path}'.") | |
return False | |
# Create a folder for PDF files | |
pdf_folder = os.path.join(html_folder_path, "pdf_files") | |
os.makedirs(pdf_folder, exist_ok=True) | |
# Convert each HTML file to PDF | |
pdf_files = [] | |
for i, html_file in enumerate(html_files): | |
base_name = os.path.basename(html_file) | |
pdf_file = os.path.join(pdf_folder, f"{i+1:04d}_{base_name}.pdf") | |
print(f"Converting {base_name} to PDF... ({i+1}/{len(html_files)})") | |
try: | |
# Try to read the HTML file with different encodings | |
html_content = None | |
for encoding in ['utf-8', 'latin-1', 'latin2', 'cp1250', 'cp1252', 'iso-8859-2']: | |
try: | |
with open(html_file, 'r', encoding=encoding) as f: | |
html_content = f.read() | |
print(f" Successfully read file with {encoding} encoding") | |
break | |
except UnicodeDecodeError: | |
continue | |
if html_content is None: | |
print(f" Skipping file due to encoding issues: {base_name}") | |
continue | |
# Parse HTML | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# Extract text and replace HTML entities | |
text = soup.get_text() | |
text = replace_html_entities(text) | |
# Create PDF with reportlab | |
c = canvas.Canvas(pdf_file, pagesize=A4) | |
width, height = A4 | |
# Split text into lines | |
lines = text.split('\n') | |
y = height - 50 # Start from top | |
# Add title | |
c.setFont(font_name, 14) | |
title = base_name[:40] + "..." if len(base_name) > 40 else base_name | |
title = replace_html_entities(title) | |
try: | |
c.drawString(50, y, title) | |
y -= 30 | |
except: | |
print(f" Error writing title, using fallback") | |
c.setFont("Helvetica", 14) | |
c.drawString(50, y, "Document Title") | |
y -= 30 | |
# Add text content | |
c.setFont(font_name, 10) | |
# Process paragraphs | |
for line in lines: | |
# Skip empty lines | |
if not line.strip(): | |
continue | |
# Process the line, preserving diacritics | |
processed_line = replace_html_entities(line) | |
# Split into words for wrapping | |
words = processed_line.split() | |
line_buffer = "" | |
for word in words: | |
if not word.strip(): | |
continue | |
test_line = line_buffer + " " + word if line_buffer else word | |
# Check if the line fits | |
try: | |
if c.stringWidth(test_line, font_name, 10) < width - 100: | |
line_buffer = test_line | |
else: | |
# Write the current line | |
c.drawString(50, y, line_buffer) | |
y -= 15 | |
if y < 50: # Add new page if needed | |
c.showPage() | |
c.setFont(font_name, 10) | |
y = height - 50 | |
line_buffer = word | |
except: | |
# If there's an error, skip this word | |
print(f" Error processing word '{word}', skipping") | |
continue | |
# Write any remaining text | |
if line_buffer: | |
try: | |
c.drawString(50, y, line_buffer) | |
y -= 15 | |
if y < 50: # Add new page if needed | |
c.showPage() | |
c.setFont(font_name, 10) | |
y = height - 50 | |
except: | |
print(f" Error writing line, skipping") | |
c.save() | |
pdf_files.append(pdf_file) | |
print(f" Successfully converted {base_name}") | |
except Exception as e: | |
print(f" Error converting {base_name}: {e}") | |
if not pdf_files: | |
print("Error: Failed to convert any HTML files to PDF.") | |
return False | |
# Merge all PDFs into a single file | |
print(f"Merging {len(pdf_files)} PDFs into a single file...") | |
# Create the final PDF | |
final_pdf_path = os.path.join(html_folder_path, "final.pdf") | |
merger = PdfMerger() | |
successful_merges = 0 | |
for pdf_file in pdf_files: | |
if os.path.exists(pdf_file) and os.path.getsize(pdf_file) > 0: | |
try: | |
merger.append(pdf_file) | |
successful_merges += 1 | |
except Exception as e: | |
print(f"Error appending {os.path.basename(pdf_file)}: {e}") | |
if successful_merges == 0: | |
print("Error: No PDFs could be merged.") | |
return False | |
merger.write(final_pdf_path) | |
merger.close() | |
print(f"Successfully merged {successful_merges} PDFs") | |
print(f"Final PDF created: {final_pdf_path}") | |
print(f"Individual PDFs available in: {pdf_folder}") | |
return True | |
if __name__ == "__main__": | |
if len(sys.argv) > 1: | |
folder_path = sys.argv[1] | |
else: | |
folder_path = input("Enter the path to the folder containing HTML files: ") | |
convert_html_to_pdf(folder_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment