Created
May 9, 2025 20:27
-
-
Save me-suzy/ead0755349809a1e1002ca71d5780650 to your computer and use it in GitHub Desktop.
Aduna si adauga toate fisierele html din folder intr-un singur pdf (GROK 3 Complex)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
from PyPDF2 import PdfMerger | |
from bs4 import BeautifulSoup | |
from reportlab.pdfgen import canvas | |
from reportlab.lib.pagesizes import A4 | |
from reportlab.pdfbase import pdfmetrics | |
from reportlab.pdfbase.ttfonts import TTFont | |
# Dictionary of common HTML entities and their proper Unicode characters | |
HTML_ENTITIES = { | |
'ă': 'ă', 'â': 'â', 'ã': 'ã', 'î': 'î', 'Î': 'Î', | |
'ș': 'ș', 'Ș': 'Ș', 'ţ': 'ț', 'Ţ': 'Ț', 'ț': 'ț', | |
'”': '"', '“': '"', ' ': ' ', '&': '&' | |
} | |
def replace_html_entities(text): | |
"""Replace HTML entities with their corresponding Unicode characters""" | |
for entity, char in HTML_ENTITIES.items(): | |
text = text.replace(entity, char) | |
return text | |
def find_and_register_unicode_font(): | |
"""Find and register a suitable Unicode font""" | |
font_paths = [ | |
"C:/Windows/Fonts/arial.ttf", | |
"C:/Windows/Fonts/times.ttf", | |
"C:/Windows/Fonts/DejaVuSans.ttf", | |
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", | |
"/usr/share/fonts/TTF/DejaVuSans.ttf", | |
"/Library/Fonts/Arial Unicode.ttf" | |
] | |
for font_path in font_paths: | |
if os.path.exists(font_path): | |
try: | |
font_name = os.path.basename(font_path).split('.')[0] | |
pdfmetrics.registerFont(TTFont(font_name, font_path)) | |
print(f"Registered font: {font_name} from {font_path}") | |
return font_name | |
except: | |
print(f"Failed to register font: {font_path}") | |
print("No suitable Unicode font found. Using built-in Helvetica (diacritics may not show correctly).") | |
return "Helvetica" | |
def convert_html_to_pdf(html_folder_path): | |
""" | |
Convert all HTML files in a folder to individual PDFs and combine them into a single final.pdf | |
""" | |
print(f"Starting HTML to PDF conversion in: {html_folder_path}") | |
# Find and register a font that supports Romanian diacritics | |
font_name = find_and_register_unicode_font() | |
# Check if directory exists | |
if not os.path.isdir(html_folder_path): | |
print(f"Error: Directory '{html_folder_path}' does not exist.") | |
return False | |
# Find all HTML files | |
html_files = [] | |
try: | |
all_files = os.listdir(html_folder_path) | |
print(f"Found {len(all_files)} total files in directory") | |
for file in all_files: | |
file_lower = file.lower() | |
if (file_lower.endswith('.html') or file_lower.endswith('.htm') or | |
'vizualizare' in file_lower or 'html-articol=' in file_lower): | |
html_files.append(os.path.join(html_folder_path, file)) | |
print(f"Found {len(html_files)} HTML files to convert") | |
except Exception as e: | |
print(f"Error listing files: {e}") | |
return False | |
if not html_files: | |
print(f"Error: No HTML files found in '{html_folder_path}'.") | |
return False | |
# Create a folder for PDF files | |
pdf_folder = os.path.join(html_folder_path, "pdf_files") | |
os.makedirs(pdf_folder, exist_ok=True) | |
# Convert each HTML file to PDF | |
pdf_files = [] | |
for i, html_file in enumerate(html_files): | |
base_name = os.path.basename(html_file) | |
pdf_file = os.path.join(pdf_folder, f"{i+1:04d}_{base_name}.pdf") | |
print(f"Converting {base_name} to PDF... ({i+1}/{len(html_files)})") | |
try: | |
html_content = None | |
for encoding in ['utf-8', 'latin-1', 'cp1250', 'iso-8859-2']: | |
try: | |
with open(html_file, 'r', encoding=encoding) as f: | |
html_content = f.read() | |
print(f" Successfully read file with {encoding} encoding") | |
break | |
except UnicodeDecodeError: | |
continue | |
if html_content is None: | |
print(f" Skipping file due to encoding issues: {base_name}") | |
continue | |
# Parse HTML and replace entities | |
soup = BeautifulSoup(html_content, 'html.parser') | |
text = soup.get_text() | |
text = replace_html_entities(text) | |
# Create PDF | |
c = canvas.Canvas(pdf_file, pagesize=A4) | |
width, height = A4 | |
lines = text.split('\n') | |
y = height - 50 | |
# Add title | |
c.setFont(font_name, 14) | |
title = base_name[:40] + "..." if len(base_name) > 40 else base_name | |
title = replace_html_entities(title) | |
c.drawString(50, y, title) | |
y -= 30 | |
# Add text content | |
c.setFont(font_name, 10) | |
for line in lines: | |
if line.strip(): | |
words = line.split() | |
current_line = "" | |
for word in words: | |
word = ''.join(char for char in word if ord(char) >= 32) | |
if c.stringWidth(current_line + " " + word, font_name, 10) < width - 100: | |
current_line += " " + word if current_line else word | |
else: | |
c.drawString(50, y, current_line) | |
y -= 15 | |
if y < 50: | |
c.showPage() | |
c.setFont(font_name, 10) | |
y = height - 50 | |
current_line = word | |
if current_line: | |
c.drawString(50, y, current_line) | |
y -= 15 | |
if y < 50: | |
c.showPage() | |
c.setFont(font_name, 10) | |
y = height - 50 | |
c.save() | |
pdf_files.append(pdf_file) | |
print(f" Successfully converted {base_name}") | |
except Exception as e: | |
print(f" Error converting {base_name}: {e}") | |
if not pdf_files: | |
print("Error: Failed to convert any HTML files to PDF.") | |
return False | |
# Merge all PDFs into final.pdf | |
print(f"Merging {len(pdf_files)} PDFs into final.pdf...") | |
final_pdf_path = os.path.join(html_folder_path, "final.pdf") | |
merger = PdfMerger() | |
successful_merges = 0 | |
for pdf_file in pdf_files: | |
if os.path.exists(pdf_file) and os.path.getsize(pdf_file) > 0: | |
try: | |
merger.append(pdf_file) | |
successful_merges += 1 | |
except Exception as e: | |
print(f"Error appending {os.path.basename(pdf_file)}: {e}") | |
if successful_merges == 0: | |
print("Error: No PDFs could be merged.") | |
return False | |
merger.write(final_pdf_path) | |
merger.close() | |
print(f"Successfully merged {successful_merges} PDFs into final.pdf") | |
print(f"Final PDF created: {final_pdf_path}") | |
print(f"Individual PDFs available in: {pdf_folder}") | |
return True | |
if __name__ == "__main__": | |
if len(sys.argv) > 1: | |
folder_path = sys.argv[1] | |
else: | |
folder_path = input("Enter the path to the folder containing HTML files: ") | |
convert_html_to_pdf(folder_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment