Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save me-suzy/74a1ffc852646bde960732e9a1f14e82 to your computer and use it in GitHub Desktop.
Save me-suzy/74a1ffc852646bde960732e9a1f14e82 to your computer and use it in GitHub Desktop.
Aduna si adauga toate fisierele html din folder intr-un singur pdf (Claude.ai)
import os
import sys
from PyPDF2 import PdfMerger
from bs4 import BeautifulSoup
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
# Dictionary of common HTML entities and their proper Unicode characters
HTML_ENTITIES = {
'ă': 'ă', 'â': 'â', 'ã': 'ã', 'â': 'â',
'ă': 'ă', 'â': 'â', 'î': 'î', 'Î': 'Î',
'î': 'î', 'î': 'î', 'Î': 'Î', 'Î': 'Î',
'ș': 'ș', 'Ș': 'Ș', 'Ş': 'Ş', 'ș': 'ș', 'ş': 'ș',
'ț': 'ț', 'ţ': 'ț', 'Ţ': 'Ţ', 'ț': 'ț',
'”': '"', '“': '"', ' ': ' ', '&': '&'
}
def replace_html_entities(text):
"""Replace HTML entities with their corresponding Unicode characters"""
for entity, char in HTML_ENTITIES.items():
text = text.replace(entity, char)
return text
def find_and_register_unicode_font():
"""Find and register a suitable Unicode font"""
# Common paths to fonts that support Romanian characters
font_paths = [
# Windows fonts
"C:/Windows/Fonts/arial.ttf",
"C:/Windows/Fonts/cour.ttf", # Courier New
"C:/Windows/Fonts/times.ttf", # Times New Roman
"C:/Windows/Fonts/segoeui.ttf", # Segoe UI
"C:/Windows/Fonts/DejaVuSans.ttf",
# Linux fonts
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
"/usr/share/fonts/TTF/DejaVuSans.ttf",
"/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
# macOS fonts
"/Library/Fonts/Arial Unicode.ttf",
"/System/Library/Fonts/LucidaGrande.ttc"
]
# Try to find and register a font
for font_path in font_paths:
if os.path.exists(font_path):
try:
font_name = os.path.basename(font_path).split('.')[0]
pdfmetrics.registerFont(TTFont(font_name, font_path))
print(f"Registered font: {font_name} from {font_path}")
return font_name
except:
print(f"Failed to register font: {font_path}")
print("No suitable Unicode font found. Using built-in Helvetica (diacritics may not show correctly).")
return "Helvetica"
def convert_html_to_pdf(html_folder_path):
"""
Convert all HTML files in a folder to individual PDFs and combine them
Args:
html_folder_path: Path to the folder containing HTML files
"""
print(f"Starting HTML to PDF conversion in: {html_folder_path}")
# Find and register a font that supports Romanian diacritics
font_name = find_and_register_unicode_font()
# Check if directory exists
if not os.path.isdir(html_folder_path):
print(f"Error: Directory '{html_folder_path}' does not exist.")
return False
# Find all HTML files
html_files = []
# List all files in the directory
try:
all_files = os.listdir(html_folder_path)
print(f"Found {len(all_files)} total files in directory")
# Filter for HTML files with various extensions
for file in all_files:
file_lower = file.lower()
if (file_lower.endswith('.html') or
file_lower.endswith('.htm') or
'vizualizare' in file_lower or
'html-articol=' in file_lower):
html_files.append(os.path.join(html_folder_path, file))
print(f"Found {len(html_files)} HTML files to convert")
except Exception as e:
print(f"Error listing files: {e}")
return False
if not html_files:
print(f"Error: No HTML files found in '{html_folder_path}'.")
return False
# Create a folder for PDF files
pdf_folder = os.path.join(html_folder_path, "pdf_files")
os.makedirs(pdf_folder, exist_ok=True)
# Convert each HTML file to PDF
pdf_files = []
for i, html_file in enumerate(html_files):
base_name = os.path.basename(html_file)
pdf_file = os.path.join(pdf_folder, f"{i+1:04d}_{base_name}.pdf")
print(f"Converting {base_name} to PDF... ({i+1}/{len(html_files)})")
try:
# Try to read the HTML file with different encodings
html_content = None
for encoding in ['utf-8', 'latin-1', 'latin2', 'cp1250', 'cp1252', 'iso-8859-2']:
try:
with open(html_file, 'r', encoding=encoding) as f:
html_content = f.read()
print(f" Successfully read file with {encoding} encoding")
break
except UnicodeDecodeError:
continue
if html_content is None:
print(f" Skipping file due to encoding issues: {base_name}")
continue
# Parse HTML
soup = BeautifulSoup(html_content, 'html.parser')
# Extract text and replace HTML entities
text = soup.get_text()
text = replace_html_entities(text)
# Create PDF with reportlab
c = canvas.Canvas(pdf_file, pagesize=A4)
width, height = A4
# Split text into lines
lines = text.split('\n')
y = height - 50 # Start from top
# Add title
c.setFont(font_name, 14)
title = base_name[:40] + "..." if len(base_name) > 40 else base_name
title = replace_html_entities(title)
try:
c.drawString(50, y, title)
y -= 30
except:
print(f" Error writing title, using fallback")
c.setFont("Helvetica", 14)
c.drawString(50, y, "Document Title")
y -= 30
# Add text content
c.setFont(font_name, 10)
# Process paragraphs
for line in lines:
# Skip empty lines
if not line.strip():
continue
# Process the line, preserving diacritics
processed_line = replace_html_entities(line)
# Split into words for wrapping
words = processed_line.split()
line_buffer = ""
for word in words:
if not word.strip():
continue
test_line = line_buffer + " " + word if line_buffer else word
# Check if the line fits
try:
if c.stringWidth(test_line, font_name, 10) < width - 100:
line_buffer = test_line
else:
# Write the current line
c.drawString(50, y, line_buffer)
y -= 15
if y < 50: # Add new page if needed
c.showPage()
c.setFont(font_name, 10)
y = height - 50
line_buffer = word
except:
# If there's an error, skip this word
print(f" Error processing word '{word}', skipping")
continue
# Write any remaining text
if line_buffer:
try:
c.drawString(50, y, line_buffer)
y -= 15
if y < 50: # Add new page if needed
c.showPage()
c.setFont(font_name, 10)
y = height - 50
except:
print(f" Error writing line, skipping")
c.save()
pdf_files.append(pdf_file)
print(f" Successfully converted {base_name}")
except Exception as e:
print(f" Error converting {base_name}: {e}")
if not pdf_files:
print("Error: Failed to convert any HTML files to PDF.")
return False
# Merge all PDFs into a single file
print(f"Merging {len(pdf_files)} PDFs into a single file...")
# Create the final PDF
final_pdf_path = os.path.join(html_folder_path, "final.pdf")
merger = PdfMerger()
successful_merges = 0
for pdf_file in pdf_files:
if os.path.exists(pdf_file) and os.path.getsize(pdf_file) > 0:
try:
merger.append(pdf_file)
successful_merges += 1
except Exception as e:
print(f"Error appending {os.path.basename(pdf_file)}: {e}")
if successful_merges == 0:
print("Error: No PDFs could be merged.")
return False
merger.write(final_pdf_path)
merger.close()
print(f"Successfully merged {successful_merges} PDFs")
print(f"Final PDF created: {final_pdf_path}")
print(f"Individual PDFs available in: {pdf_folder}")
return True
if __name__ == "__main__":
if len(sys.argv) > 1:
folder_path = sys.argv[1]
else:
folder_path = input("Enter the path to the folder containing HTML files: ")
convert_html_to_pdf(folder_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment