Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save me-suzy/ead0755349809a1e1002ca71d5780650 to your computer and use it in GitHub Desktop.
Save me-suzy/ead0755349809a1e1002ca71d5780650 to your computer and use it in GitHub Desktop.
Aduna si adauga toate fisierele html din folder intr-un singur pdf (GROK 3 Complex)
import os
import sys
from PyPDF2 import PdfMerger
from bs4 import BeautifulSoup
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
# Dictionary of common HTML entities and their proper Unicode characters
HTML_ENTITIES = {
'ă': 'ă', 'â': 'â', 'ã': 'ã', 'î': 'î', 'Î': 'Î',
'ș': 'ș', 'Ș': 'Ș', 'ţ': 'ț', 'Ţ': 'Ț', 'ț': 'ț',
'”': '"', '“': '"', ' ': ' ', '&': '&'
}
def replace_html_entities(text):
"""Replace HTML entities with their corresponding Unicode characters"""
for entity, char in HTML_ENTITIES.items():
text = text.replace(entity, char)
return text
def find_and_register_unicode_font():
"""Find and register a suitable Unicode font"""
font_paths = [
"C:/Windows/Fonts/arial.ttf",
"C:/Windows/Fonts/times.ttf",
"C:/Windows/Fonts/DejaVuSans.ttf",
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
"/usr/share/fonts/TTF/DejaVuSans.ttf",
"/Library/Fonts/Arial Unicode.ttf"
]
for font_path in font_paths:
if os.path.exists(font_path):
try:
font_name = os.path.basename(font_path).split('.')[0]
pdfmetrics.registerFont(TTFont(font_name, font_path))
print(f"Registered font: {font_name} from {font_path}")
return font_name
except:
print(f"Failed to register font: {font_path}")
print("No suitable Unicode font found. Using built-in Helvetica (diacritics may not show correctly).")
return "Helvetica"
def convert_html_to_pdf(html_folder_path):
"""
Convert all HTML files in a folder to individual PDFs and combine them into a single final.pdf
"""
print(f"Starting HTML to PDF conversion in: {html_folder_path}")
# Find and register a font that supports Romanian diacritics
font_name = find_and_register_unicode_font()
# Check if directory exists
if not os.path.isdir(html_folder_path):
print(f"Error: Directory '{html_folder_path}' does not exist.")
return False
# Find all HTML files
html_files = []
try:
all_files = os.listdir(html_folder_path)
print(f"Found {len(all_files)} total files in directory")
for file in all_files:
file_lower = file.lower()
if (file_lower.endswith('.html') or file_lower.endswith('.htm') or
'vizualizare' in file_lower or 'html-articol=' in file_lower):
html_files.append(os.path.join(html_folder_path, file))
print(f"Found {len(html_files)} HTML files to convert")
except Exception as e:
print(f"Error listing files: {e}")
return False
if not html_files:
print(f"Error: No HTML files found in '{html_folder_path}'.")
return False
# Create a folder for PDF files
pdf_folder = os.path.join(html_folder_path, "pdf_files")
os.makedirs(pdf_folder, exist_ok=True)
# Convert each HTML file to PDF
pdf_files = []
for i, html_file in enumerate(html_files):
base_name = os.path.basename(html_file)
pdf_file = os.path.join(pdf_folder, f"{i+1:04d}_{base_name}.pdf")
print(f"Converting {base_name} to PDF... ({i+1}/{len(html_files)})")
try:
html_content = None
for encoding in ['utf-8', 'latin-1', 'cp1250', 'iso-8859-2']:
try:
with open(html_file, 'r', encoding=encoding) as f:
html_content = f.read()
print(f" Successfully read file with {encoding} encoding")
break
except UnicodeDecodeError:
continue
if html_content is None:
print(f" Skipping file due to encoding issues: {base_name}")
continue
# Parse HTML and replace entities
soup = BeautifulSoup(html_content, 'html.parser')
text = soup.get_text()
text = replace_html_entities(text)
# Create PDF
c = canvas.Canvas(pdf_file, pagesize=A4)
width, height = A4
lines = text.split('\n')
y = height - 50
# Add title
c.setFont(font_name, 14)
title = base_name[:40] + "..." if len(base_name) > 40 else base_name
title = replace_html_entities(title)
c.drawString(50, y, title)
y -= 30
# Add text content
c.setFont(font_name, 10)
for line in lines:
if line.strip():
words = line.split()
current_line = ""
for word in words:
word = ''.join(char for char in word if ord(char) >= 32)
if c.stringWidth(current_line + " " + word, font_name, 10) < width - 100:
current_line += " " + word if current_line else word
else:
c.drawString(50, y, current_line)
y -= 15
if y < 50:
c.showPage()
c.setFont(font_name, 10)
y = height - 50
current_line = word
if current_line:
c.drawString(50, y, current_line)
y -= 15
if y < 50:
c.showPage()
c.setFont(font_name, 10)
y = height - 50
c.save()
pdf_files.append(pdf_file)
print(f" Successfully converted {base_name}")
except Exception as e:
print(f" Error converting {base_name}: {e}")
if not pdf_files:
print("Error: Failed to convert any HTML files to PDF.")
return False
# Merge all PDFs into final.pdf
print(f"Merging {len(pdf_files)} PDFs into final.pdf...")
final_pdf_path = os.path.join(html_folder_path, "final.pdf")
merger = PdfMerger()
successful_merges = 0
for pdf_file in pdf_files:
if os.path.exists(pdf_file) and os.path.getsize(pdf_file) > 0:
try:
merger.append(pdf_file)
successful_merges += 1
except Exception as e:
print(f"Error appending {os.path.basename(pdf_file)}: {e}")
if successful_merges == 0:
print("Error: No PDFs could be merged.")
return False
merger.write(final_pdf_path)
merger.close()
print(f"Successfully merged {successful_merges} PDFs into final.pdf")
print(f"Final PDF created: {final_pdf_path}")
print(f"Individual PDFs available in: {pdf_folder}")
return True
if __name__ == "__main__":
if len(sys.argv) > 1:
folder_path = sys.argv[1]
else:
folder_path = input("Enter the path to the folder containing HTML files: ")
convert_html_to_pdf(folder_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment