Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save me-suzy/80099e8eb4a349f0cae4efa320e509d0 to your computer and use it in GitHub Desktop.
Save me-suzy/80099e8eb4a349f0cae4efa320e509d0 to your computer and use it in GitHub Desktop.
Aduna si adauga toate fisierele html din folder intr-un singur pdf (scoate diacriticile)
import os
import sys
from PyPDF2 import PdfMerger
from bs4 import BeautifulSoup
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
# Dictionary to convert Romanian diacritics to standard ASCII
CHAR_MAP = {
# Romanian specific
'ă': 'a', 'Ă': 'A', 'â': 'a', 'Â': 'A',
'î': 'i', 'Î': 'I', 'ș': 's', 'Ș': 'S',
'ţ': 't', 'Ţ': 'T', 'ş': 's', 'Ş': 'S',
'ț': 't', 'Ț': 'T',
# HTML entities
'ă': 'a', 'â': 'a', 'ã': 'a', 'â': 'a',
'ă': 'a', 'â': 'a', 'î': 'i', 'Î': 'I',
'î': 'i', 'î': 'i', 'Î': 'I', 'Î': 'I',
'ș': 's', 'Ș': 'S', 'Ş': 'S', 'ș': 's',
'ş': 's', 'ț': 't', 'ţ': 't', 'Ţ': 'T',
'ț': 't', '”': '"', '“': '"', ' ': ' ',
'&': '&',
# Other problematic chars
'–': '-', '—': '--', '"': '"', '"': '"',
'…': '...', '•': '*', '´': "'",
'‚': ',', '„': '"',
'‹': '<', '›': '>', '«': '<<', '»': '>>',
# Unicode chars - using code points to avoid syntax errors
'\u2019': "'", '\u2018': "'", '\u201c': '"', '\u201d': '"',
'\u2013': '-', '\u2014': '--', '\u2022': '*', '\u00a0': ' ',
'\u00ad': '-', '\u00b4': "'", '\u00d7': 'x', '\ufeff': '',
'\u200b': '', '\u200c': '', '\u200d': '', '\u200e': '',
'\u200f': '', '\u2028': ' ', '\u2029': ' ', '\u202a': '',
'\u202b': '', '\u202c': '', '\u202d': '', '\u202e': '',
'\u202f': ' ', '\u205f': ' ', '\u2060': '', '\u2061': '',
'\u2062': '', '\u2063': '', '\u2064': '', '\u2066': '',
'\u2067': '', '\u2068': '', '\u2069': '', '\u206a': '',
'\u206b': '', '\u206c': '', '\u206d': '', '\u206e': '',
'\u206f': '', '\u0300': '', '\u0301': '', '\u0302': '',
'\u0303': '', '\u0304': '', '\u0305': '', '\u0306': '',
'\u0307': '', '\u0308': '', '\u0309': '', '\u030a': '',
'\u030b': '', '\u030c': '', '\u030d': '', '\u030e': '',
'\u030f': '', '\u0310': '', '\u0311': '', '\u0312': '',
'\u0313': '', '\u0314': '', '\u0315': '', '\u0316': '',
'\u0317': '', '\u0318': '', '\u0319': '', '\u031a': '',
'\u031b': '', '\u031c': '', '\u031d': '', '\u031e': '',
'\u031f': '', '\u0320': '', '\u0321': '', '\u0322': '',
'\u0323': '', '\u0324': '', '\u0325': '', '\u0326': '',
'\u0327': '', '\u0328': '', '\u0329': '', '\u032a': '',
'\u032b': '', '\u032c': '', '\u032d': '', '\u032e': '',
'\u032f': '', '\u0330': '', '\u0331': '', '\u0332': '',
'\u0333': '', '\u0334': '', '\u0335': '', '\u0336': '',
'\u0337': '', '\u0338': '', '\u0339': '', '\u033a': '',
'\u033b': '', '\u033c': '', '\u033d': '', '\u033e': '',
'\u033f': '', '\u0340': '', '\u0341': '', '\u0342': '',
'\u0343': '', '\u0344': '', '\u0345': '', '\u0346': '',
'\u0347': '', '\u0348': '', '\u0349': '', '\u034a': '',
'\u034b': '', '\u034c': '', '\u034d': '', '\u034e': '',
'\u034f': '', '\u1440': '', '\u1449': '',
}
def safe_char(c):
"""Convert character to a safe ASCII equivalent if needed"""
if c in CHAR_MAP:
return CHAR_MAP[c]
# Only allow ASCII characters (code 32-126), replace others
code = ord(c)
if code < 32 or code > 126:
return '?'
return c
def safe_text(text):
"""Convert text to use only safe characters"""
return ''.join(safe_char(c) for c in text)
def convert_html_to_pdf(html_folder_path):
"""
Convert all HTML files in a folder to individual PDFs and combine them
Args:
html_folder_path: Path to the folder containing HTML files
"""
print(f"Starting HTML to PDF conversion in: {html_folder_path}")
# Check if directory exists
if not os.path.isdir(html_folder_path):
print(f"Error: Directory '{html_folder_path}' does not exist.")
return False
# Find all HTML files
html_files = []
# List all files in the directory
try:
all_files = os.listdir(html_folder_path)
print(f"Found {len(all_files)} total files in directory")
# Filter for HTML files with various extensions
for file in all_files:
file_lower = file.lower()
if (file_lower.endswith('.html') or
file_lower.endswith('.htm') or
'vizualizare' in file_lower or
'html-articol=' in file_lower):
html_files.append(os.path.join(html_folder_path, file))
print(f"Found {len(html_files)} HTML files to convert")
except Exception as e:
print(f"Error listing files: {e}")
return False
if not html_files:
print(f"Error: No HTML files found in '{html_folder_path}'.")
return False
# Create a folder for PDF files
pdf_folder = os.path.join(html_folder_path, "pdf_files")
os.makedirs(pdf_folder, exist_ok=True)
# Convert each HTML file to PDF
pdf_files = []
for i, html_file in enumerate(html_files):
base_name = os.path.basename(html_file)
pdf_file = os.path.join(pdf_folder, f"{i+1:04d}_{base_name}.pdf")
print(f"Converting {base_name} to PDF... ({i+1}/{len(html_files)})")
try:
# Try to read the HTML file with different encodings
html_content = None
for encoding in ['utf-8', 'latin-1', 'latin2', 'cp1250', 'cp1252', 'iso-8859-2']:
try:
with open(html_file, 'r', encoding=encoding) as f:
html_content = f.read()
print(f" Successfully read file with {encoding} encoding")
break
except UnicodeDecodeError:
continue
if html_content is None:
print(f" Skipping file due to encoding issues: {base_name}")
continue
# Parse HTML
soup = BeautifulSoup(html_content, 'html.parser')
# Extract text
text = soup.get_text()
# Create PDF with reportlab
c = canvas.Canvas(pdf_file, pagesize=A4)
width, height = A4
# Split text into lines
lines = text.split('\n')
y = height - 50 # Start from top
# Add title - using standard Helvetica fonts
c.setFont("Helvetica", 14)
title = safe_text(base_name[:40] + "..." if len(base_name) > 40 else base_name)
c.drawString(50, y, title)
y -= 30
# Add text content
c.setFont("Helvetica", 10)
# Process each paragraph
for line in lines:
if line.strip():
# Convert text to safe characters only
safe_line = safe_text(line)
# Split into words
words = safe_line.split()
if not words:
continue
# Handle word wrapping
line_buffer = ""
for word in words:
if not word.strip():
continue
test_line = line_buffer + " " + word if line_buffer else word
if c.stringWidth(test_line, "Helvetica", 10) < width - 100:
line_buffer = test_line
else:
# Draw the current line
c.drawString(50, y, line_buffer)
y -= 15
# Check if we need a new page
if y < 50:
c.showPage()
c.setFont("Helvetica", 10)
y = height - 50
line_buffer = word
# Draw any remaining text
if line_buffer:
c.drawString(50, y, line_buffer)
y -= 15
# Check if we need a new page
if y < 50:
c.showPage()
c.setFont("Helvetica", 10)
y = height - 50
c.save()
pdf_files.append(pdf_file)
print(f" Successfully converted {base_name}")
except Exception as e:
print(f" Error converting {base_name}: {e}")
if not pdf_files:
print("Error: Failed to convert any HTML files to PDF.")
return False
# Merge all PDFs into a single file
print(f"Merging {len(pdf_files)} PDFs into a single file...")
# Create the final PDF
final_pdf_path = os.path.join(html_folder_path, "final.pdf")
merger = PdfMerger()
successful_merges = 0
for pdf_file in pdf_files:
if os.path.exists(pdf_file) and os.path.getsize(pdf_file) > 0:
try:
merger.append(pdf_file)
successful_merges += 1
except Exception as e:
print(f"Error appending {os.path.basename(pdf_file)}: {e}")
if successful_merges == 0:
print("Error: No PDFs could be merged.")
return False
merger.write(final_pdf_path)
merger.close()
print(f"Successfully merged {successful_merges} PDFs")
print(f"Final PDF created: {final_pdf_path}")
print(f"Individual PDFs available in: {pdf_folder}")
return True
if __name__ == "__main__":
if len(sys.argv) > 1:
folder_path = sys.argv[1]
else:
folder_path = input("Enter the path to the folder containing HTML files: ")
convert_html_to_pdf(folder_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment