me-suzy · May 9, 2025 20:25
diff --git a/Aduna si adauga toate fisierele html din folder intr-un singur pdf (Claude.ai).py b/Aduna si adauga toate fisierele html din folder intr-un singur pdf (Claude.ai).py
 import os
 import sys
 from PyPDF2 import PdfMerger
 from bs4 import BeautifulSoup
 from reportlab.pdfgen import canvas
 from reportlab.lib.pagesizes import A4
 from reportlab.pdfbase import pdfmetrics
 from reportlab.pdfbase.ttfonts import TTFont

 # Dictionary of common HTML entities and their proper Unicode characters
 HTML_ENTITIES = {
    '&#259;': 'ă', '&#226;': 'â', '&atilde;': 'ã', '&acirc;': 'â',
    '&#x103;': 'ă', '&#xE2;': 'â', '&icirc;': 'î', '&#206;': 'Î',
    '&#238;': 'î', '&#xEE;': 'î', '&#xCE;': 'Î', '&Icirc;': 'Î',
    '&#537;': 'ș', '&#536;': 'Ș', '&#350;': 'Ş', '&#x219;': 'ș', '&#351;': 'ș',
    '&#539;': 'ț', '&#355;': 'ț', '&#354;': 'Ţ', '&#x21B;': 'ț',
    '&rdquo;': '"', '&ldquo;': '"', '&nbsp;': ' ', '&amp;': '&'
 }

 def replace_html_entities(text):
    """Replace HTML entities with their corresponding Unicode characters"""
    for entity, char in HTML_ENTITIES.items():
        text = text.replace(entity, char)
    return text

 def find_and_register_unicode_font():
    """Find and register a suitable Unicode font"""
    # Common paths to fonts that support Romanian characters
    font_paths = [
        # Windows fonts
        "C:/Windows/Fonts/arial.ttf",
        "C:/Windows/Fonts/cour.ttf",  # Courier New
        "C:/Windows/Fonts/times.ttf",  # Times New Roman
        "C:/Windows/Fonts/segoeui.ttf",  # Segoe UI
        "C:/Windows/Fonts/DejaVuSans.ttf",
        # Linux fonts
        "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
        "/usr/share/fonts/TTF/DejaVuSans.ttf",
        "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
        # macOS fonts
        "/Library/Fonts/Arial Unicode.ttf",
        "/System/Library/Fonts/LucidaGrande.ttc"
    ]

    # Try to find and register a font
    for font_path in font_paths:
        if os.path.exists(font_path):
            try:
                font_name = os.path.basename(font_path).split('.')[0]
                pdfmetrics.registerFont(TTFont(font_name, font_path))
                print(f"Registered font: {font_name} from {font_path}")
                return font_name
            except:
                print(f"Failed to register font: {font_path}")

    print("No suitable Unicode font found. Using built-in Helvetica (diacritics may not show correctly).")
    return "Helvetica"

 def convert_html_to_pdf(html_folder_path):
    """
    Convert all HTML files in a folder to individual PDFs and combine them

    Args:
        html_folder_path: Path to the folder containing HTML files
    """
    print(f"Starting HTML to PDF conversion in: {html_folder_path}")

    # Find and register a font that supports Romanian diacritics
    font_name = find_and_register_unicode_font()

    # Check if directory exists
    if not os.path.isdir(html_folder_path):
        print(f"Error: Directory '{html_folder_path}' does not exist.")
        return False

    # Find all HTML files
    html_files = []

    # List all files in the directory
    try:
        all_files = os.listdir(html_folder_path)
        print(f"Found {len(all_files)} total files in directory")

        # Filter for HTML files with various extensions
        for file in all_files:
            file_lower = file.lower()
            if (file_lower.endswith('.html') or
                file_lower.endswith('.htm') or
                'vizualizare' in file_lower or
                'html-articol=' in file_lower):
                html_files.append(os.path.join(html_folder_path, file))

        print(f"Found {len(html_files)} HTML files to convert")
    except Exception as e:
        print(f"Error listing files: {e}")
        return False

    if not html_files:
        print(f"Error: No HTML files found in '{html_folder_path}'.")
        return False

    # Create a folder for PDF files
    pdf_folder = os.path.join(html_folder_path, "pdf_files")
    os.makedirs(pdf_folder, exist_ok=True)

    # Convert each HTML file to PDF
    pdf_files = []
    for i, html_file in enumerate(html_files):
        base_name = os.path.basename(html_file)
        pdf_file = os.path.join(pdf_folder, f"{i+1:04d}_{base_name}.pdf")

        print(f"Converting {base_name} to PDF... ({i+1}/{len(html_files)})")

        try:
            # Try to read the HTML file with different encodings
            html_content = None
            for encoding in ['utf-8', 'latin-1', 'latin2', 'cp1250', 'cp1252', 'iso-8859-2']:
                try:
                    with open(html_file, 'r', encoding=encoding) as f:
                        html_content = f.read()
                    print(f"  Successfully read file with {encoding} encoding")
                    break
                except UnicodeDecodeError:
                    continue

            if html_content is None:
                print(f"  Skipping file due to encoding issues: {base_name}")
                continue

            # Parse HTML
            soup = BeautifulSoup(html_content, 'html.parser')

            # Extract text and replace HTML entities
            text = soup.get_text()
            text = replace_html_entities(text)

            # Create PDF with reportlab
            c = canvas.Canvas(pdf_file, pagesize=A4)
            width, height = A4

            # Split text into lines
            lines = text.split('\n')
            y = height - 50  # Start from top

            # Add title
            c.setFont(font_name, 14)
            title = base_name[:40] + "..." if len(base_name) > 40 else base_name
            title = replace_html_entities(title)
            try:
                c.drawString(50, y, title)
                y -= 30
            except:
                print(f"  Error writing title, using fallback")
                c.setFont("Helvetica", 14)
                c.drawString(50, y, "Document Title")
                y -= 30

            # Add text content
            c.setFont(font_name, 10)

            # Process paragraphs
            for line in lines:
                # Skip empty lines
                if not line.strip():
                    continue

                # Process the line, preserving diacritics
                processed_line = replace_html_entities(line)

                # Split into words for wrapping
                words = processed_line.split()
                line_buffer = ""

                for word in words:
                    if not word.strip():
                        continue

                    test_line = line_buffer + " " + word if line_buffer else word

                    # Check if the line fits
                    try:
                        if c.stringWidth(test_line, font_name, 10) < width - 100:
                            line_buffer = test_line
                        else:
                            # Write the current line
                            c.drawString(50, y, line_buffer)
                            y -= 15
                            if y < 50:  # Add new page if needed
                                c.showPage()
                                c.setFont(font_name, 10)
                                y = height - 50
                            line_buffer = word
                    except:
                        # If there's an error, skip this word
                        print(f"  Error processing word '{word}', skipping")
                        continue

                # Write any remaining text
                if line_buffer:
                    try:
                        c.drawString(50, y, line_buffer)
                        y -= 15
                        if y < 50:  # Add new page if needed
                            c.showPage()
                            c.setFont(font_name, 10)
                            y = height - 50
                    except:
                        print(f"  Error writing line, skipping")

            c.save()
            pdf_files.append(pdf_file)
            print(f"  Successfully converted {base_name}")
        except Exception as e:
            print(f"  Error converting {base_name}: {e}")

    if not pdf_files:
        print("Error: Failed to convert any HTML files to PDF.")
        return False

    # Merge all PDFs into a single file
    print(f"Merging {len(pdf_files)} PDFs into a single file...")

    # Create the final PDF
    final_pdf_path = os.path.join(html_folder_path, "final.pdf")
    merger = PdfMerger()

    successful_merges = 0
    for pdf_file in pdf_files:
        if os.path.exists(pdf_file) and os.path.getsize(pdf_file) > 0:
            try:
                merger.append(pdf_file)
                successful_merges += 1
            except Exception as e:
                print(f"Error appending {os.path.basename(pdf_file)}: {e}")

    if successful_merges == 0:
        print("Error: No PDFs could be merged.")
        return False

    merger.write(final_pdf_path)
    merger.close()

    print(f"Successfully merged {successful_merges} PDFs")
    print(f"Final PDF created: {final_pdf_path}")
    print(f"Individual PDFs available in: {pdf_folder}")

    return True

 if __name__ == "__main__":
    if len(sys.argv) > 1:
        folder_path = sys.argv[1]
    else:
        folder_path = input("Enter the path to the folder containing HTML files: ")

    convert_html_to_pdf(folder_path)
	import os
	import sys
	from PyPDF2 import PdfMerger
	from bs4 import BeautifulSoup
	from reportlab.pdfgen import canvas
	from reportlab.lib.pagesizes import A4
	from reportlab.pdfbase import pdfmetrics
	from reportlab.pdfbase.ttfonts import TTFont

	# Dictionary of common HTML entities and their proper Unicode characters
	HTML_ENTITIES = {
	'ă': 'ă', 'â': 'â', 'ã': 'ã', 'â': 'â',
	'ă': 'ă', 'â': 'â', 'î': 'î', 'Î': 'Î',
	'î': 'î', 'î': 'î', 'Î': 'Î', 'Î': 'Î',
	'ș': 'ș', 'Ș': 'Ș', 'Ş': 'Ş', 'ș': 'ș', 'ş': 'ș',
	'ț': 'ț', 'ţ': 'ț', 'Ţ': 'Ţ', 'ț': 'ț',
	'”': '"', '“': '"', ' ': ' ', '&': '&'
	}

	def replace_html_entities(text):
	"""Replace HTML entities with their corresponding Unicode characters"""
	for entity, char in HTML_ENTITIES.items():
	text = text.replace(entity, char)
	return text

	def find_and_register_unicode_font():
	"""Find and register a suitable Unicode font"""
	# Common paths to fonts that support Romanian characters
	font_paths = [
	# Windows fonts
	"C:/Windows/Fonts/arial.ttf",
	"C:/Windows/Fonts/cour.ttf", # Courier New
	"C:/Windows/Fonts/times.ttf", # Times New Roman
	"C:/Windows/Fonts/segoeui.ttf", # Segoe UI
	"C:/Windows/Fonts/DejaVuSans.ttf",
	# Linux fonts
	"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
	"/usr/share/fonts/TTF/DejaVuSans.ttf",
	"/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
	# macOS fonts
	"/Library/Fonts/Arial Unicode.ttf",
	"/System/Library/Fonts/LucidaGrande.ttc"
	]

	# Try to find and register a font
	for font_path in font_paths:
	if os.path.exists(font_path):
	try:
	font_name = os.path.basename(font_path).split('.')[0]
	pdfmetrics.registerFont(TTFont(font_name, font_path))
	print(f"Registered font: {font_name} from {font_path}")
	return font_name
	except:
	print(f"Failed to register font: {font_path}")

	print("No suitable Unicode font found. Using built-in Helvetica (diacritics may not show correctly).")
	return "Helvetica"

	def convert_html_to_pdf(html_folder_path):
	"""
	Convert all HTML files in a folder to individual PDFs and combine them

	Args:
	html_folder_path: Path to the folder containing HTML files
	"""
	print(f"Starting HTML to PDF conversion in: {html_folder_path}")

	# Find and register a font that supports Romanian diacritics
	font_name = find_and_register_unicode_font()

	# Check if directory exists
	if not os.path.isdir(html_folder_path):
	print(f"Error: Directory '{html_folder_path}' does not exist.")
	return False

	# Find all HTML files
	html_files = []

	# List all files in the directory
	try:
	all_files = os.listdir(html_folder_path)
	print(f"Found {len(all_files)} total files in directory")

	# Filter for HTML files with various extensions
	for file in all_files:
	file_lower = file.lower()
	if (file_lower.endswith('.html') or
	file_lower.endswith('.htm') or
	'vizualizare' in file_lower or
	'html-articol=' in file_lower):
	html_files.append(os.path.join(html_folder_path, file))

	print(f"Found {len(html_files)} HTML files to convert")
	except Exception as e:
	print(f"Error listing files: {e}")
	return False

	if not html_files:
	print(f"Error: No HTML files found in '{html_folder_path}'.")
	return False

	# Create a folder for PDF files
	pdf_folder = os.path.join(html_folder_path, "pdf_files")
	os.makedirs(pdf_folder, exist_ok=True)

	# Convert each HTML file to PDF
	pdf_files = []
	for i, html_file in enumerate(html_files):
	base_name = os.path.basename(html_file)
	pdf_file = os.path.join(pdf_folder, f"{i+1:04d}_{base_name}.pdf")

	print(f"Converting {base_name} to PDF... ({i+1}/{len(html_files)})")

	try:
	# Try to read the HTML file with different encodings
	html_content = None
	for encoding in ['utf-8', 'latin-1', 'latin2', 'cp1250', 'cp1252', 'iso-8859-2']:
	try:
	with open(html_file, 'r', encoding=encoding) as f:
	html_content = f.read()
	print(f" Successfully read file with {encoding} encoding")
	break
	except UnicodeDecodeError:
	continue

	if html_content is None:
	print(f" Skipping file due to encoding issues: {base_name}")
	continue

	# Parse HTML
	soup = BeautifulSoup(html_content, 'html.parser')

	# Extract text and replace HTML entities
	text = soup.get_text()
	text = replace_html_entities(text)

	# Create PDF with reportlab
	c = canvas.Canvas(pdf_file, pagesize=A4)
	width, height = A4

	# Split text into lines
	lines = text.split('\n')
	y = height - 50 # Start from top

	# Add title
	c.setFont(font_name, 14)
	title = base_name[:40] + "..." if len(base_name) > 40 else base_name
	title = replace_html_entities(title)
	try:
	c.drawString(50, y, title)
	y -= 30
	except:
	print(f" Error writing title, using fallback")
	c.setFont("Helvetica", 14)
	c.drawString(50, y, "Document Title")
	y -= 30

	# Add text content
	c.setFont(font_name, 10)

	# Process paragraphs
	for line in lines:
	# Skip empty lines
	if not line.strip():
	continue

	# Process the line, preserving diacritics
	processed_line = replace_html_entities(line)

	# Split into words for wrapping
	words = processed_line.split()
	line_buffer = ""

	for word in words:
	if not word.strip():
	continue

	test_line = line_buffer + " " + word if line_buffer else word

	# Check if the line fits
	try:
	if c.stringWidth(test_line, font_name, 10) < width - 100:
	line_buffer = test_line
	else:
	# Write the current line
	c.drawString(50, y, line_buffer)
	y -= 15
	if y < 50: # Add new page if needed
	c.showPage()
	c.setFont(font_name, 10)
	y = height - 50
	line_buffer = word
	except:
	# If there's an error, skip this word
	print(f" Error processing word '{word}', skipping")
	continue

	# Write any remaining text
	if line_buffer:
	try:
	c.drawString(50, y, line_buffer)
	y -= 15
	if y < 50: # Add new page if needed
	c.showPage()
	c.setFont(font_name, 10)
	y = height - 50
	except:
	print(f" Error writing line, skipping")

	c.save()
	pdf_files.append(pdf_file)
	print(f" Successfully converted {base_name}")
	except Exception as e:
	print(f" Error converting {base_name}: {e}")

	if not pdf_files:
	print("Error: Failed to convert any HTML files to PDF.")
	return False

	# Merge all PDFs into a single file
	print(f"Merging {len(pdf_files)} PDFs into a single file...")

	# Create the final PDF
	final_pdf_path = os.path.join(html_folder_path, "final.pdf")
	merger = PdfMerger()

	successful_merges = 0
	for pdf_file in pdf_files:
	if os.path.exists(pdf_file) and os.path.getsize(pdf_file) > 0:
	try:
	merger.append(pdf_file)
	successful_merges += 1
	except Exception as e:
	print(f"Error appending {os.path.basename(pdf_file)}: {e}")

	if successful_merges == 0:
	print("Error: No PDFs could be merged.")
	return False

	merger.write(final_pdf_path)
	merger.close()

	print(f"Successfully merged {successful_merges} PDFs")
	print(f"Final PDF created: {final_pdf_path}")
	print(f"Individual PDFs available in: {pdf_folder}")

	return True

	if __name__ == "__main__":
	if len(sys.argv) > 1:
	folder_path = sys.argv[1]
	else:
	folder_path = input("Enter the path to the folder containing HTML files: ")

	convert_html_to_pdf(folder_path)