me-suzy · May 9, 2025 20:27
diff --git a/Aduna si adauga toate fisierele html din folder intr-un singur pdf (GROK 3 Complex).py b/Aduna si adauga toate fisierele html din folder intr-un singur pdf (GROK 3 Complex).py
 import os
 import sys
 from PyPDF2 import PdfMerger
 from bs4 import BeautifulSoup
 from reportlab.pdfgen import canvas
 from reportlab.lib.pagesizes import A4
 from reportlab.pdfbase import pdfmetrics
 from reportlab.pdfbase.ttfonts import TTFont

 # Dictionary of common HTML entities and their proper Unicode characters
 HTML_ENTITIES = {
    'ă': 'ă', 'â': 'â', 'ã': 'ã', 'î': 'î', 'Î': 'Î',
    'ș': 'ș', 'Ș': 'Ș', 'ţ': 'ț', 'Ţ': 'Ț', 'ț': 'ț',
    '”': '"', '“': '"', ' ': ' ', '&': '&'
 }

 def replace_html_entities(text):
    """Replace HTML entities with their corresponding Unicode characters"""
    for entity, char in HTML_ENTITIES.items():
        text = text.replace(entity, char)
    return text

 def find_and_register_unicode_font():
    """Find and register a suitable Unicode font"""
    font_paths = [
        "C:/Windows/Fonts/arial.ttf",
        "C:/Windows/Fonts/times.ttf",
        "C:/Windows/Fonts/DejaVuSans.ttf",
        "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
        "/usr/share/fonts/TTF/DejaVuSans.ttf",
        "/Library/Fonts/Arial Unicode.ttf"
    ]

    for font_path in font_paths:
        if os.path.exists(font_path):
            try:
                font_name = os.path.basename(font_path).split('.')[0]
                pdfmetrics.registerFont(TTFont(font_name, font_path))
                print(f"Registered font: {font_name} from {font_path}")
                return font_name
            except:
                print(f"Failed to register font: {font_path}")

    print("No suitable Unicode font found. Using built-in Helvetica (diacritics may not show correctly).")
    return "Helvetica"

 def convert_html_to_pdf(html_folder_path):
    """
    Convert all HTML files in a folder to individual PDFs and combine them into a single final.pdf
    """
    print(f"Starting HTML to PDF conversion in: {html_folder_path}")

    # Find and register a font that supports Romanian diacritics
    font_name = find_and_register_unicode_font()

    # Check if directory exists
    if not os.path.isdir(html_folder_path):
        print(f"Error: Directory '{html_folder_path}' does not exist.")
        return False

    # Find all HTML files
    html_files = []
    try:
        all_files = os.listdir(html_folder_path)
        print(f"Found {len(all_files)} total files in directory")
        for file in all_files:
            file_lower = file.lower()
            if (file_lower.endswith('.html') or file_lower.endswith('.htm') or
                'vizualizare' in file_lower or 'html-articol=' in file_lower):
                html_files.append(os.path.join(html_folder_path, file))
        print(f"Found {len(html_files)} HTML files to convert")
    except Exception as e:
        print(f"Error listing files: {e}")
        return False

    if not html_files:
        print(f"Error: No HTML files found in '{html_folder_path}'.")
        return False

    # Create a folder for PDF files
    pdf_folder = os.path.join(html_folder_path, "pdf_files")
    os.makedirs(pdf_folder, exist_ok=True)

    # Convert each HTML file to PDF
    pdf_files = []
    for i, html_file in enumerate(html_files):
        base_name = os.path.basename(html_file)
        pdf_file = os.path.join(pdf_folder, f"{i+1:04d}_{base_name}.pdf")
        print(f"Converting {base_name} to PDF... ({i+1}/{len(html_files)})")

        try:
            html_content = None
            for encoding in ['utf-8', 'latin-1', 'cp1250', 'iso-8859-2']:
                try:
                    with open(html_file, 'r', encoding=encoding) as f:
                        html_content = f.read()
                    print(f"  Successfully read file with {encoding} encoding")
                    break
                except UnicodeDecodeError:
                    continue

            if html_content is None:
                print(f"  Skipping file due to encoding issues: {base_name}")
                continue

            # Parse HTML and replace entities
            soup = BeautifulSoup(html_content, 'html.parser')
            text = soup.get_text()
            text = replace_html_entities(text)

            # Create PDF
            c = canvas.Canvas(pdf_file, pagesize=A4)
            width, height = A4
            lines = text.split('\n')
            y = height - 50

            # Add title
            c.setFont(font_name, 14)
            title = base_name[:40] + "..." if len(base_name) > 40 else base_name
            title = replace_html_entities(title)
            c.drawString(50, y, title)
            y -= 30

            # Add text content
            c.setFont(font_name, 10)
            for line in lines:
                if line.strip():
                    words = line.split()
                    current_line = ""
                    for word in words:
                        word = ''.join(char for char in word if ord(char) >= 32)
                        if c.stringWidth(current_line + " " + word, font_name, 10) < width - 100:
                            current_line += " " + word if current_line else word
                        else:
                            c.drawString(50, y, current_line)
                            y -= 15
                            if y < 50:
                                c.showPage()
                                c.setFont(font_name, 10)
                                y = height - 50
                            current_line = word
                    if current_line:
                        c.drawString(50, y, current_line)
                        y -= 15
                    if y < 50:
                        c.showPage()
                        c.setFont(font_name, 10)
                        y = height - 50

            c.save()
            pdf_files.append(pdf_file)
            print(f"  Successfully converted {base_name}")
        except Exception as e:
            print(f"  Error converting {base_name}: {e}")

    if not pdf_files:
        print("Error: Failed to convert any HTML files to PDF.")
        return False

    # Merge all PDFs into final.pdf
    print(f"Merging {len(pdf_files)} PDFs into final.pdf...")
    final_pdf_path = os.path.join(html_folder_path, "final.pdf")
    merger = PdfMerger()
    successful_merges = 0
    for pdf_file in pdf_files:
        if os.path.exists(pdf_file) and os.path.getsize(pdf_file) > 0:
            try:
                merger.append(pdf_file)
                successful_merges += 1
            except Exception as e:
                print(f"Error appending {os.path.basename(pdf_file)}: {e}")

    if successful_merges == 0:
        print("Error: No PDFs could be merged.")
        return False

    merger.write(final_pdf_path)
    merger.close()
    print(f"Successfully merged {successful_merges} PDFs into final.pdf")
    print(f"Final PDF created: {final_pdf_path}")
    print(f"Individual PDFs available in: {pdf_folder}")

    return True

 if __name__ == "__main__":
    if len(sys.argv) > 1:
        folder_path = sys.argv[1]
    else:
        folder_path = input("Enter the path to the folder containing HTML files: ")
    convert_html_to_pdf(folder_path)
	import os
	import sys
	from PyPDF2 import PdfMerger
	from bs4 import BeautifulSoup
	from reportlab.pdfgen import canvas
	from reportlab.lib.pagesizes import A4
	from reportlab.pdfbase import pdfmetrics
	from reportlab.pdfbase.ttfonts import TTFont

	# Dictionary of common HTML entities and their proper Unicode characters
	HTML_ENTITIES = {
	'ă': 'ă', 'â': 'â', 'ã': 'ã', 'î': 'î', 'Î': 'Î',
	'ș': 'ș', 'Ș': 'Ș', 'ţ': 'ț', 'Ţ': 'Ț', 'ț': 'ț',
	'”': '"', '“': '"', ' ': ' ', '&': '&'
	}

	def replace_html_entities(text):
	"""Replace HTML entities with their corresponding Unicode characters"""
	for entity, char in HTML_ENTITIES.items():
	text = text.replace(entity, char)
	return text

	def find_and_register_unicode_font():
	"""Find and register a suitable Unicode font"""
	font_paths = [
	"C:/Windows/Fonts/arial.ttf",
	"C:/Windows/Fonts/times.ttf",
	"C:/Windows/Fonts/DejaVuSans.ttf",
	"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
	"/usr/share/fonts/TTF/DejaVuSans.ttf",
	"/Library/Fonts/Arial Unicode.ttf"
	]

	for font_path in font_paths:
	if os.path.exists(font_path):
	try:
	font_name = os.path.basename(font_path).split('.')[0]
	pdfmetrics.registerFont(TTFont(font_name, font_path))
	print(f"Registered font: {font_name} from {font_path}")
	return font_name
	except:
	print(f"Failed to register font: {font_path}")

	print("No suitable Unicode font found. Using built-in Helvetica (diacritics may not show correctly).")
	return "Helvetica"

	def convert_html_to_pdf(html_folder_path):
	"""
	Convert all HTML files in a folder to individual PDFs and combine them into a single final.pdf
	"""
	print(f"Starting HTML to PDF conversion in: {html_folder_path}")

	# Find and register a font that supports Romanian diacritics
	font_name = find_and_register_unicode_font()

	# Check if directory exists
	if not os.path.isdir(html_folder_path):
	print(f"Error: Directory '{html_folder_path}' does not exist.")
	return False

	# Find all HTML files
	html_files = []
	try:
	all_files = os.listdir(html_folder_path)
	print(f"Found {len(all_files)} total files in directory")
	for file in all_files:
	file_lower = file.lower()
	if (file_lower.endswith('.html') or file_lower.endswith('.htm') or
	'vizualizare' in file_lower or 'html-articol=' in file_lower):
	html_files.append(os.path.join(html_folder_path, file))
	print(f"Found {len(html_files)} HTML files to convert")
	except Exception as e:
	print(f"Error listing files: {e}")
	return False

	if not html_files:
	print(f"Error: No HTML files found in '{html_folder_path}'.")
	return False

	# Create a folder for PDF files
	pdf_folder = os.path.join(html_folder_path, "pdf_files")
	os.makedirs(pdf_folder, exist_ok=True)

	# Convert each HTML file to PDF
	pdf_files = []
	for i, html_file in enumerate(html_files):
	base_name = os.path.basename(html_file)
	pdf_file = os.path.join(pdf_folder, f"{i+1:04d}_{base_name}.pdf")
	print(f"Converting {base_name} to PDF... ({i+1}/{len(html_files)})")

	try:
	html_content = None
	for encoding in ['utf-8', 'latin-1', 'cp1250', 'iso-8859-2']:
	try:
	with open(html_file, 'r', encoding=encoding) as f:
	html_content = f.read()
	print(f" Successfully read file with {encoding} encoding")
	break
	except UnicodeDecodeError:
	continue

	if html_content is None:
	print(f" Skipping file due to encoding issues: {base_name}")
	continue

	# Parse HTML and replace entities
	soup = BeautifulSoup(html_content, 'html.parser')
	text = soup.get_text()
	text = replace_html_entities(text)

	# Create PDF
	c = canvas.Canvas(pdf_file, pagesize=A4)
	width, height = A4
	lines = text.split('\n')
	y = height - 50

	# Add title
	c.setFont(font_name, 14)
	title = base_name[:40] + "..." if len(base_name) > 40 else base_name
	title = replace_html_entities(title)
	c.drawString(50, y, title)
	y -= 30

	# Add text content
	c.setFont(font_name, 10)
	for line in lines:
	if line.strip():
	words = line.split()
	current_line = ""
	for word in words:
	word = ''.join(char for char in word if ord(char) >= 32)
	if c.stringWidth(current_line + " " + word, font_name, 10) < width - 100:
	current_line += " " + word if current_line else word
	else:
	c.drawString(50, y, current_line)
	y -= 15
	if y < 50:
	c.showPage()
	c.setFont(font_name, 10)
	y = height - 50
	current_line = word
	if current_line:
	c.drawString(50, y, current_line)
	y -= 15
	if y < 50:
	c.showPage()
	c.setFont(font_name, 10)
	y = height - 50

	c.save()
	pdf_files.append(pdf_file)
	print(f" Successfully converted {base_name}")
	except Exception as e:
	print(f" Error converting {base_name}: {e}")

	if not pdf_files:
	print("Error: Failed to convert any HTML files to PDF.")
	return False

	# Merge all PDFs into final.pdf
	print(f"Merging {len(pdf_files)} PDFs into final.pdf...")
	final_pdf_path = os.path.join(html_folder_path, "final.pdf")
	merger = PdfMerger()
	successful_merges = 0
	for pdf_file in pdf_files:
	if os.path.exists(pdf_file) and os.path.getsize(pdf_file) > 0:
	try:
	merger.append(pdf_file)
	successful_merges += 1
	except Exception as e:
	print(f"Error appending {os.path.basename(pdf_file)}: {e}")

	if successful_merges == 0:
	print("Error: No PDFs could be merged.")
	return False

	merger.write(final_pdf_path)
	merger.close()
	print(f"Successfully merged {successful_merges} PDFs into final.pdf")
	print(f"Final PDF created: {final_pdf_path}")
	print(f"Individual PDFs available in: {pdf_folder}")

	return True

	if __name__ == "__main__":
	if len(sys.argv) > 1:
	folder_path = sys.argv[1]
	else:
	folder_path = input("Enter the path to the folder containing HTML files: ")
	convert_html_to_pdf(folder_path)