me-suzy · May 9, 2025 20:22
diff --git a/Aduna si adauga toate fisierele html din folder intr-un singur pdf (scoate diacriticile).py b/Aduna si adauga toate fisierele html din folder intr-un singur pdf (scoate diacriticile).py
 import os
 import sys
 from PyPDF2 import PdfMerger
 from bs4 import BeautifulSoup
 from reportlab.pdfgen import canvas
 from reportlab.lib.pagesizes import A4

 # Dictionary to convert Romanian diacritics to standard ASCII
 CHAR_MAP = {
    # Romanian specific
    'ă': 'a', 'Ă': 'A', 'â': 'a', 'Â': 'A',
    'î': 'i', 'Î': 'I', 'ș': 's', 'Ș': 'S',
    'ţ': 't', 'Ţ': 'T', 'ş': 's', 'Ş': 'S',
    'ț': 't', 'Ț': 'T',

    # HTML entities
    '&#259;': 'a', '&#226;': 'a', '&atilde;': 'a', '&acirc;': 'a',
    '&#x103;': 'a', '&#xE2;': 'a', '&icirc;': 'i', '&#206;': 'I',
    '&#238;': 'i', '&#xEE;': 'i', '&#xCE;': 'I', '&Icirc;': 'I',
    '&#537;': 's', '&#536;': 'S', '&#350;': 'S', '&#x219;': 's',
    '&#351;': 's', '&#539;': 't', '&#355;': 't', '&#354;': 'T',
    '&#x21B;': 't', '&rdquo;': '"', '&ldquo;': '"', '&nbsp;': ' ',
    '&amp;': '&',

    # Other problematic chars
    '–': '-', '—': '--', '"': '"', '"': '"',
    '…': '...', '•': '*', '´': "'",
    '‚': ',', '„': '"',
    '‹': '<', '›': '>', '«': '<<', '»': '>>',
    # Unicode chars - using code points to avoid syntax errors
    '\u2019': "'", '\u2018': "'", '\u201c': '"', '\u201d': '"',
    '\u2013': '-', '\u2014': '--', '\u2022': '*', '\u00a0': ' ',
    '\u00ad': '-', '\u00b4': "'", '\u00d7': 'x', '\ufeff': '',
    '\u200b': '', '\u200c': '', '\u200d': '', '\u200e': '',
    '\u200f': '', '\u2028': ' ', '\u2029': ' ', '\u202a': '',
    '\u202b': '', '\u202c': '', '\u202d': '', '\u202e': '',
    '\u202f': ' ', '\u205f': ' ', '\u2060': '', '\u2061': '',
    '\u2062': '', '\u2063': '', '\u2064': '', '\u2066': '',
    '\u2067': '', '\u2068': '', '\u2069': '', '\u206a': '',
    '\u206b': '', '\u206c': '', '\u206d': '', '\u206e': '',
    '\u206f': '', '\u0300': '', '\u0301': '', '\u0302': '',
    '\u0303': '', '\u0304': '', '\u0305': '', '\u0306': '',
    '\u0307': '', '\u0308': '', '\u0309': '', '\u030a': '',
    '\u030b': '', '\u030c': '', '\u030d': '', '\u030e': '',
    '\u030f': '', '\u0310': '', '\u0311': '', '\u0312': '',
    '\u0313': '', '\u0314': '', '\u0315': '', '\u0316': '',
    '\u0317': '', '\u0318': '', '\u0319': '', '\u031a': '',
    '\u031b': '', '\u031c': '', '\u031d': '', '\u031e': '',
    '\u031f': '', '\u0320': '', '\u0321': '', '\u0322': '',
    '\u0323': '', '\u0324': '', '\u0325': '', '\u0326': '',
    '\u0327': '', '\u0328': '', '\u0329': '', '\u032a': '',
    '\u032b': '', '\u032c': '', '\u032d': '', '\u032e': '',
    '\u032f': '', '\u0330': '', '\u0331': '', '\u0332': '',
    '\u0333': '', '\u0334': '', '\u0335': '', '\u0336': '',
    '\u0337': '', '\u0338': '', '\u0339': '', '\u033a': '',
    '\u033b': '', '\u033c': '', '\u033d': '', '\u033e': '',
    '\u033f': '', '\u0340': '', '\u0341': '', '\u0342': '',
    '\u0343': '', '\u0344': '', '\u0345': '', '\u0346': '',
    '\u0347': '', '\u0348': '', '\u0349': '', '\u034a': '',
    '\u034b': '', '\u034c': '', '\u034d': '', '\u034e': '',
    '\u034f': '', '\u1440': '', '\u1449': '',
 }

 def safe_char(c):
    """Convert character to a safe ASCII equivalent if needed"""
    if c in CHAR_MAP:
        return CHAR_MAP[c]

    # Only allow ASCII characters (code 32-126), replace others
    code = ord(c)
    if code < 32 or code > 126:
        return '?'

    return c

 def safe_text(text):
    """Convert text to use only safe characters"""
    return ''.join(safe_char(c) for c in text)

 def convert_html_to_pdf(html_folder_path):
    """
    Convert all HTML files in a folder to individual PDFs and combine them

    Args:
        html_folder_path: Path to the folder containing HTML files
    """
    print(f"Starting HTML to PDF conversion in: {html_folder_path}")

    # Check if directory exists
    if not os.path.isdir(html_folder_path):
        print(f"Error: Directory '{html_folder_path}' does not exist.")
        return False

    # Find all HTML files
    html_files = []

    # List all files in the directory
    try:
        all_files = os.listdir(html_folder_path)
        print(f"Found {len(all_files)} total files in directory")

        # Filter for HTML files with various extensions
        for file in all_files:
            file_lower = file.lower()
            if (file_lower.endswith('.html') or
                file_lower.endswith('.htm') or
                'vizualizare' in file_lower or
                'html-articol=' in file_lower):
                html_files.append(os.path.join(html_folder_path, file))

        print(f"Found {len(html_files)} HTML files to convert")
    except Exception as e:
        print(f"Error listing files: {e}")
        return False

    if not html_files:
        print(f"Error: No HTML files found in '{html_folder_path}'.")
        return False

    # Create a folder for PDF files
    pdf_folder = os.path.join(html_folder_path, "pdf_files")
    os.makedirs(pdf_folder, exist_ok=True)

    # Convert each HTML file to PDF
    pdf_files = []
    for i, html_file in enumerate(html_files):
        base_name = os.path.basename(html_file)
        pdf_file = os.path.join(pdf_folder, f"{i+1:04d}_{base_name}.pdf")

        print(f"Converting {base_name} to PDF... ({i+1}/{len(html_files)})")

        try:
            # Try to read the HTML file with different encodings
            html_content = None
            for encoding in ['utf-8', 'latin-1', 'latin2', 'cp1250', 'cp1252', 'iso-8859-2']:
                try:
                    with open(html_file, 'r', encoding=encoding) as f:
                        html_content = f.read()
                    print(f"  Successfully read file with {encoding} encoding")
                    break
                except UnicodeDecodeError:
                    continue

            if html_content is None:
                print(f"  Skipping file due to encoding issues: {base_name}")
                continue

            # Parse HTML
            soup = BeautifulSoup(html_content, 'html.parser')

            # Extract text
            text = soup.get_text()

            # Create PDF with reportlab
            c = canvas.Canvas(pdf_file, pagesize=A4)
            width, height = A4

            # Split text into lines
            lines = text.split('\n')
            y = height - 50  # Start from top

            # Add title - using standard Helvetica fonts
            c.setFont("Helvetica", 14)
            title = safe_text(base_name[:40] + "..." if len(base_name) > 40 else base_name)
            c.drawString(50, y, title)
            y -= 30

            # Add text content
            c.setFont("Helvetica", 10)

            # Process each paragraph
            for line in lines:
                if line.strip():
                    # Convert text to safe characters only
                    safe_line = safe_text(line)

                    # Split into words
                    words = safe_line.split()
                    if not words:
                        continue

                    # Handle word wrapping
                    line_buffer = ""
                    for word in words:
                        if not word.strip():
                            continue

                        test_line = line_buffer + " " + word if line_buffer else word
                        if c.stringWidth(test_line, "Helvetica", 10) < width - 100:
                            line_buffer = test_line
                        else:
                            # Draw the current line
                            c.drawString(50, y, line_buffer)
                            y -= 15

                            # Check if we need a new page
                            if y < 50:
                                c.showPage()
                                c.setFont("Helvetica", 10)
                                y = height - 50

                            line_buffer = word

                    # Draw any remaining text
                    if line_buffer:
                        c.drawString(50, y, line_buffer)
                        y -= 15

                    # Check if we need a new page
                    if y < 50:
                        c.showPage()
                        c.setFont("Helvetica", 10)
                        y = height - 50

            c.save()
            pdf_files.append(pdf_file)
            print(f"  Successfully converted {base_name}")
        except Exception as e:
            print(f"  Error converting {base_name}: {e}")

    if not pdf_files:
        print("Error: Failed to convert any HTML files to PDF.")
        return False

    # Merge all PDFs into a single file
    print(f"Merging {len(pdf_files)} PDFs into a single file...")

    # Create the final PDF
    final_pdf_path = os.path.join(html_folder_path, "final.pdf")
    merger = PdfMerger()

    successful_merges = 0
    for pdf_file in pdf_files:
        if os.path.exists(pdf_file) and os.path.getsize(pdf_file) > 0:
            try:
                merger.append(pdf_file)
                successful_merges += 1
            except Exception as e:
                print(f"Error appending {os.path.basename(pdf_file)}: {e}")

    if successful_merges == 0:
        print("Error: No PDFs could be merged.")
        return False

    merger.write(final_pdf_path)
    merger.close()

    print(f"Successfully merged {successful_merges} PDFs")
    print(f"Final PDF created: {final_pdf_path}")
    print(f"Individual PDFs available in: {pdf_folder}")

    return True

 if __name__ == "__main__":
    if len(sys.argv) > 1:
        folder_path = sys.argv[1]
    else:
        folder_path = input("Enter the path to the folder containing HTML files: ")

    convert_html_to_pdf(folder_path)
	import os
	import sys
	from PyPDF2 import PdfMerger
	from bs4 import BeautifulSoup
	from reportlab.pdfgen import canvas
	from reportlab.lib.pagesizes import A4

	# Dictionary to convert Romanian diacritics to standard ASCII
	CHAR_MAP = {
	# Romanian specific
	'ă': 'a', 'Ă': 'A', 'â': 'a', 'Â': 'A',
	'î': 'i', 'Î': 'I', 'ș': 's', 'Ș': 'S',
	'ţ': 't', 'Ţ': 'T', 'ş': 's', 'Ş': 'S',
	'ț': 't', 'Ț': 'T',

	# HTML entities
	'ă': 'a', 'â': 'a', 'ã': 'a', 'â': 'a',
	'ă': 'a', 'â': 'a', 'î': 'i', 'Î': 'I',
	'î': 'i', 'î': 'i', 'Î': 'I', 'Î': 'I',
	'ș': 's', 'Ș': 'S', 'Ş': 'S', 'ș': 's',
	'ş': 's', 'ț': 't', 'ţ': 't', 'Ţ': 'T',
	'ț': 't', '”': '"', '“': '"', ' ': ' ',
	'&': '&',

	# Other problematic chars
	'–': '-', '—': '--', '"': '"', '"': '"',
	'…': '...', '•': '*', '´': "'",
	'‚': ',', '„': '"',
	'‹': '<', '›': '>', '«': '<<', '»': '>>',
	# Unicode chars - using code points to avoid syntax errors
	'\u2019': "'", '\u2018': "'", '\u201c': '"', '\u201d': '"',
	'\u2013': '-', '\u2014': '--', '\u2022': '*', '\u00a0': ' ',
	'\u00ad': '-', '\u00b4': "'", '\u00d7': 'x', '\ufeff': '',
	'\u200b': '', '\u200c': '', '\u200d': '', '\u200e': '',
	'\u200f': '', '\u2028': ' ', '\u2029': ' ', '\u202a': '',
	'\u202b': '', '\u202c': '', '\u202d': '', '\u202e': '',
	'\u202f': ' ', '\u205f': ' ', '\u2060': '', '\u2061': '',
	'\u2062': '', '\u2063': '', '\u2064': '', '\u2066': '',
	'\u2067': '', '\u2068': '', '\u2069': '', '\u206a': '',
	'\u206b': '', '\u206c': '', '\u206d': '', '\u206e': '',
	'\u206f': '', '\u0300': '', '\u0301': '', '\u0302': '',
	'\u0303': '', '\u0304': '', '\u0305': '', '\u0306': '',
	'\u0307': '', '\u0308': '', '\u0309': '', '\u030a': '',
	'\u030b': '', '\u030c': '', '\u030d': '', '\u030e': '',
	'\u030f': '', '\u0310': '', '\u0311': '', '\u0312': '',
	'\u0313': '', '\u0314': '', '\u0315': '', '\u0316': '',
	'\u0317': '', '\u0318': '', '\u0319': '', '\u031a': '',
	'\u031b': '', '\u031c': '', '\u031d': '', '\u031e': '',
	'\u031f': '', '\u0320': '', '\u0321': '', '\u0322': '',
	'\u0323': '', '\u0324': '', '\u0325': '', '\u0326': '',
	'\u0327': '', '\u0328': '', '\u0329': '', '\u032a': '',
	'\u032b': '', '\u032c': '', '\u032d': '', '\u032e': '',
	'\u032f': '', '\u0330': '', '\u0331': '', '\u0332': '',
	'\u0333': '', '\u0334': '', '\u0335': '', '\u0336': '',
	'\u0337': '', '\u0338': '', '\u0339': '', '\u033a': '',
	'\u033b': '', '\u033c': '', '\u033d': '', '\u033e': '',
	'\u033f': '', '\u0340': '', '\u0341': '', '\u0342': '',
	'\u0343': '', '\u0344': '', '\u0345': '', '\u0346': '',
	'\u0347': '', '\u0348': '', '\u0349': '', '\u034a': '',
	'\u034b': '', '\u034c': '', '\u034d': '', '\u034e': '',
	'\u034f': '', '\u1440': '', '\u1449': '',
	}

	def safe_char(c):
	"""Convert character to a safe ASCII equivalent if needed"""
	if c in CHAR_MAP:
	return CHAR_MAP[c]

	# Only allow ASCII characters (code 32-126), replace others
	code = ord(c)
	if code < 32 or code > 126:
	return '?'

	return c

	def safe_text(text):
	"""Convert text to use only safe characters"""
	return ''.join(safe_char(c) for c in text)

	def convert_html_to_pdf(html_folder_path):
	"""
	Convert all HTML files in a folder to individual PDFs and combine them

	Args:
	html_folder_path: Path to the folder containing HTML files
	"""
	print(f"Starting HTML to PDF conversion in: {html_folder_path}")

	# Check if directory exists
	if not os.path.isdir(html_folder_path):
	print(f"Error: Directory '{html_folder_path}' does not exist.")
	return False

	# Find all HTML files
	html_files = []

	# List all files in the directory
	try:
	all_files = os.listdir(html_folder_path)
	print(f"Found {len(all_files)} total files in directory")

	# Filter for HTML files with various extensions
	for file in all_files:
	file_lower = file.lower()
	if (file_lower.endswith('.html') or
	file_lower.endswith('.htm') or
	'vizualizare' in file_lower or
	'html-articol=' in file_lower):
	html_files.append(os.path.join(html_folder_path, file))

	print(f"Found {len(html_files)} HTML files to convert")
	except Exception as e:
	print(f"Error listing files: {e}")
	return False

	if not html_files:
	print(f"Error: No HTML files found in '{html_folder_path}'.")
	return False

	# Create a folder for PDF files
	pdf_folder = os.path.join(html_folder_path, "pdf_files")
	os.makedirs(pdf_folder, exist_ok=True)

	# Convert each HTML file to PDF
	pdf_files = []
	for i, html_file in enumerate(html_files):
	base_name = os.path.basename(html_file)
	pdf_file = os.path.join(pdf_folder, f"{i+1:04d}_{base_name}.pdf")

	print(f"Converting {base_name} to PDF... ({i+1}/{len(html_files)})")

	try:
	# Try to read the HTML file with different encodings
	html_content = None
	for encoding in ['utf-8', 'latin-1', 'latin2', 'cp1250', 'cp1252', 'iso-8859-2']:
	try:
	with open(html_file, 'r', encoding=encoding) as f:
	html_content = f.read()
	print(f" Successfully read file with {encoding} encoding")
	break
	except UnicodeDecodeError:
	continue

	if html_content is None:
	print(f" Skipping file due to encoding issues: {base_name}")
	continue

	# Parse HTML
	soup = BeautifulSoup(html_content, 'html.parser')

	# Extract text
	text = soup.get_text()

	# Create PDF with reportlab
	c = canvas.Canvas(pdf_file, pagesize=A4)
	width, height = A4

	# Split text into lines
	lines = text.split('\n')
	y = height - 50 # Start from top

	# Add title - using standard Helvetica fonts
	c.setFont("Helvetica", 14)
	title = safe_text(base_name[:40] + "..." if len(base_name) > 40 else base_name)
	c.drawString(50, y, title)
	y -= 30

	# Add text content
	c.setFont("Helvetica", 10)

	# Process each paragraph
	for line in lines:
	if line.strip():
	# Convert text to safe characters only
	safe_line = safe_text(line)

	# Split into words
	words = safe_line.split()
	if not words:
	continue

	# Handle word wrapping
	line_buffer = ""
	for word in words:
	if not word.strip():
	continue

	test_line = line_buffer + " " + word if line_buffer else word
	if c.stringWidth(test_line, "Helvetica", 10) < width - 100:
	line_buffer = test_line
	else:
	# Draw the current line
	c.drawString(50, y, line_buffer)
	y -= 15

	# Check if we need a new page
	if y < 50:
	c.showPage()
	c.setFont("Helvetica", 10)
	y = height - 50

	line_buffer = word

	# Draw any remaining text
	if line_buffer:
	c.drawString(50, y, line_buffer)
	y -= 15

	# Check if we need a new page
	if y < 50:
	c.showPage()
	c.setFont("Helvetica", 10)
	y = height - 50

	c.save()
	pdf_files.append(pdf_file)
	print(f" Successfully converted {base_name}")
	except Exception as e:
	print(f" Error converting {base_name}: {e}")

	if not pdf_files:
	print("Error: Failed to convert any HTML files to PDF.")
	return False

	# Merge all PDFs into a single file
	print(f"Merging {len(pdf_files)} PDFs into a single file...")

	# Create the final PDF
	final_pdf_path = os.path.join(html_folder_path, "final.pdf")
	merger = PdfMerger()

	successful_merges = 0
	for pdf_file in pdf_files:
	if os.path.exists(pdf_file) and os.path.getsize(pdf_file) > 0:
	try:
	merger.append(pdf_file)
	successful_merges += 1
	except Exception as e:
	print(f"Error appending {os.path.basename(pdf_file)}: {e}")

	if successful_merges == 0:
	print("Error: No PDFs could be merged.")
	return False

	merger.write(final_pdf_path)
	merger.close()

	print(f"Successfully merged {successful_merges} PDFs")
	print(f"Final PDF created: {final_pdf_path}")
	print(f"Individual PDFs available in: {pdf_folder}")

	return True

	if __name__ == "__main__":
	if len(sys.argv) > 1:
	folder_path = sys.argv[1]
	else:
	folder_path = input("Enter the path to the folder containing HTML files: ")

	convert_html_to_pdf(folder_path)