jkbjh · February 11, 2025 14:07
diff --git a/pdfstatistics.py b/pdfstatistics.py
 #!/usr/bin/env python3
 import os
 import tempfile
 import subprocess
 import fitz  # PyMuPDF for reading PDFs
 import re
 import statistics
 import argparse


 def download_pdfs(url: str, download_folder: str):
    """Download all PDF files from a given URL using wget."""
    os.makedirs(download_folder, exist_ok=True)
    wget_command = f"wget --recursive --no-parent --accept pdf --directory-prefix={download_folder} {url}"

    print("Downloading PDFs...")
    subprocess.run(wget_command, shell=True, check=True)
    print("Download complete.")


 def extract_pdf_statistics(pdf_file: str):
    """Extract statistics (word count, page count, character count) from a PDF file."""
    try:
        doc = fitz.open(pdf_file)
        num_pages = doc.page_count
        num_words = 0
        num_characters = 0

        for page_num in range(num_pages):
            page = doc.load_page(page_num)
            text = page.get_text()
            words = re.findall(r"\w+", text)
            num_words += len(words)
            num_characters += len(text)

        doc.close()
        return num_pages, num_words, num_characters

    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")
        return 0, 0, 0


 def compute_statistics(data):
    """Compute quantiles, mean, median, and standard deviation."""
    if not data:
        return {"quantiles": [0] * 10, "median": 0, "mean": 0, "stdev": 0}

    quantiles = statistics.quantiles(data, n=10)
    median_val = statistics.median(data)
    mean_val = statistics.mean(data)
    stdev_val = statistics.stdev(data) if len(data) > 1 else 0

    return {
        "quantiles": quantiles,
        "median": median_val,
        "mean": mean_val,
        "stdev": stdev_val,
    }


 def process_single_pdf(file_path: str):
    """Process a single PDF and print its statistics."""
    pages, words, characters = extract_pdf_statistics(file_path)
    print("\nSingle PDF Statistics:")
    print(f"Number of Pages: {pages}")
    print(f"Number of Words: {words}")
    print(f"Number of Characters: {characters}")


 def process_pdfs_from_url(url: str):
    """Download and process PDFs from a URL."""
    with tempfile.TemporaryDirectory() as download_folder:
        # Download all PDFs
        download_pdfs(url, download_folder)

        # Collect statistics from each PDF
        pages_stats = []
        words_stats = []
        characters_stats = []

        for root, _, files in os.walk(download_folder):
            for file in files:
                if file.lower().endswith(".pdf"):
                    pdf_path = os.path.join(root, file)
                    pages, words, characters = extract_pdf_statistics(pdf_path)
                    pages_stats.append(pages)
                    words_stats.append(words)
                    characters_stats.append(characters)

        # Compute and display aggregate statistics
        print("Statistics for Pages:")
        print(compute_statistics(pages_stats))

        print("\nStatistics for Words:")
        print(compute_statistics(words_stats))

        print("\nStatistics for Characters:")
        print(compute_statistics(characters_stats))


 def main():
    parser = argparse.ArgumentParser(
        description="Process PDFs from a URL or a single file and compute statistics."
    )
    parser.add_argument("-u", "--url", help="URL to download and process PDFs from.")
    parser.add_argument("-f", "--file", help="Path to a single PDF file to process.")

    args = parser.parse_args()

    if not args.url and not args.file:
        parser.error("No action requested. Provide --url or --file or both.")

    if args.url:
        print(f"\nProcessing PDFs from URL: {args.url}")
        process_pdfs_from_url(args.url)

    if args.file:
        print(f"\nProcessing Single PDF: {args.file}")
        process_single_pdf(args.file)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	import os
	import tempfile
	import subprocess
	import fitz # PyMuPDF for reading PDFs
	import re
	import statistics
	import argparse


	def download_pdfs(url: str, download_folder: str):
	"""Download all PDF files from a given URL using wget."""
	os.makedirs(download_folder, exist_ok=True)
	wget_command = f"wget --recursive --no-parent --accept pdf --directory-prefix={download_folder} {url}"

	print("Downloading PDFs...")
	subprocess.run(wget_command, shell=True, check=True)
	print("Download complete.")


	def extract_pdf_statistics(pdf_file: str):
	"""Extract statistics (word count, page count, character count) from a PDF file."""
	try:
	doc = fitz.open(pdf_file)
	num_pages = doc.page_count
	num_words = 0
	num_characters = 0

	for page_num in range(num_pages):
	page = doc.load_page(page_num)
	text = page.get_text()
	words = re.findall(r"\w+", text)
	num_words += len(words)
	num_characters += len(text)

	doc.close()
	return num_pages, num_words, num_characters

	except Exception as e:
	print(f"Error processing {pdf_file}: {e}")
	return 0, 0, 0


	def compute_statistics(data):
	"""Compute quantiles, mean, median, and standard deviation."""
	if not data:
	return {"quantiles": [0] * 10, "median": 0, "mean": 0, "stdev": 0}

	quantiles = statistics.quantiles(data, n=10)
	median_val = statistics.median(data)
	mean_val = statistics.mean(data)
	stdev_val = statistics.stdev(data) if len(data) > 1 else 0

	return {
	"quantiles": quantiles,
	"median": median_val,
	"mean": mean_val,
	"stdev": stdev_val,
	}


	def process_single_pdf(file_path: str):
	"""Process a single PDF and print its statistics."""
	pages, words, characters = extract_pdf_statistics(file_path)
	print("\nSingle PDF Statistics:")
	print(f"Number of Pages: {pages}")
	print(f"Number of Words: {words}")
	print(f"Number of Characters: {characters}")


	def process_pdfs_from_url(url: str):
	"""Download and process PDFs from a URL."""
	with tempfile.TemporaryDirectory() as download_folder:
	# Download all PDFs
	download_pdfs(url, download_folder)

	# Collect statistics from each PDF
	pages_stats = []
	words_stats = []
	characters_stats = []

	for root, _, files in os.walk(download_folder):
	for file in files:
	if file.lower().endswith(".pdf"):
	pdf_path = os.path.join(root, file)
	pages, words, characters = extract_pdf_statistics(pdf_path)
	pages_stats.append(pages)
	words_stats.append(words)
	characters_stats.append(characters)

	# Compute and display aggregate statistics
	print("Statistics for Pages:")
	print(compute_statistics(pages_stats))

	print("\nStatistics for Words:")
	print(compute_statistics(words_stats))

	print("\nStatistics for Characters:")
	print(compute_statistics(characters_stats))


	def main():
	parser = argparse.ArgumentParser(
	description="Process PDFs from a URL or a single file and compute statistics."
	)
	parser.add_argument("-u", "--url", help="URL to download and process PDFs from.")
	parser.add_argument("-f", "--file", help="Path to a single PDF file to process.")

	args = parser.parse_args()

	if not args.url and not args.file:
	parser.error("No action requested. Provide --url or --file or both.")

	if args.url:
	print(f"\nProcessing PDFs from URL: {args.url}")
	process_pdfs_from_url(args.url)

	if args.file:
	print(f"\nProcessing Single PDF: {args.file}")
	process_single_pdf(args.file)


	if __name__ == "__main__":
	main()