Skip to content

Instantly share code, notes, and snippets.

@jkbjh
Created February 11, 2025 14:07
Show Gist options
  • Save jkbjh/306b5de65f190556bd74e1827b12ebfe to your computer and use it in GitHub Desktop.
Save jkbjh/306b5de65f190556bd74e1827b12ebfe to your computer and use it in GitHub Desktop.
Download all pdfs from a page/ check pdf file statistics -- pages, word count, characters
#!/usr/bin/env python3
import os
import tempfile
import subprocess
import fitz # PyMuPDF for reading PDFs
import re
import statistics
import argparse
def download_pdfs(url: str, download_folder: str):
"""Download all PDF files from a given URL using wget."""
os.makedirs(download_folder, exist_ok=True)
wget_command = f"wget --recursive --no-parent --accept pdf --directory-prefix={download_folder} {url}"
print("Downloading PDFs...")
subprocess.run(wget_command, shell=True, check=True)
print("Download complete.")
def extract_pdf_statistics(pdf_file: str):
"""Extract statistics (word count, page count, character count) from a PDF file."""
try:
doc = fitz.open(pdf_file)
num_pages = doc.page_count
num_words = 0
num_characters = 0
for page_num in range(num_pages):
page = doc.load_page(page_num)
text = page.get_text()
words = re.findall(r"\w+", text)
num_words += len(words)
num_characters += len(text)
doc.close()
return num_pages, num_words, num_characters
except Exception as e:
print(f"Error processing {pdf_file}: {e}")
return 0, 0, 0
def compute_statistics(data):
"""Compute quantiles, mean, median, and standard deviation."""
if not data:
return {"quantiles": [0] * 10, "median": 0, "mean": 0, "stdev": 0}
quantiles = statistics.quantiles(data, n=10)
median_val = statistics.median(data)
mean_val = statistics.mean(data)
stdev_val = statistics.stdev(data) if len(data) > 1 else 0
return {
"quantiles": quantiles,
"median": median_val,
"mean": mean_val,
"stdev": stdev_val,
}
def process_single_pdf(file_path: str):
"""Process a single PDF and print its statistics."""
pages, words, characters = extract_pdf_statistics(file_path)
print("\nSingle PDF Statistics:")
print(f"Number of Pages: {pages}")
print(f"Number of Words: {words}")
print(f"Number of Characters: {characters}")
def process_pdfs_from_url(url: str):
"""Download and process PDFs from a URL."""
with tempfile.TemporaryDirectory() as download_folder:
# Download all PDFs
download_pdfs(url, download_folder)
# Collect statistics from each PDF
pages_stats = []
words_stats = []
characters_stats = []
for root, _, files in os.walk(download_folder):
for file in files:
if file.lower().endswith(".pdf"):
pdf_path = os.path.join(root, file)
pages, words, characters = extract_pdf_statistics(pdf_path)
pages_stats.append(pages)
words_stats.append(words)
characters_stats.append(characters)
# Compute and display aggregate statistics
print("Statistics for Pages:")
print(compute_statistics(pages_stats))
print("\nStatistics for Words:")
print(compute_statistics(words_stats))
print("\nStatistics for Characters:")
print(compute_statistics(characters_stats))
def main():
parser = argparse.ArgumentParser(
description="Process PDFs from a URL or a single file and compute statistics."
)
parser.add_argument("-u", "--url", help="URL to download and process PDFs from.")
parser.add_argument("-f", "--file", help="Path to a single PDF file to process.")
args = parser.parse_args()
if not args.url and not args.file:
parser.error("No action requested. Provide --url or --file or both.")
if args.url:
print(f"\nProcessing PDFs from URL: {args.url}")
process_pdfs_from_url(args.url)
if args.file:
print(f"\nProcessing Single PDF: {args.file}")
process_single_pdf(args.file)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment