Created
February 11, 2025 14:07
-
-
Save jkbjh/306b5de65f190556bd74e1827b12ebfe to your computer and use it in GitHub Desktop.
Download all pdfs from a page/ check pdf file statistics -- pages, word count, characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import tempfile | |
import subprocess | |
import fitz # PyMuPDF for reading PDFs | |
import re | |
import statistics | |
import argparse | |
def download_pdfs(url: str, download_folder: str): | |
"""Download all PDF files from a given URL using wget.""" | |
os.makedirs(download_folder, exist_ok=True) | |
wget_command = f"wget --recursive --no-parent --accept pdf --directory-prefix={download_folder} {url}" | |
print("Downloading PDFs...") | |
subprocess.run(wget_command, shell=True, check=True) | |
print("Download complete.") | |
def extract_pdf_statistics(pdf_file: str): | |
"""Extract statistics (word count, page count, character count) from a PDF file.""" | |
try: | |
doc = fitz.open(pdf_file) | |
num_pages = doc.page_count | |
num_words = 0 | |
num_characters = 0 | |
for page_num in range(num_pages): | |
page = doc.load_page(page_num) | |
text = page.get_text() | |
words = re.findall(r"\w+", text) | |
num_words += len(words) | |
num_characters += len(text) | |
doc.close() | |
return num_pages, num_words, num_characters | |
except Exception as e: | |
print(f"Error processing {pdf_file}: {e}") | |
return 0, 0, 0 | |
def compute_statistics(data): | |
"""Compute quantiles, mean, median, and standard deviation.""" | |
if not data: | |
return {"quantiles": [0] * 10, "median": 0, "mean": 0, "stdev": 0} | |
quantiles = statistics.quantiles(data, n=10) | |
median_val = statistics.median(data) | |
mean_val = statistics.mean(data) | |
stdev_val = statistics.stdev(data) if len(data) > 1 else 0 | |
return { | |
"quantiles": quantiles, | |
"median": median_val, | |
"mean": mean_val, | |
"stdev": stdev_val, | |
} | |
def process_single_pdf(file_path: str): | |
"""Process a single PDF and print its statistics.""" | |
pages, words, characters = extract_pdf_statistics(file_path) | |
print("\nSingle PDF Statistics:") | |
print(f"Number of Pages: {pages}") | |
print(f"Number of Words: {words}") | |
print(f"Number of Characters: {characters}") | |
def process_pdfs_from_url(url: str): | |
"""Download and process PDFs from a URL.""" | |
with tempfile.TemporaryDirectory() as download_folder: | |
# Download all PDFs | |
download_pdfs(url, download_folder) | |
# Collect statistics from each PDF | |
pages_stats = [] | |
words_stats = [] | |
characters_stats = [] | |
for root, _, files in os.walk(download_folder): | |
for file in files: | |
if file.lower().endswith(".pdf"): | |
pdf_path = os.path.join(root, file) | |
pages, words, characters = extract_pdf_statistics(pdf_path) | |
pages_stats.append(pages) | |
words_stats.append(words) | |
characters_stats.append(characters) | |
# Compute and display aggregate statistics | |
print("Statistics for Pages:") | |
print(compute_statistics(pages_stats)) | |
print("\nStatistics for Words:") | |
print(compute_statistics(words_stats)) | |
print("\nStatistics for Characters:") | |
print(compute_statistics(characters_stats)) | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Process PDFs from a URL or a single file and compute statistics." | |
) | |
parser.add_argument("-u", "--url", help="URL to download and process PDFs from.") | |
parser.add_argument("-f", "--file", help="Path to a single PDF file to process.") | |
args = parser.parse_args() | |
if not args.url and not args.file: | |
parser.error("No action requested. Provide --url or --file or both.") | |
if args.url: | |
print(f"\nProcessing PDFs from URL: {args.url}") | |
process_pdfs_from_url(args.url) | |
if args.file: | |
print(f"\nProcessing Single PDF: {args.file}") | |
process_single_pdf(args.file) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment