Skip to content

Instantly share code, notes, and snippets.

@MaxGhenis
Created August 11, 2025 13:36
Show Gist options
  • Save MaxGhenis/67e66d7627ab6a466b023717d6ef6833 to your computer and use it in GitHub Desktop.
Save MaxGhenis/67e66d7627ab6a466b023717d6ef6833 to your computer and use it in GitHub Desktop.
PolicyEngine-US Source Document Reference Analysis
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#!/usr/bin/env python3
"""
PolicyEngine-US Source Document Reference Analysis
This script analyzes the number of distinct source document references
in the PolicyEngine-US codebase, both with and without page number fragments.
"""
import os
import re
from pathlib import Path
from collections import defaultdict
import yaml
def extract_urls_from_text(text):
"""Extract URLs from text content."""
# Match URLs starting with http/https
url_pattern = r'https?://[^\s<>"\']+'
urls = re.findall(url_pattern, text)
return urls
def extract_legal_citations(text):
"""Extract legal citations like USC, CFR, state codes."""
citations = []
# USC citations (e.g., "26 USC 32", "26 U.S.C. Β§ 32")
usc_patterns = [
r'\d+\s+U\.?S\.?C\.?\s+Β§?\s*\d+',
r'\d+\s+USC\s+\d+',
]
# CFR citations (e.g., "26 CFR 1.32-1")
cfr_pattern = r'\d+\s+C\.?F\.?R\.?\s+[\d\.\-]+'
# State code patterns
state_patterns = [
r'\b[A-Z]{2,}\s+Rev\.?\s+Code\s+[\d\.\-]+', # e.g., "WA Rev Code 123"
r'\b[A-Z]{2,}\s+Stat\.?\s+[\d\.\-]+', # e.g., "CA Stat 123"
]
for pattern in usc_patterns + [cfr_pattern] + state_patterns:
citations.extend(re.findall(pattern, text, re.IGNORECASE))
return citations
def remove_page_fragment(url):
"""Remove #page=X fragments from URLs."""
# Remove everything after # for page fragments
if '#page=' in url:
return url.split('#page=')[0]
return url
def main():
# Set the base path to PolicyEngine-US repository
BASE_PATH = Path("/Users/maxghenis/PolicyEngine/policyengine-us")
# Verify the path exists
if not BASE_PATH.exists():
print(f"Error: Path {BASE_PATH} does not exist")
return
print(f"Analyzing PolicyEngine-US repository at: {BASE_PATH}")
# Initialize counters
all_references_with_pages = set()
all_references_without_pages = set()
url_references = set()
legal_citations = set()
pdf_references = set()
gov_urls = set()
# Statistics
files_processed = 0
files_with_references = 0
# Process Python files in variables/ and parameters/ directories
target_dirs = ['policyengine_us/parameters', 'policyengine_us/variables']
for target_dir in target_dirs:
dir_path = BASE_PATH / target_dir
if not dir_path.exists():
print(f"Warning: {dir_path} does not exist")
continue
print(f"\nProcessing {target_dir}...")
# Process all Python and YAML files
for file_path in dir_path.rglob('*'):
if file_path.is_file() and (file_path.suffix in ['.py', '.yaml', '.yml']):
files_processed += 1
file_has_references = False
try:
content = file_path.read_text(encoding='utf-8', errors='ignore')
# Extract URLs
urls = extract_urls_from_text(content)
for url in urls:
file_has_references = True
all_references_with_pages.add(url)
# Track PDFs specifically
if '.pdf' in url.lower():
pdf_references.add(url)
# Track .gov URLs
if '.gov' in url:
gov_urls.add(url)
# Add version without page numbers
url_without_page = remove_page_fragment(url)
all_references_without_pages.add(url_without_page)
url_references.add(url_without_page)
# Extract legal citations
citations = extract_legal_citations(content)
for citation in citations:
file_has_references = True
all_references_with_pages.add(citation)
all_references_without_pages.add(citation)
legal_citations.add(citation)
# For YAML files, also check for 'reference', 'source', 'documentation' fields
if file_path.suffix in ['.yaml', '.yml']:
try:
data = yaml.safe_load(content)
if data and isinstance(data, dict):
# Recursively search for reference fields
def search_references(obj, path=""):
if isinstance(obj, dict):
for key, value in obj.items():
if key in ['reference', 'references', 'source', 'documentation', 'href', 'url']:
if isinstance(value, str):
if value.startswith('http'):
all_references_with_pages.add(value)
all_references_without_pages.add(remove_page_fragment(value))
file_has_references = True
if '.pdf' in value.lower():
pdf_references.add(value)
if '.gov' in value:
gov_urls.add(value)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict) and 'href' in item:
href = item['href']
all_references_with_pages.add(href)
all_references_without_pages.add(remove_page_fragment(href))
file_has_references = True
if '.pdf' in href.lower():
pdf_references.add(href)
if '.gov' in href:
gov_urls.add(href)
else:
search_references(value, f"{path}.{key}")
elif isinstance(obj, list):
for item in obj:
search_references(item, path)
search_references(data)
except yaml.YAMLError:
pass # Skip invalid YAML files
if file_has_references:
files_with_references += 1
except Exception as e:
# Skip files that can't be read
pass
print(f"\nProcessed {files_processed} files")
print(f"Files with references: {files_with_references}")
# Analyze PDF references to understand page fragmentation
pdfs_with_pages = [ref for ref in pdf_references if '#page=' in ref]
pdfs_without_pages = [ref for ref in pdf_references if '#page=' not in ref]
unique_pdf_bases = set(remove_page_fragment(ref) for ref in pdf_references)
print("\nPDF Reference Analysis:")
print(f" Total PDF references: {len(pdf_references):,}")
print(f" PDFs with page fragments: {len(pdfs_with_pages):,}")
print(f" PDFs without page fragments: {len(pdfs_without_pages):,}")
print(f" Unique PDF documents (base URLs): {len(unique_pdf_bases):,}")
if len(unique_pdf_bases) > 0:
print(f" Average pages per PDF: {len(pdfs_with_pages) / len(unique_pdf_bases):.1f}")
# Final summary
print("\n" + "="*60)
print("POLICYENGINE-US SOURCE REFERENCE SUMMARY")
print("="*60)
print(f"\nπŸ“Š TOTAL DISTINCT REFERENCES:")
print(f" WITH page fragments: {len(all_references_with_pages):,}")
print(f" WITHOUT page fragments: {len(all_references_without_pages):,}")
if len(all_references_without_pages) > 0:
print(f" Reduction factor: {len(all_references_with_pages) / len(all_references_without_pages):.2f}x")
print(f"\nπŸ“‘ BREAKDOWN BY TYPE:")
print(f" Government URLs (.gov): {len(gov_urls):,}")
print(f" PDF documents: {len(pdf_references):,}")
print(f" Legal citations: {len(legal_citations):,}")
print(f" Other URLs: {len(url_references - gov_urls):,}")
print(f"\nπŸ“ FILE STATISTICS:")
print(f" Files processed: {files_processed:,}")
print(f" Files with references: {files_with_references:,}")
if files_processed > 0:
print(f" Coverage: {files_with_references/files_processed*100:.1f}%")
# Show some examples
print("\n" + "="*60)
print("EXAMPLE REFERENCES")
print("="*60)
print("\nπŸ”— Sample URLs with page fragments:")
for ref in list(pdfs_with_pages)[:5]:
print(f" β€’ {ref}")
print("\nπŸ“œ Sample legal citations:")
for ref in list(legal_citations)[:5]:
print(f" β€’ {ref}")
print("\nπŸ›οΈ Sample government URLs:")
count = 0
for ref in list(gov_urls):
if '.pdf' not in ref: # Show non-PDF gov URLs
print(f" β€’ {ref}")
count += 1
if count >= 5:
break
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment