Skip to content

Instantly share code, notes, and snippets.

@bigsnarfdude
Created June 6, 2025 03:53
Show Gist options
  • Save bigsnarfdude/a4a5ce17e56ac77a13fee45e3501848a to your computer and use it in GitHub Desktop.
Save bigsnarfdude/a4a5ce17e56ac77a13fee45e3501848a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Fixed CSV Country Counter - Properly extracts clean country names
Handles complex entries like: "United States (Note: This is a corporation, not a university)"
"""
import re
import csv
def extract_clean_country(country_field):
"""
Extract clean country name from complex country field.
Examples:
"United States (Likely a private educational company)" -> "United States"
"Canada (School District - classification is approximate)" -> "Canada"
"France *(Note: CNRS is a research organization)*" -> "France"
"""
if not country_field:
return None
# Remove \r and extra whitespace
country = country_field.strip().replace('\r', '')
# Remove asterisks
country = country.replace('*', '').strip()
# Extract the country name before any parenthetical notes
# Pattern: "Country Name (any notes)" -> "Country Name"
match = re.match(r'^([^(]+)', country)
if match:
country = match.group(1).strip()
# Handle special cases
if country.lower().startswith('unknown'):
return 'Unknown'
if country.lower().startswith('not applicable') or country.lower() == 'n/a':
return 'Not Applicable'
# Skip clearly non-country entries
skip_patterns = [
'not a university', 'research institution', 'government agency',
'corporation', 'company', 'organization', 'school district',
'high school', 'elementary school', 'classification difficult',
'requires further', 'insufficient information', 'needs further'
]
country_lower = country.lower()
for pattern in skip_patterns:
if pattern in country_lower:
return None
# Must be reasonable length
if len(country) < 2 or len(country) > 50:
return None
return country
def parse_csv_properly(filename):
"""
Parse CSV file properly handling quoted fields.
"""
countries = []
try:
with open(filename, 'r', encoding='utf-8') as file:
# Use csv module for proper parsing
csv_reader = csv.reader(file)
# Skip header
header = next(csv_reader)
print(f"Header: {header}")
line_count = 0
for row in csv_reader:
line_count += 1
if len(row) >= 3:
country_field = row[2] # Third column is country
clean_country = extract_clean_country(country_field)
if clean_country:
countries.append(clean_country)
# Show first 10 successful extractions
if len(countries) <= 10:
original_name = row[0] if row[0] else "N/A"
print(f"✓ {len(countries):2d}. {original_name[:30]:<30} → {clean_country}")
if country_field != clean_country:
print(f" Original: {country_field[:60]}...")
else:
# Show first 5 skipped entries
if line_count <= 50 and country_field:
print(f"✗ Skipped: {country_field[:50]}...")
except FileNotFoundError:
print(f"Error: File '{filename}' not found.")
return []
except Exception as e:
print(f"Error reading file: {e}")
return []
return countries
def main():
filename = "universities_with_countries_gemini_batch.csv"
print("CSV Country Counter - Fixed Version")
print("=" * 50)
print("Extracting clean country names...\n")
# Parse the CSV
countries = parse_csv_properly(filename)
if not countries:
print("No countries found!")
return
# Count countries
country_counts = {}
for country in countries:
country_counts[country] = country_counts.get(country, 0) + 1
# Sort by count
sorted_countries = sorted(country_counts.items(), key=lambda x: x[1], reverse=True)
# Display results
total_entries = len(countries)
print(f"\n" + "=" * 60)
print(f"RESULTS:")
print(f"=" * 60)
print(f"Total valid entries: {total_entries:,}")
print(f"Unique countries: {len(country_counts)}")
print(f"\nTop 30 countries:")
print("-" * 60)
for i, (country, count) in enumerate(sorted_countries[:30], 1):
percentage = (count / total_entries) * 100
print(f"{i:2d}. {country:<25} {count:6,} ({percentage:5.2f}%)")
print(f"\nCountries 31-60:")
print("-" * 60)
for i, (country, count) in enumerate(sorted_countries[30:60], 31):
percentage = (count / total_entries) * 100
print(f"{i:2d}. {country:<25} {count:6,} ({percentage:5.2f}%)")
if len(sorted_countries) > 60:
print(f"\n... and {len(sorted_countries) - 60} more countries")
# Save results
output_file = "clean_country_counts.txt"
with open(output_file, 'w', encoding='utf-8') as f:
f.write("Clean Country Counts Report\n")
f.write("=" * 40 + "\n\n")
f.write(f"Total valid entries: {total_entries:,}\n")
f.write(f"Unique countries: {len(country_counts)}\n\n")
for i, (country, count) in enumerate(sorted_countries, 1):
percentage = (count / total_entries) * 100
f.write(f"{i:3d}. {country:<30} {count:6,} ({percentage:5.2f}%)\n")
print(f"\nComplete results saved to: {output_file}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment