Created
June 6, 2025 03:53
-
-
Save bigsnarfdude/a4a5ce17e56ac77a13fee45e3501848a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Fixed CSV Country Counter - Properly extracts clean country names | |
Handles complex entries like: "United States (Note: This is a corporation, not a university)" | |
""" | |
import re | |
import csv | |
def extract_clean_country(country_field): | |
""" | |
Extract clean country name from complex country field. | |
Examples: | |
"United States (Likely a private educational company)" -> "United States" | |
"Canada (School District - classification is approximate)" -> "Canada" | |
"France *(Note: CNRS is a research organization)*" -> "France" | |
""" | |
if not country_field: | |
return None | |
# Remove \r and extra whitespace | |
country = country_field.strip().replace('\r', '') | |
# Remove asterisks | |
country = country.replace('*', '').strip() | |
# Extract the country name before any parenthetical notes | |
# Pattern: "Country Name (any notes)" -> "Country Name" | |
match = re.match(r'^([^(]+)', country) | |
if match: | |
country = match.group(1).strip() | |
# Handle special cases | |
if country.lower().startswith('unknown'): | |
return 'Unknown' | |
if country.lower().startswith('not applicable') or country.lower() == 'n/a': | |
return 'Not Applicable' | |
# Skip clearly non-country entries | |
skip_patterns = [ | |
'not a university', 'research institution', 'government agency', | |
'corporation', 'company', 'organization', 'school district', | |
'high school', 'elementary school', 'classification difficult', | |
'requires further', 'insufficient information', 'needs further' | |
] | |
country_lower = country.lower() | |
for pattern in skip_patterns: | |
if pattern in country_lower: | |
return None | |
# Must be reasonable length | |
if len(country) < 2 or len(country) > 50: | |
return None | |
return country | |
def parse_csv_properly(filename): | |
""" | |
Parse CSV file properly handling quoted fields. | |
""" | |
countries = [] | |
try: | |
with open(filename, 'r', encoding='utf-8') as file: | |
# Use csv module for proper parsing | |
csv_reader = csv.reader(file) | |
# Skip header | |
header = next(csv_reader) | |
print(f"Header: {header}") | |
line_count = 0 | |
for row in csv_reader: | |
line_count += 1 | |
if len(row) >= 3: | |
country_field = row[2] # Third column is country | |
clean_country = extract_clean_country(country_field) | |
if clean_country: | |
countries.append(clean_country) | |
# Show first 10 successful extractions | |
if len(countries) <= 10: | |
original_name = row[0] if row[0] else "N/A" | |
print(f"✓ {len(countries):2d}. {original_name[:30]:<30} → {clean_country}") | |
if country_field != clean_country: | |
print(f" Original: {country_field[:60]}...") | |
else: | |
# Show first 5 skipped entries | |
if line_count <= 50 and country_field: | |
print(f"✗ Skipped: {country_field[:50]}...") | |
except FileNotFoundError: | |
print(f"Error: File '{filename}' not found.") | |
return [] | |
except Exception as e: | |
print(f"Error reading file: {e}") | |
return [] | |
return countries | |
def main(): | |
filename = "universities_with_countries_gemini_batch.csv" | |
print("CSV Country Counter - Fixed Version") | |
print("=" * 50) | |
print("Extracting clean country names...\n") | |
# Parse the CSV | |
countries = parse_csv_properly(filename) | |
if not countries: | |
print("No countries found!") | |
return | |
# Count countries | |
country_counts = {} | |
for country in countries: | |
country_counts[country] = country_counts.get(country, 0) + 1 | |
# Sort by count | |
sorted_countries = sorted(country_counts.items(), key=lambda x: x[1], reverse=True) | |
# Display results | |
total_entries = len(countries) | |
print(f"\n" + "=" * 60) | |
print(f"RESULTS:") | |
print(f"=" * 60) | |
print(f"Total valid entries: {total_entries:,}") | |
print(f"Unique countries: {len(country_counts)}") | |
print(f"\nTop 30 countries:") | |
print("-" * 60) | |
for i, (country, count) in enumerate(sorted_countries[:30], 1): | |
percentage = (count / total_entries) * 100 | |
print(f"{i:2d}. {country:<25} {count:6,} ({percentage:5.2f}%)") | |
print(f"\nCountries 31-60:") | |
print("-" * 60) | |
for i, (country, count) in enumerate(sorted_countries[30:60], 31): | |
percentage = (count / total_entries) * 100 | |
print(f"{i:2d}. {country:<25} {count:6,} ({percentage:5.2f}%)") | |
if len(sorted_countries) > 60: | |
print(f"\n... and {len(sorted_countries) - 60} more countries") | |
# Save results | |
output_file = "clean_country_counts.txt" | |
with open(output_file, 'w', encoding='utf-8') as f: | |
f.write("Clean Country Counts Report\n") | |
f.write("=" * 40 + "\n\n") | |
f.write(f"Total valid entries: {total_entries:,}\n") | |
f.write(f"Unique countries: {len(country_counts)}\n\n") | |
for i, (country, count) in enumerate(sorted_countries, 1): | |
percentage = (count / total_entries) * 100 | |
f.write(f"{i:3d}. {country:<30} {count:6,} ({percentage:5.2f}%)\n") | |
print(f"\nComplete results saved to: {output_file}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment