bigsnarfdude · June 6, 2025 03:53
diff --git a/cleanedCounter.py b/cleanedCounter.py
 #!/usr/bin/env python3
 """
 Fixed CSV Country Counter - Properly extracts clean country names
 Handles complex entries like: "United States (Note: This is a corporation, not a university)"
 """

 import re
 import csv

 def extract_clean_country(country_field):
    """
    Extract clean country name from complex country field.
    
    Examples:
    "United States (Likely a private educational company)" -> "United States"
    "Canada (School District - classification is approximate)" -> "Canada"
    "France *(Note: CNRS is a research organization)*" -> "France"
    """
    if not country_field:
        return None
    
    # Remove \r and extra whitespace
    country = country_field.strip().replace('\r', '')
    
    # Remove asterisks
    country = country.replace('*', '').strip()
    
    # Extract the country name before any parenthetical notes
    # Pattern: "Country Name (any notes)" -> "Country Name"
    match = re.match(r'^([^(]+)', country)
    if match:
        country = match.group(1).strip()
    
    # Handle special cases
    if country.lower().startswith('unknown'):
        return 'Unknown'
    if country.lower().startswith('not applicable') or country.lower() == 'n/a':
        return 'Not Applicable'
    
    # Skip clearly non-country entries
    skip_patterns = [
        'not a university', 'research institution', 'government agency',
        'corporation', 'company', 'organization', 'school district',
        'high school', 'elementary school', 'classification difficult',
        'requires further', 'insufficient information', 'needs further'
    ]
    
    country_lower = country.lower()
    for pattern in skip_patterns:
        if pattern in country_lower:
            return None
    
    # Must be reasonable length
    if len(country) < 2 or len(country) > 50:
        return None
    
    return country

 def parse_csv_properly(filename):
    """
    Parse CSV file properly handling quoted fields.
    """
    countries = []
    
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            # Use csv module for proper parsing
            csv_reader = csv.reader(file)
            
            # Skip header
            header = next(csv_reader)
            print(f"Header: {header}")
            
            line_count = 0
            for row in csv_reader:
                line_count += 1
                
                if len(row) >= 3:
                    country_field = row[2]  # Third column is country
                    clean_country = extract_clean_country(country_field)
                    
                    if clean_country:
                        countries.append(clean_country)
                        
                        # Show first 10 successful extractions
                        if len(countries) <= 10:
                            original_name = row[0] if row[0] else "N/A"
                            print(f"✓ {len(countries):2d}. {original_name[:30]:<30} → {clean_country}")
                            if country_field != clean_country:
                                print(f"     Original: {country_field[:60]}...")
                    else:
                        # Show first 5 skipped entries
                        if line_count <= 50 and country_field:
                            print(f"✗ Skipped: {country_field[:50]}...")
    
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return []
    except Exception as e:
        print(f"Error reading file: {e}")
        return []
    
    return countries

 def main():
    filename = "universities_with_countries_gemini_batch.csv"
    
    print("CSV Country Counter - Fixed Version")
    print("=" * 50)
    print("Extracting clean country names...\n")
    
    # Parse the CSV
    countries = parse_csv_properly(filename)
    
    if not countries:
        print("No countries found!")
        return
    
    # Count countries
    country_counts = {}
    for country in countries:
        country_counts[country] = country_counts.get(country, 0) + 1
    
    # Sort by count
    sorted_countries = sorted(country_counts.items(), key=lambda x: x[1], reverse=True)
    
    # Display results
    total_entries = len(countries)
    print(f"\n" + "=" * 60)
    print(f"RESULTS:")
    print(f"=" * 60)
    print(f"Total valid entries: {total_entries:,}")
    print(f"Unique countries: {len(country_counts)}")
    
    print(f"\nTop 30 countries:")
    print("-" * 60)
    for i, (country, count) in enumerate(sorted_countries[:30], 1):
        percentage = (count / total_entries) * 100
        print(f"{i:2d}. {country:<25} {count:6,} ({percentage:5.2f}%)")
    
    print(f"\nCountries 31-60:")
    print("-" * 60)
    for i, (country, count) in enumerate(sorted_countries[30:60], 31):
        percentage = (count / total_entries) * 100
        print(f"{i:2d}. {country:<25} {count:6,} ({percentage:5.2f}%)")
    
    if len(sorted_countries) > 60:
        print(f"\n... and {len(sorted_countries) - 60} more countries")
    
    # Save results
    output_file = "clean_country_counts.txt"
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("Clean Country Counts Report\n")
        f.write("=" * 40 + "\n\n")
        f.write(f"Total valid entries: {total_entries:,}\n")
        f.write(f"Unique countries: {len(country_counts)}\n\n")
        
        for i, (country, count) in enumerate(sorted_countries, 1):
            percentage = (count / total_entries) * 100
            f.write(f"{i:3d}. {country:<30} {count:6,} ({percentage:5.2f}%)\n")
    
    print(f"\nComplete results saved to: {output_file}")

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Fixed CSV Country Counter - Properly extracts clean country names
	Handles complex entries like: "United States (Note: This is a corporation, not a university)"
	"""

	import re
	import csv

	def extract_clean_country(country_field):
	"""
	Extract clean country name from complex country field.

	Examples:
	"United States (Likely a private educational company)" -> "United States"
	"Canada (School District - classification is approximate)" -> "Canada"
	"France (Note: CNRS is a research organization)" -> "France"
	"""
	if not country_field:
	return None

	# Remove \r and extra whitespace
	country = country_field.strip().replace('\r', '')

	# Remove asterisks
	country = country.replace('*', '').strip()

	# Extract the country name before any parenthetical notes
	# Pattern: "Country Name (any notes)" -> "Country Name"
	match = re.match(r'^([^(]+)', country)
	if match:
	country = match.group(1).strip()

	# Handle special cases
	if country.lower().startswith('unknown'):
	return 'Unknown'
	if country.lower().startswith('not applicable') or country.lower() == 'n/a':
	return 'Not Applicable'

	# Skip clearly non-country entries
	skip_patterns = [
	'not a university', 'research institution', 'government agency',
	'corporation', 'company', 'organization', 'school district',
	'high school', 'elementary school', 'classification difficult',
	'requires further', 'insufficient information', 'needs further'
	]

	country_lower = country.lower()
	for pattern in skip_patterns:
	if pattern in country_lower:
	return None

	# Must be reasonable length
	if len(country) < 2 or len(country) > 50:
	return None

	return country

	def parse_csv_properly(filename):
	"""
	Parse CSV file properly handling quoted fields.
	"""
	countries = []

	try:
	with open(filename, 'r', encoding='utf-8') as file:
	# Use csv module for proper parsing
	csv_reader = csv.reader(file)

	# Skip header
	header = next(csv_reader)
	print(f"Header: {header}")

	line_count = 0
	for row in csv_reader:
	line_count += 1

	if len(row) >= 3:
	country_field = row[2] # Third column is country
	clean_country = extract_clean_country(country_field)

	if clean_country:
	countries.append(clean_country)

	# Show first 10 successful extractions
	if len(countries) <= 10:
	original_name = row[0] if row[0] else "N/A"
	print(f"✓ {len(countries):2d}. {original_name[:30]:<30} → {clean_country}")
	if country_field != clean_country:
	print(f" Original: {country_field[:60]}...")
	else:
	# Show first 5 skipped entries
	if line_count <= 50 and country_field:
	print(f"✗ Skipped: {country_field[:50]}...")

	except FileNotFoundError:
	print(f"Error: File '{filename}' not found.")
	return []
	except Exception as e:
	print(f"Error reading file: {e}")
	return []

	return countries

	def main():
	filename = "universities_with_countries_gemini_batch.csv"

	print("CSV Country Counter - Fixed Version")
	print("=" * 50)
	print("Extracting clean country names...\n")

	# Parse the CSV
	countries = parse_csv_properly(filename)

	if not countries:
	print("No countries found!")
	return

	# Count countries
	country_counts = {}
	for country in countries:
	country_counts[country] = country_counts.get(country, 0) + 1

	# Sort by count
	sorted_countries = sorted(country_counts.items(), key=lambda x: x[1], reverse=True)

	# Display results
	total_entries = len(countries)
	print(f"\n" + "=" * 60)
	print(f"RESULTS:")
	print(f"=" * 60)
	print(f"Total valid entries: {total_entries:,}")
	print(f"Unique countries: {len(country_counts)}")

	print(f"\nTop 30 countries:")
	print("-" * 60)
	for i, (country, count) in enumerate(sorted_countries[:30], 1):
	percentage = (count / total_entries) * 100
	print(f"{i:2d}. {country:<25} {count:6,} ({percentage:5.2f}%)")

	print(f"\nCountries 31-60:")
	print("-" * 60)
	for i, (country, count) in enumerate(sorted_countries[30:60], 31):
	percentage = (count / total_entries) * 100
	print(f"{i:2d}. {country:<25} {count:6,} ({percentage:5.2f}%)")

	if len(sorted_countries) > 60:
	print(f"\n... and {len(sorted_countries) - 60} more countries")

	# Save results
	output_file = "clean_country_counts.txt"
	with open(output_file, 'w', encoding='utf-8') as f:
	f.write("Clean Country Counts Report\n")
	f.write("=" * 40 + "\n\n")
	f.write(f"Total valid entries: {total_entries:,}\n")
	f.write(f"Unique countries: {len(country_counts)}\n\n")

	for i, (country, count) in enumerate(sorted_countries, 1):
	percentage = (count / total_entries) * 100
	f.write(f"{i:3d}. {country:<30} {count:6,} ({percentage:5.2f}%)\n")

	print(f"\nComplete results saved to: {output_file}")

	if __name__ == "__main__":
	main()