Last active
January 28, 2025 17:09
-
-
Save Parables/e5a1f8d440b56cb7d525cb4efe4fa172 to your computer and use it in GitHub Desktop.
Import allCountries.zip/txt from Geonames.org into PostgresDB using pg_bulkload
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Configuration | |
DOWNLOAD_URL="https://download.geonames.org/export/dump/allCountries.zip" | |
DEFAULT_DOWNLOAD_DIR="$HOME/.geonames" | |
DB_NAME="${GEONAMES_DB:-central_db}" | |
TABLE_NAME="${GEONAMES_TB:-geonames}" | |
# Error handling function | |
error_exit() { | |
echo "Error: $1" >&2 | |
exit 1 | |
} | |
# Determine download location | |
get_download_location() { | |
DOWNLOAD_DIR="${1:-$DEFAULT_DOWNLOAD_DIR}" | |
mkdir -p "$DOWNLOAD_DIR" | |
ZIP_FILE="$DOWNLOAD_DIR/allCountries.zip" | |
DATA_FILE="$DOWNLOAD_DIR/allCountries.txt" | |
MIN_DATA_FILE="$DOWNLOAD_DIR/allCountries.min.txt" | |
} | |
# Download file with prompt | |
download_file() { | |
if [[ -f "$ZIP_FILE" ]]; then | |
read -rp "File already exists. Redownload? (y/n): " redownload | |
[[ "$redownload" != "y" ]] && return 1 | |
fi | |
echo "Downloading allCountries.zip..." | |
wget -O "$ZIP_FILE" "$DOWNLOAD_URL" || error_exit "Download failed" | |
} | |
extract_file() { | |
if [[ -f "$DATA_FILE" ]]; then | |
read -rp "Data file already exists. Re-extract? (y/n): " reextract | |
[[ "$reextract" != "y" ]] && return 1 | |
fi | |
rm -f "$DATA_FILE" | |
unzip -q "$ZIP_FILE" -d "$(dirname "$ZIP_FILE")" || error_exit "Unzip failed" | |
} | |
# Create table structure | |
create_table() { | |
sudo -u postgres psql -d "$DB_NAME" <<EOF | |
DROP TABLE IF EXISTS geonames; | |
CREATE TABLE IF NOT EXISTS $TABLE_NAME ( | |
geonameid BIGINT PRIMARY KEY, | |
name TEXT, | |
country_code TEXT, | |
admin1_code TEXT, | |
feature_class TEXT, | |
feature_code TEXT | |
); | |
CREATE INDEX idx_geonameid ON geonames(geonameid); | |
CREATE INDEX idx_name ON geonames(name); | |
CREATE INDEX idx_country_code ON geonames(country_code); | |
CREATE INDEX idx_admin1_code ON geonames(admin1_code); | |
CREATE INDEX idx_feature_class ON geonames(feature_class); | |
CREATE INDEX idx_feature_code ON geonames(feature_code); | |
EOF | |
} | |
# Main script | |
main() { | |
get_download_location "$1" | |
download_file | |
extract_file | |
create_table | |
echo "Starting bulk load..." | |
# Cut out the 4th column from the allCountries.txt file | |
awk -F'\t' 'BEGIN {OFS="\t"} {print $1,$2,$9,$11,$7,$8}' "$DATA_FILE" >"$MIN_DATA_FILE" | |
vim "$MIN_DATA_FILE" | |
DATA_FILE="$MIN_DATA_FILE" | |
# Copy file to postgres home directory before loading | |
sudo cp "$DATA_FILE" ~postgres/ | |
sudo chown -R postgres:postgres ~postgres/ | |
# Use the postgres home directory path in pg_bulkload | |
sudo -u postgres /usr/pgsql-16/bin/pg_bulkload \ | |
-d "$DB_NAME" \ | |
-i ~postgres/allCountries.min.txt \ | |
-O "$TABLE_NAME" \ | |
-o $'DELIMITER=\t' \ | |
-o "TYPE=CSV" \ | |
-o "VERBOSE=YES" \ | |
-o "TRUNCATE=YES" \ | |
-o "ENCODING=UTF8" \ | |
-o "PARSE_ERRORS=INFINITE" \ | |
-o "DUPLICATE_ERRORS=INFINITE" \ | |
-l ~postgres/import.log \ | |
-P ~postgres/bad_rows.txt \ | |
-u ~postgres/duplicates.txt || error_exit "Bulk load failed" | |
echo "Data import completed successfully!" | |
} | |
# Run main function with optional directory argument | |
main "$@" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Configuration | |
DOWNLOAD_URL="https://download.geonames.org/export/dump/allCountries.zip" | |
DEFAULT_DOWNLOAD_DIR="$HOME/.geonames" | |
DB_NAME="${GEONAMES_DB:-central_db}" | |
TABLE_NAME="${GEONAMES_TB:-geonames}" | |
LOG_FILE="$DEFAULT_DOWNLOAD_DIR/geonames_import.log" | |
# Error handling function | |
error_exit() { | |
echo "Error: $1" >&2 | |
echo "Error: $1" >> "$LOG_FILE" | |
exit 1 | |
} | |
# Logging function | |
log() { | |
echo "$1" | |
echo "$1" >> "$LOG_FILE" | |
} | |
# Determine download location | |
get_download_location() { | |
DOWNLOAD_DIR="${1:-$DEFAULT_DOWNLOAD_DIR}" | |
mkdir -p "$DOWNLOAD_DIR" || error_exit "Failed to create download directory: $DOWNLOAD_DIR" | |
ZIP_FILE="$DOWNLOAD_DIR/allCountries.zip" | |
DATA_FILE="$DOWNLOAD_DIR/allCountries.txt" | |
MIN_DATA_FILE="$DOWNLOAD_DIR/allCountries.min.txt" | |
} | |
# Check PostgreSQL connection | |
check_postgres_connection() { | |
log "Checking PostgreSQL connection..." | |
sudo -u postgres psql -d "$DB_NAME" -c "SELECT 1;" > /dev/null 2>&1 || error_exit "Failed to connect to PostgreSQL database: $DB_NAME" | |
} | |
# Download file with prompt | |
download_file() { | |
if [[ -f "$ZIP_FILE" ]]; then | |
read -rp "File already exists. Redownload? (y/n) [n]: " redownload | |
[[ "$redownload" != "y" ]] && return 1 | |
fi | |
log "Downloading allCountries.zip..." | |
wget -O "$ZIP_FILE" "$DOWNLOAD_URL" >> "$LOG_FILE" 2>&1 || error_exit "Download failed" | |
} | |
# Extract file | |
extract_file() { | |
if [[ -f "$DATA_FILE" ]]; then | |
read -rp "Data file already exists. Re-extract? (y/n) [n]: " reextract | |
[[ "$reextract" != "y" ]] && return 1 | |
fi | |
rm -f "$DATA_FILE" | |
unzip -q "$ZIP_FILE" -d "$(dirname "$ZIP_FILE")" >> "$LOG_FILE" 2>&1 || error_exit "Unzip failed" | |
} | |
# Create table structure | |
create_table() { | |
log "Creating table $TABLE_NAME in database $DB_NAME..." | |
sudo -u postgres psql -d "$DB_NAME" <<EOF >> "$LOG_FILE" 2>&1 | |
DROP TABLE IF EXISTS $TABLE_NAME; | |
CREATE TABLE IF NOT EXISTS $TABLE_NAME ( | |
geonameid BIGINT PRIMARY KEY, | |
name TEXT, | |
country_code TEXT, | |
admin1_code TEXT, | |
feature_class TEXT, | |
feature_code TEXT | |
); | |
CREATE INDEX idx_geonameid ON $TABLE_NAME(geonameid); | |
CREATE INDEX idx_name ON $TABLE_NAME(name); | |
CREATE INDEX idx_country_code ON $TABLE_NAME(country_code); | |
CREATE INDEX idx_admin1_code ON $TABLE_NAME(admin1_code); | |
CREATE INDEX idx_feature_class ON $TABLE_NAME(feature_class); | |
CREATE INDEX idx_feature_code ON $TABLE_NAME(feature_code); | |
EOF | |
} | |
# Prepare data file | |
prepare_data_file() { | |
log "Preparing data file..." | |
awk -F'\t' 'BEGIN {OFS="\t"} {print $1,$2,$9,$11,$7,$8}' "$DATA_FILE" > "$MIN_DATA_FILE" || error_exit "Failed to prepare data file" | |
} | |
# Copy file to postgres home directory | |
copy_to_postgres_home() { | |
log "Copying data file to PostgreSQL home directory..." | |
sudo cp "$MIN_DATA_FILE" ~postgres/ || error_exit "Failed to copy data file to PostgreSQL home directory" | |
sudo chown postgres:postgres ~postgres/allCountries.min.txt || error_exit "Failed to change ownership of data file" | |
} | |
# Bulk load data into PostgreSQL | |
bulk_load_data() { | |
log "Starting bulk load..." | |
sudo -u postgres /usr/pgsql-16/bin/pg_bulkload \ | |
-d "$DB_NAME" \ | |
-i ~postgres/allCountries.min.txt \ | |
-O "$TABLE_NAME" \ | |
-o $'DELIMITER=\t' \ | |
-o "TYPE=CSV" \ | |
-o "VERBOSE=YES" \ | |
-o "TRUNCATE=YES" \ | |
-o "ENCODING=UTF8" \ | |
-o "PARSE_ERRORS=INFINITE" \ | |
-o "DUPLICATE_ERRORS=INFINITE" \ | |
-l ~postgres/import.log \ | |
-P ~postgres/bad_rows.txt \ | |
-u ~postgres/duplicates.txt >> "$LOG_FILE" 2>&1 || error_exit "Bulk load failed" | |
} | |
# Cleanup temporary files | |
cleanup() { | |
log "Cleaning up temporary files..." | |
rm -f "$ZIP_FILE" "$DATA_FILE" "$MIN_DATA_FILE" | |
sudo rm -f ~postgres/allCountries.min.txt ~postgres/import.log ~postgres/bad_rows.txt ~postgres/duplicates.txt | |
} | |
# Main script | |
main() { | |
get_download_location "$1" | |
check_postgres_connection | |
download_file | |
extract_file | |
create_table | |
prepare_data_file | |
copy_to_postgres_home | |
bulk_load_data | |
cleanup | |
log "Data import completed successfully!" | |
} | |
# Run main function with optional directory argument | |
main "$@" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Configuration | |
DOWNLOAD_URL="https://download.geonames.org/export/dump/allCountries.zip" | |
ALTERNATE_NAMES_URL="https://download.geonames.org/export/dump/alternateNamesV2.zip" | |
DEFAULT_DOWNLOAD_DIR="$HOME/.geonames" | |
DB_NAME="${GEONAMES_DB:-central_db}" | |
TABLE_NAME="${GEONAMES_TB:-geonames}" | |
ALTERNATE_NAMES_TABLE="alternate_names" | |
LOG_FILE="$DEFAULT_DOWNLOAD_DIR/geonames_import.log" | |
# Error handling function | |
error_exit() { | |
echo "Error: $1" >&2 | |
echo "Error: $1" >> "$LOG_FILE" | |
exit 1 | |
} | |
# Logging function | |
log() { | |
echo "$1" | |
echo "$1" >> "$LOG_FILE" | |
} | |
# Determine download location | |
get_download_location() { | |
DOWNLOAD_DIR="${1:-$DEFAULT_DOWNLOAD_DIR}" | |
mkdir -p "$DOWNLOAD_DIR" || error_exit "Failed to create download directory: $DOWNLOAD_DIR" | |
ZIP_FILE="$DOWNLOAD_DIR/allCountries.zip" | |
DATA_FILE="$DOWNLOAD_DIR/allCountries.txt" | |
MIN_DATA_FILE="$DOWNLOAD_DIR/allCountries.min.txt" | |
ALT_NAMES_ZIP="$DOWNLOAD_DIR/alternateNamesV2.zip" | |
ALT_NAMES_FILE="$DOWNLOAD_DIR/alternateNamesV2.txt" | |
} | |
# Check PostgreSQL connection | |
check_postgres_connection() { | |
log "Checking PostgreSQL connection..." | |
sudo -u postgres psql -d "$DB_NAME" -c "SELECT 1;" > /dev/null 2>&1 || error_exit "Failed to connect to PostgreSQL database: $DB_NAME" | |
} | |
# Download file with prompt | |
download_file() { | |
local url="$1" | |
local output_file="$2" | |
local description="$3" | |
if [[ -f "$output_file" ]]; then | |
read -rp "File already exists. Redownload? (y/n) [n]: " redownload | |
[[ "$redownload" != "y" ]] && return 1 | |
fi | |
log "Downloading $description..." | |
wget -O "$output_file" "$url" >> "$LOG_FILE" 2>&1 || error_exit "Download failed for $description" | |
} | |
# Extract file | |
extract_file() { | |
local zip_file="$1" | |
local output_file="$2" | |
local description="$3" | |
if [[ -f "$output_file" ]]; then | |
read -rp "Data file already exists. Re-extract? (y/n) [n]: " reextract | |
[[ "$reextract" != "y" ]] && return 1 | |
fi | |
rm -f "$output_file" | |
unzip -q "$zip_file" -d "$(dirname "$zip_file")" >> "$LOG_FILE" 2>&1 || error_exit "Unzip failed for $description" | |
} | |
# Create geonames table structure | |
create_geonames_table() { | |
log "Creating table $TABLE_NAME in database $DB_NAME..." | |
sudo -u postgres psql -d "$DB_NAME" <<EOF >> "$LOG_FILE" 2>&1 | |
DROP TABLE IF EXISTS $TABLE_NAME; | |
CREATE TABLE IF NOT EXISTS $TABLE_NAME ( | |
geonameid BIGINT PRIMARY KEY, | |
name TEXT, | |
country_code TEXT, | |
admin1_code TEXT, | |
feature_class TEXT, | |
feature_code TEXT | |
); | |
CREATE INDEX idx_geonameid ON $TABLE_NAME(geonameid); | |
CREATE INDEX idx_name ON $TABLE_NAME(name); | |
CREATE INDEX idx_country_code ON $TABLE_NAME(country_code); | |
CREATE INDEX idx_admin1_code ON $TABLE_NAME(admin1_code); | |
CREATE INDEX idx_feature_class ON $TABLE_NAME(feature_class); | |
CREATE INDEX idx_feature_code ON $TABLE_NAME(feature_code); | |
EOF | |
} | |
# Create alternate names table structure | |
create_alternate_names_table() { | |
log "Creating table $ALTERNATE_NAMES_TABLE in database $DB_NAME..." | |
sudo -u postgres psql -d "$DB_NAME" <<EOF >> "$LOG_FILE" 2>&1 | |
DROP TABLE IF EXISTS $ALTERNATE_NAMES_TABLE; | |
CREATE TABLE IF NOT EXISTS $ALTERNATE_NAMES_TABLE ( | |
alternateNameId SERIAL PRIMARY KEY, | |
geonameid INT, | |
isolanguage VARCHAR(7), | |
alternateName VARCHAR(400), | |
isPreferredName BOOLEAN, | |
isShortName BOOLEAN, | |
isColloquial BOOLEAN, | |
isHistoric BOOLEAN, | |
fromPeriod TEXT, | |
toPeriod TEXT | |
); | |
CREATE INDEX idx_geonameid_alternate ON $ALTERNATE_NAMES_TABLE(geonameid); | |
CREATE INDEX idx_isolanguage ON $ALTERNATE_NAMES_TABLE(isolanguage); | |
EOF | |
} | |
# Prepare data file | |
prepare_data_file() { | |
log "Preparing data file..." | |
awk -F'\t' 'BEGIN {OFS="\t"} {print $1,$2,$9,$11,$7,$8}' "$DATA_FILE" > "$MIN_DATA_FILE" || error_exit "Failed to prepare data file" | |
} | |
# Copy file to postgres home directory | |
copy_to_postgres_home() { | |
local file="$1" | |
local description="$2" | |
log "Copying $description to PostgreSQL home directory..." | |
sudo cp "$file" ~postgres/ || error_exit "Failed to copy $description to PostgreSQL home directory" | |
sudo chown postgres:postgres ~postgres/"$(basename "$file")" || error_exit "Failed to change ownership of $description" | |
} | |
# Bulk load data into PostgreSQL | |
bulk_load_data() { | |
local input_file="$1" | |
local table_name="$2" | |
local description="$3" | |
log "Starting bulk load for $description..." | |
sudo -u postgres /usr/pgsql-16/bin/pg_bulkload \ | |
-d "$DB_NAME" \ | |
-i ~postgres/"$(basename "$input_file")" \ | |
-O "$table_name" \ | |
-o $'DELIMITER=\t' \ | |
-o "TYPE=CSV" \ | |
-o "VERBOSE=YES" \ | |
-o "TRUNCATE=YES" \ | |
-o "ENCODING=UTF8" \ | |
-o "PARSE_ERRORS=INFINITE" \ | |
-o "DUPLICATE_ERRORS=INFINITE" \ | |
-l ~postgres/import.log \ | |
-P ~postgres/bad_rows.txt \ | |
-u ~postgres/duplicates.txt >> "$LOG_FILE" 2>&1 || error_exit "Bulk load failed for $description" | |
} | |
# Cleanup temporary files with user prompt | |
cleanup() { | |
read -rp "Do you want to delete temporary files (ZIP, TXT, etc.)? (y/n) [n]: " cleanup_choice | |
if [[ "$cleanup_choice" == "y" ]]; then | |
log "Cleaning up temporary files..." | |
rm -f "$ZIP_FILE" "$DATA_FILE" "$MIN_DATA_FILE" "$ALT_NAMES_ZIP" "$ALT_NAMES_FILE" | |
sudo rm -f ~postgres/allCountries.min.txt ~postgres/alternateNamesV2.txt ~postgres/import.log ~postgres/bad_rows.txt ~postgres/duplicates.txt | |
log "Temporary files deleted." | |
else | |
log "Temporary files retained." | |
fi | |
} | |
# Main script | |
main() { | |
get_download_location "$1" | |
check_postgres_connection | |
# Download and import allCountries data | |
download_file "$DOWNLOAD_URL" "$ZIP_FILE" "allCountries.zip" | |
extract_file "$ZIP_FILE" "$DATA_FILE" "allCountries.txt" | |
create_geonames_table | |
prepare_data_file | |
copy_to_postgres_home "$MIN_DATA_FILE" "allCountries.min.txt" | |
bulk_load_data "$MIN_DATA_FILE" "$TABLE_NAME" "geonames data" | |
# Prompt to download and import alternateNamesV2 data | |
read -rp "Do you want to download and import alternate names data? (y/n) [n]: " import_alternate_names | |
if [[ "$import_alternate_names" == "y" ]]; then | |
download_file "$ALTERNATE_NAMES_URL" "$ALT_NAMES_ZIP" "alternateNamesV2.zip" | |
extract_file "$ALT_NAMES_ZIP" "$ALT_NAMES_FILE" "alternateNamesV2.txt" | |
create_alternate_names_table | |
copy_to_postgres_home "$ALT_NAMES_FILE" "alternateNamesV2.txt" | |
bulk_load_data "$ALT_NAMES_FILE" "$ALTERNATE_NAMES_TABLE" "alternate names data" | |
else | |
log "Skipping alternate names data import." | |
fi | |
cleanup | |
log "Data import completed successfully!" | |
} | |
# Run main function with optional directory argument | |
main "$@" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Query 1: Fetch All Countries | |
SELECT | |
geonameid AS country_id, | |
name AS country_name, | |
country_code | |
FROM | |
geonames | |
WHERE | |
feature_code IN ('PCLI', 'PCLD') | |
ORDER BY | |
country_name; | |
-- 1. Query to Fetch All Countries with Preferred Alternate Name | |
SELECT DISTINCT | |
g.country_code AS country_code, | |
COALESCE(a.alternateName, g.name) AS country_name -- Use alternate name if available, otherwise use default name | |
FROM | |
geonames g | |
LEFT JOIN | |
alternate_names a | |
ON | |
g.geonameid = a.geonameid | |
WHERE | |
g.feature_class = 'A' -- Assuming 'A' represents country-level features | |
AND g.feature_code = 'PCLI' -- Assuming 'PCLI' represents independent political entities (countries) | |
AND a.isolanguage = 'en' -- Replace 'fr' with the desired language code | |
AND a.isPreferredName = '1' -- Fetch only preferred names | |
ORDER BY | |
country_name; | |
-- Query 2: Fetch All States/Regions for a Country | |
SELECT | |
geonameid AS state_id, | |
name AS state_name, | |
admin1_code AS state_code | |
FROM | |
geonames | |
WHERE | |
feature_code = 'ADM1' | |
AND country_code = 'US' -- Replace 'US' with the desired country code | |
ORDER BY | |
state | |
-- 2. Query to Fetch All States/Regions for a Country with Preferred Alternate Name | |
SELECT DISTINCT | |
g.admin1_code AS state_code, | |
COALESCE(a.alternateName, g.name) AS state_name -- Use alternate name if available, otherwise use default name | |
FROM | |
geonames g | |
LEFT JOIN | |
alternate_names a | |
ON | |
g.geonameid = a.geonameid | |
AND a.isolanguage = 'en' -- Replace 'fr' with the desired language code | |
AND a.isPreferredName = '1' -- Fetch only preferred names | |
WHERE | |
g.country_code = 'GH' -- Replace 'US' with the desired country code | |
AND g.feature_class = 'A' -- Assuming 'A' represents administrative divisions | |
AND g.feature_code = 'ADM1' -- Assuming 'ADM1' represents first-level administrative divisions (states/regions) | |
ORDER BY | |
state_name; | |
-- 3. Query to Fetch All Cities for a State/Region | |
SELECT | |
geonameid AS city_id, | |
name AS city_name | |
FROM | |
geonames | |
WHERE | |
country_code = 'US' -- Replace 'US' with the desired country code | |
AND admin1_code = 'CA' -- Replace 'CA' with the desired state/region code | |
AND feature_class = 'P' -- Assuming 'P' represents populated places (cities, towns, etc.) | |
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLC'); -- Assuming these represent cities/towns | |
ORDER BY | |
city_name; | |
-- 3. Query to Fetch All Cities for a State/Region with Preferred Alternate Name | |
SELECT | |
g.geonameid AS city_id, | |
COALESCE(a.alternateName, g.name) AS city_name -- Use alternate name if available, otherwise use default name | |
FROM | |
geonames g | |
LEFT JOIN | |
alternate_names a | |
ON | |
g.geonameid = a.geonameid | |
AND a.isolanguage = 'en' -- Replace 'fr' with the desired language code | |
AND a.isPreferredName = '1' -- Fetch only preferred names | |
WHERE | |
g.country_code = 'GH' -- Replace 'US' with the desired country code | |
AND g.admin1_code = '09' -- Replace 'CA' with the desired state/region code | |
AND g.feature_class = 'P' -- Assuming 'P' represents populated places (cities, towns, etc.) | |
AND g.feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLC') -- Assuming these represent cities/towns | |
ORDER BY | |
city_name; | |
SELECT * FROM geonames WHERE feature_class = 'A' AND feature_code = 'PCLI'; | |
SELECT * FROM geonames WHERE feature_class = 'A' AND feature_code = 'ADM2' AND country_code= 'GH' AND admin1_code = '09' | |
SELECT * FROM geonames WHERE feature_class = 'A' AND feature_code = 'ADM1' AND country_code= 'GH' ; | |
SELECT * FROM geonames WHERE country_code= 'GH' AND feature_class = 'P' AND feature_code = 'PPL' AND admin1_code = '09'; | |
SELECT * FROM geonames WHERE country_code= 'GH' AND feature_class = 'P' AND feature_code = 'PPLA' | |
-- Get all cities in US | |
SELECT * FROM geonames WHERE country_code= 'GH' AND feature_class = 'P' AND feature_code = 'PPL'; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment