Skip to content

Instantly share code, notes, and snippets.

@Parables
Last active January 28, 2025 17:09
Show Gist options
  • Save Parables/e5a1f8d440b56cb7d525cb4efe4fa172 to your computer and use it in GitHub Desktop.
Save Parables/e5a1f8d440b56cb7d525cb4efe4fa172 to your computer and use it in GitHub Desktop.
Import allCountries.zip/txt from Geonames.org into PostgresDB using pg_bulkload
#!/bin/bash
# Configuration
DOWNLOAD_URL="https://download.geonames.org/export/dump/allCountries.zip"
DEFAULT_DOWNLOAD_DIR="$HOME/.geonames"
DB_NAME="${GEONAMES_DB:-central_db}"
TABLE_NAME="${GEONAMES_TB:-geonames}"
# Error handling function
error_exit() {
echo "Error: $1" >&2
exit 1
}
# Determine download location
get_download_location() {
DOWNLOAD_DIR="${1:-$DEFAULT_DOWNLOAD_DIR}"
mkdir -p "$DOWNLOAD_DIR"
ZIP_FILE="$DOWNLOAD_DIR/allCountries.zip"
DATA_FILE="$DOWNLOAD_DIR/allCountries.txt"
MIN_DATA_FILE="$DOWNLOAD_DIR/allCountries.min.txt"
}
# Download file with prompt
download_file() {
if [[ -f "$ZIP_FILE" ]]; then
read -rp "File already exists. Redownload? (y/n): " redownload
[[ "$redownload" != "y" ]] && return 1
fi
echo "Downloading allCountries.zip..."
wget -O "$ZIP_FILE" "$DOWNLOAD_URL" || error_exit "Download failed"
}
extract_file() {
if [[ -f "$DATA_FILE" ]]; then
read -rp "Data file already exists. Re-extract? (y/n): " reextract
[[ "$reextract" != "y" ]] && return 1
fi
rm -f "$DATA_FILE"
unzip -q "$ZIP_FILE" -d "$(dirname "$ZIP_FILE")" || error_exit "Unzip failed"
}
# Create table structure
create_table() {
sudo -u postgres psql -d "$DB_NAME" <<EOF
DROP TABLE IF EXISTS geonames;
CREATE TABLE IF NOT EXISTS $TABLE_NAME (
geonameid BIGINT PRIMARY KEY,
name TEXT,
country_code TEXT,
admin1_code TEXT,
feature_class TEXT,
feature_code TEXT
);
CREATE INDEX idx_geonameid ON geonames(geonameid);
CREATE INDEX idx_name ON geonames(name);
CREATE INDEX idx_country_code ON geonames(country_code);
CREATE INDEX idx_admin1_code ON geonames(admin1_code);
CREATE INDEX idx_feature_class ON geonames(feature_class);
CREATE INDEX idx_feature_code ON geonames(feature_code);
EOF
}
# Main script
main() {
get_download_location "$1"
download_file
extract_file
create_table
echo "Starting bulk load..."
# Cut out the 4th column from the allCountries.txt file
awk -F'\t' 'BEGIN {OFS="\t"} {print $1,$2,$9,$11,$7,$8}' "$DATA_FILE" >"$MIN_DATA_FILE"
vim "$MIN_DATA_FILE"
DATA_FILE="$MIN_DATA_FILE"
# Copy file to postgres home directory before loading
sudo cp "$DATA_FILE" ~postgres/
sudo chown -R postgres:postgres ~postgres/
# Use the postgres home directory path in pg_bulkload
sudo -u postgres /usr/pgsql-16/bin/pg_bulkload \
-d "$DB_NAME" \
-i ~postgres/allCountries.min.txt \
-O "$TABLE_NAME" \
-o $'DELIMITER=\t' \
-o "TYPE=CSV" \
-o "VERBOSE=YES" \
-o "TRUNCATE=YES" \
-o "ENCODING=UTF8" \
-o "PARSE_ERRORS=INFINITE" \
-o "DUPLICATE_ERRORS=INFINITE" \
-l ~postgres/import.log \
-P ~postgres/bad_rows.txt \
-u ~postgres/duplicates.txt || error_exit "Bulk load failed"
echo "Data import completed successfully!"
}
# Run main function with optional directory argument
main "$@"
#!/bin/bash
# Configuration
DOWNLOAD_URL="https://download.geonames.org/export/dump/allCountries.zip"
DEFAULT_DOWNLOAD_DIR="$HOME/.geonames"
DB_NAME="${GEONAMES_DB:-central_db}"
TABLE_NAME="${GEONAMES_TB:-geonames}"
LOG_FILE="$DEFAULT_DOWNLOAD_DIR/geonames_import.log"
# Error handling function
error_exit() {
echo "Error: $1" >&2
echo "Error: $1" >> "$LOG_FILE"
exit 1
}
# Logging function
log() {
echo "$1"
echo "$1" >> "$LOG_FILE"
}
# Determine download location
get_download_location() {
DOWNLOAD_DIR="${1:-$DEFAULT_DOWNLOAD_DIR}"
mkdir -p "$DOWNLOAD_DIR" || error_exit "Failed to create download directory: $DOWNLOAD_DIR"
ZIP_FILE="$DOWNLOAD_DIR/allCountries.zip"
DATA_FILE="$DOWNLOAD_DIR/allCountries.txt"
MIN_DATA_FILE="$DOWNLOAD_DIR/allCountries.min.txt"
}
# Check PostgreSQL connection
check_postgres_connection() {
log "Checking PostgreSQL connection..."
sudo -u postgres psql -d "$DB_NAME" -c "SELECT 1;" > /dev/null 2>&1 || error_exit "Failed to connect to PostgreSQL database: $DB_NAME"
}
# Download file with prompt
download_file() {
if [[ -f "$ZIP_FILE" ]]; then
read -rp "File already exists. Redownload? (y/n) [n]: " redownload
[[ "$redownload" != "y" ]] && return 1
fi
log "Downloading allCountries.zip..."
wget -O "$ZIP_FILE" "$DOWNLOAD_URL" >> "$LOG_FILE" 2>&1 || error_exit "Download failed"
}
# Extract file
extract_file() {
if [[ -f "$DATA_FILE" ]]; then
read -rp "Data file already exists. Re-extract? (y/n) [n]: " reextract
[[ "$reextract" != "y" ]] && return 1
fi
rm -f "$DATA_FILE"
unzip -q "$ZIP_FILE" -d "$(dirname "$ZIP_FILE")" >> "$LOG_FILE" 2>&1 || error_exit "Unzip failed"
}
# Create table structure
create_table() {
log "Creating table $TABLE_NAME in database $DB_NAME..."
sudo -u postgres psql -d "$DB_NAME" <<EOF >> "$LOG_FILE" 2>&1
DROP TABLE IF EXISTS $TABLE_NAME;
CREATE TABLE IF NOT EXISTS $TABLE_NAME (
geonameid BIGINT PRIMARY KEY,
name TEXT,
country_code TEXT,
admin1_code TEXT,
feature_class TEXT,
feature_code TEXT
);
CREATE INDEX idx_geonameid ON $TABLE_NAME(geonameid);
CREATE INDEX idx_name ON $TABLE_NAME(name);
CREATE INDEX idx_country_code ON $TABLE_NAME(country_code);
CREATE INDEX idx_admin1_code ON $TABLE_NAME(admin1_code);
CREATE INDEX idx_feature_class ON $TABLE_NAME(feature_class);
CREATE INDEX idx_feature_code ON $TABLE_NAME(feature_code);
EOF
}
# Prepare data file
prepare_data_file() {
log "Preparing data file..."
awk -F'\t' 'BEGIN {OFS="\t"} {print $1,$2,$9,$11,$7,$8}' "$DATA_FILE" > "$MIN_DATA_FILE" || error_exit "Failed to prepare data file"
}
# Copy file to postgres home directory
copy_to_postgres_home() {
log "Copying data file to PostgreSQL home directory..."
sudo cp "$MIN_DATA_FILE" ~postgres/ || error_exit "Failed to copy data file to PostgreSQL home directory"
sudo chown postgres:postgres ~postgres/allCountries.min.txt || error_exit "Failed to change ownership of data file"
}
# Bulk load data into PostgreSQL
bulk_load_data() {
log "Starting bulk load..."
sudo -u postgres /usr/pgsql-16/bin/pg_bulkload \
-d "$DB_NAME" \
-i ~postgres/allCountries.min.txt \
-O "$TABLE_NAME" \
-o $'DELIMITER=\t' \
-o "TYPE=CSV" \
-o "VERBOSE=YES" \
-o "TRUNCATE=YES" \
-o "ENCODING=UTF8" \
-o "PARSE_ERRORS=INFINITE" \
-o "DUPLICATE_ERRORS=INFINITE" \
-l ~postgres/import.log \
-P ~postgres/bad_rows.txt \
-u ~postgres/duplicates.txt >> "$LOG_FILE" 2>&1 || error_exit "Bulk load failed"
}
# Cleanup temporary files
cleanup() {
log "Cleaning up temporary files..."
rm -f "$ZIP_FILE" "$DATA_FILE" "$MIN_DATA_FILE"
sudo rm -f ~postgres/allCountries.min.txt ~postgres/import.log ~postgres/bad_rows.txt ~postgres/duplicates.txt
}
# Main script
main() {
get_download_location "$1"
check_postgres_connection
download_file
extract_file
create_table
prepare_data_file
copy_to_postgres_home
bulk_load_data
cleanup
log "Data import completed successfully!"
}
# Run main function with optional directory argument
main "$@"
#!/bin/bash
# Configuration
DOWNLOAD_URL="https://download.geonames.org/export/dump/allCountries.zip"
ALTERNATE_NAMES_URL="https://download.geonames.org/export/dump/alternateNamesV2.zip"
DEFAULT_DOWNLOAD_DIR="$HOME/.geonames"
DB_NAME="${GEONAMES_DB:-central_db}"
TABLE_NAME="${GEONAMES_TB:-geonames}"
ALTERNATE_NAMES_TABLE="alternate_names"
LOG_FILE="$DEFAULT_DOWNLOAD_DIR/geonames_import.log"
# Error handling function
error_exit() {
echo "Error: $1" >&2
echo "Error: $1" >> "$LOG_FILE"
exit 1
}
# Logging function
log() {
echo "$1"
echo "$1" >> "$LOG_FILE"
}
# Determine download location
get_download_location() {
DOWNLOAD_DIR="${1:-$DEFAULT_DOWNLOAD_DIR}"
mkdir -p "$DOWNLOAD_DIR" || error_exit "Failed to create download directory: $DOWNLOAD_DIR"
ZIP_FILE="$DOWNLOAD_DIR/allCountries.zip"
DATA_FILE="$DOWNLOAD_DIR/allCountries.txt"
MIN_DATA_FILE="$DOWNLOAD_DIR/allCountries.min.txt"
ALT_NAMES_ZIP="$DOWNLOAD_DIR/alternateNamesV2.zip"
ALT_NAMES_FILE="$DOWNLOAD_DIR/alternateNamesV2.txt"
}
# Check PostgreSQL connection
check_postgres_connection() {
log "Checking PostgreSQL connection..."
sudo -u postgres psql -d "$DB_NAME" -c "SELECT 1;" > /dev/null 2>&1 || error_exit "Failed to connect to PostgreSQL database: $DB_NAME"
}
# Download file with prompt
download_file() {
local url="$1"
local output_file="$2"
local description="$3"
if [[ -f "$output_file" ]]; then
read -rp "File already exists. Redownload? (y/n) [n]: " redownload
[[ "$redownload" != "y" ]] && return 1
fi
log "Downloading $description..."
wget -O "$output_file" "$url" >> "$LOG_FILE" 2>&1 || error_exit "Download failed for $description"
}
# Extract file
extract_file() {
local zip_file="$1"
local output_file="$2"
local description="$3"
if [[ -f "$output_file" ]]; then
read -rp "Data file already exists. Re-extract? (y/n) [n]: " reextract
[[ "$reextract" != "y" ]] && return 1
fi
rm -f "$output_file"
unzip -q "$zip_file" -d "$(dirname "$zip_file")" >> "$LOG_FILE" 2>&1 || error_exit "Unzip failed for $description"
}
# Create geonames table structure
create_geonames_table() {
log "Creating table $TABLE_NAME in database $DB_NAME..."
sudo -u postgres psql -d "$DB_NAME" <<EOF >> "$LOG_FILE" 2>&1
DROP TABLE IF EXISTS $TABLE_NAME;
CREATE TABLE IF NOT EXISTS $TABLE_NAME (
geonameid BIGINT PRIMARY KEY,
name TEXT,
country_code TEXT,
admin1_code TEXT,
feature_class TEXT,
feature_code TEXT
);
CREATE INDEX idx_geonameid ON $TABLE_NAME(geonameid);
CREATE INDEX idx_name ON $TABLE_NAME(name);
CREATE INDEX idx_country_code ON $TABLE_NAME(country_code);
CREATE INDEX idx_admin1_code ON $TABLE_NAME(admin1_code);
CREATE INDEX idx_feature_class ON $TABLE_NAME(feature_class);
CREATE INDEX idx_feature_code ON $TABLE_NAME(feature_code);
EOF
}
# Create alternate names table structure
create_alternate_names_table() {
log "Creating table $ALTERNATE_NAMES_TABLE in database $DB_NAME..."
sudo -u postgres psql -d "$DB_NAME" <<EOF >> "$LOG_FILE" 2>&1
DROP TABLE IF EXISTS $ALTERNATE_NAMES_TABLE;
CREATE TABLE IF NOT EXISTS $ALTERNATE_NAMES_TABLE (
alternateNameId SERIAL PRIMARY KEY,
geonameid INT,
isolanguage VARCHAR(7),
alternateName VARCHAR(400),
isPreferredName BOOLEAN,
isShortName BOOLEAN,
isColloquial BOOLEAN,
isHistoric BOOLEAN,
fromPeriod TEXT,
toPeriod TEXT
);
CREATE INDEX idx_geonameid_alternate ON $ALTERNATE_NAMES_TABLE(geonameid);
CREATE INDEX idx_isolanguage ON $ALTERNATE_NAMES_TABLE(isolanguage);
EOF
}
# Prepare data file
prepare_data_file() {
log "Preparing data file..."
awk -F'\t' 'BEGIN {OFS="\t"} {print $1,$2,$9,$11,$7,$8}' "$DATA_FILE" > "$MIN_DATA_FILE" || error_exit "Failed to prepare data file"
}
# Copy file to postgres home directory
copy_to_postgres_home() {
local file="$1"
local description="$2"
log "Copying $description to PostgreSQL home directory..."
sudo cp "$file" ~postgres/ || error_exit "Failed to copy $description to PostgreSQL home directory"
sudo chown postgres:postgres ~postgres/"$(basename "$file")" || error_exit "Failed to change ownership of $description"
}
# Bulk load data into PostgreSQL
bulk_load_data() {
local input_file="$1"
local table_name="$2"
local description="$3"
log "Starting bulk load for $description..."
sudo -u postgres /usr/pgsql-16/bin/pg_bulkload \
-d "$DB_NAME" \
-i ~postgres/"$(basename "$input_file")" \
-O "$table_name" \
-o $'DELIMITER=\t' \
-o "TYPE=CSV" \
-o "VERBOSE=YES" \
-o "TRUNCATE=YES" \
-o "ENCODING=UTF8" \
-o "PARSE_ERRORS=INFINITE" \
-o "DUPLICATE_ERRORS=INFINITE" \
-l ~postgres/import.log \
-P ~postgres/bad_rows.txt \
-u ~postgres/duplicates.txt >> "$LOG_FILE" 2>&1 || error_exit "Bulk load failed for $description"
}
# Cleanup temporary files with user prompt
cleanup() {
read -rp "Do you want to delete temporary files (ZIP, TXT, etc.)? (y/n) [n]: " cleanup_choice
if [[ "$cleanup_choice" == "y" ]]; then
log "Cleaning up temporary files..."
rm -f "$ZIP_FILE" "$DATA_FILE" "$MIN_DATA_FILE" "$ALT_NAMES_ZIP" "$ALT_NAMES_FILE"
sudo rm -f ~postgres/allCountries.min.txt ~postgres/alternateNamesV2.txt ~postgres/import.log ~postgres/bad_rows.txt ~postgres/duplicates.txt
log "Temporary files deleted."
else
log "Temporary files retained."
fi
}
# Main script
main() {
get_download_location "$1"
check_postgres_connection
# Download and import allCountries data
download_file "$DOWNLOAD_URL" "$ZIP_FILE" "allCountries.zip"
extract_file "$ZIP_FILE" "$DATA_FILE" "allCountries.txt"
create_geonames_table
prepare_data_file
copy_to_postgres_home "$MIN_DATA_FILE" "allCountries.min.txt"
bulk_load_data "$MIN_DATA_FILE" "$TABLE_NAME" "geonames data"
# Prompt to download and import alternateNamesV2 data
read -rp "Do you want to download and import alternate names data? (y/n) [n]: " import_alternate_names
if [[ "$import_alternate_names" == "y" ]]; then
download_file "$ALTERNATE_NAMES_URL" "$ALT_NAMES_ZIP" "alternateNamesV2.zip"
extract_file "$ALT_NAMES_ZIP" "$ALT_NAMES_FILE" "alternateNamesV2.txt"
create_alternate_names_table
copy_to_postgres_home "$ALT_NAMES_FILE" "alternateNamesV2.txt"
bulk_load_data "$ALT_NAMES_FILE" "$ALTERNATE_NAMES_TABLE" "alternate names data"
else
log "Skipping alternate names data import."
fi
cleanup
log "Data import completed successfully!"
}
# Run main function with optional directory argument
main "$@"
-- Query 1: Fetch All Countries
SELECT
geonameid AS country_id,
name AS country_name,
country_code
FROM
geonames
WHERE
feature_code IN ('PCLI', 'PCLD')
ORDER BY
country_name;
-- 1. Query to Fetch All Countries with Preferred Alternate Name
SELECT DISTINCT
g.country_code AS country_code,
COALESCE(a.alternateName, g.name) AS country_name -- Use alternate name if available, otherwise use default name
FROM
geonames g
LEFT JOIN
alternate_names a
ON
g.geonameid = a.geonameid
WHERE
g.feature_class = 'A' -- Assuming 'A' represents country-level features
AND g.feature_code = 'PCLI' -- Assuming 'PCLI' represents independent political entities (countries)
AND a.isolanguage = 'en' -- Replace 'fr' with the desired language code
AND a.isPreferredName = '1' -- Fetch only preferred names
ORDER BY
country_name;
-- Query 2: Fetch All States/Regions for a Country
SELECT
geonameid AS state_id,
name AS state_name,
admin1_code AS state_code
FROM
geonames
WHERE
feature_code = 'ADM1'
AND country_code = 'US' -- Replace 'US' with the desired country code
ORDER BY
state
-- 2. Query to Fetch All States/Regions for a Country with Preferred Alternate Name
SELECT DISTINCT
g.admin1_code AS state_code,
COALESCE(a.alternateName, g.name) AS state_name -- Use alternate name if available, otherwise use default name
FROM
geonames g
LEFT JOIN
alternate_names a
ON
g.geonameid = a.geonameid
AND a.isolanguage = 'en' -- Replace 'fr' with the desired language code
AND a.isPreferredName = '1' -- Fetch only preferred names
WHERE
g.country_code = 'GH' -- Replace 'US' with the desired country code
AND g.feature_class = 'A' -- Assuming 'A' represents administrative divisions
AND g.feature_code = 'ADM1' -- Assuming 'ADM1' represents first-level administrative divisions (states/regions)
ORDER BY
state_name;
-- 3. Query to Fetch All Cities for a State/Region
SELECT
geonameid AS city_id,
name AS city_name
FROM
geonames
WHERE
country_code = 'US' -- Replace 'US' with the desired country code
AND admin1_code = 'CA' -- Replace 'CA' with the desired state/region code
AND feature_class = 'P' -- Assuming 'P' represents populated places (cities, towns, etc.)
AND feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLC'); -- Assuming these represent cities/towns
ORDER BY
city_name;
-- 3. Query to Fetch All Cities for a State/Region with Preferred Alternate Name
SELECT
g.geonameid AS city_id,
COALESCE(a.alternateName, g.name) AS city_name -- Use alternate name if available, otherwise use default name
FROM
geonames g
LEFT JOIN
alternate_names a
ON
g.geonameid = a.geonameid
AND a.isolanguage = 'en' -- Replace 'fr' with the desired language code
AND a.isPreferredName = '1' -- Fetch only preferred names
WHERE
g.country_code = 'GH' -- Replace 'US' with the desired country code
AND g.admin1_code = '09' -- Replace 'CA' with the desired state/region code
AND g.feature_class = 'P' -- Assuming 'P' represents populated places (cities, towns, etc.)
AND g.feature_code IN ('PPL', 'PPLA', 'PPLA2', 'PPLC') -- Assuming these represent cities/towns
ORDER BY
city_name;
SELECT * FROM geonames WHERE feature_class = 'A' AND feature_code = 'PCLI';
SELECT * FROM geonames WHERE feature_class = 'A' AND feature_code = 'ADM2' AND country_code= 'GH' AND admin1_code = '09'
SELECT * FROM geonames WHERE feature_class = 'A' AND feature_code = 'ADM1' AND country_code= 'GH' ;
SELECT * FROM geonames WHERE country_code= 'GH' AND feature_class = 'P' AND feature_code = 'PPL' AND admin1_code = '09';
SELECT * FROM geonames WHERE country_code= 'GH' AND feature_class = 'P' AND feature_code = 'PPLA'
-- Get all cities in US
SELECT * FROM geonames WHERE country_code= 'GH' AND feature_class = 'P' AND feature_code = 'PPL';
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment