Skip to content

Instantly share code, notes, and snippets.

@rorybyrne
Last active February 26, 2025 16:03
Show Gist options
  • Save rorybyrne/3c2e0d0eacc00552f8feb23f0ad0456c to your computer and use it in GitHub Desktop.
Save rorybyrne/3c2e0d0eacc00552f8feb23f0ad0456c to your computer and use it in GitHub Desktop.
A simple CLI for interacting with a SLURM HPC
#!/bin/bash
# vim: ft=sh
#
# HPC job management tool for local machine
# Remote configuration
HPC_HOST="<host>"
HPC_USER="<user>"
HPC_CODE_DIR="/home/${HPC_USER}/code"
# Local directory structure (project-specific)
HPC_DIR=".hpc"
HPC_SCRIPTS_DIR="${HPC_DIR}/scripts"
HPC_JOBS_DIR="${HPC_DIR}/jobs"
HPC_RESULTS_DIR="${HPC_DIR}/results"
# Job metadata storage
HPC_JOBS_META_DIR="${HPC_JOBS_DIR}/metadata"
HPC_JOBS_ALIAS_DIR="${HPC_JOBS_DIR}/aliases"
HPC_JOBS_SEQUENCE="${HPC_JOBS_DIR}/next_alias"
# Initialize local storage in project directory
init_local_storage() {
# Create directory structure
mkdir -p "${HPC_SCRIPTS_DIR}"
mkdir -p "${HPC_JOBS_META_DIR}"
mkdir -p "${HPC_JOBS_ALIAS_DIR}"
mkdir -p "${HPC_RESULTS_DIR}"
# Initialize job sequence if not exists
[[ ! -f "${HPC_JOBS_SEQUENCE}" ]] && echo "1" > "${HPC_JOBS_SEQUENCE}"
}
# Get next available short alias
get_next_alias() {
local next=$(cat "${HPC_JOBS_SEQUENCE}")
echo $((next + 1)) > "${HPC_JOBS_SEQUENCE}"
printf "j%03d" "$next"
}
# Save job information and create alias
save_job_info() {
local job_id=$1
local job_name=$2
local remote_dir=$3
local output_dir=$4
local alias=$5
# Create job info file in metadata directory
echo "${job_name}|${remote_dir}|${output_dir}|$(date +%s)" > "${HPC_JOBS_META_DIR}/${job_id}"
# Create alias symlink
ln -sf "${HPC_JOBS_META_DIR}/${job_id}" "${HPC_JOBS_ALIAS_DIR}/${alias}"
echo "$alias"
}
# Resolve job ID or alias to full job ID
resolve_job_id() {
local id=$1
if [[ $id =~ ^j[0-9]+$ ]]; then
# It's an alias - get the real job ID from symlink
local link_target=$(readlink "${HPC_JOBS_ALIAS_DIR}/${id}")
if [[ -n "$link_target" ]]; then
basename "$link_target"
else
echo "Error: Alias $id not found" >&2
exit 1
fi
else
# It's a job ID - verify it exists
if [[ -f "${HPC_JOBS_META_DIR}/${id}" ]]; then
echo "$id"
else
echo "Error: Job ID $id not found" >&2
exit 1
fi
fi
}
# Command: hpc new <job_name> [time] [partition] [account]
cmd_new() {
local job_name=$1
local time=${2:-"02:00:00"}
local partition=${3:-"icelake"}
local account=${4:-"<account>"}
local output_file="${HPC_SCRIPTS_DIR}/${job_name}"
echo "Creating new job script: ${output_file}"
cat > "$output_file" <<EOF
#!/bin/bash
#SBATCH -J ${job_name}
#SBATCH -A ${account}
#SBATCH -p ${partition}
#SBATCH --mem=2G
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --time=${time}
#SBATCH --mail-type=NONE
#! Load modules required:
. /etc/profile.d/modules.sh
module purge
module load rhel8/default-icl
#! Echo useful information to stdout
echo "Job script information:"
echo "================================================================================"
echo "This job is allocated on \${SLURM_JOB_NUM_NODES} node(s)"
echo "Job is running on node(s):"
echo \$SLURM_JOB_NODELIST
echo "Job started on: \$(date)"
echo "Current directory: \$(pwd)"
echo "================================================================================"
echo
echo
# Your commands go here:
echo
echo "--- Job finished ---"
echo "Job finished on: \$(date)"
EOF
chmod +x "$output_file"
echo -e "\033[32mCreated job script:\033[0m $output_file"
}
# Command: hpc submit <script_name>
cmd_submit() {
local script_name=$1
local script_path="${HPC_SCRIPTS_DIR}/${script_name}"
if [[ ! -f "$script_path" ]]; then
echo "Error: Script not found: $script_path"
exit 1
fi
# Get alias early so we can use it for paths
local alias=$(get_next_alias)
local dir_name=$(basename "$(pwd)")
local remote_dir="${HPC_CODE_DIR}/${dir_name}"
local remote_results="${remote_dir}/.hpc/results/${alias}"
echo "Setting up directories..."
ssh "${HPC_USER}@${HPC_HOST}" "mkdir -p '${remote_dir}' '${remote_results}'" || {
echo "Error: Failed to create directories"
exit 1
}
echo "Syncing project files to HPC:${remote_dir}..."
rsync -a --progress \
--exclude='.hpc/results/*' \
--exclude='.venv' \
--exclude='data' \
--exclude='.git' \
--exclude='results' \
--exclude='__pycache__' \
--exclude='.pytest_cache' \
--exclude='*.pyc' \
. "${HPC_USER}@${HPC_HOST}:${remote_dir}/" || {
echo "Error: Failed to sync files to HPC"
exit 1
}
echo "Running dependency sync on HPC..."
ssh "${HPC_USER}@${HPC_HOST}" "cd '${remote_dir}' && just sync" || {
echo "Error: Failed to run dependency sync on HPC"
exit 1
}
echo "Submitting job ${HPC_SCRIPTS_DIR}/${script_name}..."
local submission_output=$(ssh "${HPC_USER}@${HPC_HOST}" "cd '${remote_dir}' && sbatch \
--output='${remote_results}/stdout' \
--error='${remote_results}/stderr' \
--export=FLYWHL_RESULTS_DIR='${remote_results}' \
'${HPC_SCRIPTS_DIR}/${script_name}'" 2>&1)
local job_id=$(echo "$submission_output" | awk '/Submitted batch job/ {print $4}')
if [[ -z "$job_id" ]]; then
echo "Error submitting job: $submission_output"
exit 1
fi
# Use existing alias when saving job info
save_job_info "$job_id" "$dir_name" "$remote_dir" "$remote_results" "$alias"
echo "Job submitted successfully:"
echo " Alias: ${alias}"
echo " Full ID: ${job_id}"
echo " Name: ${dir_name}"
echo " Remote directory: ${remote_dir}"
echo " Results directory: ${remote_results}"
}
# Command: hpc log <job_id/alias>
cmd_log() {
local alias=$1
# First ensure we have the latest outputs
cmd_fetch "$alias"
local results_dir="${HPC_RESULTS_DIR}/${alias}"
# Print stderr in red, stdout in default color
echo "=== stderr ==="
if [[ -f "${results_dir}/stderr" ]]; then
echo -e "\033[0;31m$(cat "${results_dir}/stderr")\033[0m"
fi
echo -e "\n=== stdout ==="
if [[ -f "${results_dir}/stdout" ]]; then
cat "${results_dir}/stdout"
fi
}
# Command: hpc fetch <job_id/alias> [--filetypes type1,type2,...]
cmd_fetch() {
local alias=$1
shift
# Resolve the real job ID
local job_id=$(resolve_job_id "$alias")
local filetypes=""
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--filetypes)
filetypes=$2
shift 2
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
# Read job info
local job_info=$(cat "${HPC_JOBS_META_DIR}/${job_id}")
local remote_results=$(echo "$job_info" | cut -d'|' -f3)
# Create local directory for job results using alias
local results_dir="${HPC_RESULTS_DIR}/${alias}"
mkdir -p "$results_dir"
# Fetch stdout and stderr with standardized names
echo "Fetching job output files..."
scp "${HPC_USER}@${HPC_HOST}:${remote_results}/stdout" "${results_dir}/stdout" 2>/dev/null
scp "${HPC_USER}@${HPC_HOST}:${remote_results}/stderr" "${results_dir}/stderr" 2>/dev/null
# Fetch additional file types if specified
if [[ -n "$filetypes" ]]; then
echo "Fetching files with extensions: $filetypes"
local IFS=','
for ext in $filetypes; do
echo "Fetching *.$ext files..."
scp "${HPC_USER}@${HPC_HOST}:${remote_results}/*.${ext}" "$results_dir/" 2>/dev/null
done
fi
echo "Files downloaded to: $results_dir"
}
# Command: hpc list/ls
cmd_list() {
local format="%-6s %-12s %-20s %-10s %s\n"
printf "$format" "ALIAS" "JOB ID" "NAME" "STATUS" "SUBMITTED"
printf "%s\n" "------------------------------------------------------------------------"
for alias_file in "${HPC_JOBS_ALIAS_DIR}"/j*; do
[[ -L "$alias_file" ]] || continue
local alias=$(basename "$alias_file")
local job_id=$(basename "$(readlink "$alias_file")")
local job_info=$(cat "${HPC_JOBS_META_DIR}/${job_id}")
local job_name=$(echo "$job_info" | cut -d'|' -f1)
local submit_time=$(echo "$job_info" | cut -d'|' -f4)
# Get job status from HPC
local status=$(ssh "${HPC_USER}@${HPC_HOST}" "squeue -j ${job_id} -h -o %T" 2>/dev/null || echo "COMPLETED")
# BSD date command compatible format
local formatted_date=$(date -r "$submit_time" "+%Y-%m-%d %H:%M")
printf "$format" \
"$alias" \
"$job_id" \
"$job_name" \
"$status" \
"$formatted_date"
done
}
# Command: hpc status <job_id/alias>
cmd_status() {
local job_id=$(resolve_job_id "$1")
echo "Fetching status for job ${job_id}..."
ssh "${HPC_USER}@${HPC_HOST}" "squeue -j ${job_id} -o '%.18i %.8j %.8u %.8T %.10M %.9l %.6D %R'"
}
# Command: hpc cancel <job_id/alias>
cmd_cancel() {
local job_id=$(resolve_job_id "$1")
ssh "${HPC_USER}@${HPC_HOST}" "scancel ${job_id}"
echo "Cancelled job ${job_id}"
}
### Jupyter ####################################################################
# Command: hpc jupyter [partition] [time]
cmd_jupyter() {
local partition=${1:-"icelake"}
local time=${2:-"04:00:00"}
local script_name="jupyter_notebook"
local output_file="${HPC_SCRIPTS_DIR}/${script_name}"
echo "Creating and submitting Jupyter notebook job..."
# Create the Jupyter launch script
cat > "$output_file" <<EOF
#!/bin/bash
#SBATCH -J jupyter
#SBATCH -A <account>
#SBATCH -p ${partition}
#SBATCH --mem=16G
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --time=${time}
#SBATCH --mail-type=NONE
#! Load modules
. /etc/profile.d/modules.sh
module purge
module load rhel8/default-icl
# Set up Jupyter runtime directory
export JUPYTER_RUNTIME_DIR=./jupyter_runtime
mkdir -p \$JUPYTER_RUNTIME_DIR
# Get port and IP for tunneling
XDG_RUNTIME_DIR=""
ipnport=\$(shuf -i8000-9999 -n1)
ipnip=\$(hostname -i)
# Print tunneling instructions
echo "JUPYTER_NOTEBOOK_INFO"
echo "PORT=\${ipnport}"
echo "IP=\${ipnip}"
echo "NODE=\$(hostname)"
echo "END_JUPYTER_NOTEBOOK_INFO"
# Launch Jupyter Lab
jupyter lab --no-browser --port=\$ipnport --ip=\$ipnip
EOF
chmod +x "$output_file"
# Submit the job and get the alias
local alias=$(cmd_submit "$script_name")
# Wait for the notebook to start and setup connection
wait_for_jupyter "$alias"
echo "Job submitted as ${alias}"
}
# Helper function to wait for Jupyter to start and show connection info
wait_for_jupyter() {
local alias=$1
local job_id=$(resolve_job_id "$alias")
local results_dir="${HPC_RESULTS_DIR}/${alias}"
echo "Waiting for Jupyter server to start..."
local connected=false
while ! $connected; do
# Fetch latest output
cmd_fetch "$alias" >/dev/null 2>&1
if [[ -f "${results_dir}/stdout" ]]; then
# Look for our marker lines and extract info
if grep -q "JUPYTER_NOTEBOOK_INFO" "${results_dir}/stdout"; then
local port=$(grep "PORT=" "${results_dir}/stdout" | cut -d'=' -f2)
local ip=$(grep "IP=" "${results_dir}/stdout" | cut -d'=' -f2)
local node=$(grep "NODE=" "${results_dir}/stdout" | cut -d'=' -f2)
if [[ -n "$port" && -n "$ip" && -n "$node" ]]; then
echo -e "\nJupyter server is running!"
echo -e "\nTo connect, run these commands in a new terminal:"
echo -e "\033[1m ssh -N -L ${port}:${ip}:${port} ${HPC_USER}@${HPC_HOST}\033[0m"
echo -e "\nThen open this URL in your browser:"
echo -e "\033[1m http://localhost:${port}\033[0m"
echo -e "\nUse \033[1mhpc cancel ${alias}\033[0m to stop the Jupyter server"
connected=true
fi
fi
fi
# Check if the job is still running
local status=$(ssh "${HPC_USER}@${HPC_HOST}" "squeue -j ${job_id} -h -o %T" 2>/dev/null)
if [[ -z "$status" ]]; then
echo "Error: Job terminated unexpectedly"
return 1
fi
sleep 2
done
}
# Initialize storage on first run
init_local_storage
# Main command router
case "$1" in
new)
shift
cmd_new "$@"
;;
submit)
shift
cmd_submit "$@"
;;
fetch)
shift
cmd_fetch "$@"
;;
log)
shift
cmd_log "$@"
;;
status)
shift
cmd_status "$@"
;;
cancel)
shift
cmd_cancel "$@"
;;
list|ls)
cmd_list
;;
jupyter)
shift
cmd_jupyter "$@"
;;
*)
echo "Usage: hpc <command> [options]"
echo "Commands:"
echo " new <job_name> [time] [partition] [account] Create a new job script"
echo " submit <script_name> Submit a job script to HPC"
echo " fetch <id> [--filetypes t1,t2,...] Fetch job outputs and specified file types"
echo " log <id> Display stdout and stderr from job"
echo " status <id> Check job status"
echo " cancel <id> Cancel a running job"
echo " list (ls) List"
echo " jupyter [partition] [time] Launch Jupyter notebook server (default: icelake, 4h)"
;;
esac
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment