Last active
February 26, 2025 16:03
-
-
Save rorybyrne/3c2e0d0eacc00552f8feb23f0ad0456c to your computer and use it in GitHub Desktop.
A simple CLI for interacting with a SLURM HPC
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# vim: ft=sh | |
# | |
# HPC job management tool for local machine | |
# Remote configuration | |
HPC_HOST="<host>" | |
HPC_USER="<user>" | |
HPC_CODE_DIR="/home/${HPC_USER}/code" | |
# Local directory structure (project-specific) | |
HPC_DIR=".hpc" | |
HPC_SCRIPTS_DIR="${HPC_DIR}/scripts" | |
HPC_JOBS_DIR="${HPC_DIR}/jobs" | |
HPC_RESULTS_DIR="${HPC_DIR}/results" | |
# Job metadata storage | |
HPC_JOBS_META_DIR="${HPC_JOBS_DIR}/metadata" | |
HPC_JOBS_ALIAS_DIR="${HPC_JOBS_DIR}/aliases" | |
HPC_JOBS_SEQUENCE="${HPC_JOBS_DIR}/next_alias" | |
# Initialize local storage in project directory | |
init_local_storage() { | |
# Create directory structure | |
mkdir -p "${HPC_SCRIPTS_DIR}" | |
mkdir -p "${HPC_JOBS_META_DIR}" | |
mkdir -p "${HPC_JOBS_ALIAS_DIR}" | |
mkdir -p "${HPC_RESULTS_DIR}" | |
# Initialize job sequence if not exists | |
[[ ! -f "${HPC_JOBS_SEQUENCE}" ]] && echo "1" > "${HPC_JOBS_SEQUENCE}" | |
} | |
# Get next available short alias | |
get_next_alias() { | |
local next=$(cat "${HPC_JOBS_SEQUENCE}") | |
echo $((next + 1)) > "${HPC_JOBS_SEQUENCE}" | |
printf "j%03d" "$next" | |
} | |
# Save job information and create alias | |
save_job_info() { | |
local job_id=$1 | |
local job_name=$2 | |
local remote_dir=$3 | |
local output_dir=$4 | |
local alias=$5 | |
# Create job info file in metadata directory | |
echo "${job_name}|${remote_dir}|${output_dir}|$(date +%s)" > "${HPC_JOBS_META_DIR}/${job_id}" | |
# Create alias symlink | |
ln -sf "${HPC_JOBS_META_DIR}/${job_id}" "${HPC_JOBS_ALIAS_DIR}/${alias}" | |
echo "$alias" | |
} | |
# Resolve job ID or alias to full job ID | |
resolve_job_id() { | |
local id=$1 | |
if [[ $id =~ ^j[0-9]+$ ]]; then | |
# It's an alias - get the real job ID from symlink | |
local link_target=$(readlink "${HPC_JOBS_ALIAS_DIR}/${id}") | |
if [[ -n "$link_target" ]]; then | |
basename "$link_target" | |
else | |
echo "Error: Alias $id not found" >&2 | |
exit 1 | |
fi | |
else | |
# It's a job ID - verify it exists | |
if [[ -f "${HPC_JOBS_META_DIR}/${id}" ]]; then | |
echo "$id" | |
else | |
echo "Error: Job ID $id not found" >&2 | |
exit 1 | |
fi | |
fi | |
} | |
# Command: hpc new <job_name> [time] [partition] [account] | |
cmd_new() { | |
local job_name=$1 | |
local time=${2:-"02:00:00"} | |
local partition=${3:-"icelake"} | |
local account=${4:-"<account>"} | |
local output_file="${HPC_SCRIPTS_DIR}/${job_name}" | |
echo "Creating new job script: ${output_file}" | |
cat > "$output_file" <<EOF | |
#!/bin/bash | |
#SBATCH -J ${job_name} | |
#SBATCH -A ${account} | |
#SBATCH -p ${partition} | |
#SBATCH --mem=2G | |
#SBATCH --nodes=1 | |
#SBATCH --ntasks=1 | |
#SBATCH --time=${time} | |
#SBATCH --mail-type=NONE | |
#! Load modules required: | |
. /etc/profile.d/modules.sh | |
module purge | |
module load rhel8/default-icl | |
#! Echo useful information to stdout | |
echo "Job script information:" | |
echo "================================================================================" | |
echo "This job is allocated on \${SLURM_JOB_NUM_NODES} node(s)" | |
echo "Job is running on node(s):" | |
echo \$SLURM_JOB_NODELIST | |
echo "Job started on: \$(date)" | |
echo "Current directory: \$(pwd)" | |
echo "================================================================================" | |
echo | |
echo | |
# Your commands go here: | |
echo | |
echo "--- Job finished ---" | |
echo "Job finished on: \$(date)" | |
EOF | |
chmod +x "$output_file" | |
echo -e "\033[32mCreated job script:\033[0m $output_file" | |
} | |
# Command: hpc submit <script_name> | |
cmd_submit() { | |
local script_name=$1 | |
local script_path="${HPC_SCRIPTS_DIR}/${script_name}" | |
if [[ ! -f "$script_path" ]]; then | |
echo "Error: Script not found: $script_path" | |
exit 1 | |
fi | |
# Get alias early so we can use it for paths | |
local alias=$(get_next_alias) | |
local dir_name=$(basename "$(pwd)") | |
local remote_dir="${HPC_CODE_DIR}/${dir_name}" | |
local remote_results="${remote_dir}/.hpc/results/${alias}" | |
echo "Setting up directories..." | |
ssh "${HPC_USER}@${HPC_HOST}" "mkdir -p '${remote_dir}' '${remote_results}'" || { | |
echo "Error: Failed to create directories" | |
exit 1 | |
} | |
echo "Syncing project files to HPC:${remote_dir}..." | |
rsync -a --progress \ | |
--exclude='.hpc/results/*' \ | |
--exclude='.venv' \ | |
--exclude='data' \ | |
--exclude='.git' \ | |
--exclude='results' \ | |
--exclude='__pycache__' \ | |
--exclude='.pytest_cache' \ | |
--exclude='*.pyc' \ | |
. "${HPC_USER}@${HPC_HOST}:${remote_dir}/" || { | |
echo "Error: Failed to sync files to HPC" | |
exit 1 | |
} | |
echo "Running dependency sync on HPC..." | |
ssh "${HPC_USER}@${HPC_HOST}" "cd '${remote_dir}' && just sync" || { | |
echo "Error: Failed to run dependency sync on HPC" | |
exit 1 | |
} | |
echo "Submitting job ${HPC_SCRIPTS_DIR}/${script_name}..." | |
local submission_output=$(ssh "${HPC_USER}@${HPC_HOST}" "cd '${remote_dir}' && sbatch \ | |
--output='${remote_results}/stdout' \ | |
--error='${remote_results}/stderr' \ | |
--export=FLYWHL_RESULTS_DIR='${remote_results}' \ | |
'${HPC_SCRIPTS_DIR}/${script_name}'" 2>&1) | |
local job_id=$(echo "$submission_output" | awk '/Submitted batch job/ {print $4}') | |
if [[ -z "$job_id" ]]; then | |
echo "Error submitting job: $submission_output" | |
exit 1 | |
fi | |
# Use existing alias when saving job info | |
save_job_info "$job_id" "$dir_name" "$remote_dir" "$remote_results" "$alias" | |
echo "Job submitted successfully:" | |
echo " Alias: ${alias}" | |
echo " Full ID: ${job_id}" | |
echo " Name: ${dir_name}" | |
echo " Remote directory: ${remote_dir}" | |
echo " Results directory: ${remote_results}" | |
} | |
# Command: hpc log <job_id/alias> | |
cmd_log() { | |
local alias=$1 | |
# First ensure we have the latest outputs | |
cmd_fetch "$alias" | |
local results_dir="${HPC_RESULTS_DIR}/${alias}" | |
# Print stderr in red, stdout in default color | |
echo "=== stderr ===" | |
if [[ -f "${results_dir}/stderr" ]]; then | |
echo -e "\033[0;31m$(cat "${results_dir}/stderr")\033[0m" | |
fi | |
echo -e "\n=== stdout ===" | |
if [[ -f "${results_dir}/stdout" ]]; then | |
cat "${results_dir}/stdout" | |
fi | |
} | |
# Command: hpc fetch <job_id/alias> [--filetypes type1,type2,...] | |
cmd_fetch() { | |
local alias=$1 | |
shift | |
# Resolve the real job ID | |
local job_id=$(resolve_job_id "$alias") | |
local filetypes="" | |
# Parse arguments | |
while [[ $# -gt 0 ]]; do | |
case $1 in | |
--filetypes) | |
filetypes=$2 | |
shift 2 | |
;; | |
*) | |
echo "Unknown option: $1" | |
exit 1 | |
;; | |
esac | |
done | |
# Read job info | |
local job_info=$(cat "${HPC_JOBS_META_DIR}/${job_id}") | |
local remote_results=$(echo "$job_info" | cut -d'|' -f3) | |
# Create local directory for job results using alias | |
local results_dir="${HPC_RESULTS_DIR}/${alias}" | |
mkdir -p "$results_dir" | |
# Fetch stdout and stderr with standardized names | |
echo "Fetching job output files..." | |
scp "${HPC_USER}@${HPC_HOST}:${remote_results}/stdout" "${results_dir}/stdout" 2>/dev/null | |
scp "${HPC_USER}@${HPC_HOST}:${remote_results}/stderr" "${results_dir}/stderr" 2>/dev/null | |
# Fetch additional file types if specified | |
if [[ -n "$filetypes" ]]; then | |
echo "Fetching files with extensions: $filetypes" | |
local IFS=',' | |
for ext in $filetypes; do | |
echo "Fetching *.$ext files..." | |
scp "${HPC_USER}@${HPC_HOST}:${remote_results}/*.${ext}" "$results_dir/" 2>/dev/null | |
done | |
fi | |
echo "Files downloaded to: $results_dir" | |
} | |
# Command: hpc list/ls | |
cmd_list() { | |
local format="%-6s %-12s %-20s %-10s %s\n" | |
printf "$format" "ALIAS" "JOB ID" "NAME" "STATUS" "SUBMITTED" | |
printf "%s\n" "------------------------------------------------------------------------" | |
for alias_file in "${HPC_JOBS_ALIAS_DIR}"/j*; do | |
[[ -L "$alias_file" ]] || continue | |
local alias=$(basename "$alias_file") | |
local job_id=$(basename "$(readlink "$alias_file")") | |
local job_info=$(cat "${HPC_JOBS_META_DIR}/${job_id}") | |
local job_name=$(echo "$job_info" | cut -d'|' -f1) | |
local submit_time=$(echo "$job_info" | cut -d'|' -f4) | |
# Get job status from HPC | |
local status=$(ssh "${HPC_USER}@${HPC_HOST}" "squeue -j ${job_id} -h -o %T" 2>/dev/null || echo "COMPLETED") | |
# BSD date command compatible format | |
local formatted_date=$(date -r "$submit_time" "+%Y-%m-%d %H:%M") | |
printf "$format" \ | |
"$alias" \ | |
"$job_id" \ | |
"$job_name" \ | |
"$status" \ | |
"$formatted_date" | |
done | |
} | |
# Command: hpc status <job_id/alias> | |
cmd_status() { | |
local job_id=$(resolve_job_id "$1") | |
echo "Fetching status for job ${job_id}..." | |
ssh "${HPC_USER}@${HPC_HOST}" "squeue -j ${job_id} -o '%.18i %.8j %.8u %.8T %.10M %.9l %.6D %R'" | |
} | |
# Command: hpc cancel <job_id/alias> | |
cmd_cancel() { | |
local job_id=$(resolve_job_id "$1") | |
ssh "${HPC_USER}@${HPC_HOST}" "scancel ${job_id}" | |
echo "Cancelled job ${job_id}" | |
} | |
### Jupyter #################################################################### | |
# Command: hpc jupyter [partition] [time] | |
cmd_jupyter() { | |
local partition=${1:-"icelake"} | |
local time=${2:-"04:00:00"} | |
local script_name="jupyter_notebook" | |
local output_file="${HPC_SCRIPTS_DIR}/${script_name}" | |
echo "Creating and submitting Jupyter notebook job..." | |
# Create the Jupyter launch script | |
cat > "$output_file" <<EOF | |
#!/bin/bash | |
#SBATCH -J jupyter | |
#SBATCH -A <account> | |
#SBATCH -p ${partition} | |
#SBATCH --mem=16G | |
#SBATCH --nodes=1 | |
#SBATCH --ntasks=1 | |
#SBATCH --time=${time} | |
#SBATCH --mail-type=NONE | |
#! Load modules | |
. /etc/profile.d/modules.sh | |
module purge | |
module load rhel8/default-icl | |
# Set up Jupyter runtime directory | |
export JUPYTER_RUNTIME_DIR=./jupyter_runtime | |
mkdir -p \$JUPYTER_RUNTIME_DIR | |
# Get port and IP for tunneling | |
XDG_RUNTIME_DIR="" | |
ipnport=\$(shuf -i8000-9999 -n1) | |
ipnip=\$(hostname -i) | |
# Print tunneling instructions | |
echo "JUPYTER_NOTEBOOK_INFO" | |
echo "PORT=\${ipnport}" | |
echo "IP=\${ipnip}" | |
echo "NODE=\$(hostname)" | |
echo "END_JUPYTER_NOTEBOOK_INFO" | |
# Launch Jupyter Lab | |
jupyter lab --no-browser --port=\$ipnport --ip=\$ipnip | |
EOF | |
chmod +x "$output_file" | |
# Submit the job and get the alias | |
local alias=$(cmd_submit "$script_name") | |
# Wait for the notebook to start and setup connection | |
wait_for_jupyter "$alias" | |
echo "Job submitted as ${alias}" | |
} | |
# Helper function to wait for Jupyter to start and show connection info | |
wait_for_jupyter() { | |
local alias=$1 | |
local job_id=$(resolve_job_id "$alias") | |
local results_dir="${HPC_RESULTS_DIR}/${alias}" | |
echo "Waiting for Jupyter server to start..." | |
local connected=false | |
while ! $connected; do | |
# Fetch latest output | |
cmd_fetch "$alias" >/dev/null 2>&1 | |
if [[ -f "${results_dir}/stdout" ]]; then | |
# Look for our marker lines and extract info | |
if grep -q "JUPYTER_NOTEBOOK_INFO" "${results_dir}/stdout"; then | |
local port=$(grep "PORT=" "${results_dir}/stdout" | cut -d'=' -f2) | |
local ip=$(grep "IP=" "${results_dir}/stdout" | cut -d'=' -f2) | |
local node=$(grep "NODE=" "${results_dir}/stdout" | cut -d'=' -f2) | |
if [[ -n "$port" && -n "$ip" && -n "$node" ]]; then | |
echo -e "\nJupyter server is running!" | |
echo -e "\nTo connect, run these commands in a new terminal:" | |
echo -e "\033[1m ssh -N -L ${port}:${ip}:${port} ${HPC_USER}@${HPC_HOST}\033[0m" | |
echo -e "\nThen open this URL in your browser:" | |
echo -e "\033[1m http://localhost:${port}\033[0m" | |
echo -e "\nUse \033[1mhpc cancel ${alias}\033[0m to stop the Jupyter server" | |
connected=true | |
fi | |
fi | |
fi | |
# Check if the job is still running | |
local status=$(ssh "${HPC_USER}@${HPC_HOST}" "squeue -j ${job_id} -h -o %T" 2>/dev/null) | |
if [[ -z "$status" ]]; then | |
echo "Error: Job terminated unexpectedly" | |
return 1 | |
fi | |
sleep 2 | |
done | |
} | |
# Initialize storage on first run | |
init_local_storage | |
# Main command router | |
case "$1" in | |
new) | |
shift | |
cmd_new "$@" | |
;; | |
submit) | |
shift | |
cmd_submit "$@" | |
;; | |
fetch) | |
shift | |
cmd_fetch "$@" | |
;; | |
log) | |
shift | |
cmd_log "$@" | |
;; | |
status) | |
shift | |
cmd_status "$@" | |
;; | |
cancel) | |
shift | |
cmd_cancel "$@" | |
;; | |
list|ls) | |
cmd_list | |
;; | |
jupyter) | |
shift | |
cmd_jupyter "$@" | |
;; | |
*) | |
echo "Usage: hpc <command> [options]" | |
echo "Commands:" | |
echo " new <job_name> [time] [partition] [account] Create a new job script" | |
echo " submit <script_name> Submit a job script to HPC" | |
echo " fetch <id> [--filetypes t1,t2,...] Fetch job outputs and specified file types" | |
echo " log <id> Display stdout and stderr from job" | |
echo " status <id> Check job status" | |
echo " cancel <id> Cancel a running job" | |
echo " list (ls) List" | |
echo " jupyter [partition] [time] Launch Jupyter notebook server (default: icelake, 4h)" | |
;; | |
esac |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment