rorybyrne · February 26, 2025 16:03
diff --git a/hpc.sh b/hpc.sh
 #!/bin/bash
 # vim: ft=sh
 #
 # HPC job management tool for local machine

 # Remote configuration
 HPC_HOST="<host>"
 HPC_USER="<user>"
 HPC_CODE_DIR="/home/${HPC_USER}/code"

 # Local directory structure (project-specific)
 HPC_DIR=".hpc"
 HPC_SCRIPTS_DIR="${HPC_DIR}/scripts"
 HPC_JOBS_DIR="${HPC_DIR}/jobs"
 HPC_RESULTS_DIR="${HPC_DIR}/results"

 # Job metadata storage
 HPC_JOBS_META_DIR="${HPC_JOBS_DIR}/metadata"
 HPC_JOBS_ALIAS_DIR="${HPC_JOBS_DIR}/aliases"
 HPC_JOBS_SEQUENCE="${HPC_JOBS_DIR}/next_alias"

 # Initialize local storage in project directory
 init_local_storage() {
    # Create directory structure
    mkdir -p "${HPC_SCRIPTS_DIR}"
    mkdir -p "${HPC_JOBS_META_DIR}"
    mkdir -p "${HPC_JOBS_ALIAS_DIR}"
    mkdir -p "${HPC_RESULTS_DIR}"
    
    # Initialize job sequence if not exists
    [[ ! -f "${HPC_JOBS_SEQUENCE}" ]] && echo "1" > "${HPC_JOBS_SEQUENCE}"
 }

 # Get next available short alias
 get_next_alias() {
    local next=$(cat "${HPC_JOBS_SEQUENCE}")
    echo $((next + 1)) > "${HPC_JOBS_SEQUENCE}"
    printf "j%03d" "$next"
 }

 # Save job information and create alias
 save_job_info() {
    local job_id=$1
    local job_name=$2
    local remote_dir=$3
    local output_dir=$4
    local alias=$5

    # Create job info file in metadata directory
    echo "${job_name}|${remote_dir}|${output_dir}|$(date +%s)" > "${HPC_JOBS_META_DIR}/${job_id}"
    
    # Create alias symlink
    ln -sf "${HPC_JOBS_META_DIR}/${job_id}" "${HPC_JOBS_ALIAS_DIR}/${alias}"
    
    echo "$alias"
 }

 # Resolve job ID or alias to full job ID
 resolve_job_id() {
    local id=$1
    if [[ $id =~ ^j[0-9]+$ ]]; then
        # It's an alias - get the real job ID from symlink
        local link_target=$(readlink "${HPC_JOBS_ALIAS_DIR}/${id}")
        if [[ -n "$link_target" ]]; then
            basename "$link_target"
        else
            echo "Error: Alias $id not found" >&2
            exit 1
        fi
    else
        # It's a job ID - verify it exists
        if [[ -f "${HPC_JOBS_META_DIR}/${id}" ]]; then
            echo "$id"
        else
            echo "Error: Job ID $id not found" >&2
            exit 1
        fi
    fi
 }

 # Command: hpc new <job_name> [time] [partition] [account]
 cmd_new() {
    local job_name=$1
    local time=${2:-"02:00:00"}
    local partition=${3:-"icelake"}
    local account=${4:-"<account>"}
    
    local output_file="${HPC_SCRIPTS_DIR}/${job_name}"
    
    echo "Creating new job script: ${output_file}"
    
    cat > "$output_file" <<EOF
 #!/bin/bash
 #SBATCH -J ${job_name}
 #SBATCH -A ${account}
 #SBATCH -p ${partition}
 #SBATCH --mem=2G
 #SBATCH --nodes=1
 #SBATCH --ntasks=1
 #SBATCH --time=${time}
 #SBATCH --mail-type=NONE

 #! Load modules required:
 . /etc/profile.d/modules.sh
 module purge
 module load rhel8/default-icl

 #! Echo useful information to stdout
 echo "Job script information:"
 echo "================================================================================"
 echo "This job is allocated on \${SLURM_JOB_NUM_NODES} node(s)"
 echo "Job is running on node(s):"
 echo \$SLURM_JOB_NODELIST
 echo "Job started on: \$(date)"
 echo "Current directory: \$(pwd)"
 echo "================================================================================"
 echo
 echo

 # Your commands go here:

 echo
 echo "--- Job finished ---"
 echo "Job finished on: \$(date)"
 EOF
    chmod +x "$output_file"
    echo -e "\033[32mCreated job script:\033[0m $output_file"
 }

 # Command: hpc submit <script_name>
 cmd_submit() {
    local script_name=$1
    local script_path="${HPC_SCRIPTS_DIR}/${script_name}"
    
    if [[ ! -f "$script_path" ]]; then
        echo "Error: Script not found: $script_path"
        exit 1
    fi

    # Get alias early so we can use it for paths
    local alias=$(get_next_alias)
    local dir_name=$(basename "$(pwd)")
    local remote_dir="${HPC_CODE_DIR}/${dir_name}"
    local remote_results="${remote_dir}/.hpc/results/${alias}"

    echo "Setting up directories..."
    ssh "${HPC_USER}@${HPC_HOST}" "mkdir -p '${remote_dir}' '${remote_results}'" || {
        echo "Error: Failed to create directories"
        exit 1
    }

    echo "Syncing project files to HPC:${remote_dir}..."
    rsync -a --progress \
        --exclude='.hpc/results/*' \
        --exclude='.venv' \
        --exclude='data' \
        --exclude='.git' \
        --exclude='results' \
        --exclude='__pycache__' \
        --exclude='.pytest_cache' \
        --exclude='*.pyc' \
        . "${HPC_USER}@${HPC_HOST}:${remote_dir}/" || {
        echo "Error: Failed to sync files to HPC"
        exit 1
    }
    
    echo "Running dependency sync on HPC..."
    ssh "${HPC_USER}@${HPC_HOST}" "cd '${remote_dir}' && just sync" || {
        echo "Error: Failed to run dependency sync on HPC"
        exit 1
    }

    echo "Submitting job ${HPC_SCRIPTS_DIR}/${script_name}..."
    local submission_output=$(ssh "${HPC_USER}@${HPC_HOST}" "cd '${remote_dir}' && sbatch \
        --output='${remote_results}/stdout' \
        --error='${remote_results}/stderr' \
        --export=FLYWHL_RESULTS_DIR='${remote_results}' \
        '${HPC_SCRIPTS_DIR}/${script_name}'" 2>&1)
    local job_id=$(echo "$submission_output" | awk '/Submitted batch job/ {print $4}')

    if [[ -z "$job_id" ]]; then
        echo "Error submitting job: $submission_output"
        exit 1
    fi
    
    # Use existing alias when saving job info
    save_job_info "$job_id" "$dir_name" "$remote_dir" "$remote_results" "$alias"
    
    echo "Job submitted successfully:"
    echo "  Alias: ${alias}"
    echo "  Full ID: ${job_id}"
    echo "  Name: ${dir_name}"
    echo "  Remote directory: ${remote_dir}"
    echo "  Results directory: ${remote_results}"
 }

 # Command: hpc log <job_id/alias>
 cmd_log() {
    local alias=$1
    
    # First ensure we have the latest outputs
    cmd_fetch "$alias"
    
    local results_dir="${HPC_RESULTS_DIR}/${alias}"
    
    # Print stderr in red, stdout in default color
    echo "=== stderr ==="
    if [[ -f "${results_dir}/stderr" ]]; then
        echo -e "\033[0;31m$(cat "${results_dir}/stderr")\033[0m"
    fi
    
    echo -e "\n=== stdout ==="
    if [[ -f "${results_dir}/stdout" ]]; then
        cat "${results_dir}/stdout"
    fi
 }

 # Command: hpc fetch <job_id/alias> [--filetypes type1,type2,...]
 cmd_fetch() {
    local alias=$1
    shift
    
    # Resolve the real job ID
    local job_id=$(resolve_job_id "$alias")
    local filetypes=""

    # Parse arguments
    while [[ $# -gt 0 ]]; do
        case $1 in
            --filetypes)
                filetypes=$2
                shift 2
                ;;
            *)
                echo "Unknown option: $1"
                exit 1
                ;;
        esac
    done

    # Read job info
    local job_info=$(cat "${HPC_JOBS_META_DIR}/${job_id}")
    local remote_results=$(echo "$job_info" | cut -d'|' -f3)


    # Create local directory for job results using alias
    local results_dir="${HPC_RESULTS_DIR}/${alias}"
    mkdir -p "$results_dir"

    # Fetch stdout and stderr with standardized names
    echo "Fetching job output files..."
    scp "${HPC_USER}@${HPC_HOST}:${remote_results}/stdout" "${results_dir}/stdout" 2>/dev/null
    scp "${HPC_USER}@${HPC_HOST}:${remote_results}/stderr" "${results_dir}/stderr" 2>/dev/null

    # Fetch additional file types if specified
    if [[ -n "$filetypes" ]]; then
        echo "Fetching files with extensions: $filetypes"
        local IFS=','
        for ext in $filetypes; do
            echo "Fetching *.$ext files..."
            scp "${HPC_USER}@${HPC_HOST}:${remote_results}/*.${ext}" "$results_dir/" 2>/dev/null
        done
    fi

    echo "Files downloaded to: $results_dir"
 }


 # Command: hpc list/ls
 cmd_list() {
    local format="%-6s %-12s %-20s %-10s %s\n"
    printf "$format" "ALIAS" "JOB ID" "NAME" "STATUS" "SUBMITTED"
    printf "%s\n" "------------------------------------------------------------------------"
    
    for alias_file in "${HPC_JOBS_ALIAS_DIR}"/j*; do
        [[ -L "$alias_file" ]] || continue
        
        local alias=$(basename "$alias_file")
        local job_id=$(basename "$(readlink "$alias_file")")
        local job_info=$(cat "${HPC_JOBS_META_DIR}/${job_id}")
        
        local job_name=$(echo "$job_info" | cut -d'|' -f1)
        local submit_time=$(echo "$job_info" | cut -d'|' -f4)
        
        # Get job status from HPC
        local status=$(ssh "${HPC_USER}@${HPC_HOST}" "squeue -j ${job_id} -h -o %T" 2>/dev/null || echo "COMPLETED")
        
        # BSD date command compatible format
        local formatted_date=$(date -r "$submit_time" "+%Y-%m-%d %H:%M")
        
        printf "$format" \
            "$alias" \
            "$job_id" \
            "$job_name" \
            "$status" \
            "$formatted_date"
    done
 }

 # Command: hpc status <job_id/alias>
 cmd_status() {
    local job_id=$(resolve_job_id "$1")
    echo "Fetching status for job ${job_id}..."
    ssh "${HPC_USER}@${HPC_HOST}" "squeue -j ${job_id} -o '%.18i %.8j %.8u %.8T %.10M %.9l %.6D %R'"
 }

 # Command: hpc cancel <job_id/alias>
 cmd_cancel() {
    local job_id=$(resolve_job_id "$1")
    ssh "${HPC_USER}@${HPC_HOST}" "scancel ${job_id}"
    echo "Cancelled job ${job_id}"
 }

 ### Jupyter ####################################################################

 # Command: hpc jupyter [partition] [time]
 cmd_jupyter() {
    local partition=${1:-"icelake"}
    local time=${2:-"04:00:00"}
    local script_name="jupyter_notebook"
    local output_file="${HPC_SCRIPTS_DIR}/${script_name}"
    
    echo "Creating and submitting Jupyter notebook job..."
    
    # Create the Jupyter launch script
    cat > "$output_file" <<EOF
 #!/bin/bash
 #SBATCH -J jupyter
 #SBATCH -A <account>
 #SBATCH -p ${partition}
 #SBATCH --mem=16G
 #SBATCH --nodes=1
 #SBATCH --ntasks=1
 #SBATCH --time=${time}
 #SBATCH --mail-type=NONE

 #! Load modules
 . /etc/profile.d/modules.sh
 module purge
 module load rhel8/default-icl

 # Set up Jupyter runtime directory
 export JUPYTER_RUNTIME_DIR=./jupyter_runtime
 mkdir -p \$JUPYTER_RUNTIME_DIR

 # Get port and IP for tunneling
 XDG_RUNTIME_DIR=""
 ipnport=\$(shuf -i8000-9999 -n1)
 ipnip=\$(hostname -i)

 # Print tunneling instructions
 echo "JUPYTER_NOTEBOOK_INFO"
 echo "PORT=\${ipnport}"
 echo "IP=\${ipnip}"
 echo "NODE=\$(hostname)"
 echo "END_JUPYTER_NOTEBOOK_INFO"

 # Launch Jupyter Lab
 jupyter lab --no-browser --port=\$ipnport --ip=\$ipnip
 EOF
    chmod +x "$output_file"
    
    # Submit the job and get the alias
    local alias=$(cmd_submit "$script_name")
    
    # Wait for the notebook to start and setup connection
    wait_for_jupyter "$alias"
    
    echo "Job submitted as ${alias}"
 }

 # Helper function to wait for Jupyter to start and show connection info
 wait_for_jupyter() {
    local alias=$1
    local job_id=$(resolve_job_id "$alias")
    local results_dir="${HPC_RESULTS_DIR}/${alias}"
    
    echo "Waiting for Jupyter server to start..."
    
    local connected=false
    while ! $connected; do
        # Fetch latest output
        cmd_fetch "$alias" >/dev/null 2>&1
        
        if [[ -f "${results_dir}/stdout" ]]; then
            # Look for our marker lines and extract info
            if grep -q "JUPYTER_NOTEBOOK_INFO" "${results_dir}/stdout"; then
                local port=$(grep "PORT=" "${results_dir}/stdout" | cut -d'=' -f2)
                local ip=$(grep "IP=" "${results_dir}/stdout" | cut -d'=' -f2)
                local node=$(grep "NODE=" "${results_dir}/stdout" | cut -d'=' -f2)
                
                if [[ -n "$port" && -n "$ip" && -n "$node" ]]; then
                    echo -e "\nJupyter server is running!"
                    echo -e "\nTo connect, run these commands in a new terminal:"
                    echo -e "\033[1m ssh -N -L ${port}:${ip}:${port} ${HPC_USER}@${HPC_HOST}\033[0m"
                    echo -e "\nThen open this URL in your browser:"
                    echo -e "\033[1m http://localhost:${port}\033[0m"
                    echo -e "\nUse \033[1mhpc cancel ${alias}\033[0m to stop the Jupyter server"
                    connected=true
                fi
            fi
        fi
        
        # Check if the job is still running
        local status=$(ssh "${HPC_USER}@${HPC_HOST}" "squeue -j ${job_id} -h -o %T" 2>/dev/null)
        if [[ -z "$status" ]]; then
            echo "Error: Job terminated unexpectedly"
            return 1
        fi
        
        sleep 2
    done
 }

 # Initialize storage on first run
 init_local_storage

 # Main command router
 case "$1" in
    new)
        shift
        cmd_new "$@"
        ;;
    submit)
        shift
        cmd_submit "$@"
        ;;
    fetch)
        shift
        cmd_fetch "$@"
        ;;
    log)
        shift
        cmd_log "$@"
        ;;
    status)
        shift
        cmd_status "$@"
        ;;
    cancel)
        shift
        cmd_cancel "$@"
        ;;
    list|ls)
        cmd_list
        ;;
    jupyter)
        shift
        cmd_jupyter "$@"
        ;;
    *)
        echo "Usage: hpc <command> [options]"
        echo "Commands:"
        echo "  new <job_name> [time] [partition] [account]  Create a new job script"
        echo "  submit <script_name>                           Submit a job script to HPC"
        echo "  fetch <id> [--filetypes t1,t2,...]         Fetch job outputs and specified file types"
        echo "  log <id>                                    Display stdout and stderr from job"
        echo "  status <id>                                 Check job status"
        echo "  cancel <id>                                 Cancel a running job"
        echo "  list (ls)                                   List"
        echo "  jupyter [partition] [time]    Launch Jupyter notebook server (default: icelake, 4h)"
        ;;
 esac
	#!/bin/bash
	# vim: ft=sh
	#
	# HPC job management tool for local machine

	# Remote configuration
	HPC_HOST="<host>"
	HPC_USER="<user>"
	HPC_CODE_DIR="/home/${HPC_USER}/code"

	# Local directory structure (project-specific)
	HPC_DIR=".hpc"
	HPC_SCRIPTS_DIR="${HPC_DIR}/scripts"
	HPC_JOBS_DIR="${HPC_DIR}/jobs"
	HPC_RESULTS_DIR="${HPC_DIR}/results"

	# Job metadata storage
	HPC_JOBS_META_DIR="${HPC_JOBS_DIR}/metadata"
	HPC_JOBS_ALIAS_DIR="${HPC_JOBS_DIR}/aliases"
	HPC_JOBS_SEQUENCE="${HPC_JOBS_DIR}/next_alias"

	# Initialize local storage in project directory
	init_local_storage() {
	# Create directory structure
	mkdir -p "${HPC_SCRIPTS_DIR}"
	mkdir -p "${HPC_JOBS_META_DIR}"
	mkdir -p "${HPC_JOBS_ALIAS_DIR}"
	mkdir -p "${HPC_RESULTS_DIR}"

	# Initialize job sequence if not exists
	[[ ! -f "${HPC_JOBS_SEQUENCE}" ]] && echo "1" > "${HPC_JOBS_SEQUENCE}"
	}

	# Get next available short alias
	get_next_alias() {
	local next=$(cat "${HPC_JOBS_SEQUENCE}")
	echo $((next + 1)) > "${HPC_JOBS_SEQUENCE}"
	printf "j%03d" "$next"
	}

	# Save job information and create alias
	save_job_info() {
	local job_id=$1
	local job_name=$2
	local remote_dir=$3
	local output_dir=$4
	local alias=$5

	# Create job info file in metadata directory
	echo "${job_name}\|${remote_dir}\|${output_dir}\|$(date +%s)" > "${HPC_JOBS_META_DIR}/${job_id}"

	# Create alias symlink
	ln -sf "${HPC_JOBS_META_DIR}/${job_id}" "${HPC_JOBS_ALIAS_DIR}/${alias}"

	echo "$alias"
	}

	# Resolve job ID or alias to full job ID
	resolve_job_id() {
	local id=$1
	if [[ $id =~ ^j[0-9]+$ ]]; then
	# It's an alias - get the real job ID from symlink
	local link_target=$(readlink "${HPC_JOBS_ALIAS_DIR}/${id}")
	if [[ -n "$link_target" ]]; then
	basename "$link_target"
	else
	echo "Error: Alias $id not found" >&2
	exit 1
	fi
	else
	# It's a job ID - verify it exists
	if [[ -f "${HPC_JOBS_META_DIR}/${id}" ]]; then
	echo "$id"
	else
	echo "Error: Job ID $id not found" >&2
	exit 1
	fi
	fi
	}

	# Command: hpc new <job_name> [time] [partition] [account]
	cmd_new() {
	local job_name=$1
	local time=${2:-"02:00:00"}
	local partition=${3:-"icelake"}
	local account=${4:-"<account>"}

	local output_file="${HPC_SCRIPTS_DIR}/${job_name}"

	echo "Creating new job script: ${output_file}"

	cat > "$output_file" <<EOF
	#!/bin/bash
	#SBATCH -J ${job_name}
	#SBATCH -A ${account}
	#SBATCH -p ${partition}
	#SBATCH --mem=2G
	#SBATCH --nodes=1
	#SBATCH --ntasks=1
	#SBATCH --time=${time}
	#SBATCH --mail-type=NONE

	#! Load modules required:
	. /etc/profile.d/modules.sh
	module purge
	module load rhel8/default-icl

	#! Echo useful information to stdout
	echo "Job script information:"
	echo "================================================================================"
	echo "This job is allocated on \${SLURM_JOB_NUM_NODES} node(s)"
	echo "Job is running on node(s):"
	echo \$SLURM_JOB_NODELIST
	echo "Job started on: \$(date)"
	echo "Current directory: \$(pwd)"
	echo "================================================================================"
	echo
	echo

	# Your commands go here:

	echo
	echo "--- Job finished ---"
	echo "Job finished on: \$(date)"
	EOF
	chmod +x "$output_file"
	echo -e "\033[32mCreated job script:\033[0m $output_file"
	}

	# Command: hpc submit <script_name>
	cmd_submit() {
	local script_name=$1
	local script_path="${HPC_SCRIPTS_DIR}/${script_name}"

	if [[ ! -f "$script_path" ]]; then
	echo "Error: Script not found: $script_path"
	exit 1
	fi

	# Get alias early so we can use it for paths
	local alias=$(get_next_alias)
	local dir_name=$(basename "$(pwd)")
	local remote_dir="${HPC_CODE_DIR}/${dir_name}"
	local remote_results="${remote_dir}/.hpc/results/${alias}"

	echo "Setting up directories..."
	ssh "${HPC_USER}@${HPC_HOST}" "mkdir -p '${remote_dir}' '${remote_results}'" \|\| {
	echo "Error: Failed to create directories"
	exit 1
	}

	echo "Syncing project files to HPC:${remote_dir}..."
	rsync -a --progress \
	--exclude='.hpc/results/*' \
	--exclude='.venv' \
	--exclude='data' \
	--exclude='.git' \
	--exclude='results' \
	--exclude='__pycache__' \
	--exclude='.pytest_cache' \
	--exclude='*.pyc' \
	. "${HPC_USER}@${HPC_HOST}:${remote_dir}/" \|\| {
	echo "Error: Failed to sync files to HPC"
	exit 1
	}

	echo "Running dependency sync on HPC..."
	ssh "${HPC_USER}@${HPC_HOST}" "cd '${remote_dir}' && just sync" \|\| {
	echo "Error: Failed to run dependency sync on HPC"
	exit 1
	}

	echo "Submitting job ${HPC_SCRIPTS_DIR}/${script_name}..."
	local submission_output=$(ssh "${HPC_USER}@${HPC_HOST}" "cd '${remote_dir}' && sbatch \
	--output='${remote_results}/stdout' \
	--error='${remote_results}/stderr' \
	--export=FLYWHL_RESULTS_DIR='${remote_results}' \
	'${HPC_SCRIPTS_DIR}/${script_name}'" 2>&1)
	local job_id=$(echo "$submission_output" \| awk '/Submitted batch job/ {print $4}')

	if [[ -z "$job_id" ]]; then
	echo "Error submitting job: $submission_output"
	exit 1
	fi

	# Use existing alias when saving job info
	save_job_info "$job_id" "$dir_name" "$remote_dir" "$remote_results" "$alias"

	echo "Job submitted successfully:"
	echo " Alias: ${alias}"
	echo " Full ID: ${job_id}"
	echo " Name: ${dir_name}"
	echo " Remote directory: ${remote_dir}"
	echo " Results directory: ${remote_results}"
	}

	# Command: hpc log <job_id/alias>
	cmd_log() {
	local alias=$1

	# First ensure we have the latest outputs
	cmd_fetch "$alias"

	local results_dir="${HPC_RESULTS_DIR}/${alias}"

	# Print stderr in red, stdout in default color
	echo "=== stderr ==="
	if [[ -f "${results_dir}/stderr" ]]; then
	echo -e "\033[0;31m$(cat "${results_dir}/stderr")\033[0m"
	fi

	echo -e "\n=== stdout ==="
	if [[ -f "${results_dir}/stdout" ]]; then
	cat "${results_dir}/stdout"
	fi
	}

	# Command: hpc fetch <job_id/alias> [--filetypes type1,type2,...]
	cmd_fetch() {
	local alias=$1
	shift

	# Resolve the real job ID
	local job_id=$(resolve_job_id "$alias")
	local filetypes=""

	# Parse arguments
	while [[ $# -gt 0 ]]; do
	case $1 in
	--filetypes)
	filetypes=$2
	shift 2
	;;
	*)
	echo "Unknown option: $1"
	exit 1
	;;
	esac
	done

	# Read job info
	local job_info=$(cat "${HPC_JOBS_META_DIR}/${job_id}")
	local remote_results=$(echo "$job_info" \| cut -d'\|' -f3)


	# Create local directory for job results using alias
	local results_dir="${HPC_RESULTS_DIR}/${alias}"
	mkdir -p "$results_dir"

	# Fetch stdout and stderr with standardized names
	echo "Fetching job output files..."
	scp "${HPC_USER}@${HPC_HOST}:${remote_results}/stdout" "${results_dir}/stdout" 2>/dev/null
	scp "${HPC_USER}@${HPC_HOST}:${remote_results}/stderr" "${results_dir}/stderr" 2>/dev/null

	# Fetch additional file types if specified
	if [[ -n "$filetypes" ]]; then
	echo "Fetching files with extensions: $filetypes"
	local IFS=','
	for ext in $filetypes; do
	echo "Fetching *.$ext files..."
	scp "${HPC_USER}@${HPC_HOST}:${remote_results}/*.${ext}" "$results_dir/" 2>/dev/null
	done
	fi

	echo "Files downloaded to: $results_dir"
	}


	# Command: hpc list/ls
	cmd_list() {
	local format="%-6s %-12s %-20s %-10s %s\n"
	printf "$format" "ALIAS" "JOB ID" "NAME" "STATUS" "SUBMITTED"
	printf "%s\n" "------------------------------------------------------------------------"

	for alias_file in "${HPC_JOBS_ALIAS_DIR}"/j*; do
	[[ -L "$alias_file" ]] \|\| continue

	local alias=$(basename "$alias_file")
	local job_id=$(basename "$(readlink "$alias_file")")
	local job_info=$(cat "${HPC_JOBS_META_DIR}/${job_id}")

	local job_name=$(echo "$job_info" \| cut -d'\|' -f1)
	local submit_time=$(echo "$job_info" \| cut -d'\|' -f4)

	# Get job status from HPC
	local status=$(ssh "${HPC_USER}@${HPC_HOST}" "squeue -j ${job_id} -h -o %T" 2>/dev/null \|\| echo "COMPLETED")

	# BSD date command compatible format
	local formatted_date=$(date -r "$submit_time" "+%Y-%m-%d %H:%M")

	printf "$format" \
	"$alias" \
	"$job_id" \
	"$job_name" \
	"$status" \
	"$formatted_date"
	done
	}

	# Command: hpc status <job_id/alias>
	cmd_status() {
	local job_id=$(resolve_job_id "$1")
	echo "Fetching status for job ${job_id}..."
	ssh "${HPC_USER}@${HPC_HOST}" "squeue -j ${job_id} -o '%.18i %.8j %.8u %.8T %.10M %.9l %.6D %R'"
	}

	# Command: hpc cancel <job_id/alias>
	cmd_cancel() {
	local job_id=$(resolve_job_id "$1")
	ssh "${HPC_USER}@${HPC_HOST}" "scancel ${job_id}"
	echo "Cancelled job ${job_id}"
	}

	### Jupyter ####################################################################

	# Command: hpc jupyter [partition] [time]
	cmd_jupyter() {
	local partition=${1:-"icelake"}
	local time=${2:-"04:00:00"}
	local script_name="jupyter_notebook"
	local output_file="${HPC_SCRIPTS_DIR}/${script_name}"

	echo "Creating and submitting Jupyter notebook job..."

	# Create the Jupyter launch script
	cat > "$output_file" <<EOF
	#!/bin/bash
	#SBATCH -J jupyter
	#SBATCH -A <account>
	#SBATCH -p ${partition}
	#SBATCH --mem=16G
	#SBATCH --nodes=1
	#SBATCH --ntasks=1
	#SBATCH --time=${time}
	#SBATCH --mail-type=NONE

	#! Load modules
	. /etc/profile.d/modules.sh
	module purge
	module load rhel8/default-icl

	# Set up Jupyter runtime directory
	export JUPYTER_RUNTIME_DIR=./jupyter_runtime
	mkdir -p \$JUPYTER_RUNTIME_DIR

	# Get port and IP for tunneling
	XDG_RUNTIME_DIR=""
	ipnport=\$(shuf -i8000-9999 -n1)
	ipnip=\$(hostname -i)

	# Print tunneling instructions
	echo "JUPYTER_NOTEBOOK_INFO"
	echo "PORT=\${ipnport}"
	echo "IP=\${ipnip}"
	echo "NODE=\$(hostname)"
	echo "END_JUPYTER_NOTEBOOK_INFO"

	# Launch Jupyter Lab
	jupyter lab --no-browser --port=\$ipnport --ip=\$ipnip
	EOF
	chmod +x "$output_file"

	# Submit the job and get the alias
	local alias=$(cmd_submit "$script_name")

	# Wait for the notebook to start and setup connection
	wait_for_jupyter "$alias"

	echo "Job submitted as ${alias}"
	}

	# Helper function to wait for Jupyter to start and show connection info
	wait_for_jupyter() {
	local alias=$1
	local job_id=$(resolve_job_id "$alias")
	local results_dir="${HPC_RESULTS_DIR}/${alias}"

	echo "Waiting for Jupyter server to start..."

	local connected=false
	while ! $connected; do
	# Fetch latest output
	cmd_fetch "$alias" >/dev/null 2>&1

	if [[ -f "${results_dir}/stdout" ]]; then
	# Look for our marker lines and extract info
	if grep -q "JUPYTER_NOTEBOOK_INFO" "${results_dir}/stdout"; then
	local port=$(grep "PORT=" "${results_dir}/stdout" \| cut -d'=' -f2)
	local ip=$(grep "IP=" "${results_dir}/stdout" \| cut -d'=' -f2)
	local node=$(grep "NODE=" "${results_dir}/stdout" \| cut -d'=' -f2)

	if [[ -n "$port" && -n "$ip" && -n "$node" ]]; then
	echo -e "\nJupyter server is running!"
	echo -e "\nTo connect, run these commands in a new terminal:"
	echo -e "\033[1m ssh -N -L ${port}:${ip}:${port} ${HPC_USER}@${HPC_HOST}\033[0m"
	echo -e "\nThen open this URL in your browser:"
	echo -e "\033[1m http://localhost:${port}\033[0m"
	echo -e "\nUse \033[1mhpc cancel ${alias}\033[0m to stop the Jupyter server"
	connected=true
	fi
	fi
	fi

	# Check if the job is still running
	local status=$(ssh "${HPC_USER}@${HPC_HOST}" "squeue -j ${job_id} -h -o %T" 2>/dev/null)
	if [[ -z "$status" ]]; then
	echo "Error: Job terminated unexpectedly"
	return 1
	fi

	sleep 2
	done
	}

	# Initialize storage on first run
	init_local_storage

	# Main command router
	case "$1" in
	new)
	shift
	cmd_new "$@"
	;;
	submit)
	shift
	cmd_submit "$@"
	;;
	fetch)
	shift
	cmd_fetch "$@"
	;;
	log)
	shift
	cmd_log "$@"
	;;
	status)
	shift
	cmd_status "$@"
	;;
	cancel)
	shift
	cmd_cancel "$@"
	;;
	list\|ls)
	cmd_list
	;;
	jupyter)
	shift
	cmd_jupyter "$@"
	;;
	*)
	echo "Usage: hpc <command> [options]"
	echo "Commands:"
	echo " new <job_name> [time] [partition] [account] Create a new job script"
	echo " submit <script_name> Submit a job script to HPC"
	echo " fetch <id> [--filetypes t1,t2,...] Fetch job outputs and specified file types"
	echo " log <id> Display stdout and stderr from job"
	echo " status <id> Check job status"
	echo " cancel <id> Cancel a running job"
	echo " list (ls) List"
	echo " jupyter [partition] [time] Launch Jupyter notebook server (default: icelake, 4h)"
	;;
	esac