Skip to content

Instantly share code, notes, and snippets.

@JustinShenk
Last active January 15, 2025 11:37
Show Gist options
  • Save JustinShenk/312b5e0ab7acc3b116f7bf3b6d888fa4 to your computer and use it in GitHub Desktop.
Save JustinShenk/312b5e0ab7acc3b116f7bf3b6d888fa4 to your computer and use it in GitHub Desktop.
Google Cloud Platform (GCP) instance idle shutdown
#!/bin/bash
# Add to instance metadata with `gcloud compute instances add-metadata \
# instance-name --metadata-from-file startup-script=idle-shutdown.sh` and reboot
# NOTE: requires `bc`, eg, sudo apt-get install bc
# Modified from https://stackoverflow.com/questions/30556920/how-can-i-automatically-kill-idle-gce-instances-based-on-cpu-usage
threshold=0.1
count=0
wait_minutes=60
while true
do
load=$(uptime | sed -e 's/.*load average: //g' | awk '{ print $1 }') # 1-minute average load
load="${load//,}" # remove trailing comma
res=$(echo $load'<'$threshold | bc -l)
if (( $res ))
then
echo "Idling.."
((count+=1))
fi
echo "Idle minutes count = $count"
if (( count>wait_minutes ))
then
echo Shutting down
# wait a little bit more before actually pulling the plug
sleep 300
sudo poweroff
fi
sleep 60
done
@wise-hyunsoo
Copy link

wise-hyunsoo commented Aug 8, 2024

Thanks @OptogeneticsandNeuralEngineeringCore!

If you use a custom script to idle shutdown, Vertex AI Workbench will not terminate Instances normally. To resolve this, you can modify the default Idle Shutdown script to use it. As you know, if you are using an instance of Vertex AI workbench, the Idle Shutdown feature is built-in by default.

However, if you use an editor such as VSCode to execute code via remote ssh connection, the default idle shutdown feature may be activated during code execution, causing the instance to be terminated. To prevent this case, I modified the script as below, referring to @OptogeneticsandNeuralEngineeringCore's script.

The modification is to check the CPU and GPU utilization and if it is below a certain percentage, it is considered as IDLE. If you modify this script and paste it into /opt/deeplearning/bin/check_idle_shutdown.sh, check_idle_shutdown.sh process will refer to the CPU and GPU utilization and shut down the instance when the criteria is met. (Note that you will need sudo privileges to modify the script).

#!/bin/bash -eu
#
# Copyright 2022 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Scripts that run idle shutdown logic when the right metadata is set.

# Skips when running unit tests

if [[ -z ${TEST_SRCDIR:-""} ]]; then
  # shellcheck disable=SC1091
  source /opt/c2d/c2d-utils || exit 1
  source /etc/profile.d/env.sh || exit 1
fi

# User modification settings
## Flags
check_gpu=true        # Flag to enable/disable GPU utilization check
check_cpu=true        # Flag to enable/disable CPU utilization check
## Settings
threshold_cpu=10     # Average (over 1 min) of CPU usage. Defaulted as 10%
threshold_gpu=5      # GPU utilization percentage threshold
# wait_minutes=10       # Time, in minutes, that the CPU/GPU usage must be under before the VM is shut down. Note that the script will wait for 30 seconds after this time to shut down to allow the VM to sort itself out a bit.
current_time=$(date +"%Y-%m-%d %H:%M:%S")
# Initialization of variables
cpu_resolution_flag=false
gpu_resolution_flag=false

# Obtain Notebook type: DLVM, USER_MANAGED_NOTEBOOK, GOOGLE_MANAGED_NOTEBOOK
notebook=$(notebook_type)

# System performs idle checks every minute. (Defined by cron schedule * * * * *)
# Uses port 8081 which returns both local and remote sessions from Mixer.

# Bind to `localhost`` since Mixer starts on: [::1]
URL_SESSIONS="http://127.0.0.1:8080/api/sessions"
URL_TERMINALS="http://127.0.0.1:8080/api/terminals"
NAMESPACE=$(guest_attributes_namespace)

function calculate_idle_origin_time() {
  time_given="$1"
  time_instance="$2"
  # Uses the max between the latest activity and the creation time of the instance.
  # This prevents idle shutdown to trigger due to ghost sessions unrelated to the
  # instance and that might have existed before the instance was created. See b/241870602.
  if [[ -z ${time_instance} ]]; then
    echo ""
    exit 0
  fi
  if [[ -z ${time_given} ]]; then
    echo "${time_instance}"
    exit 0
  fi
  if [[ ${time_instance} -gt "${time_given}" ]]; then
    echo "${time_instance}"
    exit 0
  fi
  echo "${time_given}"
}

function set_origin_time_attribute() {
  # Sets an immutable time similar to instance creation time.
  origin_time=$(get_origin_time_attribute)
  if [[ -z "$origin_time" ]]; then
    origin_time=$(get_current_time_in_sec)
    echo "Setting an origin timestamp ${origin_time} as a minimum base for idle calculations."
    set_guest_attributes "${NAMESPACE}/origin_time" "${origin_time}"
  fi
}

function get_origin_time_attribute() {
  last_activity_attribute=$(get_guest_attributes "${NAMESPACE}/origin_time")
  echo "${last_activity_attribute}"
}

function set_last_activity_attribute() {
  now=$(get_current_time_in_sec)
  timestamp_to_set="$1"
  reason="$2"
  origin_timestamp=$(get_origin_time_attribute)
  timestamp_to_set=$(calculate_idle_origin_time "${timestamp_to_set}" "${origin_timestamp}")
  # shellcheck disable=SC9002
  if [ "$timestamp_to_set" -ge "$origin_timestamp" ];then
    reason="Reason - $2 - replaced by later instance creation date"
  fi
  echo "Setting last notebook activity timestamp ${timestamp_to_set} at ${now} for the following reason: ${reason}"
  set_guest_attributes "${NAMESPACE}/last_activity" "${timestamp_to_set}"
}

function get_last_activity_attribute() {
  last_activity_attribute=$(get_guest_attributes "${NAMESPACE}/last_activity")
  echo "${last_activity_attribute}"
}

function get_terminals() {
  curl -s "${URL_TERMINALS}"
}

function get_sessions() {
  retval="${URL_SESSIONS}"
  curl -s "${retval}"
}

function update_last_activity() {
  # Sessions `.kernel.last_activity` gets updated only when there's cell outputs
  # e.g. `time.sleep(60)` only updates session at start and after 60s
  # e.g. a for loop with print every 10s updates sessions at start and every 10s
  # Sessions with a cell running have `.kernel.state` == busy. Otherwise "idle".
  sessions=$(get_sessions)

  # When there's a running cell, uses "now" which addresses the potential no
  # output use case (ex: time.sleep)
  if $check_cpu; then
    cpu_utilization=$(uptime | sed -e 's/.*load average: //g' | awk '{ printf("%.0f", $1 * 1) }')
        # load=$(uptime | sed -e 's/.*load average: //g' | awk '{ print $1 }') # 1-minute average load
        # load="${load//,}" # remove trailing comma
        # res=$(echo $load'<'$threshold | bc -l)
    resolution_cpu=$((cpu_utilization < threshold_cpu)) # Set to 0 if CPU utilization is less than the threshold, and 1 if it's not
    echo "CPU flag set to true."
    echo "  At time: $current_time, cpu load: $cpu_utilization %"
    echo "  CPU threshold set to: $threshold_cpu %"
    if [ "$resolution_cpu" -eq 0 ]; then # CPU above thresh
        cpu_resolution_flag=false
        echo "  CPU found to be above threshold. Not idling"
    else # CPU found to be below threshold
        cpu_resolution_flag=true 
        echo "  CPU found to be below threshold. Considered idling"
    fi
  else # Skip CPU check if the flag is disabled
    cpu_resolution_flag=true
    echo "CPU flag set to false, will not check. Considered idling"
  fi

  if $check_gpu; then
    tmp_gpu_utilization=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits)
    newgpu_util=$(echo $tmp_gpu_utilization)
    gpu_utilization=$(echo $newgpu_util | xargs -n 1 | paste -sd+ - | bc)
    resolution_gpu=$((gpu_utilization < threshold_gpu)) # Set to 0 if GPU utilization is less than the threshold, and 1 if it's not
    echo "GPU flag set to true."
    echo "  At time: $current_time, GPU load: $gpu_utilization %" # Set to 0 if GPU utilization is less than the threshold, and 1 if it's not
    echo "  GPU threshold set to: $threshold_gpu %"
    if [ "$resolution_gpu" -eq 0 ]; then
        gpu_resolution_flag=false
        echo "  GPU found to be above threshold. Not idling"
    else # GPU found to be below threshold
        gpu_resolution_flag=true
        echo "  GPU found to be below threshold. Considered idleing"
    fi
  else # Skip GPU check if the flag is disabled
    gpu_resolution_flag=true
    echo "GPU flag set to false, will not check"
  fi

  latest_activity=$(jq -r --raw-output '[ .[] | select(.kernel.execution_state == "busy").kernel.last_activity | sub(".[0-9]+Z$"; "Z") | fromdate ] | max | values' <<< "${sessions}")
  if [[ -n "$latest_activity" ]] || [ "$gpu_resolution_flag" == false ] || [ "$cpu_resolution_flag" == false ]; then
    latest_activity=$(get_current_time_in_sec)
    set_last_activity_attribute "${latest_activity}" "running cell"
    return 0
  fi

  latest_activity_previous=$(get_last_activity_attribute)
  latest_activity=$(jq -r --raw-output '[ .[] | .kernel.last_activity | sub(".[0-9]+Z$"; "Z") | fromdate ] | max | values' <<< "${sessions}")

  # When there is no sessions and we never set an activity attribute, uses "now"
  # which is more or less the creation date.
  if [[ -z "$latest_activity$latest_activity_previous" ]]; then
    latest_activity=$(get_current_time_in_sec)
    set_last_activity_attribute "${latest_activity}" "no known previous activity"
    return 0
  fi

  # When there is no sessions during the following cron calls, uses the previous
  # recorded time.
  if [[ -z "$latest_activity" ]]; then
    latest_activity="${latest_activity_previous}"
  fi

  # Checks if there is any active terminals. Terminal API doesn't provide a idle
  # or busy like kernels so we can not check cases like `sleep 5``. But we still
  # want to minimize shutdown case when terminals might run.
  # Skips when terminals are not enabled in JupyterLab (curl returns a 404 error)
  terminals=$(get_terminals)
  latest_activity_terminals=$(jq -r --raw-output '[ .[] | .last_activity | sub(".[0-9]+Z$"; "Z") | fromdate ] | max | values' <<< "${terminals}")
  status=$?
  # shellcheck disable=SC9002
  if [ $status -eq 0 ]; then
    if [[ $latest_activity_terminals > $latest_activity ]]; then
      latest_activity="${latest_activity_terminals}"
      set_last_activity_attribute "${latest_activity}" "terminals active after sessions"
      return 0
    fi
  fi

  # Uses latest activity of all sessions when there's at least one idle session.
  set_last_activity_attribute "${latest_activity}" "latest session activity"
  return 0
}

function shutdown_if_idle_timeout() {
  now=$(date +'%s')
  # Condition for WBI idle's metadata and old Runtime idle's metadata.
  idle_timeout_seconds=$(get_attribute_value idle-timeout-seconds || get_attribute_value idle-shutdown-timeout || 0)
  last_activity=$(get_last_activity_attribute)
  if $check_cpu; then
    cpu_utilization=$(uptime | sed -e 's/.*load average: //g' | awk '{ printf("%.0f", $1 * 1) }')
        # load=$(uptime | sed -e 's/.*load average: //g' | awk '{ print $1 }') # 1-minute average load
        # load="${load//,}" # remove trailing comma
        # res=$(echo $load'<'$threshold | bc -l)
    resolution_cpu=$((cpu_utilization < threshold_cpu)) # Set to 0 if CPU utilization is less than the threshold, and 1 if it's not
    echo "CPU flag set to true."
    echo "  At time: $current_time, cpu load: $cpu_utilization %"
    echo "  CPU threshold set to: $threshold_cpu %"
    if [ "$resolution_cpu" -eq 0 ]; then # CPU above thresh
        cpu_resolution_flag=false
        echo "  CPU found to be above threshold. Not idling"
    else # CPU found to be below threshold
        cpu_resolution_flag=true 
        echo "  CPU found to be below threshold. Considered idling"
    fi
  else # Skip CPU check if the flag is disabled
    cpu_resolution_flag=true
    echo "CPU flag set to false, will not check. Considered idling"
  fi

  if $check_gpu; then
    tmp_gpu_utilization=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits)
    newgpu_util=$(echo $tmp_gpu_utilization)
    gpu_utilization=$(echo $newgpu_util | xargs -n 1 | paste -sd+ - | bc)
    resolution_gpu=$((gpu_utilization < threshold_gpu)) # Set to 0 if GPU utilization is less than the threshold, and 1 if it's not
    echo "GPU flag set to true."
    echo "  At time: $current_time, GPU load: $gpu_utilization %" # Set to 0 if GPU utilization is less than the threshold, and 1 if it's not
    echo "  GPU threshold set to: $threshold_gpu %"
    if [ "$resolution_gpu" -eq 0 ]; then
        gpu_resolution_flag=false
        echo "  GPU found to be above threshold. Not idling"
    else # GPU found to be below threshold
        gpu_resolution_flag=true
        echo "  GPU found to be below threshold. Considered idleing"
    fi
  else # Skip GPU check if the flag is disabled
    gpu_resolution_flag=true
    echo "GPU flag set to false, will not check"
  fi

  if [[ ${now} -gt $((last_activity + idle_timeout_seconds)) ]] && $gpu_resolution_flag && $cpu_resolution_flag; then
    echo "Shutting down with last_activity ${last_activity} at ${now}"
    echo "Reporting IDLE event"
    # Ensures that at the next instance start, origin_time won't be the creation
    # time but the start time.
    delete_guest_attributes "${NAMESPACE}/origin_time"
    expected_idle_shutdown=$((last_activity + idle_timeout_seconds))
    report_event "IDLE" "{\"last_activity\": \"${last_activity}\", \"expected_idle_shutdown\": \"${expected_idle_shutdown}\"}"
  fi
}

if [[ -z ${TEST_SRCDIR:-""} ]]; then
  # Condition for WBI idle's metadata and old Runtime idle's metadata.
  if [[ -n "$(get_attribute_value idle-timeout-seconds)" || -n "$(get_attribute_value idle-shutdown-timeout)" ]]; then
    # Although google-wi-promote-premium.service removes code related to idle
    # shutdown, we still add this condition to skip the run for DLVM.
    if [[ $notebook != "DLVM" ]]; then
      set_origin_time_attribute
      update_last_activity
      shutdown_if_idle_timeout
    fi
  fi
fi

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment