Skip to content

Instantly share code, notes, and snippets.

@jperkin
Created March 20, 2025 08:43

Revisions

  1. jperkin created this gist Mar 20, 2025.
    138 changes: 138 additions & 0 deletions kill-hung-procs.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,138 @@
    #!/bin/sh
    #
    # Look for any known-to-hang processes that have been running for longer than
    # 2 hours, these are not caught by the ulimit -t set by pbulk as they are not
    # using any CPU time.
    #
    # Run from cron and redirect output to a log
    #

    if [ "$1" = "-n" ]; then
    dry_run=true
    else
    dry_run=false
    fi

    case "$(uname -s)" in
    NetBSD)
    ps_args_pbulk="-o etime= -o pid= -o args= -U pbulk -x"
    ps_args_log="-ww -o user,pid,lstart,etime,args"
    process_restart=true
    ;;
    *)
    ps_args_pbulk="-o etime= -o pid= -o args= -U pbulk"
    ps_args_log="-fo user,pid,etime,args"
    process_restart=false
    ;;
    esac

    kill_or_restart()
    {
    pid=$1; shift

    if ${dry_run}; then
    if ${process_restart}; then
    echo "Would stop/start PID ${pid}"
    else
    echo "Would kill PID ${pid}"
    fi
    return
    fi

    #
    # On some OS it's enough to stop and restart processes to get them
    # running again (notably NetBSD with its broken libpthread).
    #
    if ${process_restart}; then
    kill -STOP ${pid}
    sleep 1
    kill -CONT ${pid}
    else
    kill -9 ${pid}
    fi
    }

    log_and_kill()
    {
    pid=$1; shift

    # Output date and running command for the log.
    date '+%Y-%m-%d-%H:%M:%S'
    case "$(uname -s)" in
    Darwin)
    /usr/sbin/lsof -d cwd -a -p ${pid}
    ;;
    esac
    ps ${ps_args_log} -p ${pid}
    kill_or_restart ${pid}
    }

    #
    # fseventsd on macOS often ends up spinning during bulk builds. Just kill
    # it once it's hit a certain amount of user time.
    #
    restart_system_processes_macos()
    {
    fspid=$(launchctl list | awk '/com.apple.fseventsd/ {print $1}')

    case "$(ps -o utime= -p ${fspid} 2>/dev/null)" in
    *[0-9][0-9]:*)
    log_and_kill ${fspid}
    ;;
    esac
    }

    ps ${ps_args_pbulk} | while read time pid cmd; do
    #
    # Match when the ETIME field is 2 hours or longer. Account for OS
    # differences, most have leading 0, NetBSD does not.
    #
    # If anything has been running for over a day just kill it, it's
    # highly unlikely to be making forward progress.
    #
    case "${time}" in
    *-*:*:*)
    # Except for known false positives
    case "${cmd}" in
    /usr/libexec/lsd*|/usr/sbin/distnoted*)
    ;;
    *)
    log_and_kill ${pid}
    ;;
    esac
    ;;
    0[2-9]:*:*|\
    [2-9]:*:*|\
    [1-9][0-9]:*:*)
    #
    # Only match either known fail processes or anything running
    # from within the work directory, skipping known false
    # positives such as Rust.
    #
    case "${cmd}" in
    *lang/rust*|*ghc9[46]*)
    # Do nothing, likely legitimate long-running process.
    ;;
    /Users/pbulk/*|\
    /home/pbulk/*|\
    *Configure*|\
    *cmake_autogen*|\
    mplayer|\
    *py-scipy*|\
    *" ./configure "*|\
    ./*|../*)
    log_and_kill ${pid}
    ;;
    esac
    ;;
    esac
    done

    #
    # Perform any OS-specific system cleanup.
    #
    case "$(uname -s)" in
    Darwin)
    restart_system_processes_macos
    ;;
    esac