Last active
April 3, 2020 08:41
-
-
Save giovtorres/a26bcd754bf0eaa4b4e10b8e48bdfa22 to your computer and use it in GitHub Desktop.
Graph Slurm's sdiag with PySlurm and Graphite
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# vim: set ts=4 sw=4 et | |
""" | |
slurm_sched_stats.py | |
A script that uses PySlurm to get the slurm scheduler statistics. | |
""" | |
import pickle | |
import socket | |
import struct | |
import sys | |
import time | |
import pyslurm | |
__author__ = "Giovanni Torres" | |
CARBON_SERVER = "127.0.0.1" | |
CARBON_PICKLE_PORT = 2004 | |
DELAY=30 | |
def get_sched_stats(): | |
stats = {} | |
try: | |
sdiag = pyslurm.statistics().get() | |
except: | |
return | |
else: | |
# Slurmctld Stats | |
stats["server_thread_count"] = sdiag.get("server_thread_count") | |
stats["agent_queue_size"] = sdiag.get("agent_queue_size") | |
# Jobs Stats | |
stats["jobs_submitted"] = sdiag.get("jobs_submitted") | |
stats["jobs_started"] = sdiag.get("jobs_started") | |
stats["jobs_completed"] = sdiag.get("jobs_completed") | |
stats["jobs_canceled"] = sdiag.get("jobs_canceled") | |
stats["jobs_failed"] = sdiag.get("jobs_failed") | |
# Main Scheduler Stats | |
stats["main_last_cycle"] = sdiag.get("schedule_cycle_last") | |
stats["main_max_cycle"] = sdiag.get("schedule_cycle_max") | |
stats["main_total_cycles"] = sdiag.get("schedule_cycle_counter") | |
if sdiag.get("schedule_cycle_counter") > 0: | |
stats["main_mean_cycle"] = ( | |
sdiag.get("schedule_cycle_sum") / sdiag.get("schedule_cycle_counter") | |
) | |
stats["main_mean_depth_cycle"] = ( | |
sdiag.get("schedule_cycle_depth") / sdiag.get("schedule_cycle_counter") | |
) | |
if (sdiag.get("req_time") - sdiag.get("req_time_start")) > 60: | |
stats["main_cycles_per_minute"] = ( | |
sdiag.get("schedule_cycle_counter") / | |
((sdiag.get("req_time") - sdiag.get("req_time_start")) / 60) | |
) | |
stats["main_last_queue_length"] = sdiag.get("schedule_queue_len") | |
# Backfilling stats | |
stats["bf_total_jobs_since_slurm_start"] = sdiag.get("bf_backfilled_jobs") | |
stats["bf_total_jobs_since_cycle_start"] = sdiag.get("bf_last_backfilled_jobs") | |
stats["bf_total_cycles"] = sdiag.get("bf_cycle_counter") | |
stats["bf_last_cycle"] = sdiag.get("bf_cycle_last") | |
stats["bf_max_cycle"] = sdiag.get("bf_cycle_max") | |
stats["bf_queue_length"] = sdiag.get("bf_queue_len") | |
if sdiag.get("bf_cycle_counter") > 0: | |
stats["bf_mean_cycle"] = ( | |
sdiag.get("bf_cycle_sum") / sdiag.get("bf_cycle_counter") | |
) | |
stats["bf_depth_mean"] = ( | |
sdiag.get("bf_depth_sum") / sdiag.get("bf_cycle_counter") | |
) | |
stats["bf_depth_mean_try"] = ( | |
sdiag.get("bf_depth_try_sum") / sdiag.get("bf_cycle_counter") | |
) | |
stats["bf_queue_length_mean"] = ( | |
sdiag.get("bf_queue_len_sum") / sdiag.get("bf_cycle_counter") | |
) | |
stats["bf_last_depth_cycle"] = sdiag.get("bf_last_depth") | |
stats["bf_last_depth_cycle_try"] = sdiag.get("bf_last_depth_try") | |
return stats | |
def run(sock, delay): | |
while True: | |
now = int(time.time()) | |
tuples = ([]) | |
stats = get_sched_stats() | |
if stats is not None: | |
prefix = "cluster.slurm_sched_stats.gauge-" | |
for key in stats: | |
tuples.append((prefix + key, (now, stats[key]))) | |
package = pickle.dumps(tuples, 1) | |
size = struct.pack('!L', len(package)) | |
try: | |
sock.sendall(size) | |
sock.sendall(package) | |
except socket.error: | |
pass | |
time.sleep(delay) | |
def main(): | |
sock = socket.socket() | |
try: | |
sock.connect((CARBON_SERVER, CARBON_PICKLE_PORT)) | |
except socket.error: | |
raise SystemExit("Couldn't connect to %s on port %d. Is carbon-cache \ | |
running" % (CARBON_SERVER, CARBON_PICKLE_PORT)) | |
try: | |
run(sock, DELAY) | |
except KeyboardInterrupt: | |
sys.stderr.write("\nExiting on CTRL-c\n") | |
sys.exit(0) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment