Created
August 13, 2020 12:55
-
-
Save corenel/6312c37040eec418c09f41909af33fae to your computer and use it in GitHub Desktop.
GPU-Nodes-Metrics-Nvidia
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"annotations": { | |
"list": [ | |
{ | |
"builtIn": 1, | |
"datasource": "-- Grafana --", | |
"enable": true, | |
"hide": true, | |
"iconColor": "rgba(0, 211, 255, 1)", | |
"limit": 100, | |
"name": "Annotations & Alerts", | |
"showIn": 0, | |
"type": "dashboard" | |
} | |
] | |
}, | |
"description": "使用NVIDIA Data Center GPU Manager (DCGM) dcgm-exporter 通过Prometheus绘制的GPU Nvidia 基础监控信息.", | |
"editable": true, | |
"gnetId": 12639, | |
"graphTooltip": 0, | |
"id": 1, | |
"iteration": 1597321805205, | |
"links": [], | |
"panels": [ | |
{ | |
"cacheTimeout": null, | |
"colorBackground": false, | |
"colorValue": false, | |
"colors": [ | |
"#299c46", | |
"rgba(237, 129, 40, 0.89)", | |
"#d44a3a" | |
], | |
"datasource": "Prometheus", | |
"fieldConfig": { | |
"defaults": { | |
"custom": {} | |
}, | |
"overrides": [] | |
}, | |
"format": "hertz", | |
"gauge": { | |
"maxValue": 100, | |
"minValue": 0, | |
"show": false, | |
"thresholdLabels": false, | |
"thresholdMarkers": true | |
}, | |
"gridPos": { | |
"h": 2, | |
"w": 12, | |
"x": 0, | |
"y": 0 | |
}, | |
"id": 44, | |
"interval": null, | |
"links": [], | |
"mappingType": 1, | |
"mappingTypes": [ | |
{ | |
"name": "value to text", | |
"value": 1 | |
}, | |
{ | |
"name": "range to text", | |
"value": 2 | |
} | |
], | |
"maxDataPoints": 100, | |
"nullPointMode": "connected", | |
"nullText": null, | |
"postfix": "", | |
"postfixFontSize": "50%", | |
"prefix": "", | |
"prefixFontSize": "50%", | |
"rangeMaps": [ | |
{ | |
"from": "null", | |
"text": "N/A", | |
"to": "null" | |
} | |
], | |
"sparkline": { | |
"fillColor": "rgba(31, 118, 189, 0.18)", | |
"full": false, | |
"lineColor": "rgb(31, 120, 193)", | |
"show": false | |
}, | |
"tableColumn": "GPU SM 时钟", | |
"targets": [ | |
{ | |
"expr": "avg(DCGM_FI_DEV_SM_CLOCK{instance=~\"$hostname\"}*1000000)", | |
"format": "time_series", | |
"interval": "", | |
"intervalFactor": 1, | |
"legendFormat": "GPU SM 时钟", | |
"refId": "A" | |
} | |
], | |
"thresholds": "", | |
"title": "GPU SM Clocks", | |
"type": "singlestat", | |
"valueFontSize": "80%", | |
"valueMaps": [ | |
{ | |
"op": "=", | |
"text": "N/A", | |
"value": "null" | |
} | |
], | |
"valueName": "current" | |
}, | |
{ | |
"cacheTimeout": null, | |
"colorBackground": false, | |
"colorValue": false, | |
"colors": [ | |
"#299c46", | |
"rgba(237, 129, 40, 0.89)", | |
"#d44a3a" | |
], | |
"datasource": "Prometheus", | |
"fieldConfig": { | |
"defaults": { | |
"custom": {} | |
}, | |
"overrides": [] | |
}, | |
"format": "hertz", | |
"gauge": { | |
"maxValue": 100, | |
"minValue": 0, | |
"show": false, | |
"thresholdLabels": false, | |
"thresholdMarkers": true | |
}, | |
"gridPos": { | |
"h": 2, | |
"w": 12, | |
"x": 12, | |
"y": 0 | |
}, | |
"id": 45, | |
"interval": null, | |
"links": [], | |
"mappingType": 1, | |
"mappingTypes": [ | |
{ | |
"name": "value to text", | |
"value": 1 | |
}, | |
{ | |
"name": "range to text", | |
"value": 2 | |
} | |
], | |
"maxDataPoints": 100, | |
"nullPointMode": "connected", | |
"nullText": null, | |
"postfix": "", | |
"postfixFontSize": "50%", | |
"prefix": "", | |
"prefixFontSize": "50%", | |
"rangeMaps": [ | |
{ | |
"from": "null", | |
"text": "N/A", | |
"to": "null" | |
} | |
], | |
"sparkline": { | |
"fillColor": "rgba(31, 118, 189, 0.18)", | |
"full": false, | |
"lineColor": "rgb(31, 120, 193)", | |
"show": false | |
}, | |
"tableColumn": "GPU 内存时钟", | |
"targets": [ | |
{ | |
"expr": "avg(DCGM_FI_DEV_MEM_CLOCK{instance=~\"$hostname\"}*1000000)", | |
"format": "time_series", | |
"interval": "", | |
"intervalFactor": 1, | |
"legendFormat": "GPU 内存时钟", | |
"refId": "A" | |
} | |
], | |
"thresholds": "", | |
"title": "GPU Memory Clocks", | |
"type": "singlestat", | |
"valueFontSize": "80%", | |
"valueMaps": [ | |
{ | |
"op": "=", | |
"text": "N/A", | |
"value": "null" | |
} | |
], | |
"valueName": "current" | |
}, | |
{ | |
"aliasColors": {}, | |
"bars": false, | |
"dashLength": 10, | |
"dashes": false, | |
"datasource": "Prometheus", | |
"fieldConfig": { | |
"defaults": { | |
"custom": {}, | |
"links": [] | |
}, | |
"overrides": [] | |
}, | |
"fill": 0, | |
"fillGradient": 0, | |
"gridPos": { | |
"h": 7, | |
"w": 8, | |
"x": 0, | |
"y": 2 | |
}, | |
"hiddenSeries": false, | |
"id": 57, | |
"legend": { | |
"alignAsTable": true, | |
"avg": true, | |
"current": true, | |
"max": true, | |
"min": true, | |
"show": true, | |
"total": false, | |
"values": true | |
}, | |
"lines": true, | |
"linewidth": 2, | |
"links": [], | |
"nullPointMode": "connected", | |
"percentage": false, | |
"pluginVersion": "7.1.3", | |
"pointradius": 5, | |
"points": false, | |
"renderer": "flot", | |
"seriesOverrides": [], | |
"spaceLength": 10, | |
"stack": false, | |
"steppedLine": false, | |
"targets": [ | |
{ | |
"expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$hostname\"}", | |
"format": "time_series", | |
"hide": false, | |
"interval": "", | |
"intervalFactor": 1, | |
"legendFormat": "{{instance}}.{{gpu}}", | |
"refId": "A" | |
} | |
], | |
"thresholds": [], | |
"timeFrom": null, | |
"timeRegions": [], | |
"timeShift": null, | |
"title": "GPU 使用率", | |
"tooltip": { | |
"shared": true, | |
"sort": 0, | |
"value_type": "individual" | |
}, | |
"type": "graph", | |
"xaxis": { | |
"buckets": null, | |
"mode": "time", | |
"name": null, | |
"show": true, | |
"values": [] | |
}, | |
"yaxes": [ | |
{ | |
"format": "percent", | |
"label": null, | |
"logBase": 1, | |
"max": "100", | |
"min": "0", | |
"show": true | |
}, | |
{ | |
"format": "watt", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
} | |
], | |
"yaxis": { | |
"align": false, | |
"alignLevel": null | |
} | |
}, | |
{ | |
"aliasColors": {}, | |
"bars": false, | |
"dashLength": 10, | |
"dashes": false, | |
"datasource": "Prometheus", | |
"description": "内存使用", | |
"fieldConfig": { | |
"defaults": { | |
"custom": {}, | |
"links": [], | |
"mappings": [], | |
"thresholds": { | |
"mode": "absolute", | |
"steps": [ | |
{ | |
"color": "green", | |
"value": null | |
}, | |
{ | |
"color": "red", | |
"value": 80 | |
} | |
] | |
} | |
}, | |
"overrides": [] | |
}, | |
"fill": 1, | |
"fillGradient": 0, | |
"gridPos": { | |
"h": 9, | |
"w": 8, | |
"x": 8, | |
"y": 2 | |
}, | |
"hiddenSeries": false, | |
"id": 60, | |
"legend": { | |
"avg": false, | |
"current": false, | |
"max": false, | |
"min": false, | |
"show": true, | |
"total": false, | |
"values": false | |
}, | |
"lines": true, | |
"linewidth": 1, | |
"nullPointMode": "null", | |
"percentage": false, | |
"pluginVersion": "7.1.3", | |
"pointradius": 2, | |
"points": false, | |
"renderer": "flot", | |
"seriesOverrides": [], | |
"spaceLength": 10, | |
"stack": false, | |
"steppedLine": false, | |
"targets": [ | |
{ | |
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"$hostname\"}", | |
"interval": "", | |
"legendFormat": "{{instance}}.{{gpu}}", | |
"refId": "B" | |
}, | |
{ | |
"expr": "DCGM_FI_DEV_FB_USED{instance=~\"$hostname\"}+DCGM_FI_DEV_FB_FREE{instance=~\"$hostname\"}", | |
"hide": true, | |
"interval": "", | |
"legendFormat": "GPU 总内存 {{instance}}.{{gpu}}", | |
"refId": "A" | |
} | |
], | |
"thresholds": [], | |
"timeFrom": null, | |
"timeRegions": [], | |
"timeShift": null, | |
"title": "GPU 内存用量", | |
"tooltip": { | |
"shared": true, | |
"sort": 0, | |
"value_type": "individual" | |
}, | |
"type": "graph", | |
"xaxis": { | |
"buckets": null, | |
"mode": "time", | |
"name": null, | |
"show": true, | |
"values": [] | |
}, | |
"yaxes": [ | |
{ | |
"format": "decmbytes", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
}, | |
{ | |
"format": "short", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
} | |
], | |
"yaxis": { | |
"align": false, | |
"alignLevel": null | |
} | |
}, | |
{ | |
"cacheTimeout": null, | |
"colorBackground": false, | |
"colorValue": false, | |
"colors": [ | |
"#299c46", | |
"rgba(237, 129, 40, 0.89)", | |
"#d44a3a" | |
], | |
"datasource": "Prometheus", | |
"fieldConfig": { | |
"defaults": { | |
"custom": {} | |
}, | |
"overrides": [] | |
}, | |
"format": "celsius", | |
"gauge": { | |
"maxValue": 90, | |
"minValue": 0, | |
"show": true, | |
"thresholdLabels": false, | |
"thresholdMarkers": true | |
}, | |
"gridPos": { | |
"h": 5, | |
"w": 4, | |
"x": 16, | |
"y": 2 | |
}, | |
"id": 31, | |
"interval": null, | |
"links": [], | |
"mappingType": 1, | |
"mappingTypes": [ | |
{ | |
"name": "value to text", | |
"value": 1 | |
}, | |
{ | |
"name": "range to text", | |
"value": 2 | |
} | |
], | |
"maxDataPoints": 100, | |
"nullPointMode": "connected", | |
"nullText": null, | |
"postfix": "", | |
"postfixFontSize": "50%", | |
"prefix": "", | |
"prefixFontSize": "50%", | |
"rangeMaps": [ | |
{ | |
"from": "null", | |
"text": "N/A", | |
"to": "null" | |
} | |
], | |
"sparkline": { | |
"fillColor": "rgba(31, 118, 189, 0.18)", | |
"full": false, | |
"lineColor": "rgb(31, 120, 193)", | |
"show": false | |
}, | |
"tableColumn": "", | |
"targets": [ | |
{ | |
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"$hostname\"})", | |
"format": "time_series", | |
"interval": "", | |
"intervalFactor": 1, | |
"legendFormat": "", | |
"refId": "A" | |
} | |
], | |
"thresholds": "83,87", | |
"title": "GPU 平均温度", | |
"type": "singlestat", | |
"valueFontSize": "80%", | |
"valueMaps": [ | |
{ | |
"op": "=", | |
"text": "N/A", | |
"value": "null" | |
} | |
], | |
"valueName": "current" | |
}, | |
{ | |
"cacheTimeout": null, | |
"colorBackground": false, | |
"colorValue": false, | |
"colors": [ | |
"#299c46", | |
"rgba(237, 129, 40, 0.89)", | |
"#d44a3a" | |
], | |
"datasource": "Prometheus", | |
"fieldConfig": { | |
"defaults": { | |
"custom": {} | |
}, | |
"overrides": [] | |
}, | |
"format": "watt", | |
"gauge": { | |
"maxValue": 2400, | |
"minValue": 0, | |
"show": true, | |
"thresholdLabels": false, | |
"thresholdMarkers": true | |
}, | |
"gridPos": { | |
"h": 5, | |
"w": 4, | |
"x": 20, | |
"y": 2 | |
}, | |
"id": 30, | |
"interval": null, | |
"links": [], | |
"mappingType": 1, | |
"mappingTypes": [ | |
{ | |
"name": "value to text", | |
"value": 1 | |
}, | |
{ | |
"name": "range to text", | |
"value": 2 | |
} | |
], | |
"maxDataPoints": 100, | |
"nullPointMode": "connected", | |
"nullText": null, | |
"postfix": "", | |
"postfixFontSize": "50%", | |
"prefix": "", | |
"prefixFontSize": "50%", | |
"rangeMaps": [ | |
{ | |
"from": "null", | |
"text": "N/A", | |
"to": "null" | |
} | |
], | |
"sparkline": { | |
"fillColor": "rgba(31, 118, 189, 0.18)", | |
"full": false, | |
"lineColor": "rgb(31, 120, 193)", | |
"show": false | |
}, | |
"tableColumn": "", | |
"targets": [ | |
{ | |
"expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"$hostname\"})", | |
"format": "time_series", | |
"interval": "", | |
"intervalFactor": 1, | |
"legendFormat": "", | |
"refId": "A" | |
} | |
], | |
"thresholds": "1800,2200", | |
"title": "GPU 总功率", | |
"type": "singlestat", | |
"valueFontSize": "80%", | |
"valueMaps": [ | |
{ | |
"op": "=", | |
"text": "N/A", | |
"value": "null" | |
} | |
], | |
"valueName": "current" | |
}, | |
{ | |
"cacheTimeout": null, | |
"colorBackground": false, | |
"colorValue": false, | |
"colors": [ | |
"#299c46", | |
"rgba(237, 129, 40, 0.89)", | |
"#d44a3a" | |
], | |
"datasource": "Prometheus", | |
"description": "", | |
"fieldConfig": { | |
"defaults": { | |
"custom": {} | |
}, | |
"overrides": [] | |
}, | |
"format": "percent", | |
"gauge": { | |
"maxValue": 100, | |
"minValue": 0, | |
"show": true, | |
"thresholdLabels": false, | |
"thresholdMarkers": true | |
}, | |
"gridPos": { | |
"h": 5, | |
"w": 4, | |
"x": 16, | |
"y": 7 | |
}, | |
"id": 40, | |
"interval": null, | |
"links": [], | |
"mappingType": 1, | |
"mappingTypes": [ | |
{ | |
"name": "value to text", | |
"value": 1 | |
}, | |
{ | |
"name": "range to text", | |
"value": 2 | |
} | |
], | |
"maxDataPoints": 100, | |
"nullPointMode": "connected", | |
"nullText": null, | |
"postfix": "", | |
"postfixFontSize": "50%", | |
"prefix": "", | |
"prefixFontSize": "50%", | |
"rangeMaps": [ | |
{ | |
"from": "null", | |
"text": "N/A", | |
"to": "null" | |
} | |
], | |
"sparkline": { | |
"fillColor": "rgba(31, 118, 189, 0.18)", | |
"full": false, | |
"lineColor": "rgb(31, 120, 193)", | |
"show": false | |
}, | |
"tableColumn": "", | |
"targets": [ | |
{ | |
"expr": "avg(DCGM_FI_DEV_MEM_COPY_UTIL{instance=~\"$hostname\"})", | |
"format": "time_series", | |
"interval": "", | |
"intervalFactor": 1, | |
"legendFormat": "", | |
"refId": "A" | |
} | |
], | |
"thresholds": "70,90", | |
"title": "GPU 总内存利用率", | |
"type": "singlestat", | |
"valueFontSize": "80%", | |
"valueMaps": [ | |
{ | |
"op": "=", | |
"text": "N/A", | |
"value": "null" | |
} | |
], | |
"valueName": "current" | |
}, | |
{ | |
"cacheTimeout": null, | |
"colorBackground": false, | |
"colorValue": false, | |
"colors": [ | |
"#299c46", | |
"rgba(237, 129, 40, 0.89)", | |
"#d44a3a" | |
], | |
"datasource": "Prometheus", | |
"description": "", | |
"fieldConfig": { | |
"defaults": { | |
"custom": {} | |
}, | |
"overrides": [] | |
}, | |
"format": "percent", | |
"gauge": { | |
"maxValue": 100, | |
"minValue": 0, | |
"show": true, | |
"thresholdLabels": false, | |
"thresholdMarkers": true | |
}, | |
"gridPos": { | |
"h": 5, | |
"w": 4, | |
"x": 20, | |
"y": 7 | |
}, | |
"id": 58, | |
"interval": null, | |
"links": [], | |
"mappingType": 1, | |
"mappingTypes": [ | |
{ | |
"name": "value to text", | |
"value": 1 | |
}, | |
{ | |
"name": "range to text", | |
"value": 2 | |
} | |
], | |
"maxDataPoints": 100, | |
"nullPointMode": "connected", | |
"nullText": null, | |
"postfix": "", | |
"postfixFontSize": "50%", | |
"prefix": "", | |
"prefixFontSize": "50%", | |
"rangeMaps": [ | |
{ | |
"from": "null", | |
"text": "N/A", | |
"to": "null" | |
} | |
], | |
"sparkline": { | |
"fillColor": "rgba(31, 118, 189, 0.18)", | |
"full": false, | |
"lineColor": "rgb(31, 120, 193)", | |
"show": false | |
}, | |
"tableColumn": "", | |
"targets": [ | |
{ | |
"expr": "avg(DCGM_FI_DEV_GPU_UTIL{instance=~\"$hostname\"})", | |
"format": "time_series", | |
"interval": "", | |
"intervalFactor": 1, | |
"legendFormat": "", | |
"refId": "A" | |
} | |
], | |
"thresholds": "80,90", | |
"title": "GPU 总利用率", | |
"type": "singlestat", | |
"valueFontSize": "80%", | |
"valueMaps": [ | |
{ | |
"op": "=", | |
"text": "N/A", | |
"value": "null" | |
} | |
], | |
"valueName": "current" | |
}, | |
{ | |
"aliasColors": {}, | |
"bars": false, | |
"dashLength": 10, | |
"dashes": false, | |
"datasource": "Prometheus", | |
"fieldConfig": { | |
"defaults": { | |
"custom": {}, | |
"links": [] | |
}, | |
"overrides": [] | |
}, | |
"fill": 1, | |
"fillGradient": 0, | |
"gridPos": { | |
"h": 6, | |
"w": 8, | |
"x": 0, | |
"y": 9 | |
}, | |
"hiddenSeries": false, | |
"id": 24, | |
"legend": { | |
"alignAsTable": true, | |
"avg": true, | |
"current": true, | |
"max": true, | |
"min": true, | |
"show": true, | |
"total": false, | |
"values": true | |
}, | |
"lines": true, | |
"linewidth": 2, | |
"links": [], | |
"nullPointMode": "connected", | |
"percentage": false, | |
"pluginVersion": "7.1.3", | |
"pointradius": 5, | |
"points": false, | |
"renderer": "flot", | |
"seriesOverrides": [], | |
"spaceLength": 10, | |
"stack": true, | |
"steppedLine": false, | |
"targets": [ | |
{ | |
"expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"$hostname\"}", | |
"format": "time_series", | |
"hide": false, | |
"interval": "", | |
"intervalFactor": 1, | |
"legendFormat": "{{instance}}.{{gpu}}", | |
"refId": "A" | |
} | |
], | |
"thresholds": [], | |
"timeFrom": null, | |
"timeRegions": [], | |
"timeShift": null, | |
"title": "GPU 功率", | |
"tooltip": { | |
"shared": true, | |
"sort": 0, | |
"value_type": "individual" | |
}, | |
"type": "graph", | |
"xaxis": { | |
"buckets": null, | |
"mode": "time", | |
"name": null, | |
"show": true, | |
"values": [] | |
}, | |
"yaxes": [ | |
{ | |
"format": "watt", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
}, | |
{ | |
"format": "watt", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
} | |
], | |
"yaxis": { | |
"align": false, | |
"alignLevel": null | |
} | |
}, | |
{ | |
"aliasColors": {}, | |
"bars": false, | |
"dashLength": 10, | |
"dashes": false, | |
"datasource": "Prometheus", | |
"description": "内存利用率\n", | |
"fieldConfig": { | |
"defaults": { | |
"custom": {}, | |
"links": [] | |
}, | |
"overrides": [] | |
}, | |
"fill": 0, | |
"fillGradient": 0, | |
"gridPos": { | |
"h": 10, | |
"w": 8, | |
"x": 8, | |
"y": 11 | |
}, | |
"hiddenSeries": false, | |
"id": 39, | |
"legend": { | |
"alignAsTable": true, | |
"avg": true, | |
"current": true, | |
"max": true, | |
"min": true, | |
"show": true, | |
"total": false, | |
"values": true | |
}, | |
"lines": true, | |
"linewidth": 2, | |
"links": [], | |
"nullPointMode": "connected", | |
"percentage": false, | |
"pluginVersion": "7.1.3", | |
"pointradius": 5, | |
"points": false, | |
"renderer": "flot", | |
"seriesOverrides": [], | |
"spaceLength": 10, | |
"stack": false, | |
"steppedLine": false, | |
"targets": [ | |
{ | |
"expr": "DCGM_FI_DEV_MEM_COPY_UTIL{instance=~\"$hostname\"}", | |
"format": "time_series", | |
"hide": false, | |
"interval": "", | |
"intervalFactor": 1, | |
"legendFormat": "{{instance}}.{{gpu}}", | |
"refId": "A" | |
} | |
], | |
"thresholds": [], | |
"timeFrom": null, | |
"timeRegions": [], | |
"timeShift": null, | |
"title": "GPU 内存利用率", | |
"tooltip": { | |
"shared": true, | |
"sort": 0, | |
"value_type": "individual" | |
}, | |
"type": "graph", | |
"xaxis": { | |
"buckets": null, | |
"mode": "time", | |
"name": null, | |
"show": true, | |
"values": [] | |
}, | |
"yaxes": [ | |
{ | |
"format": "percent", | |
"label": null, | |
"logBase": 1, | |
"max": "100", | |
"min": "0", | |
"show": true | |
}, | |
{ | |
"format": "watt", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
} | |
], | |
"yaxis": { | |
"align": false, | |
"alignLevel": null | |
} | |
}, | |
{ | |
"datasource": "Prometheus", | |
"fieldConfig": { | |
"defaults": { | |
"custom": {}, | |
"links": [], | |
"mappings": [], | |
"thresholds": { | |
"mode": "absolute", | |
"steps": [ | |
{ | |
"color": "green", | |
"value": null | |
}, | |
{ | |
"color": "#6ED0E0", | |
"value": 25 | |
}, | |
{ | |
"color": "#EAB839", | |
"value": 50 | |
}, | |
{ | |
"color": "red", | |
"value": 75 | |
} | |
] | |
}, | |
"unit": "percent" | |
}, | |
"overrides": [] | |
}, | |
"gridPos": { | |
"h": 9, | |
"w": 8, | |
"x": 16, | |
"y": 12 | |
}, | |
"id": 42, | |
"links": [], | |
"options": { | |
"displayMode": "lcd", | |
"orientation": "horizontal", | |
"reduceOptions": { | |
"calcs": [ | |
"mean" | |
], | |
"fields": "", | |
"values": false | |
}, | |
"showUnfilled": true | |
}, | |
"pluginVersion": "7.1.3", | |
"targets": [ | |
{ | |
"expr": "(DCGM_FI_DEV_FB_USED{instance=~\"$hostname\"}/(DCGM_FI_DEV_FB_USED{instance=~\"$hostname\"}+DCGM_FI_DEV_FB_FREE{instance=~\"$hostname\"}))*100", | |
"format": "time_series", | |
"hide": false, | |
"interval": "", | |
"intervalFactor": 1, | |
"legendFormat": "{{instance}}.{{gpu}}", | |
"refId": "A" | |
} | |
], | |
"timeFrom": null, | |
"timeShift": null, | |
"title": "GPU 内存使用率", | |
"type": "bargauge" | |
}, | |
{ | |
"aliasColors": {}, | |
"bars": false, | |
"dashLength": 10, | |
"dashes": false, | |
"datasource": "Prometheus", | |
"fieldConfig": { | |
"defaults": { | |
"custom": {}, | |
"links": [] | |
}, | |
"overrides": [] | |
}, | |
"fill": 0, | |
"fillGradient": 0, | |
"gridPos": { | |
"h": 6, | |
"w": 8, | |
"x": 0, | |
"y": 15 | |
}, | |
"hiddenSeries": false, | |
"id": 25, | |
"legend": { | |
"alignAsTable": true, | |
"avg": true, | |
"current": true, | |
"max": true, | |
"min": true, | |
"show": true, | |
"total": false, | |
"values": true | |
}, | |
"lines": true, | |
"linewidth": 2, | |
"links": [], | |
"nullPointMode": "connected", | |
"percentage": false, | |
"pluginVersion": "7.1.3", | |
"pointradius": 5, | |
"points": false, | |
"renderer": "flot", | |
"seriesOverrides": [], | |
"spaceLength": 10, | |
"stack": false, | |
"steppedLine": false, | |
"targets": [ | |
{ | |
"expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"$hostname\"} ", | |
"format": "time_series", | |
"hide": false, | |
"interval": "", | |
"intervalFactor": 1, | |
"legendFormat": "{{instance}}.{{gpu}}", | |
"refId": "A" | |
} | |
], | |
"thresholds": [], | |
"timeFrom": null, | |
"timeRegions": [], | |
"timeShift": null, | |
"title": "GPU 温度", | |
"tooltip": { | |
"shared": true, | |
"sort": 0, | |
"value_type": "individual" | |
}, | |
"type": "graph", | |
"xaxis": { | |
"buckets": null, | |
"mode": "time", | |
"name": null, | |
"show": true, | |
"values": [] | |
}, | |
"yaxes": [ | |
{ | |
"format": "celsius", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
}, | |
{ | |
"format": "short", | |
"label": null, | |
"logBase": 1, | |
"max": null, | |
"min": null, | |
"show": true | |
} | |
], | |
"yaxis": { | |
"align": false, | |
"alignLevel": null | |
} | |
} | |
], | |
"refresh": "5s", | |
"schemaVersion": 26, | |
"style": "dark", | |
"tags": [ | |
"GPU" | |
], | |
"templating": { | |
"list": [ | |
{ | |
"allValue": null, | |
"current": { | |
"selected": true, | |
"tags": [], | |
"text": "lab-81 + lab-80 + lab-73 + lab-72 + lab-71 + lab-70 + lab-61 + lab-60", | |
"value": [ | |
"lab-81", | |
"lab-80", | |
"lab-73", | |
"lab-72", | |
"lab-71", | |
"lab-70", | |
"lab-61", | |
"lab-60" | |
] | |
}, | |
"datasource": "Prometheus", | |
"definition": "label_values(instance)", | |
"hide": 0, | |
"includeAll": false, | |
"label": "host", | |
"multi": true, | |
"name": "hostname", | |
"options": [], | |
"query": "label_values(instance)", | |
"refresh": 1, | |
"regex": "", | |
"skipUrlSync": false, | |
"sort": 6, | |
"tagValuesQuery": "", | |
"tags": [], | |
"tagsQuery": "", | |
"type": "query", | |
"useTags": false | |
} | |
] | |
}, | |
"time": { | |
"from": "now-30m", | |
"to": "now" | |
}, | |
"timepicker": { | |
"refresh_intervals": [ | |
"10s", | |
"30s", | |
"1m", | |
"5m", | |
"15m", | |
"30m", | |
"1h", | |
"2h", | |
"1d" | |
], | |
"time_options": [ | |
"5m", | |
"15m", | |
"1h", | |
"6h", | |
"12h", | |
"24h", | |
"2d", | |
"7d", | |
"30d" | |
] | |
}, | |
"timezone": "browser", | |
"title": "GPU-Nodes-Metrics-Nvidia", | |
"uid": "hpcsyl6zhqk", | |
"version": 6 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment