Manage Monitoring Dashboard
Updated 2024.09.30
Topics
Detailed Steps
For detailed explanations of {variables}, refer to the Terminology page.
- Monitoring Dashboard Description
- This is a tool used to check how much resource is consumed during the training conducted by AI Conductor.
1. Prerequisite
Before setting up the monitoring dashboard, [Monitoring Infrastructure & Services](.. /.. /installation/ai_conductor/resource_monitoring) must be installed.
2. Add Dashboard
- The Monitoring Dashboard is installed in the following sequence:
- Create Grafana User
- Create Grafana Folder
- Modify and Import Monitoring Dashboard JSON
2-1. Create Grafana User
- Log in to Grafana
- Log in to the installed Grafana with admin privileges.
- Create Grafana User
- Click Administration from the left menu.
- Click Users and Access.
- Click Users.
- Click New User.
- Enter all the user information and click Create user.
2-2. Create Grafana Folder
- Create Grafana Folder
- Click Dashboards from the left menu.
- Click New at the top right.
- From the dropdown, click New folder.
- Enter 01. {PROJECT_NAME}_monitoring in the Folder name on the right window.
2-3. Create and Import Monitoring Dashboard JSON
- Create Monitoring Dashboard JSON
-
Run the following command to create the JSON file to import the dashboard.
[Expand create-monitoring-dashboard.json]
cat <<EOT > create-monitoring-dashboard.json
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 1025,
"links": [],
"liveNow": false,
"panels": [
{
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 3,
"panels": [],
"repeat": "stream_name",
"repeatDirection": "h",
"title": "\${stream_name}",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "\${Datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 1,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percentunit",
"unitScale": true
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 1
},
"id": 1,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "\${Datasource}"
},
"editorMode": "code",
"exemplar": false,
"expr": "avg by (pod) (irate(container_cpu_usage_seconds_total {container=\"main\", namespace=~\"aic-ns-${PROJECT_NAME}.*\"}[90s]))\r\n* on (pod) group_left(annotation_pipelines_kubeflow_org_task_display_name) \r\nkube_pod_annotations {annotation_stream_history_id=\"\$stream_history_id\"}",
"instant": false,
"legendFormat": "{{annotation_pipelines_kubeflow_org_task_display_name}}",
"range": true,
"refId": "A"
}
],
"title": "CPU usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "\${Datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"fieldMinMax": false,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "decbytes",
"unitScale": true
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 9
},
"id": 2,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "\${Datasource}"
},
"editorMode": "code",
"expr": "container_memory_working_set_bytes{container=\"main\", namespace=~\"aic-ns-${PROJECT_NAME}.*\"}\r\n* on (pod) group_left (annotation_pipelines_kubeflow_org_task_display_name) \r\nkube_pod_annotations {annotation_stream_history_id=\"\$stream_history_id\"}",
"instant": false,
"legendFormat": "{{annotation_pipelines_kubeflow_org_task_display_name}}",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "\${Datasource}"
},
"editorMode": "code",
"expr": "kube_pod_container_resource_limits{container=\"main\", namespace=~\"aic-ns-${PROJECT_NAME}.*\"}\r\n* on (pod) group_left (annotation_pipelines_kubeflow_org_task_display_name) \r\nkube_pod_annotations {annotation_stream_history_id=\"\$stream_history_id\"}",
"hide": false,
"instant": false,
"legendFormat": "Maximum pod memory",
"range": true,
"refId": "B"
}
],
"title": "Memory usage",
"type": "timeseries"
}
],
"refresh": "",
"schemaVersion": 39,
"tags": [],
"templating": {
"list": [
{
"current": {
"isNone": true,
"selected": false,
"text": "None",
"value": ""
},
"datasource": {
"type": "prometheus",
"uid": "\${Datasource}"
},
"definition": "label_values(kube_pod_annotations, annotation_stream_history_id)",
"hide": 0,
"includeAll": false,
"multi": false,
"name": "stream_history_id",
"options": [],
"query": {
"qryType": 1,
"query": "label_values(kube_pod_annotations,annotation_stream_history_id)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"current": {
"isNone": true,
"selected": false,
"text": "None",
"value": ""
},
"datasource": {
"type": "prometheus",
"uid": "\${Datasource}"
},
"definition": "label_values(kube_pod_annotations {annotation_stream_history_id=\"\$stream_history_id\"}, annotation_pipelines_kubeflow_org_task_display_name)",
"hide": 0,
"includeAll": false,
"multi": false,
"name": "stream_name",
"options": [],
"query": {
"qryType": 1,
"query": "label_values(kube_pod_annotations {annotation_stream_history_id=\"\$stream_history_id\"}, annotation_pipelines_kubeflow_org_task_display_name)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"current": {
"selected": false,
"text": "",
"value": ""
},
"hide": 0,
"includeAll": false,
"multi": false,
"name": "Datasource",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timeRangeUpdatedDuringEditOrView": false,
"timepicker": {},
"timezone": "browser",
"title": "Training Pod monitor",
"uid": "",
"version": 1,
"weekStart": ""
}
EOT -
Copy the generated JSON content directly or use the command below:
xclip -selection clipboard < create-monitoring-dashboard.json
-
- Import the Dashboard JSON
- Click Dashboards from the Grafana left menu.
- Click New at the top right.
- From the displayed menu, click Import.
- Paste the JSON content from your clipboard into the Import via dashboard JSON model field.
- Click Load.
- Select the folder created in step 2-2. Create Grafana Folder.
- Click Import.
- Click Save dashboard at the top right.
- Click Save.
3. Delete Monitoring Dashboard
- Log in to Grafana as admin.
- Click Dashboards from the left menu.
- Select the Dashboard you want to delete from the list.
- After selecting the Dashboard, click the Delete button.