Skip to main content
Version: Next

Manage Monitoring Dashboard

Updated 2024.09.30

Topics



Detailed Steps

For detailed explanations of {variables}, refer to the Terminology page.

  • Monitoring Dashboard Description
    • This is a tool used to check how much resource is consumed during the training conducted by AI Conductor.

1. Prerequisite

Before setting up the monitoring dashboard, [Monitoring Infrastructure & Services](.. /.. /installation/ai_conductor/resource_monitoring) must be installed.


2. Add Dashboard

  • The Monitoring Dashboard is installed in the following sequence:
    • Create Grafana User
    • Create Grafana Folder
    • Modify and Import Monitoring Dashboard JSON

2-1. Create Grafana User

  • Log in to Grafana
    • Log in to the installed Grafana with admin privileges.
  • Create Grafana User
    • Click Administration from the left menu.
    • Click Users and Access.
    • Click Users.
    • Click New User.
    • Enter all the user information and click Create user.

2-2. Create Grafana Folder

  • Create Grafana Folder
    • Click Dashboards from the left menu.
    • Click New at the top right.
    • From the dropdown, click New folder.
    • Enter 01. {PROJECT_NAME}_monitoring in the Folder name on the right window.

2-3. Create and Import Monitoring Dashboard JSON

  • Create Monitoring Dashboard JSON
    • Run the following command to create the JSON file to import the dashboard.

      [Expand create-monitoring-dashboard.json]
      cat <<EOT > create-monitoring-dashboard.json
      {
      "annotations": {
      "list": [
      {
      "builtIn": 1,
      "datasource": {
      "type": "grafana",
      "uid": "-- Grafana --"
      },
      "enable": true,
      "hide": true,
      "iconColor": "rgba(0, 211, 255, 1)",
      "name": "Annotations & Alerts",
      "type": "dashboard"
      }
      ]
      },
      "editable": true,
      "fiscalYearStartMonth": 0,
      "graphTooltip": 0,
      "id": 1025,
      "links": [],
      "liveNow": false,
      "panels": [
      {
      "gridPos": {
      "h": 1,
      "w": 24,
      "x": 0,
      "y": 0
      },
      "id": 3,
      "panels": [],
      "repeat": "stream_name",
      "repeatDirection": "h",
      "title": "\${stream_name}",
      "type": "row"
      },
      {
      "datasource": {
      "type": "prometheus",
      "uid": "\${Datasource}"
      },
      "fieldConfig": {
      "defaults": {
      "color": {
      "mode": "palette-classic"
      },
      "custom": {
      "axisBorderShow": false,
      "axisCenteredZero": false,
      "axisColorMode": "text",
      "axisLabel": "",
      "axisPlacement": "auto",
      "barAlignment": 0,
      "drawStyle": "line",
      "fillOpacity": 0,
      "gradientMode": "none",
      "hideFrom": {
      "legend": false,
      "tooltip": false,
      "viz": false
      },
      "insertNulls": false,
      "lineInterpolation": "linear",
      "lineWidth": 1,
      "pointSize": 5,
      "scaleDistribution": {
      "type": "linear"
      },
      "showPoints": "auto",
      "spanNulls": false,
      "stacking": {
      "group": "A",
      "mode": "none"
      },
      "thresholdsStyle": {
      "mode": "off"
      }
      },
      "mappings": [],
      "max": 1,
      "thresholds": {
      "mode": "absolute",
      "steps": [
      {
      "color": "green",
      "value": null
      },
      {
      "color": "red",
      "value": 80
      }
      ]
      },
      "unit": "percentunit",
      "unitScale": true
      },
      "overrides": []
      },
      "gridPos": {
      "h": 8,
      "w": 24,
      "x": 0,
      "y": 1
      },
      "id": 1,
      "options": {
      "legend": {
      "calcs": [],
      "displayMode": "list",
      "placement": "bottom",
      "showLegend": true
      },
      "tooltip": {
      "mode": "single",
      "sort": "none"
      }
      },
      "targets": [
      {
      "datasource": {
      "type": "prometheus",
      "uid": "\${Datasource}"
      },
      "editorMode": "code",
      "exemplar": false,
      "expr": "avg by (pod) (irate(container_cpu_usage_seconds_total {container=\"main\", namespace=~\"aic-ns-${PROJECT_NAME}.*\"}[90s]))\r\n* on (pod) group_left(annotation_pipelines_kubeflow_org_task_display_name) \r\nkube_pod_annotations {annotation_stream_history_id=\"\$stream_history_id\"}",
      "instant": false,
      "legendFormat": "{{annotation_pipelines_kubeflow_org_task_display_name}}",
      "range": true,
      "refId": "A"
      }
      ],
      "title": "CPU usage",
      "type": "timeseries"
      },
      {
      "datasource": {
      "type": "prometheus",
      "uid": "\${Datasource}"
      },
      "fieldConfig": {
      "defaults": {
      "color": {
      "mode": "palette-classic"
      },
      "custom": {
      "axisBorderShow": false,
      "axisCenteredZero": false,
      "axisColorMode": "text",
      "axisLabel": "",
      "axisPlacement": "auto",
      "barAlignment": 0,
      "drawStyle": "line",
      "fillOpacity": 0,
      "gradientMode": "none",
      "hideFrom": {
      "legend": false,
      "tooltip": false,
      "viz": false
      },
      "insertNulls": false,
      "lineInterpolation": "linear",
      "lineWidth": 1,
      "pointSize": 5,
      "scaleDistribution": {
      "type": "linear"
      },
      "showPoints": "auto",
      "spanNulls": false,
      "stacking": {
      "group": "A",
      "mode": "none"
      },
      "thresholdsStyle": {
      "mode": "off"
      }
      },
      "fieldMinMax": false,
      "mappings": [],
      "thresholds": {
      "mode": "absolute",
      "steps": [
      {
      "color": "green",
      "value": null
      },
      {
      "color": "red",
      "value": 80
      }
      ]
      },
      "unit": "decbytes",
      "unitScale": true
      },
      "overrides": []
      },
      "gridPos": {
      "h": 8,
      "w": 24,
      "x": 0,
      "y": 9
      },
      "id": 2,
      "options": {
      "legend": {
      "calcs": [],
      "displayMode": "list",
      "placement": "bottom",
      "showLegend": true
      },
      "tooltip": {
      "mode": "single",
      "sort": "none"
      }
      },
      "targets": [
      {
      "datasource": {
      "type": "prometheus",
      "uid": "\${Datasource}"
      },
      "editorMode": "code",
      "expr": "container_memory_working_set_bytes{container=\"main\", namespace=~\"aic-ns-${PROJECT_NAME}.*\"}\r\n* on (pod) group_left (annotation_pipelines_kubeflow_org_task_display_name) \r\nkube_pod_annotations {annotation_stream_history_id=\"\$stream_history_id\"}",
      "instant": false,
      "legendFormat": "{{annotation_pipelines_kubeflow_org_task_display_name}}",
      "range": true,
      "refId": "A"
      },
      {
      "datasource": {
      "type": "prometheus",
      "uid": "\${Datasource}"
      },
      "editorMode": "code",
      "expr": "kube_pod_container_resource_limits{container=\"main\", namespace=~\"aic-ns-${PROJECT_NAME}.*\"}\r\n* on (pod) group_left (annotation_pipelines_kubeflow_org_task_display_name) \r\nkube_pod_annotations {annotation_stream_history_id=\"\$stream_history_id\"}",
      "hide": false,
      "instant": false,
      "legendFormat": "Maximum pod memory",
      "range": true,
      "refId": "B"
      }
      ],
      "title": "Memory usage",
      "type": "timeseries"
      }
      ],
      "refresh": "",
      "schemaVersion": 39,
      "tags": [],
      "templating": {
      "list": [
      {
      "current": {
      "isNone": true,
      "selected": false,
      "text": "None",
      "value": ""
      },
      "datasource": {
      "type": "prometheus",
      "uid": "\${Datasource}"
      },
      "definition": "label_values(kube_pod_annotations, annotation_stream_history_id)",
      "hide": 0,
      "includeAll": false,
      "multi": false,
      "name": "stream_history_id",
      "options": [],
      "query": {
      "qryType": 1,
      "query": "label_values(kube_pod_annotations,annotation_stream_history_id)",
      "refId": "PrometheusVariableQueryEditor-VariableQuery"
      },
      "refresh": 2,
      "regex": "",
      "skipUrlSync": false,
      "sort": 0,
      "type": "query"
      },
      {
      "current": {
      "isNone": true,
      "selected": false,
      "text": "None",
      "value": ""
      },
      "datasource": {
      "type": "prometheus",
      "uid": "\${Datasource}"
      },
      "definition": "label_values(kube_pod_annotations {annotation_stream_history_id=\"\$stream_history_id\"}, annotation_pipelines_kubeflow_org_task_display_name)",
      "hide": 0,
      "includeAll": false,
      "multi": false,
      "name": "stream_name",
      "options": [],
      "query": {
      "qryType": 1,
      "query": "label_values(kube_pod_annotations {annotation_stream_history_id=\"\$stream_history_id\"}, annotation_pipelines_kubeflow_org_task_display_name)",
      "refId": "PrometheusVariableQueryEditor-VariableQuery"
      },
      "refresh": 1,
      "regex": "",
      "skipUrlSync": false,
      "sort": 0,
      "type": "query"
      },
      {
      "current": {
      "selected": false,
      "text": "",
      "value": ""
      },
      "hide": 0,
      "includeAll": false,
      "multi": false,
      "name": "Datasource",
      "options": [],
      "query": "prometheus",
      "queryValue": "",
      "refresh": 1,
      "regex": "",
      "skipUrlSync": false,
      "type": "datasource"
      }
      ]
      },
      "time": {
      "from": "now-5m",
      "to": "now"
      },
      "timeRangeUpdatedDuringEditOrView": false,
      "timepicker": {},
      "timezone": "browser",
      "title": "Training Pod monitor",
      "uid": "",
      "version": 1,
      "weekStart": ""
      }
      EOT
    • Copy the generated JSON content directly or use the command below:

      xclip -selection clipboard < create-monitoring-dashboard.json
  • Import the Dashboard JSON
    • Click Dashboards from the Grafana left menu.
    • Click New at the top right.
    • From the displayed menu, click Import.
    • Paste the JSON content from your clipboard into the Import via dashboard JSON model field.
    • Click Load.
    • Select the folder created in step 2-2. Create Grafana Folder.
    • Click Import.
    • Click Save dashboard at the top right.
    • Click Save.

3. Delete Monitoring Dashboard

  • Log in to Grafana as admin.
  • Click Dashboards from the left menu.
  • Select the Dashboard you want to delete from the list.
  • After selecting the Dashboard, click the Delete button.