diff --git a/plugins/inputs/nvidia_smi/nvidia_smi_test.go b/plugins/inputs/nvidia_smi/nvidia_smi_test.go index b954560ce..c7b4cb025 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi_test.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi_test.go @@ -271,6 +271,137 @@ func TestGatherValidXML(t *testing.T) { time.Unix(1689872450, 0)), }, }, + { + name: "A100-SXM4 schema v12", + filename: "a100-sxm4-v12.xml", + expected: []telegraf.Metric{ + testutil.MustMetric( + "nvidia_smi", + map[string]string{ + "compute_mode": "Default", + "index": "0", + "name": "NVIDIA A100-SXM4-80GB", + "arch": "Ampere", + "pstate": "P0", + "uuid": "GPU-513536b6-7d19-9063-b049-1e69664bb298", + }, + map[string]interface{}{ + "clocks_current_graphics": 1275, + "clocks_current_memory": 1593, + "clocks_current_sm": 1275, + "clocks_current_video": 1275, + "cuda_version": "12.2", + "driver_version": "535.54.03", + "encoder_stats_average_fps": 0, + "encoder_stats_average_latency": 0, + "encoder_stats_session_count": 0, + "fbc_stats_average_fps": 0, + "fbc_stats_average_latency": 0, + "fbc_stats_session_count": 0, + "power_draw": 67.03, + "memory_free": 80999, + "memory_total": 81920, + "memory_used": 50, + "memory_reserved": 869, + "pcie_link_gen_current": 4, + "pcie_link_width_current": 16, + "temperature_gpu": 27, + }, + time.Unix(1689872450, 0)), + testutil.MustMetric( + "nvidia_smi_mig", + map[string]string{ + "compute_mode": "Default", + "index": "0", + "name": "NVIDIA A100-SXM4-80GB", + "arch": "Ampere", + "pstate": "P0", + "uuid": "GPU-513536b6-7d19-9063-b049-1e69664bb298", + "compute_index": "0", + "gpu_index": "3", + }, + map[string]interface{}{ + "memory_bar1_free": 32767, + "memory_bar1_total": 32767, + "memory_bar1_used": 0, + "memory_fb_free": 19955, + "memory_fb_reserved": 0, + "memory_fb_total": 19968, + "memory_fb_used": 12, + "sram_uncorrectable": 0, + }, + time.Unix(1689872450, 0)), + testutil.MustMetric( + "nvidia_smi_mig", + map[string]string{ + "compute_mode": "Default", + "index": "1", + "name": "NVIDIA A100-SXM4-80GB", + "arch": "Ampere", + "pstate": "P0", + "uuid": "GPU-513536b6-7d19-9063-b049-1e69664bb298", + "compute_index": "0", + "gpu_index": "4", + }, + map[string]interface{}{ + "memory_bar1_free": 32767, + "memory_bar1_total": 32767, + "memory_bar1_used": 0, + "memory_fb_free": 19955, + "memory_fb_reserved": 0, + "memory_fb_total": 19968, + "memory_fb_used": 12, + "sram_uncorrectable": 0, + }, + time.Unix(1689872450, 0)), + testutil.MustMetric( + "nvidia_smi_mig", + map[string]string{ + "compute_mode": "Default", + "index": "2", + "name": "NVIDIA A100-SXM4-80GB", + "arch": "Ampere", + "pstate": "P0", + "uuid": "GPU-513536b6-7d19-9063-b049-1e69664bb298", + "compute_index": "0", + "gpu_index": "5", + }, + map[string]interface{}{ + "memory_bar1_free": 32767, + "memory_bar1_total": 32767, + "memory_bar1_used": 0, + "memory_fb_free": 19955, + "memory_fb_reserved": 0, + "memory_fb_total": 19968, + "memory_fb_used": 12, + "sram_uncorrectable": 0, + }, + time.Unix(1689872450, 0)), + testutil.MustMetric( + "nvidia_smi_mig", + map[string]string{ + "compute_mode": "Default", + "index": "3", + "name": "NVIDIA A100-SXM4-80GB", + "arch": "Ampere", + "pstate": "P0", + "uuid": "GPU-513536b6-7d19-9063-b049-1e69664bb298", + "compute_index": "0", + "gpu_index": "6", + }, + map[string]interface{}{ + "memory_bar1_free": 32767, + "memory_bar1_total": 32767, + "memory_bar1_used": 0, + "memory_fb_free": 19955, + "memory_fb_reserved": 0, + "memory_fb_total": 19968, + "memory_fb_used": 12, + "sram_uncorrectable": 0, + }, + time.Unix(1689872450, 0)), + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/plugins/inputs/nvidia_smi/schema_v12/parser.go b/plugins/inputs/nvidia_smi/schema_v12/parser.go index ce351991e..1175e5d47 100644 --- a/plugins/inputs/nvidia_smi/schema_v12/parser.go +++ b/plugins/inputs/nvidia_smi/schema_v12/parser.go @@ -69,6 +69,30 @@ func Parse(acc telegraf.Accumulator, buf []byte) error { common.SetIfUsed("float", fields, "power_draw", gpu.GpuPowerReadings.PowerDraw) common.SetIfUsed("float", fields, "module_power_draw", gpu.ModulePowerReadings.PowerDraw) acc.AddFields("nvidia_smi", fields, tags, timestamp) + + for _, device := range gpu.MigDevices.MigDevice { + tags := map[string]string{} + common.SetTagIfUsed(tags, "index", device.Index) + common.SetTagIfUsed(tags, "gpu_index", device.GpuInstanceID) + common.SetTagIfUsed(tags, "compute_index", device.ComputeInstanceID) + common.SetTagIfUsed(tags, "pstate", gpu.PerformanceState) + common.SetTagIfUsed(tags, "name", gpu.ProductName) + common.SetTagIfUsed(tags, "arch", gpu.ProductArchitecture) + common.SetTagIfUsed(tags, "uuid", gpu.UUID) + common.SetTagIfUsed(tags, "compute_mode", gpu.ComputeMode) + + fields := map[string]interface{}{} + common.SetIfUsed("int", fields, "sram_uncorrectable", device.EccErrorCount.VolatileCount.SramUncorrectable) + common.SetIfUsed("int", fields, "memory_fb_total", device.FbMemoryUsage.Total) + common.SetIfUsed("int", fields, "memory_fb_reserved", device.FbMemoryUsage.Reserved) + common.SetIfUsed("int", fields, "memory_fb_used", device.FbMemoryUsage.Used) + common.SetIfUsed("int", fields, "memory_fb_free", device.FbMemoryUsage.Free) + common.SetIfUsed("int", fields, "memory_bar1_total", device.Bar1MemoryUsage.Total) + common.SetIfUsed("int", fields, "memory_bar1_used", device.Bar1MemoryUsage.Used) + common.SetIfUsed("int", fields, "memory_bar1_free", device.Bar1MemoryUsage.Free) + + acc.AddFields("nvidia_smi_mig", fields, tags, timestamp) + } } return nil diff --git a/plugins/inputs/nvidia_smi/schema_v12/types.go b/plugins/inputs/nvidia_smi/schema_v12/types.go index 40843e7f6..3eeefe6e0 100644 --- a/plugins/inputs/nvidia_smi/schema_v12/types.go +++ b/plugins/inputs/nvidia_smi/schema_v12/types.go @@ -144,8 +144,31 @@ type smi struct { MaxCustomerBoostClocks struct { GraphicsClock string `xml:"graphics_clock"` } `xml:"max_customer_boost_clocks"` - MigDevices string `xml:"mig_devices"` - MigMode struct { + MigDevices struct { + MigDevice []struct { + Index string `xml:"index"` + GpuInstanceID string `xml:"gpu_instance_id"` + ComputeInstanceID string `xml:"compute_instance_id"` + EccErrorCount struct { + Text string `xml:",chardata" json:"text"` + VolatileCount struct { + SramUncorrectable string `xml:"sram_uncorrectable"` + } `xml:"volatile_count" json:"volatile_count"` + } `xml:"ecc_error_count" json:"ecc_error_count"` + FbMemoryUsage struct { + Total string `xml:"total"` + Reserved string `xml:"reserved"` + Used string `xml:"used"` + Free string `xml:"free"` + } `xml:"fb_memory_usage" json:"fb_memory_usage"` + Bar1MemoryUsage struct { + Total string `xml:"total"` + Used string `xml:"used"` + Free string `xml:"free"` + } `xml:"bar1_memory_usage" json:"bar1_memory_usage"` + } `xml:"mig_device" json:"mig_device"` + } `xml:"mig_devices" json:"mig_devices"` + MigMode struct { CurrentMig string `xml:"current_mig"` PendingMig string `xml:"pending_mig"` } `xml:"mig_mode"` diff --git a/plugins/inputs/nvidia_smi/testdata/a100-sxm4-v12.xml b/plugins/inputs/nvidia_smi/testdata/a100-sxm4-v12.xml new file mode 100644 index 000000000..29bf40cae --- /dev/null +++ b/plugins/inputs/nvidia_smi/testdata/a100-sxm4-v12.xml @@ -0,0 +1,452 @@ + + + + Fri Aug 4 11:44:30 2023 + 535.54.03 + 12.2 + 4 + + NVIDIA A100-SXM4-80GB + NVIDIA + Ampere + Enabled + Disabled + Disabled + None + + Enabled + Enabled + + + + 0 + 3 + 0 + + + 14 + 1 + 0 + 1 + 0 + 0 + + + + + 0 + + + + 19968 MiB + 0 MiB + 12 MiB + 19955 MiB + + + 32767 MiB + 0 MiB + 32767 MiB + + + + 1 + 4 + 0 + + + 14 + 1 + 0 + 1 + 0 + 0 + + + + + 0 + + + + 19968 MiB + 0 MiB + 12 MiB + 19955 MiB + + + 32767 MiB + 0 MiB + 32767 MiB + + + + 2 + 5 + 0 + + + 14 + 1 + 0 + 1 + 0 + 0 + + + + + 0 + + + + 19968 MiB + 0 MiB + 12 MiB + 19955 MiB + + + 32767 MiB + 0 MiB + 32767 MiB + + + + 3 + 6 + 0 + + + 14 + 1 + 0 + 1 + 0 + 0 + + + + + 0 + + + + 19968 MiB + 0 MiB + 12 MiB + 19955 MiB + + + 32767 MiB + 0 MiB + 32767 MiB + + + + Disabled + 4000 + + N/A + N/A + + 1650522003820 + GPU-513536b6-7d19-9063-b049-1e69664bb298 + 1 + 92.00.36.00.02 + No + 0x100 + 692-2G506-0212-002 + 20B2-895-A1 + N/A + 4 + + G506.0212.00.01 + 2.0 + 6.16 + N/A + + + N/A + N/A + + 535.54.03 + + None + N/A + + + No + No + + + N/A + + + 01 + 00 + 0000 + 20B210DE + 00000000:01:00.0 + 147F10DE + + + 4 + 4 + 4 + 4 + 4 + + + 16x + 16x + + + + N/A + N/A + + 0 + 0 + 4000 KB/s + 0 KB/s + N/A + N/A + + N/A + P0 + + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + + + 81920 MiB + 869 MiB + 50 MiB + 80999 MiB + + + 131072 MiB + 1 MiB + 131071 MiB + + + 0 MiB + 0 MiB + 0 MiB + + Default + + N/A + N/A + N/A + N/A + N/A + N/A + + + 0 + 0 + 0 + + + 0 + 0 + 0 + + + Enabled + Enabled + + + + 0 + 0 + 0 + 0 + + + 0 + 0 + 0 + 0 + + + + + N/A + N/A + + + N/A + N/A + + N/A + N/A + + N/A + + 27 C + N/A + 92 C + 89 C + 85 C + N/A + 44 C + 95 C + + + N/A + N/A + + + P0 + 67.03 W + 500.00 W + 500.00 W + 500.00 W + 100.00 W + 500.00 W + + + P0 + N/A + N/A + N/A + N/A + N/A + N/A + + + 1275 MHz + 1275 MHz + 1593 MHz + 1275 MHz + + + 1275 MHz + 1593 MHz + + + 1275 MHz + 1593 MHz + + + N/A + + + 1410 MHz + 1410 MHz + 1593 MHz + 1290 MHz + + + 1410 MHz + + + N/A + N/A + + + 912.500 mV + + + N/A + N/A + + + + 1593 MHz + 1410 MHz + 1395 MHz + 1380 MHz + 1365 MHz + 1350 MHz + 1335 MHz + 1320 MHz + 1305 MHz + 1290 MHz + 1275 MHz + 1260 MHz + 1245 MHz + 1230 MHz + 1215 MHz + 1200 MHz + 1185 MHz + 1170 MHz + 1155 MHz + 1140 MHz + 1125 MHz + 1110 MHz + 1095 MHz + 1080 MHz + 1065 MHz + 1050 MHz + 1035 MHz + 1020 MHz + 1005 MHz + 990 MHz + 975 MHz + 960 MHz + 945 MHz + 930 MHz + 915 MHz + 900 MHz + 885 MHz + 870 MHz + 855 MHz + 840 MHz + 825 MHz + 810 MHz + 795 MHz + 780 MHz + 765 MHz + 750 MHz + 735 MHz + 720 MHz + 705 MHz + 690 MHz + 675 MHz + 660 MHz + 645 MHz + 630 MHz + 615 MHz + 600 MHz + 585 MHz + 570 MHz + 555 MHz + 540 MHz + 525 MHz + 510 MHz + 495 MHz + 480 MHz + 465 MHz + 450 MHz + 435 MHz + 420 MHz + 405 MHz + 390 MHz + 375 MHz + 360 MHz + 345 MHz + 330 MHz + 315 MHz + 300 MHz + 285 MHz + 270 MHz + 255 MHz + 240 MHz + 225 MHz + 210 MHz + + + + + +