feat(inputs.nvidia_smi): Add additional fields (#13783)

This commit is contained in:
Joshua Powers 2023-08-28 14:06:20 -06:00 committed by GitHub
parent ebceed6157
commit cb488ad0f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 60 additions and 15 deletions

View File

@ -70,6 +70,8 @@ func TestGatherValidXML(t *testing.T) {
"clocks_current_sm": 300,
"clocks_current_video": 540,
"cuda_version": "10.1",
"display_active": "Disabled",
"display_mode": "Disabled",
"driver_version": "418.43",
"encoder_stats_average_fps": 0,
"encoder_stats_average_latency": 0,
@ -89,6 +91,7 @@ func TestGatherValidXML(t *testing.T) {
"utilization_memory": 1,
"utilization_encoder": 0,
"utilization_decoder": 0,
"vbios_version": "90.16.25.00.4C",
},
time.Unix(0, 0)),
},
@ -112,6 +115,8 @@ func TestGatherValidXML(t *testing.T) {
"clocks_current_sm": 139,
"clocks_current_video": 544,
"cuda_version": "10.1",
"display_active": "Disabled",
"display_mode": "Disabled",
"driver_version": "418.43",
"encoder_stats_average_fps": 0,
"encoder_stats_average_latency": 0,
@ -125,11 +130,13 @@ func TestGatherValidXML(t *testing.T) {
"memory_used": 0,
"pcie_link_gen_current": 1,
"pcie_link_width_current": 16,
"serial": "0424418054852",
"temperature_gpu": 33,
"utilization_gpu": 0,
"utilization_memory": 3,
"utilization_encoder": 0,
"utilization_decoder": 0,
"vbios_version": "86.07.3B.00.4A",
},
time.Unix(0, 0)),
},
@ -153,6 +160,9 @@ func TestGatherValidXML(t *testing.T) {
"clocks_current_sm": 585,
"clocks_current_video": 810,
"cuda_version": "11.7",
"current_ecc": "Enabled",
"display_active": "Disabled",
"display_mode": "Disabled",
"driver_version": "515.105.01",
"encoder_stats_average_fps": 0,
"encoder_stats_average_latency": 0,
@ -171,11 +181,13 @@ func TestGatherValidXML(t *testing.T) {
"retired_pages_pending": "No",
"pcie_link_gen_current": 3,
"pcie_link_width_current": 8,
"serial": "0000000000000",
"temperature_gpu": 40,
"utilization_gpu": 0,
"utilization_memory": 0,
"utilization_encoder": 0,
"utilization_decoder": 0,
"vbios_version": "90.04.84.00.06",
},
time.Unix(0, 0)),
},
@ -199,6 +211,9 @@ func TestGatherValidXML(t *testing.T) {
"clocks_current_sm": 210,
"clocks_current_video": 555,
"cuda_version": "11.7",
"current_ecc": "Enabled",
"display_active": "Disabled",
"display_mode": "Disabled",
"driver_version": "515.105.01",
"encoder_stats_average_fps": 0,
"encoder_stats_average_latency": 0,
@ -218,11 +233,13 @@ func TestGatherValidXML(t *testing.T) {
"remapped_rows_failure": "No",
"pcie_link_gen_current": 1,
"pcie_link_width_current": 8,
"serial": "0000000000000",
"temperature_gpu": 17,
"utilization_gpu": 0,
"utilization_memory": 0,
"utilization_encoder": 0,
"utilization_decoder": 0,
"vbios_version": "94.02.75.00.01",
},
time.Unix(0, 0)),
},
@ -247,6 +264,8 @@ func TestGatherValidXML(t *testing.T) {
"clocks_current_sm": 210,
"clocks_current_video": 555,
"cuda_version": "12.2",
"display_active": "Enabled",
"display_mode": "Enabled",
"driver_version": "536.40",
"encoder_stats_average_fps": 0,
"encoder_stats_average_latency": 0,
@ -264,9 +283,12 @@ func TestGatherValidXML(t *testing.T) {
"pcie_link_width_current": 16,
"temperature_gpu": 31,
"utilization_gpu": 0,
"utilization_jpeg": 0,
"utilization_memory": 37,
"utilization_encoder": 0,
"utilization_decoder": 0,
"utilization_ofa": 0,
"vbios_version": "94.02.71.40.72",
},
time.Unix(1689872450, 0)),
},

View File

@ -27,6 +27,11 @@ func Parse(acc telegraf.Accumulator, buf []byte) error {
common.SetIfUsed("str", fields, "driver_version", s.DriverVersion)
common.SetIfUsed("str", fields, "cuda_version", s.CUDAVersion)
common.SetIfUsed("str", fields, "serial", gpu.Serial)
common.SetIfUsed("str", fields, "vbios_version", gpu.VbiosVersion)
common.SetIfUsed("str", fields, "display_active", gpu.DisplayActive)
common.SetIfUsed("str", fields, "display_mode", gpu.DisplayMode)
common.SetIfUsed("str", fields, "current_ecc", gpu.EccMode.CurrentEcc)
common.SetIfUsed("int", fields, "fan_speed", gpu.FanSpeed)
common.SetIfUsed("int", fields, "memory_total", gpu.Memory.Total)
common.SetIfUsed("int", fields, "memory_used", gpu.Memory.Used)

View File

@ -9,21 +9,32 @@ type smi struct {
// GPU defines the structure of the GPU portion of the smi output.
type GPU struct {
FanSpeed string `xml:"fan_speed"` // int
Memory MemoryStats `xml:"fb_memory_usage"`
RetiredPages MemoryRetiredPages `xml:"retired_pages"`
RemappedRows MemoryRemappedRows `xml:"remapped_rows"`
PState string `xml:"performance_state"`
Temp TempStats `xml:"temperature"`
ProdName string `xml:"product_name"`
UUID string `xml:"uuid"`
ComputeMode string `xml:"compute_mode"`
Utilization UtilizationStats `xml:"utilization"`
Power PowerReadings `xml:"power_readings"`
PCI PCI `xml:"pci"`
Encoder EncoderStats `xml:"encoder_stats"`
FBC FBCStats `xml:"fbc_stats"`
Clocks ClockStats `xml:"clocks"`
Clocks ClockStats `xml:"clocks"`
ComputeMode string `xml:"compute_mode"`
DisplayActive string `xml:"display_active"`
DisplayMode string `xml:"display_mode"`
EccMode ECCMode `xml:"ecc_mode"`
Encoder EncoderStats `xml:"encoder_stats"`
FanSpeed string `xml:"fan_speed"` // int
FBC FBCStats `xml:"fbc_stats"`
Memory MemoryStats `xml:"fb_memory_usage"`
PCI PCI `xml:"pci"`
Power PowerReadings `xml:"power_readings"`
ProdName string `xml:"product_name"`
PState string `xml:"performance_state"`
RemappedRows MemoryRemappedRows `xml:"remapped_rows"`
RetiredPages MemoryRetiredPages `xml:"retired_pages"`
Serial string `xml:"serial"`
Temp TempStats `xml:"temperature"`
Utilization UtilizationStats `xml:"utilization"`
UUID string `xml:"uuid"`
VbiosVersion string `xml:"vbios_version"`
}
// ECCMode defines the structure of the ecc portions in the smi output.
type ECCMode struct {
CurrentEcc string `xml:"current_ecc"` // Enabled, Disabled, N/A
PendingEcc string `xml:"pending_ecc"` // Enabled, Disabled, N/A
}
// MemoryStats defines the structure of the memory portions in the smi output.

View File

@ -36,6 +36,11 @@ func Parse(acc telegraf.Accumulator, buf []byte) error {
common.SetIfUsed("str", fields, "driver_version", s.DriverVersion)
common.SetIfUsed("str", fields, "cuda_version", s.CudaVersion)
common.SetIfUsed("str", fields, "serial", gpu.Serial)
common.SetIfUsed("str", fields, "vbios_version", gpu.VbiosVersion)
common.SetIfUsed("str", fields, "display_active", gpu.DisplayActive)
common.SetIfUsed("str", fields, "display_mode", gpu.DisplayMode)
common.SetIfUsed("str", fields, "current_ecc", gpu.EccMode.CurrentEcc)
common.SetIfUsed("int", fields, "fan_speed", gpu.FanSpeed)
common.SetIfUsed("int", fields, "memory_total", gpu.FbMemoryUsage.Total)
common.SetIfUsed("int", fields, "memory_used", gpu.FbMemoryUsage.Used)
@ -54,6 +59,8 @@ func Parse(acc telegraf.Accumulator, buf []byte) error {
common.SetIfUsed("int", fields, "utilization_memory", gpu.Utilization.MemoryUtil)
common.SetIfUsed("int", fields, "utilization_encoder", gpu.Utilization.EncoderUtil)
common.SetIfUsed("int", fields, "utilization_decoder", gpu.Utilization.DecoderUtil)
common.SetIfUsed("int", fields, "utilization_jpeg", gpu.Utilization.JpegUtil)
common.SetIfUsed("int", fields, "utilization_ofa", gpu.Utilization.OfaUtil)
common.SetIfUsed("int", fields, "pcie_link_gen_current", gpu.Pci.PciGpuLinkInfo.PcieGen.CurrentLinkGen)
common.SetIfUsed("int", fields, "pcie_link_width_current", gpu.Pci.PciGpuLinkInfo.LinkWidths.CurrentLinkWidth)
common.SetIfUsed("int", fields, "encoder_stats_session_count", gpu.EncoderStats.SessionCount)