From 04c3e9bb24feb36f24e8da75f6b764e44d6e58cf Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Thu, 2 Sep 2021 16:57:17 +0200 Subject: [PATCH] feat: Add rocm_smi input to monitor AMD GPUs (#9602) --- etc/telegraf.conf | 15 +- plugins/inputs/all/all.go | 1 + plugins/inputs/amd_rocm_smi/README.md | 58 ++++ plugins/inputs/amd_rocm_smi/amd_rocm_smi.go | 294 ++++++++++++++++++ .../inputs/amd_rocm_smi/amd_rocm_smi_test.go | 90 ++++++ .../amd_rocm_smi/testdata/vega-10-XT.json | 77 +++++ .../testdata/vega-20-WKS-GL-XE.json | 165 ++++++++++ 7 files changed, 696 insertions(+), 4 deletions(-) create mode 100644 plugins/inputs/amd_rocm_smi/README.md create mode 100644 plugins/inputs/amd_rocm_smi/amd_rocm_smi.go create mode 100644 plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go create mode 100644 plugins/inputs/amd_rocm_smi/testdata/vega-10-XT.json create mode 100644 plugins/inputs/amd_rocm_smi/testdata/vega-20-WKS-GL-XE.json diff --git a/etc/telegraf.conf b/etc/telegraf.conf index c49761c94..43b1f8f3a 100644 --- a/etc/telegraf.conf +++ b/etc/telegraf.conf @@ -1870,7 +1870,7 @@ # ## Print Warp 10 error body # # print_error_body = false # -# ## Max string error size +# ## Max string error size # # max_string_error_size = 511 # # ## Optional TLS Config @@ -4343,19 +4343,19 @@ # ## List of metrics collected on above servers # ## Each metric consists in a name, a jmx path and either # ## a pass or drop slice attribute. -# ## This collect all heap memory usage metrics. +# ## This collect all heap memory usage metrics. # [[inputs.jolokia.metrics]] # name = "heap_memory_usage" # mbean = "java.lang:type=Memory" # attribute = "HeapMemoryUsage" # -# ## This collect thread counts metrics. +# ## This collect thread counts metrics. # [[inputs.jolokia.metrics]] # name = "thread_count" # mbean = "java.lang:type=Threading" # attribute = "TotalStartedThreadCount,ThreadCount,DaemonThreadCount,PeakThreadCount" # -# ## This collect number of class loaded/unloaded counts metrics. +# ## This collect number of class loaded/unloaded counts metrics. # [[inputs.jolokia.metrics]] # name = "class_count" # mbean = "java.lang:type=ClassLoading" @@ -5785,6 +5785,13 @@ # # Specify a list of one or more riak http servers # servers = ["http://localhost:8098"] +# # Query statistics from AMD Graphics cards using rocm-smi binary +# [[inputs.amd_rocm_smi]] +# ## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath +# # bin_path = "/opt/rocm/bin/rocm-smi" +# +# ## Optional: timeout for GPU polling +# # timeout = "5s" # # Read API usage and limits for a Salesforce organisation # [[inputs.salesforce]] diff --git a/plugins/inputs/all/all.go b/plugins/inputs/all/all.go index 350a8cca0..781e04e60 100644 --- a/plugins/inputs/all/all.go +++ b/plugins/inputs/all/all.go @@ -5,6 +5,7 @@ import ( _ "github.com/influxdata/telegraf/plugins/inputs/activemq" _ "github.com/influxdata/telegraf/plugins/inputs/aerospike" _ "github.com/influxdata/telegraf/plugins/inputs/aliyuncms" + _ "github.com/influxdata/telegraf/plugins/inputs/amd_rocm_smi" _ "github.com/influxdata/telegraf/plugins/inputs/amqp_consumer" _ "github.com/influxdata/telegraf/plugins/inputs/apache" _ "github.com/influxdata/telegraf/plugins/inputs/apcupsd" diff --git a/plugins/inputs/amd_rocm_smi/README.md b/plugins/inputs/amd_rocm_smi/README.md new file mode 100644 index 000000000..89a5b0630 --- /dev/null +++ b/plugins/inputs/amd_rocm_smi/README.md @@ -0,0 +1,58 @@ +# ROCm System Management Interface (SMI) Input Plugin + +This plugin uses a query on the [`rocm-smi`](https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/master/python_smi_tools) binary to pull GPU stats including memory and GPU usage, temperatures and other. + +### Configuration + +```toml +# Pulls statistics from nvidia GPUs attached to the host +[[inputs.amd_rocm_smi]] + ## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath + # bin_path = "/opt/rocm/bin/rocm-smi" + + ## Optional: timeout for GPU polling + # timeout = "5s" +``` + +### Metrics +- measurement: `amd_rocm_smi` + - tags + - `name` (entry name assigned by rocm-smi executable) + - `gpu_id` (id of the GPU according to rocm-smi) + - `gpu_unique_id` (unique id of the GPU) + + - fields + - `driver_version` (integer) + - `fan_speed`(integer) + - `memory_total`(integer B) + - `memory_used`(integer B) + - `memory_free`(integer B) + - `temperature_sensor_edge` (float, Celsius) + - `temperature_sensor_junction` (float, Celsius) + - `temperature_sensor_memory` (float, Celsius) + - `utilization_gpu` (integer, percentage) + - `utilization_memory` (integer, percentage) + - `clocks_current_sm` (integer, Mhz) + - `clocks_current_memory` (integer, Mhz) + - `power_draw` (float, Watt) + +### Troubleshooting +Check the full output by running `rocm-smi` binary manually. + +Linux: +```sh +rocm-smi rocm-smi -o -l -m -M -g -c -t -u -i -f -p -P -s -S -v --showreplaycount --showpids --showdriverversion --showmemvendor --showfwinfo --showproductname --showserial --showuniqueid --showbus --showpendingpages --showpagesinfo --showretiredpages --showunreservablepages --showmemuse --showvoltage --showtopo --showtopoweight --showtopohops --showtopotype --showtoponuma --showmeminfo all --json +``` +Please include the output of this command if opening a GitHub issue, together with ROCm version. +### Example Output +``` +amd_rocm_smi,gpu_id=0x6861,gpu_unique_id=0x2150e7d042a1124,host=ali47xl,name=card0 clocks_current_memory=167i,clocks_current_sm=852i,driver_version=51114i,fan_speed=14i,memory_free=17145282560i,memory_total=17163091968i,memory_used=17809408i,power_draw=7,temperature_sensor_edge=28,temperature_sensor_junction=29,temperature_sensor_memory=92,utilization_gpu=0i 1630572551000000000 +amd_rocm_smi,gpu_id=0x6861,gpu_unique_id=0x2150e7d042a1124,host=ali47xl,name=card0 clocks_current_memory=167i,clocks_current_sm=852i,driver_version=51114i,fan_speed=14i,memory_free=17145282560i,memory_total=17163091968i,memory_used=17809408i,power_draw=7,temperature_sensor_edge=29,temperature_sensor_junction=30,temperature_sensor_memory=91,utilization_gpu=0i 1630572701000000000 +amd_rocm_smi,gpu_id=0x6861,gpu_unique_id=0x2150e7d042a1124,host=ali47xl,name=card0 clocks_current_memory=167i,clocks_current_sm=852i,driver_version=51114i,fan_speed=14i,memory_free=17145282560i,memory_total=17163091968i,memory_used=17809408i,power_draw=7,temperature_sensor_edge=29,temperature_sensor_junction=29,temperature_sensor_memory=92,utilization_gpu=0i 1630572749000000000 +``` +### Limitations and notices +Please notice that this plugin has been developed and tested on a limited number of versions and small set of GPUs. Currently the latest ROCm version tested is 4.3.0. +Notice that depending on the device and driver versions the amount of information provided by `rocm-smi` can vary so that some fields would start/stop appearing in the metrics upon updates. +The `rocm-smi` JSON output is not perfectly homogeneous and is possibly changing in the future, hence parsing and unmarshaling can start failing upon updating ROCm. + +Inspired by the current state of the art of the `nvidia-smi` plugin. diff --git a/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go b/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go new file mode 100644 index 000000000..7fdd32f46 --- /dev/null +++ b/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go @@ -0,0 +1,294 @@ +package amd_rocm_smi + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "strconv" + "strings" + "time" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/config" + "github.com/influxdata/telegraf/internal" + "github.com/influxdata/telegraf/plugins/inputs" +) + +const measurement = "amd_rocm_smi" + +type ROCmSMI struct { + BinPath string + Timeout config.Duration +} + +// Description returns the description of the ROCmSMI plugin +func (rsmi *ROCmSMI) Description() string { + return "Query statistics from AMD Graphics cards using rocm-smi binary" +} + +var ROCmSMIConfig = ` +## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath +# bin_path = "/opt/rocm/bin/rocm-smi" + +## Optional: timeout for GPU polling +# timeout = "5s" +` + +// SampleConfig returns the sample configuration for the ROCmSMI plugin +func (rsmi *ROCmSMI) SampleConfig() string { + return ROCmSMIConfig +} + +// Gather implements the telegraf interface +func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error { + if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) { + return fmt.Errorf("rocm-smi binary not found in path %s, cannot query GPUs statistics", rsmi.BinPath) + } + + data, err := rsmi.pollROCmSMI() + if err != nil { + return err + } + + err = gatherROCmSMI(data, acc) + if err != nil { + return err + } + + return nil +} + +func init() { + inputs.Add("amd_rocm_smi", func() telegraf.Input { + return &ROCmSMI{ + BinPath: "/opt/rocm/bin/rocm-smi", + Timeout: config.Duration(5 * time.Second), + } + }) +} + +func (rsmi *ROCmSMI) pollROCmSMI() ([]byte, error) { + // Construct and execute metrics query, there currently exist (ROCm v4.3.x) a "-a" option + // that does not provide all the information, so each needed parameter is set manually + cmd := exec.Command(rsmi.BinPath, + "-o", + "-l", + "-m", + "-M", + "-g", + "-c", + "-t", + "-u", + "-i", + "-f", + "-p", + "-P", + "-s", + "-S", + "-v", + "--showreplaycount", + "--showpids", + "--showdriverversion", + "--showmemvendor", + "--showfwinfo", + "--showproductname", + "--showserial", + "--showuniqueid", + "--showbus", + "--showpendingpages", + "--showpagesinfo", + "--showmeminfo", + "all", + "--showretiredpages", + "--showunreservablepages", + "--showmemuse", + "--showvoltage", + "--showtopo", + "--showtopoweight", + "--showtopohops", + "--showtopotype", + "--showtoponuma", + "--json") + + ret, _ := internal.StdOutputTimeout(cmd, + time.Duration(rsmi.Timeout)) + return ret, nil +} + +func gatherROCmSMI(ret []byte, acc telegraf.Accumulator) error { + var gpus map[string]GPU + var sys map[string]sysInfo + + err1 := json.Unmarshal(ret, &gpus) + if err1 != nil { + return err1 + } + + err2 := json.Unmarshal(ret, &sys) + if err2 != nil { + return err2 + } + + metrics := genTagsFields(gpus, sys) + for _, metric := range metrics { + acc.AddFields(measurement, metric.fields, metric.tags) + } + + return nil +} + +type metric struct { + tags map[string]string + fields map[string]interface{} +} + +func genTagsFields(gpus map[string]GPU, system map[string]sysInfo) []metric { + metrics := []metric{} + for cardID, payload := range gpus { + if strings.Contains(cardID, "card") { + tags := map[string]string{ + "name": cardID, + } + fields := map[string]interface{}{} + + totVRAM, _ := strconv.ParseInt(payload.GpuVRAMTotalMemory, 10, 64) + usdVRAM, _ := strconv.ParseInt(payload.GpuVRAMTotalUsedMemory, 10, 64) + strFree := strconv.FormatInt(totVRAM-usdVRAM, 10) + + setTagIfUsed(tags, "gpu_id", payload.GpuID) + setTagIfUsed(tags, "gpu_unique_id", payload.GpuUniqueID) + + setIfUsed("int", fields, "driver_version", strings.Replace(system["system"].DriverVersion, ".", "", -1)) + setIfUsed("int", fields, "fan_speed", payload.GpuFanSpeedPercentage) + setIfUsed("int64", fields, "memory_total", payload.GpuVRAMTotalMemory) + setIfUsed("int64", fields, "memory_used", payload.GpuVRAMTotalUsedMemory) + setIfUsed("int64", fields, "memory_free", strFree) + setIfUsed("float", fields, "temperature_sensor_edge", payload.GpuTemperatureSensorEdge) + setIfUsed("float", fields, "temperature_sensor_junction", payload.GpuTemperatureSensorJunction) + setIfUsed("float", fields, "temperature_sensor_memory", payload.GpuTemperatureSensorMemory) + setIfUsed("int", fields, "utilization_gpu", payload.GpuUsePercentage) + setIfUsed("int", fields, "utilization_memory", payload.GpuMemoryUsePercentage) + setIfUsed("int", fields, "clocks_current_sm", strings.Trim(payload.GpuSclkClockSpeed, "(Mhz)")) + setIfUsed("int", fields, "clocks_current_memory", strings.Trim(payload.GpuMclkClockSpeed, "(Mhz)")) + setIfUsed("float", fields, "power_draw", payload.GpuAveragePower) + + metrics = append(metrics, metric{tags, fields}) + } + } + return metrics +} + +func setTagIfUsed(m map[string]string, k, v string) { + if v != "" { + m[k] = v + } +} + +func setIfUsed(t string, m map[string]interface{}, k, v string) { + vals := strings.Fields(v) + if len(vals) < 1 { + return + } + + val := vals[0] + + switch t { + case "float": + if val != "" { + f, err := strconv.ParseFloat(val, 64) + if err == nil { + m[k] = f + } + } + case "int": + if val != "" { + i, err := strconv.Atoi(val) + if err == nil { + m[k] = i + } + } + case "int64": + if val != "" { + i, err := strconv.ParseInt(val, 10, 64) + if err == nil { + m[k] = i + } + } + case "str": + if val != "" { + m[k] = val + } + } +} + +type sysInfo struct { + DriverVersion string `json:"Driver version"` +} + +type GPU struct { + GpuID string `json:"GPU ID"` + GpuUniqueID string `json:"Unique ID"` + GpuVBIOSVersion string `json:"VBIOS version"` + GpuTemperatureSensorEdge string `json:"Temperature (Sensor edge) (C)"` + GpuTemperatureSensorJunction string `json:"Temperature (Sensor junction) (C)"` + GpuTemperatureSensorMemory string `json:"Temperature (Sensor memory) (C)"` + GpuDcefClkClockSpeed string `json:"dcefclk clock speed"` + GpuDcefClkClockLevel string `json:"dcefclk clock level"` + GpuFclkClockSpeed string `json:"fclk clock speed"` + GpuFclkClockLevel string `json:"fclk clock level"` + GpuMclkClockSpeed string `json:"mclk clock speed:"` + GpuMclkClockLevel string `json:"mclk clock level:"` + GpuSclkClockSpeed string `json:"sclk clock speed:"` + GpuSclkClockLevel string `json:"sclk clock level:"` + GpuSocclkClockSpeed string `json:"socclk clock speed"` + GpuSocclkClockLevel string `json:"socclk clock level"` + GpuPcieClock string `json:"pcie clock level"` + GpuFanSpeedLevel string `json:"Fan speed (level)"` + GpuFanSpeedPercentage string `json:"Fan speed (%)"` + GpuFanRPM string `json:"Fan RPM"` + GpuPerformanceLevel string `json:"Performance Level"` + GpuOverdrive string `json:"GPU OverDrive value (%)"` + GpuMaxPower string `json:"Max Graphics Package Power (W)"` + GpuAveragePower string `json:"Average Graphics Package Power (W)"` + GpuUsePercentage string `json:"GPU use (%)"` + GpuMemoryUsePercentage string `json:"GPU memory use (%)"` + GpuMemoryVendor string `json:"GPU memory vendor"` + GpuPCIeReplay string `json:"PCIe Replay Count"` + GpuSerialNumber string `json:"Serial Number"` + GpuVoltagemV string `json:"Voltage (mV)"` + GpuPCIBus string `json:"PCI Bus"` + GpuASDDirmware string `json:"ASD firmware version"` + GpuCEFirmware string `json:"CE firmware version"` + GpuDMCUFirmware string `json:"DMCU firmware version"` + GpuMCFirmware string `json:"MC firmware version"` + GpuMEFirmware string `json:"ME firmware version"` + GpuMECFirmware string `json:"MEC firmware version"` + GpuMEC2Firmware string `json:"MEC2 firmware version"` + GpuPFPFirmware string `json:"PFP firmware version"` + GpuRLCFirmware string `json:"RLC firmware version"` + GpuRLCSRLC string `json:"RLC SRLC firmware version"` + GpuRLCSRLG string `json:"RLC SRLG firmware version"` + GpuRLCSRLS string `json:"RLC SRLS firmware version"` + GpuSDMAFirmware string `json:"SDMA firmware version"` + GpuSDMA2Firmware string `json:"SDMA2 firmware version"` + GpuSMCFirmware string `json:"SMC firmware version"` + GpuSOSFirmware string `json:"SOS firmware version"` + GpuTARAS string `json:"TA RAS firmware version"` + GpuTAXGMI string `json:"TA XGMI firmware version"` + GpuUVDFirmware string `json:"UVD firmware version"` + GpuVCEFirmware string `json:"VCE firmware version"` + GpuVCNFirmware string `json:"VCN firmware version"` + GpuCardSeries string `json:"Card series"` + GpuCardModel string `json:"Card model"` + GpuCardVendor string `json:"Card vendor"` + GpuCardSKU string `json:"Card SKU"` + GpuNUMANode string `json:"(Topology) Numa Node"` + GpuNUMAAffinity string `json:"(Topology) Numa Affinity"` + GpuVisVRAMTotalMemory string `json:"VIS_VRAM Total Memory (B)"` + GpuVisVRAMTotalUsedMemory string `json:"VIS_VRAM Total Used Memory (B)"` + GpuVRAMTotalMemory string `json:"VRAM Total Memory (B)"` + GpuVRAMTotalUsedMemory string `json:"VRAM Total Used Memory (B)"` + GpuGTTTotalMemory string `json:"GTT Total Memory (B)"` + GpuGTTTotalUsedMemory string `json:"GTT Total Used Memory (B)"` +} diff --git a/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go b/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go new file mode 100644 index 000000000..7893760bd --- /dev/null +++ b/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go @@ -0,0 +1,90 @@ +package amd_rocm_smi + +import ( + "io/ioutil" + "path/filepath" + "testing" + "time" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/testutil" + "github.com/stretchr/testify/require" +) + +func TestGatherValidJSON(t *testing.T) { + tests := []struct { + name string + filename string + expected []telegraf.Metric + }{ + { + name: "Vega 10 XT", + filename: "vega-10-XT.json", + expected: []telegraf.Metric{ + testutil.MustMetric( + "amd_rocm_smi", + map[string]string{ + "gpu_id": "0x6861", + "gpu_unique_id": "0x2150e7d042a1124", + "name": "card0", + }, + map[string]interface{}{ + "driver_version": 5925, + "fan_speed": 13, + "memory_total": int64(17163091968), + "memory_used": int64(17776640), + "memory_free": int64(17145315328), + "temperature_sensor_edge": 39.0, + "temperature_sensor_junction": 40.0, + "temperature_sensor_memory": 92.0, + "utilization_gpu": 0, + "clocks_current_sm": 1269, + "clocks_current_memory": 167, + "power_draw": 15.0, + }, + time.Unix(0, 0)), + }, + }, + { + name: "Vega 20 WKS GL-XE [Radeon Pro VII]", + filename: "vega-20-WKS-GL-XE.json", + expected: []telegraf.Metric{ + testutil.MustMetric( + "amd_rocm_smi", + map[string]string{ + "gpu_id": "0x66a1", + "gpu_unique_id": "0x2f048617326b1ea", + "name": "card0", + }, + map[string]interface{}{ + "driver_version": 5917, + "fan_speed": 0, + "memory_total": int64(34342961152), + "memory_used": int64(10850304), + "memory_free": int64(34332110848), + "temperature_sensor_edge": 36.0, + "temperature_sensor_junction": 38.0, + "temperature_sensor_memory": 35.0, + "utilization_gpu": 0, + "utilization_memory": 0, + "clocks_current_sm": 1725, + "clocks_current_memory": 1000, + "power_draw": 26.0, + }, + time.Unix(0, 0)), + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var acc testutil.Accumulator + octets, err := ioutil.ReadFile(filepath.Join("testdata", tt.filename)) + require.NoError(t, err) + + err = gatherROCmSMI(octets, &acc) + require.NoError(t, err) + + testutil.RequireMetricsEqual(t, tt.expected, acc.GetTelegrafMetrics(), testutil.IgnoreTime()) + }) + } +} diff --git a/plugins/inputs/amd_rocm_smi/testdata/vega-10-XT.json b/plugins/inputs/amd_rocm_smi/testdata/vega-10-XT.json new file mode 100644 index 000000000..c4d51f525 --- /dev/null +++ b/plugins/inputs/amd_rocm_smi/testdata/vega-10-XT.json @@ -0,0 +1,77 @@ +{ + "card0": { + "GPU ID": "0x6861", + "Unique ID": "0x2150e7d042a1124", + "VBIOS version": "113-D0510100-106", + "Temperature (Sensor edge) (C)": "39.0", + "Temperature (Sensor junction) (C)": "40.0", + "Temperature (Sensor memory) (C)": "92.0", + "dcefclk clock speed:": "(600Mhz)", + "dcefclk clock level:": "0", + "mclk clock speed:": "(167Mhz)", + "mclk clock level:": "0", + "sclk clock speed:": "(1269Mhz)", + "sclk clock level:": "3", + "socclk clock speed:": "(960Mhz)", + "socclk clock level:": "3", + "pcie clock level": "1 (8.0GT/s x16)", + "sclk clock level": "3 (1269Mhz)", + "Fan speed (level)": "33", + "Fan speed (%)": "13", + "Fan RPM": "682", + "Performance Level": "auto", + "GPU OverDrive value (%)": "0", + "GPU Memory OverDrive value (%)": "0", + "Max Graphics Package Power (W)": "170.0", + "Average Graphics Package Power (W)": "15.0", + "0": "8.0GT/s x16", + "1": "8.0GT/s x16 *", + "2": "847Mhz", + "3": "960Mhz *", + "4": "1028Mhz", + "5": "1107Mhz", + "6": "1440Mhz", + "7": "1500Mhz", + "GPU use (%)": "0", + "GPU memory vendor": "samsung", + "PCIe Replay Count": "0", + "Serial Number": "N/A", + "Voltage (mV)": "906", + "PCI Bus": "0000:04:00.0", + "VRAM Total Memory (B)": "17163091968", + "VRAM Total Used Memory (B)": "17776640", + "VIS_VRAM Total Memory (B)": "268435456", + "VIS_VRAM Total Used Memory (B)": "13557760", + "GTT Total Memory (B)": "17163091968", + "GTT Total Used Memory (B)": "25608192", + "ASD firmware version": "553648152", + "CE firmware version": "79", + "DMCU firmware version": "0", + "MC firmware version": "0", + "ME firmware version": "163", + "MEC firmware version": "432", + "MEC2 firmware version": "432", + "PFP firmware version": "186", + "RLC firmware version": "93", + "RLC SRLC firmware version": "0", + "RLC SRLG firmware version": "0", + "RLC SRLS firmware version": "0", + "SDMA firmware version": "430", + "SDMA2 firmware version": "430", + "SMC firmware version": "00.28.54.00", + "SOS firmware version": "0x0008015d", + "TA RAS firmware version": "00.00.00.00", + "TA XGMI firmware version": "00.00.00.00", + "UVD firmware version": "0x422b1100", + "VCE firmware version": "0x39060400", + "VCN firmware version": "0x00000000", + "Card model": "0xc1e", + "Card vendor": "Advanced Micro Devices, Inc. [AMD/ATI]", + "Card SKU": "D05101", + "(Topology) Numa Node": "0", + "(Topology) Numa Affinity": "0" + }, + "system": { + "Driver version": "5.9.25" + } +} \ No newline at end of file diff --git a/plugins/inputs/amd_rocm_smi/testdata/vega-20-WKS-GL-XE.json b/plugins/inputs/amd_rocm_smi/testdata/vega-20-WKS-GL-XE.json new file mode 100644 index 000000000..771565a60 --- /dev/null +++ b/plugins/inputs/amd_rocm_smi/testdata/vega-20-WKS-GL-XE.json @@ -0,0 +1,165 @@ +{ + "card0": { + "GPU ID": "0x66a1", + "Unique ID": "0x2f048617326b1ea", + "VBIOS version": "113-D1631700-111", + "Temperature (Sensor edge) (C)": "36.0", + "Temperature (Sensor junction) (C)": "38.0", + "Temperature (Sensor memory) (C)": "35.0", + "dcefclk clock speed:": "(357Mhz)", + "dcefclk clock level:": "0", + "fclk clock speed:": "(1080Mhz)", + "fclk clock level:": "6", + "mclk clock speed:": "(1000Mhz)", + "mclk clock level:": "2", + "sclk clock speed:": "(1725Mhz)", + "sclk clock level:": "8", + "socclk clock speed:": "(971Mhz)", + "socclk clock level:": "7", + "pcie clock level": "1 (16.0GT/s x16)", + "sclk clock level": "8 (1725Mhz)", + "Fan speed (level)": "0", + "Fan speed (%)": "0", + "Fan RPM": "0", + "Performance Level": "high", + "GPU OverDrive value (%)": "0", + "Max Graphics Package Power (W)": "225.0", + "Average Graphics Package Power (W)": "26.0", + "0": "2.5GT/s x16", + "1": "16.0GT/s x16 *", + "2": "566Mhz", + "3": "618Mhz", + "4": "680Mhz", + "5": "755Mhz", + "6": "850Mhz", + "7": "971Mhz *", + "8": "1725Mhz *", + "GPU use (%)": "0", + "GPU memory use (%)": "0", + "GPU memory vendor": "samsung", + "PCIe Replay Count": "0", + "Serial Number": "692024000810", + "Voltage (mV)": "1000", + "PCI Bus": "0000:63:00.0", + "VRAM Total Memory (B)": "34342961152", + "VRAM Total Used Memory (B)": "10850304", + "VIS_VRAM Total Memory (B)": "34342961152", + "VIS_VRAM Total Used Memory (B)": "10850304", + "GTT Total Memory (B)": "54974742528", + "GTT Total Used Memory (B)": "11591680", + "ASD firmware version": "553648199", + "CE firmware version": "79", + "DMCU firmware version": "0", + "MC firmware version": "0", + "ME firmware version": "164", + "MEC firmware version": "448", + "MEC2 firmware version": "448", + "PFP firmware version": "188", + "RLC firmware version": "50", + "RLC SRLC firmware version": "1", + "RLC SRLG firmware version": "1", + "RLC SRLS firmware version": "1", + "SDMA firmware version": "144", + "SDMA2 firmware version": "144", + "SMC firmware version": "00.40.59.00", + "SOS firmware version": "0x00080b67", + "TA RAS firmware version": "27.00.01.36", + "TA XGMI firmware version": "32.00.00.02", + "UVD firmware version": "0x42002b13", + "VCE firmware version": "0x39060400", + "VCN firmware version": "0x00000000", + "Card series": "Radeon Instinct MI50 32GB", + "Card model": "0x834", + "Card vendor": "Advanced Micro Devices, Inc. [AMD/ATI]", + "Card SKU": "D16317", + "(Topology) Numa Node": "0", + "(Topology) Numa Affinity": "0" + }, + "system": { + "Driver version": "5.9.17", + "(Topology) Weight between DRM devices 0 and 1": "40", + "(Topology) Weight between DRM devices 0 and 2": "40", + "(Topology) Weight between DRM devices 0 and 3": "40", + "(Topology) Weight between DRM devices 0 and 4": "72", + "(Topology) Weight between DRM devices 0 and 5": "72", + "(Topology) Weight between DRM devices 0 and 6": "72", + "(Topology) Weight between DRM devices 0 and 7": "72", + "(Topology) Weight between DRM devices 1 and 2": "40", + "(Topology) Weight between DRM devices 1 and 3": "40", + "(Topology) Weight between DRM devices 1 and 4": "72", + "(Topology) Weight between DRM devices 1 and 5": "72", + "(Topology) Weight between DRM devices 1 and 6": "72", + "(Topology) Weight between DRM devices 1 and 7": "72", + "(Topology) Weight between DRM devices 2 and 3": "40", + "(Topology) Weight between DRM devices 2 and 4": "72", + "(Topology) Weight between DRM devices 2 and 5": "72", + "(Topology) Weight between DRM devices 2 and 6": "72", + "(Topology) Weight between DRM devices 2 and 7": "72", + "(Topology) Weight between DRM devices 3 and 4": "72", + "(Topology) Weight between DRM devices 3 and 5": "72", + "(Topology) Weight between DRM devices 3 and 6": "72", + "(Topology) Weight between DRM devices 3 and 7": "72", + "(Topology) Weight between DRM devices 4 and 5": "40", + "(Topology) Weight between DRM devices 4 and 6": "40", + "(Topology) Weight between DRM devices 4 and 7": "40", + "(Topology) Weight between DRM devices 5 and 6": "40", + "(Topology) Weight between DRM devices 5 and 7": "40", + "(Topology) Weight between DRM devices 6 and 7": "40", + "(Topology) Hops between DRM devices 0 and 1": "2", + "(Topology) Hops between DRM devices 0 and 2": "2", + "(Topology) Hops between DRM devices 0 and 3": "2", + "(Topology) Hops between DRM devices 0 and 4": "3", + "(Topology) Hops between DRM devices 0 and 5": "3", + "(Topology) Hops between DRM devices 0 and 6": "3", + "(Topology) Hops between DRM devices 0 and 7": "3", + "(Topology) Hops between DRM devices 1 and 2": "2", + "(Topology) Hops between DRM devices 1 and 3": "2", + "(Topology) Hops between DRM devices 1 and 4": "3", + "(Topology) Hops between DRM devices 1 and 5": "3", + "(Topology) Hops between DRM devices 1 and 6": "3", + "(Topology) Hops between DRM devices 1 and 7": "3", + "(Topology) Hops between DRM devices 2 and 3": "2", + "(Topology) Hops between DRM devices 2 and 4": "3", + "(Topology) Hops between DRM devices 2 and 5": "3", + "(Topology) Hops between DRM devices 2 and 6": "3", + "(Topology) Hops between DRM devices 2 and 7": "3", + "(Topology) Hops between DRM devices 3 and 4": "3", + "(Topology) Hops between DRM devices 3 and 5": "3", + "(Topology) Hops between DRM devices 3 and 6": "3", + "(Topology) Hops between DRM devices 3 and 7": "3", + "(Topology) Hops between DRM devices 4 and 5": "2", + "(Topology) Hops between DRM devices 4 and 6": "2", + "(Topology) Hops between DRM devices 4 and 7": "2", + "(Topology) Hops between DRM devices 5 and 6": "2", + "(Topology) Hops between DRM devices 5 and 7": "2", + "(Topology) Hops between DRM devices 6 and 7": "2", + "(Topology) Link type between DRM devices 0 and 1": "PCIE", + "(Topology) Link type between DRM devices 0 and 2": "PCIE", + "(Topology) Link type between DRM devices 0 and 3": "PCIE", + "(Topology) Link type between DRM devices 0 and 4": "PCIE", + "(Topology) Link type between DRM devices 0 and 5": "PCIE", + "(Topology) Link type between DRM devices 0 and 6": "PCIE", + "(Topology) Link type between DRM devices 0 and 7": "PCIE", + "(Topology) Link type between DRM devices 1 and 2": "PCIE", + "(Topology) Link type between DRM devices 1 and 3": "PCIE", + "(Topology) Link type between DRM devices 1 and 4": "PCIE", + "(Topology) Link type between DRM devices 1 and 5": "PCIE", + "(Topology) Link type between DRM devices 1 and 6": "PCIE", + "(Topology) Link type between DRM devices 1 and 7": "PCIE", + "(Topology) Link type between DRM devices 2 and 3": "PCIE", + "(Topology) Link type between DRM devices 2 and 4": "PCIE", + "(Topology) Link type between DRM devices 2 and 5": "PCIE", + "(Topology) Link type between DRM devices 2 and 6": "PCIE", + "(Topology) Link type between DRM devices 2 and 7": "PCIE", + "(Topology) Link type between DRM devices 3 and 4": "PCIE", + "(Topology) Link type between DRM devices 3 and 5": "PCIE", + "(Topology) Link type between DRM devices 3 and 6": "PCIE", + "(Topology) Link type between DRM devices 3 and 7": "PCIE", + "(Topology) Link type between DRM devices 4 and 5": "PCIE", + "(Topology) Link type between DRM devices 4 and 6": "PCIE", + "(Topology) Link type between DRM devices 4 and 7": "PCIE", + "(Topology) Link type between DRM devices 5 and 6": "PCIE", + "(Topology) Link type between DRM devices 5 and 7": "PCIE", + "(Topology) Link type between DRM devices 6 and 7": "PCIE" + } +} \ No newline at end of file