feat: Add rocm_smi input to monitor AMD GPUs (#9602)

This commit is contained in:
Matteo Concas 2021-09-02 16:57:17 +02:00 committed by GitHub
parent 514a942a6c
commit 04c3e9bb24
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 696 additions and 4 deletions

View File

@ -1870,7 +1870,7 @@
# ## Print Warp 10 error body # ## Print Warp 10 error body
# # print_error_body = false # # print_error_body = false
# #
# ## Max string error size # ## Max string error size
# # max_string_error_size = 511 # # max_string_error_size = 511
# #
# ## Optional TLS Config # ## Optional TLS Config
@ -4343,19 +4343,19 @@
# ## List of metrics collected on above servers # ## List of metrics collected on above servers
# ## Each metric consists in a name, a jmx path and either # ## Each metric consists in a name, a jmx path and either
# ## a pass or drop slice attribute. # ## a pass or drop slice attribute.
# ## This collect all heap memory usage metrics. # ## This collect all heap memory usage metrics.
# [[inputs.jolokia.metrics]] # [[inputs.jolokia.metrics]]
# name = "heap_memory_usage" # name = "heap_memory_usage"
# mbean = "java.lang:type=Memory" # mbean = "java.lang:type=Memory"
# attribute = "HeapMemoryUsage" # attribute = "HeapMemoryUsage"
# #
# ## This collect thread counts metrics. # ## This collect thread counts metrics.
# [[inputs.jolokia.metrics]] # [[inputs.jolokia.metrics]]
# name = "thread_count" # name = "thread_count"
# mbean = "java.lang:type=Threading" # mbean = "java.lang:type=Threading"
# attribute = "TotalStartedThreadCount,ThreadCount,DaemonThreadCount,PeakThreadCount" # attribute = "TotalStartedThreadCount,ThreadCount,DaemonThreadCount,PeakThreadCount"
# #
# ## This collect number of class loaded/unloaded counts metrics. # ## This collect number of class loaded/unloaded counts metrics.
# [[inputs.jolokia.metrics]] # [[inputs.jolokia.metrics]]
# name = "class_count" # name = "class_count"
# mbean = "java.lang:type=ClassLoading" # mbean = "java.lang:type=ClassLoading"
@ -5785,6 +5785,13 @@
# # Specify a list of one or more riak http servers # # Specify a list of one or more riak http servers
# servers = ["http://localhost:8098"] # servers = ["http://localhost:8098"]
# # Query statistics from AMD Graphics cards using rocm-smi binary
# [[inputs.amd_rocm_smi]]
# ## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
# # bin_path = "/opt/rocm/bin/rocm-smi"
#
# ## Optional: timeout for GPU polling
# # timeout = "5s"
# # Read API usage and limits for a Salesforce organisation # # Read API usage and limits for a Salesforce organisation
# [[inputs.salesforce]] # [[inputs.salesforce]]

View File

@ -5,6 +5,7 @@ import (
_ "github.com/influxdata/telegraf/plugins/inputs/activemq" _ "github.com/influxdata/telegraf/plugins/inputs/activemq"
_ "github.com/influxdata/telegraf/plugins/inputs/aerospike" _ "github.com/influxdata/telegraf/plugins/inputs/aerospike"
_ "github.com/influxdata/telegraf/plugins/inputs/aliyuncms" _ "github.com/influxdata/telegraf/plugins/inputs/aliyuncms"
_ "github.com/influxdata/telegraf/plugins/inputs/amd_rocm_smi"
_ "github.com/influxdata/telegraf/plugins/inputs/amqp_consumer" _ "github.com/influxdata/telegraf/plugins/inputs/amqp_consumer"
_ "github.com/influxdata/telegraf/plugins/inputs/apache" _ "github.com/influxdata/telegraf/plugins/inputs/apache"
_ "github.com/influxdata/telegraf/plugins/inputs/apcupsd" _ "github.com/influxdata/telegraf/plugins/inputs/apcupsd"

View File

@ -0,0 +1,58 @@
# ROCm System Management Interface (SMI) Input Plugin
This plugin uses a query on the [`rocm-smi`](https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/master/python_smi_tools) binary to pull GPU stats including memory and GPU usage, temperatures and other.
### Configuration
```toml
# Pulls statistics from nvidia GPUs attached to the host
[[inputs.amd_rocm_smi]]
## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
# bin_path = "/opt/rocm/bin/rocm-smi"
## Optional: timeout for GPU polling
# timeout = "5s"
```
### Metrics
- measurement: `amd_rocm_smi`
- tags
- `name` (entry name assigned by rocm-smi executable)
- `gpu_id` (id of the GPU according to rocm-smi)
- `gpu_unique_id` (unique id of the GPU)
- fields
- `driver_version` (integer)
- `fan_speed`(integer)
- `memory_total`(integer B)
- `memory_used`(integer B)
- `memory_free`(integer B)
- `temperature_sensor_edge` (float, Celsius)
- `temperature_sensor_junction` (float, Celsius)
- `temperature_sensor_memory` (float, Celsius)
- `utilization_gpu` (integer, percentage)
- `utilization_memory` (integer, percentage)
- `clocks_current_sm` (integer, Mhz)
- `clocks_current_memory` (integer, Mhz)
- `power_draw` (float, Watt)
### Troubleshooting
Check the full output by running `rocm-smi` binary manually.
Linux:
```sh
rocm-smi rocm-smi -o -l -m -M -g -c -t -u -i -f -p -P -s -S -v --showreplaycount --showpids --showdriverversion --showmemvendor --showfwinfo --showproductname --showserial --showuniqueid --showbus --showpendingpages --showpagesinfo --showretiredpages --showunreservablepages --showmemuse --showvoltage --showtopo --showtopoweight --showtopohops --showtopotype --showtoponuma --showmeminfo all --json
```
Please include the output of this command if opening a GitHub issue, together with ROCm version.
### Example Output
```
amd_rocm_smi,gpu_id=0x6861,gpu_unique_id=0x2150e7d042a1124,host=ali47xl,name=card0 clocks_current_memory=167i,clocks_current_sm=852i,driver_version=51114i,fan_speed=14i,memory_free=17145282560i,memory_total=17163091968i,memory_used=17809408i,power_draw=7,temperature_sensor_edge=28,temperature_sensor_junction=29,temperature_sensor_memory=92,utilization_gpu=0i 1630572551000000000
amd_rocm_smi,gpu_id=0x6861,gpu_unique_id=0x2150e7d042a1124,host=ali47xl,name=card0 clocks_current_memory=167i,clocks_current_sm=852i,driver_version=51114i,fan_speed=14i,memory_free=17145282560i,memory_total=17163091968i,memory_used=17809408i,power_draw=7,temperature_sensor_edge=29,temperature_sensor_junction=30,temperature_sensor_memory=91,utilization_gpu=0i 1630572701000000000
amd_rocm_smi,gpu_id=0x6861,gpu_unique_id=0x2150e7d042a1124,host=ali47xl,name=card0 clocks_current_memory=167i,clocks_current_sm=852i,driver_version=51114i,fan_speed=14i,memory_free=17145282560i,memory_total=17163091968i,memory_used=17809408i,power_draw=7,temperature_sensor_edge=29,temperature_sensor_junction=29,temperature_sensor_memory=92,utilization_gpu=0i 1630572749000000000
```
### Limitations and notices
Please notice that this plugin has been developed and tested on a limited number of versions and small set of GPUs. Currently the latest ROCm version tested is 4.3.0.
Notice that depending on the device and driver versions the amount of information provided by `rocm-smi` can vary so that some fields would start/stop appearing in the metrics upon updates.
The `rocm-smi` JSON output is not perfectly homogeneous and is possibly changing in the future, hence parsing and unmarshaling can start failing upon updating ROCm.
Inspired by the current state of the art of the `nvidia-smi` plugin.

View File

@ -0,0 +1,294 @@
package amd_rocm_smi
import (
"encoding/json"
"fmt"
"os"
"os/exec"
"strconv"
"strings"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/plugins/inputs"
)
const measurement = "amd_rocm_smi"
type ROCmSMI struct {
BinPath string
Timeout config.Duration
}
// Description returns the description of the ROCmSMI plugin
func (rsmi *ROCmSMI) Description() string {
return "Query statistics from AMD Graphics cards using rocm-smi binary"
}
var ROCmSMIConfig = `
## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
# bin_path = "/opt/rocm/bin/rocm-smi"
## Optional: timeout for GPU polling
# timeout = "5s"
`
// SampleConfig returns the sample configuration for the ROCmSMI plugin
func (rsmi *ROCmSMI) SampleConfig() string {
return ROCmSMIConfig
}
// Gather implements the telegraf interface
func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error {
if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) {
return fmt.Errorf("rocm-smi binary not found in path %s, cannot query GPUs statistics", rsmi.BinPath)
}
data, err := rsmi.pollROCmSMI()
if err != nil {
return err
}
err = gatherROCmSMI(data, acc)
if err != nil {
return err
}
return nil
}
func init() {
inputs.Add("amd_rocm_smi", func() telegraf.Input {
return &ROCmSMI{
BinPath: "/opt/rocm/bin/rocm-smi",
Timeout: config.Duration(5 * time.Second),
}
})
}
func (rsmi *ROCmSMI) pollROCmSMI() ([]byte, error) {
// Construct and execute metrics query, there currently exist (ROCm v4.3.x) a "-a" option
// that does not provide all the information, so each needed parameter is set manually
cmd := exec.Command(rsmi.BinPath,
"-o",
"-l",
"-m",
"-M",
"-g",
"-c",
"-t",
"-u",
"-i",
"-f",
"-p",
"-P",
"-s",
"-S",
"-v",
"--showreplaycount",
"--showpids",
"--showdriverversion",
"--showmemvendor",
"--showfwinfo",
"--showproductname",
"--showserial",
"--showuniqueid",
"--showbus",
"--showpendingpages",
"--showpagesinfo",
"--showmeminfo",
"all",
"--showretiredpages",
"--showunreservablepages",
"--showmemuse",
"--showvoltage",
"--showtopo",
"--showtopoweight",
"--showtopohops",
"--showtopotype",
"--showtoponuma",
"--json")
ret, _ := internal.StdOutputTimeout(cmd,
time.Duration(rsmi.Timeout))
return ret, nil
}
func gatherROCmSMI(ret []byte, acc telegraf.Accumulator) error {
var gpus map[string]GPU
var sys map[string]sysInfo
err1 := json.Unmarshal(ret, &gpus)
if err1 != nil {
return err1
}
err2 := json.Unmarshal(ret, &sys)
if err2 != nil {
return err2
}
metrics := genTagsFields(gpus, sys)
for _, metric := range metrics {
acc.AddFields(measurement, metric.fields, metric.tags)
}
return nil
}
type metric struct {
tags map[string]string
fields map[string]interface{}
}
func genTagsFields(gpus map[string]GPU, system map[string]sysInfo) []metric {
metrics := []metric{}
for cardID, payload := range gpus {
if strings.Contains(cardID, "card") {
tags := map[string]string{
"name": cardID,
}
fields := map[string]interface{}{}
totVRAM, _ := strconv.ParseInt(payload.GpuVRAMTotalMemory, 10, 64)
usdVRAM, _ := strconv.ParseInt(payload.GpuVRAMTotalUsedMemory, 10, 64)
strFree := strconv.FormatInt(totVRAM-usdVRAM, 10)
setTagIfUsed(tags, "gpu_id", payload.GpuID)
setTagIfUsed(tags, "gpu_unique_id", payload.GpuUniqueID)
setIfUsed("int", fields, "driver_version", strings.Replace(system["system"].DriverVersion, ".", "", -1))
setIfUsed("int", fields, "fan_speed", payload.GpuFanSpeedPercentage)
setIfUsed("int64", fields, "memory_total", payload.GpuVRAMTotalMemory)
setIfUsed("int64", fields, "memory_used", payload.GpuVRAMTotalUsedMemory)
setIfUsed("int64", fields, "memory_free", strFree)
setIfUsed("float", fields, "temperature_sensor_edge", payload.GpuTemperatureSensorEdge)
setIfUsed("float", fields, "temperature_sensor_junction", payload.GpuTemperatureSensorJunction)
setIfUsed("float", fields, "temperature_sensor_memory", payload.GpuTemperatureSensorMemory)
setIfUsed("int", fields, "utilization_gpu", payload.GpuUsePercentage)
setIfUsed("int", fields, "utilization_memory", payload.GpuMemoryUsePercentage)
setIfUsed("int", fields, "clocks_current_sm", strings.Trim(payload.GpuSclkClockSpeed, "(Mhz)"))
setIfUsed("int", fields, "clocks_current_memory", strings.Trim(payload.GpuMclkClockSpeed, "(Mhz)"))
setIfUsed("float", fields, "power_draw", payload.GpuAveragePower)
metrics = append(metrics, metric{tags, fields})
}
}
return metrics
}
func setTagIfUsed(m map[string]string, k, v string) {
if v != "" {
m[k] = v
}
}
func setIfUsed(t string, m map[string]interface{}, k, v string) {
vals := strings.Fields(v)
if len(vals) < 1 {
return
}
val := vals[0]
switch t {
case "float":
if val != "" {
f, err := strconv.ParseFloat(val, 64)
if err == nil {
m[k] = f
}
}
case "int":
if val != "" {
i, err := strconv.Atoi(val)
if err == nil {
m[k] = i
}
}
case "int64":
if val != "" {
i, err := strconv.ParseInt(val, 10, 64)
if err == nil {
m[k] = i
}
}
case "str":
if val != "" {
m[k] = val
}
}
}
type sysInfo struct {
DriverVersion string `json:"Driver version"`
}
type GPU struct {
GpuID string `json:"GPU ID"`
GpuUniqueID string `json:"Unique ID"`
GpuVBIOSVersion string `json:"VBIOS version"`
GpuTemperatureSensorEdge string `json:"Temperature (Sensor edge) (C)"`
GpuTemperatureSensorJunction string `json:"Temperature (Sensor junction) (C)"`
GpuTemperatureSensorMemory string `json:"Temperature (Sensor memory) (C)"`
GpuDcefClkClockSpeed string `json:"dcefclk clock speed"`
GpuDcefClkClockLevel string `json:"dcefclk clock level"`
GpuFclkClockSpeed string `json:"fclk clock speed"`
GpuFclkClockLevel string `json:"fclk clock level"`
GpuMclkClockSpeed string `json:"mclk clock speed:"`
GpuMclkClockLevel string `json:"mclk clock level:"`
GpuSclkClockSpeed string `json:"sclk clock speed:"`
GpuSclkClockLevel string `json:"sclk clock level:"`
GpuSocclkClockSpeed string `json:"socclk clock speed"`
GpuSocclkClockLevel string `json:"socclk clock level"`
GpuPcieClock string `json:"pcie clock level"`
GpuFanSpeedLevel string `json:"Fan speed (level)"`
GpuFanSpeedPercentage string `json:"Fan speed (%)"`
GpuFanRPM string `json:"Fan RPM"`
GpuPerformanceLevel string `json:"Performance Level"`
GpuOverdrive string `json:"GPU OverDrive value (%)"`
GpuMaxPower string `json:"Max Graphics Package Power (W)"`
GpuAveragePower string `json:"Average Graphics Package Power (W)"`
GpuUsePercentage string `json:"GPU use (%)"`
GpuMemoryUsePercentage string `json:"GPU memory use (%)"`
GpuMemoryVendor string `json:"GPU memory vendor"`
GpuPCIeReplay string `json:"PCIe Replay Count"`
GpuSerialNumber string `json:"Serial Number"`
GpuVoltagemV string `json:"Voltage (mV)"`
GpuPCIBus string `json:"PCI Bus"`
GpuASDDirmware string `json:"ASD firmware version"`
GpuCEFirmware string `json:"CE firmware version"`
GpuDMCUFirmware string `json:"DMCU firmware version"`
GpuMCFirmware string `json:"MC firmware version"`
GpuMEFirmware string `json:"ME firmware version"`
GpuMECFirmware string `json:"MEC firmware version"`
GpuMEC2Firmware string `json:"MEC2 firmware version"`
GpuPFPFirmware string `json:"PFP firmware version"`
GpuRLCFirmware string `json:"RLC firmware version"`
GpuRLCSRLC string `json:"RLC SRLC firmware version"`
GpuRLCSRLG string `json:"RLC SRLG firmware version"`
GpuRLCSRLS string `json:"RLC SRLS firmware version"`
GpuSDMAFirmware string `json:"SDMA firmware version"`
GpuSDMA2Firmware string `json:"SDMA2 firmware version"`
GpuSMCFirmware string `json:"SMC firmware version"`
GpuSOSFirmware string `json:"SOS firmware version"`
GpuTARAS string `json:"TA RAS firmware version"`
GpuTAXGMI string `json:"TA XGMI firmware version"`
GpuUVDFirmware string `json:"UVD firmware version"`
GpuVCEFirmware string `json:"VCE firmware version"`
GpuVCNFirmware string `json:"VCN firmware version"`
GpuCardSeries string `json:"Card series"`
GpuCardModel string `json:"Card model"`
GpuCardVendor string `json:"Card vendor"`
GpuCardSKU string `json:"Card SKU"`
GpuNUMANode string `json:"(Topology) Numa Node"`
GpuNUMAAffinity string `json:"(Topology) Numa Affinity"`
GpuVisVRAMTotalMemory string `json:"VIS_VRAM Total Memory (B)"`
GpuVisVRAMTotalUsedMemory string `json:"VIS_VRAM Total Used Memory (B)"`
GpuVRAMTotalMemory string `json:"VRAM Total Memory (B)"`
GpuVRAMTotalUsedMemory string `json:"VRAM Total Used Memory (B)"`
GpuGTTTotalMemory string `json:"GTT Total Memory (B)"`
GpuGTTTotalUsedMemory string `json:"GTT Total Used Memory (B)"`
}

View File

@ -0,0 +1,90 @@
package amd_rocm_smi
import (
"io/ioutil"
"path/filepath"
"testing"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/require"
)
func TestGatherValidJSON(t *testing.T) {
tests := []struct {
name string
filename string
expected []telegraf.Metric
}{
{
name: "Vega 10 XT",
filename: "vega-10-XT.json",
expected: []telegraf.Metric{
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x6861",
"gpu_unique_id": "0x2150e7d042a1124",
"name": "card0",
},
map[string]interface{}{
"driver_version": 5925,
"fan_speed": 13,
"memory_total": int64(17163091968),
"memory_used": int64(17776640),
"memory_free": int64(17145315328),
"temperature_sensor_edge": 39.0,
"temperature_sensor_junction": 40.0,
"temperature_sensor_memory": 92.0,
"utilization_gpu": 0,
"clocks_current_sm": 1269,
"clocks_current_memory": 167,
"power_draw": 15.0,
},
time.Unix(0, 0)),
},
},
{
name: "Vega 20 WKS GL-XE [Radeon Pro VII]",
filename: "vega-20-WKS-GL-XE.json",
expected: []telegraf.Metric{
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x66a1",
"gpu_unique_id": "0x2f048617326b1ea",
"name": "card0",
},
map[string]interface{}{
"driver_version": 5917,
"fan_speed": 0,
"memory_total": int64(34342961152),
"memory_used": int64(10850304),
"memory_free": int64(34332110848),
"temperature_sensor_edge": 36.0,
"temperature_sensor_junction": 38.0,
"temperature_sensor_memory": 35.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_sm": 1725,
"clocks_current_memory": 1000,
"power_draw": 26.0,
},
time.Unix(0, 0)),
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var acc testutil.Accumulator
octets, err := ioutil.ReadFile(filepath.Join("testdata", tt.filename))
require.NoError(t, err)
err = gatherROCmSMI(octets, &acc)
require.NoError(t, err)
testutil.RequireMetricsEqual(t, tt.expected, acc.GetTelegrafMetrics(), testutil.IgnoreTime())
})
}
}

View File

@ -0,0 +1,77 @@
{
"card0": {
"GPU ID": "0x6861",
"Unique ID": "0x2150e7d042a1124",
"VBIOS version": "113-D0510100-106",
"Temperature (Sensor edge) (C)": "39.0",
"Temperature (Sensor junction) (C)": "40.0",
"Temperature (Sensor memory) (C)": "92.0",
"dcefclk clock speed:": "(600Mhz)",
"dcefclk clock level:": "0",
"mclk clock speed:": "(167Mhz)",
"mclk clock level:": "0",
"sclk clock speed:": "(1269Mhz)",
"sclk clock level:": "3",
"socclk clock speed:": "(960Mhz)",
"socclk clock level:": "3",
"pcie clock level": "1 (8.0GT/s x16)",
"sclk clock level": "3 (1269Mhz)",
"Fan speed (level)": "33",
"Fan speed (%)": "13",
"Fan RPM": "682",
"Performance Level": "auto",
"GPU OverDrive value (%)": "0",
"GPU Memory OverDrive value (%)": "0",
"Max Graphics Package Power (W)": "170.0",
"Average Graphics Package Power (W)": "15.0",
"0": "8.0GT/s x16",
"1": "8.0GT/s x16 *",
"2": "847Mhz",
"3": "960Mhz *",
"4": "1028Mhz",
"5": "1107Mhz",
"6": "1440Mhz",
"7": "1500Mhz",
"GPU use (%)": "0",
"GPU memory vendor": "samsung",
"PCIe Replay Count": "0",
"Serial Number": "N/A",
"Voltage (mV)": "906",
"PCI Bus": "0000:04:00.0",
"VRAM Total Memory (B)": "17163091968",
"VRAM Total Used Memory (B)": "17776640",
"VIS_VRAM Total Memory (B)": "268435456",
"VIS_VRAM Total Used Memory (B)": "13557760",
"GTT Total Memory (B)": "17163091968",
"GTT Total Used Memory (B)": "25608192",
"ASD firmware version": "553648152",
"CE firmware version": "79",
"DMCU firmware version": "0",
"MC firmware version": "0",
"ME firmware version": "163",
"MEC firmware version": "432",
"MEC2 firmware version": "432",
"PFP firmware version": "186",
"RLC firmware version": "93",
"RLC SRLC firmware version": "0",
"RLC SRLG firmware version": "0",
"RLC SRLS firmware version": "0",
"SDMA firmware version": "430",
"SDMA2 firmware version": "430",
"SMC firmware version": "00.28.54.00",
"SOS firmware version": "0x0008015d",
"TA RAS firmware version": "00.00.00.00",
"TA XGMI firmware version": "00.00.00.00",
"UVD firmware version": "0x422b1100",
"VCE firmware version": "0x39060400",
"VCN firmware version": "0x00000000",
"Card model": "0xc1e",
"Card vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
"Card SKU": "D05101",
"(Topology) Numa Node": "0",
"(Topology) Numa Affinity": "0"
},
"system": {
"Driver version": "5.9.25"
}
}

View File

@ -0,0 +1,165 @@
{
"card0": {
"GPU ID": "0x66a1",
"Unique ID": "0x2f048617326b1ea",
"VBIOS version": "113-D1631700-111",
"Temperature (Sensor edge) (C)": "36.0",
"Temperature (Sensor junction) (C)": "38.0",
"Temperature (Sensor memory) (C)": "35.0",
"dcefclk clock speed:": "(357Mhz)",
"dcefclk clock level:": "0",
"fclk clock speed:": "(1080Mhz)",
"fclk clock level:": "6",
"mclk clock speed:": "(1000Mhz)",
"mclk clock level:": "2",
"sclk clock speed:": "(1725Mhz)",
"sclk clock level:": "8",
"socclk clock speed:": "(971Mhz)",
"socclk clock level:": "7",
"pcie clock level": "1 (16.0GT/s x16)",
"sclk clock level": "8 (1725Mhz)",
"Fan speed (level)": "0",
"Fan speed (%)": "0",
"Fan RPM": "0",
"Performance Level": "high",
"GPU OverDrive value (%)": "0",
"Max Graphics Package Power (W)": "225.0",
"Average Graphics Package Power (W)": "26.0",
"0": "2.5GT/s x16",
"1": "16.0GT/s x16 *",
"2": "566Mhz",
"3": "618Mhz",
"4": "680Mhz",
"5": "755Mhz",
"6": "850Mhz",
"7": "971Mhz *",
"8": "1725Mhz *",
"GPU use (%)": "0",
"GPU memory use (%)": "0",
"GPU memory vendor": "samsung",
"PCIe Replay Count": "0",
"Serial Number": "692024000810",
"Voltage (mV)": "1000",
"PCI Bus": "0000:63:00.0",
"VRAM Total Memory (B)": "34342961152",
"VRAM Total Used Memory (B)": "10850304",
"VIS_VRAM Total Memory (B)": "34342961152",
"VIS_VRAM Total Used Memory (B)": "10850304",
"GTT Total Memory (B)": "54974742528",
"GTT Total Used Memory (B)": "11591680",
"ASD firmware version": "553648199",
"CE firmware version": "79",
"DMCU firmware version": "0",
"MC firmware version": "0",
"ME firmware version": "164",
"MEC firmware version": "448",
"MEC2 firmware version": "448",
"PFP firmware version": "188",
"RLC firmware version": "50",
"RLC SRLC firmware version": "1",
"RLC SRLG firmware version": "1",
"RLC SRLS firmware version": "1",
"SDMA firmware version": "144",
"SDMA2 firmware version": "144",
"SMC firmware version": "00.40.59.00",
"SOS firmware version": "0x00080b67",
"TA RAS firmware version": "27.00.01.36",
"TA XGMI firmware version": "32.00.00.02",
"UVD firmware version": "0x42002b13",
"VCE firmware version": "0x39060400",
"VCN firmware version": "0x00000000",
"Card series": "Radeon Instinct MI50 32GB",
"Card model": "0x834",
"Card vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
"Card SKU": "D16317",
"(Topology) Numa Node": "0",
"(Topology) Numa Affinity": "0"
},
"system": {
"Driver version": "5.9.17",
"(Topology) Weight between DRM devices 0 and 1": "40",
"(Topology) Weight between DRM devices 0 and 2": "40",
"(Topology) Weight between DRM devices 0 and 3": "40",
"(Topology) Weight between DRM devices 0 and 4": "72",
"(Topology) Weight between DRM devices 0 and 5": "72",
"(Topology) Weight between DRM devices 0 and 6": "72",
"(Topology) Weight between DRM devices 0 and 7": "72",
"(Topology) Weight between DRM devices 1 and 2": "40",
"(Topology) Weight between DRM devices 1 and 3": "40",
"(Topology) Weight between DRM devices 1 and 4": "72",
"(Topology) Weight between DRM devices 1 and 5": "72",
"(Topology) Weight between DRM devices 1 and 6": "72",
"(Topology) Weight between DRM devices 1 and 7": "72",
"(Topology) Weight between DRM devices 2 and 3": "40",
"(Topology) Weight between DRM devices 2 and 4": "72",
"(Topology) Weight between DRM devices 2 and 5": "72",
"(Topology) Weight between DRM devices 2 and 6": "72",
"(Topology) Weight between DRM devices 2 and 7": "72",
"(Topology) Weight between DRM devices 3 and 4": "72",
"(Topology) Weight between DRM devices 3 and 5": "72",
"(Topology) Weight between DRM devices 3 and 6": "72",
"(Topology) Weight between DRM devices 3 and 7": "72",
"(Topology) Weight between DRM devices 4 and 5": "40",
"(Topology) Weight between DRM devices 4 and 6": "40",
"(Topology) Weight between DRM devices 4 and 7": "40",
"(Topology) Weight between DRM devices 5 and 6": "40",
"(Topology) Weight between DRM devices 5 and 7": "40",
"(Topology) Weight between DRM devices 6 and 7": "40",
"(Topology) Hops between DRM devices 0 and 1": "2",
"(Topology) Hops between DRM devices 0 and 2": "2",
"(Topology) Hops between DRM devices 0 and 3": "2",
"(Topology) Hops between DRM devices 0 and 4": "3",
"(Topology) Hops between DRM devices 0 and 5": "3",
"(Topology) Hops between DRM devices 0 and 6": "3",
"(Topology) Hops between DRM devices 0 and 7": "3",
"(Topology) Hops between DRM devices 1 and 2": "2",
"(Topology) Hops between DRM devices 1 and 3": "2",
"(Topology) Hops between DRM devices 1 and 4": "3",
"(Topology) Hops between DRM devices 1 and 5": "3",
"(Topology) Hops between DRM devices 1 and 6": "3",
"(Topology) Hops between DRM devices 1 and 7": "3",
"(Topology) Hops between DRM devices 2 and 3": "2",
"(Topology) Hops between DRM devices 2 and 4": "3",
"(Topology) Hops between DRM devices 2 and 5": "3",
"(Topology) Hops between DRM devices 2 and 6": "3",
"(Topology) Hops between DRM devices 2 and 7": "3",
"(Topology) Hops between DRM devices 3 and 4": "3",
"(Topology) Hops between DRM devices 3 and 5": "3",
"(Topology) Hops between DRM devices 3 and 6": "3",
"(Topology) Hops between DRM devices 3 and 7": "3",
"(Topology) Hops between DRM devices 4 and 5": "2",
"(Topology) Hops between DRM devices 4 and 6": "2",
"(Topology) Hops between DRM devices 4 and 7": "2",
"(Topology) Hops between DRM devices 5 and 6": "2",
"(Topology) Hops between DRM devices 5 and 7": "2",
"(Topology) Hops between DRM devices 6 and 7": "2",
"(Topology) Link type between DRM devices 0 and 1": "PCIE",
"(Topology) Link type between DRM devices 0 and 2": "PCIE",
"(Topology) Link type between DRM devices 0 and 3": "PCIE",
"(Topology) Link type between DRM devices 0 and 4": "PCIE",
"(Topology) Link type between DRM devices 0 and 5": "PCIE",
"(Topology) Link type between DRM devices 0 and 6": "PCIE",
"(Topology) Link type between DRM devices 0 and 7": "PCIE",
"(Topology) Link type between DRM devices 1 and 2": "PCIE",
"(Topology) Link type between DRM devices 1 and 3": "PCIE",
"(Topology) Link type between DRM devices 1 and 4": "PCIE",
"(Topology) Link type between DRM devices 1 and 5": "PCIE",
"(Topology) Link type between DRM devices 1 and 6": "PCIE",
"(Topology) Link type between DRM devices 1 and 7": "PCIE",
"(Topology) Link type between DRM devices 2 and 3": "PCIE",
"(Topology) Link type between DRM devices 2 and 4": "PCIE",
"(Topology) Link type between DRM devices 2 and 5": "PCIE",
"(Topology) Link type between DRM devices 2 and 6": "PCIE",
"(Topology) Link type between DRM devices 2 and 7": "PCIE",
"(Topology) Link type between DRM devices 3 and 4": "PCIE",
"(Topology) Link type between DRM devices 3 and 5": "PCIE",
"(Topology) Link type between DRM devices 3 and 6": "PCIE",
"(Topology) Link type between DRM devices 3 and 7": "PCIE",
"(Topology) Link type between DRM devices 4 and 5": "PCIE",
"(Topology) Link type between DRM devices 4 and 6": "PCIE",
"(Topology) Link type between DRM devices 4 and 7": "PCIE",
"(Topology) Link type between DRM devices 5 and 6": "PCIE",
"(Topology) Link type between DRM devices 5 and 7": "PCIE",
"(Topology) Link type between DRM devices 6 and 7": "PCIE"
}
}