feat: Add rocm_smi input to monitor AMD GPUs (#9602)
This commit is contained in:
parent
514a942a6c
commit
04c3e9bb24
|
|
@ -1870,7 +1870,7 @@
|
|||
# ## Print Warp 10 error body
|
||||
# # print_error_body = false
|
||||
#
|
||||
# ## Max string error size
|
||||
# ## Max string error size
|
||||
# # max_string_error_size = 511
|
||||
#
|
||||
# ## Optional TLS Config
|
||||
|
|
@ -4343,19 +4343,19 @@
|
|||
# ## List of metrics collected on above servers
|
||||
# ## Each metric consists in a name, a jmx path and either
|
||||
# ## a pass or drop slice attribute.
|
||||
# ## This collect all heap memory usage metrics.
|
||||
# ## This collect all heap memory usage metrics.
|
||||
# [[inputs.jolokia.metrics]]
|
||||
# name = "heap_memory_usage"
|
||||
# mbean = "java.lang:type=Memory"
|
||||
# attribute = "HeapMemoryUsage"
|
||||
#
|
||||
# ## This collect thread counts metrics.
|
||||
# ## This collect thread counts metrics.
|
||||
# [[inputs.jolokia.metrics]]
|
||||
# name = "thread_count"
|
||||
# mbean = "java.lang:type=Threading"
|
||||
# attribute = "TotalStartedThreadCount,ThreadCount,DaemonThreadCount,PeakThreadCount"
|
||||
#
|
||||
# ## This collect number of class loaded/unloaded counts metrics.
|
||||
# ## This collect number of class loaded/unloaded counts metrics.
|
||||
# [[inputs.jolokia.metrics]]
|
||||
# name = "class_count"
|
||||
# mbean = "java.lang:type=ClassLoading"
|
||||
|
|
@ -5785,6 +5785,13 @@
|
|||
# # Specify a list of one or more riak http servers
|
||||
# servers = ["http://localhost:8098"]
|
||||
|
||||
# # Query statistics from AMD Graphics cards using rocm-smi binary
|
||||
# [[inputs.amd_rocm_smi]]
|
||||
# ## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
|
||||
# # bin_path = "/opt/rocm/bin/rocm-smi"
|
||||
#
|
||||
# ## Optional: timeout for GPU polling
|
||||
# # timeout = "5s"
|
||||
|
||||
# # Read API usage and limits for a Salesforce organisation
|
||||
# [[inputs.salesforce]]
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import (
|
|||
_ "github.com/influxdata/telegraf/plugins/inputs/activemq"
|
||||
_ "github.com/influxdata/telegraf/plugins/inputs/aerospike"
|
||||
_ "github.com/influxdata/telegraf/plugins/inputs/aliyuncms"
|
||||
_ "github.com/influxdata/telegraf/plugins/inputs/amd_rocm_smi"
|
||||
_ "github.com/influxdata/telegraf/plugins/inputs/amqp_consumer"
|
||||
_ "github.com/influxdata/telegraf/plugins/inputs/apache"
|
||||
_ "github.com/influxdata/telegraf/plugins/inputs/apcupsd"
|
||||
|
|
|
|||
|
|
@ -0,0 +1,58 @@
|
|||
# ROCm System Management Interface (SMI) Input Plugin
|
||||
|
||||
This plugin uses a query on the [`rocm-smi`](https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/master/python_smi_tools) binary to pull GPU stats including memory and GPU usage, temperatures and other.
|
||||
|
||||
### Configuration
|
||||
|
||||
```toml
|
||||
# Pulls statistics from nvidia GPUs attached to the host
|
||||
[[inputs.amd_rocm_smi]]
|
||||
## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
|
||||
# bin_path = "/opt/rocm/bin/rocm-smi"
|
||||
|
||||
## Optional: timeout for GPU polling
|
||||
# timeout = "5s"
|
||||
```
|
||||
|
||||
### Metrics
|
||||
- measurement: `amd_rocm_smi`
|
||||
- tags
|
||||
- `name` (entry name assigned by rocm-smi executable)
|
||||
- `gpu_id` (id of the GPU according to rocm-smi)
|
||||
- `gpu_unique_id` (unique id of the GPU)
|
||||
|
||||
- fields
|
||||
- `driver_version` (integer)
|
||||
- `fan_speed`(integer)
|
||||
- `memory_total`(integer B)
|
||||
- `memory_used`(integer B)
|
||||
- `memory_free`(integer B)
|
||||
- `temperature_sensor_edge` (float, Celsius)
|
||||
- `temperature_sensor_junction` (float, Celsius)
|
||||
- `temperature_sensor_memory` (float, Celsius)
|
||||
- `utilization_gpu` (integer, percentage)
|
||||
- `utilization_memory` (integer, percentage)
|
||||
- `clocks_current_sm` (integer, Mhz)
|
||||
- `clocks_current_memory` (integer, Mhz)
|
||||
- `power_draw` (float, Watt)
|
||||
|
||||
### Troubleshooting
|
||||
Check the full output by running `rocm-smi` binary manually.
|
||||
|
||||
Linux:
|
||||
```sh
|
||||
rocm-smi rocm-smi -o -l -m -M -g -c -t -u -i -f -p -P -s -S -v --showreplaycount --showpids --showdriverversion --showmemvendor --showfwinfo --showproductname --showserial --showuniqueid --showbus --showpendingpages --showpagesinfo --showretiredpages --showunreservablepages --showmemuse --showvoltage --showtopo --showtopoweight --showtopohops --showtopotype --showtoponuma --showmeminfo all --json
|
||||
```
|
||||
Please include the output of this command if opening a GitHub issue, together with ROCm version.
|
||||
### Example Output
|
||||
```
|
||||
amd_rocm_smi,gpu_id=0x6861,gpu_unique_id=0x2150e7d042a1124,host=ali47xl,name=card0 clocks_current_memory=167i,clocks_current_sm=852i,driver_version=51114i,fan_speed=14i,memory_free=17145282560i,memory_total=17163091968i,memory_used=17809408i,power_draw=7,temperature_sensor_edge=28,temperature_sensor_junction=29,temperature_sensor_memory=92,utilization_gpu=0i 1630572551000000000
|
||||
amd_rocm_smi,gpu_id=0x6861,gpu_unique_id=0x2150e7d042a1124,host=ali47xl,name=card0 clocks_current_memory=167i,clocks_current_sm=852i,driver_version=51114i,fan_speed=14i,memory_free=17145282560i,memory_total=17163091968i,memory_used=17809408i,power_draw=7,temperature_sensor_edge=29,temperature_sensor_junction=30,temperature_sensor_memory=91,utilization_gpu=0i 1630572701000000000
|
||||
amd_rocm_smi,gpu_id=0x6861,gpu_unique_id=0x2150e7d042a1124,host=ali47xl,name=card0 clocks_current_memory=167i,clocks_current_sm=852i,driver_version=51114i,fan_speed=14i,memory_free=17145282560i,memory_total=17163091968i,memory_used=17809408i,power_draw=7,temperature_sensor_edge=29,temperature_sensor_junction=29,temperature_sensor_memory=92,utilization_gpu=0i 1630572749000000000
|
||||
```
|
||||
### Limitations and notices
|
||||
Please notice that this plugin has been developed and tested on a limited number of versions and small set of GPUs. Currently the latest ROCm version tested is 4.3.0.
|
||||
Notice that depending on the device and driver versions the amount of information provided by `rocm-smi` can vary so that some fields would start/stop appearing in the metrics upon updates.
|
||||
The `rocm-smi` JSON output is not perfectly homogeneous and is possibly changing in the future, hence parsing and unmarshaling can start failing upon updating ROCm.
|
||||
|
||||
Inspired by the current state of the art of the `nvidia-smi` plugin.
|
||||
|
|
@ -0,0 +1,294 @@
|
|||
package amd_rocm_smi
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/influxdata/telegraf"
|
||||
"github.com/influxdata/telegraf/config"
|
||||
"github.com/influxdata/telegraf/internal"
|
||||
"github.com/influxdata/telegraf/plugins/inputs"
|
||||
)
|
||||
|
||||
const measurement = "amd_rocm_smi"
|
||||
|
||||
type ROCmSMI struct {
|
||||
BinPath string
|
||||
Timeout config.Duration
|
||||
}
|
||||
|
||||
// Description returns the description of the ROCmSMI plugin
|
||||
func (rsmi *ROCmSMI) Description() string {
|
||||
return "Query statistics from AMD Graphics cards using rocm-smi binary"
|
||||
}
|
||||
|
||||
var ROCmSMIConfig = `
|
||||
## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
|
||||
# bin_path = "/opt/rocm/bin/rocm-smi"
|
||||
|
||||
## Optional: timeout for GPU polling
|
||||
# timeout = "5s"
|
||||
`
|
||||
|
||||
// SampleConfig returns the sample configuration for the ROCmSMI plugin
|
||||
func (rsmi *ROCmSMI) SampleConfig() string {
|
||||
return ROCmSMIConfig
|
||||
}
|
||||
|
||||
// Gather implements the telegraf interface
|
||||
func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error {
|
||||
if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) {
|
||||
return fmt.Errorf("rocm-smi binary not found in path %s, cannot query GPUs statistics", rsmi.BinPath)
|
||||
}
|
||||
|
||||
data, err := rsmi.pollROCmSMI()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = gatherROCmSMI(data, acc)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
inputs.Add("amd_rocm_smi", func() telegraf.Input {
|
||||
return &ROCmSMI{
|
||||
BinPath: "/opt/rocm/bin/rocm-smi",
|
||||
Timeout: config.Duration(5 * time.Second),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (rsmi *ROCmSMI) pollROCmSMI() ([]byte, error) {
|
||||
// Construct and execute metrics query, there currently exist (ROCm v4.3.x) a "-a" option
|
||||
// that does not provide all the information, so each needed parameter is set manually
|
||||
cmd := exec.Command(rsmi.BinPath,
|
||||
"-o",
|
||||
"-l",
|
||||
"-m",
|
||||
"-M",
|
||||
"-g",
|
||||
"-c",
|
||||
"-t",
|
||||
"-u",
|
||||
"-i",
|
||||
"-f",
|
||||
"-p",
|
||||
"-P",
|
||||
"-s",
|
||||
"-S",
|
||||
"-v",
|
||||
"--showreplaycount",
|
||||
"--showpids",
|
||||
"--showdriverversion",
|
||||
"--showmemvendor",
|
||||
"--showfwinfo",
|
||||
"--showproductname",
|
||||
"--showserial",
|
||||
"--showuniqueid",
|
||||
"--showbus",
|
||||
"--showpendingpages",
|
||||
"--showpagesinfo",
|
||||
"--showmeminfo",
|
||||
"all",
|
||||
"--showretiredpages",
|
||||
"--showunreservablepages",
|
||||
"--showmemuse",
|
||||
"--showvoltage",
|
||||
"--showtopo",
|
||||
"--showtopoweight",
|
||||
"--showtopohops",
|
||||
"--showtopotype",
|
||||
"--showtoponuma",
|
||||
"--json")
|
||||
|
||||
ret, _ := internal.StdOutputTimeout(cmd,
|
||||
time.Duration(rsmi.Timeout))
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func gatherROCmSMI(ret []byte, acc telegraf.Accumulator) error {
|
||||
var gpus map[string]GPU
|
||||
var sys map[string]sysInfo
|
||||
|
||||
err1 := json.Unmarshal(ret, &gpus)
|
||||
if err1 != nil {
|
||||
return err1
|
||||
}
|
||||
|
||||
err2 := json.Unmarshal(ret, &sys)
|
||||
if err2 != nil {
|
||||
return err2
|
||||
}
|
||||
|
||||
metrics := genTagsFields(gpus, sys)
|
||||
for _, metric := range metrics {
|
||||
acc.AddFields(measurement, metric.fields, metric.tags)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
type metric struct {
|
||||
tags map[string]string
|
||||
fields map[string]interface{}
|
||||
}
|
||||
|
||||
func genTagsFields(gpus map[string]GPU, system map[string]sysInfo) []metric {
|
||||
metrics := []metric{}
|
||||
for cardID, payload := range gpus {
|
||||
if strings.Contains(cardID, "card") {
|
||||
tags := map[string]string{
|
||||
"name": cardID,
|
||||
}
|
||||
fields := map[string]interface{}{}
|
||||
|
||||
totVRAM, _ := strconv.ParseInt(payload.GpuVRAMTotalMemory, 10, 64)
|
||||
usdVRAM, _ := strconv.ParseInt(payload.GpuVRAMTotalUsedMemory, 10, 64)
|
||||
strFree := strconv.FormatInt(totVRAM-usdVRAM, 10)
|
||||
|
||||
setTagIfUsed(tags, "gpu_id", payload.GpuID)
|
||||
setTagIfUsed(tags, "gpu_unique_id", payload.GpuUniqueID)
|
||||
|
||||
setIfUsed("int", fields, "driver_version", strings.Replace(system["system"].DriverVersion, ".", "", -1))
|
||||
setIfUsed("int", fields, "fan_speed", payload.GpuFanSpeedPercentage)
|
||||
setIfUsed("int64", fields, "memory_total", payload.GpuVRAMTotalMemory)
|
||||
setIfUsed("int64", fields, "memory_used", payload.GpuVRAMTotalUsedMemory)
|
||||
setIfUsed("int64", fields, "memory_free", strFree)
|
||||
setIfUsed("float", fields, "temperature_sensor_edge", payload.GpuTemperatureSensorEdge)
|
||||
setIfUsed("float", fields, "temperature_sensor_junction", payload.GpuTemperatureSensorJunction)
|
||||
setIfUsed("float", fields, "temperature_sensor_memory", payload.GpuTemperatureSensorMemory)
|
||||
setIfUsed("int", fields, "utilization_gpu", payload.GpuUsePercentage)
|
||||
setIfUsed("int", fields, "utilization_memory", payload.GpuMemoryUsePercentage)
|
||||
setIfUsed("int", fields, "clocks_current_sm", strings.Trim(payload.GpuSclkClockSpeed, "(Mhz)"))
|
||||
setIfUsed("int", fields, "clocks_current_memory", strings.Trim(payload.GpuMclkClockSpeed, "(Mhz)"))
|
||||
setIfUsed("float", fields, "power_draw", payload.GpuAveragePower)
|
||||
|
||||
metrics = append(metrics, metric{tags, fields})
|
||||
}
|
||||
}
|
||||
return metrics
|
||||
}
|
||||
|
||||
func setTagIfUsed(m map[string]string, k, v string) {
|
||||
if v != "" {
|
||||
m[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
func setIfUsed(t string, m map[string]interface{}, k, v string) {
|
||||
vals := strings.Fields(v)
|
||||
if len(vals) < 1 {
|
||||
return
|
||||
}
|
||||
|
||||
val := vals[0]
|
||||
|
||||
switch t {
|
||||
case "float":
|
||||
if val != "" {
|
||||
f, err := strconv.ParseFloat(val, 64)
|
||||
if err == nil {
|
||||
m[k] = f
|
||||
}
|
||||
}
|
||||
case "int":
|
||||
if val != "" {
|
||||
i, err := strconv.Atoi(val)
|
||||
if err == nil {
|
||||
m[k] = i
|
||||
}
|
||||
}
|
||||
case "int64":
|
||||
if val != "" {
|
||||
i, err := strconv.ParseInt(val, 10, 64)
|
||||
if err == nil {
|
||||
m[k] = i
|
||||
}
|
||||
}
|
||||
case "str":
|
||||
if val != "" {
|
||||
m[k] = val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type sysInfo struct {
|
||||
DriverVersion string `json:"Driver version"`
|
||||
}
|
||||
|
||||
type GPU struct {
|
||||
GpuID string `json:"GPU ID"`
|
||||
GpuUniqueID string `json:"Unique ID"`
|
||||
GpuVBIOSVersion string `json:"VBIOS version"`
|
||||
GpuTemperatureSensorEdge string `json:"Temperature (Sensor edge) (C)"`
|
||||
GpuTemperatureSensorJunction string `json:"Temperature (Sensor junction) (C)"`
|
||||
GpuTemperatureSensorMemory string `json:"Temperature (Sensor memory) (C)"`
|
||||
GpuDcefClkClockSpeed string `json:"dcefclk clock speed"`
|
||||
GpuDcefClkClockLevel string `json:"dcefclk clock level"`
|
||||
GpuFclkClockSpeed string `json:"fclk clock speed"`
|
||||
GpuFclkClockLevel string `json:"fclk clock level"`
|
||||
GpuMclkClockSpeed string `json:"mclk clock speed:"`
|
||||
GpuMclkClockLevel string `json:"mclk clock level:"`
|
||||
GpuSclkClockSpeed string `json:"sclk clock speed:"`
|
||||
GpuSclkClockLevel string `json:"sclk clock level:"`
|
||||
GpuSocclkClockSpeed string `json:"socclk clock speed"`
|
||||
GpuSocclkClockLevel string `json:"socclk clock level"`
|
||||
GpuPcieClock string `json:"pcie clock level"`
|
||||
GpuFanSpeedLevel string `json:"Fan speed (level)"`
|
||||
GpuFanSpeedPercentage string `json:"Fan speed (%)"`
|
||||
GpuFanRPM string `json:"Fan RPM"`
|
||||
GpuPerformanceLevel string `json:"Performance Level"`
|
||||
GpuOverdrive string `json:"GPU OverDrive value (%)"`
|
||||
GpuMaxPower string `json:"Max Graphics Package Power (W)"`
|
||||
GpuAveragePower string `json:"Average Graphics Package Power (W)"`
|
||||
GpuUsePercentage string `json:"GPU use (%)"`
|
||||
GpuMemoryUsePercentage string `json:"GPU memory use (%)"`
|
||||
GpuMemoryVendor string `json:"GPU memory vendor"`
|
||||
GpuPCIeReplay string `json:"PCIe Replay Count"`
|
||||
GpuSerialNumber string `json:"Serial Number"`
|
||||
GpuVoltagemV string `json:"Voltage (mV)"`
|
||||
GpuPCIBus string `json:"PCI Bus"`
|
||||
GpuASDDirmware string `json:"ASD firmware version"`
|
||||
GpuCEFirmware string `json:"CE firmware version"`
|
||||
GpuDMCUFirmware string `json:"DMCU firmware version"`
|
||||
GpuMCFirmware string `json:"MC firmware version"`
|
||||
GpuMEFirmware string `json:"ME firmware version"`
|
||||
GpuMECFirmware string `json:"MEC firmware version"`
|
||||
GpuMEC2Firmware string `json:"MEC2 firmware version"`
|
||||
GpuPFPFirmware string `json:"PFP firmware version"`
|
||||
GpuRLCFirmware string `json:"RLC firmware version"`
|
||||
GpuRLCSRLC string `json:"RLC SRLC firmware version"`
|
||||
GpuRLCSRLG string `json:"RLC SRLG firmware version"`
|
||||
GpuRLCSRLS string `json:"RLC SRLS firmware version"`
|
||||
GpuSDMAFirmware string `json:"SDMA firmware version"`
|
||||
GpuSDMA2Firmware string `json:"SDMA2 firmware version"`
|
||||
GpuSMCFirmware string `json:"SMC firmware version"`
|
||||
GpuSOSFirmware string `json:"SOS firmware version"`
|
||||
GpuTARAS string `json:"TA RAS firmware version"`
|
||||
GpuTAXGMI string `json:"TA XGMI firmware version"`
|
||||
GpuUVDFirmware string `json:"UVD firmware version"`
|
||||
GpuVCEFirmware string `json:"VCE firmware version"`
|
||||
GpuVCNFirmware string `json:"VCN firmware version"`
|
||||
GpuCardSeries string `json:"Card series"`
|
||||
GpuCardModel string `json:"Card model"`
|
||||
GpuCardVendor string `json:"Card vendor"`
|
||||
GpuCardSKU string `json:"Card SKU"`
|
||||
GpuNUMANode string `json:"(Topology) Numa Node"`
|
||||
GpuNUMAAffinity string `json:"(Topology) Numa Affinity"`
|
||||
GpuVisVRAMTotalMemory string `json:"VIS_VRAM Total Memory (B)"`
|
||||
GpuVisVRAMTotalUsedMemory string `json:"VIS_VRAM Total Used Memory (B)"`
|
||||
GpuVRAMTotalMemory string `json:"VRAM Total Memory (B)"`
|
||||
GpuVRAMTotalUsedMemory string `json:"VRAM Total Used Memory (B)"`
|
||||
GpuGTTTotalMemory string `json:"GTT Total Memory (B)"`
|
||||
GpuGTTTotalUsedMemory string `json:"GTT Total Used Memory (B)"`
|
||||
}
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
package amd_rocm_smi
|
||||
|
||||
import (
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/influxdata/telegraf"
|
||||
"github.com/influxdata/telegraf/testutil"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestGatherValidJSON(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
filename string
|
||||
expected []telegraf.Metric
|
||||
}{
|
||||
{
|
||||
name: "Vega 10 XT",
|
||||
filename: "vega-10-XT.json",
|
||||
expected: []telegraf.Metric{
|
||||
testutil.MustMetric(
|
||||
"amd_rocm_smi",
|
||||
map[string]string{
|
||||
"gpu_id": "0x6861",
|
||||
"gpu_unique_id": "0x2150e7d042a1124",
|
||||
"name": "card0",
|
||||
},
|
||||
map[string]interface{}{
|
||||
"driver_version": 5925,
|
||||
"fan_speed": 13,
|
||||
"memory_total": int64(17163091968),
|
||||
"memory_used": int64(17776640),
|
||||
"memory_free": int64(17145315328),
|
||||
"temperature_sensor_edge": 39.0,
|
||||
"temperature_sensor_junction": 40.0,
|
||||
"temperature_sensor_memory": 92.0,
|
||||
"utilization_gpu": 0,
|
||||
"clocks_current_sm": 1269,
|
||||
"clocks_current_memory": 167,
|
||||
"power_draw": 15.0,
|
||||
},
|
||||
time.Unix(0, 0)),
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "Vega 20 WKS GL-XE [Radeon Pro VII]",
|
||||
filename: "vega-20-WKS-GL-XE.json",
|
||||
expected: []telegraf.Metric{
|
||||
testutil.MustMetric(
|
||||
"amd_rocm_smi",
|
||||
map[string]string{
|
||||
"gpu_id": "0x66a1",
|
||||
"gpu_unique_id": "0x2f048617326b1ea",
|
||||
"name": "card0",
|
||||
},
|
||||
map[string]interface{}{
|
||||
"driver_version": 5917,
|
||||
"fan_speed": 0,
|
||||
"memory_total": int64(34342961152),
|
||||
"memory_used": int64(10850304),
|
||||
"memory_free": int64(34332110848),
|
||||
"temperature_sensor_edge": 36.0,
|
||||
"temperature_sensor_junction": 38.0,
|
||||
"temperature_sensor_memory": 35.0,
|
||||
"utilization_gpu": 0,
|
||||
"utilization_memory": 0,
|
||||
"clocks_current_sm": 1725,
|
||||
"clocks_current_memory": 1000,
|
||||
"power_draw": 26.0,
|
||||
},
|
||||
time.Unix(0, 0)),
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
var acc testutil.Accumulator
|
||||
octets, err := ioutil.ReadFile(filepath.Join("testdata", tt.filename))
|
||||
require.NoError(t, err)
|
||||
|
||||
err = gatherROCmSMI(octets, &acc)
|
||||
require.NoError(t, err)
|
||||
|
||||
testutil.RequireMetricsEqual(t, tt.expected, acc.GetTelegrafMetrics(), testutil.IgnoreTime())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
{
|
||||
"card0": {
|
||||
"GPU ID": "0x6861",
|
||||
"Unique ID": "0x2150e7d042a1124",
|
||||
"VBIOS version": "113-D0510100-106",
|
||||
"Temperature (Sensor edge) (C)": "39.0",
|
||||
"Temperature (Sensor junction) (C)": "40.0",
|
||||
"Temperature (Sensor memory) (C)": "92.0",
|
||||
"dcefclk clock speed:": "(600Mhz)",
|
||||
"dcefclk clock level:": "0",
|
||||
"mclk clock speed:": "(167Mhz)",
|
||||
"mclk clock level:": "0",
|
||||
"sclk clock speed:": "(1269Mhz)",
|
||||
"sclk clock level:": "3",
|
||||
"socclk clock speed:": "(960Mhz)",
|
||||
"socclk clock level:": "3",
|
||||
"pcie clock level": "1 (8.0GT/s x16)",
|
||||
"sclk clock level": "3 (1269Mhz)",
|
||||
"Fan speed (level)": "33",
|
||||
"Fan speed (%)": "13",
|
||||
"Fan RPM": "682",
|
||||
"Performance Level": "auto",
|
||||
"GPU OverDrive value (%)": "0",
|
||||
"GPU Memory OverDrive value (%)": "0",
|
||||
"Max Graphics Package Power (W)": "170.0",
|
||||
"Average Graphics Package Power (W)": "15.0",
|
||||
"0": "8.0GT/s x16",
|
||||
"1": "8.0GT/s x16 *",
|
||||
"2": "847Mhz",
|
||||
"3": "960Mhz *",
|
||||
"4": "1028Mhz",
|
||||
"5": "1107Mhz",
|
||||
"6": "1440Mhz",
|
||||
"7": "1500Mhz",
|
||||
"GPU use (%)": "0",
|
||||
"GPU memory vendor": "samsung",
|
||||
"PCIe Replay Count": "0",
|
||||
"Serial Number": "N/A",
|
||||
"Voltage (mV)": "906",
|
||||
"PCI Bus": "0000:04:00.0",
|
||||
"VRAM Total Memory (B)": "17163091968",
|
||||
"VRAM Total Used Memory (B)": "17776640",
|
||||
"VIS_VRAM Total Memory (B)": "268435456",
|
||||
"VIS_VRAM Total Used Memory (B)": "13557760",
|
||||
"GTT Total Memory (B)": "17163091968",
|
||||
"GTT Total Used Memory (B)": "25608192",
|
||||
"ASD firmware version": "553648152",
|
||||
"CE firmware version": "79",
|
||||
"DMCU firmware version": "0",
|
||||
"MC firmware version": "0",
|
||||
"ME firmware version": "163",
|
||||
"MEC firmware version": "432",
|
||||
"MEC2 firmware version": "432",
|
||||
"PFP firmware version": "186",
|
||||
"RLC firmware version": "93",
|
||||
"RLC SRLC firmware version": "0",
|
||||
"RLC SRLG firmware version": "0",
|
||||
"RLC SRLS firmware version": "0",
|
||||
"SDMA firmware version": "430",
|
||||
"SDMA2 firmware version": "430",
|
||||
"SMC firmware version": "00.28.54.00",
|
||||
"SOS firmware version": "0x0008015d",
|
||||
"TA RAS firmware version": "00.00.00.00",
|
||||
"TA XGMI firmware version": "00.00.00.00",
|
||||
"UVD firmware version": "0x422b1100",
|
||||
"VCE firmware version": "0x39060400",
|
||||
"VCN firmware version": "0x00000000",
|
||||
"Card model": "0xc1e",
|
||||
"Card vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
|
||||
"Card SKU": "D05101",
|
||||
"(Topology) Numa Node": "0",
|
||||
"(Topology) Numa Affinity": "0"
|
||||
},
|
||||
"system": {
|
||||
"Driver version": "5.9.25"
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,165 @@
|
|||
{
|
||||
"card0": {
|
||||
"GPU ID": "0x66a1",
|
||||
"Unique ID": "0x2f048617326b1ea",
|
||||
"VBIOS version": "113-D1631700-111",
|
||||
"Temperature (Sensor edge) (C)": "36.0",
|
||||
"Temperature (Sensor junction) (C)": "38.0",
|
||||
"Temperature (Sensor memory) (C)": "35.0",
|
||||
"dcefclk clock speed:": "(357Mhz)",
|
||||
"dcefclk clock level:": "0",
|
||||
"fclk clock speed:": "(1080Mhz)",
|
||||
"fclk clock level:": "6",
|
||||
"mclk clock speed:": "(1000Mhz)",
|
||||
"mclk clock level:": "2",
|
||||
"sclk clock speed:": "(1725Mhz)",
|
||||
"sclk clock level:": "8",
|
||||
"socclk clock speed:": "(971Mhz)",
|
||||
"socclk clock level:": "7",
|
||||
"pcie clock level": "1 (16.0GT/s x16)",
|
||||
"sclk clock level": "8 (1725Mhz)",
|
||||
"Fan speed (level)": "0",
|
||||
"Fan speed (%)": "0",
|
||||
"Fan RPM": "0",
|
||||
"Performance Level": "high",
|
||||
"GPU OverDrive value (%)": "0",
|
||||
"Max Graphics Package Power (W)": "225.0",
|
||||
"Average Graphics Package Power (W)": "26.0",
|
||||
"0": "2.5GT/s x16",
|
||||
"1": "16.0GT/s x16 *",
|
||||
"2": "566Mhz",
|
||||
"3": "618Mhz",
|
||||
"4": "680Mhz",
|
||||
"5": "755Mhz",
|
||||
"6": "850Mhz",
|
||||
"7": "971Mhz *",
|
||||
"8": "1725Mhz *",
|
||||
"GPU use (%)": "0",
|
||||
"GPU memory use (%)": "0",
|
||||
"GPU memory vendor": "samsung",
|
||||
"PCIe Replay Count": "0",
|
||||
"Serial Number": "692024000810",
|
||||
"Voltage (mV)": "1000",
|
||||
"PCI Bus": "0000:63:00.0",
|
||||
"VRAM Total Memory (B)": "34342961152",
|
||||
"VRAM Total Used Memory (B)": "10850304",
|
||||
"VIS_VRAM Total Memory (B)": "34342961152",
|
||||
"VIS_VRAM Total Used Memory (B)": "10850304",
|
||||
"GTT Total Memory (B)": "54974742528",
|
||||
"GTT Total Used Memory (B)": "11591680",
|
||||
"ASD firmware version": "553648199",
|
||||
"CE firmware version": "79",
|
||||
"DMCU firmware version": "0",
|
||||
"MC firmware version": "0",
|
||||
"ME firmware version": "164",
|
||||
"MEC firmware version": "448",
|
||||
"MEC2 firmware version": "448",
|
||||
"PFP firmware version": "188",
|
||||
"RLC firmware version": "50",
|
||||
"RLC SRLC firmware version": "1",
|
||||
"RLC SRLG firmware version": "1",
|
||||
"RLC SRLS firmware version": "1",
|
||||
"SDMA firmware version": "144",
|
||||
"SDMA2 firmware version": "144",
|
||||
"SMC firmware version": "00.40.59.00",
|
||||
"SOS firmware version": "0x00080b67",
|
||||
"TA RAS firmware version": "27.00.01.36",
|
||||
"TA XGMI firmware version": "32.00.00.02",
|
||||
"UVD firmware version": "0x42002b13",
|
||||
"VCE firmware version": "0x39060400",
|
||||
"VCN firmware version": "0x00000000",
|
||||
"Card series": "Radeon Instinct MI50 32GB",
|
||||
"Card model": "0x834",
|
||||
"Card vendor": "Advanced Micro Devices, Inc. [AMD/ATI]",
|
||||
"Card SKU": "D16317",
|
||||
"(Topology) Numa Node": "0",
|
||||
"(Topology) Numa Affinity": "0"
|
||||
},
|
||||
"system": {
|
||||
"Driver version": "5.9.17",
|
||||
"(Topology) Weight between DRM devices 0 and 1": "40",
|
||||
"(Topology) Weight between DRM devices 0 and 2": "40",
|
||||
"(Topology) Weight between DRM devices 0 and 3": "40",
|
||||
"(Topology) Weight between DRM devices 0 and 4": "72",
|
||||
"(Topology) Weight between DRM devices 0 and 5": "72",
|
||||
"(Topology) Weight between DRM devices 0 and 6": "72",
|
||||
"(Topology) Weight between DRM devices 0 and 7": "72",
|
||||
"(Topology) Weight between DRM devices 1 and 2": "40",
|
||||
"(Topology) Weight between DRM devices 1 and 3": "40",
|
||||
"(Topology) Weight between DRM devices 1 and 4": "72",
|
||||
"(Topology) Weight between DRM devices 1 and 5": "72",
|
||||
"(Topology) Weight between DRM devices 1 and 6": "72",
|
||||
"(Topology) Weight between DRM devices 1 and 7": "72",
|
||||
"(Topology) Weight between DRM devices 2 and 3": "40",
|
||||
"(Topology) Weight between DRM devices 2 and 4": "72",
|
||||
"(Topology) Weight between DRM devices 2 and 5": "72",
|
||||
"(Topology) Weight between DRM devices 2 and 6": "72",
|
||||
"(Topology) Weight between DRM devices 2 and 7": "72",
|
||||
"(Topology) Weight between DRM devices 3 and 4": "72",
|
||||
"(Topology) Weight between DRM devices 3 and 5": "72",
|
||||
"(Topology) Weight between DRM devices 3 and 6": "72",
|
||||
"(Topology) Weight between DRM devices 3 and 7": "72",
|
||||
"(Topology) Weight between DRM devices 4 and 5": "40",
|
||||
"(Topology) Weight between DRM devices 4 and 6": "40",
|
||||
"(Topology) Weight between DRM devices 4 and 7": "40",
|
||||
"(Topology) Weight between DRM devices 5 and 6": "40",
|
||||
"(Topology) Weight between DRM devices 5 and 7": "40",
|
||||
"(Topology) Weight between DRM devices 6 and 7": "40",
|
||||
"(Topology) Hops between DRM devices 0 and 1": "2",
|
||||
"(Topology) Hops between DRM devices 0 and 2": "2",
|
||||
"(Topology) Hops between DRM devices 0 and 3": "2",
|
||||
"(Topology) Hops between DRM devices 0 and 4": "3",
|
||||
"(Topology) Hops between DRM devices 0 and 5": "3",
|
||||
"(Topology) Hops between DRM devices 0 and 6": "3",
|
||||
"(Topology) Hops between DRM devices 0 and 7": "3",
|
||||
"(Topology) Hops between DRM devices 1 and 2": "2",
|
||||
"(Topology) Hops between DRM devices 1 and 3": "2",
|
||||
"(Topology) Hops between DRM devices 1 and 4": "3",
|
||||
"(Topology) Hops between DRM devices 1 and 5": "3",
|
||||
"(Topology) Hops between DRM devices 1 and 6": "3",
|
||||
"(Topology) Hops between DRM devices 1 and 7": "3",
|
||||
"(Topology) Hops between DRM devices 2 and 3": "2",
|
||||
"(Topology) Hops between DRM devices 2 and 4": "3",
|
||||
"(Topology) Hops between DRM devices 2 and 5": "3",
|
||||
"(Topology) Hops between DRM devices 2 and 6": "3",
|
||||
"(Topology) Hops between DRM devices 2 and 7": "3",
|
||||
"(Topology) Hops between DRM devices 3 and 4": "3",
|
||||
"(Topology) Hops between DRM devices 3 and 5": "3",
|
||||
"(Topology) Hops between DRM devices 3 and 6": "3",
|
||||
"(Topology) Hops between DRM devices 3 and 7": "3",
|
||||
"(Topology) Hops between DRM devices 4 and 5": "2",
|
||||
"(Topology) Hops between DRM devices 4 and 6": "2",
|
||||
"(Topology) Hops between DRM devices 4 and 7": "2",
|
||||
"(Topology) Hops between DRM devices 5 and 6": "2",
|
||||
"(Topology) Hops between DRM devices 5 and 7": "2",
|
||||
"(Topology) Hops between DRM devices 6 and 7": "2",
|
||||
"(Topology) Link type between DRM devices 0 and 1": "PCIE",
|
||||
"(Topology) Link type between DRM devices 0 and 2": "PCIE",
|
||||
"(Topology) Link type between DRM devices 0 and 3": "PCIE",
|
||||
"(Topology) Link type between DRM devices 0 and 4": "PCIE",
|
||||
"(Topology) Link type between DRM devices 0 and 5": "PCIE",
|
||||
"(Topology) Link type between DRM devices 0 and 6": "PCIE",
|
||||
"(Topology) Link type between DRM devices 0 and 7": "PCIE",
|
||||
"(Topology) Link type between DRM devices 1 and 2": "PCIE",
|
||||
"(Topology) Link type between DRM devices 1 and 3": "PCIE",
|
||||
"(Topology) Link type between DRM devices 1 and 4": "PCIE",
|
||||
"(Topology) Link type between DRM devices 1 and 5": "PCIE",
|
||||
"(Topology) Link type between DRM devices 1 and 6": "PCIE",
|
||||
"(Topology) Link type between DRM devices 1 and 7": "PCIE",
|
||||
"(Topology) Link type between DRM devices 2 and 3": "PCIE",
|
||||
"(Topology) Link type between DRM devices 2 and 4": "PCIE",
|
||||
"(Topology) Link type between DRM devices 2 and 5": "PCIE",
|
||||
"(Topology) Link type between DRM devices 2 and 6": "PCIE",
|
||||
"(Topology) Link type between DRM devices 2 and 7": "PCIE",
|
||||
"(Topology) Link type between DRM devices 3 and 4": "PCIE",
|
||||
"(Topology) Link type between DRM devices 3 and 5": "PCIE",
|
||||
"(Topology) Link type between DRM devices 3 and 6": "PCIE",
|
||||
"(Topology) Link type between DRM devices 3 and 7": "PCIE",
|
||||
"(Topology) Link type between DRM devices 4 and 5": "PCIE",
|
||||
"(Topology) Link type between DRM devices 4 and 6": "PCIE",
|
||||
"(Topology) Link type between DRM devices 4 and 7": "PCIE",
|
||||
"(Topology) Link type between DRM devices 5 and 6": "PCIE",
|
||||
"(Topology) Link type between DRM devices 5 and 7": "PCIE",
|
||||
"(Topology) Link type between DRM devices 6 and 7": "PCIE"
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue