feat(inputs.amd_rocm_smi): Add startup_error_behavior config option (#14872)
This commit is contained in:
parent
32b8ad5662
commit
c4069a4bff
|
|
@ -22,6 +22,12 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
|
||||||
## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
|
## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
|
||||||
# bin_path = "/opt/rocm/bin/rocm-smi"
|
# bin_path = "/opt/rocm/bin/rocm-smi"
|
||||||
|
|
||||||
|
## Optional: specifies plugin behavior regarding missing rocm-smi binary
|
||||||
|
## Available choices:
|
||||||
|
## - error: telegraf will return an error on startup
|
||||||
|
## - ignore: telegraf will ignore this plugin
|
||||||
|
# startup_error_behavior = "error"
|
||||||
|
|
||||||
## Optional: timeout for GPU polling
|
## Optional: timeout for GPU polling
|
||||||
# timeout = "5s"
|
# timeout = "5s"
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -23,8 +23,12 @@ var sampleConfig string
|
||||||
const measurement = "amd_rocm_smi"
|
const measurement = "amd_rocm_smi"
|
||||||
|
|
||||||
type ROCmSMI struct {
|
type ROCmSMI struct {
|
||||||
BinPath string
|
BinPath string `toml:"bin_path"`
|
||||||
Timeout config.Duration
|
Timeout config.Duration `toml:"timeout"`
|
||||||
|
StartupErrorBehavior string `toml:"startup_error_behavior"`
|
||||||
|
Log telegraf.Logger `toml:"-"`
|
||||||
|
|
||||||
|
ignorePlugin bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func (*ROCmSMI) SampleConfig() string {
|
func (*ROCmSMI) SampleConfig() string {
|
||||||
|
|
@ -33,8 +37,8 @@ func (*ROCmSMI) SampleConfig() string {
|
||||||
|
|
||||||
// Gather implements the telegraf interface
|
// Gather implements the telegraf interface
|
||||||
func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error {
|
func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error {
|
||||||
if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) {
|
if rsmi.ignorePlugin {
|
||||||
return fmt.Errorf("rocm-smi binary not found in path %s, cannot query GPUs statistics", rsmi.BinPath)
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
data := rsmi.pollROCmSMI()
|
data := rsmi.pollROCmSMI()
|
||||||
|
|
@ -46,6 +50,27 @@ func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (rsmi *ROCmSMI) Init() error {
|
||||||
|
if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) {
|
||||||
|
binPath, err := exec.LookPath("rocm-smi")
|
||||||
|
if err != nil {
|
||||||
|
switch rsmi.StartupErrorBehavior {
|
||||||
|
case "ignore":
|
||||||
|
rsmi.ignorePlugin = true
|
||||||
|
rsmi.Log.Warnf("rocm-smi not found on the system, ignoring: %s", err)
|
||||||
|
return nil
|
||||||
|
case "", "error":
|
||||||
|
return fmt.Errorf("rocm-smi binary not found in path %s, cannot query GPUs statistics", rsmi.BinPath)
|
||||||
|
default:
|
||||||
|
return fmt.Errorf("unknown startup behavior setting: %s", rsmi.StartupErrorBehavior)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rsmi.BinPath = binPath
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
inputs.Add("amd_rocm_smi", func() telegraf.Input {
|
inputs.Add("amd_rocm_smi", func() telegraf.Input {
|
||||||
return &ROCmSMI{
|
return &ROCmSMI{
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,51 @@ import (
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func TestErrorBehaviorError(t *testing.T) {
|
||||||
|
// make sure we can't find rocm-smi in $PATH somewhere
|
||||||
|
os.Unsetenv("PATH")
|
||||||
|
plugin := &ROCmSMI{
|
||||||
|
BinPath: "/random/non-existent/path",
|
||||||
|
Log: &testutil.Logger{},
|
||||||
|
StartupErrorBehavior: "error",
|
||||||
|
}
|
||||||
|
require.Error(t, plugin.Init())
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestErrorBehaviorDefault(t *testing.T) {
|
||||||
|
// make sure we can't find rocm-smi in $PATH somewhere
|
||||||
|
os.Unsetenv("PATH")
|
||||||
|
plugin := &ROCmSMI{
|
||||||
|
BinPath: "/random/non-existent/path",
|
||||||
|
Log: &testutil.Logger{},
|
||||||
|
}
|
||||||
|
require.Error(t, plugin.Init())
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestErrorBehaviorIgnore(t *testing.T) {
|
||||||
|
// make sure we can't find rocm-smi in $PATH somewhere
|
||||||
|
os.Unsetenv("PATH")
|
||||||
|
plugin := &ROCmSMI{
|
||||||
|
BinPath: "/random/non-existent/path",
|
||||||
|
Log: &testutil.Logger{},
|
||||||
|
StartupErrorBehavior: "ignore",
|
||||||
|
}
|
||||||
|
require.NoError(t, plugin.Init())
|
||||||
|
acc := testutil.Accumulator{}
|
||||||
|
require.NoError(t, plugin.Gather(&acc))
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestErrorBehaviorInvalidOption(t *testing.T) {
|
||||||
|
// make sure we can't find rocm-smi in $PATH somewhere
|
||||||
|
os.Unsetenv("PATH")
|
||||||
|
plugin := &ROCmSMI{
|
||||||
|
BinPath: "/random/non-existent/path",
|
||||||
|
Log: &testutil.Logger{},
|
||||||
|
StartupErrorBehavior: "giveup",
|
||||||
|
}
|
||||||
|
require.Error(t, plugin.Init())
|
||||||
|
}
|
||||||
|
|
||||||
func TestGatherValidJSON(t *testing.T) {
|
func TestGatherValidJSON(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
|
|
|
||||||
|
|
@ -3,5 +3,11 @@
|
||||||
## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
|
## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
|
||||||
# bin_path = "/opt/rocm/bin/rocm-smi"
|
# bin_path = "/opt/rocm/bin/rocm-smi"
|
||||||
|
|
||||||
|
## Optional: specifies plugin behavior regarding missing rocm-smi binary
|
||||||
|
## Available choices:
|
||||||
|
## - error: telegraf will return an error on startup
|
||||||
|
## - ignore: telegraf will ignore this plugin
|
||||||
|
# startup_error_behavior = "error"
|
||||||
|
|
||||||
## Optional: timeout for GPU polling
|
## Optional: timeout for GPU polling
|
||||||
# timeout = "5s"
|
# timeout = "5s"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue