From c4069a4bff26f430a8a4614a5fab62abc24a8722 Mon Sep 17 00:00:00 2001 From: Serguei Mokhov Date: Thu, 22 Feb 2024 09:09:04 -0500 Subject: [PATCH] feat(inputs.amd_rocm_smi): Add startup_error_behavior config option (#14872) --- plugins/inputs/amd_rocm_smi/README.md | 6 +++ plugins/inputs/amd_rocm_smi/amd_rocm_smi.go | 33 ++++++++++++-- .../inputs/amd_rocm_smi/amd_rocm_smi_test.go | 45 +++++++++++++++++++ plugins/inputs/amd_rocm_smi/sample.conf | 6 +++ 4 files changed, 86 insertions(+), 4 deletions(-) diff --git a/plugins/inputs/amd_rocm_smi/README.md b/plugins/inputs/amd_rocm_smi/README.md index c8439bbc3..ae0cbd56e 100644 --- a/plugins/inputs/amd_rocm_smi/README.md +++ b/plugins/inputs/amd_rocm_smi/README.md @@ -22,6 +22,12 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. ## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath # bin_path = "/opt/rocm/bin/rocm-smi" + ## Optional: specifies plugin behavior regarding missing rocm-smi binary + ## Available choices: + ## - error: telegraf will return an error on startup + ## - ignore: telegraf will ignore this plugin + # startup_error_behavior = "error" + ## Optional: timeout for GPU polling # timeout = "5s" ``` diff --git a/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go b/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go index fa9dddb40..eb890259e 100644 --- a/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go +++ b/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go @@ -23,8 +23,12 @@ var sampleConfig string const measurement = "amd_rocm_smi" type ROCmSMI struct { - BinPath string - Timeout config.Duration + BinPath string `toml:"bin_path"` + Timeout config.Duration `toml:"timeout"` + StartupErrorBehavior string `toml:"startup_error_behavior"` + Log telegraf.Logger `toml:"-"` + + ignorePlugin bool } func (*ROCmSMI) SampleConfig() string { @@ -33,8 +37,8 @@ func (*ROCmSMI) SampleConfig() string { // Gather implements the telegraf interface func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error { - if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) { - return fmt.Errorf("rocm-smi binary not found in path %s, cannot query GPUs statistics", rsmi.BinPath) + if rsmi.ignorePlugin { + return nil } data := rsmi.pollROCmSMI() @@ -46,6 +50,27 @@ func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error { return nil } +func (rsmi *ROCmSMI) Init() error { + if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) { + binPath, err := exec.LookPath("rocm-smi") + if err != nil { + switch rsmi.StartupErrorBehavior { + case "ignore": + rsmi.ignorePlugin = true + rsmi.Log.Warnf("rocm-smi not found on the system, ignoring: %s", err) + return nil + case "", "error": + return fmt.Errorf("rocm-smi binary not found in path %s, cannot query GPUs statistics", rsmi.BinPath) + default: + return fmt.Errorf("unknown startup behavior setting: %s", rsmi.StartupErrorBehavior) + } + } + rsmi.BinPath = binPath + } + + return nil +} + func init() { inputs.Add("amd_rocm_smi", func() telegraf.Input { return &ROCmSMI{ diff --git a/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go b/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go index e38e0ff89..9d508cf25 100644 --- a/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go +++ b/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go @@ -11,6 +11,51 @@ import ( "github.com/stretchr/testify/require" ) +func TestErrorBehaviorError(t *testing.T) { + // make sure we can't find rocm-smi in $PATH somewhere + os.Unsetenv("PATH") + plugin := &ROCmSMI{ + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + StartupErrorBehavior: "error", + } + require.Error(t, plugin.Init()) +} + +func TestErrorBehaviorDefault(t *testing.T) { + // make sure we can't find rocm-smi in $PATH somewhere + os.Unsetenv("PATH") + plugin := &ROCmSMI{ + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + } + require.Error(t, plugin.Init()) +} + +func TestErrorBehaviorIgnore(t *testing.T) { + // make sure we can't find rocm-smi in $PATH somewhere + os.Unsetenv("PATH") + plugin := &ROCmSMI{ + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + StartupErrorBehavior: "ignore", + } + require.NoError(t, plugin.Init()) + acc := testutil.Accumulator{} + require.NoError(t, plugin.Gather(&acc)) +} + +func TestErrorBehaviorInvalidOption(t *testing.T) { + // make sure we can't find rocm-smi in $PATH somewhere + os.Unsetenv("PATH") + plugin := &ROCmSMI{ + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + StartupErrorBehavior: "giveup", + } + require.Error(t, plugin.Init()) +} + func TestGatherValidJSON(t *testing.T) { tests := []struct { name string diff --git a/plugins/inputs/amd_rocm_smi/sample.conf b/plugins/inputs/amd_rocm_smi/sample.conf index aed15aae9..b4fb9ebb1 100644 --- a/plugins/inputs/amd_rocm_smi/sample.conf +++ b/plugins/inputs/amd_rocm_smi/sample.conf @@ -3,5 +3,11 @@ ## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath # bin_path = "/opt/rocm/bin/rocm-smi" + ## Optional: specifies plugin behavior regarding missing rocm-smi binary + ## Available choices: + ## - error: telegraf will return an error on startup + ## - ignore: telegraf will ignore this plugin + # startup_error_behavior = "error" + ## Optional: timeout for GPU polling # timeout = "5s"