From 1e57608a2167de681db89cb32461c630fd97b16e Mon Sep 17 00:00:00 2001 From: Sven Rebhan <36194019+srebhan@users.noreply.github.com> Date: Wed, 24 Apr 2024 14:34:08 -0400 Subject: [PATCH] chore(inputs.amd_rocm_smi): Consolidate startup_retry_behavior to model implementation (#15216) --- plugins/inputs/amd_rocm_smi/README.md | 18 ++-- plugins/inputs/amd_rocm_smi/amd_rocm_smi.go | 35 ++----- .../inputs/amd_rocm_smi/amd_rocm_smi_test.go | 91 +++++++++++++------ plugins/inputs/amd_rocm_smi/sample.conf | 6 -- 4 files changed, 82 insertions(+), 68 deletions(-) diff --git a/plugins/inputs/amd_rocm_smi/README.md b/plugins/inputs/amd_rocm_smi/README.md index ae0cbd56e..2e3c6f436 100644 --- a/plugins/inputs/amd_rocm_smi/README.md +++ b/plugins/inputs/amd_rocm_smi/README.md @@ -14,6 +14,18 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. [CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins +## Startup error behavior options + +In addition to the plugin-specific and global configuration settings the plugin +supports options for specifying the behavior when experiencing startup errors +using the `startup_error_behavior` setting. Available values are: + +- `error`: Telegraf with stop and exit in case of startup errors. This is the + default behavior. +- `ignore`: Telegraf will ignore startup errors for this plugin and disables it + but continues processing for all other plugins. +- `retry`: NOT AVAILABLE + ## Configuration ```toml @sample.conf @@ -22,12 +34,6 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. ## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath # bin_path = "/opt/rocm/bin/rocm-smi" - ## Optional: specifies plugin behavior regarding missing rocm-smi binary - ## Available choices: - ## - error: telegraf will return an error on startup - ## - ignore: telegraf will ignore this plugin - # startup_error_behavior = "error" - ## Optional: timeout for GPU polling # timeout = "5s" ``` diff --git a/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go b/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go index eb890259e..6ca2a7aaa 100644 --- a/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go +++ b/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go @@ -4,7 +4,6 @@ package amd_rocm_smi import ( _ "embed" "encoding/json" - "fmt" "os" "os/exec" "strconv" @@ -23,12 +22,9 @@ var sampleConfig string const measurement = "amd_rocm_smi" type ROCmSMI struct { - BinPath string `toml:"bin_path"` - Timeout config.Duration `toml:"timeout"` - StartupErrorBehavior string `toml:"startup_error_behavior"` - Log telegraf.Logger `toml:"-"` - - ignorePlugin bool + BinPath string `toml:"bin_path"` + Timeout config.Duration `toml:"timeout"` + Log telegraf.Logger `toml:"-"` } func (*ROCmSMI) SampleConfig() string { @@ -37,33 +33,16 @@ func (*ROCmSMI) SampleConfig() string { // Gather implements the telegraf interface func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error { - if rsmi.ignorePlugin { - return nil - } - data := rsmi.pollROCmSMI() - err := gatherROCmSMI(data, acc) - if err != nil { - return err - } - return nil + return gatherROCmSMI(data, acc) } -func (rsmi *ROCmSMI) Init() error { +func (rsmi *ROCmSMI) Start(telegraf.Accumulator) error { if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) { binPath, err := exec.LookPath("rocm-smi") if err != nil { - switch rsmi.StartupErrorBehavior { - case "ignore": - rsmi.ignorePlugin = true - rsmi.Log.Warnf("rocm-smi not found on the system, ignoring: %s", err) - return nil - case "", "error": - return fmt.Errorf("rocm-smi binary not found in path %s, cannot query GPUs statistics", rsmi.BinPath) - default: - return fmt.Errorf("unknown startup behavior setting: %s", rsmi.StartupErrorBehavior) - } + return &internal.StartupError{Err: err} } rsmi.BinPath = binPath } @@ -71,6 +50,8 @@ func (rsmi *ROCmSMI) Init() error { return nil } +func (rsmi *ROCmSMI) Stop() {} + func init() { inputs.Add("amd_rocm_smi", func() telegraf.Input { return &ROCmSMI{ diff --git a/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go b/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go index 9d508cf25..b832cf08e 100644 --- a/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go +++ b/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go @@ -1,27 +1,19 @@ package amd_rocm_smi import ( + "errors" "os" "path/filepath" "testing" "time" "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal" + "github.com/influxdata/telegraf/models" "github.com/influxdata/telegraf/testutil" "github.com/stretchr/testify/require" ) -func TestErrorBehaviorError(t *testing.T) { - // make sure we can't find rocm-smi in $PATH somewhere - os.Unsetenv("PATH") - plugin := &ROCmSMI{ - BinPath: "/random/non-existent/path", - Log: &testutil.Logger{}, - StartupErrorBehavior: "error", - } - require.Error(t, plugin.Init()) -} - func TestErrorBehaviorDefault(t *testing.T) { // make sure we can't find rocm-smi in $PATH somewhere os.Unsetenv("PATH") @@ -29,31 +21,72 @@ func TestErrorBehaviorDefault(t *testing.T) { BinPath: "/random/non-existent/path", Log: &testutil.Logger{}, } - require.Error(t, plugin.Init()) + model := models.NewRunningInput(plugin, &models.InputConfig{ + Name: "amd_rocm_smi", + }) + require.NoError(t, model.Init()) + + var acc testutil.Accumulator + var ferr *internal.FatalError + require.False(t, errors.As(model.Start(&acc), &ferr)) + require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected) +} + +func TestErrorBehaviorError(t *testing.T) { + // make sure we can't find rocm-smi in $PATH somewhere + os.Unsetenv("PATH") + plugin := &ROCmSMI{ + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + } + model := models.NewRunningInput(plugin, &models.InputConfig{ + Name: "amd_rocm_smi", + StartupErrorBehavior: "error", + }) + require.NoError(t, model.Init()) + + var acc testutil.Accumulator + var ferr *internal.FatalError + require.False(t, errors.As(model.Start(&acc), &ferr)) + require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected) +} + +func TestErrorBehaviorRetry(t *testing.T) { + // make sure we can't find nvidia-smi in $PATH somewhere + os.Unsetenv("PATH") + plugin := &ROCmSMI{ + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + } + model := models.NewRunningInput(plugin, &models.InputConfig{ + Name: "amd_rocm_smi", + StartupErrorBehavior: "retry", + }) + require.NoError(t, model.Init()) + + var acc testutil.Accumulator + var ferr *internal.FatalError + require.False(t, errors.As(model.Start(&acc), &ferr)) + require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected) } func TestErrorBehaviorIgnore(t *testing.T) { - // make sure we can't find rocm-smi in $PATH somewhere + // make sure we can't find nvidia-smi in $PATH somewhere os.Unsetenv("PATH") plugin := &ROCmSMI{ - BinPath: "/random/non-existent/path", - Log: &testutil.Logger{}, + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + } + model := models.NewRunningInput(plugin, &models.InputConfig{ + Name: "amd_rocm_smi", StartupErrorBehavior: "ignore", - } - require.NoError(t, plugin.Init()) - acc := testutil.Accumulator{} - require.NoError(t, plugin.Gather(&acc)) -} + }) + require.NoError(t, model.Init()) -func TestErrorBehaviorInvalidOption(t *testing.T) { - // make sure we can't find rocm-smi in $PATH somewhere - os.Unsetenv("PATH") - plugin := &ROCmSMI{ - BinPath: "/random/non-existent/path", - Log: &testutil.Logger{}, - StartupErrorBehavior: "giveup", - } - require.Error(t, plugin.Init()) + var acc testutil.Accumulator + var ferr *internal.FatalError + require.ErrorAs(t, model.Start(&acc), &ferr) + require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected) } func TestGatherValidJSON(t *testing.T) { diff --git a/plugins/inputs/amd_rocm_smi/sample.conf b/plugins/inputs/amd_rocm_smi/sample.conf index b4fb9ebb1..aed15aae9 100644 --- a/plugins/inputs/amd_rocm_smi/sample.conf +++ b/plugins/inputs/amd_rocm_smi/sample.conf @@ -3,11 +3,5 @@ ## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath # bin_path = "/opt/rocm/bin/rocm-smi" - ## Optional: specifies plugin behavior regarding missing rocm-smi binary - ## Available choices: - ## - error: telegraf will return an error on startup - ## - ignore: telegraf will ignore this plugin - # startup_error_behavior = "error" - ## Optional: timeout for GPU polling # timeout = "5s"