diff --git a/models/running_input.go b/models/running_input.go index 0b2197f05..865c784e8 100644 --- a/models/running_input.go +++ b/models/running_input.go @@ -132,7 +132,7 @@ func (r *RunningInput) Start(acc telegraf.Accumulator) error { // Check if the plugin reports a retry-able error, otherwise we exit. var serr *internal.StartupError - if !errors.As(err, &serr) || !serr.Retry { + if !errors.As(err, &serr) { return err } @@ -140,6 +140,9 @@ func (r *RunningInput) Start(acc telegraf.Accumulator) error { switch r.Config.StartupErrorBehavior { case "", "error": // fall-trough to return the actual error case "retry": + if !serr.Retry { + return err + } r.log.Infof("Startup failed: %v; retrying...", err) return nil case "ignore": diff --git a/plugins/inputs/nvidia_smi/README.md b/plugins/inputs/nvidia_smi/README.md index 58dabcf79..c493ff357 100644 --- a/plugins/inputs/nvidia_smi/README.md +++ b/plugins/inputs/nvidia_smi/README.md @@ -13,6 +13,18 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. [CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins +## Startup error behavior options + +In addition to the plugin-specific and global configuration settings the plugin +supports options for specifying the behavior when experiencing startup errors +using the `startup_error_behavior` setting. Available values are: + +- `error`: Telegraf with stop and exit in case of startup errors. This is the + default behavior. +- `ignore`: Telegraf will ignore startup errors for this plugin and disables it + but continues processing for all other plugins. +- `retry`: NOT AVAILABLE + ## Configuration ```toml @sample.conf @@ -23,12 +35,6 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. ## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned # bin_path = "/usr/bin/nvidia-smi" - ## Optional: specifies plugin behavior regarding missing nvidia-smi binary - ## Available choices: - ## - error: telegraf will return an error on startup - ## - ignore: telegraf will ignore this plugin - # startup_error_behavior = "error" - ## Optional: timeout for GPU polling # timeout = "5s" ``` diff --git a/plugins/inputs/nvidia_smi/nvidia_smi.go b/plugins/inputs/nvidia_smi/nvidia_smi.go index 78953678a..695b8c6f6 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi.go @@ -27,10 +27,9 @@ var sampleConfig string // NvidiaSMI holds the methods for this plugin type NvidiaSMI struct { - BinPath string `toml:"bin_path"` - Timeout config.Duration `toml:"timeout"` - StartupErrorBehavior string `toml:"startup_error_behavior"` - Log telegraf.Logger `toml:"-"` + BinPath string `toml:"bin_path"` + Timeout config.Duration `toml:"timeout"` + Log telegraf.Logger `toml:"-"` ignorePlugin bool once sync.Once @@ -40,20 +39,11 @@ func (*NvidiaSMI) SampleConfig() string { return sampleConfig } -func (smi *NvidiaSMI) Init() error { +func (smi *NvidiaSMI) Start(telegraf.Accumulator) error { if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) { binPath, err := exec.LookPath("nvidia-smi") if err != nil { - switch smi.StartupErrorBehavior { - case "ignore": - smi.ignorePlugin = true - smi.Log.Warnf("nvidia-smi not found on the system, ignoring: %s", err) - return nil - case "", "error": - return fmt.Errorf("nvidia-smi not found in %q and not in PATH; please make sure nvidia-smi is installed and/or is in PATH", smi.BinPath) - default: - return fmt.Errorf("unknown startup behavior setting: %s", smi.StartupErrorBehavior) - } + return &internal.StartupError{Err: err} } smi.BinPath = binPath } @@ -61,6 +51,8 @@ func (smi *NvidiaSMI) Init() error { return nil } +func (smi *NvidiaSMI) Stop() {} + // Gather implements the telegraf interface func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error { if smi.ignorePlugin { diff --git a/plugins/inputs/nvidia_smi/nvidia_smi_test.go b/plugins/inputs/nvidia_smi/nvidia_smi_test.go index 578f1da9f..1478b6c33 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi_test.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi_test.go @@ -1,27 +1,19 @@ package nvidia_smi import ( + "errors" "os" "path/filepath" "testing" "time" "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal" + "github.com/influxdata/telegraf/models" "github.com/influxdata/telegraf/testutil" "github.com/stretchr/testify/require" ) -func TestErrorBehaviorError(t *testing.T) { - // make sure we can't find nvidia-smi in $PATH somewhere - os.Unsetenv("PATH") - plugin := &NvidiaSMI{ - BinPath: "/random/non-existent/path", - Log: &testutil.Logger{}, - StartupErrorBehavior: "error", - } - require.Error(t, plugin.Init()) -} - func TestErrorBehaviorDefault(t *testing.T) { // make sure we can't find nvidia-smi in $PATH somewhere os.Unsetenv("PATH") @@ -29,31 +21,72 @@ func TestErrorBehaviorDefault(t *testing.T) { BinPath: "/random/non-existent/path", Log: &testutil.Logger{}, } - require.Error(t, plugin.Init()) + model := models.NewRunningInput(plugin, &models.InputConfig{ + Name: "nvidia_smi", + }) + require.NoError(t, model.Init()) + + var acc testutil.Accumulator + var ferr *internal.FatalError + require.False(t, errors.As(model.Start(&acc), &ferr)) + require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected) +} + +func TestErrorBehaviorError(t *testing.T) { + // make sure we can't find nvidia-smi in $PATH somewhere + os.Unsetenv("PATH") + plugin := &NvidiaSMI{ + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + } + model := models.NewRunningInput(plugin, &models.InputConfig{ + Name: "nvidia_smi", + StartupErrorBehavior: "error", + }) + require.NoError(t, model.Init()) + + var acc testutil.Accumulator + var ferr *internal.FatalError + require.False(t, errors.As(model.Start(&acc), &ferr)) + require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected) +} + +func TestErrorBehaviorRetry(t *testing.T) { + // make sure we can't find nvidia-smi in $PATH somewhere + os.Unsetenv("PATH") + plugin := &NvidiaSMI{ + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + } + model := models.NewRunningInput(plugin, &models.InputConfig{ + Name: "nvidia_smi", + StartupErrorBehavior: "retry", + }) + require.NoError(t, model.Init()) + + var acc testutil.Accumulator + var ferr *internal.FatalError + require.False(t, errors.As(model.Start(&acc), &ferr)) + require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected) } func TestErrorBehaviorIgnore(t *testing.T) { // make sure we can't find nvidia-smi in $PATH somewhere os.Unsetenv("PATH") plugin := &NvidiaSMI{ - BinPath: "/random/non-existent/path", - Log: &testutil.Logger{}, + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + } + model := models.NewRunningInput(plugin, &models.InputConfig{ + Name: "nvidia_smi", StartupErrorBehavior: "ignore", - } - require.NoError(t, plugin.Init()) - acc := testutil.Accumulator{} - require.NoError(t, plugin.Gather(&acc)) -} + }) + require.NoError(t, model.Init()) -func TestErrorBehaviorInvalidOption(t *testing.T) { - // make sure we can't find nvidia-smi in $PATH somewhere - os.Unsetenv("PATH") - plugin := &NvidiaSMI{ - BinPath: "/random/non-existent/path", - Log: &testutil.Logger{}, - StartupErrorBehavior: "giveup", - } - require.Error(t, plugin.Init()) + var acc testutil.Accumulator + var ferr *internal.FatalError + require.ErrorAs(t, model.Start(&acc), &ferr) + require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected) } func TestGatherValidXML(t *testing.T) { diff --git a/plugins/inputs/nvidia_smi/sample.conf b/plugins/inputs/nvidia_smi/sample.conf index dee34936d..8879b3923 100644 --- a/plugins/inputs/nvidia_smi/sample.conf +++ b/plugins/inputs/nvidia_smi/sample.conf @@ -5,11 +5,5 @@ ## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned # bin_path = "/usr/bin/nvidia-smi" - ## Optional: specifies plugin behavior regarding missing nvidia-smi binary - ## Available choices: - ## - error: telegraf will return an error on startup - ## - ignore: telegraf will ignore this plugin - # startup_error_behavior = "error" - ## Optional: timeout for GPU polling # timeout = "5s"