chore(inputs.nvidia_smi): Consolidate startup_retry_behavior to model implementation (#15215)

This commit is contained in:
Sven Rebhan 2024-04-24 14:33:33 -04:00 committed by GitHub
parent e8d3fc9efc
commit e7e2d1aeee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 84 additions and 56 deletions

View File

@ -132,7 +132,7 @@ func (r *RunningInput) Start(acc telegraf.Accumulator) error {
// Check if the plugin reports a retry-able error, otherwise we exit. // Check if the plugin reports a retry-able error, otherwise we exit.
var serr *internal.StartupError var serr *internal.StartupError
if !errors.As(err, &serr) || !serr.Retry { if !errors.As(err, &serr) {
return err return err
} }
@ -140,6 +140,9 @@ func (r *RunningInput) Start(acc telegraf.Accumulator) error {
switch r.Config.StartupErrorBehavior { switch r.Config.StartupErrorBehavior {
case "", "error": // fall-trough to return the actual error case "", "error": // fall-trough to return the actual error
case "retry": case "retry":
if !serr.Retry {
return err
}
r.log.Infof("Startup failed: %v; retrying...", err) r.log.Infof("Startup failed: %v; retrying...", err)
return nil return nil
case "ignore": case "ignore":

View File

@ -13,6 +13,18 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins [CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins
## Startup error behavior options
In addition to the plugin-specific and global configuration settings the plugin
supports options for specifying the behavior when experiencing startup errors
using the `startup_error_behavior` setting. Available values are:
- `error`: Telegraf with stop and exit in case of startup errors. This is the
default behavior.
- `ignore`: Telegraf will ignore startup errors for this plugin and disables it
but continues processing for all other plugins.
- `retry`: NOT AVAILABLE
## Configuration ## Configuration
```toml @sample.conf ```toml @sample.conf
@ -23,12 +35,6 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned ## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned
# bin_path = "/usr/bin/nvidia-smi" # bin_path = "/usr/bin/nvidia-smi"
## Optional: specifies plugin behavior regarding missing nvidia-smi binary
## Available choices:
## - error: telegraf will return an error on startup
## - ignore: telegraf will ignore this plugin
# startup_error_behavior = "error"
## Optional: timeout for GPU polling ## Optional: timeout for GPU polling
# timeout = "5s" # timeout = "5s"
``` ```

View File

@ -27,10 +27,9 @@ var sampleConfig string
// NvidiaSMI holds the methods for this plugin // NvidiaSMI holds the methods for this plugin
type NvidiaSMI struct { type NvidiaSMI struct {
BinPath string `toml:"bin_path"` BinPath string `toml:"bin_path"`
Timeout config.Duration `toml:"timeout"` Timeout config.Duration `toml:"timeout"`
StartupErrorBehavior string `toml:"startup_error_behavior"` Log telegraf.Logger `toml:"-"`
Log telegraf.Logger `toml:"-"`
ignorePlugin bool ignorePlugin bool
once sync.Once once sync.Once
@ -40,20 +39,11 @@ func (*NvidiaSMI) SampleConfig() string {
return sampleConfig return sampleConfig
} }
func (smi *NvidiaSMI) Init() error { func (smi *NvidiaSMI) Start(telegraf.Accumulator) error {
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) { if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
binPath, err := exec.LookPath("nvidia-smi") binPath, err := exec.LookPath("nvidia-smi")
if err != nil { if err != nil {
switch smi.StartupErrorBehavior { return &internal.StartupError{Err: err}
case "ignore":
smi.ignorePlugin = true
smi.Log.Warnf("nvidia-smi not found on the system, ignoring: %s", err)
return nil
case "", "error":
return fmt.Errorf("nvidia-smi not found in %q and not in PATH; please make sure nvidia-smi is installed and/or is in PATH", smi.BinPath)
default:
return fmt.Errorf("unknown startup behavior setting: %s", smi.StartupErrorBehavior)
}
} }
smi.BinPath = binPath smi.BinPath = binPath
} }
@ -61,6 +51,8 @@ func (smi *NvidiaSMI) Init() error {
return nil return nil
} }
func (smi *NvidiaSMI) Stop() {}
// Gather implements the telegraf interface // Gather implements the telegraf interface
func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error { func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
if smi.ignorePlugin { if smi.ignorePlugin {

View File

@ -1,27 +1,19 @@
package nvidia_smi package nvidia_smi
import ( import (
"errors"
"os" "os"
"path/filepath" "path/filepath"
"testing" "testing"
"time" "time"
"github.com/influxdata/telegraf" "github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/models"
"github.com/influxdata/telegraf/testutil" "github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
func TestErrorBehaviorError(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &NvidiaSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
StartupErrorBehavior: "error",
}
require.Error(t, plugin.Init())
}
func TestErrorBehaviorDefault(t *testing.T) { func TestErrorBehaviorDefault(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere // make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH") os.Unsetenv("PATH")
@ -29,31 +21,72 @@ func TestErrorBehaviorDefault(t *testing.T) {
BinPath: "/random/non-existent/path", BinPath: "/random/non-existent/path",
Log: &testutil.Logger{}, Log: &testutil.Logger{},
} }
require.Error(t, plugin.Init()) model := models.NewRunningInput(plugin, &models.InputConfig{
Name: "nvidia_smi",
})
require.NoError(t, model.Init())
var acc testutil.Accumulator
var ferr *internal.FatalError
require.False(t, errors.As(model.Start(&acc), &ferr))
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
}
func TestErrorBehaviorError(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &NvidiaSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
}
model := models.NewRunningInput(plugin, &models.InputConfig{
Name: "nvidia_smi",
StartupErrorBehavior: "error",
})
require.NoError(t, model.Init())
var acc testutil.Accumulator
var ferr *internal.FatalError
require.False(t, errors.As(model.Start(&acc), &ferr))
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
}
func TestErrorBehaviorRetry(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &NvidiaSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
}
model := models.NewRunningInput(plugin, &models.InputConfig{
Name: "nvidia_smi",
StartupErrorBehavior: "retry",
})
require.NoError(t, model.Init())
var acc testutil.Accumulator
var ferr *internal.FatalError
require.False(t, errors.As(model.Start(&acc), &ferr))
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
} }
func TestErrorBehaviorIgnore(t *testing.T) { func TestErrorBehaviorIgnore(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere // make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH") os.Unsetenv("PATH")
plugin := &NvidiaSMI{ plugin := &NvidiaSMI{
BinPath: "/random/non-existent/path", BinPath: "/random/non-existent/path",
Log: &testutil.Logger{}, Log: &testutil.Logger{},
}
model := models.NewRunningInput(plugin, &models.InputConfig{
Name: "nvidia_smi",
StartupErrorBehavior: "ignore", StartupErrorBehavior: "ignore",
} })
require.NoError(t, plugin.Init()) require.NoError(t, model.Init())
acc := testutil.Accumulator{}
require.NoError(t, plugin.Gather(&acc))
}
func TestErrorBehaviorInvalidOption(t *testing.T) { var acc testutil.Accumulator
// make sure we can't find nvidia-smi in $PATH somewhere var ferr *internal.FatalError
os.Unsetenv("PATH") require.ErrorAs(t, model.Start(&acc), &ferr)
plugin := &NvidiaSMI{ require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
StartupErrorBehavior: "giveup",
}
require.Error(t, plugin.Init())
} }
func TestGatherValidXML(t *testing.T) { func TestGatherValidXML(t *testing.T) {

View File

@ -5,11 +5,5 @@
## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned ## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned
# bin_path = "/usr/bin/nvidia-smi" # bin_path = "/usr/bin/nvidia-smi"
## Optional: specifies plugin behavior regarding missing nvidia-smi binary
## Available choices:
## - error: telegraf will return an error on startup
## - ignore: telegraf will ignore this plugin
# startup_error_behavior = "error"
## Optional: timeout for GPU polling ## Optional: timeout for GPU polling
# timeout = "5s" # timeout = "5s"