chore(inputs.nvidia_smi): Consolidate startup_retry_behavior to model implementation (#15215)
This commit is contained in:
parent
e8d3fc9efc
commit
e7e2d1aeee
|
|
@ -132,7 +132,7 @@ func (r *RunningInput) Start(acc telegraf.Accumulator) error {
|
||||||
|
|
||||||
// Check if the plugin reports a retry-able error, otherwise we exit.
|
// Check if the plugin reports a retry-able error, otherwise we exit.
|
||||||
var serr *internal.StartupError
|
var serr *internal.StartupError
|
||||||
if !errors.As(err, &serr) || !serr.Retry {
|
if !errors.As(err, &serr) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -140,6 +140,9 @@ func (r *RunningInput) Start(acc telegraf.Accumulator) error {
|
||||||
switch r.Config.StartupErrorBehavior {
|
switch r.Config.StartupErrorBehavior {
|
||||||
case "", "error": // fall-trough to return the actual error
|
case "", "error": // fall-trough to return the actual error
|
||||||
case "retry":
|
case "retry":
|
||||||
|
if !serr.Retry {
|
||||||
|
return err
|
||||||
|
}
|
||||||
r.log.Infof("Startup failed: %v; retrying...", err)
|
r.log.Infof("Startup failed: %v; retrying...", err)
|
||||||
return nil
|
return nil
|
||||||
case "ignore":
|
case "ignore":
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,18 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
|
||||||
|
|
||||||
[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins
|
[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins
|
||||||
|
|
||||||
|
## Startup error behavior options
|
||||||
|
|
||||||
|
In addition to the plugin-specific and global configuration settings the plugin
|
||||||
|
supports options for specifying the behavior when experiencing startup errors
|
||||||
|
using the `startup_error_behavior` setting. Available values are:
|
||||||
|
|
||||||
|
- `error`: Telegraf with stop and exit in case of startup errors. This is the
|
||||||
|
default behavior.
|
||||||
|
- `ignore`: Telegraf will ignore startup errors for this plugin and disables it
|
||||||
|
but continues processing for all other plugins.
|
||||||
|
- `retry`: NOT AVAILABLE
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
```toml @sample.conf
|
```toml @sample.conf
|
||||||
|
|
@ -23,12 +35,6 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
|
||||||
## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned
|
## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned
|
||||||
# bin_path = "/usr/bin/nvidia-smi"
|
# bin_path = "/usr/bin/nvidia-smi"
|
||||||
|
|
||||||
## Optional: specifies plugin behavior regarding missing nvidia-smi binary
|
|
||||||
## Available choices:
|
|
||||||
## - error: telegraf will return an error on startup
|
|
||||||
## - ignore: telegraf will ignore this plugin
|
|
||||||
# startup_error_behavior = "error"
|
|
||||||
|
|
||||||
## Optional: timeout for GPU polling
|
## Optional: timeout for GPU polling
|
||||||
# timeout = "5s"
|
# timeout = "5s"
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -27,10 +27,9 @@ var sampleConfig string
|
||||||
|
|
||||||
// NvidiaSMI holds the methods for this plugin
|
// NvidiaSMI holds the methods for this plugin
|
||||||
type NvidiaSMI struct {
|
type NvidiaSMI struct {
|
||||||
BinPath string `toml:"bin_path"`
|
BinPath string `toml:"bin_path"`
|
||||||
Timeout config.Duration `toml:"timeout"`
|
Timeout config.Duration `toml:"timeout"`
|
||||||
StartupErrorBehavior string `toml:"startup_error_behavior"`
|
Log telegraf.Logger `toml:"-"`
|
||||||
Log telegraf.Logger `toml:"-"`
|
|
||||||
|
|
||||||
ignorePlugin bool
|
ignorePlugin bool
|
||||||
once sync.Once
|
once sync.Once
|
||||||
|
|
@ -40,20 +39,11 @@ func (*NvidiaSMI) SampleConfig() string {
|
||||||
return sampleConfig
|
return sampleConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
func (smi *NvidiaSMI) Init() error {
|
func (smi *NvidiaSMI) Start(telegraf.Accumulator) error {
|
||||||
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
|
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
|
||||||
binPath, err := exec.LookPath("nvidia-smi")
|
binPath, err := exec.LookPath("nvidia-smi")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
switch smi.StartupErrorBehavior {
|
return &internal.StartupError{Err: err}
|
||||||
case "ignore":
|
|
||||||
smi.ignorePlugin = true
|
|
||||||
smi.Log.Warnf("nvidia-smi not found on the system, ignoring: %s", err)
|
|
||||||
return nil
|
|
||||||
case "", "error":
|
|
||||||
return fmt.Errorf("nvidia-smi not found in %q and not in PATH; please make sure nvidia-smi is installed and/or is in PATH", smi.BinPath)
|
|
||||||
default:
|
|
||||||
return fmt.Errorf("unknown startup behavior setting: %s", smi.StartupErrorBehavior)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
smi.BinPath = binPath
|
smi.BinPath = binPath
|
||||||
}
|
}
|
||||||
|
|
@ -61,6 +51,8 @@ func (smi *NvidiaSMI) Init() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (smi *NvidiaSMI) Stop() {}
|
||||||
|
|
||||||
// Gather implements the telegraf interface
|
// Gather implements the telegraf interface
|
||||||
func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
|
func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
|
||||||
if smi.ignorePlugin {
|
if smi.ignorePlugin {
|
||||||
|
|
|
||||||
|
|
@ -1,27 +1,19 @@
|
||||||
package nvidia_smi
|
package nvidia_smi
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/influxdata/telegraf"
|
"github.com/influxdata/telegraf"
|
||||||
|
"github.com/influxdata/telegraf/internal"
|
||||||
|
"github.com/influxdata/telegraf/models"
|
||||||
"github.com/influxdata/telegraf/testutil"
|
"github.com/influxdata/telegraf/testutil"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestErrorBehaviorError(t *testing.T) {
|
|
||||||
// make sure we can't find nvidia-smi in $PATH somewhere
|
|
||||||
os.Unsetenv("PATH")
|
|
||||||
plugin := &NvidiaSMI{
|
|
||||||
BinPath: "/random/non-existent/path",
|
|
||||||
Log: &testutil.Logger{},
|
|
||||||
StartupErrorBehavior: "error",
|
|
||||||
}
|
|
||||||
require.Error(t, plugin.Init())
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestErrorBehaviorDefault(t *testing.T) {
|
func TestErrorBehaviorDefault(t *testing.T) {
|
||||||
// make sure we can't find nvidia-smi in $PATH somewhere
|
// make sure we can't find nvidia-smi in $PATH somewhere
|
||||||
os.Unsetenv("PATH")
|
os.Unsetenv("PATH")
|
||||||
|
|
@ -29,31 +21,72 @@ func TestErrorBehaviorDefault(t *testing.T) {
|
||||||
BinPath: "/random/non-existent/path",
|
BinPath: "/random/non-existent/path",
|
||||||
Log: &testutil.Logger{},
|
Log: &testutil.Logger{},
|
||||||
}
|
}
|
||||||
require.Error(t, plugin.Init())
|
model := models.NewRunningInput(plugin, &models.InputConfig{
|
||||||
|
Name: "nvidia_smi",
|
||||||
|
})
|
||||||
|
require.NoError(t, model.Init())
|
||||||
|
|
||||||
|
var acc testutil.Accumulator
|
||||||
|
var ferr *internal.FatalError
|
||||||
|
require.False(t, errors.As(model.Start(&acc), &ferr))
|
||||||
|
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestErrorBehaviorError(t *testing.T) {
|
||||||
|
// make sure we can't find nvidia-smi in $PATH somewhere
|
||||||
|
os.Unsetenv("PATH")
|
||||||
|
plugin := &NvidiaSMI{
|
||||||
|
BinPath: "/random/non-existent/path",
|
||||||
|
Log: &testutil.Logger{},
|
||||||
|
}
|
||||||
|
model := models.NewRunningInput(plugin, &models.InputConfig{
|
||||||
|
Name: "nvidia_smi",
|
||||||
|
StartupErrorBehavior: "error",
|
||||||
|
})
|
||||||
|
require.NoError(t, model.Init())
|
||||||
|
|
||||||
|
var acc testutil.Accumulator
|
||||||
|
var ferr *internal.FatalError
|
||||||
|
require.False(t, errors.As(model.Start(&acc), &ferr))
|
||||||
|
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestErrorBehaviorRetry(t *testing.T) {
|
||||||
|
// make sure we can't find nvidia-smi in $PATH somewhere
|
||||||
|
os.Unsetenv("PATH")
|
||||||
|
plugin := &NvidiaSMI{
|
||||||
|
BinPath: "/random/non-existent/path",
|
||||||
|
Log: &testutil.Logger{},
|
||||||
|
}
|
||||||
|
model := models.NewRunningInput(plugin, &models.InputConfig{
|
||||||
|
Name: "nvidia_smi",
|
||||||
|
StartupErrorBehavior: "retry",
|
||||||
|
})
|
||||||
|
require.NoError(t, model.Init())
|
||||||
|
|
||||||
|
var acc testutil.Accumulator
|
||||||
|
var ferr *internal.FatalError
|
||||||
|
require.False(t, errors.As(model.Start(&acc), &ferr))
|
||||||
|
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestErrorBehaviorIgnore(t *testing.T) {
|
func TestErrorBehaviorIgnore(t *testing.T) {
|
||||||
// make sure we can't find nvidia-smi in $PATH somewhere
|
// make sure we can't find nvidia-smi in $PATH somewhere
|
||||||
os.Unsetenv("PATH")
|
os.Unsetenv("PATH")
|
||||||
plugin := &NvidiaSMI{
|
plugin := &NvidiaSMI{
|
||||||
BinPath: "/random/non-existent/path",
|
BinPath: "/random/non-existent/path",
|
||||||
Log: &testutil.Logger{},
|
Log: &testutil.Logger{},
|
||||||
|
}
|
||||||
|
model := models.NewRunningInput(plugin, &models.InputConfig{
|
||||||
|
Name: "nvidia_smi",
|
||||||
StartupErrorBehavior: "ignore",
|
StartupErrorBehavior: "ignore",
|
||||||
}
|
})
|
||||||
require.NoError(t, plugin.Init())
|
require.NoError(t, model.Init())
|
||||||
acc := testutil.Accumulator{}
|
|
||||||
require.NoError(t, plugin.Gather(&acc))
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestErrorBehaviorInvalidOption(t *testing.T) {
|
var acc testutil.Accumulator
|
||||||
// make sure we can't find nvidia-smi in $PATH somewhere
|
var ferr *internal.FatalError
|
||||||
os.Unsetenv("PATH")
|
require.ErrorAs(t, model.Start(&acc), &ferr)
|
||||||
plugin := &NvidiaSMI{
|
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
|
||||||
BinPath: "/random/non-existent/path",
|
|
||||||
Log: &testutil.Logger{},
|
|
||||||
StartupErrorBehavior: "giveup",
|
|
||||||
}
|
|
||||||
require.Error(t, plugin.Init())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGatherValidXML(t *testing.T) {
|
func TestGatherValidXML(t *testing.T) {
|
||||||
|
|
|
||||||
|
|
@ -5,11 +5,5 @@
|
||||||
## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned
|
## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned
|
||||||
# bin_path = "/usr/bin/nvidia-smi"
|
# bin_path = "/usr/bin/nvidia-smi"
|
||||||
|
|
||||||
## Optional: specifies plugin behavior regarding missing nvidia-smi binary
|
|
||||||
## Available choices:
|
|
||||||
## - error: telegraf will return an error on startup
|
|
||||||
## - ignore: telegraf will ignore this plugin
|
|
||||||
# startup_error_behavior = "error"
|
|
||||||
|
|
||||||
## Optional: timeout for GPU polling
|
## Optional: timeout for GPU polling
|
||||||
# timeout = "5s"
|
# timeout = "5s"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue