feat(inputs.nvidia_smi): Add startup_error_behavior config option (#14680)
This commit is contained in:
parent
75efce9201
commit
2c815e4d8a
|
|
@ -23,6 +23,12 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
|
|||
## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned
|
||||
# bin_path = "/usr/bin/nvidia-smi"
|
||||
|
||||
## Optional: specifies plugin behavior regarding missing nvidia-smi binary
|
||||
## Available choices:
|
||||
## - error: telegraf will return an error on startup
|
||||
## - ignore: telegraf will ignore this plugin
|
||||
# startup_error_behavior = "error"
|
||||
|
||||
## Optional: timeout for GPU polling
|
||||
# timeout = "5s"
|
||||
```
|
||||
|
|
|
|||
|
|
@ -27,11 +27,13 @@ var sampleConfig string
|
|||
|
||||
// NvidiaSMI holds the methods for this plugin
|
||||
type NvidiaSMI struct {
|
||||
BinPath string `toml:"bin_path"`
|
||||
Timeout config.Duration `toml:"timeout"`
|
||||
Log telegraf.Logger `toml:"-"`
|
||||
BinPath string `toml:"bin_path"`
|
||||
Timeout config.Duration `toml:"timeout"`
|
||||
StartupErrorBehavior string `toml:"startup_error_behavior"`
|
||||
Log telegraf.Logger `toml:"-"`
|
||||
|
||||
once sync.Once
|
||||
ignorePlugin bool
|
||||
once sync.Once
|
||||
}
|
||||
|
||||
func (*NvidiaSMI) SampleConfig() string {
|
||||
|
|
@ -41,9 +43,17 @@ func (*NvidiaSMI) SampleConfig() string {
|
|||
func (smi *NvidiaSMI) Init() error {
|
||||
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
|
||||
binPath, err := exec.LookPath("nvidia-smi")
|
||||
// fail-fast
|
||||
if err != nil {
|
||||
return fmt.Errorf("nvidia-smi not found in %q and not in PATH; please make sure nvidia-smi is installed and/or is in PATH", smi.BinPath)
|
||||
switch smi.StartupErrorBehavior {
|
||||
case "ignore":
|
||||
smi.ignorePlugin = true
|
||||
smi.Log.Warnf("nvidia-smi not found on the system, ignoring: %s", err)
|
||||
return nil
|
||||
case "", "error":
|
||||
return fmt.Errorf("nvidia-smi not found in %q and not in PATH; please make sure nvidia-smi is installed and/or is in PATH", smi.BinPath)
|
||||
default:
|
||||
return fmt.Errorf("unknown startup behavior setting: %s", smi.StartupErrorBehavior)
|
||||
}
|
||||
}
|
||||
smi.BinPath = binPath
|
||||
}
|
||||
|
|
@ -53,6 +63,10 @@ func (smi *NvidiaSMI) Init() error {
|
|||
|
||||
// Gather implements the telegraf interface
|
||||
func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
|
||||
if smi.ignorePlugin {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Construct and execute metrics query
|
||||
data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), time.Duration(smi.Timeout))
|
||||
if err != nil {
|
||||
|
|
|
|||
|
|
@ -11,6 +11,51 @@ import (
|
|||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestErrorBehaviorError(t *testing.T) {
|
||||
// make sure we can't find nvidia-smi in $PATH somewhere
|
||||
os.Unsetenv("PATH")
|
||||
plugin := &NvidiaSMI{
|
||||
BinPath: "/random/non-existent/path",
|
||||
Log: &testutil.Logger{},
|
||||
StartupErrorBehavior: "error",
|
||||
}
|
||||
require.Error(t, plugin.Init())
|
||||
}
|
||||
|
||||
func TestErrorBehaviorDefault(t *testing.T) {
|
||||
// make sure we can't find nvidia-smi in $PATH somewhere
|
||||
os.Unsetenv("PATH")
|
||||
plugin := &NvidiaSMI{
|
||||
BinPath: "/random/non-existent/path",
|
||||
Log: &testutil.Logger{},
|
||||
}
|
||||
require.Error(t, plugin.Init())
|
||||
}
|
||||
|
||||
func TestErorBehaviorIgnore(t *testing.T) {
|
||||
// make sure we can't find nvidia-smi in $PATH somewhere
|
||||
os.Unsetenv("PATH")
|
||||
plugin := &NvidiaSMI{
|
||||
BinPath: "/random/non-existent/path",
|
||||
Log: &testutil.Logger{},
|
||||
StartupErrorBehavior: "ignore",
|
||||
}
|
||||
require.NoError(t, plugin.Init())
|
||||
acc := testutil.Accumulator{}
|
||||
require.NoError(t, plugin.Gather(&acc))
|
||||
}
|
||||
|
||||
func TestErrorBehaviorInvalidOption(t *testing.T) {
|
||||
// make sure we can't find nvidia-smi in $PATH somewhere
|
||||
os.Unsetenv("PATH")
|
||||
plugin := &NvidiaSMI{
|
||||
BinPath: "/random/non-existent/path",
|
||||
Log: &testutil.Logger{},
|
||||
StartupErrorBehavior: "giveup",
|
||||
}
|
||||
require.Error(t, plugin.Init())
|
||||
}
|
||||
|
||||
func TestGatherValidXML(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
|
|
|
|||
|
|
@ -5,5 +5,11 @@
|
|||
## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned
|
||||
# bin_path = "/usr/bin/nvidia-smi"
|
||||
|
||||
## Optional: specifies plugin behavior regarding missing nvidia-smi binary
|
||||
## Available choices:
|
||||
## - error: telegraf will return an error on startup
|
||||
## - ignore: telegraf will ignore this plugin
|
||||
# startup_error_behavior = "error"
|
||||
|
||||
## Optional: timeout for GPU polling
|
||||
# timeout = "5s"
|
||||
|
|
|
|||
Loading…
Reference in New Issue