feat(inputs.nvidia_smi): Add startup_error_behavior config option (#14680)

This commit is contained in:
Serguei Mokhov 2024-02-15 04:23:05 -05:00 committed by GitHub
parent 75efce9201
commit 2c815e4d8a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 77 additions and 6 deletions

View File

@ -23,6 +23,12 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned
# bin_path = "/usr/bin/nvidia-smi"
## Optional: specifies plugin behavior regarding missing nvidia-smi binary
## Available choices:
## - error: telegraf will return an error on startup
## - ignore: telegraf will ignore this plugin
# startup_error_behavior = "error"
## Optional: timeout for GPU polling
# timeout = "5s"
```

View File

@ -27,11 +27,13 @@ var sampleConfig string
// NvidiaSMI holds the methods for this plugin
type NvidiaSMI struct {
BinPath string `toml:"bin_path"`
Timeout config.Duration `toml:"timeout"`
Log telegraf.Logger `toml:"-"`
BinPath string `toml:"bin_path"`
Timeout config.Duration `toml:"timeout"`
StartupErrorBehavior string `toml:"startup_error_behavior"`
Log telegraf.Logger `toml:"-"`
once sync.Once
ignorePlugin bool
once sync.Once
}
func (*NvidiaSMI) SampleConfig() string {
@ -41,9 +43,17 @@ func (*NvidiaSMI) SampleConfig() string {
func (smi *NvidiaSMI) Init() error {
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
binPath, err := exec.LookPath("nvidia-smi")
// fail-fast
if err != nil {
return fmt.Errorf("nvidia-smi not found in %q and not in PATH; please make sure nvidia-smi is installed and/or is in PATH", smi.BinPath)
switch smi.StartupErrorBehavior {
case "ignore":
smi.ignorePlugin = true
smi.Log.Warnf("nvidia-smi not found on the system, ignoring: %s", err)
return nil
case "", "error":
return fmt.Errorf("nvidia-smi not found in %q and not in PATH; please make sure nvidia-smi is installed and/or is in PATH", smi.BinPath)
default:
return fmt.Errorf("unknown startup behavior setting: %s", smi.StartupErrorBehavior)
}
}
smi.BinPath = binPath
}
@ -53,6 +63,10 @@ func (smi *NvidiaSMI) Init() error {
// Gather implements the telegraf interface
func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
if smi.ignorePlugin {
return nil
}
// Construct and execute metrics query
data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), time.Duration(smi.Timeout))
if err != nil {

View File

@ -11,6 +11,51 @@ import (
"github.com/stretchr/testify/require"
)
func TestErrorBehaviorError(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &NvidiaSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
StartupErrorBehavior: "error",
}
require.Error(t, plugin.Init())
}
func TestErrorBehaviorDefault(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &NvidiaSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
}
require.Error(t, plugin.Init())
}
func TestErorBehaviorIgnore(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &NvidiaSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
StartupErrorBehavior: "ignore",
}
require.NoError(t, plugin.Init())
acc := testutil.Accumulator{}
require.NoError(t, plugin.Gather(&acc))
}
func TestErrorBehaviorInvalidOption(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &NvidiaSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
StartupErrorBehavior: "giveup",
}
require.Error(t, plugin.Init())
}
func TestGatherValidXML(t *testing.T) {
tests := []struct {
name string

View File

@ -5,5 +5,11 @@
## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned
# bin_path = "/usr/bin/nvidia-smi"
## Optional: specifies plugin behavior regarding missing nvidia-smi binary
## Available choices:
## - error: telegraf will return an error on startup
## - ignore: telegraf will ignore this plugin
# startup_error_behavior = "error"
## Optional: timeout for GPU polling
# timeout = "5s"