From 2c815e4d8a01deeb2c181ec62cfa876930716482 Mon Sep 17 00:00:00 2001 From: Serguei Mokhov Date: Thu, 15 Feb 2024 04:23:05 -0500 Subject: [PATCH] feat(inputs.nvidia_smi): Add startup_error_behavior config option (#14680) --- plugins/inputs/nvidia_smi/README.md | 6 +++ plugins/inputs/nvidia_smi/nvidia_smi.go | 26 ++++++++--- plugins/inputs/nvidia_smi/nvidia_smi_test.go | 45 ++++++++++++++++++++ plugins/inputs/nvidia_smi/sample.conf | 6 +++ 4 files changed, 77 insertions(+), 6 deletions(-) diff --git a/plugins/inputs/nvidia_smi/README.md b/plugins/inputs/nvidia_smi/README.md index 8e2881c25..58dabcf79 100644 --- a/plugins/inputs/nvidia_smi/README.md +++ b/plugins/inputs/nvidia_smi/README.md @@ -23,6 +23,12 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. ## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned # bin_path = "/usr/bin/nvidia-smi" + ## Optional: specifies plugin behavior regarding missing nvidia-smi binary + ## Available choices: + ## - error: telegraf will return an error on startup + ## - ignore: telegraf will ignore this plugin + # startup_error_behavior = "error" + ## Optional: timeout for GPU polling # timeout = "5s" ``` diff --git a/plugins/inputs/nvidia_smi/nvidia_smi.go b/plugins/inputs/nvidia_smi/nvidia_smi.go index 45e8a2c2c..78953678a 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi.go @@ -27,11 +27,13 @@ var sampleConfig string // NvidiaSMI holds the methods for this plugin type NvidiaSMI struct { - BinPath string `toml:"bin_path"` - Timeout config.Duration `toml:"timeout"` - Log telegraf.Logger `toml:"-"` + BinPath string `toml:"bin_path"` + Timeout config.Duration `toml:"timeout"` + StartupErrorBehavior string `toml:"startup_error_behavior"` + Log telegraf.Logger `toml:"-"` - once sync.Once + ignorePlugin bool + once sync.Once } func (*NvidiaSMI) SampleConfig() string { @@ -41,9 +43,17 @@ func (*NvidiaSMI) SampleConfig() string { func (smi *NvidiaSMI) Init() error { if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) { binPath, err := exec.LookPath("nvidia-smi") - // fail-fast if err != nil { - return fmt.Errorf("nvidia-smi not found in %q and not in PATH; please make sure nvidia-smi is installed and/or is in PATH", smi.BinPath) + switch smi.StartupErrorBehavior { + case "ignore": + smi.ignorePlugin = true + smi.Log.Warnf("nvidia-smi not found on the system, ignoring: %s", err) + return nil + case "", "error": + return fmt.Errorf("nvidia-smi not found in %q and not in PATH; please make sure nvidia-smi is installed and/or is in PATH", smi.BinPath) + default: + return fmt.Errorf("unknown startup behavior setting: %s", smi.StartupErrorBehavior) + } } smi.BinPath = binPath } @@ -53,6 +63,10 @@ func (smi *NvidiaSMI) Init() error { // Gather implements the telegraf interface func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error { + if smi.ignorePlugin { + return nil + } + // Construct and execute metrics query data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), time.Duration(smi.Timeout)) if err != nil { diff --git a/plugins/inputs/nvidia_smi/nvidia_smi_test.go b/plugins/inputs/nvidia_smi/nvidia_smi_test.go index 039e73900..f08d6fb41 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi_test.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi_test.go @@ -11,6 +11,51 @@ import ( "github.com/stretchr/testify/require" ) +func TestErrorBehaviorError(t *testing.T) { + // make sure we can't find nvidia-smi in $PATH somewhere + os.Unsetenv("PATH") + plugin := &NvidiaSMI{ + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + StartupErrorBehavior: "error", + } + require.Error(t, plugin.Init()) +} + +func TestErrorBehaviorDefault(t *testing.T) { + // make sure we can't find nvidia-smi in $PATH somewhere + os.Unsetenv("PATH") + plugin := &NvidiaSMI{ + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + } + require.Error(t, plugin.Init()) +} + +func TestErorBehaviorIgnore(t *testing.T) { + // make sure we can't find nvidia-smi in $PATH somewhere + os.Unsetenv("PATH") + plugin := &NvidiaSMI{ + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + StartupErrorBehavior: "ignore", + } + require.NoError(t, plugin.Init()) + acc := testutil.Accumulator{} + require.NoError(t, plugin.Gather(&acc)) +} + +func TestErrorBehaviorInvalidOption(t *testing.T) { + // make sure we can't find nvidia-smi in $PATH somewhere + os.Unsetenv("PATH") + plugin := &NvidiaSMI{ + BinPath: "/random/non-existent/path", + Log: &testutil.Logger{}, + StartupErrorBehavior: "giveup", + } + require.Error(t, plugin.Init()) +} + func TestGatherValidXML(t *testing.T) { tests := []struct { name string diff --git a/plugins/inputs/nvidia_smi/sample.conf b/plugins/inputs/nvidia_smi/sample.conf index 8879b3923..dee34936d 100644 --- a/plugins/inputs/nvidia_smi/sample.conf +++ b/plugins/inputs/nvidia_smi/sample.conf @@ -5,5 +5,11 @@ ## if it is not found, we will try to locate it on PATH(exec.LookPath), if it is still not found, an error will be returned # bin_path = "/usr/bin/nvidia-smi" + ## Optional: specifies plugin behavior regarding missing nvidia-smi binary + ## Available choices: + ## - error: telegraf will return an error on startup + ## - ignore: telegraf will ignore this plugin + # startup_error_behavior = "error" + ## Optional: timeout for GPU polling # timeout = "5s"