feat(inputs.nvidia-smi): Add `probe_on_startup` option (#15916)

This commit is contained in:
Landon Clipp 2024-10-03 11:51:54 -05:00 committed by GitHub
parent bcbecb03f1
commit b6e59aac59
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 77 additions and 8 deletions

View File

@ -37,6 +37,12 @@ using the `startup_error_behavior` setting. Available values are:
## Optional: timeout for GPU polling
# timeout = "5s"
## Optional: Attempt to run nvidia-smi once on startup. If nvidia-smi returns a non-zero
## exit code, the plugin will return an error. This is particularly useful
## if used in conjunction with `startup_error_behavior` to allow the plugin to be
## disabled if nvidia-smi cannot run successfully.
# probe_on_startup = false
```
### Linux

View File

@ -27,12 +27,14 @@ var sampleConfig string
// NvidiaSMI holds the methods for this plugin
type NvidiaSMI struct {
BinPath string `toml:"bin_path"`
Timeout config.Duration `toml:"timeout"`
Log telegraf.Logger `toml:"-"`
BinPath string `toml:"bin_path"`
Timeout config.Duration `toml:"timeout"`
ProbeOnStartup bool `toml:"probe_on_startup"`
Log telegraf.Logger `toml:"-"`
ignorePlugin bool
once sync.Once
ignorePlugin bool
once sync.Once
nvidiaSMIArgs []string
}
func (*NvidiaSMI) SampleConfig() string {
@ -47,6 +49,11 @@ func (smi *NvidiaSMI) Start(telegraf.Accumulator) error {
}
smi.BinPath = binPath
}
if smi.ProbeOnStartup {
if _, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout)); err != nil {
return &internal.StartupError{Err: err}
}
}
return nil
}
@ -60,7 +67,7 @@ func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
}
// Construct and execute metrics query
data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), time.Duration(smi.Timeout))
data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout))
if err != nil {
return fmt.Errorf("calling %q failed: %w", smi.BinPath, err)
}
@ -119,8 +126,9 @@ func (smi *NvidiaSMI) parse(acc telegraf.Accumulator, data []byte) error {
func init() {
inputs.Add("nvidia_smi", func() telegraf.Input {
return &NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
Timeout: config.Duration(5 * time.Second),
BinPath: "/usr/bin/nvidia-smi",
Timeout: config.Duration(5 * time.Second),
nvidiaSMIArgs: []string{"-q", "-x"},
}
})
}

View File

@ -4,16 +4,65 @@ import (
"errors"
"os"
"path/filepath"
"runtime"
"testing"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/models"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/require"
)
func TestOnStartupError(t *testing.T) {
var binPath string
var nvidiaSMIArgs []string
if runtime.GOOS == "windows" {
binPath = `C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe`
nvidiaSMIArgs = []string{"-Command", "exit 1"}
} else {
binPath = "/bin/bash"
nvidiaSMIArgs = []string{"-c", "exit 1"}
}
tests := []struct {
ProbeOnStartup bool
}{
{
ProbeOnStartup: true,
},
{
ProbeOnStartup: false,
},
}
for _, tt := range tests {
plugin := &NvidiaSMI{
BinPath: binPath,
ProbeOnStartup: tt.ProbeOnStartup,
Timeout: config.Duration(time.Second),
Log: &testutil.Logger{},
nvidiaSMIArgs: nvidiaSMIArgs,
}
model := models.NewRunningInput(plugin, &models.InputConfig{
Name: "nvidia_smi",
})
require.NoError(t, model.Init())
var acc testutil.Accumulator
var ferr *internal.FatalError
err := model.Start(&acc)
if tt.ProbeOnStartup {
require.False(t, errors.As(err, &ferr))
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
} else {
require.NoError(t, err)
}
}
}
func TestErrorBehaviorDefault(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH")

View File

@ -7,3 +7,9 @@
## Optional: timeout for GPU polling
# timeout = "5s"
## Optional: Attempt to run nvidia-smi once on startup. If nvidia-smi returns a non-zero
## exit code, the plugin will return an error. This is particularly useful
## if used in conjunction with `startup_error_behavior` to allow the plugin to be
## disabled if nvidia-smi cannot run successfully.
# probe_on_startup = false