feat(inputs.nvidia-smi): REVERT -- Add `probe_on_startup` option (#16165)
This commit is contained in:
parent
942d2b3f6f
commit
1cfc65abef
|
|
@ -37,12 +37,6 @@ using the `startup_error_behavior` setting. Available values are:
|
||||||
|
|
||||||
## Optional: timeout for GPU polling
|
## Optional: timeout for GPU polling
|
||||||
# timeout = "5s"
|
# timeout = "5s"
|
||||||
|
|
||||||
## Optional: Attempt to run nvidia-smi once on startup. If nvidia-smi returns a non-zero
|
|
||||||
## exit code, the plugin will return an error. This is particularly useful
|
|
||||||
## if used in conjunction with `startup_error_behavior` to allow the plugin to be
|
|
||||||
## disabled if nvidia-smi cannot run successfully.
|
|
||||||
# probe_on_startup = false
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Linux
|
### Linux
|
||||||
|
|
|
||||||
|
|
@ -27,14 +27,12 @@ var sampleConfig string
|
||||||
|
|
||||||
// NvidiaSMI holds the methods for this plugin
|
// NvidiaSMI holds the methods for this plugin
|
||||||
type NvidiaSMI struct {
|
type NvidiaSMI struct {
|
||||||
BinPath string `toml:"bin_path"`
|
BinPath string `toml:"bin_path"`
|
||||||
Timeout config.Duration `toml:"timeout"`
|
Timeout config.Duration `toml:"timeout"`
|
||||||
ProbeOnStartup bool `toml:"probe_on_startup"`
|
Log telegraf.Logger `toml:"-"`
|
||||||
Log telegraf.Logger `toml:"-"`
|
|
||||||
|
|
||||||
ignorePlugin bool
|
ignorePlugin bool
|
||||||
once sync.Once
|
once sync.Once
|
||||||
nvidiaSMIArgs []string
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (*NvidiaSMI) SampleConfig() string {
|
func (*NvidiaSMI) SampleConfig() string {
|
||||||
|
|
@ -49,11 +47,6 @@ func (smi *NvidiaSMI) Start(telegraf.Accumulator) error {
|
||||||
}
|
}
|
||||||
smi.BinPath = binPath
|
smi.BinPath = binPath
|
||||||
}
|
}
|
||||||
if smi.ProbeOnStartup {
|
|
||||||
if _, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout)); err != nil {
|
|
||||||
return &internal.StartupError{Err: err}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
@ -67,7 +60,7 @@ func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Construct and execute metrics query
|
// Construct and execute metrics query
|
||||||
data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout))
|
data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), time.Duration(smi.Timeout))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("calling %q failed: %w", smi.BinPath, err)
|
return fmt.Errorf("calling %q failed: %w", smi.BinPath, err)
|
||||||
}
|
}
|
||||||
|
|
@ -126,9 +119,8 @@ func (smi *NvidiaSMI) parse(acc telegraf.Accumulator, data []byte) error {
|
||||||
func init() {
|
func init() {
|
||||||
inputs.Add("nvidia_smi", func() telegraf.Input {
|
inputs.Add("nvidia_smi", func() telegraf.Input {
|
||||||
return &NvidiaSMI{
|
return &NvidiaSMI{
|
||||||
BinPath: "/usr/bin/nvidia-smi",
|
BinPath: "/usr/bin/nvidia-smi",
|
||||||
Timeout: config.Duration(5 * time.Second),
|
Timeout: config.Duration(5 * time.Second),
|
||||||
nvidiaSMIArgs: []string{"-q", "-x"},
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,65 +4,16 @@ import (
|
||||||
"errors"
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/influxdata/telegraf"
|
"github.com/influxdata/telegraf"
|
||||||
"github.com/influxdata/telegraf/config"
|
|
||||||
"github.com/influxdata/telegraf/internal"
|
"github.com/influxdata/telegraf/internal"
|
||||||
"github.com/influxdata/telegraf/models"
|
"github.com/influxdata/telegraf/models"
|
||||||
"github.com/influxdata/telegraf/testutil"
|
"github.com/influxdata/telegraf/testutil"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestOnStartupError(t *testing.T) {
|
|
||||||
var binPath string
|
|
||||||
var nvidiaSMIArgs []string
|
|
||||||
if runtime.GOOS == "windows" {
|
|
||||||
binPath = `C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe`
|
|
||||||
nvidiaSMIArgs = []string{"-Command", "exit 1"}
|
|
||||||
} else {
|
|
||||||
binPath = "/bin/bash"
|
|
||||||
nvidiaSMIArgs = []string{"-c", "exit 1"}
|
|
||||||
}
|
|
||||||
|
|
||||||
tests := []struct {
|
|
||||||
ProbeOnStartup bool
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
ProbeOnStartup: true,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
ProbeOnStartup: false,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
for _, tt := range tests {
|
|
||||||
plugin := &NvidiaSMI{
|
|
||||||
BinPath: binPath,
|
|
||||||
ProbeOnStartup: tt.ProbeOnStartup,
|
|
||||||
Timeout: config.Duration(time.Second),
|
|
||||||
Log: &testutil.Logger{},
|
|
||||||
nvidiaSMIArgs: nvidiaSMIArgs,
|
|
||||||
}
|
|
||||||
model := models.NewRunningInput(plugin, &models.InputConfig{
|
|
||||||
Name: "nvidia_smi",
|
|
||||||
})
|
|
||||||
require.NoError(t, model.Init())
|
|
||||||
|
|
||||||
var acc testutil.Accumulator
|
|
||||||
var ferr *internal.FatalError
|
|
||||||
err := model.Start(&acc)
|
|
||||||
|
|
||||||
if tt.ProbeOnStartup {
|
|
||||||
require.False(t, errors.As(err, &ferr))
|
|
||||||
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
|
|
||||||
} else {
|
|
||||||
require.NoError(t, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestErrorBehaviorDefault(t *testing.T) {
|
func TestErrorBehaviorDefault(t *testing.T) {
|
||||||
// make sure we can't find nvidia-smi in $PATH somewhere
|
// make sure we can't find nvidia-smi in $PATH somewhere
|
||||||
os.Unsetenv("PATH")
|
os.Unsetenv("PATH")
|
||||||
|
|
|
||||||
|
|
@ -7,9 +7,3 @@
|
||||||
|
|
||||||
## Optional: timeout for GPU polling
|
## Optional: timeout for GPU polling
|
||||||
# timeout = "5s"
|
# timeout = "5s"
|
||||||
|
|
||||||
## Optional: Attempt to run nvidia-smi once on startup. If nvidia-smi returns a non-zero
|
|
||||||
## exit code, the plugin will return an error. This is particularly useful
|
|
||||||
## if used in conjunction with `startup_error_behavior` to allow the plugin to be
|
|
||||||
## disabled if nvidia-smi cannot run successfully.
|
|
||||||
# probe_on_startup = false
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue