chore(inputs.amd_rocm_smi): Consolidate startup_retry_behavior to model implementation (#15216)
This commit is contained in:
parent
e7e2d1aeee
commit
1e57608a21
|
|
@ -14,6 +14,18 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
|
|||
|
||||
[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins
|
||||
|
||||
## Startup error behavior options
|
||||
|
||||
In addition to the plugin-specific and global configuration settings the plugin
|
||||
supports options for specifying the behavior when experiencing startup errors
|
||||
using the `startup_error_behavior` setting. Available values are:
|
||||
|
||||
- `error`: Telegraf with stop and exit in case of startup errors. This is the
|
||||
default behavior.
|
||||
- `ignore`: Telegraf will ignore startup errors for this plugin and disables it
|
||||
but continues processing for all other plugins.
|
||||
- `retry`: NOT AVAILABLE
|
||||
|
||||
## Configuration
|
||||
|
||||
```toml @sample.conf
|
||||
|
|
@ -22,12 +34,6 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
|
|||
## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
|
||||
# bin_path = "/opt/rocm/bin/rocm-smi"
|
||||
|
||||
## Optional: specifies plugin behavior regarding missing rocm-smi binary
|
||||
## Available choices:
|
||||
## - error: telegraf will return an error on startup
|
||||
## - ignore: telegraf will ignore this plugin
|
||||
# startup_error_behavior = "error"
|
||||
|
||||
## Optional: timeout for GPU polling
|
||||
# timeout = "5s"
|
||||
```
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ package amd_rocm_smi
|
|||
import (
|
||||
_ "embed"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
|
|
@ -23,12 +22,9 @@ var sampleConfig string
|
|||
const measurement = "amd_rocm_smi"
|
||||
|
||||
type ROCmSMI struct {
|
||||
BinPath string `toml:"bin_path"`
|
||||
Timeout config.Duration `toml:"timeout"`
|
||||
StartupErrorBehavior string `toml:"startup_error_behavior"`
|
||||
Log telegraf.Logger `toml:"-"`
|
||||
|
||||
ignorePlugin bool
|
||||
BinPath string `toml:"bin_path"`
|
||||
Timeout config.Duration `toml:"timeout"`
|
||||
Log telegraf.Logger `toml:"-"`
|
||||
}
|
||||
|
||||
func (*ROCmSMI) SampleConfig() string {
|
||||
|
|
@ -37,33 +33,16 @@ func (*ROCmSMI) SampleConfig() string {
|
|||
|
||||
// Gather implements the telegraf interface
|
||||
func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error {
|
||||
if rsmi.ignorePlugin {
|
||||
return nil
|
||||
}
|
||||
|
||||
data := rsmi.pollROCmSMI()
|
||||
err := gatherROCmSMI(data, acc)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
return gatherROCmSMI(data, acc)
|
||||
}
|
||||
|
||||
func (rsmi *ROCmSMI) Init() error {
|
||||
func (rsmi *ROCmSMI) Start(telegraf.Accumulator) error {
|
||||
if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) {
|
||||
binPath, err := exec.LookPath("rocm-smi")
|
||||
if err != nil {
|
||||
switch rsmi.StartupErrorBehavior {
|
||||
case "ignore":
|
||||
rsmi.ignorePlugin = true
|
||||
rsmi.Log.Warnf("rocm-smi not found on the system, ignoring: %s", err)
|
||||
return nil
|
||||
case "", "error":
|
||||
return fmt.Errorf("rocm-smi binary not found in path %s, cannot query GPUs statistics", rsmi.BinPath)
|
||||
default:
|
||||
return fmt.Errorf("unknown startup behavior setting: %s", rsmi.StartupErrorBehavior)
|
||||
}
|
||||
return &internal.StartupError{Err: err}
|
||||
}
|
||||
rsmi.BinPath = binPath
|
||||
}
|
||||
|
|
@ -71,6 +50,8 @@ func (rsmi *ROCmSMI) Init() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (rsmi *ROCmSMI) Stop() {}
|
||||
|
||||
func init() {
|
||||
inputs.Add("amd_rocm_smi", func() telegraf.Input {
|
||||
return &ROCmSMI{
|
||||
|
|
|
|||
|
|
@ -1,27 +1,19 @@
|
|||
package amd_rocm_smi
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/influxdata/telegraf"
|
||||
"github.com/influxdata/telegraf/internal"
|
||||
"github.com/influxdata/telegraf/models"
|
||||
"github.com/influxdata/telegraf/testutil"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestErrorBehaviorError(t *testing.T) {
|
||||
// make sure we can't find rocm-smi in $PATH somewhere
|
||||
os.Unsetenv("PATH")
|
||||
plugin := &ROCmSMI{
|
||||
BinPath: "/random/non-existent/path",
|
||||
Log: &testutil.Logger{},
|
||||
StartupErrorBehavior: "error",
|
||||
}
|
||||
require.Error(t, plugin.Init())
|
||||
}
|
||||
|
||||
func TestErrorBehaviorDefault(t *testing.T) {
|
||||
// make sure we can't find rocm-smi in $PATH somewhere
|
||||
os.Unsetenv("PATH")
|
||||
|
|
@ -29,31 +21,72 @@ func TestErrorBehaviorDefault(t *testing.T) {
|
|||
BinPath: "/random/non-existent/path",
|
||||
Log: &testutil.Logger{},
|
||||
}
|
||||
require.Error(t, plugin.Init())
|
||||
model := models.NewRunningInput(plugin, &models.InputConfig{
|
||||
Name: "amd_rocm_smi",
|
||||
})
|
||||
require.NoError(t, model.Init())
|
||||
|
||||
var acc testutil.Accumulator
|
||||
var ferr *internal.FatalError
|
||||
require.False(t, errors.As(model.Start(&acc), &ferr))
|
||||
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
|
||||
}
|
||||
|
||||
func TestErrorBehaviorError(t *testing.T) {
|
||||
// make sure we can't find rocm-smi in $PATH somewhere
|
||||
os.Unsetenv("PATH")
|
||||
plugin := &ROCmSMI{
|
||||
BinPath: "/random/non-existent/path",
|
||||
Log: &testutil.Logger{},
|
||||
}
|
||||
model := models.NewRunningInput(plugin, &models.InputConfig{
|
||||
Name: "amd_rocm_smi",
|
||||
StartupErrorBehavior: "error",
|
||||
})
|
||||
require.NoError(t, model.Init())
|
||||
|
||||
var acc testutil.Accumulator
|
||||
var ferr *internal.FatalError
|
||||
require.False(t, errors.As(model.Start(&acc), &ferr))
|
||||
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
|
||||
}
|
||||
|
||||
func TestErrorBehaviorRetry(t *testing.T) {
|
||||
// make sure we can't find nvidia-smi in $PATH somewhere
|
||||
os.Unsetenv("PATH")
|
||||
plugin := &ROCmSMI{
|
||||
BinPath: "/random/non-existent/path",
|
||||
Log: &testutil.Logger{},
|
||||
}
|
||||
model := models.NewRunningInput(plugin, &models.InputConfig{
|
||||
Name: "amd_rocm_smi",
|
||||
StartupErrorBehavior: "retry",
|
||||
})
|
||||
require.NoError(t, model.Init())
|
||||
|
||||
var acc testutil.Accumulator
|
||||
var ferr *internal.FatalError
|
||||
require.False(t, errors.As(model.Start(&acc), &ferr))
|
||||
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
|
||||
}
|
||||
|
||||
func TestErrorBehaviorIgnore(t *testing.T) {
|
||||
// make sure we can't find rocm-smi in $PATH somewhere
|
||||
// make sure we can't find nvidia-smi in $PATH somewhere
|
||||
os.Unsetenv("PATH")
|
||||
plugin := &ROCmSMI{
|
||||
BinPath: "/random/non-existent/path",
|
||||
Log: &testutil.Logger{},
|
||||
BinPath: "/random/non-existent/path",
|
||||
Log: &testutil.Logger{},
|
||||
}
|
||||
model := models.NewRunningInput(plugin, &models.InputConfig{
|
||||
Name: "amd_rocm_smi",
|
||||
StartupErrorBehavior: "ignore",
|
||||
}
|
||||
require.NoError(t, plugin.Init())
|
||||
acc := testutil.Accumulator{}
|
||||
require.NoError(t, plugin.Gather(&acc))
|
||||
}
|
||||
})
|
||||
require.NoError(t, model.Init())
|
||||
|
||||
func TestErrorBehaviorInvalidOption(t *testing.T) {
|
||||
// make sure we can't find rocm-smi in $PATH somewhere
|
||||
os.Unsetenv("PATH")
|
||||
plugin := &ROCmSMI{
|
||||
BinPath: "/random/non-existent/path",
|
||||
Log: &testutil.Logger{},
|
||||
StartupErrorBehavior: "giveup",
|
||||
}
|
||||
require.Error(t, plugin.Init())
|
||||
var acc testutil.Accumulator
|
||||
var ferr *internal.FatalError
|
||||
require.ErrorAs(t, model.Start(&acc), &ferr)
|
||||
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
|
||||
}
|
||||
|
||||
func TestGatherValidJSON(t *testing.T) {
|
||||
|
|
|
|||
|
|
@ -3,11 +3,5 @@
|
|||
## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
|
||||
# bin_path = "/opt/rocm/bin/rocm-smi"
|
||||
|
||||
## Optional: specifies plugin behavior regarding missing rocm-smi binary
|
||||
## Available choices:
|
||||
## - error: telegraf will return an error on startup
|
||||
## - ignore: telegraf will ignore this plugin
|
||||
# startup_error_behavior = "error"
|
||||
|
||||
## Optional: timeout for GPU polling
|
||||
# timeout = "5s"
|
||||
|
|
|
|||
Loading…
Reference in New Issue