feat(inputs.amd_rocm_smi): Add startup_error_behavior config option (#14872)

This commit is contained in:
Serguei Mokhov 2024-02-22 09:09:04 -05:00 committed by GitHub
parent 32b8ad5662
commit c4069a4bff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 86 additions and 4 deletions

View File

@ -22,6 +22,12 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
# bin_path = "/opt/rocm/bin/rocm-smi"
## Optional: specifies plugin behavior regarding missing rocm-smi binary
## Available choices:
## - error: telegraf will return an error on startup
## - ignore: telegraf will ignore this plugin
# startup_error_behavior = "error"
## Optional: timeout for GPU polling
# timeout = "5s"
```

View File

@ -23,8 +23,12 @@ var sampleConfig string
const measurement = "amd_rocm_smi"
type ROCmSMI struct {
BinPath string
Timeout config.Duration
BinPath string `toml:"bin_path"`
Timeout config.Duration `toml:"timeout"`
StartupErrorBehavior string `toml:"startup_error_behavior"`
Log telegraf.Logger `toml:"-"`
ignorePlugin bool
}
func (*ROCmSMI) SampleConfig() string {
@ -33,8 +37,8 @@ func (*ROCmSMI) SampleConfig() string {
// Gather implements the telegraf interface
func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error {
if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) {
return fmt.Errorf("rocm-smi binary not found in path %s, cannot query GPUs statistics", rsmi.BinPath)
if rsmi.ignorePlugin {
return nil
}
data := rsmi.pollROCmSMI()
@ -46,6 +50,27 @@ func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error {
return nil
}
func (rsmi *ROCmSMI) Init() error {
if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) {
binPath, err := exec.LookPath("rocm-smi")
if err != nil {
switch rsmi.StartupErrorBehavior {
case "ignore":
rsmi.ignorePlugin = true
rsmi.Log.Warnf("rocm-smi not found on the system, ignoring: %s", err)
return nil
case "", "error":
return fmt.Errorf("rocm-smi binary not found in path %s, cannot query GPUs statistics", rsmi.BinPath)
default:
return fmt.Errorf("unknown startup behavior setting: %s", rsmi.StartupErrorBehavior)
}
}
rsmi.BinPath = binPath
}
return nil
}
func init() {
inputs.Add("amd_rocm_smi", func() telegraf.Input {
return &ROCmSMI{

View File

@ -11,6 +11,51 @@ import (
"github.com/stretchr/testify/require"
)
func TestErrorBehaviorError(t *testing.T) {
// make sure we can't find rocm-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &ROCmSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
StartupErrorBehavior: "error",
}
require.Error(t, plugin.Init())
}
func TestErrorBehaviorDefault(t *testing.T) {
// make sure we can't find rocm-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &ROCmSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
}
require.Error(t, plugin.Init())
}
func TestErrorBehaviorIgnore(t *testing.T) {
// make sure we can't find rocm-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &ROCmSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
StartupErrorBehavior: "ignore",
}
require.NoError(t, plugin.Init())
acc := testutil.Accumulator{}
require.NoError(t, plugin.Gather(&acc))
}
func TestErrorBehaviorInvalidOption(t *testing.T) {
// make sure we can't find rocm-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &ROCmSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
StartupErrorBehavior: "giveup",
}
require.Error(t, plugin.Init())
}
func TestGatherValidJSON(t *testing.T) {
tests := []struct {
name string

View File

@ -3,5 +3,11 @@
## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
# bin_path = "/opt/rocm/bin/rocm-smi"
## Optional: specifies plugin behavior regarding missing rocm-smi binary
## Available choices:
## - error: telegraf will return an error on startup
## - ignore: telegraf will ignore this plugin
# startup_error_behavior = "error"
## Optional: timeout for GPU polling
# timeout = "5s"