chore(inputs.amd_rocm_smi): Consolidate startup_retry_behavior to model implementation (#15216)

2024-04-24 14:34:08 -04:00 · 2024-04-24 14:34:08 -04:00 · 1e57608a21
parent e7e2d1aeee
commit 1e57608a21
4 changed files with 82 additions and 68 deletions
--- a/plugins/inputs/amd_rocm_smi/README.md
+++ b/plugins/inputs/amd_rocm_smi/README.md
@ -14,6 +14,18 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
 [CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins
 ## Startup error behavior options
 In addition to the plugin-specific and global configuration settings the plugin
 supports options for specifying the behavior when experiencing startup errors
 using the `startup_error_behavior` setting. Available values are:
 - `error`:  Telegraf with stop and exit in case of startup errors. This is the
            default behavior.
 - `ignore`: Telegraf will ignore startup errors for this plugin and disables it
            but continues processing for all other plugins.
 - `retry`:  NOT AVAILABLE
 ## Configuration
 ```toml @sample.conf
@ -22,12 +34,6 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
  ## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
  # bin_path = "/opt/rocm/bin/rocm-smi"
  ## Optional: specifies plugin behavior regarding missing rocm-smi binary
  ## Available choices:
  ##   - error: telegraf will return an error on startup
  ##   - ignore: telegraf will ignore this plugin
  # startup_error_behavior = "error"
  ## Optional: timeout for GPU polling
  # timeout = "5s"
 ```
--- a/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go
+++ b/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go
@ -4,7 +4,6 @@ package amd_rocm_smi
 import (
 	_ "embed"
 	"encoding/json"
 	"fmt"
 	"os"
 	"os/exec"
 	"strconv"
@ -23,12 +22,9 @@ var sampleConfig string
 const measurement = "amd_rocm_smi"
 type ROCmSMI struct {
-	BinPath              string          `toml:"bin_path"`
+	BinPath string          `toml:"bin_path"`
-	Timeout              config.Duration `toml:"timeout"`
+	Timeout config.Duration `toml:"timeout"`
-	StartupErrorBehavior string          `toml:"startup_error_behavior"`
+	Log     telegraf.Logger `toml:"-"`
 	Log                  telegraf.Logger `toml:"-"`
 	ignorePlugin bool
 }
 func (*ROCmSMI) SampleConfig() string {
@ -37,33 +33,16 @@ func (*ROCmSMI) SampleConfig() string {
 // Gather implements the telegraf interface
 func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error {
 	if rsmi.ignorePlugin {
 		return nil
 	}
 	data := rsmi.pollROCmSMI()
 	err := gatherROCmSMI(data, acc)
 	if err != nil {
 		return err
 	}
-	return nil
+	return gatherROCmSMI(data, acc)
 }
-func (rsmi *ROCmSMI) Init() error {
+func (rsmi *ROCmSMI) Start(telegraf.Accumulator) error {
 	if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) {
 		binPath, err := exec.LookPath("rocm-smi")
 		if err != nil {
-			switch rsmi.StartupErrorBehavior {
+			return &internal.StartupError{Err: err}
 			case "ignore":
 				rsmi.ignorePlugin = true
 				rsmi.Log.Warnf("rocm-smi not found on the system, ignoring: %s", err)
 				return nil
 			case "", "error":
 				return fmt.Errorf("rocm-smi binary not found in path %s, cannot query GPUs statistics", rsmi.BinPath)
 			default:
 				return fmt.Errorf("unknown startup behavior setting: %s", rsmi.StartupErrorBehavior)
 			}
 		}
 		rsmi.BinPath = binPath
 	}
@ -71,6 +50,8 @@ func (rsmi *ROCmSMI) Init() error {
 	return nil
 }
 func (rsmi *ROCmSMI) Stop() {}
 func init() {
 	inputs.Add("amd_rocm_smi", func() telegraf.Input {
 		return &ROCmSMI{
--- a/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go
+++ b/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go
@ -1,27 +1,19 @@
 package amd_rocm_smi
 import (
 	"errors"
 	"os"
 	"path/filepath"
 	"testing"
 	"time"
 	"github.com/influxdata/telegraf"
 	"github.com/influxdata/telegraf/internal"
 	"github.com/influxdata/telegraf/models"
 	"github.com/influxdata/telegraf/testutil"
 	"github.com/stretchr/testify/require"
 )
 func TestErrorBehaviorError(t *testing.T) {
 	// make sure we can't find rocm-smi in $PATH somewhere
 	os.Unsetenv("PATH")
 	plugin := &ROCmSMI{
 		BinPath:              "/random/non-existent/path",
 		Log:                  &testutil.Logger{},
 		StartupErrorBehavior: "error",
 	}
 	require.Error(t, plugin.Init())
 }
 func TestErrorBehaviorDefault(t *testing.T) {
 	// make sure we can't find rocm-smi in $PATH somewhere
 	os.Unsetenv("PATH")
@ -29,31 +21,72 @@ func TestErrorBehaviorDefault(t *testing.T) {
 		BinPath: "/random/non-existent/path",
 		Log:     &testutil.Logger{},
 	}
-	require.Error(t, plugin.Init())
+	model := models.NewRunningInput(plugin, &models.InputConfig{
 		Name: "amd_rocm_smi",
 	})
 	require.NoError(t, model.Init())
 	var acc testutil.Accumulator
 	var ferr *internal.FatalError
 	require.False(t, errors.As(model.Start(&acc), &ferr))
 	require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
 }
 func TestErrorBehaviorError(t *testing.T) {
 	// make sure we can't find rocm-smi in $PATH somewhere
 	os.Unsetenv("PATH")
 	plugin := &ROCmSMI{
 		BinPath: "/random/non-existent/path",
 		Log:     &testutil.Logger{},
 	}
 	model := models.NewRunningInput(plugin, &models.InputConfig{
 		Name:                 "amd_rocm_smi",
 		StartupErrorBehavior: "error",
 	})
 	require.NoError(t, model.Init())
 	var acc testutil.Accumulator
 	var ferr *internal.FatalError
 	require.False(t, errors.As(model.Start(&acc), &ferr))
 	require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
 }
 func TestErrorBehaviorRetry(t *testing.T) {
 	// make sure we can't find nvidia-smi in $PATH somewhere
 	os.Unsetenv("PATH")
 	plugin := &ROCmSMI{
 		BinPath: "/random/non-existent/path",
 		Log:     &testutil.Logger{},
 	}
 	model := models.NewRunningInput(plugin, &models.InputConfig{
 		Name:                 "amd_rocm_smi",
 		StartupErrorBehavior: "retry",
 	})
 	require.NoError(t, model.Init())
 	var acc testutil.Accumulator
 	var ferr *internal.FatalError
 	require.False(t, errors.As(model.Start(&acc), &ferr))
 	require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
 }
 func TestErrorBehaviorIgnore(t *testing.T) {
-	// make sure we can't find rocm-smi in $PATH somewhere
+	// make sure we can't find nvidia-smi in $PATH somewhere
 	os.Unsetenv("PATH")
 	plugin := &ROCmSMI{
-		BinPath:              "/random/non-existent/path",
+		BinPath: "/random/non-existent/path",
-		Log:                  &testutil.Logger{},
+		Log:     &testutil.Logger{},
 	}
 	model := models.NewRunningInput(plugin, &models.InputConfig{
 		Name:                 "amd_rocm_smi",
 		StartupErrorBehavior: "ignore",
-	}
+	})
-	require.NoError(t, plugin.Init())
+	require.NoError(t, model.Init())
 	acc := testutil.Accumulator{}
 	require.NoError(t, plugin.Gather(&acc))
 }
-func TestErrorBehaviorInvalidOption(t *testing.T) {
+	var acc testutil.Accumulator
-	// make sure we can't find rocm-smi in $PATH somewhere
+	var ferr *internal.FatalError
-	os.Unsetenv("PATH")
+	require.ErrorAs(t, model.Start(&acc), &ferr)
-	plugin := &ROCmSMI{
+	require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
 		BinPath:              "/random/non-existent/path",
 		Log:                  &testutil.Logger{},
 		StartupErrorBehavior: "giveup",
 	}
 	require.Error(t, plugin.Init())
 }
 func TestGatherValidJSON(t *testing.T) {
--- a/plugins/inputs/amd_rocm_smi/sample.conf
+++ b/plugins/inputs/amd_rocm_smi/sample.conf
@ -3,11 +3,5 @@
  ## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
  # bin_path = "/opt/rocm/bin/rocm-smi"
  ## Optional: specifies plugin behavior regarding missing rocm-smi binary
  ## Available choices:
  ##   - error: telegraf will return an error on startup
  ##   - ignore: telegraf will ignore this plugin
  # startup_error_behavior = "error"
  ## Optional: timeout for GPU polling
  # timeout = "5s"