chore(inputs.amd_rocm_smi): Consolidate startup_retry_behavior to model implementation (#15216)

2024-04-24 14:34:08 -04:00 · 2024-04-24 14:34:08 -04:00 · 1e57608a21
parent e7e2d1aeee
commit 1e57608a21
4 changed files with 82 additions and 68 deletions
--- a/plugins/inputs/amd_rocm_smi/README.md
+++ b/plugins/inputs/amd_rocm_smi/README.md
@ -14,6 +14,18 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.

 [CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins

+## Startup error behavior options
+
+In addition to the plugin-specific and global configuration settings the plugin
+supports options for specifying the behavior when experiencing startup errors
+using the `startup_error_behavior` setting. Available values are:
+
+- `error`:  Telegraf with stop and exit in case of startup errors. This is the
+            default behavior.
+- `ignore`: Telegraf will ignore startup errors for this plugin and disables it
+            but continues processing for all other plugins.
+- `retry`:  NOT AVAILABLE
+
 ## Configuration

 ```toml @sample.conf
@ -22,12 +34,6 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
  ## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
  # bin_path = "/opt/rocm/bin/rocm-smi"

-  ## Optional: specifies plugin behavior regarding missing rocm-smi binary
-  ## Available choices:
-  ##   - error: telegraf will return an error on startup
-  ##   - ignore: telegraf will ignore this plugin
-  # startup_error_behavior = "error"
-
  ## Optional: timeout for GPU polling
  # timeout = "5s"
 ```
--- a/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go
+++ b/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go
@ -4,7 +4,6 @@ package amd_rocm_smi
 import (
 	_ "embed"
 	"encoding/json"
-	"fmt"
 	"os"
 	"os/exec"
 	"strconv"
@ -23,12 +22,9 @@ var sampleConfig string
 const measurement = "amd_rocm_smi"

 type ROCmSMI struct {
-	BinPath              string          `toml:"bin_path"`
-	Timeout              config.Duration `toml:"timeout"`
-	StartupErrorBehavior string          `toml:"startup_error_behavior"`
-	Log                  telegraf.Logger `toml:"-"`
-
-	ignorePlugin bool
+	BinPath string          `toml:"bin_path"`
+	Timeout config.Duration `toml:"timeout"`
+	Log     telegraf.Logger `toml:"-"`
 }

 func (*ROCmSMI) SampleConfig() string {
@ -37,33 +33,16 @@ func (*ROCmSMI) SampleConfig() string {

 // Gather implements the telegraf interface
 func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error {
-	if rsmi.ignorePlugin {
-		return nil
-	}
-
 	data := rsmi.pollROCmSMI()
-	err := gatherROCmSMI(data, acc)
-	if err != nil {
-		return err
-	}

-	return nil
+	return gatherROCmSMI(data, acc)
 }

-func (rsmi *ROCmSMI) Init() error {
+func (rsmi *ROCmSMI) Start(telegraf.Accumulator) error {
 	if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) {
 		binPath, err := exec.LookPath("rocm-smi")
 		if err != nil {
-			switch rsmi.StartupErrorBehavior {
-			case "ignore":
-				rsmi.ignorePlugin = true
-				rsmi.Log.Warnf("rocm-smi not found on the system, ignoring: %s", err)
-				return nil
-			case "", "error":
-				return fmt.Errorf("rocm-smi binary not found in path %s, cannot query GPUs statistics", rsmi.BinPath)
-			default:
-				return fmt.Errorf("unknown startup behavior setting: %s", rsmi.StartupErrorBehavior)
-			}
+			return &internal.StartupError{Err: err}
 		}
 		rsmi.BinPath = binPath
 	}
@ -71,6 +50,8 @@ func (rsmi *ROCmSMI) Init() error {
 	return nil
 }

+func (rsmi *ROCmSMI) Stop() {}
+
 func init() {
 	inputs.Add("amd_rocm_smi", func() telegraf.Input {
 		return &ROCmSMI{
--- a/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go
+++ b/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go
@ -1,27 +1,19 @@
 package amd_rocm_smi

 import (
+	"errors"
 	"os"
 	"path/filepath"
 	"testing"
 	"time"

 	"github.com/influxdata/telegraf"
+	"github.com/influxdata/telegraf/internal"
+	"github.com/influxdata/telegraf/models"
 	"github.com/influxdata/telegraf/testutil"
 	"github.com/stretchr/testify/require"
 )

-func TestErrorBehaviorError(t *testing.T) {
-	// make sure we can't find rocm-smi in $PATH somewhere
-	os.Unsetenv("PATH")
-	plugin := &ROCmSMI{
-		BinPath:              "/random/non-existent/path",
-		Log:                  &testutil.Logger{},
-		StartupErrorBehavior: "error",
-	}
-	require.Error(t, plugin.Init())
-}
-
 func TestErrorBehaviorDefault(t *testing.T) {
 	// make sure we can't find rocm-smi in $PATH somewhere
 	os.Unsetenv("PATH")
@ -29,31 +21,72 @@ func TestErrorBehaviorDefault(t *testing.T) {
 		BinPath: "/random/non-existent/path",
 		Log:     &testutil.Logger{},
 	}
-	require.Error(t, plugin.Init())
+	model := models.NewRunningInput(plugin, &models.InputConfig{
+		Name: "amd_rocm_smi",
+	})
+	require.NoError(t, model.Init())
+
+	var acc testutil.Accumulator
+	var ferr *internal.FatalError
+	require.False(t, errors.As(model.Start(&acc), &ferr))
+	require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
+}
+
+func TestErrorBehaviorError(t *testing.T) {
+	// make sure we can't find rocm-smi in $PATH somewhere
+	os.Unsetenv("PATH")
+	plugin := &ROCmSMI{
+		BinPath: "/random/non-existent/path",
+		Log:     &testutil.Logger{},
+	}
+	model := models.NewRunningInput(plugin, &models.InputConfig{
+		Name:                 "amd_rocm_smi",
+		StartupErrorBehavior: "error",
+	})
+	require.NoError(t, model.Init())
+
+	var acc testutil.Accumulator
+	var ferr *internal.FatalError
+	require.False(t, errors.As(model.Start(&acc), &ferr))
+	require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
+}
+
+func TestErrorBehaviorRetry(t *testing.T) {
+	// make sure we can't find nvidia-smi in $PATH somewhere
+	os.Unsetenv("PATH")
+	plugin := &ROCmSMI{
+		BinPath: "/random/non-existent/path",
+		Log:     &testutil.Logger{},
+	}
+	model := models.NewRunningInput(plugin, &models.InputConfig{
+		Name:                 "amd_rocm_smi",
+		StartupErrorBehavior: "retry",
+	})
+	require.NoError(t, model.Init())
+
+	var acc testutil.Accumulator
+	var ferr *internal.FatalError
+	require.False(t, errors.As(model.Start(&acc), &ferr))
+	require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
 }

 func TestErrorBehaviorIgnore(t *testing.T) {
-	// make sure we can't find rocm-smi in $PATH somewhere
+	// make sure we can't find nvidia-smi in $PATH somewhere
 	os.Unsetenv("PATH")
 	plugin := &ROCmSMI{
-		BinPath:              "/random/non-existent/path",
-		Log:                  &testutil.Logger{},
+		BinPath: "/random/non-existent/path",
+		Log:     &testutil.Logger{},
+	}
+	model := models.NewRunningInput(plugin, &models.InputConfig{
+		Name:                 "amd_rocm_smi",
 		StartupErrorBehavior: "ignore",
-	}
-	require.NoError(t, plugin.Init())
-	acc := testutil.Accumulator{}
-	require.NoError(t, plugin.Gather(&acc))
-}
+	})
+	require.NoError(t, model.Init())

-func TestErrorBehaviorInvalidOption(t *testing.T) {
-	// make sure we can't find rocm-smi in $PATH somewhere
-	os.Unsetenv("PATH")
-	plugin := &ROCmSMI{
-		BinPath:              "/random/non-existent/path",
-		Log:                  &testutil.Logger{},
-		StartupErrorBehavior: "giveup",
-	}
-	require.Error(t, plugin.Init())
+	var acc testutil.Accumulator
+	var ferr *internal.FatalError
+	require.ErrorAs(t, model.Start(&acc), &ferr)
+	require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
 }

 func TestGatherValidJSON(t *testing.T) {
--- a/plugins/inputs/amd_rocm_smi/sample.conf
+++ b/plugins/inputs/amd_rocm_smi/sample.conf
@ -3,11 +3,5 @@
  ## Optional: path to rocm-smi binary, defaults to $PATH via exec.LookPath
  # bin_path = "/opt/rocm/bin/rocm-smi"

-  ## Optional: specifies plugin behavior regarding missing rocm-smi binary
-  ## Available choices:
-  ##   - error: telegraf will return an error on startup
-  ##   - ignore: telegraf will ignore this plugin
-  # startup_error_behavior = "error"
-
  ## Optional: timeout for GPU polling
  # timeout = "5s"