fix(parsers/nagios): metrics will always return a supported status co… (#11062)

Co-authored-by: Morten Urban <morten.urban@sectornord.de>
This commit is contained in:
Sakerdotes 2022-05-17 22:14:26 +02:00 committed by GitHub
parent d3abbc0897
commit 42f954fc88
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 70 additions and 42 deletions

View File

@ -127,9 +127,9 @@ func (e *Exec) ProcessCommand(command string, acc telegraf.Accumulator, wg *sync
defer wg.Done() defer wg.Done()
_, isNagios := e.parser.(*nagios.NagiosParser) _, isNagios := e.parser.(*nagios.NagiosParser)
out, errbuf, runErr := e.runner.Run(command, e.Environment, time.Duration(e.Timeout)) out, errBuf, runErr := e.runner.Run(command, e.Environment, time.Duration(e.Timeout))
if !isNagios && runErr != nil { if !isNagios && runErr != nil {
err := fmt.Errorf("exec: %s for command '%s': %s", runErr, command, string(errbuf)) err := fmt.Errorf("exec: %s for command '%s': %s", runErr, command, string(errBuf))
acc.AddError(err) acc.AddError(err)
return return
} }
@ -141,10 +141,7 @@ func (e *Exec) ProcessCommand(command string, acc telegraf.Accumulator, wg *sync
} }
if isNagios { if isNagios {
metrics, err = nagios.TryAddState(runErr, metrics) metrics = nagios.AddState(runErr, errBuf, metrics)
if err != nil {
e.Log.Errorf("Failed to add nagios state: %s", err)
}
} }
for _, m := range metrics { for _, m := range metrics {

View File

@ -4,7 +4,6 @@ import (
"bufio" "bufio"
"bytes" "bytes"
"errors" "errors"
"fmt"
"os/exec" "os/exec"
"regexp" "regexp"
"strconv" "strconv"
@ -16,6 +15,10 @@ import (
"github.com/influxdata/telegraf/metric" "github.com/influxdata/telegraf/metric"
) )
// unknownExitCode is the nagios unknown status code
// the exit code should be used if an error occurs or something unexpected happens
const unknownExitCode = 3
// getExitCode get the exit code from an error value which is the result // getExitCode get the exit code from an error value which is the result
// of running a command through exec package api. // of running a command through exec package api.
func getExitCode(err error) (int, error) { func getExitCode(err error) (int, error) {
@ -25,10 +28,7 @@ func getExitCode(err error) (int, error) {
ee, ok := err.(*exec.ExitError) ee, ok := err.(*exec.ExitError)
if !ok { if !ok {
// If it is not an *exec.ExitError, then it must be return unknownExitCode, err
// an io error, but docs do not say anything about the
// exit code in this case.
return 0, err
} }
ws, ok := ee.Sys().(syscall.WaitStatus) ws, ok := ee.Sys().(syscall.WaitStatus)
@ -39,19 +39,35 @@ func getExitCode(err error) (int, error) {
return ws.ExitStatus(), nil return ws.ExitStatus(), nil
} }
// TryAddState attempts to add a state derived from the runErr. // AddState adds a state derived from the runErr. Unknown state will be set as fallback.
// If any error occurs, it is guaranteed to be returned along with // If any error occurs, it is guaranteed to be added to the service output.
// the initial metric slice. // An updated slice of metrics will be returned.
func TryAddState(runErr error, metrics []telegraf.Metric) ([]telegraf.Metric, error) { func AddState(runErr error, errMessage []byte, metrics []telegraf.Metric) []telegraf.Metric {
state, err := getExitCode(runErr) state, exitErr := getExitCode(runErr)
if err != nil { // This will ensure that in every error case the valid nagios state 'unknown' will be returned.
return metrics, fmt.Errorf("exec: get exit code: %s", err) // No error needs to be thrown because the output will contain the error information.
// Description found at 'Plugin Return Codes' https://nagios-plugins.org/doc/guidelines.html
if exitErr != nil || state < 0 || state > unknownExitCode {
state = unknownExitCode
} }
for _, m := range metrics { for _, m := range metrics {
if m.Name() == "nagios_state" { if m.Name() == "nagios_state" {
m.AddField("state", state) m.AddField("state", state)
return metrics, nil
if state == unknownExitCode {
errorMessage := string(errMessage)
if exitErr != nil && exitErr.Error() != "" {
errorMessage = exitErr.Error()
}
value, ok := m.GetField("service_output")
if !ok || value == "" {
// By adding the error message as output, the metric contains all needed information to understand
// the problem and fix it
m.AddField("service_output", errorMessage)
}
}
return metrics
} }
} }
@ -66,8 +82,7 @@ func TryAddState(runErr error, metrics []telegraf.Metric) ([]telegraf.Metric, er
} }
m := metric.New("nagios_state", nil, f, ts) m := metric.New("nagios_state", nil, f, ts)
metrics = append(metrics, m) return append(metrics, m)
return metrics, nil
} }
type NagiosParser struct { type NagiosParser struct {

View File

@ -32,7 +32,7 @@ func TestGetExitCode(t *testing.T) {
errF: func() error { errF: func() error {
return errors.New("I am not *exec.ExitError") return errors.New("I am not *exec.ExitError")
}, },
expCode: 0, expCode: 3,
expErr: errors.New("I am not *exec.ExitError"), expErr: errors.New("I am not *exec.ExitError"),
}, },
} }
@ -89,10 +89,11 @@ func assertEqual(t *testing.T, exp, actual []telegraf.Metric) {
func TestTryAddState(t *testing.T) { func TestTryAddState(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
runErrF func() error runErrF func() error
metrics []telegraf.Metric runErrMessage []byte
assertF func(*testing.T, []telegraf.Metric, error) metrics []telegraf.Metric
assertF func(*testing.T, []telegraf.Metric)
}{ }{
{ {
name: "should append state=0 field to existing metric", name: "should append state=0 field to existing metric",
@ -107,7 +108,7 @@ func TestTryAddState(t *testing.T) {
n("nagios_state"). n("nagios_state").
f("service_output", "OK: system working").b(), f("service_output", "OK: system working").b(),
}, },
assertF: func(t *testing.T, metrics []telegraf.Metric, err error) { assertF: func(t *testing.T, metrics []telegraf.Metric) {
exp := []telegraf.Metric{ exp := []telegraf.Metric{
mb(). mb().
n("nagios"). n("nagios").
@ -118,7 +119,6 @@ func TestTryAddState(t *testing.T) {
f("state", 0).b(), f("state", 0).b(),
} }
assertEqual(t, exp, metrics) assertEqual(t, exp, metrics)
require.NoError(t, err)
}, },
}, },
{ {
@ -131,7 +131,7 @@ func TestTryAddState(t *testing.T) {
n("nagios"). n("nagios").
f("perfdata", 0).b(), f("perfdata", 0).b(),
}, },
assertF: func(t *testing.T, metrics []telegraf.Metric, err error) { assertF: func(t *testing.T, metrics []telegraf.Metric) {
exp := []telegraf.Metric{ exp := []telegraf.Metric{
mb(). mb().
n("nagios"). n("nagios").
@ -141,7 +141,6 @@ func TestTryAddState(t *testing.T) {
f("state", 0).b(), f("state", 0).b(),
} }
assertEqual(t, exp, metrics) assertEqual(t, exp, metrics)
require.NoError(t, err)
}, },
}, },
{ {
@ -150,7 +149,7 @@ func TestTryAddState(t *testing.T) {
return nil return nil
}, },
metrics: []telegraf.Metric{}, metrics: []telegraf.Metric{},
assertF: func(t *testing.T, metrics []telegraf.Metric, err error) { assertF: func(t *testing.T, metrics []telegraf.Metric) {
require.Len(t, metrics, 1) require.Len(t, metrics, 1)
m := metrics[0] m := metrics[0]
require.Equal(t, "nagios_state", m.Name()) require.Equal(t, "nagios_state", m.Name())
@ -158,37 +157,54 @@ func TestTryAddState(t *testing.T) {
require.True(t, ok) require.True(t, ok)
require.Equal(t, int64(0), s) require.Equal(t, int64(0), s)
require.WithinDuration(t, time.Now().UTC(), m.Time(), 10*time.Second) require.WithinDuration(t, time.Now().UTC(), m.Time(), 10*time.Second)
require.NoError(t, err)
}, },
}, },
{ {
name: "should return original metrics and an error", name: "should return metrics with state unknown and thrown error is service_output",
runErrF: func() error { runErrF: func() error {
return errors.New("non parsable error") return errors.New("non parsable error")
}, },
metrics: []telegraf.Metric{ metrics: []telegraf.Metric{
mb(). mb().
n("nagios"). n("nagios_state").b(),
f("perfdata", 0).b(),
}, },
assertF: func(t *testing.T, metrics []telegraf.Metric, err error) { assertF: func(t *testing.T, metrics []telegraf.Metric) {
exp := []telegraf.Metric{ exp := []telegraf.Metric{
mb(). mb().
n("nagios"). n("nagios_state").
f("perfdata", 0).b(), f("state", 3).
f("service_output", "non parsable error").b(),
} }
expErr := "exec: get exit code: non parsable error"
assertEqual(t, exp, metrics) assertEqual(t, exp, metrics)
require.Equal(t, expErr, err.Error()) },
},
{
name: "should return metrics with state unknown and service_output error from error message parameter",
runErrF: func() error {
return errors.New("")
},
runErrMessage: []byte("some error message"),
metrics: []telegraf.Metric{
mb().
n("nagios_state").b(),
},
assertF: func(t *testing.T, metrics []telegraf.Metric) {
exp := []telegraf.Metric{
mb().
n("nagios_state").
f("state", 3).
f("service_output", "some error message").b(),
}
assertEqual(t, exp, metrics)
}, },
}, },
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
metrics, err := TryAddState(tt.runErrF(), tt.metrics) metrics := AddState(tt.runErrF(), tt.runErrMessage, tt.metrics)
tt.assertF(t, metrics, err) tt.assertF(t, metrics)
}) })
} }
} }