fix(parsers/nagios): metrics will always return a supported status co… (#11062)
Co-authored-by: Morten Urban <morten.urban@sectornord.de>
This commit is contained in:
parent
d3abbc0897
commit
42f954fc88
|
|
@ -127,9 +127,9 @@ func (e *Exec) ProcessCommand(command string, acc telegraf.Accumulator, wg *sync
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
_, isNagios := e.parser.(*nagios.NagiosParser)
|
_, isNagios := e.parser.(*nagios.NagiosParser)
|
||||||
|
|
||||||
out, errbuf, runErr := e.runner.Run(command, e.Environment, time.Duration(e.Timeout))
|
out, errBuf, runErr := e.runner.Run(command, e.Environment, time.Duration(e.Timeout))
|
||||||
if !isNagios && runErr != nil {
|
if !isNagios && runErr != nil {
|
||||||
err := fmt.Errorf("exec: %s for command '%s': %s", runErr, command, string(errbuf))
|
err := fmt.Errorf("exec: %s for command '%s': %s", runErr, command, string(errBuf))
|
||||||
acc.AddError(err)
|
acc.AddError(err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
@ -141,10 +141,7 @@ func (e *Exec) ProcessCommand(command string, acc telegraf.Accumulator, wg *sync
|
||||||
}
|
}
|
||||||
|
|
||||||
if isNagios {
|
if isNagios {
|
||||||
metrics, err = nagios.TryAddState(runErr, metrics)
|
metrics = nagios.AddState(runErr, errBuf, metrics)
|
||||||
if err != nil {
|
|
||||||
e.Log.Errorf("Failed to add nagios state: %s", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, m := range metrics {
|
for _, m := range metrics {
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
|
@ -16,6 +15,10 @@ import (
|
||||||
"github.com/influxdata/telegraf/metric"
|
"github.com/influxdata/telegraf/metric"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// unknownExitCode is the nagios unknown status code
|
||||||
|
// the exit code should be used if an error occurs or something unexpected happens
|
||||||
|
const unknownExitCode = 3
|
||||||
|
|
||||||
// getExitCode get the exit code from an error value which is the result
|
// getExitCode get the exit code from an error value which is the result
|
||||||
// of running a command through exec package api.
|
// of running a command through exec package api.
|
||||||
func getExitCode(err error) (int, error) {
|
func getExitCode(err error) (int, error) {
|
||||||
|
|
@ -25,10 +28,7 @@ func getExitCode(err error) (int, error) {
|
||||||
|
|
||||||
ee, ok := err.(*exec.ExitError)
|
ee, ok := err.(*exec.ExitError)
|
||||||
if !ok {
|
if !ok {
|
||||||
// If it is not an *exec.ExitError, then it must be
|
return unknownExitCode, err
|
||||||
// an io error, but docs do not say anything about the
|
|
||||||
// exit code in this case.
|
|
||||||
return 0, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ws, ok := ee.Sys().(syscall.WaitStatus)
|
ws, ok := ee.Sys().(syscall.WaitStatus)
|
||||||
|
|
@ -39,19 +39,35 @@ func getExitCode(err error) (int, error) {
|
||||||
return ws.ExitStatus(), nil
|
return ws.ExitStatus(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// TryAddState attempts to add a state derived from the runErr.
|
// AddState adds a state derived from the runErr. Unknown state will be set as fallback.
|
||||||
// If any error occurs, it is guaranteed to be returned along with
|
// If any error occurs, it is guaranteed to be added to the service output.
|
||||||
// the initial metric slice.
|
// An updated slice of metrics will be returned.
|
||||||
func TryAddState(runErr error, metrics []telegraf.Metric) ([]telegraf.Metric, error) {
|
func AddState(runErr error, errMessage []byte, metrics []telegraf.Metric) []telegraf.Metric {
|
||||||
state, err := getExitCode(runErr)
|
state, exitErr := getExitCode(runErr)
|
||||||
if err != nil {
|
// This will ensure that in every error case the valid nagios state 'unknown' will be returned.
|
||||||
return metrics, fmt.Errorf("exec: get exit code: %s", err)
|
// No error needs to be thrown because the output will contain the error information.
|
||||||
|
// Description found at 'Plugin Return Codes' https://nagios-plugins.org/doc/guidelines.html
|
||||||
|
if exitErr != nil || state < 0 || state > unknownExitCode {
|
||||||
|
state = unknownExitCode
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, m := range metrics {
|
for _, m := range metrics {
|
||||||
if m.Name() == "nagios_state" {
|
if m.Name() == "nagios_state" {
|
||||||
m.AddField("state", state)
|
m.AddField("state", state)
|
||||||
return metrics, nil
|
|
||||||
|
if state == unknownExitCode {
|
||||||
|
errorMessage := string(errMessage)
|
||||||
|
if exitErr != nil && exitErr.Error() != "" {
|
||||||
|
errorMessage = exitErr.Error()
|
||||||
|
}
|
||||||
|
value, ok := m.GetField("service_output")
|
||||||
|
if !ok || value == "" {
|
||||||
|
// By adding the error message as output, the metric contains all needed information to understand
|
||||||
|
// the problem and fix it
|
||||||
|
m.AddField("service_output", errorMessage)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return metrics
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -66,8 +82,7 @@ func TryAddState(runErr error, metrics []telegraf.Metric) ([]telegraf.Metric, er
|
||||||
}
|
}
|
||||||
m := metric.New("nagios_state", nil, f, ts)
|
m := metric.New("nagios_state", nil, f, ts)
|
||||||
|
|
||||||
metrics = append(metrics, m)
|
return append(metrics, m)
|
||||||
return metrics, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type NagiosParser struct {
|
type NagiosParser struct {
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,7 @@ func TestGetExitCode(t *testing.T) {
|
||||||
errF: func() error {
|
errF: func() error {
|
||||||
return errors.New("I am not *exec.ExitError")
|
return errors.New("I am not *exec.ExitError")
|
||||||
},
|
},
|
||||||
expCode: 0,
|
expCode: 3,
|
||||||
expErr: errors.New("I am not *exec.ExitError"),
|
expErr: errors.New("I am not *exec.ExitError"),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
@ -89,10 +89,11 @@ func assertEqual(t *testing.T, exp, actual []telegraf.Metric) {
|
||||||
|
|
||||||
func TestTryAddState(t *testing.T) {
|
func TestTryAddState(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
runErrF func() error
|
runErrF func() error
|
||||||
metrics []telegraf.Metric
|
runErrMessage []byte
|
||||||
assertF func(*testing.T, []telegraf.Metric, error)
|
metrics []telegraf.Metric
|
||||||
|
assertF func(*testing.T, []telegraf.Metric)
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "should append state=0 field to existing metric",
|
name: "should append state=0 field to existing metric",
|
||||||
|
|
@ -107,7 +108,7 @@ func TestTryAddState(t *testing.T) {
|
||||||
n("nagios_state").
|
n("nagios_state").
|
||||||
f("service_output", "OK: system working").b(),
|
f("service_output", "OK: system working").b(),
|
||||||
},
|
},
|
||||||
assertF: func(t *testing.T, metrics []telegraf.Metric, err error) {
|
assertF: func(t *testing.T, metrics []telegraf.Metric) {
|
||||||
exp := []telegraf.Metric{
|
exp := []telegraf.Metric{
|
||||||
mb().
|
mb().
|
||||||
n("nagios").
|
n("nagios").
|
||||||
|
|
@ -118,7 +119,6 @@ func TestTryAddState(t *testing.T) {
|
||||||
f("state", 0).b(),
|
f("state", 0).b(),
|
||||||
}
|
}
|
||||||
assertEqual(t, exp, metrics)
|
assertEqual(t, exp, metrics)
|
||||||
require.NoError(t, err)
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -131,7 +131,7 @@ func TestTryAddState(t *testing.T) {
|
||||||
n("nagios").
|
n("nagios").
|
||||||
f("perfdata", 0).b(),
|
f("perfdata", 0).b(),
|
||||||
},
|
},
|
||||||
assertF: func(t *testing.T, metrics []telegraf.Metric, err error) {
|
assertF: func(t *testing.T, metrics []telegraf.Metric) {
|
||||||
exp := []telegraf.Metric{
|
exp := []telegraf.Metric{
|
||||||
mb().
|
mb().
|
||||||
n("nagios").
|
n("nagios").
|
||||||
|
|
@ -141,7 +141,6 @@ func TestTryAddState(t *testing.T) {
|
||||||
f("state", 0).b(),
|
f("state", 0).b(),
|
||||||
}
|
}
|
||||||
assertEqual(t, exp, metrics)
|
assertEqual(t, exp, metrics)
|
||||||
require.NoError(t, err)
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -150,7 +149,7 @@ func TestTryAddState(t *testing.T) {
|
||||||
return nil
|
return nil
|
||||||
},
|
},
|
||||||
metrics: []telegraf.Metric{},
|
metrics: []telegraf.Metric{},
|
||||||
assertF: func(t *testing.T, metrics []telegraf.Metric, err error) {
|
assertF: func(t *testing.T, metrics []telegraf.Metric) {
|
||||||
require.Len(t, metrics, 1)
|
require.Len(t, metrics, 1)
|
||||||
m := metrics[0]
|
m := metrics[0]
|
||||||
require.Equal(t, "nagios_state", m.Name())
|
require.Equal(t, "nagios_state", m.Name())
|
||||||
|
|
@ -158,37 +157,54 @@ func TestTryAddState(t *testing.T) {
|
||||||
require.True(t, ok)
|
require.True(t, ok)
|
||||||
require.Equal(t, int64(0), s)
|
require.Equal(t, int64(0), s)
|
||||||
require.WithinDuration(t, time.Now().UTC(), m.Time(), 10*time.Second)
|
require.WithinDuration(t, time.Now().UTC(), m.Time(), 10*time.Second)
|
||||||
require.NoError(t, err)
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "should return original metrics and an error",
|
name: "should return metrics with state unknown and thrown error is service_output",
|
||||||
runErrF: func() error {
|
runErrF: func() error {
|
||||||
return errors.New("non parsable error")
|
return errors.New("non parsable error")
|
||||||
},
|
},
|
||||||
metrics: []telegraf.Metric{
|
metrics: []telegraf.Metric{
|
||||||
mb().
|
mb().
|
||||||
n("nagios").
|
n("nagios_state").b(),
|
||||||
f("perfdata", 0).b(),
|
|
||||||
},
|
},
|
||||||
assertF: func(t *testing.T, metrics []telegraf.Metric, err error) {
|
assertF: func(t *testing.T, metrics []telegraf.Metric) {
|
||||||
exp := []telegraf.Metric{
|
exp := []telegraf.Metric{
|
||||||
mb().
|
mb().
|
||||||
n("nagios").
|
n("nagios_state").
|
||||||
f("perfdata", 0).b(),
|
f("state", 3).
|
||||||
|
f("service_output", "non parsable error").b(),
|
||||||
}
|
}
|
||||||
expErr := "exec: get exit code: non parsable error"
|
|
||||||
|
|
||||||
assertEqual(t, exp, metrics)
|
assertEqual(t, exp, metrics)
|
||||||
require.Equal(t, expErr, err.Error())
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "should return metrics with state unknown and service_output error from error message parameter",
|
||||||
|
runErrF: func() error {
|
||||||
|
return errors.New("")
|
||||||
|
},
|
||||||
|
runErrMessage: []byte("some error message"),
|
||||||
|
metrics: []telegraf.Metric{
|
||||||
|
mb().
|
||||||
|
n("nagios_state").b(),
|
||||||
|
},
|
||||||
|
assertF: func(t *testing.T, metrics []telegraf.Metric) {
|
||||||
|
exp := []telegraf.Metric{
|
||||||
|
mb().
|
||||||
|
n("nagios_state").
|
||||||
|
f("state", 3).
|
||||||
|
f("service_output", "some error message").b(),
|
||||||
|
}
|
||||||
|
assertEqual(t, exp, metrics)
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
metrics, err := TryAddState(tt.runErrF(), tt.metrics)
|
metrics := AddState(tt.runErrF(), tt.runErrMessage, tt.metrics)
|
||||||
tt.assertF(t, metrics, err)
|
tt.assertF(t, metrics)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue