feat(inputs.execd): Add option to not restart program on error (#15271)
This commit is contained in:
parent
5e830fb29a
commit
12ab6dfb33
|
|
@ -23,6 +23,7 @@ type Process struct {
|
||||||
ReadStdoutFn func(io.Reader)
|
ReadStdoutFn func(io.Reader)
|
||||||
ReadStderrFn func(io.Reader)
|
ReadStderrFn func(io.Reader)
|
||||||
RestartDelay time.Duration
|
RestartDelay time.Duration
|
||||||
|
StopOnError bool
|
||||||
Log telegraf.Logger
|
Log telegraf.Logger
|
||||||
|
|
||||||
name string
|
name string
|
||||||
|
|
@ -31,6 +32,8 @@ type Process struct {
|
||||||
pid int32
|
pid int32
|
||||||
cancel context.CancelFunc
|
cancel context.CancelFunc
|
||||||
mainLoopWg sync.WaitGroup
|
mainLoopWg sync.WaitGroup
|
||||||
|
|
||||||
|
sync.Mutex
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates a new process wrapper
|
// New creates a new process wrapper
|
||||||
|
|
@ -65,10 +68,10 @@ func (p *Process) Start() error {
|
||||||
|
|
||||||
p.mainLoopWg.Add(1)
|
p.mainLoopWg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
|
defer p.mainLoopWg.Done()
|
||||||
if err := p.cmdLoop(ctx); err != nil {
|
if err := p.cmdLoop(ctx); err != nil {
|
||||||
p.Log.Errorf("Process quit with message: %v", err)
|
p.Log.Errorf("Process quit with message: %v", err)
|
||||||
}
|
}
|
||||||
p.mainLoopWg.Done()
|
|
||||||
}()
|
}()
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
|
@ -81,12 +84,24 @@ func (p *Process) Stop() {
|
||||||
p.cancel()
|
p.cancel()
|
||||||
}
|
}
|
||||||
// close stdin so the app can shut down gracefully.
|
// close stdin so the app can shut down gracefully.
|
||||||
if err := p.Stdin.Close(); err != nil {
|
if err := p.Stdin.Close(); err != nil && !errors.Is(err, os.ErrClosed) {
|
||||||
p.Log.Errorf("Stdin closed with message: %v", err)
|
p.Log.Errorf("Stdin closed with message: %v", err)
|
||||||
}
|
}
|
||||||
p.mainLoopWg.Wait()
|
p.mainLoopWg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (p *Process) Pid() int {
|
||||||
|
pid := atomic.LoadInt32(&p.pid)
|
||||||
|
return int(pid)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Process) State() (state *os.ProcessState, running bool) {
|
||||||
|
p.Lock()
|
||||||
|
defer p.Unlock()
|
||||||
|
|
||||||
|
return p.Cmd.ProcessState, p.Cmd.ProcessState.ExitCode() == -1
|
||||||
|
}
|
||||||
|
|
||||||
func (p *Process) cmdStart() error {
|
func (p *Process) cmdStart() error {
|
||||||
p.Cmd = exec.Command(p.name, p.args...)
|
p.Cmd = exec.Command(p.name, p.args...)
|
||||||
|
|
||||||
|
|
@ -119,15 +134,13 @@ func (p *Process) cmdStart() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Process) Pid() int {
|
|
||||||
pid := atomic.LoadInt32(&p.pid)
|
|
||||||
return int(pid)
|
|
||||||
}
|
|
||||||
|
|
||||||
// cmdLoop watches an already running process, restarting it when appropriate.
|
// cmdLoop watches an already running process, restarting it when appropriate.
|
||||||
func (p *Process) cmdLoop(ctx context.Context) error {
|
func (p *Process) cmdLoop(ctx context.Context) error {
|
||||||
for {
|
for {
|
||||||
err := p.cmdWait(ctx)
|
err := p.cmdWait(ctx)
|
||||||
|
if err != nil && p.StopOnError {
|
||||||
|
return err
|
||||||
|
}
|
||||||
if isQuitting(ctx) {
|
if isQuitting(ctx) {
|
||||||
p.Log.Infof("Process %s shut down", p.Cmd.Path)
|
p.Log.Infof("Process %s shut down", p.Cmd.Path)
|
||||||
return nil
|
return nil
|
||||||
|
|
@ -184,7 +197,9 @@ func (p *Process) cmdWait(ctx context.Context) error {
|
||||||
wg.Done()
|
wg.Done()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
p.Lock()
|
||||||
err := p.Cmd.Wait()
|
err := p.Cmd.Wait()
|
||||||
|
p.Unlock()
|
||||||
processCancel()
|
processCancel()
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
return err
|
return err
|
||||||
|
|
|
||||||
|
|
@ -59,20 +59,24 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
|
||||||
## "SIGHUP" : Send a HUP signal. Not available on Windows. (not recommended)
|
## "SIGHUP" : Send a HUP signal. Not available on Windows. (not recommended)
|
||||||
## "SIGUSR1" : Send a USR1 signal. Not available on Windows.
|
## "SIGUSR1" : Send a USR1 signal. Not available on Windows.
|
||||||
## "SIGUSR2" : Send a USR2 signal. Not available on Windows.
|
## "SIGUSR2" : Send a USR2 signal. Not available on Windows.
|
||||||
signal = "none"
|
# signal = "none"
|
||||||
|
|
||||||
## Delay before the process is restarted after an unexpected termination
|
## Delay before the process is restarted after an unexpected termination
|
||||||
restart_delay = "10s"
|
# restart_delay = "10s"
|
||||||
|
|
||||||
## Buffer size used to read from the command output stream
|
## Buffer size used to read from the command output stream
|
||||||
## Optional parameter. Default is 64 Kib, minimum is 16 bytes
|
## Optional parameter. Default is 64 Kib, minimum is 16 bytes
|
||||||
# buffer_size = "64Kib"
|
# buffer_size = "64Kib"
|
||||||
|
|
||||||
|
## Disable automatic restart of the program and stop if the program exits
|
||||||
|
## with an error (i.e. non-zero error code)
|
||||||
|
# stop_on_error = false
|
||||||
|
|
||||||
## Data format to consume.
|
## Data format to consume.
|
||||||
## Each data format has its own unique set of configuration options, read
|
## Each data format has its own unique set of configuration options, read
|
||||||
## more about them here:
|
## more about them here:
|
||||||
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
|
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
|
||||||
data_format = "influx"
|
# data_format = "influx"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Example
|
## Example
|
||||||
|
|
|
||||||
|
|
@ -25,10 +25,11 @@ var sampleConfig string
|
||||||
type Execd struct {
|
type Execd struct {
|
||||||
Command []string `toml:"command"`
|
Command []string `toml:"command"`
|
||||||
Environment []string `toml:"environment"`
|
Environment []string `toml:"environment"`
|
||||||
|
BufferSize config.Size `toml:"buffer_size"`
|
||||||
Signal string `toml:"signal"`
|
Signal string `toml:"signal"`
|
||||||
RestartDelay config.Duration `toml:"restart_delay"`
|
RestartDelay config.Duration `toml:"restart_delay"`
|
||||||
|
StopOnError bool `toml:"stop_on_error"`
|
||||||
Log telegraf.Logger `toml:"-"`
|
Log telegraf.Logger `toml:"-"`
|
||||||
BufferSize config.Size `toml:"buffer_size"`
|
|
||||||
|
|
||||||
process *process.Process
|
process *process.Process
|
||||||
acc telegraf.Accumulator
|
acc telegraf.Accumulator
|
||||||
|
|
@ -59,10 +60,11 @@ func (e *Execd) Start(acc telegraf.Accumulator) error {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("error creating new process: %w", err)
|
return fmt.Errorf("error creating new process: %w", err)
|
||||||
}
|
}
|
||||||
e.process.Log = e.Log
|
|
||||||
e.process.RestartDelay = time.Duration(e.RestartDelay)
|
|
||||||
e.process.ReadStdoutFn = e.outputReader
|
e.process.ReadStdoutFn = e.outputReader
|
||||||
e.process.ReadStderrFn = e.cmdReadErr
|
e.process.ReadStderrFn = e.cmdReadErr
|
||||||
|
e.process.RestartDelay = time.Duration(e.RestartDelay)
|
||||||
|
e.process.StopOnError = e.StopOnError
|
||||||
|
e.process.Log = e.Log
|
||||||
|
|
||||||
if err = e.process.Start(); err != nil {
|
if err = e.process.Start(); err != nil {
|
||||||
// if there was only one argument, and it contained spaces, warn the user
|
// if there was only one argument, and it contained spaces, warn the user
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,7 @@ func TestExternalInputWorks(t *testing.T) {
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
e := &Execd{
|
e := &Execd{
|
||||||
Command: []string{exe, "-counter"},
|
Command: []string{exe, "-mode", "counter"},
|
||||||
Environment: []string{"PLUGINS_INPUTS_EXECD_MODE=application", "METRIC_NAME=counter"},
|
Environment: []string{"PLUGINS_INPUTS_EXECD_MODE=application", "METRIC_NAME=counter"},
|
||||||
RestartDelay: config.Duration(5 * time.Second),
|
RestartDelay: config.Duration(5 * time.Second),
|
||||||
Signal: "STDIN",
|
Signal: "STDIN",
|
||||||
|
|
@ -159,6 +159,62 @@ test{handler="execd",quantile="0.5"} 42.0
|
||||||
testutil.RequireMetricsEqual(t, expected, actual, testutil.IgnoreTime())
|
testutil.RequireMetricsEqual(t, expected, actual, testutil.IgnoreTime())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestStopOnError(t *testing.T) {
|
||||||
|
exe, err := os.Executable()
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
plugin := &Execd{
|
||||||
|
Command: []string{exe, "-mode", "fail"},
|
||||||
|
Environment: []string{"PLUGINS_INPUTS_EXECD_MODE=application"},
|
||||||
|
StopOnError: true,
|
||||||
|
RestartDelay: config.Duration(5 * time.Second),
|
||||||
|
Log: testutil.Logger{},
|
||||||
|
}
|
||||||
|
|
||||||
|
parser := models.NewRunningParser(&influx.Parser{}, &models.ParserConfig{})
|
||||||
|
require.NoError(t, parser.Init())
|
||||||
|
plugin.SetParser(parser)
|
||||||
|
|
||||||
|
var acc testutil.Accumulator
|
||||||
|
require.NoError(t, plugin.Start(&acc))
|
||||||
|
defer plugin.Stop()
|
||||||
|
|
||||||
|
require.Eventually(t, func() bool {
|
||||||
|
_, running := plugin.process.State()
|
||||||
|
return !running
|
||||||
|
}, 3*time.Second, 100*time.Millisecond)
|
||||||
|
|
||||||
|
state, running := plugin.process.State()
|
||||||
|
require.False(t, running)
|
||||||
|
require.Equal(t, 42, state.ExitCode())
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestStopOnErrorSuccess(t *testing.T) {
|
||||||
|
exe, err := os.Executable()
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
plugin := &Execd{
|
||||||
|
Command: []string{exe, "-mode", "success"},
|
||||||
|
Environment: []string{"PLUGINS_INPUTS_EXECD_MODE=application"},
|
||||||
|
StopOnError: true,
|
||||||
|
RestartDelay: config.Duration(100 * time.Millisecond),
|
||||||
|
Log: testutil.Logger{},
|
||||||
|
}
|
||||||
|
|
||||||
|
parser := models.NewRunningParser(&influx.Parser{}, &models.ParserConfig{})
|
||||||
|
require.NoError(t, parser.Init())
|
||||||
|
plugin.SetParser(parser)
|
||||||
|
|
||||||
|
var acc testutil.Accumulator
|
||||||
|
require.NoError(t, plugin.Start(&acc))
|
||||||
|
defer plugin.Stop()
|
||||||
|
|
||||||
|
// Wait for at least two metric as this indicates the process was restarted
|
||||||
|
require.Eventually(t, func() bool {
|
||||||
|
return acc.NMetrics() > 1
|
||||||
|
}, 3*time.Second, 100*time.Millisecond)
|
||||||
|
}
|
||||||
|
|
||||||
func readChanWithTimeout(t *testing.T, metrics chan telegraf.Metric, timeout time.Duration) telegraf.Metric {
|
func readChanWithTimeout(t *testing.T, metrics chan telegraf.Metric, timeout time.Duration) telegraf.Metric {
|
||||||
to := time.NewTimer(timeout)
|
to := time.NewTimer(timeout)
|
||||||
defer to.Stop()
|
defer to.Stop()
|
||||||
|
|
@ -189,20 +245,32 @@ func (tm *TestMetricMaker) Log() telegraf.Logger {
|
||||||
return logger.NewLogger("TestPlugin", "test", "")
|
return logger.NewLogger("TestPlugin", "test", "")
|
||||||
}
|
}
|
||||||
|
|
||||||
var counter = flag.Bool("counter", false,
|
|
||||||
"if true, act like line input program instead of test")
|
|
||||||
|
|
||||||
func TestMain(m *testing.M) {
|
func TestMain(m *testing.M) {
|
||||||
|
var mode string
|
||||||
|
|
||||||
|
flag.StringVar(&mode, "mode", "counter", "determines the output when run as mockup program")
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
runMode := os.Getenv("PLUGINS_INPUTS_EXECD_MODE")
|
|
||||||
if *counter && runMode == "application" {
|
operationMode := os.Getenv("PLUGINS_INPUTS_EXECD_MODE")
|
||||||
|
if operationMode != "application" {
|
||||||
|
// Run the normal test mode
|
||||||
|
os.Exit(m.Run())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run as a mock program
|
||||||
|
switch mode {
|
||||||
|
case "counter":
|
||||||
if err := runCounterProgram(); err != nil {
|
if err := runCounterProgram(); err != nil {
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
os.Exit(0)
|
os.Exit(0)
|
||||||
|
case "fail":
|
||||||
|
os.Exit(42)
|
||||||
|
case "success":
|
||||||
|
fmt.Println("test value=42i")
|
||||||
|
os.Exit(0)
|
||||||
}
|
}
|
||||||
code := m.Run()
|
os.Exit(23)
|
||||||
os.Exit(code)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func runCounterProgram() error {
|
func runCounterProgram() error {
|
||||||
|
|
@ -217,9 +285,7 @@ func runCounterProgram() error {
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
m := metric.New(envMetricName,
|
m := metric.New(envMetricName,
|
||||||
map[string]string{},
|
map[string]string{},
|
||||||
map[string]interface{}{
|
map[string]interface{}{"count": i},
|
||||||
"count": i,
|
|
||||||
},
|
|
||||||
time.Now(),
|
time.Now(),
|
||||||
)
|
)
|
||||||
i++
|
i++
|
||||||
|
|
|
||||||
|
|
@ -18,17 +18,21 @@
|
||||||
## "SIGHUP" : Send a HUP signal. Not available on Windows. (not recommended)
|
## "SIGHUP" : Send a HUP signal. Not available on Windows. (not recommended)
|
||||||
## "SIGUSR1" : Send a USR1 signal. Not available on Windows.
|
## "SIGUSR1" : Send a USR1 signal. Not available on Windows.
|
||||||
## "SIGUSR2" : Send a USR2 signal. Not available on Windows.
|
## "SIGUSR2" : Send a USR2 signal. Not available on Windows.
|
||||||
signal = "none"
|
# signal = "none"
|
||||||
|
|
||||||
## Delay before the process is restarted after an unexpected termination
|
## Delay before the process is restarted after an unexpected termination
|
||||||
restart_delay = "10s"
|
# restart_delay = "10s"
|
||||||
|
|
||||||
## Buffer size used to read from the command output stream
|
## Buffer size used to read from the command output stream
|
||||||
## Optional parameter. Default is 64 Kib, minimum is 16 bytes
|
## Optional parameter. Default is 64 Kib, minimum is 16 bytes
|
||||||
# buffer_size = "64Kib"
|
# buffer_size = "64Kib"
|
||||||
|
|
||||||
|
## Disable automatic restart of the program and stop if the program exits
|
||||||
|
## with an error (i.e. non-zero error code)
|
||||||
|
# stop_on_error = false
|
||||||
|
|
||||||
## Data format to consume.
|
## Data format to consume.
|
||||||
## Each data format has its own unique set of configuration options, read
|
## Each data format has its own unique set of configuration options, read
|
||||||
## more about them here:
|
## more about them here:
|
||||||
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
|
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
|
||||||
data_format = "influx"
|
# data_format = "influx"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue