feat(inputs.vsphere): Flag for more lenient behavior when connect fails on startup (#12828)

This commit is contained in:
Pontus Rydin 2023-03-13 07:19:49 -04:00 committed by GitHub
parent 2006086262
commit 7daf7bb38f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 109 additions and 58 deletions

View File

@ -234,6 +234,12 @@ to use them.
## The Historical Interval value must match EXACTLY the interval in the daily ## The Historical Interval value must match EXACTLY the interval in the daily
# "Interval Duration" found on the VCenter server under Configure > General > Statistics > Statistic intervals # "Interval Duration" found on the VCenter server under Configure > General > Statistics > Statistic intervals
# historical_interval = "5m" # historical_interval = "5m"
## Specifies plugin behavior regarding disconnected servers
## Available choices :
## - error: telegraf will return an error on startup if one the servers is unreachable
## - skip: telegraf will skip unreachable servers on both startup and gather
# disconnected_servers_behavior = "error"
``` ```
NOTE: To disable collection of a specific resource type, simply exclude all NOTE: To disable collection of a specific resource type, simply exclude all

View File

@ -304,7 +304,17 @@ func (e *Endpoint) initalDiscovery(ctx context.Context) {
func (e *Endpoint) init(ctx context.Context) error { func (e *Endpoint) init(ctx context.Context) error {
client, err := e.clientFactory.GetClient(ctx) client, err := e.clientFactory.GetClient(ctx)
if err != nil { if err != nil {
return err switch e.Parent.DisconnectedServersBehavior {
case "error":
return err
case "ignore":
// Ignore the error and postpone the init until next collection cycle
e.log.Warnf("Error connecting to vCenter on init: %s", err)
return nil
default:
return fmt.Errorf("%q is not a valid value for disconnected_servers_behavior",
e.Parent.DisconnectedServersBehavior)
}
} }
// Initial load of custom field metadata // Initial load of custom field metadata
@ -889,6 +899,15 @@ func (e *Endpoint) Close() {
// Collect runs a round of data collections as specified in the configuration. // Collect runs a round of data collections as specified in the configuration.
func (e *Endpoint) Collect(ctx context.Context, acc telegraf.Accumulator) error { func (e *Endpoint) Collect(ctx context.Context, acc telegraf.Accumulator) error {
// Connection could have failed on init, so we need to check for a deferred
// init request.
if !e.initialized {
e.log.Debug("Performing deferred init")
err := e.init(ctx)
if err != nil {
return err
}
}
// If we never managed to do a discovery, collection will be a no-op. Therefore, // If we never managed to do a discovery, collection will be a no-op. Therefore,
// we need to check that a connection is available, or the collection will // we need to check that a connection is available, or the collection will
// silently fail. // silently fail.

View File

@ -193,3 +193,9 @@
## The Historical Interval value must match EXACTLY the interval in the daily ## The Historical Interval value must match EXACTLY the interval in the daily
# "Interval Duration" found on the VCenter server under Configure > General > Statistics > Statistic intervals # "Interval Duration" found on the VCenter server under Configure > General > Statistics > Statistic intervals
# historical_interval = "5m" # historical_interval = "5m"
## Specifies plugin behavior regarding disconnected servers
## Available choices :
## - error: telegraf will return an error on startup if one the servers is unreachable
## - skip: telegraf will skip unreachable servers on both startup and gather
# disconnected_servers_behavior = "error"

View File

@ -22,45 +22,46 @@ var sampleConfig string
// VSphere is the top level type for the vSphere input plugin. It contains all the configuration // VSphere is the top level type for the vSphere input plugin. It contains all the configuration
// and a list of connected vSphere endpoints // and a list of connected vSphere endpoints
type VSphere struct { type VSphere struct {
Vcenters []string Vcenters []string
Username config.Secret `toml:"username"` Username config.Secret `toml:"username"`
Password config.Secret `toml:"password"` Password config.Secret `toml:"password"`
DatacenterInstances bool DatacenterInstances bool
DatacenterMetricInclude []string DatacenterMetricInclude []string
DatacenterMetricExclude []string DatacenterMetricExclude []string
DatacenterInclude []string DatacenterInclude []string
DatacenterExclude []string DatacenterExclude []string
ClusterInstances bool ClusterInstances bool
ClusterMetricInclude []string ClusterMetricInclude []string
ClusterMetricExclude []string ClusterMetricExclude []string
ClusterInclude []string ClusterInclude []string
ClusterExclude []string ClusterExclude []string
ResourcePoolInstances bool ResourcePoolInstances bool
ResourcePoolMetricInclude []string ResourcePoolMetricInclude []string
ResourcePoolMetricExclude []string ResourcePoolMetricExclude []string
ResourcePoolInclude []string ResourcePoolInclude []string
ResourcePoolExclude []string ResourcePoolExclude []string
HostInstances bool HostInstances bool
HostMetricInclude []string HostMetricInclude []string
HostMetricExclude []string HostMetricExclude []string
HostInclude []string HostInclude []string
HostExclude []string HostExclude []string
VMInstances bool `toml:"vm_instances"` VMInstances bool `toml:"vm_instances"`
VMMetricInclude []string `toml:"vm_metric_include"` VMMetricInclude []string `toml:"vm_metric_include"`
VMMetricExclude []string `toml:"vm_metric_exclude"` VMMetricExclude []string `toml:"vm_metric_exclude"`
VMInclude []string `toml:"vm_include"` VMInclude []string `toml:"vm_include"`
VMExclude []string `toml:"vm_exclude"` VMExclude []string `toml:"vm_exclude"`
DatastoreInstances bool DatastoreInstances bool
DatastoreMetricInclude []string DatastoreMetricInclude []string
DatastoreMetricExclude []string DatastoreMetricExclude []string
DatastoreInclude []string DatastoreInclude []string
DatastoreExclude []string DatastoreExclude []string
Separator string Separator string
CustomAttributeInclude []string CustomAttributeInclude []string
CustomAttributeExclude []string CustomAttributeExclude []string
UseIntSamples bool UseIntSamples bool
IPAddresses []string IPAddresses []string
MetricLookback int MetricLookback int
DisconnectedServersBehavior string
MaxQueryObjects int MaxQueryObjects int
MaxQueryMetrics int MaxQueryMetrics int
@ -186,15 +187,16 @@ func init() {
UseIntSamples: true, UseIntSamples: true,
IPAddresses: []string{}, IPAddresses: []string{},
MaxQueryObjects: 256, MaxQueryObjects: 256,
MaxQueryMetrics: 256, MaxQueryMetrics: 256,
CollectConcurrency: 1, CollectConcurrency: 1,
DiscoverConcurrency: 1, DiscoverConcurrency: 1,
MetricLookback: 3, MetricLookback: 3,
ForceDiscoverOnInit: true, ForceDiscoverOnInit: true,
ObjectDiscoveryInterval: config.Duration(time.Second * 300), ObjectDiscoveryInterval: config.Duration(time.Second * 300),
Timeout: config.Duration(time.Second * 60), Timeout: config.Duration(time.Second * 60),
HistoricalInterval: config.Duration(time.Second * 300), HistoricalInterval: config.Duration(time.Second * 300),
DisconnectedServersBehavior: "error",
} }
}) })
} }

View File

@ -4,6 +4,7 @@ import (
"context" "context"
"crypto/tls" "crypto/tls"
"fmt" "fmt"
"net/url"
"os" "os"
"strings" "strings"
"testing" "testing"
@ -132,15 +133,16 @@ func defaultVSphere() *VSphere {
DatacenterInclude: []string{"/**"}, DatacenterInclude: []string{"/**"},
ClientConfig: itls.ClientConfig{InsecureSkipVerify: true}, ClientConfig: itls.ClientConfig{InsecureSkipVerify: true},
MaxQueryObjects: 256, MaxQueryObjects: 256,
MaxQueryMetrics: 256, MaxQueryMetrics: 256,
ObjectDiscoveryInterval: config.Duration(time.Second * 300), ObjectDiscoveryInterval: config.Duration(time.Second * 300),
Timeout: config.Duration(time.Second * 20), Timeout: config.Duration(time.Second * 20),
ForceDiscoverOnInit: true, ForceDiscoverOnInit: true,
DiscoverConcurrency: 1, DiscoverConcurrency: 1,
CollectConcurrency: 1, CollectConcurrency: 1,
Separator: ".", Separator: ".",
HistoricalInterval: config.Duration(time.Second * 300), HistoricalInterval: config.Duration(time.Second * 300),
DisconnectedServersBehavior: "error",
} }
} }
@ -449,6 +451,22 @@ func TestCollectionNoClusterMetrics(t *testing.T) {
testCollection(t, true) testCollection(t, true)
} }
func TestDisconnectedServerBehavior(t *testing.T) {
u, err := url.Parse("https://definitely.not.a.valid.host")
require.NoError(t, err)
v := defaultVSphere()
v.DisconnectedServersBehavior = "error"
_, err = NewEndpoint(context.Background(), v, u, v.Log)
require.Error(t, err)
v.DisconnectedServersBehavior = "ignore"
_, err = NewEndpoint(context.Background(), v, u, v.Log)
require.NoError(t, err)
v.DisconnectedServersBehavior = "something else"
_, err = NewEndpoint(context.Background(), v, u, v.Log)
require.Error(t, err)
require.Equal(t, err.Error(), `"something else" is not a valid value for disconnected_servers_behavior`)
}
func testCollection(t *testing.T, excludeClusters bool) { func testCollection(t *testing.T, excludeClusters bool) {
mustHaveMetrics := map[string]struct{}{ mustHaveMetrics := map[string]struct{}{
"vsphere.vm.cpu": {}, "vsphere.vm.cpu": {},