feat(outputs.datadog): Add support for submitting alongside dd-agent (#15702)

2024-08-07 15:58:25 +01:00 · 2024-08-07 15:58:25 +01:00 · 66a042f592
parent 61efaee971
commit 66a042f592
4 changed files with 660 additions and 18 deletions
--- a/plugins/outputs/datadog/README.md
+++ b/plugins/outputs/datadog/README.md
@ -36,6 +36,13 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
  ## Override the default (none) compression used to send data.
  ## Supports: "zlib", "none"
  # compression = "none"
+
+  ## When non-zero, converts count metrics submitted by inputs.statsd
+  ## into rate, while dividing the metric value by this number.
+  ## Note that in order for metrics to be submitted simultaenously alongside
+  ## a Datadog agent, rate_interval has to match the interval used by the
+  ## agent - which defaults to 10s
+  # rate_interval = 0s
 ```

 ## Metrics
@ -46,11 +53,13 @@ field key with a `.` character.
 Field values are converted to floating point numbers.  Strings and floats that
 cannot be sent over JSON, namely NaN and Inf, are ignored.

-We do not send `Rate` types. Counts are sent as `count`, with an
-interval hard-coded to 1. Note that this behavior does *not* play
-super-well if running simultaneously with current Datadog agents; they
-will attempt to change to `Rate` with `interval=10`. We prefer this
-method, however, as it reflects the raw data more accurately.
+Setting `rate_interval` to non-zero will convert `count` metrics to `rate`
+and divide its value by this interval before submitting to Datadog.
+This allows Telegraf to submit metrics alongside Datadog agents when their rate
+intervals are the same (Datadog defaults to `10s`).
+Note that this only supports metrics ingested via `inputs.statsd` given
+the dependency on the `metric_type` tag it creates. There is only support for
+`counter` metrics, and `count` values from `timing` and `histogram` metrics.

 [metrics]: https://docs.datadoghq.com/api/v1/metrics/#submit-metrics
 [apikey]: https://app.datadoghq.com/account/settings#api
--- a/plugins/outputs/datadog/datadog.go
+++ b/plugins/outputs/datadog/datadog.go
@ -25,11 +25,12 @@ import (
 var sampleConfig string

 type Datadog struct {
-	Apikey      string          `toml:"apikey"`
-	Timeout     config.Duration `toml:"timeout"`
-	URL         string          `toml:"url"`
-	Compression string          `toml:"compression"`
-	Log         telegraf.Logger `toml:"-"`
+	Apikey       string          `toml:"apikey"`
+	Timeout      config.Duration `toml:"timeout"`
+	URL          string          `toml:"url"`
+	Compression  string          `toml:"compression"`
+	RateInterval config.Duration `toml:"rate_interval"`
+	Log          telegraf.Logger `toml:"-"`

 	client *http.Client
 	proxy.HTTPProxy
@ -75,15 +76,15 @@ func (d *Datadog) Connect() error {
 	return nil
 }

-func (d *Datadog) Write(metrics []telegraf.Metric) error {
-	ts := TimeSeries{}
+func (d *Datadog) convertToDatadogMetric(metrics []telegraf.Metric) []*Metric {
 	tempSeries := []*Metric{}
-	metricCounter := 0

 	for _, m := range metrics {
 		if dogMs, err := buildMetrics(m); err == nil {
 			metricTags := buildTags(m.TagList())
 			host, _ := m.GetTag("host")
+			// Retrieve the metric_type tag created by inputs.statsd
+			statsDMetricType, _ := m.GetTag("metric_type")

 			if len(dogMs) == 0 {
 				continue
@ -99,9 +100,21 @@ func (d *Datadog) Write(metrics []telegraf.Metric) error {
 					dname = m.Name() + "." + fieldName
 				}
 				var tname string
+				var interval int64
+				interval = 1
 				switch m.Type() {
-				case telegraf.Counter:
-					tname = "count"
+				case telegraf.Counter, telegraf.Untyped:
+					if d.RateInterval > 0 && isRateable(statsDMetricType, fieldName) {
+						// interval is expected to be in seconds
+						rateIntervalSeconds := time.Duration(d.RateInterval).Seconds()
+						interval = int64(rateIntervalSeconds)
+						dogM[1] = dogM[1] / rateIntervalSeconds
+						tname = "rate"
+					} else if m.Type() == telegraf.Counter {
+						tname = "count"
+					} else {
+						tname = ""
+					}
 				case telegraf.Gauge:
 					tname = "gauge"
 				default:
@ -112,23 +125,28 @@ func (d *Datadog) Write(metrics []telegraf.Metric) error {
 					Tags:     metricTags,
 					Host:     host,
 					Type:     tname,
-					Interval: 1,
+					Interval: interval,
 				}
 				metric.Points[0] = dogM
 				tempSeries = append(tempSeries, metric)
-				metricCounter++
 			}
 		} else {
 			d.Log.Infof("Unable to build Metric for %s due to error '%v', skipping", m.Name(), err)
 		}
 	}
+	return tempSeries
+}
+
+func (d *Datadog) Write(metrics []telegraf.Metric) error {
+	ts := TimeSeries{}
+	tempSeries := d.convertToDatadogMetric(metrics)

 	if len(tempSeries) == 0 {
 		return nil
 	}

 	redactedAPIKey := "****************"
-	ts.Series = make([]*Metric, metricCounter)
+	ts.Series = make([]*Metric, len(tempSeries))
 	copy(ts.Series, tempSeries[0:])
 	tsBytes, err := json.Marshal(ts)
 	if err != nil {
@ -220,6 +238,20 @@ func verifyValue(v interface{}) bool {
 	return true
 }

+func isRateable(statsDMetricType string, fieldName string) bool {
+	switch statsDMetricType {
+	case
+		"counter":
+		return true
+	case
+		"timing",
+		"histogram":
+		return fieldName == "count"
+	default:
+		return false
+	}
+}
+
 func (p *Point) setValue(v interface{}) error {
 	switch d := v.(type) {
 	case int64:
--- a/plugins/outputs/datadog/datadog_test.go
+++ b/plugins/outputs/datadog/datadog_test.go
@ -13,6 +13,7 @@ import (
 	"github.com/stretchr/testify/require"

 	"github.com/influxdata/telegraf"
+	"github.com/influxdata/telegraf/config"
 	"github.com/influxdata/telegraf/testutil"
 )

@ -305,3 +306,596 @@ func TestInfIsSkipped(t *testing.T) {
 	})
 	require.NoError(t, err)
 }
+
+func TestNonZeroRateIntervalConvertsRatesToCount(t *testing.T) {
+	d := &Datadog{
+		Apikey:       "123456",
+		RateInterval: config.Duration(10 * time.Second),
+	}
+
+	var tests = []struct {
+		name       string
+		metricsIn  []telegraf.Metric
+		metricsOut []*Metric
+	}{
+		{
+			"convert counter metrics to rate",
+			[]telegraf.Metric{
+				testutil.MustMetric(
+					"count_metric",
+					map[string]string{
+						"metric_type": "counter",
+					},
+					map[string]interface{}{
+						"value": 100,
+					},
+					time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC),
+					telegraf.Counter,
+				),
+			},
+			[]*Metric{
+				{
+					Metric: "count_metric",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							10,
+						},
+					},
+					Type: "rate",
+					Tags: []string{
+						"metric_type:counter",
+					},
+					Interval: 10,
+				},
+			},
+		},
+		{
+			"convert count value in timing metrics to rate",
+			[]telegraf.Metric{
+				testutil.MustMetric(
+					"timing_metric",
+					map[string]string{
+						"metric_type": "timing",
+					},
+					map[string]interface{}{
+						"count":  1,
+						"lower":  float64(10),
+						"mean":   float64(10),
+						"median": float64(10),
+						"stddev": float64(0),
+						"sum":    float64(10),
+						"upper":  float64(10),
+					},
+					time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC),
+					telegraf.Untyped,
+				),
+			},
+			[]*Metric{
+				{
+					Metric: "timing_metric.count",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							0.1,
+						},
+					},
+					Type: "rate",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 10,
+				},
+				{
+					Metric: "timing_metric.lower",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "timing_metric.mean",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "timing_metric.median",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "timing_metric.stddev",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(0),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "timing_metric.sum",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "timing_metric.upper",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 1,
+				},
+			},
+		},
+		{
+			"convert count value in histogram metrics to rate",
+			[]telegraf.Metric{
+				testutil.MustMetric(
+					"histogram_metric",
+					map[string]string{
+						"metric_type": "histogram",
+					},
+					map[string]interface{}{
+						"count":  1,
+						"lower":  float64(10),
+						"mean":   float64(10),
+						"median": float64(10),
+						"stddev": float64(0),
+						"sum":    float64(10),
+						"upper":  float64(10),
+					},
+					time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC),
+					telegraf.Untyped,
+				),
+			},
+			[]*Metric{
+				{
+					Metric: "histogram_metric.count",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							0.1,
+						},
+					},
+					Type: "rate",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 10,
+				},
+				{
+					Metric: "histogram_metric.lower",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "histogram_metric.mean",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "histogram_metric.median",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "histogram_metric.stddev",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(0),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "histogram_metric.sum",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "histogram_metric.upper",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 1,
+				},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			actualMetricsOut := d.convertToDatadogMetric(tt.metricsIn)
+			require.ElementsMatch(t, tt.metricsOut, actualMetricsOut)
+		})
+	}
+}
+
+func TestZeroRateIntervalConvertsRatesToCount(t *testing.T) {
+	d := &Datadog{
+		Apikey: "123456",
+	}
+
+	var tests = []struct {
+		name       string
+		metricsIn  []telegraf.Metric
+		metricsOut []*Metric
+	}{
+		{
+			"does not convert counter metrics to rate",
+			[]telegraf.Metric{
+				testutil.MustMetric(
+					"count_metric",
+					map[string]string{
+						"metric_type": "counter",
+					},
+					map[string]interface{}{
+						"value": 100,
+					},
+					time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC),
+					telegraf.Counter,
+				),
+			},
+			[]*Metric{
+				{
+					Metric: "count_metric",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							100,
+						},
+					},
+					Type: "count",
+					Tags: []string{
+						"metric_type:counter",
+					},
+					Interval: 1,
+				},
+			},
+		},
+		{
+			"does not convert count value in timing metrics to rate",
+			[]telegraf.Metric{
+				testutil.MustMetric(
+					"timing_metric",
+					map[string]string{
+						"metric_type": "timing",
+					},
+					map[string]interface{}{
+						"count":  1,
+						"lower":  float64(10),
+						"mean":   float64(10),
+						"median": float64(10),
+						"stddev": float64(0),
+						"sum":    float64(10),
+						"upper":  float64(10),
+					},
+					time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC),
+					telegraf.Untyped,
+				),
+			},
+			[]*Metric{
+				{
+					Metric: "timing_metric.count",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							1,
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "timing_metric.lower",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "timing_metric.mean",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "timing_metric.median",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "timing_metric.stddev",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(0),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "timing_metric.sum",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "timing_metric.upper",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:timing",
+					},
+					Interval: 1,
+				},
+			},
+		},
+		{
+			"does not convert count value in histogram metrics to rate",
+			[]telegraf.Metric{
+				testutil.MustMetric(
+					"histogram_metric",
+					map[string]string{
+						"metric_type": "histogram",
+					},
+					map[string]interface{}{
+						"count":  1,
+						"lower":  float64(10),
+						"mean":   float64(10),
+						"median": float64(10),
+						"stddev": float64(0),
+						"sum":    float64(10),
+						"upper":  float64(10),
+					},
+					time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC),
+					telegraf.Untyped,
+				),
+			},
+			[]*Metric{
+				{
+					Metric: "histogram_metric.count",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							1,
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "histogram_metric.lower",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "histogram_metric.mean",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "histogram_metric.median",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "histogram_metric.stddev",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(0),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "histogram_metric.sum",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 1,
+				},
+				{
+					Metric: "histogram_metric.upper",
+					Points: [1]Point{
+						{
+							float64(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC).Unix()),
+							float64(10),
+						},
+					},
+					Type: "",
+					Tags: []string{
+						"metric_type:histogram",
+					},
+					Interval: 1,
+				},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			actualMetricsOut := d.convertToDatadogMetric(tt.metricsIn)
+			require.ElementsMatch(t, tt.metricsOut, actualMetricsOut)
+		})
+	}
+}
--- a/plugins/outputs/datadog/sample.conf
+++ b/plugins/outputs/datadog/sample.conf
@ -18,3 +18,10 @@
  ## Override the default (none) compression used to send data.
  ## Supports: "zlib", "none"
  # compression = "none"
+
+  ## When non-zero, converts count metrics submitted by inputs.statsd
+  ## into rate, while dividing the metric value by this number.
+  ## Note that in order for metrics to be submitted simultaenously alongside
+  ## a Datadog agent, rate_interval has to match the interval used by the
+  ## agent - which defaults to 10s
+  # rate_interval = 0s