feat(outputs.influxdb_v2): Add rate limit implementation (#15742)

This commit is contained in:
Sven Rebhan 2024-12-06 17:50:21 +01:00 committed by GitHub
parent be2d5efed1
commit 11709858e3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 245 additions and 55 deletions

View File

@ -1,6 +1,7 @@
package ratelimiter package ratelimiter
import ( import (
"errors"
"time" "time"
"github.com/influxdata/telegraf/config" "github.com/influxdata/telegraf/config"
@ -11,9 +12,12 @@ type RateLimitConfig struct {
Period config.Duration `toml:"rate_limit_period"` Period config.Duration `toml:"rate_limit_period"`
} }
func (cfg *RateLimitConfig) CreateRateLimiter() *RateLimiter { func (cfg *RateLimitConfig) CreateRateLimiter() (*RateLimiter, error) {
if cfg.Limit > 0 && cfg.Period <= 0 {
return nil, errors.New("invalid period for rate-limit")
}
return &RateLimiter{ return &RateLimiter{
limit: int64(cfg.Limit), limit: int64(cfg.Limit),
period: time.Duration(cfg.Period), period: time.Duration(cfg.Period),
} }, nil
} }

View File

@ -9,9 +9,16 @@ import (
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
func TestInvalidPeriod(t *testing.T) {
cfg := &RateLimitConfig{Limit: config.Size(1024)}
_, err := cfg.CreateRateLimiter()
require.ErrorContains(t, err, "invalid period for rate-limit")
}
func TestUnlimited(t *testing.T) { func TestUnlimited(t *testing.T) {
cfg := &RateLimitConfig{} cfg := &RateLimitConfig{}
limiter := cfg.CreateRateLimiter() limiter, err := cfg.CreateRateLimiter()
require.NoError(t, err)
start := time.Now() start := time.Now()
end := start.Add(30 * time.Minute) end := start.Add(30 * time.Minute)
@ -24,7 +31,8 @@ func TestUnlimitedWithPeriod(t *testing.T) {
cfg := &RateLimitConfig{ cfg := &RateLimitConfig{
Period: config.Duration(5 * time.Minute), Period: config.Duration(5 * time.Minute),
} }
limiter := cfg.CreateRateLimiter() limiter, err := cfg.CreateRateLimiter()
require.NoError(t, err)
start := time.Now() start := time.Now()
end := start.Add(30 * time.Minute) end := start.Add(30 * time.Minute)
@ -67,7 +75,8 @@ func TestLimited(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name+" at period", func(t *testing.T) { t.Run(tt.name+" at period", func(t *testing.T) {
// Setup the limiter // Setup the limiter
limiter := tt.cfg.CreateRateLimiter() limiter, err := tt.cfg.CreateRateLimiter()
require.NoError(t, err)
// Compute the actual values // Compute the actual values
start := time.Now().Truncate(tt.step) start := time.Now().Truncate(tt.step)
@ -85,7 +94,8 @@ func TestLimited(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
// Setup the limiter // Setup the limiter
limiter := tt.cfg.CreateRateLimiter() limiter, err := tt.cfg.CreateRateLimiter()
require.NoError(t, err)
// Compute the actual values // Compute the actual values
start := time.Now().Truncate(tt.step).Add(1 * time.Second) start := time.Now().Truncate(tt.step).Add(1 * time.Second)
@ -134,7 +144,8 @@ func TestUndo(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name+" at period", func(t *testing.T) { t.Run(tt.name+" at period", func(t *testing.T) {
// Setup the limiter // Setup the limiter
limiter := tt.cfg.CreateRateLimiter() limiter, err := tt.cfg.CreateRateLimiter()
require.NoError(t, err)
// Compute the actual values // Compute the actual values
start := time.Now().Truncate(tt.step) start := time.Now().Truncate(tt.step)
@ -156,7 +167,8 @@ func TestUndo(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
// Setup the limiter // Setup the limiter
limiter := tt.cfg.CreateRateLimiter() limiter, err := tt.cfg.CreateRateLimiter()
require.NoError(t, err)
// Compute the actual values // Compute the actual values
start := time.Now().Truncate(tt.step).Add(1 * time.Second) start := time.Now().Truncate(tt.step).Add(1 * time.Second)

View File

@ -101,6 +101,12 @@ to use them.
# tls_key = "/etc/telegraf/key.pem" # tls_key = "/etc/telegraf/key.pem"
## Use TLS but skip chain & host verification ## Use TLS but skip chain & host verification
# insecure_skip_verify = false # insecure_skip_verify = false
## Rate limits for sending data (disabled by default)
## Available, uncompressed payload size e.g. "5Mb"
# rate_limit = "unlimited"
## Fixed time-window for the available payload size e.g. "5m"
# rate_limit_period = "0s"
``` ```
## Metrics ## Metrics

View File

@ -22,7 +22,7 @@ import (
"github.com/influxdata/telegraf" "github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config" "github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/internal" "github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/plugins/serializers/influx" "github.com/influxdata/telegraf/plugins/common/ratelimiter"
) )
type APIError struct { type APIError struct {
@ -59,8 +59,9 @@ type httpClient struct {
pingTimeout config.Duration pingTimeout config.Duration
readIdleTimeout config.Duration readIdleTimeout config.Duration
tlsConfig *tls.Config tlsConfig *tls.Config
serializer *influx.Serializer
encoder internal.ContentEncoder encoder internal.ContentEncoder
serializer ratelimiter.Serializer
rateLimiter *ratelimiter.RateLimiter
client *http.Client client *http.Client
params url.Values params url.Values
retryTime time.Time retryTime time.Time
@ -160,52 +161,69 @@ func (c *httpClient) Write(ctx context.Context, metrics []telegraf.Metric) error
} }
batches := make(map[string][]telegraf.Metric) batches := make(map[string][]telegraf.Metric)
batchIndices := make(map[string][]int)
if c.bucketTag == "" { if c.bucketTag == "" {
err := c.writeBatch(ctx, c.bucket, metrics) batches[c.bucket] = metrics
if err != nil { batchIndices[c.bucket] = make([]int, len(metrics))
var apiErr *APIError for i := range metrics {
if errors.As(err, &apiErr) { batchIndices[c.bucket][i] = i
if apiErr.StatusCode == http.StatusRequestEntityTooLarge {
return c.splitAndWriteBatch(ctx, c.bucket, metrics)
}
}
return err
} }
} else { } else {
for _, metric := range metrics { for i, metric := range metrics {
bucket, ok := metric.GetTag(c.bucketTag) bucket, ok := metric.GetTag(c.bucketTag)
if !ok { if !ok {
bucket = c.bucket bucket = c.bucket
} } else if c.excludeBucketTag {
// Avoid modifying the metric if we do remove the tag
if _, ok := batches[bucket]; !ok {
batches[bucket] = make([]telegraf.Metric, 0)
}
if c.excludeBucketTag {
// Avoid modifying the metric in case we need to retry the request.
metric = metric.Copy() metric = metric.Copy()
metric.Accept() metric.Accept()
metric.RemoveTag(c.bucketTag) metric.RemoveTag(c.bucketTag)
} }
batches[bucket] = append(batches[bucket], metric) batches[bucket] = append(batches[bucket], metric)
batchIndices[c.bucket] = append(batchIndices[c.bucket], i)
}
}
var wErr internal.PartialWriteError
for bucket, batch := range batches {
err := c.writeBatch(ctx, bucket, batch)
if err == nil {
wErr.MetricsAccept = append(wErr.MetricsAccept, batchIndices[bucket]...)
continue
} }
for bucket, batch := range batches { // Check if the request was too large and split it
err := c.writeBatch(ctx, bucket, batch) var apiErr *APIError
if err != nil { if errors.As(err, &apiErr) {
var apiErr *APIError if apiErr.StatusCode == http.StatusRequestEntityTooLarge {
if errors.As(err, &apiErr) { return c.splitAndWriteBatch(ctx, c.bucket, metrics)
if apiErr.StatusCode == http.StatusRequestEntityTooLarge {
return c.splitAndWriteBatch(ctx, c.bucket, metrics)
}
}
return err
} }
wErr.Err = err
wErr.MetricsReject = append(wErr.MetricsReject, batchIndices[bucket]...)
return &wErr
} }
// Check if we got a write error and if so, translate the returned
// metric indices to return the original indices in case of bucketing
var writeErr *internal.PartialWriteError
if errors.As(err, &writeErr) {
wErr.Err = writeErr.Err
for _, idx := range writeErr.MetricsAccept {
wErr.MetricsAccept = append(wErr.MetricsAccept, batchIndices[bucket][idx])
}
for _, idx := range writeErr.MetricsReject {
wErr.MetricsReject = append(wErr.MetricsReject, batchIndices[bucket][idx])
}
if !errors.Is(writeErr.Err, internal.ErrSizeLimitReached) {
continue
}
return &wErr
}
// Return the error without special treatment
wErr.Err = err
return &wErr
} }
return nil return nil
} }
@ -222,11 +240,16 @@ func (c *httpClient) splitAndWriteBatch(ctx context.Context, bucket string, metr
} }
func (c *httpClient) writeBatch(ctx context.Context, bucket string, metrics []telegraf.Metric) error { func (c *httpClient) writeBatch(ctx context.Context, bucket string, metrics []telegraf.Metric) error {
// Serialize the metrics // Get the current limit for the outbound data
body, err := c.serializer.SerializeBatch(metrics) ratets := time.Now()
if err != nil { limit := c.rateLimiter.Remaining(ratets)
return err
// Serialize the metrics with the remaining limit, exit early if nothing was serialized
body, werr := c.serializer.SerializeBatch(metrics, limit)
if werr != nil && !errors.Is(werr, internal.ErrSizeLimitReached) || len(body) == 0 {
return werr
} }
used := int64(len(body))
// Encode the content if requested // Encode the content if requested
if c.encoder != nil { if c.encoder != nil {
@ -249,6 +272,7 @@ func (c *httpClient) writeBatch(ctx context.Context, bucket string, metrics []te
c.addHeaders(req) c.addHeaders(req)
// Execute the request // Execute the request
c.rateLimiter.Accept(ratets, used)
resp, err := c.client.Do(req.WithContext(ctx)) resp, err := c.client.Do(req.WithContext(ctx))
if err != nil { if err != nil {
internal.OnClientError(c.client, err) internal.OnClientError(c.client, err)
@ -269,7 +293,7 @@ func (c *httpClient) writeBatch(ctx context.Context, bucket string, metrics []te
http.StatusMultiStatus, http.StatusMultiStatus,
http.StatusAlreadyReported: http.StatusAlreadyReported:
c.retryCount = 0 c.retryCount = 0
return nil return werr
} }
// We got an error and now try to decode further // We got an error and now try to decode further
@ -294,11 +318,18 @@ func (c *httpClient) writeBatch(ctx context.Context, bucket string, metrics []te
http.StatusBadRequest, http.StatusBadRequest,
// request was received but server refused to process it due to a semantic problem with the request. // request was received but server refused to process it due to a semantic problem with the request.
// for example, submitting metrics outside the retention period. // for example, submitting metrics outside the retention period.
// Clients should *not* repeat the request and the metrics should be dropped.
http.StatusUnprocessableEntity, http.StatusUnprocessableEntity,
http.StatusNotAcceptable: http.StatusNotAcceptable:
c.log.Errorf("Failed to write metric to %s (will be dropped: %s): %s\n", bucket, resp.Status, desc)
return nil // Clients should *not* repeat the request and the metrics should be dropped.
rejected := make([]int, 0, len(metrics))
for i := range len(metrics) {
rejected = append(rejected, i)
}
return &internal.PartialWriteError{
Err: fmt.Errorf("failed to write metric to %s (will be dropped: %s): %s", bucket, resp.Status, desc),
MetricsReject: rejected,
}
case http.StatusUnauthorized, http.StatusForbidden: case http.StatusUnauthorized, http.StatusForbidden:
return fmt.Errorf("failed to write metric to %s (%s): %s", bucket, resp.Status, desc) return fmt.Errorf("failed to write metric to %s (%s): %s", bucket, resp.Status, desc)
case http.StatusTooManyRequests, case http.StatusTooManyRequests,
@ -316,8 +347,14 @@ func (c *httpClient) writeBatch(ctx context.Context, bucket string, metrics []te
// if it's any other 4xx code, the client should not retry as it's the client's mistake. // if it's any other 4xx code, the client should not retry as it's the client's mistake.
// retrying will not make the request magically work. // retrying will not make the request magically work.
if len(resp.Status) > 0 && resp.Status[0] == '4' { if len(resp.Status) > 0 && resp.Status[0] == '4' {
c.log.Errorf("Failed to write metric to %s (will be dropped: %s): %s\n", bucket, resp.Status, desc) rejected := make([]int, 0, len(metrics))
return nil for i := range len(metrics) {
rejected = append(rejected, i)
}
return &internal.PartialWriteError{
Err: fmt.Errorf("failed to write metric to %s (will be dropped: %s): %s", bucket, resp.Status, desc),
MetricsReject: rejected,
}
} }
// This is only until platform spec is fully implemented. As of the // This is only until platform spec is fully implemented. As of the

View File

@ -17,6 +17,7 @@ import (
"github.com/influxdata/telegraf" "github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config" "github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/internal" "github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/plugins/common/ratelimiter"
commontls "github.com/influxdata/telegraf/plugins/common/tls" commontls "github.com/influxdata/telegraf/plugins/common/tls"
"github.com/influxdata/telegraf/plugins/outputs" "github.com/influxdata/telegraf/plugins/outputs"
"github.com/influxdata/telegraf/plugins/serializers/influx" "github.com/influxdata/telegraf/plugins/serializers/influx"
@ -44,10 +45,11 @@ type InfluxDB struct {
ReadIdleTimeout config.Duration `toml:"read_idle_timeout"` ReadIdleTimeout config.Duration `toml:"read_idle_timeout"`
Log telegraf.Logger `toml:"-"` Log telegraf.Logger `toml:"-"`
commontls.ClientConfig commontls.ClientConfig
ratelimiter.RateLimitConfig
clients []*httpClient clients []*httpClient
encoder internal.ContentEncoder encoder internal.ContentEncoder
serializer *influx.Serializer serializer ratelimiter.Serializer
tlsCfg *tls.Config tlsCfg *tls.Config
} }
@ -65,7 +67,7 @@ func (i *InfluxDB) Init() error {
i.URLs = append(i.URLs, "http://localhost:8086") i.URLs = append(i.URLs, "http://localhost:8086")
} }
// Check options // Init encoding if configured
switch i.ContentEncoding { switch i.ContentEncoding {
case "", "gzip": case "", "gzip":
i.ContentEncoding = "gzip" i.ContentEncoding = "gzip"
@ -80,13 +82,14 @@ func (i *InfluxDB) Init() error {
} }
// Setup the limited serializer // Setup the limited serializer
i.serializer = &influx.Serializer{ serializer := &influx.Serializer{
UintSupport: i.UintSupport, UintSupport: i.UintSupport,
OmitTimestamp: i.OmitTimestamp, OmitTimestamp: i.OmitTimestamp,
} }
if err := i.serializer.Init(); err != nil { if err := serializer.Init(); err != nil {
return fmt.Errorf("setting up serializer failed: %w", err) return fmt.Errorf("setting up serializer failed: %w", err)
} }
i.serializer = ratelimiter.NewIndividualSerializer(serializer)
// Setup the client config // Setup the client config
tlsCfg, err := i.ClientConfig.TLSConfig() tlsCfg, err := i.ClientConfig.TLSConfig()
@ -142,6 +145,10 @@ func (i *InfluxDB) Connect() error {
switch parts.Scheme { switch parts.Scheme {
case "http", "https", "unix": case "http", "https", "unix":
limiter, err := i.RateLimitConfig.CreateRateLimiter()
if err != nil {
return err
}
c := &httpClient{ c := &httpClient{
url: parts, url: parts,
localAddr: localAddr, localAddr: localAddr,
@ -158,8 +165,9 @@ func (i *InfluxDB) Connect() error {
tlsConfig: i.tlsCfg, tlsConfig: i.tlsCfg,
pingTimeout: i.PingTimeout, pingTimeout: i.PingTimeout,
readIdleTimeout: i.ReadIdleTimeout, readIdleTimeout: i.ReadIdleTimeout,
serializer: i.serializer,
encoder: i.encoder, encoder: i.encoder,
rateLimiter: limiter,
serializer: i.serializer,
log: i.Log, log: i.Log,
} }
@ -191,6 +199,10 @@ func (i *InfluxDB) Write(metrics []telegraf.Metric) error {
for _, n := range rand.Perm(len(i.clients)) { for _, n := range rand.Perm(len(i.clients)) {
client := i.clients[n] client := i.clients[n]
if err := client.Write(ctx, metrics); err != nil { if err := client.Write(ctx, metrics); err != nil {
var werr *internal.PartialWriteError
if errors.As(err, &werr) || errors.Is(err, internal.ErrSizeLimitReached) {
return err
}
i.Log.Errorf("When writing to [%s]: %v", client.url, err) i.Log.Errorf("When writing to [%s]: %v", client.url, err)
continue continue
} }

View File

@ -7,6 +7,7 @@ import (
"net/http/httptest" "net/http/httptest"
"reflect" "reflect"
"strings" "strings"
"sync/atomic"
"testing" "testing"
"time" "time"
@ -14,7 +15,9 @@ import (
"github.com/influxdata/telegraf" "github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config" "github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/metric" "github.com/influxdata/telegraf/metric"
"github.com/influxdata/telegraf/plugins/common/ratelimiter"
"github.com/influxdata/telegraf/plugins/common/tls" "github.com/influxdata/telegraf/plugins/common/tls"
"github.com/influxdata/telegraf/plugins/outputs" "github.com/influxdata/telegraf/plugins/outputs"
influxdb "github.com/influxdata/telegraf/plugins/outputs/influxdb_v2" influxdb "github.com/influxdata/telegraf/plugins/outputs/influxdb_v2"
@ -373,3 +376,113 @@ func TestTooLargeWriteRetry(t *testing.T) {
} }
require.Error(t, plugin.Write(hugeMetrics)) require.Error(t, plugin.Write(hugeMetrics))
} }
func TestRateLimit(t *testing.T) {
// Setup a test server
var received atomic.Uint64
ts := httptest.NewServer(
http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/api/v2/write":
if err := r.ParseForm(); err != nil {
w.WriteHeader(http.StatusUnprocessableEntity)
return
}
body, err := io.ReadAll(r.Body)
if err != nil {
w.WriteHeader(http.StatusUnprocessableEntity)
return
}
received.Add(uint64(len(body)))
w.WriteHeader(http.StatusNoContent)
return
default:
w.WriteHeader(http.StatusNotFound)
return
}
}),
)
defer ts.Close()
// Setup plugin and connect
plugin := &influxdb.InfluxDB{
URLs: []string{"http://" + ts.Listener.Addr().String()},
Bucket: "telegraf",
ContentEncoding: "identity",
RateLimitConfig: ratelimiter.RateLimitConfig{
Limit: 50,
Period: config.Duration(time.Second),
},
Log: &testutil.Logger{},
}
require.NoError(t, plugin.Init())
require.NoError(t, plugin.Connect())
defer plugin.Close()
// Together the metric batch size is too big, split up, we get success
metrics := []telegraf.Metric{
metric.New(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42.0,
},
time.Unix(0, 1),
),
metric.New(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 99.0,
},
time.Unix(0, 2),
),
metric.New(
"operating_hours",
map[string]string{
"machine": "A",
},
map[string]interface{}{
"value": 123.456,
},
time.Unix(0, 3),
),
metric.New(
"status",
map[string]string{
"machine": "B",
},
map[string]interface{}{
"temp": 48.235,
"remaining": 999.999,
},
time.Unix(0, 4),
),
}
// Write the metrics the first time. Only the first two metrics should be
// received by the server due to the rate limit.
require.ErrorIs(t, plugin.Write(metrics), internal.ErrSizeLimitReached)
require.LessOrEqual(t, received.Load(), uint64(30))
// A direct follow-up write attempt with the remaining metrics should fail
// due to the rate limit being reached
require.ErrorIs(t, plugin.Write(metrics[2:]), internal.ErrSizeLimitReached)
require.LessOrEqual(t, received.Load(), uint64(30))
// Wait for at least the period (plus some safety margin) to write the third metric
time.Sleep(time.Duration(plugin.RateLimitConfig.Period) + 100*time.Millisecond)
require.ErrorIs(t, plugin.Write(metrics[2:]), internal.ErrSizeLimitReached)
require.Greater(t, received.Load(), uint64(30))
require.LessOrEqual(t, received.Load(), uint64(72))
// Wait again for the period for at least the period (plus some safety margin)
// to write the last metric. This should finally succeed as all metrics
// are written.
time.Sleep(time.Duration(plugin.RateLimitConfig.Period) + 100*time.Millisecond)
require.NoError(t, plugin.Write(metrics[3:]))
require.Equal(t, uint64(121), received.Load())
}

View File

@ -71,3 +71,9 @@
# tls_key = "/etc/telegraf/key.pem" # tls_key = "/etc/telegraf/key.pem"
## Use TLS but skip chain & host verification ## Use TLS but skip chain & host verification
# insecure_skip_verify = false # insecure_skip_verify = false
## Rate limits for sending data (disabled by default)
## Available, uncompressed payload size e.g. "5Mb"
# rate_limit = "unlimited"
## Fixed time-window for the available payload size e.g. "5m"
# rate_limit_period = "0s"