[outputs.influxdb_v2] add exponential backoff, and respect client error responses (#8662)
* [outputs.influxdb_v2] add exponential backoff, and respect client error responses * add test * Update to 60 seconds * fix test
This commit is contained in:
parent
cbe99ef596
commit
9c7cf99fa7
|
|
@ -9,6 +9,7 @@ import (
|
||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
|
"math"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
|
|
@ -36,7 +37,7 @@ func (e APIError) Error() string {
|
||||||
|
|
||||||
const (
|
const (
|
||||||
defaultRequestTimeout = time.Second * 5
|
defaultRequestTimeout = time.Second * 5
|
||||||
defaultMaxWait = 10 // seconds
|
defaultMaxWait = 60 // seconds
|
||||||
defaultDatabase = "telegraf"
|
defaultDatabase = "telegraf"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -70,6 +71,7 @@ type httpClient struct {
|
||||||
serializer *influx.Serializer
|
serializer *influx.Serializer
|
||||||
url *url.URL
|
url *url.URL
|
||||||
retryTime time.Time
|
retryTime time.Time
|
||||||
|
retryCount int
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewHTTPClient(config *HTTPConfig) (*httpClient, error) {
|
func NewHTTPClient(config *HTTPConfig) (*httpClient, error) {
|
||||||
|
|
@ -233,7 +235,18 @@ func (c *httpClient) writeBatch(ctx context.Context, bucket string, metrics []te
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
if resp.StatusCode == http.StatusNoContent {
|
switch resp.StatusCode {
|
||||||
|
case
|
||||||
|
// this is the expected response:
|
||||||
|
http.StatusNoContent,
|
||||||
|
// but if we get these we should still accept it as delivered:
|
||||||
|
http.StatusOK,
|
||||||
|
http.StatusCreated,
|
||||||
|
http.StatusAccepted,
|
||||||
|
http.StatusPartialContent,
|
||||||
|
http.StatusMultiStatus,
|
||||||
|
http.StatusAlreadyReported:
|
||||||
|
c.retryCount = 0
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -245,33 +258,37 @@ func (c *httpClient) writeBatch(ctx context.Context, bucket string, metrics []te
|
||||||
}
|
}
|
||||||
|
|
||||||
switch resp.StatusCode {
|
switch resp.StatusCode {
|
||||||
case http.StatusBadRequest, http.StatusRequestEntityTooLarge:
|
case
|
||||||
log.Printf("E! [outputs.influxdb_v2] Failed to write metric: %s\n", desc)
|
// request was malformed:
|
||||||
|
http.StatusBadRequest,
|
||||||
|
// request was too large:
|
||||||
|
http.StatusRequestEntityTooLarge,
|
||||||
|
// request was received but server refused to process it due to a semantic problem with the request.
|
||||||
|
// for example, submitting metrics outside the retention period.
|
||||||
|
// Clients should *not* repeat the request and the metrics should be dropped.
|
||||||
|
http.StatusUnprocessableEntity,
|
||||||
|
http.StatusNotAcceptable:
|
||||||
|
log.Printf("E! [outputs.influxdb_v2] Failed to write metric (will be dropped: %s): %s\n", resp.Status, desc)
|
||||||
return nil
|
return nil
|
||||||
case http.StatusUnauthorized, http.StatusForbidden:
|
case http.StatusUnauthorized, http.StatusForbidden:
|
||||||
return fmt.Errorf("failed to write metric: %s", desc)
|
return fmt.Errorf("failed to write metric (%s): %s", resp.Status, desc)
|
||||||
case http.StatusTooManyRequests:
|
case http.StatusTooManyRequests,
|
||||||
retryAfter := resp.Header.Get("Retry-After")
|
http.StatusServiceUnavailable,
|
||||||
retry, err := strconv.Atoi(retryAfter)
|
http.StatusBadGateway,
|
||||||
if err != nil {
|
http.StatusGatewayTimeout:
|
||||||
return errors.New("rate limit exceeded")
|
// ^ these handle the cases where the server is likely overloaded, and may not be able to say so.
|
||||||
}
|
c.retryCount++
|
||||||
if retry > defaultMaxWait {
|
retryDuration := c.getRetryDuration(resp.Header)
|
||||||
retry = defaultMaxWait
|
c.retryTime = time.Now().Add(retryDuration)
|
||||||
}
|
log.Printf("W! [outputs.influxdb_v2] Failed to write; will retry in %s. (%s)\n", retryDuration, resp.Status)
|
||||||
c.retryTime = time.Now().Add(time.Duration(retry) * time.Second)
|
return fmt.Errorf("waiting %s for server before sending metric again", retryDuration)
|
||||||
return fmt.Errorf("waiting %ds for server before sending metric again", retry)
|
}
|
||||||
case http.StatusServiceUnavailable:
|
|
||||||
retryAfter := resp.Header.Get("Retry-After")
|
// if it's any other 4xx code, the client should not retry as it's the client's mistake.
|
||||||
retry, err := strconv.Atoi(retryAfter)
|
// retrying will not make the request magically work.
|
||||||
if err != nil {
|
if len(resp.Status) > 0 && resp.Status[0] == '4' {
|
||||||
return errors.New("server responded: service unavailable")
|
log.Printf("E! [outputs.influxdb_v2] Failed to write metric (will be dropped: %s): %s\n", resp.Status, desc)
|
||||||
}
|
return nil
|
||||||
if retry > defaultMaxWait {
|
|
||||||
retry = defaultMaxWait
|
|
||||||
}
|
|
||||||
c.retryTime = time.Now().Add(time.Duration(retry) * time.Second)
|
|
||||||
return fmt.Errorf("waiting %ds for server before sending metric again", retry)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is only until platform spec is fully implemented. As of the
|
// This is only until platform spec is fully implemented. As of the
|
||||||
|
|
@ -287,6 +304,29 @@ func (c *httpClient) writeBatch(ctx context.Context, bucket string, metrics []te
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// retryDuration takes the longer of the Retry-After header and our own back-off calculation
|
||||||
|
func (c *httpClient) getRetryDuration(headers http.Header) time.Duration {
|
||||||
|
// basic exponential backoff (x^2)/40 (denominator to widen the slope)
|
||||||
|
// at 40 denominator, it'll take 35 retries to hit the max defaultMaxWait of 30s
|
||||||
|
backoff := math.Pow(float64(c.retryCount), 2) / 40
|
||||||
|
|
||||||
|
// get any value from the header, if available
|
||||||
|
retryAfterHeader := float64(0)
|
||||||
|
retryAfterHeaderString := headers.Get("Retry-After")
|
||||||
|
if len(retryAfterHeaderString) > 0 {
|
||||||
|
var err error
|
||||||
|
retryAfterHeader, err = strconv.ParseFloat(retryAfterHeaderString, 64)
|
||||||
|
if err != nil {
|
||||||
|
// there was a value but we couldn't parse it? guess minimum 10 sec
|
||||||
|
retryAfterHeader = 10
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// take the highest value from both, but not over the max wait.
|
||||||
|
retry := math.Max(backoff, retryAfterHeader)
|
||||||
|
retry = math.Min(retry, defaultMaxWait)
|
||||||
|
return time.Duration(retry) * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
func (c *httpClient) makeWriteRequest(url string, body io.Reader) (*http.Request, error) {
|
func (c *httpClient) makeWriteRequest(url string, body io.Reader) (*http.Request, error) {
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,11 @@
|
||||||
package influxdb_v2
|
package influxdb_v2
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
@ -45,3 +48,27 @@ func TestMakeWriteURL(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestExponentialBackoffCalculation(t *testing.T) {
|
||||||
|
c := &httpClient{}
|
||||||
|
tests := []struct {
|
||||||
|
retryCount int
|
||||||
|
expected time.Duration
|
||||||
|
}{
|
||||||
|
{retryCount: 0, expected: 0},
|
||||||
|
{retryCount: 1, expected: 0},
|
||||||
|
{retryCount: 5, expected: 0},
|
||||||
|
{retryCount: 10, expected: 2 * time.Second},
|
||||||
|
{retryCount: 30, expected: 22 * time.Second},
|
||||||
|
{retryCount: 40, expected: 40 * time.Second},
|
||||||
|
{retryCount: 50, expected: 60 * time.Second},
|
||||||
|
{retryCount: 100, expected: 60 * time.Second},
|
||||||
|
{retryCount: 1000, expected: 60 * time.Second},
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
t.Run(fmt.Sprintf("%d_retries", test.retryCount), func(t *testing.T) {
|
||||||
|
c.retryCount = test.retryCount
|
||||||
|
require.EqualValues(t, test.expected, c.getRetryDuration(http.Header{}))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue