Add UTF-8 sanitizer to Strings plugin (#9118)

Adds a new option for the Strings processer to sanitize strings so that they conform to utf-8
This commit is contained in:
Logan 2021-04-29 18:46:36 -06:00 committed by GitHub
parent 4fc849d73f
commit 370836d436
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 130 additions and 0 deletions

View File

@ -14,6 +14,7 @@ Implemented functions are:
- replace - replace
- left - left
- base64decode - base64decode
- valid_utf8
Please note that in this implementation these are processed in the order that they appear above. Please note that in this implementation these are processed in the order that they appear above.
@ -78,6 +79,12 @@ If you'd like to apply multiple processings to the same `tag_key` or `field_key`
## Decode a base64 encoded utf-8 string ## Decode a base64 encoded utf-8 string
# [[processors.strings.base64decode]] # [[processors.strings.base64decode]]
# field = "message" # field = "message"
## Sanitize a string to ensure it is a valid utf-8 string
## Each run of invalid UTF-8 byte sequences is replaced by the replacement string, which may be empty
# [[processors.strings.valid_utf8]]
# field = "message"
# replacement = ""
``` ```
#### Trim, TrimLeft, TrimRight #### Trim, TrimLeft, TrimRight

View File

@ -22,6 +22,7 @@ type Strings struct {
Replace []converter `toml:"replace"` Replace []converter `toml:"replace"`
Left []converter `toml:"left"` Left []converter `toml:"left"`
Base64Decode []converter `toml:"base64decode"` Base64Decode []converter `toml:"base64decode"`
ValidUTF8 []converter `toml:"valid_utf8"`
converters []converter converters []converter
init bool init bool
@ -42,6 +43,7 @@ type converter struct {
Old string Old string
New string New string
Width int Width int
Replacement string
fn ConvertFunc fn ConvertFunc
} }
@ -98,6 +100,12 @@ const sampleConfig = `
## Decode a base64 encoded utf-8 string ## Decode a base64 encoded utf-8 string
# [[processors.strings.base64decode]] # [[processors.strings.base64decode]]
# field = "message" # field = "message"
## Sanitize a string to ensure it is a valid utf-8 string
## Each run of invalid UTF-8 byte sequences is replaced by the replacement string, which may be empty
# [[processors.strings.valid_utf8]]
# field = "message"
# replacement = ""
` `
func (s *Strings) SampleConfig() string { func (s *Strings) SampleConfig() string {
@ -318,6 +326,11 @@ func (s *Strings) initOnce() {
} }
s.converters = append(s.converters, c) s.converters = append(s.converters, c)
} }
for _, c := range s.ValidUTF8 {
c := c
c.fn = func(s string) string { return strings.ToValidUTF8(s, c.Replacement) }
s.converters = append(s.converters, c)
}
s.init = true s.init = true
} }

View File

@ -1047,3 +1047,113 @@ func TestBase64Decode(t *testing.T) {
}) })
} }
} }
func TestValidUTF8(t *testing.T) {
tests := []struct {
name string
plugin *Strings
metric []telegraf.Metric
expected []telegraf.Metric
}{
{
name: "valid utf-8 keeps original string",
plugin: &Strings{
ValidUTF8: []converter{
{
Field: "message",
Replacement: "r",
},
},
},
metric: []telegraf.Metric{
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"message": "howdy",
},
time.Unix(0, 0),
),
},
expected: []telegraf.Metric{
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"message": "howdy",
},
time.Unix(0, 0),
),
},
},
{
name: "non-valid utf-8 modifies original string",
plugin: &Strings{
ValidUTF8: []converter{
{
Field: "message",
Replacement: "r",
},
},
},
metric: []telegraf.Metric{
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"message": "ho" + string([]byte{0xff}) + "wdy",
},
time.Unix(0, 0),
),
},
expected: []telegraf.Metric{
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"message": "horwdy",
},
time.Unix(0, 0),
),
},
},
{
name: "non-valid utf-8 and empty replacement removes invalid characters",
plugin: &Strings{
ValidUTF8: []converter{
{
Field: "message",
Replacement: "",
},
},
},
metric: []telegraf.Metric{
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"message": "ho" + string([]byte{0xff}) + "wdy",
},
time.Unix(0, 0),
),
},
expected: []telegraf.Metric{
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"message": "howdy",
},
time.Unix(0, 0),
),
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
actual := tt.plugin.Apply(tt.metric...)
testutil.RequireMetricsEqual(t, tt.expected, actual)
})
}
}