Add UTF-8 sanitizer to Strings plugin (#9118)

Adds a new option for the Strings processer to sanitize strings so that they conform to utf-8
This commit is contained in:
Logan 2021-04-29 18:46:36 -06:00 committed by GitHub
parent 4fc849d73f
commit 370836d436
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 130 additions and 0 deletions

View File

@ -14,6 +14,7 @@ Implemented functions are:
- replace
- left
- base64decode
- valid_utf8
Please note that in this implementation these are processed in the order that they appear above.
@ -78,6 +79,12 @@ If you'd like to apply multiple processings to the same `tag_key` or `field_key`
## Decode a base64 encoded utf-8 string
# [[processors.strings.base64decode]]
# field = "message"
## Sanitize a string to ensure it is a valid utf-8 string
## Each run of invalid UTF-8 byte sequences is replaced by the replacement string, which may be empty
# [[processors.strings.valid_utf8]]
# field = "message"
# replacement = ""
```
#### Trim, TrimLeft, TrimRight

View File

@ -22,6 +22,7 @@ type Strings struct {
Replace []converter `toml:"replace"`
Left []converter `toml:"left"`
Base64Decode []converter `toml:"base64decode"`
ValidUTF8 []converter `toml:"valid_utf8"`
converters []converter
init bool
@ -42,6 +43,7 @@ type converter struct {
Old string
New string
Width int
Replacement string
fn ConvertFunc
}
@ -98,6 +100,12 @@ const sampleConfig = `
## Decode a base64 encoded utf-8 string
# [[processors.strings.base64decode]]
# field = "message"
## Sanitize a string to ensure it is a valid utf-8 string
## Each run of invalid UTF-8 byte sequences is replaced by the replacement string, which may be empty
# [[processors.strings.valid_utf8]]
# field = "message"
# replacement = ""
`
func (s *Strings) SampleConfig() string {
@ -318,6 +326,11 @@ func (s *Strings) initOnce() {
}
s.converters = append(s.converters, c)
}
for _, c := range s.ValidUTF8 {
c := c
c.fn = func(s string) string { return strings.ToValidUTF8(s, c.Replacement) }
s.converters = append(s.converters, c)
}
s.init = true
}

View File

@ -1047,3 +1047,113 @@ func TestBase64Decode(t *testing.T) {
})
}
}
func TestValidUTF8(t *testing.T) {
tests := []struct {
name string
plugin *Strings
metric []telegraf.Metric
expected []telegraf.Metric
}{
{
name: "valid utf-8 keeps original string",
plugin: &Strings{
ValidUTF8: []converter{
{
Field: "message",
Replacement: "r",
},
},
},
metric: []telegraf.Metric{
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"message": "howdy",
},
time.Unix(0, 0),
),
},
expected: []telegraf.Metric{
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"message": "howdy",
},
time.Unix(0, 0),
),
},
},
{
name: "non-valid utf-8 modifies original string",
plugin: &Strings{
ValidUTF8: []converter{
{
Field: "message",
Replacement: "r",
},
},
},
metric: []telegraf.Metric{
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"message": "ho" + string([]byte{0xff}) + "wdy",
},
time.Unix(0, 0),
),
},
expected: []telegraf.Metric{
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"message": "horwdy",
},
time.Unix(0, 0),
),
},
},
{
name: "non-valid utf-8 and empty replacement removes invalid characters",
plugin: &Strings{
ValidUTF8: []converter{
{
Field: "message",
Replacement: "",
},
},
},
metric: []telegraf.Metric{
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"message": "ho" + string([]byte{0xff}) + "wdy",
},
time.Unix(0, 0),
),
},
expected: []telegraf.Metric{
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"message": "howdy",
},
time.Unix(0, 0),
),
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
actual := tt.plugin.Apply(tt.metric...)
testutil.RequireMetricsEqual(t, tt.expected, actual)
})
}
}