feat(inputs.tail): Allow handling of quoted strings spanning multiple lines (#11762)

This commit is contained in:
Sven Rebhan 2022-11-15 18:58:15 +01:00 committed by GitHub
parent cdc622e9db
commit 9acbf23ebb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 303 additions and 14 deletions

View File

@ -95,6 +95,15 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
## If true, a message not matching the pattern will constitute a match of the multiline filter and the what will be applied. (vice-versa is also true)
#invert_match = false
## The handling method for quoted text (defaults to 'ignore').
## The following methods are available:
## ignore -- do not consider quotation (default)
## single-quotes -- consider text quoted by single quotes (')
## double-quotes -- consider text quoted by double quotes (")
## backticks -- consider text quoted by backticks (`)
## When handling quotes, escaped quotes (e.g. \") are handled correctly.
#quotation = "ignore"
#After the specified timeout, this plugin sends the multiline event even if no new pattern is found to start a new event. The default is 5s.
#timeout = 5s
```
@ -103,3 +112,7 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
Metrics are produced according to the `data_format` option. Additionally a
tag labeled `path` is added to the metric containing the filename being tailed.
## Example Output
There is no predefined metric format, so output depends on plugin input.

View File

@ -2,6 +2,7 @@ package tail
import (
"bytes"
"errors"
"fmt"
"regexp"
"strings"
@ -17,13 +18,16 @@ type Multiline struct {
config *MultilineConfig
enabled bool
patternRegexp *regexp.Regexp
quote byte
inQuote bool
}
type MultilineConfig struct {
Pattern string
Pattern string `toml:"pattern"`
MatchWhichLine MultilineMatchWhichLine `toml:"match_which_line"`
InvertMatch bool
Timeout *config.Duration
InvertMatch bool `toml:"invert_match"`
Quotation string `toml:"quotation"`
Timeout *config.Duration `toml:"timeout"`
}
const (
@ -34,25 +38,41 @@ const (
)
func (m *MultilineConfig) NewMultiline() (*Multiline, error) {
enabled := false
var r *regexp.Regexp
var err error
if m.Pattern != "" {
enabled = true
var err error
if r, err = regexp.Compile(m.Pattern); err != nil {
return nil, err
}
if m.Timeout == nil || time.Duration(*m.Timeout).Nanoseconds() == int64(0) {
d := config.Duration(5 * time.Second)
m.Timeout = &d
}
}
var quote byte
switch m.Quotation {
case "", "ignore":
m.Quotation = "ignore"
case "single-quotes":
quote = '\''
case "double-quotes":
quote = '"'
case "backticks":
quote = '`'
default:
return nil, errors.New("invalid 'quotation' setting")
}
enabled := m.Pattern != "" || quote != 0
if m.Timeout == nil || time.Duration(*m.Timeout).Nanoseconds() == int64(0) {
d := config.Duration(5 * time.Second)
m.Timeout = &d
}
return &Multiline{
config: m,
enabled: enabled,
patternRegexp: r}, nil
patternRegexp: r,
quote: quote,
}, nil
}
func (m *Multiline) IsEnabled() bool {
@ -60,10 +80,14 @@ func (m *Multiline) IsEnabled() bool {
}
func (m *Multiline) ProcessLine(text string, buffer *bytes.Buffer) string {
if m.matchQuotation(text) {
// Ignore the returned error as we cannot do anything about it anyway
_, _ = buffer.WriteString(text + "\n")
return ""
}
if m.matchString(text) {
// Ignore the returned error as we cannot do anything about it anyway
//nolint:errcheck,revive
buffer.WriteString(text)
_, _ = buffer.WriteString(text)
return ""
}
@ -97,8 +121,39 @@ func (m *Multiline) Flush(buffer *bytes.Buffer) string {
return text
}
func (m *Multiline) matchQuotation(text string) bool {
if m.config.Quotation == "ignore" {
return false
}
escaped := 0
count := 0
for i := 0; i < len(text); i++ {
if text[i] == '\\' {
escaped++
continue
}
// If we do encounter a backslash-quote combination, we interpret this
// as an escaped-quoted and should not count the quote. However,
// backslash-backslash combinations (or any even number of backslashes)
// are interpreted as a literal backslash not escaping the quote.
if text[i] == m.quote && escaped%2 == 0 {
count++
}
// If we encounter any non-quote, non-backslash character we can
// safely reset the escape state.
escaped = 0
}
even := count%2 == 0
m.inQuote = (m.inQuote && even) || (!m.inQuote && !even)
return m.inQuote
}
func (m *Multiline) matchString(text string) bool {
return m.patternRegexp.MatchString(text) != m.config.InvertMatch
if m.patternRegexp != nil {
return m.patternRegexp.MatchString(text) != m.config.InvertMatch
}
return false
}
func (w MultilineMatchWhichLine) String() string {

View File

@ -1,7 +1,11 @@
package tail
import (
"bufio"
"bytes"
"fmt"
"os"
"path/filepath"
"testing"
"time"
@ -234,3 +238,169 @@ func TestMultilineWhat(t *testing.T) {
require.Error(t, w7.UnmarshalTOML([]byte(`nope`)))
require.Equal(t, MultilineMatchWhichLine(-1), w7)
}
func TestMultiLineQuoted(t *testing.T) {
tests := []struct {
name string
quotation string
quote string
filename string
}{
{
name: "single-quotes",
quotation: "single-quotes",
quote: `'`,
filename: "multiline_quoted_single.csv",
},
{
name: "double-quotes",
quotation: "double-quotes",
quote: `"`,
filename: "multiline_quoted_double.csv",
},
{
name: "backticks",
quotation: "backticks",
quote: "`",
filename: "multiline_quoted_backticks.csv",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
expected := []string{
`1660819827410,1,some text without quotes,A`,
fmt.Sprintf("1660819827411,1,%ssome text all quoted%s,A", tt.quote, tt.quote),
fmt.Sprintf("1660819827412,1,%ssome text all quoted\nbut wrapped%s,A", tt.quote, tt.quote),
fmt.Sprintf("1660819827420,2,some text with %squotes%s,B", tt.quote, tt.quote),
"1660819827430,3,some text with 'multiple \"quotes\" in `one` line',C",
fmt.Sprintf("1660819827440,4,some multiline text with %squotes\n", tt.quote) +
fmt.Sprintf("spanning \\%smultiple\\%s\n", tt.quote, tt.quote) +
fmt.Sprintf("lines%s but do not %send\ndirectly%s,D", tt.quote, tt.quote, tt.quote),
fmt.Sprintf("1660819827450,5,all of %sthis%s should %sbasically%s work...,E", tt.quote, tt.quote, tt.quote, tt.quote),
}
c := &MultilineConfig{
MatchWhichLine: Next,
Quotation: tt.quotation,
}
m, err := c.NewMultiline()
require.NoError(t, err)
f, err := os.Open(filepath.Join("testdata", tt.filename))
require.NoError(t, err)
scanner := bufio.NewScanner(f)
var buffer bytes.Buffer
var result []string
for scanner.Scan() {
line := scanner.Text()
text := m.ProcessLine(line, &buffer)
if text == "" {
continue
}
result = append(result, text)
}
require.EqualValues(t, expected, result)
})
}
}
func TestMultiLineQuotedError(t *testing.T) {
tests := []struct {
name string
filename string
quotation string
quote string
expected []string
}{
{
name: "messed up quoting",
filename: "multiline_quoted_messed_up.csv",
quotation: "single-quotes",
quote: `'`,
expected: []string{
"1660819827410,1,some text without quotes,A",
"1660819827411,1,'some text all quoted,A\n1660819827412,1,'some text all quoted",
"but wrapped,A"},
},
{
name: "missing closing quote",
filename: "multiline_quoted_missing_close.csv",
quotation: "single-quotes",
quote: `'`,
expected: nil,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
c := &MultilineConfig{
MatchWhichLine: Next,
Quotation: tt.quotation,
}
m, err := c.NewMultiline()
require.NoError(t, err)
f, err := os.Open(filepath.Join("testdata", tt.filename))
require.NoError(t, err)
scanner := bufio.NewScanner(f)
var buffer bytes.Buffer
var result []string
for scanner.Scan() {
line := scanner.Text()
text := m.ProcessLine(line, &buffer)
if text == "" {
continue
}
result = append(result, text)
}
require.EqualValues(t, tt.expected, result)
})
}
}
func TestMultiLineQuotedAndPattern(t *testing.T) {
c := &MultilineConfig{
Pattern: "=>$",
MatchWhichLine: Next,
Quotation: "double-quotes",
}
m, err := c.NewMultiline()
require.NoError(t, err, "Configuration was OK.")
var buffer bytes.Buffer
text := m.ProcessLine("1=>", &buffer)
require.Empty(t, text)
require.NotZero(t, buffer.Len())
text = m.ProcessLine("2=>", &buffer)
require.Empty(t, text)
require.NotZero(t, buffer.Len())
text = m.ProcessLine(`"a quoted`, &buffer)
require.Empty(t, text)
require.NotZero(t, buffer.Len())
text = m.ProcessLine(`multiline string"=>`, &buffer)
require.Empty(t, text)
require.NotZero(t, buffer.Len())
text = m.ProcessLine("3=>", &buffer)
require.Empty(t, text)
require.NotZero(t, buffer.Len())
text = m.ProcessLine("4", &buffer)
require.Equal(t, "1=>2=>\"a quoted\nmultiline string\"=>3=>4", text)
require.Zero(t, buffer.Len())
text = m.ProcessLine("5", &buffer)
require.Equal(t, "5", text)
require.Zero(t, buffer.Len())
}

View File

@ -62,5 +62,14 @@
## If true, a message not matching the pattern will constitute a match of the multiline filter and the what will be applied. (vice-versa is also true)
#invert_match = false
## The handling method for quoted text (defaults to 'ignore').
## The following methods are available:
## ignore -- do not consider quotation (default)
## single-quotes -- consider text quoted by single quotes (')
## double-quotes -- consider text quoted by double quotes (")
## backticks -- consider text quoted by backticks (`)
## When handling quotes, escaped quotes (e.g. \") are handled correctly.
#quotation = "ignore"
#After the specified timeout, this plugin sends the multiline event even if no new pattern is found to start a new event. The default is 5s.
#timeout = 5s

View File

@ -0,0 +1,12 @@
1660819827410,1,some text without quotes,A
1660819827411,1,`some text all quoted`,A
1660819827412,1,`some text all quoted
but wrapped`,A
1660819827420,2,some text with `quotes`,B
1660819827430,3,some text with 'multiple "quotes" in `one` line',C
1660819827440,4,some multiline text with `quotes
spanning \`multiple\`
lines` but do not `end
directly`,D
1660819827450,5,all of `this` should `basically` work...,E
Can't render this file because it contains an unexpected character in line 6 and column 42.

View File

@ -0,0 +1,12 @@
1660819827410,1,some text without quotes,A
1660819827411,1,"some text all quoted",A
1660819827412,1,"some text all quoted
but wrapped",A
1660819827420,2,some text with "quotes",B
1660819827430,3,some text with 'multiple "quotes" in `one` line',C
1660819827440,4,some multiline text with "quotes
spanning \"multiple\"
lines" but do not "end
directly",D
1660819827450,5,all of "this" should "basically" work...,E
Can't render this file because it contains an unexpected character in line 5 and column 32.

View File

@ -0,0 +1,4 @@
1660819827410,1,some text without quotes,A
1660819827411,1,'some text all quoted,A
1660819827412,1,'some text all quoted
but wrapped,A
1 1660819827410,1,some text without quotes,A
2 1660819827411,1,'some text all quoted,A
3 1660819827412,1,'some text all quoted
4 but wrapped,A

View File

@ -0,0 +1,2 @@
1660819827411,2,'some text all quoted,B
1660819827410,1,some text without quotes,A
1 1660819827411 2 'some text all quoted B
2 1660819827410 1 some text without quotes A

View File

@ -0,0 +1,12 @@
1660819827410,1,some text without quotes,A
1660819827411,1,'some text all quoted',A
1660819827412,1,'some text all quoted
but wrapped',A
1660819827420,2,some text with 'quotes',B
1660819827430,3,some text with 'multiple "quotes" in `one` line',C
1660819827440,4,some multiline text with 'quotes
spanning \'multiple\'
lines' but do not 'end
directly',D
1660819827450,5,all of 'this' should 'basically' work...,E
Can't render this file because it contains an unexpected character in line 6 and column 42.