feat(inputs.tail): Allow handling of quoted strings spanning multiple lines (#11762)

This commit is contained in:
Sven Rebhan 2022-11-15 18:58:15 +01:00 committed by GitHub
parent cdc622e9db
commit 9acbf23ebb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 303 additions and 14 deletions

View File

@ -95,6 +95,15 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
## If true, a message not matching the pattern will constitute a match of the multiline filter and the what will be applied. (vice-versa is also true) ## If true, a message not matching the pattern will constitute a match of the multiline filter and the what will be applied. (vice-versa is also true)
#invert_match = false #invert_match = false
## The handling method for quoted text (defaults to 'ignore').
## The following methods are available:
## ignore -- do not consider quotation (default)
## single-quotes -- consider text quoted by single quotes (')
## double-quotes -- consider text quoted by double quotes (")
## backticks -- consider text quoted by backticks (`)
## When handling quotes, escaped quotes (e.g. \") are handled correctly.
#quotation = "ignore"
#After the specified timeout, this plugin sends the multiline event even if no new pattern is found to start a new event. The default is 5s. #After the specified timeout, this plugin sends the multiline event even if no new pattern is found to start a new event. The default is 5s.
#timeout = 5s #timeout = 5s
``` ```
@ -103,3 +112,7 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
Metrics are produced according to the `data_format` option. Additionally a Metrics are produced according to the `data_format` option. Additionally a
tag labeled `path` is added to the metric containing the filename being tailed. tag labeled `path` is added to the metric containing the filename being tailed.
## Example Output
There is no predefined metric format, so output depends on plugin input.

View File

@ -2,6 +2,7 @@ package tail
import ( import (
"bytes" "bytes"
"errors"
"fmt" "fmt"
"regexp" "regexp"
"strings" "strings"
@ -17,13 +18,16 @@ type Multiline struct {
config *MultilineConfig config *MultilineConfig
enabled bool enabled bool
patternRegexp *regexp.Regexp patternRegexp *regexp.Regexp
quote byte
inQuote bool
} }
type MultilineConfig struct { type MultilineConfig struct {
Pattern string Pattern string `toml:"pattern"`
MatchWhichLine MultilineMatchWhichLine `toml:"match_which_line"` MatchWhichLine MultilineMatchWhichLine `toml:"match_which_line"`
InvertMatch bool InvertMatch bool `toml:"invert_match"`
Timeout *config.Duration Quotation string `toml:"quotation"`
Timeout *config.Duration `toml:"timeout"`
} }
const ( const (
@ -34,25 +38,41 @@ const (
) )
func (m *MultilineConfig) NewMultiline() (*Multiline, error) { func (m *MultilineConfig) NewMultiline() (*Multiline, error) {
enabled := false
var r *regexp.Regexp var r *regexp.Regexp
var err error
if m.Pattern != "" { if m.Pattern != "" {
enabled = true var err error
if r, err = regexp.Compile(m.Pattern); err != nil { if r, err = regexp.Compile(m.Pattern); err != nil {
return nil, err return nil, err
} }
}
var quote byte
switch m.Quotation {
case "", "ignore":
m.Quotation = "ignore"
case "single-quotes":
quote = '\''
case "double-quotes":
quote = '"'
case "backticks":
quote = '`'
default:
return nil, errors.New("invalid 'quotation' setting")
}
enabled := m.Pattern != "" || quote != 0
if m.Timeout == nil || time.Duration(*m.Timeout).Nanoseconds() == int64(0) { if m.Timeout == nil || time.Duration(*m.Timeout).Nanoseconds() == int64(0) {
d := config.Duration(5 * time.Second) d := config.Duration(5 * time.Second)
m.Timeout = &d m.Timeout = &d
} }
}
return &Multiline{ return &Multiline{
config: m, config: m,
enabled: enabled, enabled: enabled,
patternRegexp: r}, nil patternRegexp: r,
quote: quote,
}, nil
} }
func (m *Multiline) IsEnabled() bool { func (m *Multiline) IsEnabled() bool {
@ -60,10 +80,14 @@ func (m *Multiline) IsEnabled() bool {
} }
func (m *Multiline) ProcessLine(text string, buffer *bytes.Buffer) string { func (m *Multiline) ProcessLine(text string, buffer *bytes.Buffer) string {
if m.matchQuotation(text) {
// Ignore the returned error as we cannot do anything about it anyway
_, _ = buffer.WriteString(text + "\n")
return ""
}
if m.matchString(text) { if m.matchString(text) {
// Ignore the returned error as we cannot do anything about it anyway // Ignore the returned error as we cannot do anything about it anyway
//nolint:errcheck,revive _, _ = buffer.WriteString(text)
buffer.WriteString(text)
return "" return ""
} }
@ -97,9 +121,40 @@ func (m *Multiline) Flush(buffer *bytes.Buffer) string {
return text return text
} }
func (m *Multiline) matchQuotation(text string) bool {
if m.config.Quotation == "ignore" {
return false
}
escaped := 0
count := 0
for i := 0; i < len(text); i++ {
if text[i] == '\\' {
escaped++
continue
}
// If we do encounter a backslash-quote combination, we interpret this
// as an escaped-quoted and should not count the quote. However,
// backslash-backslash combinations (or any even number of backslashes)
// are interpreted as a literal backslash not escaping the quote.
if text[i] == m.quote && escaped%2 == 0 {
count++
}
// If we encounter any non-quote, non-backslash character we can
// safely reset the escape state.
escaped = 0
}
even := count%2 == 0
m.inQuote = (m.inQuote && even) || (!m.inQuote && !even)
return m.inQuote
}
func (m *Multiline) matchString(text string) bool { func (m *Multiline) matchString(text string) bool {
if m.patternRegexp != nil {
return m.patternRegexp.MatchString(text) != m.config.InvertMatch return m.patternRegexp.MatchString(text) != m.config.InvertMatch
} }
return false
}
func (w MultilineMatchWhichLine) String() string { func (w MultilineMatchWhichLine) String() string {
switch w { switch w {

View File

@ -1,7 +1,11 @@
package tail package tail
import ( import (
"bufio"
"bytes" "bytes"
"fmt"
"os"
"path/filepath"
"testing" "testing"
"time" "time"
@ -234,3 +238,169 @@ func TestMultilineWhat(t *testing.T) {
require.Error(t, w7.UnmarshalTOML([]byte(`nope`))) require.Error(t, w7.UnmarshalTOML([]byte(`nope`)))
require.Equal(t, MultilineMatchWhichLine(-1), w7) require.Equal(t, MultilineMatchWhichLine(-1), w7)
} }
func TestMultiLineQuoted(t *testing.T) {
tests := []struct {
name string
quotation string
quote string
filename string
}{
{
name: "single-quotes",
quotation: "single-quotes",
quote: `'`,
filename: "multiline_quoted_single.csv",
},
{
name: "double-quotes",
quotation: "double-quotes",
quote: `"`,
filename: "multiline_quoted_double.csv",
},
{
name: "backticks",
quotation: "backticks",
quote: "`",
filename: "multiline_quoted_backticks.csv",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
expected := []string{
`1660819827410,1,some text without quotes,A`,
fmt.Sprintf("1660819827411,1,%ssome text all quoted%s,A", tt.quote, tt.quote),
fmt.Sprintf("1660819827412,1,%ssome text all quoted\nbut wrapped%s,A", tt.quote, tt.quote),
fmt.Sprintf("1660819827420,2,some text with %squotes%s,B", tt.quote, tt.quote),
"1660819827430,3,some text with 'multiple \"quotes\" in `one` line',C",
fmt.Sprintf("1660819827440,4,some multiline text with %squotes\n", tt.quote) +
fmt.Sprintf("spanning \\%smultiple\\%s\n", tt.quote, tt.quote) +
fmt.Sprintf("lines%s but do not %send\ndirectly%s,D", tt.quote, tt.quote, tt.quote),
fmt.Sprintf("1660819827450,5,all of %sthis%s should %sbasically%s work...,E", tt.quote, tt.quote, tt.quote, tt.quote),
}
c := &MultilineConfig{
MatchWhichLine: Next,
Quotation: tt.quotation,
}
m, err := c.NewMultiline()
require.NoError(t, err)
f, err := os.Open(filepath.Join("testdata", tt.filename))
require.NoError(t, err)
scanner := bufio.NewScanner(f)
var buffer bytes.Buffer
var result []string
for scanner.Scan() {
line := scanner.Text()
text := m.ProcessLine(line, &buffer)
if text == "" {
continue
}
result = append(result, text)
}
require.EqualValues(t, expected, result)
})
}
}
func TestMultiLineQuotedError(t *testing.T) {
tests := []struct {
name string
filename string
quotation string
quote string
expected []string
}{
{
name: "messed up quoting",
filename: "multiline_quoted_messed_up.csv",
quotation: "single-quotes",
quote: `'`,
expected: []string{
"1660819827410,1,some text without quotes,A",
"1660819827411,1,'some text all quoted,A\n1660819827412,1,'some text all quoted",
"but wrapped,A"},
},
{
name: "missing closing quote",
filename: "multiline_quoted_missing_close.csv",
quotation: "single-quotes",
quote: `'`,
expected: nil,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
c := &MultilineConfig{
MatchWhichLine: Next,
Quotation: tt.quotation,
}
m, err := c.NewMultiline()
require.NoError(t, err)
f, err := os.Open(filepath.Join("testdata", tt.filename))
require.NoError(t, err)
scanner := bufio.NewScanner(f)
var buffer bytes.Buffer
var result []string
for scanner.Scan() {
line := scanner.Text()
text := m.ProcessLine(line, &buffer)
if text == "" {
continue
}
result = append(result, text)
}
require.EqualValues(t, tt.expected, result)
})
}
}
func TestMultiLineQuotedAndPattern(t *testing.T) {
c := &MultilineConfig{
Pattern: "=>$",
MatchWhichLine: Next,
Quotation: "double-quotes",
}
m, err := c.NewMultiline()
require.NoError(t, err, "Configuration was OK.")
var buffer bytes.Buffer
text := m.ProcessLine("1=>", &buffer)
require.Empty(t, text)
require.NotZero(t, buffer.Len())
text = m.ProcessLine("2=>", &buffer)
require.Empty(t, text)
require.NotZero(t, buffer.Len())
text = m.ProcessLine(`"a quoted`, &buffer)
require.Empty(t, text)
require.NotZero(t, buffer.Len())
text = m.ProcessLine(`multiline string"=>`, &buffer)
require.Empty(t, text)
require.NotZero(t, buffer.Len())
text = m.ProcessLine("3=>", &buffer)
require.Empty(t, text)
require.NotZero(t, buffer.Len())
text = m.ProcessLine("4", &buffer)
require.Equal(t, "1=>2=>\"a quoted\nmultiline string\"=>3=>4", text)
require.Zero(t, buffer.Len())
text = m.ProcessLine("5", &buffer)
require.Equal(t, "5", text)
require.Zero(t, buffer.Len())
}

View File

@ -62,5 +62,14 @@
## If true, a message not matching the pattern will constitute a match of the multiline filter and the what will be applied. (vice-versa is also true) ## If true, a message not matching the pattern will constitute a match of the multiline filter and the what will be applied. (vice-versa is also true)
#invert_match = false #invert_match = false
## The handling method for quoted text (defaults to 'ignore').
## The following methods are available:
## ignore -- do not consider quotation (default)
## single-quotes -- consider text quoted by single quotes (')
## double-quotes -- consider text quoted by double quotes (")
## backticks -- consider text quoted by backticks (`)
## When handling quotes, escaped quotes (e.g. \") are handled correctly.
#quotation = "ignore"
#After the specified timeout, this plugin sends the multiline event even if no new pattern is found to start a new event. The default is 5s. #After the specified timeout, this plugin sends the multiline event even if no new pattern is found to start a new event. The default is 5s.
#timeout = 5s #timeout = 5s

View File

@ -0,0 +1,12 @@
1660819827410,1,some text without quotes,A
1660819827411,1,`some text all quoted`,A
1660819827412,1,`some text all quoted
but wrapped`,A
1660819827420,2,some text with `quotes`,B
1660819827430,3,some text with 'multiple "quotes" in `one` line',C
1660819827440,4,some multiline text with `quotes
spanning \`multiple\`
lines` but do not `end
directly`,D
1660819827450,5,all of `this` should `basically` work...,E
Can't render this file because it contains an unexpected character in line 6 and column 42.

View File

@ -0,0 +1,12 @@
1660819827410,1,some text without quotes,A
1660819827411,1,"some text all quoted",A
1660819827412,1,"some text all quoted
but wrapped",A
1660819827420,2,some text with "quotes",B
1660819827430,3,some text with 'multiple "quotes" in `one` line',C
1660819827440,4,some multiline text with "quotes
spanning \"multiple\"
lines" but do not "end
directly",D
1660819827450,5,all of "this" should "basically" work...,E
Can't render this file because it contains an unexpected character in line 5 and column 32.

View File

@ -0,0 +1,4 @@
1660819827410,1,some text without quotes,A
1660819827411,1,'some text all quoted,A
1660819827412,1,'some text all quoted
but wrapped,A
1 1660819827410,1,some text without quotes,A
2 1660819827411,1,'some text all quoted,A
3 1660819827412,1,'some text all quoted
4 but wrapped,A

View File

@ -0,0 +1,2 @@
1660819827411,2,'some text all quoted,B
1660819827410,1,some text without quotes,A
1 1660819827411 2 'some text all quoted B
2 1660819827410 1 some text without quotes A

View File

@ -0,0 +1,12 @@
1660819827410,1,some text without quotes,A
1660819827411,1,'some text all quoted',A
1660819827412,1,'some text all quoted
but wrapped',A
1660819827420,2,some text with 'quotes',B
1660819827430,3,some text with 'multiple "quotes" in `one` line',C
1660819827440,4,some multiline text with 'quotes
spanning \'multiple\'
lines' but do not 'end
directly',D
1660819827450,5,all of 'this' should 'basically' work...,E
Can't render this file because it contains an unexpected character in line 6 and column 42.