diff --git a/plugins/parsers/csv/README.md b/plugins/parsers/csv/README.md index 7d68f6d1c..fa4acba1a 100644 --- a/plugins/parsers/csv/README.md +++ b/plugins/parsers/csv/README.md @@ -59,6 +59,10 @@ values. ## The separator between csv fields ## By default, the parser assumes a comma (",") + ## Please note that if you use invalid delimiters (e.g. "\u0000"), commas + ## will be changed to "\ufffd", the invalid delimiters changed to a comma + ## during parsing, and afterwards the invalid characters and commas are + ## returned to their original values. csv_delimiter = "," ## The character reserved for marking a row as a comment row diff --git a/plugins/parsers/csv/parser.go b/plugins/parsers/csv/parser.go index 050d47bc2..8e56df88f 100644 --- a/plugins/parsers/csv/parser.go +++ b/plugins/parsers/csv/parser.go @@ -11,6 +11,7 @@ import ( "strconv" "strings" "time" + "unicode/utf8" _ "time/tzdata" // needed to bundle timezone info into the binary for Windows @@ -23,6 +24,9 @@ import ( type TimeFunc func() time.Time +const replacementByte = "\ufffd" +const commaByte = "\u002C" + type Parser struct { ColumnNames []string `toml:"csv_column_names"` ColumnTypes []string `toml:"csv_column_types"` @@ -51,6 +55,8 @@ type Parser struct { gotColumnNames bool + invalidDelimiter bool + TimeFunc func() time.Time DefaultTags map[string]string metadataTags map[string]string @@ -141,6 +147,7 @@ func (p *Parser) Init() error { if len(runeStr) > 1 { return fmt.Errorf("csv_delimiter must be a single character, got: %s", p.Delimiter) } + p.invalidDelimiter = !validDelim(runeStr[0]) } if p.Comment != "" { @@ -182,9 +189,13 @@ func (p *Parser) compile(r io.Reader) *csv.Reader { csvReader := csv.NewReader(r) // ensures that the reader reads records of different lengths without an error csvReader.FieldsPerRecord = -1 - if p.Delimiter != "" { + if !p.invalidDelimiter && p.Delimiter != "" { csvReader.Comma = []rune(p.Delimiter)[0] } + // Check if delimiter is invalid + if p.invalidDelimiter && p.Delimiter != "" { + csvReader.Comma = []rune(commaByte)[0] + } if p.Comment != "" { csvReader.Comment = []rune(p.Comment)[0] } @@ -192,12 +203,23 @@ func (p *Parser) compile(r io.Reader) *csv.Reader { return csvReader } +// Taken from upstream Golang code see +// https://github.com/golang/go/blob/release-branch.go1.19/src/encoding/csv/reader.go#L95 +func validDelim(r rune) bool { + return r != 0 && r != '"' && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError +} + func (p *Parser) Parse(buf []byte) ([]telegraf.Metric, error) { // Reset the parser according to the specified mode if p.ResetMode == "always" { p.Reset() } - + // If using an invalid delimiter, replace commas with replacement and + // invalid delimiter with commas + if p.invalidDelimiter { + buf = bytes.Replace(buf, []byte(commaByte), []byte(replacementByte), -1) + buf = bytes.Replace(buf, []byte(p.Delimiter), []byte(commaByte), -1) + } r := bytes.NewReader(buf) metrics, err := parseCSV(p, r) if err != nil && errors.Is(err, io.EOF) { diff --git a/plugins/parsers/csv/parser_test.go b/plugins/parsers/csv/parser_test.go index ad3db905a..d8dbcbe02 100644 --- a/plugins/parsers/csv/parser_test.go +++ b/plugins/parsers/csv/parser_test.go @@ -2,6 +2,7 @@ package csv import ( "fmt" + "strings" "testing" "time" @@ -221,6 +222,24 @@ func TestDelimiter(t *testing.T) { require.Equal(t, "3,4", metrics[0].Fields()["first"]) } +func TestNullDelimiter(t *testing.T) { + p := &Parser{ + HeaderRowCount: 0, + Delimiter: "\u0000", + ColumnNames: []string{"first", "second", "third"}, + TimeFunc: DefaultTime, + } + err := p.Init() + require.NoError(t, err) + + testCSV := strings.Join([]string{"3.4", "70", "test_name"}, "\u0000") + metrics, err := p.Parse([]byte(testCSV)) + require.NoError(t, err) + require.Equal(t, float64(3.4), metrics[0].Fields()["first"]) + require.Equal(t, int64(70), metrics[0].Fields()["second"]) + require.Equal(t, "test_name", metrics[0].Fields()["third"]) +} + func TestValueConversion(t *testing.T) { p := &Parser{ HeaderRowCount: 0,