feat(parsers.csv): suport null delimiters (#12247)

2022-12-12 17:55:47 +01:00 · 2022-12-12 17:55:47 +01:00 · e264721cb9
parent 35b5476c83
commit e264721cb9
3 changed files with 47 additions and 2 deletions
--- a/plugins/parsers/csv/README.md
+++ b/plugins/parsers/csv/README.md
@ -59,6 +59,10 @@ values.

  ## The separator between csv fields
  ## By default, the parser assumes a comma (",")
+  ## Please note that if you use invalid delimiters (e.g. "\u0000"), commas
+  ## will be changed to "\ufffd", the invalid delimiters changed to a comma
+  ## during parsing, and afterwards the invalid characters and commas are
+  ## returned to their original values.
  csv_delimiter = ","

  ## The character reserved for marking a row as a comment row
--- a/plugins/parsers/csv/parser.go
+++ b/plugins/parsers/csv/parser.go
@ -11,6 +11,7 @@ import (
 	"strconv"
 	"strings"
 	"time"
+	"unicode/utf8"

 	_ "time/tzdata" // needed to bundle timezone info into the binary for Windows

@ -23,6 +24,9 @@ import (

 type TimeFunc func() time.Time

+const replacementByte = "\ufffd"
+const commaByte = "\u002C"
+
 type Parser struct {
 	ColumnNames        []string        `toml:"csv_column_names"`
 	ColumnTypes        []string        `toml:"csv_column_types"`
@ -51,6 +55,8 @@ type Parser struct {

 	gotColumnNames bool

+	invalidDelimiter bool
+
 	TimeFunc     func() time.Time
 	DefaultTags  map[string]string
 	metadataTags map[string]string
@ -141,6 +147,7 @@ func (p *Parser) Init() error {
 		if len(runeStr) > 1 {
 			return fmt.Errorf("csv_delimiter must be a single character, got: %s", p.Delimiter)
 		}
+		p.invalidDelimiter = !validDelim(runeStr[0])
 	}

 	if p.Comment != "" {
@ -182,9 +189,13 @@ func (p *Parser) compile(r io.Reader) *csv.Reader {
 	csvReader := csv.NewReader(r)
 	// ensures that the reader reads records of different lengths without an error
 	csvReader.FieldsPerRecord = -1
-	if p.Delimiter != "" {
+	if !p.invalidDelimiter && p.Delimiter != "" {
 		csvReader.Comma = []rune(p.Delimiter)[0]
 	}
+	// Check if delimiter is invalid
+	if p.invalidDelimiter && p.Delimiter != "" {
+		csvReader.Comma = []rune(commaByte)[0]
+	}
 	if p.Comment != "" {
 		csvReader.Comment = []rune(p.Comment)[0]
 	}
@ -192,12 +203,23 @@ func (p *Parser) compile(r io.Reader) *csv.Reader {
 	return csvReader
 }

+// Taken from upstream Golang code see
+// https://github.com/golang/go/blob/release-branch.go1.19/src/encoding/csv/reader.go#L95
+func validDelim(r rune) bool {
+	return r != 0 && r != '"' && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError
+}
+
 func (p *Parser) Parse(buf []byte) ([]telegraf.Metric, error) {
 	// Reset the parser according to the specified mode
 	if p.ResetMode == "always" {
 		p.Reset()
 	}
-
+	// If using an invalid delimiter, replace commas with replacement and
+	// invalid delimiter with commas
+	if p.invalidDelimiter {
+		buf = bytes.Replace(buf, []byte(commaByte), []byte(replacementByte), -1)
+		buf = bytes.Replace(buf, []byte(p.Delimiter), []byte(commaByte), -1)
+	}
 	r := bytes.NewReader(buf)
 	metrics, err := parseCSV(p, r)
 	if err != nil && errors.Is(err, io.EOF) {
--- a/plugins/parsers/csv/parser_test.go
+++ b/plugins/parsers/csv/parser_test.go
@ -2,6 +2,7 @@ package csv

 import (
 	"fmt"
+	"strings"
 	"testing"
 	"time"

@ -221,6 +222,24 @@ func TestDelimiter(t *testing.T) {
 	require.Equal(t, "3,4", metrics[0].Fields()["first"])
 }

+func TestNullDelimiter(t *testing.T) {
+	p := &Parser{
+		HeaderRowCount: 0,
+		Delimiter:      "\u0000",
+		ColumnNames:    []string{"first", "second", "third"},
+		TimeFunc:       DefaultTime,
+	}
+	err := p.Init()
+	require.NoError(t, err)
+
+	testCSV := strings.Join([]string{"3.4", "70", "test_name"}, "\u0000")
+	metrics, err := p.Parse([]byte(testCSV))
+	require.NoError(t, err)
+	require.Equal(t, float64(3.4), metrics[0].Fields()["first"])
+	require.Equal(t, int64(70), metrics[0].Fields()["second"])
+	require.Equal(t, "test_name", metrics[0].Fields()["third"])
+}
+
 func TestValueConversion(t *testing.T) {
 	p := &Parser{
 		HeaderRowCount: 0,