feat(parsers.csv): suport null delimiters (#12247)

This commit is contained in:
Meceron 2022-12-12 17:55:47 +01:00 committed by GitHub
parent 35b5476c83
commit e264721cb9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 47 additions and 2 deletions

View File

@ -59,6 +59,10 @@ values.
## The separator between csv fields
## By default, the parser assumes a comma (",")
## Please note that if you use invalid delimiters (e.g. "\u0000"), commas
## will be changed to "\ufffd", the invalid delimiters changed to a comma
## during parsing, and afterwards the invalid characters and commas are
## returned to their original values.
csv_delimiter = ","
## The character reserved for marking a row as a comment row

View File

@ -11,6 +11,7 @@ import (
"strconv"
"strings"
"time"
"unicode/utf8"
_ "time/tzdata" // needed to bundle timezone info into the binary for Windows
@ -23,6 +24,9 @@ import (
type TimeFunc func() time.Time
const replacementByte = "\ufffd"
const commaByte = "\u002C"
type Parser struct {
ColumnNames []string `toml:"csv_column_names"`
ColumnTypes []string `toml:"csv_column_types"`
@ -51,6 +55,8 @@ type Parser struct {
gotColumnNames bool
invalidDelimiter bool
TimeFunc func() time.Time
DefaultTags map[string]string
metadataTags map[string]string
@ -141,6 +147,7 @@ func (p *Parser) Init() error {
if len(runeStr) > 1 {
return fmt.Errorf("csv_delimiter must be a single character, got: %s", p.Delimiter)
}
p.invalidDelimiter = !validDelim(runeStr[0])
}
if p.Comment != "" {
@ -182,9 +189,13 @@ func (p *Parser) compile(r io.Reader) *csv.Reader {
csvReader := csv.NewReader(r)
// ensures that the reader reads records of different lengths without an error
csvReader.FieldsPerRecord = -1
if p.Delimiter != "" {
if !p.invalidDelimiter && p.Delimiter != "" {
csvReader.Comma = []rune(p.Delimiter)[0]
}
// Check if delimiter is invalid
if p.invalidDelimiter && p.Delimiter != "" {
csvReader.Comma = []rune(commaByte)[0]
}
if p.Comment != "" {
csvReader.Comment = []rune(p.Comment)[0]
}
@ -192,12 +203,23 @@ func (p *Parser) compile(r io.Reader) *csv.Reader {
return csvReader
}
// Taken from upstream Golang code see
// https://github.com/golang/go/blob/release-branch.go1.19/src/encoding/csv/reader.go#L95
func validDelim(r rune) bool {
return r != 0 && r != '"' && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError
}
func (p *Parser) Parse(buf []byte) ([]telegraf.Metric, error) {
// Reset the parser according to the specified mode
if p.ResetMode == "always" {
p.Reset()
}
// If using an invalid delimiter, replace commas with replacement and
// invalid delimiter with commas
if p.invalidDelimiter {
buf = bytes.Replace(buf, []byte(commaByte), []byte(replacementByte), -1)
buf = bytes.Replace(buf, []byte(p.Delimiter), []byte(commaByte), -1)
}
r := bytes.NewReader(buf)
metrics, err := parseCSV(p, r)
if err != nil && errors.Is(err, io.EOF) {

View File

@ -2,6 +2,7 @@ package csv
import (
"fmt"
"strings"
"testing"
"time"
@ -221,6 +222,24 @@ func TestDelimiter(t *testing.T) {
require.Equal(t, "3,4", metrics[0].Fields()["first"])
}
func TestNullDelimiter(t *testing.T) {
p := &Parser{
HeaderRowCount: 0,
Delimiter: "\u0000",
ColumnNames: []string{"first", "second", "third"},
TimeFunc: DefaultTime,
}
err := p.Init()
require.NoError(t, err)
testCSV := strings.Join([]string{"3.4", "70", "test_name"}, "\u0000")
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, float64(3.4), metrics[0].Fields()["first"])
require.Equal(t, int64(70), metrics[0].Fields()["second"])
require.Equal(t, "test_name", metrics[0].Fields()["third"])
}
func TestValueConversion(t *testing.T) {
p := &Parser{
HeaderRowCount: 0,