fix: directory monitor input plugin when data format is CSV and csv_skip_rows>0 and csv_header_row_count>=1 (#9865)

This commit is contained in:
Ehsan 2021-11-17 08:05:48 +10:00 committed by GitHub
parent b9e4978b17
commit db86904759
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 501 additions and 94 deletions

View File

@ -261,15 +261,12 @@ func (monitor *DirectoryMonitor) ingestFile(filePath string) error {
}
func (monitor *DirectoryMonitor) parseFile(parser parsers.Parser, reader io.Reader, fileName string) error {
// Read the file line-by-line and parse with the configured parse method.
firstLine := true
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
metrics, err := monitor.parseLine(parser, scanner.Bytes(), firstLine)
metrics, err := monitor.parseLine(parser, scanner.Bytes())
if err != nil {
return err
}
firstLine = false
if monitor.FileTag != "" {
for _, m := range metrics {
@ -285,24 +282,17 @@ func (monitor *DirectoryMonitor) parseFile(parser parsers.Parser, reader io.Read
return nil
}
func (monitor *DirectoryMonitor) parseLine(parser parsers.Parser, line []byte, firstLine bool) ([]telegraf.Metric, error) {
func (monitor *DirectoryMonitor) parseLine(parser parsers.Parser, line []byte) ([]telegraf.Metric, error) {
switch parser.(type) {
case *csv.Parser:
// The CSV parser parses headers in Parse and skips them in ParseLine.
if firstLine {
return parser.Parse(line)
}
m, err := parser.ParseLine(string(line))
m, err := parser.Parse(line)
if err != nil {
if errors.Is(err, io.EOF) {
return nil, nil
}
return nil, err
}
if m != nil {
return []telegraf.Metric{m}, nil
}
return []telegraf.Metric{}, nil
return m, err
default:
return parser.Parse(line)
}

View File

@ -3,12 +3,11 @@ package directory_monitor
import (
"bytes"
"compress/gzip"
"github.com/stretchr/testify/require"
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/require"
"github.com/influxdata/telegraf/plugins/parsers"
"github.com/influxdata/telegraf/testutil"
)
@ -193,3 +192,224 @@ func TestFileTag(t *testing.T) {
}
}
}
func TestCSVNoSkipRows(t *testing.T) {
acc := testutil.Accumulator{}
testCsvFile := "test.csv"
// Establish process directory and finished directory.
finishedDirectory, err := os.MkdirTemp("", "finished")
require.NoError(t, err)
processDirectory, err := os.MkdirTemp("", "test")
require.NoError(t, err)
defer os.RemoveAll(processDirectory)
defer os.RemoveAll(finishedDirectory)
// Init plugin.
r := DirectoryMonitor{
Directory: processDirectory,
FinishedDirectory: finishedDirectory,
MaxBufferedMetrics: 1000,
FileQueueSize: 100000,
}
err = r.Init()
require.NoError(t, err)
parserConfig := parsers.Config{
DataFormat: "csv",
CSVHeaderRowCount: 1,
CSVSkipRows: 0,
CSVTagColumns: []string{"line1"},
}
require.NoError(t, err)
r.SetParserFunc(func() (parsers.Parser, error) {
return parsers.NewParser(&parserConfig)
})
r.Log = testutil.Logger{}
testCSV := `line1,line2,line3
hello,80,test_name2`
expectedFields := map[string]interface{}{
"line2": int64(80),
"line3": "test_name2",
}
// Write csv file to process into the 'process' directory.
f, err := os.Create(filepath.Join(processDirectory, testCsvFile))
require.NoError(t, err)
_, err = f.WriteString(testCSV)
require.NoError(t, err)
err = f.Close()
require.NoError(t, err)
// Start plugin before adding file.
err = r.Start(&acc)
require.NoError(t, err)
err = r.Gather(&acc)
require.NoError(t, err)
acc.Wait(1)
r.Stop()
// Verify that we read both files once.
require.Equal(t, len(acc.Metrics), 1)
// File should have gone back to the test directory, as we configured.
_, err = os.Stat(filepath.Join(finishedDirectory, testCsvFile))
require.NoError(t, err)
for _, m := range acc.Metrics {
for key, value := range m.Tags {
require.Equal(t, "line1", key)
require.Equal(t, "hello", value)
}
require.Equal(t, expectedFields, m.Fields)
}
}
func TestCSVSkipRows(t *testing.T) {
acc := testutil.Accumulator{}
testCsvFile := "test.csv"
// Establish process directory and finished directory.
finishedDirectory, err := os.MkdirTemp("", "finished")
require.NoError(t, err)
processDirectory, err := os.MkdirTemp("", "test")
require.NoError(t, err)
defer os.RemoveAll(processDirectory)
defer os.RemoveAll(finishedDirectory)
// Init plugin.
r := DirectoryMonitor{
Directory: processDirectory,
FinishedDirectory: finishedDirectory,
MaxBufferedMetrics: 1000,
FileQueueSize: 100000,
}
err = r.Init()
require.NoError(t, err)
parserConfig := parsers.Config{
DataFormat: "csv",
CSVHeaderRowCount: 1,
CSVSkipRows: 2,
CSVTagColumns: []string{"line1"},
}
require.NoError(t, err)
r.SetParserFunc(func() (parsers.Parser, error) {
return parsers.NewParser(&parserConfig)
})
r.Log = testutil.Logger{}
testCSV := `garbage nonsense 1
garbage,nonsense,2
line1,line2,line3
hello,80,test_name2`
expectedFields := map[string]interface{}{
"line2": int64(80),
"line3": "test_name2",
}
// Write csv file to process into the 'process' directory.
f, err := os.Create(filepath.Join(processDirectory, testCsvFile))
require.NoError(t, err)
_, err = f.WriteString(testCSV)
require.NoError(t, err)
err = f.Close()
require.NoError(t, err)
// Start plugin before adding file.
err = r.Start(&acc)
require.NoError(t, err)
err = r.Gather(&acc)
require.NoError(t, err)
acc.Wait(1)
r.Stop()
// Verify that we read both files once.
require.Equal(t, len(acc.Metrics), 1)
// File should have gone back to the test directory, as we configured.
_, err = os.Stat(filepath.Join(finishedDirectory, testCsvFile))
require.NoError(t, err)
for _, m := range acc.Metrics {
for key, value := range m.Tags {
require.Equal(t, "line1", key)
require.Equal(t, "hello", value)
}
require.Equal(t, expectedFields, m.Fields)
}
}
func TestCSVMultiHeader(t *testing.T) {
acc := testutil.Accumulator{}
testCsvFile := "test.csv"
// Establish process directory and finished directory.
finishedDirectory, err := os.MkdirTemp("", "finished")
require.NoError(t, err)
processDirectory, err := os.MkdirTemp("", "test")
require.NoError(t, err)
defer os.RemoveAll(processDirectory)
defer os.RemoveAll(finishedDirectory)
// Init plugin.
r := DirectoryMonitor{
Directory: processDirectory,
FinishedDirectory: finishedDirectory,
MaxBufferedMetrics: 1000,
FileQueueSize: 100000,
}
err = r.Init()
require.NoError(t, err)
parserConfig := parsers.Config{
DataFormat: "csv",
CSVHeaderRowCount: 2,
CSVTagColumns: []string{"line1"},
}
require.NoError(t, err)
r.SetParserFunc(func() (parsers.Parser, error) {
return parsers.NewParser(&parserConfig)
})
r.Log = testutil.Logger{}
testCSV := `line,line,line
1,2,3
hello,80,test_name2`
expectedFields := map[string]interface{}{
"line2": int64(80),
"line3": "test_name2",
}
// Write csv file to process into the 'process' directory.
f, err := os.Create(filepath.Join(processDirectory, testCsvFile))
require.NoError(t, err)
_, err = f.WriteString(testCSV)
require.NoError(t, err)
err = f.Close()
require.NoError(t, err)
// Start plugin before adding file.
err = r.Start(&acc)
require.NoError(t, err)
err = r.Gather(&acc)
require.NoError(t, err)
acc.Wait(1)
r.Stop()
// Verify that we read both files once.
require.Equal(t, len(acc.Metrics), 1)
// File should have gone back to the test directory, as we configured.
_, err = os.Stat(filepath.Join(finishedDirectory, testCsvFile))
require.NoError(t, err)
for _, m := range acc.Metrics {
for key, value := range m.Tags {
require.Equal(t, "line1", key)
require.Equal(t, "hello", value)
}
require.Equal(t, expectedFields, m.Fields)
}
}

View File

@ -288,25 +288,17 @@ func (t *Tail) tailNewFiles(fromBeginning bool) error {
}
// ParseLine parses a line of text.
func parseLine(parser parsers.Parser, line string, firstLine bool) ([]telegraf.Metric, error) {
func parseLine(parser parsers.Parser, line string) ([]telegraf.Metric, error) {
switch parser.(type) {
case *csv.Parser:
// The csv parser parses headers in Parse and skips them in ParseLine.
// As a temporary solution call Parse only when getting the first
// line from the file.
if firstLine {
return parser.Parse([]byte(line))
}
m, err := parser.ParseLine(line)
m, err := parser.Parse([]byte(line))
if err != nil {
if errors.Is(err, io.EOF) {
return nil, nil
}
return nil, err
}
if m != nil {
return []telegraf.Metric{m}, nil
}
return []telegraf.Metric{}, nil
return m, err
default:
return parser.Parse([]byte(line))
}
@ -315,8 +307,6 @@ func parseLine(parser parsers.Parser, line string, firstLine bool) ([]telegraf.M
// Receiver is launched as a goroutine to continuously watch a tailed logfile
// for changes, parse any incoming msgs, and add to the accumulator.
func (t *Tail) receiver(parser parsers.Parser, tailer *tail.Tail) {
var firstLine = true
// holds the individual lines of multi-line log entries.
var buffer bytes.Buffer
@ -378,13 +368,12 @@ func (t *Tail) receiver(parser parsers.Parser, tailer *tail.Tail) {
continue
}
metrics, err := parseLine(parser, text, firstLine)
metrics, err := parseLine(parser, text)
if err != nil {
t.Log.Errorf("Malformed log line in %q: [%q]: %s",
tailer.Filename, text, err.Error())
continue
}
firstLine = false
if t.PathTag != "" {
for _, metric := range metrics {

View File

@ -342,6 +342,67 @@ cpu,42
testutil.RequireMetricsEqual(t, expected, acc.GetTelegrafMetrics())
}
func TestCSVMultiHeaderWithSkipRowANDColumn(t *testing.T) {
tmpfile, err := os.CreateTemp("", "")
require.NoError(t, err)
defer os.Remove(tmpfile.Name())
_, err = tmpfile.WriteString(`garbage nonsense
skip,measurement,value
row,1,2
skip1,cpu,42
skip2,mem,100
`)
require.NoError(t, err)
require.NoError(t, tmpfile.Close())
plugin := NewTestTail()
plugin.Log = testutil.Logger{}
plugin.FromBeginning = true
plugin.Files = []string{tmpfile.Name()}
plugin.SetParserFunc(func() (parsers.Parser, error) {
return csv.NewParser(&csv.Config{
MeasurementColumn: "measurement1",
HeaderRowCount: 2,
SkipRows: 1,
SkipColumns: 1,
TimeFunc: func() time.Time { return time.Unix(0, 0) },
})
})
err = plugin.Init()
require.NoError(t, err)
acc := testutil.Accumulator{}
err = plugin.Start(&acc)
require.NoError(t, err)
defer plugin.Stop()
err = plugin.Gather(&acc)
require.NoError(t, err)
acc.Wait(2)
plugin.Stop()
expected := []telegraf.Metric{
testutil.MustMetric("cpu",
map[string]string{
"path": tmpfile.Name(),
},
map[string]interface{}{
"value2": 42,
},
time.Unix(0, 0)),
testutil.MustMetric("mem",
map[string]string{
"path": tmpfile.Name(),
},
map[string]interface{}{
"value2": 100,
},
time.Unix(0, 0)),
}
testutil.RequireMetricsEqual(t, expected, acc.GetTelegrafMetrics())
}
// Ensure that the first line can produce multiple metrics (#6138)
func TestMultipleMetricsOnFirstLine(t *testing.T) {
tmpfile, err := os.CreateTemp("", "")

View File

@ -96,48 +96,68 @@ func (p *Parser) compile(r io.Reader) *csv.Reader {
func (p *Parser) Parse(buf []byte) ([]telegraf.Metric, error) {
r := bytes.NewReader(buf)
return parseCSV(p, r)
}
// ParseLine does not use any information in header and assumes DataColumns is set
// it will also not skip any rows
func (p *Parser) ParseLine(line string) (telegraf.Metric, error) {
r := bytes.NewReader([]byte(line))
metrics, err := parseCSV(p, r)
if err != nil {
return nil, err
}
if len(metrics) == 1 {
return metrics[0], nil
}
if len(metrics) > 1 {
return nil, fmt.Errorf("expected 1 metric found %d", len(metrics))
}
return nil, nil
}
func parseCSV(p *Parser, r io.Reader) ([]telegraf.Metric, error) {
csvReader := p.compile(r)
// skip first rows
for i := 0; i < p.SkipRows; i++ {
for p.SkipRows > 0 {
_, err := csvReader.Read()
if err != nil {
return nil, err
}
p.SkipRows--
}
// if there is a header and we did not get DataColumns
// if there is a header, and we did not get DataColumns
// set DataColumns to names extracted from the header
// we always reread the header to avoid side effects
// in cases where multiple files with different
// headers are read
for p.HeaderRowCount > 0 {
header, err := csvReader.Read()
if err != nil {
return nil, err
}
p.HeaderRowCount--
if p.gotColumnNames {
// Ignore header lines if columns are named
continue
}
//concatenate header names
for i, h := range header {
name := h
if p.TrimSpace {
name = strings.Trim(name, " ")
}
if len(p.ColumnNames) <= i {
p.ColumnNames = append(p.ColumnNames, name)
} else {
p.ColumnNames[i] = p.ColumnNames[i] + name
}
}
}
if !p.gotColumnNames {
headerNames := make([]string, 0)
for i := 0; i < p.HeaderRowCount; i++ {
header, err := csvReader.Read()
if err != nil {
return nil, err
}
//concatenate header names
for i := range header {
name := header[i]
if p.TrimSpace {
name = strings.Trim(name, " ")
}
if len(headerNames) <= i {
headerNames = append(headerNames, name)
} else {
headerNames[i] = headerNames[i] + name
}
}
}
p.ColumnNames = headerNames[p.SkipColumns:]
} else {
// if columns are named, just skip header rows
for i := 0; i < p.HeaderRowCount; i++ {
_, err := csvReader.Read()
if err != nil {
return nil, err
}
}
// skip first rows
p.ColumnNames = p.ColumnNames[p.SkipColumns:]
p.gotColumnNames = true
}
table, err := csvReader.ReadAll()
@ -156,27 +176,6 @@ func (p *Parser) Parse(buf []byte) ([]telegraf.Metric, error) {
return metrics, nil
}
// ParseLine does not use any information in header and assumes DataColumns is set
// it will also not skip any rows
func (p *Parser) ParseLine(line string) (telegraf.Metric, error) {
r := bytes.NewReader([]byte(line))
csvReader := p.compile(r)
// if there is nothing in DataColumns, ParseLine will fail
if len(p.ColumnNames) == 0 {
return nil, fmt.Errorf("[parsers.csv] data columns must be specified")
}
record, err := csvReader.Read()
if err != nil {
return nil, err
}
m, err := p.parseRecord(record)
if err != nil {
return nil, err
}
return m, nil
}
func (p *Parser) parseRecord(record []string) (telegraf.Metric, error) {
recordFields := make(map[string]interface{})
tags := make(map[string]string)
@ -289,7 +288,7 @@ outer:
// will be the current timestamp, else it will try to parse the time according
// to the format.
func parseTimestamp(timeFunc func() time.Time, recordFields map[string]interface{},
timestampColumn, timestampFormat string, Timezone string,
timestampColumn, timestampFormat string, timezone string,
) (time.Time, error) {
if timestampColumn != "" {
if recordFields[timestampColumn] == nil {
@ -300,7 +299,7 @@ func parseTimestamp(timeFunc func() time.Time, recordFields map[string]interface
case "":
return time.Time{}, fmt.Errorf("timestamp format must be specified")
default:
metricTime, err := internal.ParseTimestamp(timestampFormat, recordFields[timestampColumn], Timezone)
metricTime, err := internal.ParseTimestamp(timestampFormat, recordFields[timestampColumn], timezone)
if err != nil {
return time.Time{}, err
}

View File

@ -2,6 +2,7 @@ package csv
import (
"fmt"
"io"
"testing"
"time"
@ -59,9 +60,33 @@ func TestHeaderOverride(t *testing.T) {
require.NoError(t, err)
testCSV := `line1,line2,line3
3.4,70,test_name`
expectedFields := map[string]interface{}{
"first": 3.4,
"second": int64(70),
}
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, "test_name", metrics[0].Name())
require.Equal(t, expectedFields, metrics[0].Fields())
testCSVRows := []string{"line1,line2,line3\r\n", "3.4,70,test_name\r\n"}
p, err = NewParser(
&Config{
HeaderRowCount: 1,
ColumnNames: []string{"first", "second", "third"},
MeasurementColumn: "third",
TimeFunc: DefaultTime,
},
)
require.NoError(t, err)
metrics, err = p.Parse([]byte(testCSVRows[0]))
require.NoError(t, err)
require.Equal(t, []telegraf.Metric{}, metrics)
m, err := p.ParseLine(testCSVRows[1])
require.NoError(t, err)
require.Equal(t, "test_name", m.Name())
require.Equal(t, expectedFields, m.Fields())
}
func TestTimestamp(t *testing.T) {
@ -293,6 +318,22 @@ func TestTrimSpace(t *testing.T) {
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, expectedFields, metrics[0].Fields())
p, err = NewParser(
&Config{
HeaderRowCount: 2,
TrimSpace: true,
TimeFunc: DefaultTime,
},
)
require.NoError(t, err)
testCSV = " col , col ,col\n" +
" 1 , 2 ,3\n" +
" test space , 80 ,test_name"
metrics, err = p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, map[string]interface{}{"col1": "test space", "col2": int64(80), "col3": "test_name"}, metrics[0].Fields())
}
func TestTrimSpaceDelimitedBySpace(t *testing.T) {
@ -332,6 +373,7 @@ func TestSkipRows(t *testing.T) {
TimeFunc: DefaultTime,
},
)
require.NoError(t, err)
testCSV := `garbage nonsense
line1,line2,line3
hello,80,test_name2`
@ -339,10 +381,39 @@ hello,80,test_name2`
expectedFields := map[string]interface{}{
"line2": int64(80),
}
expectedTags := map[string]string{
"line1": "hello",
}
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, "test_name2", metrics[0].Name())
require.Equal(t, expectedFields, metrics[0].Fields())
require.Equal(t, expectedTags, metrics[0].Tags())
p, err = NewParser(
&Config{
HeaderRowCount: 1,
SkipRows: 1,
TagColumns: []string{"line1"},
MeasurementColumn: "line3",
TimeFunc: DefaultTime,
},
)
require.NoError(t, err)
testCSVRows := []string{"garbage nonsense\r\n", "line1,line2,line3\r\n", "hello,80,test_name2\r\n"}
metrics, err = p.Parse([]byte(testCSVRows[0]))
require.Error(t, io.EOF, err)
require.Error(t, err)
require.Nil(t, metrics)
m, err := p.ParseLine(testCSVRows[1])
require.NoError(t, err)
require.Nil(t, m)
m, err = p.ParseLine(testCSVRows[2])
require.NoError(t, err)
require.Equal(t, "test_name2", m.Name())
require.Equal(t, expectedFields, m.Fields())
require.Equal(t, expectedTags, m.Tags())
}
func TestSkipColumns(t *testing.T) {
@ -375,8 +446,8 @@ func TestSkipColumnsWithHeader(t *testing.T) {
)
require.NoError(t, err)
testCSV := `col,col,col
1,2,3
trash,80,test_name`
1,2,3
trash,80,test_name`
// we should expect an error if we try to get col1
metrics, err := p.Parse([]byte(testCSV))
@ -384,6 +455,44 @@ func TestSkipColumnsWithHeader(t *testing.T) {
require.Equal(t, map[string]interface{}{"col2": int64(80), "col3": "test_name"}, metrics[0].Fields())
}
func TestMultiHeader(t *testing.T) {
p, err := NewParser(
&Config{
HeaderRowCount: 2,
TimeFunc: DefaultTime,
},
)
require.NoError(t, err)
testCSV := `col,col
1,2
80,test_name`
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, map[string]interface{}{"col1": int64(80), "col2": "test_name"}, metrics[0].Fields())
testCSVRows := []string{"col,col\r\n", "1,2\r\n", "80,test_name\r\n"}
p, err = NewParser(
&Config{
HeaderRowCount: 2,
TimeFunc: DefaultTime,
},
)
require.NoError(t, err)
metrics, err = p.Parse([]byte(testCSVRows[0]))
require.Error(t, io.EOF, err)
require.Error(t, err)
require.Nil(t, metrics)
m, err := p.ParseLine(testCSVRows[1])
require.NoError(t, err)
require.Nil(t, m)
m, err = p.ParseLine(testCSVRows[2])
require.NoError(t, err)
require.Equal(t, map[string]interface{}{"col1": int64(80), "col2": "test_name"}, m.Fields())
}
func TestParseStream(t *testing.T) {
p, err := NewParser(
&Config{
@ -400,7 +509,8 @@ func TestParseStream(t *testing.T) {
metrics, err := p.Parse([]byte(csvHeader))
require.NoError(t, err)
require.Len(t, metrics, 0)
metric, err := p.ParseLine(csvBody)
m, err := p.ParseLine(csvBody)
require.NoError(t, err)
testutil.RequireMetricEqual(t,
testutil.MustMetric(
"csv",
@ -411,7 +521,45 @@ func TestParseStream(t *testing.T) {
"c": int64(3),
},
DefaultTime(),
), metric)
), m)
}
func TestParseLineMultiMetricErrorMessage(t *testing.T) {
p, err := NewParser(
&Config{
MetricName: "csv",
HeaderRowCount: 1,
TimeFunc: DefaultTime,
},
)
require.NoError(t, err)
csvHeader := "a,b,c"
csvOneRow := "1,2,3"
csvTwoRows := "4,5,6\n7,8,9"
metrics, err := p.Parse([]byte(csvHeader))
require.NoError(t, err)
require.Len(t, metrics, 0)
m, err := p.ParseLine(csvOneRow)
require.NoError(t, err)
testutil.RequireMetricEqual(t,
testutil.MustMetric(
"csv",
map[string]string{},
map[string]interface{}{
"a": int64(1),
"b": int64(2),
"c": int64(3),
},
DefaultTime(),
), m)
m, err = p.ParseLine(csvTwoRows)
require.Errorf(t, err, "expected 1 metric found 2")
require.Nil(t, m)
metrics, err = p.Parse([]byte(csvTwoRows))
require.NoError(t, err)
require.Len(t, metrics, 2)
}
func TestTimestampUnixFloatPrecision(t *testing.T) {