Add csv parser (#4439)

This commit is contained in:
maxunt
2018-08-24 16:40:41 -07:00
committed by Daniel Nelson
parent 80346b2e93
commit 889745a112
5 changed files with 733 additions and 20 deletions

View File

@@ -0,0 +1,196 @@
package csv
import (
"bytes"
"encoding/csv"
"fmt"
"strconv"
"strings"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/metric"
)
type Parser struct {
MetricName string
HeaderRowCount int
SkipRows int
SkipColumns int
Delimiter string
Comment string
TrimSpace bool
ColumnNames []string
TagColumns []string
MeasurementColumn string
TimestampColumn string
TimestampFormat string
DefaultTags map[string]string
}
func (p *Parser) compile(r *bytes.Reader) (*csv.Reader, error) {
csvReader := csv.NewReader(r)
// ensures that the reader reads records of different lengths without an error
csvReader.FieldsPerRecord = -1
if p.Delimiter != "" {
csvReader.Comma = []rune(p.Delimiter)[0]
}
if p.Comment != "" {
csvReader.Comment = []rune(p.Comment)[0]
}
return csvReader, nil
}
func (p *Parser) Parse(buf []byte) ([]telegraf.Metric, error) {
r := bytes.NewReader(buf)
csvReader, err := p.compile(r)
if err != nil {
return nil, err
}
// skip first rows
for i := 0; i < p.SkipRows; i++ {
csvReader.Read()
}
// if there is a header and nothing in DataColumns
// set DataColumns to names extracted from the header
headerNames := make([]string, 0)
if len(p.ColumnNames) == 0 {
for i := 0; i < p.HeaderRowCount; i++ {
header, err := csvReader.Read()
if err != nil {
return nil, err
}
//concatenate header names
for i := range header {
name := header[i]
if p.TrimSpace {
name = strings.Trim(name, " ")
}
if len(headerNames) <= i {
headerNames = append(headerNames, name)
} else {
headerNames[i] = headerNames[i] + name
}
}
}
p.ColumnNames = headerNames[p.SkipColumns:]
} else {
// if columns are named, just skip header rows
for i := 0; i < p.HeaderRowCount; i++ {
csvReader.Read()
}
}
table, err := csvReader.ReadAll()
if err != nil {
return nil, err
}
metrics := make([]telegraf.Metric, 0)
for _, record := range table {
m, err := p.parseRecord(record)
if err != nil {
return metrics, err
}
metrics = append(metrics, m)
}
return metrics, nil
}
// ParseLine does not use any information in header and assumes DataColumns is set
// it will also not skip any rows
func (p *Parser) ParseLine(line string) (telegraf.Metric, error) {
r := bytes.NewReader([]byte(line))
csvReader, err := p.compile(r)
if err != nil {
return nil, err
}
// if there is nothing in DataColumns, ParseLine will fail
if len(p.ColumnNames) == 0 {
return nil, fmt.Errorf("[parsers.csv] data columns must be specified")
}
record, err := csvReader.Read()
if err != nil {
return nil, err
}
m, err := p.parseRecord(record)
if err != nil {
return nil, err
}
return m, nil
}
func (p *Parser) parseRecord(record []string) (telegraf.Metric, error) {
recordFields := make(map[string]interface{})
tags := make(map[string]string)
// skip columns in record
record = record[p.SkipColumns:]
outer:
for i, fieldName := range p.ColumnNames {
if i < len(record) {
value := record[i]
if p.TrimSpace {
value = strings.Trim(value, " ")
}
for _, tagName := range p.TagColumns {
if tagName == fieldName {
tags[tagName] = value
continue outer
}
}
// attempt type conversions
if iValue, err := strconv.ParseInt(value, 10, 64); err == nil {
recordFields[fieldName] = iValue
} else if fValue, err := strconv.ParseFloat(value, 64); err == nil {
recordFields[fieldName] = fValue
} else if bValue, err := strconv.ParseBool(value); err == nil {
recordFields[fieldName] = bValue
} else {
recordFields[fieldName] = value
}
}
}
// add default tags
for k, v := range p.DefaultTags {
tags[k] = v
}
// will default to plugin name
measurementName := p.MetricName
if recordFields[p.MeasurementColumn] != nil {
measurementName = fmt.Sprintf("%v", recordFields[p.MeasurementColumn])
}
metricTime := time.Now()
if p.TimestampColumn != "" {
if recordFields[p.TimestampColumn] == nil {
return nil, fmt.Errorf("timestamp column: %v could not be found", p.TimestampColumn)
}
tStr := fmt.Sprintf("%v", recordFields[p.TimestampColumn])
if p.TimestampFormat == "" {
return nil, fmt.Errorf("timestamp format must be specified")
}
var err error
metricTime, err = time.Parse(p.TimestampFormat, tStr)
if err != nil {
return nil, err
}
}
m, err := metric.New(measurementName, tags, recordFields, metricTime)
if err != nil {
return nil, err
}
return m, nil
}
func (p *Parser) SetDefaultTags(tags map[string]string) {
p.DefaultTags = tags
}

View File

@@ -0,0 +1,231 @@
package csv
import (
"fmt"
"testing"
"time"
"github.com/influxdata/telegraf/metric"
"github.com/stretchr/testify/require"
)
func TestBasicCSV(t *testing.T) {
p := Parser{
ColumnNames: []string{"first", "second", "third"},
TagColumns: []string{"third"},
}
_, err := p.ParseLine("1.4,true,hi")
require.NoError(t, err)
}
func TestHeaderConcatenationCSV(t *testing.T) {
p := Parser{
HeaderRowCount: 2,
MeasurementColumn: "3",
}
testCSV := `first,second
1,2,3
3.4,70,test_name`
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, "test_name", metrics[0].Name())
}
func TestHeaderOverride(t *testing.T) {
p := Parser{
HeaderRowCount: 1,
ColumnNames: []string{"first", "second", "third"},
MeasurementColumn: "third",
}
testCSV := `line1,line2,line3
3.4,70,test_name`
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, "test_name", metrics[0].Name())
}
func TestTimestamp(t *testing.T) {
p := Parser{
HeaderRowCount: 1,
ColumnNames: []string{"first", "second", "third"},
MeasurementColumn: "third",
TimestampColumn: "first",
TimestampFormat: "02/01/06 03:04:05 PM",
}
testCSV := `line1,line2,line3
23/05/09 04:05:06 PM,70,test_name
07/11/09 04:05:06 PM,80,test_name2`
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, metrics[0].Time().UnixNano(), int64(1243094706000000000))
require.Equal(t, metrics[1].Time().UnixNano(), int64(1257609906000000000))
}
func TestTimestampError(t *testing.T) {
p := Parser{
HeaderRowCount: 1,
ColumnNames: []string{"first", "second", "third"},
MeasurementColumn: "third",
TimestampColumn: "first",
}
testCSV := `line1,line2,line3
23/05/09 04:05:06 PM,70,test_name
07/11/09 04:05:06 PM,80,test_name2`
_, err := p.Parse([]byte(testCSV))
require.Equal(t, fmt.Errorf("timestamp format must be specified"), err)
}
func TestQuotedCharacter(t *testing.T) {
p := Parser{
HeaderRowCount: 1,
ColumnNames: []string{"first", "second", "third"},
MeasurementColumn: "third",
}
testCSV := `line1,line2,line3
"3,4",70,test_name`
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, "3,4", metrics[0].Fields()["first"])
}
func TestDelimiter(t *testing.T) {
p := Parser{
HeaderRowCount: 1,
Delimiter: "%",
ColumnNames: []string{"first", "second", "third"},
MeasurementColumn: "third",
}
testCSV := `line1%line2%line3
3,4%70%test_name`
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, "3,4", metrics[0].Fields()["first"])
}
func TestValueConversion(t *testing.T) {
p := Parser{
HeaderRowCount: 0,
Delimiter: ",",
ColumnNames: []string{"first", "second", "third", "fourth"},
MetricName: "test_value",
}
testCSV := `3.3,4,true,hello`
expectedTags := make(map[string]string)
expectedFields := map[string]interface{}{
"first": 3.3,
"second": 4,
"third": true,
"fourth": "hello",
}
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
expectedMetric, err1 := metric.New("test_value", expectedTags, expectedFields, time.Unix(0, 0))
returnedMetric, err2 := metric.New(metrics[0].Name(), metrics[0].Tags(), metrics[0].Fields(), time.Unix(0, 0))
require.NoError(t, err1)
require.NoError(t, err2)
//deep equal fields
require.Equal(t, expectedMetric.Fields(), returnedMetric.Fields())
}
func TestSkipComment(t *testing.T) {
p := Parser{
HeaderRowCount: 0,
Comment: "#",
ColumnNames: []string{"first", "second", "third", "fourth"},
MetricName: "test_value",
}
testCSV := `#3.3,4,true,hello
4,9.9,true,name_this`
expectedFields := map[string]interface{}{
"first": int64(4),
"second": 9.9,
"third": true,
"fourth": "name_this",
}
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, expectedFields, metrics[0].Fields())
}
func TestTrimSpace(t *testing.T) {
p := Parser{
HeaderRowCount: 0,
TrimSpace: true,
ColumnNames: []string{"first", "second", "third", "fourth"},
MetricName: "test_value",
}
testCSV := ` 3.3, 4, true,hello`
expectedFields := map[string]interface{}{
"first": 3.3,
"second": int64(4),
"third": true,
"fourth": "hello",
}
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, expectedFields, metrics[0].Fields())
}
func TestSkipRows(t *testing.T) {
p := Parser{
HeaderRowCount: 1,
SkipRows: 1,
TagColumns: []string{"line1"},
MeasurementColumn: "line3",
}
testCSV := `garbage nonsense
line1,line2,line3
hello,80,test_name2`
expectedFields := map[string]interface{}{
"line2": int64(80),
"line3": "test_name2",
}
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, expectedFields, metrics[0].Fields())
}
func TestSkipColumns(t *testing.T) {
p := Parser{
SkipColumns: 1,
ColumnNames: []string{"line1", "line2"},
}
testCSV := `hello,80,test_name`
expectedFields := map[string]interface{}{
"line1": int64(80),
"line2": "test_name",
}
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, expectedFields, metrics[0].Fields())
}
func TestSkipColumnsWithHeader(t *testing.T) {
p := Parser{
SkipColumns: 1,
HeaderRowCount: 2,
}
testCSV := `col,col,col
1,2,3
trash,80,test_name`
// we should expect an error if we try to get col1
metrics, err := p.Parse([]byte(testCSV))
require.NoError(t, err)
require.Equal(t, map[string]interface{}{"col2": int64(80), "col3": "test_name"}, metrics[0].Fields())
}