Implement telegraf's own full metric type

main reasons behind this: - make adding/removing tags cheap - make adding/removing fields cheap - make parsing cheaper - make parse -> decorate -> write out bytes metric flow much faster Refactor serializer to use byte buffer
2016-11-22 12:51:57 +00:00
parent 332f678afb
commit db7a4b24b6
40 changed files with 1376 additions and 398 deletions
--- a/metric/parse.go
+++ b/metric/parse.go
@@ -0,0 +1,624 @@
+package metric
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"time"
+
+	"github.com/influxdata/telegraf"
+)
+
+var (
+	ErrInvalidNumber = errors.New("invalid number")
+)
+
+const (
+	// the number of characters for the largest possible int64 (9223372036854775807)
+	maxInt64Digits = 19
+
+	// the number of characters for the smallest possible int64 (-9223372036854775808)
+	minInt64Digits = 20
+
+	// the number of characters required for the largest float64 before a range check
+	// would occur during parsing
+	maxFloat64Digits = 25
+
+	// the number of characters required for smallest float64 before a range check occur
+	// would occur during parsing
+	minFloat64Digits = 27
+
+	MaxKeyLength = 65535
+)
+
+// The following constants allow us to specify which state to move to
+// next, when scanning sections of a Point.
+const (
+	tagKeyState = iota
+	tagValueState
+	fieldsState
+)
+
+func Parse(buf []byte) ([]telegraf.Metric, error) {
+	return ParseWithDefaultTime(buf, time.Now())
+}
+
+func ParseWithDefaultTime(buf []byte, t time.Time) ([]telegraf.Metric, error) {
+	metrics := make([]telegraf.Metric, 0, bytes.Count(buf, []byte("\n"))+1)
+	var (
+		errStr string
+		line   []byte
+		err    error
+	)
+	b := bytes.NewBuffer(buf)
+	for {
+		line, err = b.ReadBytes('\n')
+		if err != nil {
+			break
+		}
+		if len(line) < 2 {
+			continue
+		}
+		// trim the newline:
+		line = line[0 : len(line)-1]
+
+		m, err := parseMetric(line, t)
+		if err != nil {
+			errStr += " " + err.Error()
+			continue
+		}
+
+		metrics = append(metrics, m)
+	}
+
+	if len(errStr) > 0 {
+		return metrics, fmt.Errorf(errStr)
+	}
+	return metrics, nil
+}
+
+func parseMetric(buf []byte, defaultTime time.Time) (telegraf.Metric, error) {
+	var dTime string
+	// scan the first block which is measurement[,tag1=value1,tag2=value=2...]
+	pos, key, err := scanKey(buf, 0)
+	if err != nil {
+		return nil, err
+	}
+
+	// measurement name is required
+	if len(key) == 0 {
+		return nil, fmt.Errorf("missing measurement")
+	}
+
+	if len(key) > MaxKeyLength {
+		return nil, fmt.Errorf("max key length exceeded: %v > %v", len(key), MaxKeyLength)
+	}
+
+	// scan the second block is which is field1=value1[,field2=value2,...]
+	pos, fields, err := scanFields(buf, pos)
+	if err != nil {
+		return nil, err
+	}
+
+	// at least one field is required
+	if len(fields) == 0 {
+		return nil, fmt.Errorf("missing fields")
+	}
+
+	// scan the last block which is an optional integer timestamp
+	pos, ts, err := scanTime(buf, pos)
+	if err != nil {
+		return nil, err
+	}
+
+	m := &metric{
+		fields: fields,
+		t:      ts,
+	}
+
+	// parse out the measurement name
+	// namei is the index at which the "name" ends
+	namei := indexUnescapedByte(key, ',')
+	if namei < 1 {
+		// no tags
+		m.name = key
+	} else {
+		m.name = key[0:namei]
+		m.tags = key[namei:]
+	}
+
+	if len(m.t) == 0 {
+		if len(dTime) == 0 {
+			dTime = fmt.Sprint(defaultTime.UnixNano())
+		}
+		// use default time
+		m.t = []byte(dTime)
+	}
+
+	return m, nil
+}
+
+// scanKey scans buf starting at i for the measurement and tag portion of the point.
+// It returns the ending position and the byte slice of key within buf.  If there
+// are tags, they will be sorted if they are not already.
+func scanKey(buf []byte, i int) (int, []byte, error) {
+	start := skipWhitespace(buf, i)
+	i = start
+
+	// First scan the Point's measurement.
+	state, i, err := scanMeasurement(buf, i)
+	if err != nil {
+		return i, buf[start:i], err
+	}
+
+	// Optionally scan tags if needed.
+	if state == tagKeyState {
+		i, err = scanTags(buf, i)
+		if err != nil {
+			return i, buf[start:i], err
+		}
+	}
+
+	return i, buf[start:i], nil
+}
+
+// scanMeasurement examines the measurement part of a Point, returning
+// the next state to move to, and the current location in the buffer.
+func scanMeasurement(buf []byte, i int) (int, int, error) {
+	// Check first byte of measurement, anything except a comma is fine.
+	// It can't be a space, since whitespace is stripped prior to this
+	// function call.
+	if i >= len(buf) || buf[i] == ',' {
+		return -1, i, fmt.Errorf("missing measurement")
+	}
+
+	for {
+		i++
+		if i >= len(buf) {
+			// cpu
+			return -1, i, fmt.Errorf("missing fields")
+		}
+
+		if buf[i-1] == '\\' {
+			// Skip character (it's escaped).
+			continue
+		}
+
+		// Unescaped comma; move onto scanning the tags.
+		if buf[i] == ',' {
+			return tagKeyState, i + 1, nil
+		}
+
+		// Unescaped space; move onto scanning the fields.
+		if buf[i] == ' ' {
+			// cpu value=1.0
+			return fieldsState, i, nil
+		}
+	}
+}
+
+// scanTags examines all the tags in a Point, keeping track of and
+// returning the updated indices slice, number of commas and location
+// in buf where to start examining the Point fields.
+func scanTags(buf []byte, i int) (int, error) {
+	var (
+		err   error
+		state = tagKeyState
+	)
+
+	for {
+		switch state {
+		case tagKeyState:
+			i, err = scanTagsKey(buf, i)
+			state = tagValueState // tag value always follows a tag key
+		case tagValueState:
+			state, i, err = scanTagsValue(buf, i)
+		case fieldsState:
+			return i, nil
+		}
+
+		if err != nil {
+			return i, err
+		}
+	}
+}
+
+// scanTagsKey scans each character in a tag key.
+func scanTagsKey(buf []byte, i int) (int, error) {
+	// First character of the key.
+	if i >= len(buf) || buf[i] == ' ' || buf[i] == ',' || buf[i] == '=' {
+		// cpu,{'', ' ', ',', '='}
+		return i, fmt.Errorf("missing tag key")
+	}
+
+	// Examine each character in the tag key until we hit an unescaped
+	// equals (the tag value), or we hit an error (i.e., unescaped
+	// space or comma).
+	for {
+		i++
+
+		// Either we reached the end of the buffer or we hit an
+		// unescaped comma or space.
+		if i >= len(buf) ||
+			((buf[i] == ' ' || buf[i] == ',') && buf[i-1] != '\\') {
+			// cpu,tag{'', ' ', ','}
+			return i, fmt.Errorf("missing tag value")
+		}
+
+		if buf[i] == '=' && buf[i-1] != '\\' {
+			// cpu,tag=
+			return i + 1, nil
+		}
+	}
+}
+
+// scanTagsValue scans each character in a tag value.
+func scanTagsValue(buf []byte, i int) (int, int, error) {
+	// Tag value cannot be empty.
+	if i >= len(buf) || buf[i] == ',' || buf[i] == ' ' {
+		// cpu,tag={',', ' '}
+		return -1, i, fmt.Errorf("missing tag value")
+	}
+
+	// Examine each character in the tag value until we hit an unescaped
+	// comma (move onto next tag key), an unescaped space (move onto
+	// fields), or we error out.
+	for {
+		i++
+		if i >= len(buf) {
+			// cpu,tag=value
+			return -1, i, fmt.Errorf("missing fields")
+		}
+
+		// An unescaped equals sign is an invalid tag value.
+		if buf[i] == '=' && buf[i-1] != '\\' {
+			// cpu,tag={'=', 'fo=o'}
+			return -1, i, fmt.Errorf("invalid tag format")
+		}
+
+		if buf[i] == ',' && buf[i-1] != '\\' {
+			// cpu,tag=foo,
+			return tagKeyState, i + 1, nil
+		}
+
+		// cpu,tag=foo value=1.0
+		// cpu, tag=foo\= value=1.0
+		if buf[i] == ' ' && buf[i-1] != '\\' {
+			return fieldsState, i, nil
+		}
+	}
+}
+
+// scanFields scans buf, starting at i for the fields section of a point.  It returns
+// the ending position and the byte slice of the fields within buf
+func scanFields(buf []byte, i int) (int, []byte, error) {
+	start := skipWhitespace(buf, i)
+	i = start
+	quoted := false
+
+	// tracks how many '=' we've seen
+	equals := 0
+
+	// tracks how many commas we've seen
+	commas := 0
+
+	for {
+		// reached the end of buf?
+		if i >= len(buf) {
+			break
+		}
+
+		// escaped characters?
+		if buf[i] == '\\' && i+1 < len(buf) {
+			i += 2
+			continue
+		}
+
+		// If the value is quoted, scan until we get to the end quote
+		// Only quote values in the field value since quotes are not significant
+		// in the field key
+		if buf[i] == '"' && equals > commas {
+			quoted = !quoted
+			i++
+			continue
+		}
+
+		// If we see an =, ensure that there is at least on char before and after it
+		if buf[i] == '=' && !quoted {
+			equals++
+
+			// check for "... =123" but allow "a\ =123"
+			if buf[i-1] == ' ' && buf[i-2] != '\\' {
+				return i, buf[start:i], fmt.Errorf("missing field key")
+			}
+
+			// check for "...a=123,=456" but allow "a=123,a\,=456"
+			if buf[i-1] == ',' && buf[i-2] != '\\' {
+				return i, buf[start:i], fmt.Errorf("missing field key")
+			}
+
+			// check for "... value="
+			if i+1 >= len(buf) {
+				return i, buf[start:i], fmt.Errorf("missing field value")
+			}
+
+			// check for "... value=,value2=..."
+			if buf[i+1] == ',' || buf[i+1] == ' ' {
+				return i, buf[start:i], fmt.Errorf("missing field value")
+			}
+
+			if isNumeric(buf[i+1]) || buf[i+1] == '-' || buf[i+1] == 'N' || buf[i+1] == 'n' {
+				var err error
+				i, err = scanNumber(buf, i+1)
+				if err != nil {
+					return i, buf[start:i], err
+				}
+				continue
+			}
+			// If next byte is not a double-quote, the value must be a boolean
+			if buf[i+1] != '"' {
+				var err error
+				i, _, err = scanBoolean(buf, i+1)
+				if err != nil {
+					return i, buf[start:i], err
+				}
+				continue
+			}
+		}
+
+		if buf[i] == ',' && !quoted {
+			commas++
+		}
+
+		// reached end of block?
+		if buf[i] == ' ' && !quoted {
+			break
+		}
+		i++
+	}
+
+	if quoted {
+		return i, buf[start:i], fmt.Errorf("unbalanced quotes")
+	}
+
+	// check that all field sections had key and values (e.g. prevent "a=1,b"
+	if equals == 0 || commas != equals-1 {
+		return i, buf[start:i], fmt.Errorf("invalid field format")
+	}
+
+	return i, buf[start:i], nil
+}
+
+// scanTime scans buf, starting at i for the time section of a point. It
+// returns the ending position and the byte slice of the timestamp within buf
+// and and error if the timestamp is not in the correct numeric format.
+func scanTime(buf []byte, i int) (int, []byte, error) {
+	start := skipWhitespace(buf, i)
+	i = start
+
+	for {
+		// reached the end of buf?
+		if i >= len(buf) {
+			break
+		}
+
+		// Reached end of block or trailing whitespace?
+		if buf[i] == '\n' || buf[i] == ' ' {
+			break
+		}
+
+		// Handle negative timestamps
+		if i == start && buf[i] == '-' {
+			i++
+			continue
+		}
+
+		// Timestamps should be integers, make sure they are so we don't need
+		// to actually  parse the timestamp until needed.
+		if buf[i] < '0' || buf[i] > '9' {
+			return i, buf[start:i], fmt.Errorf("bad timestamp")
+		}
+		i++
+	}
+	return i, buf[start:i], nil
+}
+
+func isNumeric(b byte) bool {
+	return (b >= '0' && b <= '9') || b == '.'
+}
+
+// scanNumber returns the end position within buf, start at i after
+// scanning over buf for an integer, or float.  It returns an
+// error if a invalid number is scanned.
+func scanNumber(buf []byte, i int) (int, error) {
+	start := i
+	var isInt bool
+
+	// Is negative number?
+	if i < len(buf) && buf[i] == '-' {
+		i++
+		// There must be more characters now, as just '-' is illegal.
+		if i == len(buf) {
+			return i, ErrInvalidNumber
+		}
+	}
+
+	// how many decimal points we've see
+	decimal := false
+
+	// indicates the number is float in scientific notation
+	scientific := false
+
+	for {
+		if i >= len(buf) {
+			break
+		}
+
+		if buf[i] == ',' || buf[i] == ' ' {
+			break
+		}
+
+		if buf[i] == 'i' && i > start && !isInt {
+			isInt = true
+			i++
+			continue
+		}
+
+		if buf[i] == '.' {
+			// Can't have more than 1 decimal (e.g. 1.1.1 should fail)
+			if decimal {
+				return i, ErrInvalidNumber
+			}
+			decimal = true
+		}
+
+		// `e` is valid for floats but not as the first char
+		if i > start && (buf[i] == 'e' || buf[i] == 'E') {
+			scientific = true
+			i++
+			continue
+		}
+
+		// + and - are only valid at this point if they follow an e (scientific notation)
+		if (buf[i] == '+' || buf[i] == '-') && (buf[i-1] == 'e' || buf[i-1] == 'E') {
+			i++
+			continue
+		}
+
+		// NaN is an unsupported value
+		if i+2 < len(buf) && (buf[i] == 'N' || buf[i] == 'n') {
+			return i, ErrInvalidNumber
+		}
+
+		if !isNumeric(buf[i]) {
+			return i, ErrInvalidNumber
+		}
+		i++
+	}
+
+	if isInt && (decimal || scientific) {
+		return i, ErrInvalidNumber
+	}
+
+	numericDigits := i - start
+	if isInt {
+		numericDigits--
+	}
+	if decimal {
+		numericDigits--
+	}
+	if buf[start] == '-' {
+		numericDigits--
+	}
+
+	if numericDigits == 0 {
+		return i, ErrInvalidNumber
+	}
+
+	// It's more common that numbers will be within min/max range for their type but we need to prevent
+	// out or range numbers from being parsed successfully.  This uses some simple heuristics to decide
+	// if we should parse the number to the actual type.  It does not do it all the time because it incurs
+	// extra allocations and we end up converting the type again when writing points to disk.
+	if isInt {
+		// Make sure the last char is an 'i' for integers (e.g. 9i10 is not valid)
+		if buf[i-1] != 'i' {
+			return i, ErrInvalidNumber
+		}
+		// Parse the int to check bounds the number of digits could be larger than the max range
+		// We subtract 1 from the index to remove the `i` from our tests
+		if len(buf[start:i-1]) >= maxInt64Digits || len(buf[start:i-1]) >= minInt64Digits {
+			if _, err := parseIntBytes(buf[start:i-1], 10, 64); err != nil {
+				return i, fmt.Errorf("unable to parse integer %s: %s", buf[start:i-1], err)
+			}
+		}
+	} else {
+		// Parse the float to check bounds if it's scientific or the number of digits could be larger than the max range
+		if scientific || len(buf[start:i]) >= maxFloat64Digits || len(buf[start:i]) >= minFloat64Digits {
+			if _, err := parseFloatBytes(buf[start:i], 10); err != nil {
+				return i, fmt.Errorf("invalid float")
+			}
+		}
+	}
+
+	return i, nil
+}
+
+// scanBoolean returns the end position within buf, start at i after
+// scanning over buf for boolean. Valid values for a boolean are
+// t, T, true, TRUE, f, F, false, FALSE. It returns an error if a invalid boolean
+// is scanned.
+func scanBoolean(buf []byte, i int) (int, []byte, error) {
+	start := i
+
+	if i < len(buf) && (buf[i] != 't' && buf[i] != 'f' && buf[i] != 'T' && buf[i] != 'F') {
+		return i, buf[start:i], fmt.Errorf("invalid boolean")
+	}
+
+	i++
+	for {
+		if i >= len(buf) {
+			break
+		}
+
+		if buf[i] == ',' || buf[i] == ' ' {
+			break
+		}
+		i++
+	}
+
+	// Single char bool (t, T, f, F) is ok
+	if i-start == 1 {
+		return i, buf[start:i], nil
+	}
+
+	// length must be 4 for true or TRUE
+	if (buf[start] == 't' || buf[start] == 'T') && i-start != 4 {
+		return i, buf[start:i], fmt.Errorf("invalid boolean")
+	}
+
+	// length must be 5 for false or FALSE
+	if (buf[start] == 'f' || buf[start] == 'F') && i-start != 5 {
+		return i, buf[start:i], fmt.Errorf("invalid boolean")
+	}
+
+	// Otherwise
+	valid := false
+	switch buf[start] {
+	case 't':
+		valid = bytes.Equal(buf[start:i], []byte("true"))
+	case 'f':
+		valid = bytes.Equal(buf[start:i], []byte("false"))
+	case 'T':
+		valid = bytes.Equal(buf[start:i], []byte("TRUE")) || bytes.Equal(buf[start:i], []byte("True"))
+	case 'F':
+		valid = bytes.Equal(buf[start:i], []byte("FALSE")) || bytes.Equal(buf[start:i], []byte("False"))
+	}
+
+	if !valid {
+		return i, buf[start:i], fmt.Errorf("invalid boolean")
+	}
+
+	return i, buf[start:i], nil
+
+}
+
+// skipWhitespace returns the end position within buf, starting at i after
+// scanning over spaces in tags
+func skipWhitespace(buf []byte, i int) int {
+	for i < len(buf) {
+		if buf[i] != ' ' && buf[i] != '\t' && buf[i] != 0 {
+			break
+		}
+		i++
+	}
+	return i
+}
+
+// makeError is a helper function for making a metric parsing error.
+//   reason is the reason that the error occured.
+//   buf should be the current buffer we are parsing.
+//   i is the current index, to give some context on where in the buffer we are.
+func makeError(reason string, buf []byte, i int) error {
+	return fmt.Errorf("metric parsing error, reason: [%s], buffer: [%s], index: [%d]",
+		reason, buf, i)
+}