From ec7f13111f7a12e0b0b7668882379af145b3cefa Mon Sep 17 00:00:00 2001 From: Max U Date: Thu, 21 Jun 2018 16:06:36 -0700 Subject: [PATCH] add more test files --- plugins/inputs/reader/testfiles/grok_a.log | 2 + plugins/parsers/grok/influx-patterns | 73 +++ plugins/parsers/grok/influx_patterns.go | 78 +++ plugins/parsers/grok/parser.go | 527 +++++++++++++++++++++ plugins/parsers/grok/parser_test.go | 21 + 5 files changed, 701 insertions(+) create mode 100644 plugins/inputs/reader/testfiles/grok_a.log create mode 100644 plugins/parsers/grok/influx-patterns create mode 100644 plugins/parsers/grok/influx_patterns.go create mode 100644 plugins/parsers/grok/parser.go create mode 100644 plugins/parsers/grok/parser_test.go diff --git a/plugins/inputs/reader/testfiles/grok_a.log b/plugins/inputs/reader/testfiles/grok_a.log new file mode 100644 index 000000000..5295fcb75 --- /dev/null +++ b/plugins/inputs/reader/testfiles/grok_a.log @@ -0,0 +1,2 @@ +127.0.0.1 user-identifier frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 +128.0.0.1 user-identifier tony [10/Oct/2000:13:55:36 -0800] "GET /apache_pb.gif HTTP/1.0" 300 45 \ No newline at end of file diff --git a/plugins/parsers/grok/influx-patterns b/plugins/parsers/grok/influx-patterns new file mode 100644 index 000000000..931b61bc8 --- /dev/null +++ b/plugins/parsers/grok/influx-patterns @@ -0,0 +1,73 @@ +# Captures are a slightly modified version of logstash "grok" patterns, with +# the format %{[:][:]} +# By default all named captures are converted into string fields. +# Modifiers can be used to convert captures to other types or tags. +# Timestamp modifiers can be used to convert captures to the timestamp of the +# parsed metric. + +# View logstash grok pattern docs here: +# https://www.elastic.co/guide/en/logstash/current/plugins-filters-grok.html +# All default logstash patterns are supported, these can be viewed here: +# https://github.com/logstash-plugins/logstash-patterns-core/blob/master/patterns/grok-patterns + +# Available modifiers: +# string (default if nothing is specified) +# int +# float +# duration (ie, 5.23ms gets converted to int nanoseconds) +# tag (converts the field into a tag) +# drop (drops the field completely) +# Timestamp modifiers: +# ts-ansic ("Mon Jan _2 15:04:05 2006") +# ts-unix ("Mon Jan _2 15:04:05 MST 2006") +# ts-ruby ("Mon Jan 02 15:04:05 -0700 2006") +# ts-rfc822 ("02 Jan 06 15:04 MST") +# ts-rfc822z ("02 Jan 06 15:04 -0700") +# ts-rfc850 ("Monday, 02-Jan-06 15:04:05 MST") +# ts-rfc1123 ("Mon, 02 Jan 2006 15:04:05 MST") +# ts-rfc1123z ("Mon, 02 Jan 2006 15:04:05 -0700") +# ts-rfc3339 ("2006-01-02T15:04:05Z07:00") +# ts-rfc3339nano ("2006-01-02T15:04:05.999999999Z07:00") +# ts-httpd ("02/Jan/2006:15:04:05 -0700") +# ts-epoch (seconds since unix epoch) +# ts-epochnano (nanoseconds since unix epoch) +# ts-"CUSTOM" +# CUSTOM time layouts must be within quotes and be the representation of the +# "reference time", which is Mon Jan 2 15:04:05 -0700 MST 2006 +# See https://golang.org/pkg/time/#Parse for more details. + +# Example log file pattern, example log looks like this: +# [04/Jun/2016:12:41:45 +0100] 1.25 200 192.168.1.1 5.432µs +# Breakdown of the DURATION pattern below: +# NUMBER is a builtin logstash grok pattern matching float & int numbers. +# [nuµm]? is a regex specifying 0 or 1 of the characters within brackets. +# s is also regex, this pattern must end in "s". +# so DURATION will match something like '5.324ms' or '6.1µs' or '10s' +DURATION %{NUMBER}[nuµm]?s +RESPONSE_CODE %{NUMBER:response_code:tag} +RESPONSE_TIME %{DURATION:response_time_ns:duration} +EXAMPLE_LOG \[%{HTTPDATE:ts:ts-httpd}\] %{NUMBER:myfloat:float} %{RESPONSE_CODE} %{IPORHOST:clientip} %{RESPONSE_TIME} + +# Wider-ranging username matching vs. logstash built-in %{USER} +NGUSERNAME [a-zA-Z0-9\.\@\-\+_%]+ +NGUSER %{NGUSERNAME} +# Wider-ranging client IP matching +CLIENT (?:%{IPORHOST}|%{HOSTPORT}|::1) + +## +## COMMON LOG PATTERNS +## + +# apache & nginx logs, this is also known as the "common log format" +# see https://en.wikipedia.org/wiki/Common_Log_Format +COMMON_LOG_FORMAT %{CLIENT:client_ip} %{NOTSPACE:ident} %{NOTSPACE:auth} \[%{HTTPDATE:ts:ts-httpd}\] "(?:%{WORD:verb:tag} %{NOTSPACE:request}(?: HTTP/%{NUMBER:http_version:float})?|%{DATA})" %{NUMBER:resp_code:tag} (?:%{NUMBER:resp_bytes:int}|-) + +# Combined log format is the same as the common log format but with the addition +# of two quoted strings at the end for "referrer" and "agent" +# See Examples at http://httpd.apache.org/docs/current/mod/mod_log_config.html +COMBINED_LOG_FORMAT %{COMMON_LOG_FORMAT} %{QS:referrer} %{QS:agent} + +# HTTPD log formats +HTTPD20_ERRORLOG \[%{HTTPDERROR_DATE:timestamp}\] \[%{LOGLEVEL:loglevel:tag}\] (?:\[client %{IPORHOST:clientip}\] ){0,1}%{GREEDYDATA:errormsg} +HTTPD24_ERRORLOG \[%{HTTPDERROR_DATE:timestamp}\] \[%{WORD:module}:%{LOGLEVEL:loglevel:tag}\] \[pid %{POSINT:pid:int}:tid %{NUMBER:tid:int}\]( \(%{POSINT:proxy_errorcode:int}\)%{DATA:proxy_errormessage}:)?( \[client %{IPORHOST:client}:%{POSINT:clientport}\])? %{DATA:errorcode}: %{GREEDYDATA:message} +HTTPD_ERRORLOG %{HTTPD20_ERRORLOG}|%{HTTPD24_ERRORLOG} diff --git a/plugins/parsers/grok/influx_patterns.go b/plugins/parsers/grok/influx_patterns.go new file mode 100644 index 000000000..6dc990622 --- /dev/null +++ b/plugins/parsers/grok/influx_patterns.go @@ -0,0 +1,78 @@ +package grok + +// DEFAULT_PATTERNS SHOULD BE KEPT IN-SYNC WITH patterns/influx-patterns +const DEFAULT_PATTERNS = ` +# Captures are a slightly modified version of logstash "grok" patterns, with +# the format %{[:][:]} +# By default all named captures are converted into string fields. +# Modifiers can be used to convert captures to other types or tags. +# Timestamp modifiers can be used to convert captures to the timestamp of the +# parsed metric. + +# View logstash grok pattern docs here: +# https://www.elastic.co/guide/en/logstash/current/plugins-filters-grok.html +# All default logstash patterns are supported, these can be viewed here: +# https://github.com/logstash-plugins/logstash-patterns-core/blob/master/patterns/grok-patterns + +# Available modifiers: +# string (default if nothing is specified) +# int +# float +# duration (ie, 5.23ms gets converted to int nanoseconds) +# tag (converts the field into a tag) +# drop (drops the field completely) +# Timestamp modifiers: +# ts-ansic ("Mon Jan _2 15:04:05 2006") +# ts-unix ("Mon Jan _2 15:04:05 MST 2006") +# ts-ruby ("Mon Jan 02 15:04:05 -0700 2006") +# ts-rfc822 ("02 Jan 06 15:04 MST") +# ts-rfc822z ("02 Jan 06 15:04 -0700") +# ts-rfc850 ("Monday, 02-Jan-06 15:04:05 MST") +# ts-rfc1123 ("Mon, 02 Jan 2006 15:04:05 MST") +# ts-rfc1123z ("Mon, 02 Jan 2006 15:04:05 -0700") +# ts-rfc3339 ("2006-01-02T15:04:05Z07:00") +# ts-rfc3339nano ("2006-01-02T15:04:05.999999999Z07:00") +# ts-httpd ("02/Jan/2006:15:04:05 -0700") +# ts-epoch (seconds since unix epoch) +# ts-epochnano (nanoseconds since unix epoch) +# ts-"CUSTOM" +# CUSTOM time layouts must be within quotes and be the representation of the +# "reference time", which is Mon Jan 2 15:04:05 -0700 MST 2006 +# See https://golang.org/pkg/time/#Parse for more details. + +# Example log file pattern, example log looks like this: +# [04/Jun/2016:12:41:45 +0100] 1.25 200 192.168.1.1 5.432µs +# Breakdown of the DURATION pattern below: +# NUMBER is a builtin logstash grok pattern matching float & int numbers. +# [nuµm]? is a regex specifying 0 or 1 of the characters within brackets. +# s is also regex, this pattern must end in "s". +# so DURATION will match something like '5.324ms' or '6.1µs' or '10s' +DURATION %{NUMBER}[nuµm]?s +RESPONSE_CODE %{NUMBER:response_code:tag} +RESPONSE_TIME %{DURATION:response_time_ns:duration} +EXAMPLE_LOG \[%{HTTPDATE:ts:ts-httpd}\] %{NUMBER:myfloat:float} %{RESPONSE_CODE} %{IPORHOST:clientip} %{RESPONSE_TIME} + +# Wider-ranging username matching vs. logstash built-in %{USER} +NGUSERNAME [a-zA-Z0-9\.\@\-\+_%]+ +NGUSER %{NGUSERNAME} +# Wider-ranging client IP matching +CLIENT (?:%{IPV6}|%{IPV4}|%{HOSTNAME}|%{HOSTPORT}) + +## +## COMMON LOG PATTERNS +## + +# apache & nginx logs, this is also known as the "common log format" +# see https://en.wikipedia.org/wiki/Common_Log_Format +COMMON_LOG_FORMAT %{CLIENT:client_ip} %{NOTSPACE:ident} %{NOTSPACE:auth} \[%{HTTPDATE:ts:ts-httpd}\] "(?:%{WORD:verb:tag} %{NOTSPACE:request}(?: HTTP/%{NUMBER:http_version:float})?|%{DATA})" %{NUMBER:resp_code:tag} (?:%{NUMBER:resp_bytes:int}|-) + +# Combined log format is the same as the common log format but with the addition +# of two quoted strings at the end for "referrer" and "agent" +# See Examples at http://httpd.apache.org/docs/current/mod/mod_log_config.html +COMBINED_LOG_FORMAT %{COMMON_LOG_FORMAT} %{QS:referrer} %{QS:agent} + +# HTTPD log formats +HTTPD20_ERRORLOG \[%{HTTPDERROR_DATE:timestamp}\] \[%{LOGLEVEL:loglevel:tag}\] (?:\[client %{IPORHOST:clientip}\] ){0,1}%{GREEDYDATA:errormsg} +HTTPD24_ERRORLOG \[%{HTTPDERROR_DATE:timestamp}\] \[%{WORD:module}:%{LOGLEVEL:loglevel:tag}\] \[pid %{POSINT:pid:int}:tid %{NUMBER:tid:int}\]( \(%{POSINT:proxy_errorcode:int}\)%{DATA:proxy_errormessage}:)?( \[client %{IPORHOST:client}:%{POSINT:clientport}\])? %{DATA:errorcode}: %{GREEDYDATA:message} +HTTPD_ERRORLOG %{HTTPD20_ERRORLOG}|%{HTTPD24_ERRORLOG} +` diff --git a/plugins/parsers/grok/parser.go b/plugins/parsers/grok/parser.go new file mode 100644 index 000000000..e3d2acf3b --- /dev/null +++ b/plugins/parsers/grok/parser.go @@ -0,0 +1,527 @@ +package grok + +import ( + "bufio" + "fmt" + "log" + "os" + "regexp" + "strconv" + "strings" + "time" + + "github.com/vjeantet/grok" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/metric" +) + +var timeLayouts = map[string]string{ + "ts-ansic": "Mon Jan _2 15:04:05 2006", + "ts-unix": "Mon Jan _2 15:04:05 MST 2006", + "ts-ruby": "Mon Jan 02 15:04:05 -0700 2006", + "ts-rfc822": "02 Jan 06 15:04 MST", + "ts-rfc822z": "02 Jan 06 15:04 -0700", // RFC822 with numeric zone + "ts-rfc850": "Monday, 02-Jan-06 15:04:05 MST", + "ts-rfc1123": "Mon, 02 Jan 2006 15:04:05 MST", + "ts-rfc1123z": "Mon, 02 Jan 2006 15:04:05 -0700", // RFC1123 with numeric zone + "ts-rfc3339": "2006-01-02T15:04:05Z07:00", + "ts-rfc3339nano": "2006-01-02T15:04:05.999999999Z07:00", + "ts-httpd": "02/Jan/2006:15:04:05 -0700", + // These three are not exactly "layouts", but they are special cases that + // will get handled in the ParseLine function. + "ts-epoch": "EPOCH", + "ts-epochnano": "EPOCH_NANO", + "ts-syslog": "SYSLOG_TIMESTAMP", + "ts": "GENERIC_TIMESTAMP", // try parsing all known timestamp layouts. +} + +const ( + INT = "int" + TAG = "tag" + FLOAT = "float" + STRING = "string" + DURATION = "duration" + DROP = "drop" + EPOCH = "EPOCH" + EPOCH_NANO = "EPOCH_NANO" + SYSLOG_TIMESTAMP = "SYSLOG_TIMESTAMP" + GENERIC_TIMESTAMP = "GENERIC_TIMESTAMP" +) + +var ( + // matches named captures that contain a modifier. + // ie, + // %{NUMBER:bytes:int} + // %{IPORHOST:clientip:tag} + // %{HTTPDATE:ts1:ts-http} + // %{HTTPDATE:ts2:ts-"02 Jan 06 15:04"} + modifierRe = regexp.MustCompile(`%{\w+:(\w+):(ts-".+"|t?s?-?\w+)}`) + // matches a plain pattern name. ie, %{NUMBER} + patternOnlyRe = regexp.MustCompile(`%{(\w+)}`) +) + +// Parser is the primary struct to handle and grok-patterns defined in the config toml +type Parser struct { + Patterns []string + // namedPatterns is a list of internally-assigned names to the patterns + // specified by the user in Patterns. + // They will look like: + // GROK_INTERNAL_PATTERN_0, GROK_INTERNAL_PATTERN_1, etc. + NamedPatterns []string + CustomPatterns string + CustomPatternFiles []string + Measurement string + + // Timezone is an optional component to help render log dates to + // your chosen zone. + // Default: "" which renders UTC + // Options are as follows: + // 1. Local -- interpret based on machine localtime + // 2. "America/Chicago" -- Unix TZ values like those found in https://en.wikipedia.org/wiki/List_of_tz_database_time_zones + // 3. UTC -- or blank/unspecified, will return timestamp in UTC + Timezone string + loc *time.Location + + // typeMap is a map of patterns -> capture name -> modifier, + // ie, { + // "%{TESTLOG}": + // { + // "bytes": "int", + // "clientip": "tag" + // } + // } + typeMap map[string]map[string]string + // tsMap is a map of patterns -> capture name -> timestamp layout. + // ie, { + // "%{TESTLOG}": + // { + // "httptime": "02/Jan/2006:15:04:05 -0700" + // } + // } + tsMap map[string]map[string]string + // patterns is a map of all of the parsed patterns from CustomPatterns + // and CustomPatternFiles. + // ie, { + // "DURATION": "%{NUMBER}[nuµm]?s" + // "RESPONSE_CODE": "%{NUMBER:rc:tag}" + // } + patterns map[string]string + // foundTsLayouts is a slice of timestamp patterns that have been found + // in the log lines. This slice gets updated if the user uses the generic + // 'ts' modifier for timestamps. This slice is checked first for matches, + // so that previously-matched layouts get priority over all other timestamp + // layouts. + foundTsLayouts []string + + timeFunc func() time.Time + g *grok.Grok + tsModder *tsModder +} + +// Compile is a bound method to Parser which will process the options for our parser +func (p *Parser) Compile() error { + p.typeMap = make(map[string]map[string]string) + p.tsMap = make(map[string]map[string]string) + p.patterns = make(map[string]string) + p.tsModder = &tsModder{} + var err error + p.g, err = grok.NewWithConfig(&grok.Config{NamedCapturesOnly: true}) + if err != nil { + return err + } + + // Give Patterns fake names so that they can be treated as named + // "custom patterns" + p.NamedPatterns = make([]string, 0, len(p.Patterns)) + for i, pattern := range p.Patterns { + pattern = strings.TrimSpace(pattern) + if pattern == "" { + continue + } + name := fmt.Sprintf("GROK_INTERNAL_PATTERN_%d", i) + p.CustomPatterns += "\n" + name + " " + pattern + "\n" + p.NamedPatterns = append(p.NamedPatterns, "%{"+name+"}") + } + + if len(p.NamedPatterns) == 0 { + return fmt.Errorf("pattern required") + } + + // Combine user-supplied CustomPatterns with DEFAULT_PATTERNS and parse + // them together as the same type of pattern. + p.CustomPatterns = DEFAULT_PATTERNS + p.CustomPatterns + if len(p.CustomPatterns) != 0 { + scanner := bufio.NewScanner(strings.NewReader(p.CustomPatterns)) + p.addCustomPatterns(scanner) + } + + // Parse any custom pattern files supplied. + for _, filename := range p.CustomPatternFiles { + file, fileErr := os.Open(filename) + if fileErr != nil { + return fileErr + } + + scanner := bufio.NewScanner(bufio.NewReader(file)) + p.addCustomPatterns(scanner) + } + + if p.Measurement == "" { + p.Measurement = "logparser_grok" + } + + p.loc, err = time.LoadLocation(p.Timezone) + if err != nil { + log.Printf("W! improper timezone supplied (%s), setting loc to UTC", p.Timezone) + p.loc, _ = time.LoadLocation("UTC") + } + + if p.timeFunc == nil { + p.timeFunc = time.Now + } + + return p.compileCustomPatterns() +} + +// ParseLine is the primary function to process individual lines, returning the metrics +func (p *Parser) ParseLine(line string) (telegraf.Metric, error) { + var err error + // values are the parsed fields from the log line + var values map[string]string + // the matching pattern string + var patternName string + for _, pattern := range p.NamedPatterns { + if values, err = p.g.Parse(pattern, line); err != nil { + return nil, err + } + if len(values) != 0 { + patternName = pattern + break + } + } + + if len(values) == 0 { + log.Printf("D! Grok no match found for: %q", line) + return nil, nil + } + + fields := make(map[string]interface{}) + tags := make(map[string]string) + timestamp := time.Now() + for k, v := range values { + if k == "" || v == "" { + continue + } + + // t is the modifier of the field + var t string + // check if pattern has some modifiers + if types, ok := p.typeMap[patternName]; ok { + t = types[k] + } + // if we didn't find a modifier, check if we have a timestamp layout + if t == "" { + if ts, ok := p.tsMap[patternName]; ok { + // check if the modifier is a timestamp layout + if layout, ok := ts[k]; ok { + t = layout + } + } + } + // if we didn't find a type OR timestamp modifier, assume string + if t == "" { + t = STRING + } + + switch t { + case INT: + iv, err := strconv.ParseInt(v, 10, 64) + if err != nil { + log.Printf("E! Error parsing %s to int: %s", v, err) + } else { + fields[k] = iv + } + case FLOAT: + fv, err := strconv.ParseFloat(v, 64) + if err != nil { + log.Printf("E! Error parsing %s to float: %s", v, err) + } else { + fields[k] = fv + } + case DURATION: + d, err := time.ParseDuration(v) + if err != nil { + log.Printf("E! Error parsing %s to duration: %s", v, err) + } else { + fields[k] = int64(d) + } + case TAG: + tags[k] = v + case STRING: + fields[k] = strings.Trim(v, `"`) + case EPOCH: + parts := strings.SplitN(v, ".", 2) + if len(parts) == 0 { + log.Printf("E! Error parsing %s to timestamp: %s", v, err) + break + } + + sec, err := strconv.ParseInt(parts[0], 10, 64) + if err != nil { + log.Printf("E! Error parsing %s to timestamp: %s", v, err) + break + } + ts := time.Unix(sec, 0) + + if len(parts) == 2 { + padded := fmt.Sprintf("%-9s", parts[1]) + nsString := strings.Replace(padded[:9], " ", "0", -1) + nanosec, err := strconv.ParseInt(nsString, 10, 64) + if err != nil { + log.Printf("E! Error parsing %s to timestamp: %s", v, err) + break + } + ts = ts.Add(time.Duration(nanosec) * time.Nanosecond) + } + timestamp = ts + case EPOCH_NANO: + iv, err := strconv.ParseInt(v, 10, 64) + if err != nil { + log.Printf("E! Error parsing %s to int: %s", v, err) + } else { + timestamp = time.Unix(0, iv) + } + case SYSLOG_TIMESTAMP: + ts, err := time.ParseInLocation("Jan 02 15:04:05", v, p.loc) + if err == nil { + if ts.Year() == 0 { + ts = ts.AddDate(timestamp.Year(), 0, 0) + } + timestamp = ts + } else { + log.Printf("E! Error parsing %s to time layout [%s]: %s", v, t, err) + } + case GENERIC_TIMESTAMP: + var foundTs bool + // first try timestamp layouts that we've already found + for _, layout := range p.foundTsLayouts { + ts, err := time.ParseInLocation(layout, v, p.loc) + if err == nil { + timestamp = ts + foundTs = true + break + } + } + // if we haven't found a timestamp layout yet, try all timestamp + // layouts. + if !foundTs { + for _, layout := range timeLayouts { + ts, err := time.ParseInLocation(layout, v, p.loc) + if err == nil { + timestamp = ts + foundTs = true + p.foundTsLayouts = append(p.foundTsLayouts, layout) + break + } + } + } + // if we still haven't found a timestamp layout, log it and we will + // just use time.Now() + if !foundTs { + log.Printf("E! Error parsing timestamp [%s], could not find any "+ + "suitable time layouts.", v) + } + case DROP: + // goodbye! + default: + ts, err := time.ParseInLocation(t, v, p.loc) + if err == nil { + timestamp = ts + } else { + log.Printf("E! Error parsing %s to time layout [%s]: %s", v, t, err) + } + } + } + + if len(fields) == 0 { + return nil, fmt.Errorf("logparser_grok: must have one or more fields") + } + + return metric.New(p.Measurement, tags, fields, p.tsModder.tsMod(timestamp)) +} + +func (p *Parser) Parse(buf []byte) ([]telegraf.Metric, error) { + lines := strings.Split(string(buf), "\n") + var metrics []telegraf.Metric + + for _, line := range lines { + m, err := p.ParseLine(line) + if err != nil { + return nil, err + } + metrics = append(metrics, m) + } + + return metrics, nil +} + +func (p *Parser) SetDefaultTags(tags map[string]string) { + //needs implementation +} + +func (p *Parser) addCustomPatterns(scanner *bufio.Scanner) { + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if len(line) > 0 && line[0] != '#' { + names := strings.SplitN(line, " ", 2) + p.patterns[names[0]] = names[1] + } + } +} + +func (p *Parser) compileCustomPatterns() error { + var err error + // check if the pattern contains a subpattern that is already defined + // replace it with the subpattern for modifier inheritance. + for i := 0; i < 2; i++ { + for name, pattern := range p.patterns { + subNames := patternOnlyRe.FindAllStringSubmatch(pattern, -1) + for _, subName := range subNames { + if subPattern, ok := p.patterns[subName[1]]; ok { + pattern = strings.Replace(pattern, subName[0], subPattern, 1) + } + } + p.patterns[name] = pattern + } + } + + // check if pattern contains modifiers. Parse them out if it does. + for name, pattern := range p.patterns { + if modifierRe.MatchString(pattern) { + // this pattern has modifiers, so parse out the modifiers + pattern, err = p.parseTypedCaptures(name, pattern) + if err != nil { + return err + } + p.patterns[name] = pattern + } + } + + return p.g.AddPatternsFromMap(p.patterns) +} + +// parseTypedCaptures parses the capture modifiers, and then deletes the +// modifier from the line so that it is a valid "grok" pattern again. +// ie, +// %{NUMBER:bytes:int} => %{NUMBER:bytes} (stores %{NUMBER}->bytes->int) +// %{IPORHOST:clientip:tag} => %{IPORHOST:clientip} (stores %{IPORHOST}->clientip->tag) +func (p *Parser) parseTypedCaptures(name, pattern string) (string, error) { + matches := modifierRe.FindAllStringSubmatch(pattern, -1) + + // grab the name of the capture pattern + patternName := "%{" + name + "}" + // create type map for this pattern + p.typeMap[patternName] = make(map[string]string) + p.tsMap[patternName] = make(map[string]string) + + // boolean to verify that each pattern only has a single ts- data type. + hasTimestamp := false + for _, match := range matches { + // regex capture 1 is the name of the capture + // regex capture 2 is the modifier of the capture + if strings.HasPrefix(match[2], "ts") { + if hasTimestamp { + return pattern, fmt.Errorf("logparser pattern compile error: "+ + "Each pattern is allowed only one named "+ + "timestamp data type. pattern: %s", pattern) + } + if layout, ok := timeLayouts[match[2]]; ok { + // built-in time format + p.tsMap[patternName][match[1]] = layout + } else { + // custom time format + p.tsMap[patternName][match[1]] = strings.TrimSuffix(strings.TrimPrefix(match[2], `ts-"`), `"`) + } + hasTimestamp = true + } else { + p.typeMap[patternName][match[1]] = match[2] + } + + // the modifier is not a valid part of a "grok" pattern, so remove it + // from the pattern. + pattern = strings.Replace(pattern, ":"+match[2]+"}", "}", 1) + } + + return pattern, nil +} + +// tsModder is a struct for incrementing identical timestamps of log lines +// so that we don't push identical metrics that will get overwritten. +type tsModder struct { + dupe time.Time + last time.Time + incr time.Duration + incrn time.Duration + rollover time.Duration +} + +// tsMod increments the given timestamp one unit more from the previous +// duplicate timestamp. +// the increment unit is determined as the next smallest time unit below the +// most significant time unit of ts. +// ie, if the input is at ms precision, it will increment it 1µs. +func (t *tsModder) tsMod(ts time.Time) time.Time { + defer func() { t.last = ts }() + // don't mod the time if we don't need to + if t.last.IsZero() || ts.IsZero() { + t.incrn = 0 + t.rollover = 0 + return ts + } + if !ts.Equal(t.last) && !ts.Equal(t.dupe) { + t.incr = 0 + t.incrn = 0 + t.rollover = 0 + return ts + } + + if ts.Equal(t.last) { + t.dupe = ts + } + + if ts.Equal(t.dupe) && t.incr == time.Duration(0) { + tsNano := ts.UnixNano() + + d := int64(10) + counter := 1 + for { + a := tsNano % d + if a > 0 { + break + } + d = d * 10 + counter++ + } + + switch { + case counter <= 6: + t.incr = time.Nanosecond + case counter <= 9: + t.incr = time.Microsecond + case counter > 9: + t.incr = time.Millisecond + } + } + + t.incrn++ + if t.incrn == 999 && t.incr > time.Nanosecond { + t.rollover = t.incr * t.incrn + t.incrn = 1 + t.incr = t.incr / 1000 + if t.incr < time.Nanosecond { + t.incr = time.Nanosecond + } + } + return ts.Add(t.incr*t.incrn + t.rollover) +} diff --git a/plugins/parsers/grok/parser_test.go b/plugins/parsers/grok/parser_test.go new file mode 100644 index 000000000..5dc01911c --- /dev/null +++ b/plugins/parsers/grok/parser_test.go @@ -0,0 +1,21 @@ +package grok + +import ( + "log" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestGrokParse(t *testing.T) { + parser := Parser{ + Measurement: "t_met", + Patterns: []string{"%{COMMON_LOG_FORMAT}"}, + } + //var acc testutil.Accumulator + parser.Compile() + metrics, err := parser.Parse([]byte(`127.0.0.1 user-identifier frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326`)) + log.Printf("metric_tags: %v, metric_fields: %v", metrics[0].Tags(), metrics[0].Fields()) + assert.NoError(t, err) + //t.Error() +}