diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4bc7daf71..0015cd5eb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,489 +1,52 @@ -## Steps for Contributing: +### Contributing -1. [Sign the CLA](http://influxdb.com/community/cla.html) -1. Make changes or write plugin (see below for details) -1. Add your plugin to one of: `plugins/{inputs,outputs,aggregators,processors}/all/all.go` -1. If your plugin requires a new Go package, -[add it](https://github.com/influxdata/telegraf/blob/master/CONTRIBUTING.md#adding-a-dependency) -1. Write a README for your plugin, if it's an input plugin, it should be structured -like the [input example here](https://github.com/influxdata/telegraf/blob/master/plugins/inputs/EXAMPLE_README.md). -Output plugins READMEs are less structured, -but any information you can provide on how the data will look is appreciated. -See the [OpenTSDB output](https://github.com/influxdata/telegraf/tree/master/plugins/outputs/opentsdb) -for a good example. -1. **Optional:** Help users of your plugin by including example queries for populating dashboards. Include these sample queries in the `README.md` for the plugin. -1. **Optional:** Write a [tickscript](https://docs.influxdata.com/kapacitor/v1.0/tick/syntax/) for your plugin and add it to [Kapacitor](https://github.com/influxdata/kapacitor/tree/master/examples/telegraf). +1. [Sign the CLA][cla]. +1. Open a [new issue][] to discuss the changes you would like to make. This is + not strictly required but it may help reduce the amount of rework you need + to do later. +1. Make changes or write plugin using the guidelines in the following + documents: + - [Input Plugins][inputs] + - [Processor Plugins][processors] + - [Aggregator Plugins][aggregators] + - [Output Plugins][outputs] +1. Ensure you have added proper unit tests and documentation. +1. Open a new [pull request][]. -## GoDoc +### GoDoc Public interfaces for inputs, outputs, processors, aggregators, metrics, -and the accumulator can be found on the GoDoc +and the accumulator can be found in the GoDoc: [![GoDoc](https://godoc.org/github.com/influxdata/telegraf?status.svg)](https://godoc.org/github.com/influxdata/telegraf) -## Sign the CLA +### Common development tasks -Before we can merge a pull request, you will need to sign the CLA, -which can be found [on our website](http://influxdb.com/community/cla.html) - -## Adding a dependency +**Adding a dependency:** Assuming you can already build the project, run these in the telegraf directory: 1. `dep ensure -vendor-only` 2. `dep ensure -add github.com/[dependency]/[new-package]` -## Input Plugins - -This section is for developers who want to create new collection inputs. -Telegraf is entirely plugin driven. This interface allows for operators to -pick and chose what is gathered and makes it easy for developers -to create new ways of generating metrics. - -Plugin authorship is kept as simple as possible to promote people to develop -and submit new inputs. - -### Input Plugin Guidelines - -* A plugin must conform to the [`telegraf.Input`](https://godoc.org/github.com/influxdata/telegraf#Input) interface. -* Input Plugins should call `inputs.Add` in their `init` function to register themselves. -See below for a quick example. -* Input Plugins must be added to the -`github.com/influxdata/telegraf/plugins/inputs/all/all.go` file. -* The `SampleConfig` function should return valid toml that describes how the -plugin can be configured. This is included in `telegraf config`. Please -consult the [SampleConfig](https://github.com/influxdata/telegraf/wiki/SampleConfig) -page for the latest style guidelines. -* The `Description` function should say in one line what this plugin does. - -Let's say you've written a plugin that emits metrics about processes on the -current host. - -### Input Plugin Example - -```go -package simple - -// simple.go - -import ( - "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/plugins/inputs" -) - -type Simple struct { - Ok bool -} - -func (s *Simple) Description() string { - return "a demo plugin" -} - -func (s *Simple) SampleConfig() string { - return ` - ## Indicate if everything is fine - ok = true -` -} - -func (s *Simple) Gather(acc telegraf.Accumulator) error { - if s.Ok { - acc.AddFields("state", map[string]interface{}{"value": "pretty good"}, nil) - } else { - acc.AddFields("state", map[string]interface{}{"value": "not great"}, nil) - } - - return nil -} - -func init() { - inputs.Add("simple", func() telegraf.Input { return &Simple{} }) -} -``` - -### Input Plugin Development - -* Run `make static` followed by `make plugin-[pluginName]` to spin up a docker dev environment -using docker-compose. -* ***[Optional]*** When developing a plugin, add a `dev` directory with a `docker-compose.yml` and `telegraf.conf` -as well as any other supporting files, where sensible. - -## Adding Typed Metrics - -In addition the the `AddFields` function, the accumulator also supports an -`AddGauge` and `AddCounter` function. These functions are for adding _typed_ -metrics. Metric types are ignored for the InfluxDB output, but can be used -for other outputs, such as [prometheus](https://prometheus.io/docs/concepts/metric_types/). - -## Input Plugins Accepting Arbitrary Data Formats - -Some input plugins (such as -[exec](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/exec)) -accept arbitrary input data formats. An overview of these data formats can -be found -[here](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md). - -In order to enable this, you must specify a `SetParser(parser parsers.Parser)` -function on the plugin object (see the exec plugin for an example), as well as -defining `parser` as a field of the object. - -You can then utilize the parser internally in your plugin, parsing data as you -see fit. Telegraf's configuration layer will take care of instantiating and -creating the `Parser` object. - -You should also add the following to your SampleConfig() return: - -```toml - ## Data format to consume. - ## Each data format has its own unique set of configuration options, read - ## more about them here: - ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md - data_format = "influx" -``` - -Below is the `Parser` interface. - -```go -// Parser is an interface defining functions that a parser plugin must satisfy. -type Parser interface { - // Parse takes a byte buffer separated by newlines - // ie, `cpu.usage.idle 90\ncpu.usage.busy 10` - // and parses it into telegraf metrics - Parse(buf []byte) ([]telegraf.Metric, error) - - // ParseLine takes a single string metric - // ie, "cpu.usage.idle 90" - // and parses it into a telegraf metric. - ParseLine(line string) (telegraf.Metric, error) -} -``` - -And you can view the code -[here.](https://github.com/influxdata/telegraf/blob/henrypfhu-master/plugins/parsers/registry.go) - -## Service Input Plugins - -This section is for developers who want to create new "service" collection -inputs. A service plugin differs from a regular plugin in that it operates -a background service while Telegraf is running. One example would be the `statsd` -plugin, which operates a statsd server. - -Service Input Plugins are substantially more complicated than a regular plugin, as they -will require threads and locks to verify data integrity. Service Input Plugins should -be avoided unless there is no way to create their behavior with a regular plugin. - -Their interface is quite similar to a regular plugin, with the addition of `Start()` -and `Stop()` methods. - -### Service Plugin Guidelines - -* Same as the `Plugin` guidelines, except that they must conform to the -[`telegraf.ServiceInput`](https://godoc.org/github.com/influxdata/telegraf#ServiceInput) interface. - -## Output Plugins - -This section is for developers who want to create a new output sink. Outputs -are created in a similar manner as collection plugins, and their interface has -similar constructs. - -### Output Plugin Guidelines - -* An output must conform to the [`telegraf.Output`](https://godoc.org/github.com/influxdata/telegraf#Output) interface. -* Outputs should call `outputs.Add` in their `init` function to register themselves. -See below for a quick example. -* To be available within Telegraf itself, plugins must add themselves to the -`github.com/influxdata/telegraf/plugins/outputs/all/all.go` file. -* The `SampleConfig` function should return valid toml that describes how the -plugin can be configured. This is included in `telegraf config`. Please -consult the [SampleConfig](https://github.com/influxdata/telegraf/wiki/SampleConfig) -page for the latest style guidelines. -* The `Description` function should say in one line what this output does. - -### Output Example - -```go -package simpleoutput - -// simpleoutput.go - -import ( - "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/plugins/outputs" -) - -type Simple struct { - Ok bool -} - -func (s *Simple) Description() string { - return "a demo output" -} - -func (s *Simple) SampleConfig() string { - return ` - ok = true -` -} - -func (s *Simple) Connect() error { - // Make a connection to the URL here - return nil -} - -func (s *Simple) Close() error { - // Close connection to the URL here - return nil -} - -func (s *Simple) Write(metrics []telegraf.Metric) error { - for _, metric := range metrics { - // write `metric` to the output sink here - } - return nil -} - -func init() { - outputs.Add("simpleoutput", func() telegraf.Output { return &Simple{} }) -} - -``` - -## Output Plugins Writing Arbitrary Data Formats - -Some output plugins (such as -[file](https://github.com/influxdata/telegraf/tree/master/plugins/outputs/file)) -can write arbitrary output data formats. An overview of these data formats can -be found -[here](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md). - -In order to enable this, you must specify a -`SetSerializer(serializer serializers.Serializer)` -function on the plugin object (see the file plugin for an example), as well as -defining `serializer` as a field of the object. - -You can then utilize the serializer internally in your plugin, serializing data -before it's written. Telegraf's configuration layer will take care of -instantiating and creating the `Serializer` object. - -You should also add the following to your SampleConfig() return: - -```toml - ## Data format to output. - ## Each data format has its own unique set of configuration options, read - ## more about them here: - ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md - data_format = "influx" -``` - -## Service Output Plugins - -This section is for developers who want to create new "service" output. A -service output differs from a regular output in that it operates a background service -while Telegraf is running. One example would be the `prometheus_client` output, -which operates an HTTP server. - -Their interface is quite similar to a regular output, with the addition of `Start()` -and `Stop()` methods. - -### Service Output Guidelines - -* Same as the `Output` guidelines, except that they must conform to the -`output.ServiceOutput` interface. - -## Processor Plugins - -This section is for developers who want to create a new processor plugin. - -### Processor Plugin Guidelines - -* A processor must conform to the [`telegraf.Processor`](https://godoc.org/github.com/influxdata/telegraf#Processor) interface. -* Processors should call `processors.Add` in their `init` function to register themselves. -See below for a quick example. -* To be available within Telegraf itself, plugins must add themselves to the -`github.com/influxdata/telegraf/plugins/processors/all/all.go` file. -* The `SampleConfig` function should return valid toml that describes how the -processor can be configured. This is include in the output of `telegraf config`. -* The `Description` function should say in one line what this processor does. - -### Processor Example - -```go -package printer - -// printer.go - -import ( - "fmt" - - "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/plugins/processors" -) - -type Printer struct { -} - -var sampleConfig = ` -` - -func (p *Printer) SampleConfig() string { - return sampleConfig -} - -func (p *Printer) Description() string { - return "Print all metrics that pass through this filter." -} - -func (p *Printer) Apply(in ...telegraf.Metric) []telegraf.Metric { - for _, metric := range in { - fmt.Println(metric.String()) - } - return in -} - -func init() { - processors.Add("printer", func() telegraf.Processor { - return &Printer{} - }) -} -``` - -## Aggregator Plugins - -This section is for developers who want to create a new aggregator plugin. - -### Aggregator Plugin Guidelines - -* A aggregator must conform to the [`telegraf.Aggregator`](https://godoc.org/github.com/influxdata/telegraf#Aggregator) interface. -* Aggregators should call `aggregators.Add` in their `init` function to register themselves. -See below for a quick example. -* To be available within Telegraf itself, plugins must add themselves to the -`github.com/influxdata/telegraf/plugins/aggregators/all/all.go` file. -* The `SampleConfig` function should return valid toml that describes how the -aggregator can be configured. This is include in `telegraf config`. -* The `Description` function should say in one line what this aggregator does. -* The Aggregator plugin will need to keep caches of metrics that have passed -through it. This should be done using the builtin `HashID()` function of each -metric. -* When the `Reset()` function is called, all caches should be cleared. - -### Aggregator Example - -```go -package min - -// min.go - -import ( - "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/plugins/aggregators" -) - -type Min struct { - // caches for metric fields, names, and tags - fieldCache map[uint64]map[string]float64 - nameCache map[uint64]string - tagCache map[uint64]map[string]string -} - -func NewMin() telegraf.Aggregator { - m := &Min{} - m.Reset() - return m -} - -var sampleConfig = ` - ## period is the flush & clear interval of the aggregator. - period = "30s" - ## If true drop_original will drop the original metrics and - ## only send aggregates. - drop_original = false -` - -func (m *Min) SampleConfig() string { - return sampleConfig -} - -func (m *Min) Description() string { - return "Keep the aggregate min of each metric passing through." -} - -func (m *Min) Add(in telegraf.Metric) { - id := in.HashID() - if _, ok := m.nameCache[id]; !ok { - // hit an uncached metric, create caches for first time: - m.nameCache[id] = in.Name() - m.tagCache[id] = in.Tags() - m.fieldCache[id] = make(map[string]float64) - for k, v := range in.Fields() { - if fv, ok := convert(v); ok { - m.fieldCache[id][k] = fv - } - } - } else { - for k, v := range in.Fields() { - if fv, ok := convert(v); ok { - if _, ok := m.fieldCache[id][k]; !ok { - // hit an uncached field of a cached metric - m.fieldCache[id][k] = fv - continue - } - if fv < m.fieldCache[id][k] { - // set new minimum - m.fieldCache[id][k] = fv - } - } - } - } -} - -func (m *Min) Push(acc telegraf.Accumulator) { - for id, _ := range m.nameCache { - fields := map[string]interface{}{} - for k, v := range m.fieldCache[id] { - fields[k+"_min"] = v - } - acc.AddFields(m.nameCache[id], fields, m.tagCache[id]) - } -} - -func (m *Min) Reset() { - m.fieldCache = make(map[uint64]map[string]float64) - m.nameCache = make(map[uint64]string) - m.tagCache = make(map[uint64]map[string]string) -} - -func convert(in interface{}) (float64, bool) { - switch v := in.(type) { - case float64: - return v, true - case int64: - return float64(v), true - default: - return 0, false - } -} - -func init() { - aggregators.Add("min", func() telegraf.Aggregator { - return NewMin() - }) -} -``` - -## Unit Tests +**Unit Tests:** Before opening a pull request you should run the linter checks and the short tests. -### Execute linter +**Run static analysis:** -execute `make check` +``` +make check +``` -### Execute short tests +**Run short tests:** -execute `make test` +``` +make test +``` -### Execute integration tests +**Execute integration tests:** Running the integration tests requires several docker containers to be running. You can start the containers with: @@ -497,3 +60,12 @@ make test-all ``` Use `make docker-kill` to stop the containers. + + +[cla]: https://www.influxdata.com/legal/cla/ +[new issue]: https://github.com/influxdata/telegraf/issues/new/choose +[pull request]: https://github.com/influxdata/telegraf/compare +[inputs]: /docs/INPUTS.md +[processors]: /docs/PROCESSORS.md +[aggregators]: /docs/AGGREGATORS.md +[outputs]: /docs/OUTPUTS.md diff --git a/accumulator.go b/accumulator.go index 370f0c70c..825455c4c 100644 --- a/accumulator.go +++ b/accumulator.go @@ -1,16 +1,14 @@ package telegraf -import "time" +import ( + "time" +) -// Accumulator is an interface for "accumulating" metrics from plugin(s). -// The metrics are sent down a channel shared between all plugins. +// Accumulator allows adding metrics to the processing flow. type Accumulator interface { // AddFields adds a metric to the accumulator with the given measurement // name, fields, and tags (and timestamp). If a timestamp is not provided, // then the accumulator sets it to "now". - // Create a point with a value, decorating it with tags - // NOTE: tags is expected to be owned by the caller, don't mutate - // it after passing to Add. AddFields(measurement string, fields map[string]interface{}, tags map[string]string, @@ -40,7 +38,49 @@ type Accumulator interface { tags map[string]string, t ...time.Time) + // AddMetric adds an metric to the accumulator. + AddMetric(Metric) + + // SetPrecision takes two time.Duration objects. If the first is non-zero, + // it sets that as the precision. Otherwise, it takes the second argument + // as the order of time that the metrics should be rounded to, with the + // maximum being 1s. SetPrecision(precision, interval time.Duration) + // Report an error. AddError(err error) + + // Upgrade to a TrackingAccumulator with space for maxTracked + // metrics/batches. + WithTracking(maxTracked int) TrackingAccumulator +} + +// TrackingID uniquely identifies a tracked metric group +type TrackingID uint64 + +// DeliveryInfo provides the results of a delivered metric group. +type DeliveryInfo interface { + // ID is the TrackingID + ID() TrackingID + + // Delivered returns true if the metric was processed successfully. + Delivered() bool +} + +// TrackingAccumulator is an Accumulator that provides a signal when the +// metric has been fully processed. Sending more metrics than the accumulator +// has been allocated for without reading status from the Accepted or Rejected +// channels is an error. +type TrackingAccumulator interface { + Accumulator + + // Add the Metric and arrange for tracking feedback after processing.. + AddTrackingMetric(m Metric) TrackingID + + // Add a group of Metrics and arrange for a signal when the group has been + // processed. + AddTrackingMetricGroup(group []Metric) TrackingID + + // Delivered returns a channel that will contain the tracking results. + Delivered() <-chan DeliveryInfo } diff --git a/agent/accumulator.go b/agent/accumulator.go index 05e99350b..c29b521e9 100644 --- a/agent/accumulator.go +++ b/agent/accumulator.go @@ -20,13 +20,13 @@ type MetricMaker interface { type accumulator struct { maker MetricMaker - metrics chan telegraf.Metric + metrics chan<- telegraf.Metric precision time.Duration } func NewAccumulator( maker MetricMaker, - metrics chan telegraf.Metric, + metrics chan<- telegraf.Metric, ) telegraf.Accumulator { acc := accumulator{ maker: maker, @@ -42,7 +42,7 @@ func (ac *accumulator) AddFields( tags map[string]string, t ...time.Time, ) { - ac.addMetric(measurement, tags, fields, telegraf.Untyped, t...) + ac.addFields(measurement, tags, fields, telegraf.Untyped, t...) } func (ac *accumulator) AddGauge( @@ -51,7 +51,7 @@ func (ac *accumulator) AddGauge( tags map[string]string, t ...time.Time, ) { - ac.addMetric(measurement, tags, fields, telegraf.Gauge, t...) + ac.addFields(measurement, tags, fields, telegraf.Gauge, t...) } func (ac *accumulator) AddCounter( @@ -60,7 +60,7 @@ func (ac *accumulator) AddCounter( tags map[string]string, t ...time.Time, ) { - ac.addMetric(measurement, tags, fields, telegraf.Counter, t...) + ac.addFields(measurement, tags, fields, telegraf.Counter, t...) } func (ac *accumulator) AddSummary( @@ -69,7 +69,7 @@ func (ac *accumulator) AddSummary( tags map[string]string, t ...time.Time, ) { - ac.addMetric(measurement, tags, fields, telegraf.Summary, t...) + ac.addFields(measurement, tags, fields, telegraf.Summary, t...) } func (ac *accumulator) AddHistogram( @@ -78,10 +78,16 @@ func (ac *accumulator) AddHistogram( tags map[string]string, t ...time.Time, ) { - ac.addMetric(measurement, tags, fields, telegraf.Histogram, t...) + ac.addFields(measurement, tags, fields, telegraf.Histogram, t...) } -func (ac *accumulator) addMetric( +func (ac *accumulator) AddMetric(m telegraf.Metric) { + if m := ac.maker.MakeMetric(m); m != nil { + ac.metrics <- m + } +} + +func (ac *accumulator) addFields( measurement string, tags map[string]string, fields map[string]interface{}, @@ -104,13 +110,9 @@ func (ac *accumulator) AddError(err error) { return } NErrors.Incr(1) - log.Printf("E! Error in plugin [%s]: %s", ac.maker.Name(), err) + log.Printf("E! [%s]: Error in plugin: %v", ac.maker.Name(), err) } -// SetPrecision takes two time.Duration objects. If the first is non-zero, -// it sets that as the precision. Otherwise, it takes the second argument -// as the order of time that the metrics should be rounded to, with the -// maximum being 1s. func (ac *accumulator) SetPrecision(precision, interval time.Duration) { if precision > 0 { ac.precision = precision @@ -128,7 +130,7 @@ func (ac *accumulator) SetPrecision(precision, interval time.Duration) { } } -func (ac accumulator) getTime(t []time.Time) time.Time { +func (ac *accumulator) getTime(t []time.Time) time.Time { var timestamp time.Time if len(t) > 0 { timestamp = t[0] @@ -137,3 +139,43 @@ func (ac accumulator) getTime(t []time.Time) time.Time { } return timestamp.Round(ac.precision) } + +func (ac *accumulator) WithTracking(maxTracked int) telegraf.TrackingAccumulator { + return &trackingAccumulator{ + Accumulator: ac, + delivered: make(chan telegraf.DeliveryInfo, maxTracked), + } +} + +type trackingAccumulator struct { + telegraf.Accumulator + delivered chan telegraf.DeliveryInfo +} + +func (a *trackingAccumulator) AddTrackingMetric(m telegraf.Metric) telegraf.TrackingID { + dm, id := metric.WithTracking(m, a.onDelivery) + a.AddMetric(dm) + return id +} + +func (a *trackingAccumulator) AddTrackingMetricGroup(group []telegraf.Metric) telegraf.TrackingID { + db, id := metric.WithGroupTracking(group, a.onDelivery) + for _, m := range db { + a.AddMetric(m) + } + return id +} + +func (a *trackingAccumulator) Delivered() <-chan telegraf.DeliveryInfo { + return a.delivered +} + +func (a *trackingAccumulator) onDelivery(info telegraf.DeliveryInfo) { + select { + case a.delivered <- info: + default: + // This is a programming error in the input. More items were sent for + // tracking than space requested. + panic("channel is full") + } +} diff --git a/agent/agent.go b/agent/agent.go index 6f7b540f2..d8875e447 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1,9 +1,9 @@ package agent import ( + "context" "fmt" "log" - "os" "runtime" "sync" "time" @@ -12,187 +12,157 @@ import ( "github.com/influxdata/telegraf/internal" "github.com/influxdata/telegraf/internal/config" "github.com/influxdata/telegraf/internal/models" - "github.com/influxdata/telegraf/selfstat" + "github.com/influxdata/telegraf/plugins/serializers/influx" ) -// Agent runs telegraf and collects data based on the given config +// Agent runs a set of plugins. type Agent struct { Config *config.Config } -// NewAgent returns an Agent struct based off the given Config +// NewAgent returns an Agent for the given Config. func NewAgent(config *config.Config) (*Agent, error) { a := &Agent{ Config: config, } - - if !a.Config.Agent.OmitHostname { - if a.Config.Agent.Hostname == "" { - hostname, err := os.Hostname() - if err != nil { - return nil, err - } - - a.Config.Agent.Hostname = hostname - } - - config.Tags["host"] = a.Config.Agent.Hostname - } - return a, nil } -// Connect connects to all configured outputs -func (a *Agent) Connect() error { - for _, o := range a.Config.Outputs { - switch ot := o.Output.(type) { - case telegraf.ServiceOutput: - if err := ot.Start(); err != nil { - log.Printf("E! Service for output %s failed to start, exiting\n%s\n", - o.Name, err.Error()) - return err - } +// Run starts and runs the Agent until the context is done. +func (a *Agent) Run(ctx context.Context) error { + log.Printf("I! [agent] Config: Interval:%s, Quiet:%#v, Hostname:%#v, "+ + "Flush Interval:%s", + a.Config.Agent.Interval.Duration, a.Config.Agent.Quiet, + a.Config.Agent.Hostname, a.Config.Agent.FlushInterval.Duration) + + if ctx.Err() != nil { + return ctx.Err() + } + + log.Printf("D! [agent] Connecting outputs") + err := a.connectOutputs(ctx) + if err != nil { + return err + } + + inputC := make(chan telegraf.Metric, 100) + procC := make(chan telegraf.Metric, 100) + outputC := make(chan telegraf.Metric, 100) + + startTime := time.Now() + + log.Printf("D! [agent] Starting service inputs") + err = a.startServiceInputs(ctx, inputC) + if err != nil { + return err + } + + var wg sync.WaitGroup + + src := inputC + dst := inputC + + wg.Add(1) + go func(dst chan telegraf.Metric) { + defer wg.Done() + + err := a.runInputs(ctx, startTime, dst) + if err != nil { + log.Printf("E! [agent] Error running inputs: %v", err) } - log.Printf("D! Attempting connection to output: %s\n", o.Name) - err := o.Output.Connect() - if err != nil { - log.Printf("E! Failed to connect to output %s, retrying in 15s, "+ - "error was '%s' \n", o.Name, err) - time.Sleep(15 * time.Second) - err = o.Output.Connect() + log.Printf("D! [agent] Stopping service inputs") + a.stopServiceInputs() + + close(dst) + log.Printf("D! [agent] Input channel closed") + }(dst) + + src = dst + + if len(a.Config.Processors) > 0 { + dst = procC + + wg.Add(1) + go func(src, dst chan telegraf.Metric) { + defer wg.Done() + + err := a.runProcessors(src, dst) if err != nil { - return err + log.Printf("E! [agent] Error running processors: %v", err) } - } - log.Printf("D! Successfully connected to output: %s\n", o.Name) + close(dst) + log.Printf("D! [agent] Processor channel closed") + }(src, dst) + + src = dst } + + if len(a.Config.Aggregators) > 0 { + dst = outputC + + wg.Add(1) + go func(src, dst chan telegraf.Metric) { + defer wg.Done() + + err := a.runAggregators(startTime, src, dst) + if err != nil { + log.Printf("E! [agent] Error running aggregators: %v", err) + } + close(dst) + log.Printf("D! [agent] Output channel closed") + }(src, dst) + + src = dst + } + + wg.Add(1) + go func(src chan telegraf.Metric) { + defer wg.Done() + + err := a.runOutputs(startTime, src) + if err != nil { + log.Printf("E! [agent] Error running outputs: %v", err) + } + }(src) + + wg.Wait() + + log.Printf("D! [agent] Closing outputs") + err = a.closeOutputs() + if err != nil { + return err + } + return nil } -// Close closes the connection to all configured outputs -func (a *Agent) Close() error { - var err error - for _, o := range a.Config.Outputs { - err = o.Output.Close() - switch ot := o.Output.(type) { - case telegraf.ServiceOutput: - ot.Stop() - } - } - return err -} - -func panicRecover(input *models.RunningInput) { - if err := recover(); err != nil { - trace := make([]byte, 2048) - runtime.Stack(trace, true) - log.Printf("E! FATAL: Input [%s] panicked: %s, Stack:\n%s\n", - input.Name(), err, trace) - log.Println("E! PLEASE REPORT THIS PANIC ON GITHUB with " + - "stack trace, configuration, and OS information: " + - "https://github.com/influxdata/telegraf/issues/new") - } -} - -// gatherer runs the inputs that have been configured with their own -// reporting interval. -func (a *Agent) gatherer( - shutdown chan struct{}, - input *models.RunningInput, - interval time.Duration, - metricC chan telegraf.Metric, -) { - defer panicRecover(input) - - GatherTime := selfstat.RegisterTiming("gather", - "gather_time_ns", - map[string]string{"input": input.Config.Name}, - ) - - acc := NewAccumulator(input, metricC) - acc.SetPrecision(a.Config.Agent.Precision.Duration, - a.Config.Agent.Interval.Duration) - - ticker := time.NewTicker(interval) - defer ticker.Stop() - - for { - internal.RandomSleep(a.Config.Agent.CollectionJitter.Duration, shutdown) - - start := time.Now() - gatherWithTimeout(shutdown, input, acc, interval) - elapsed := time.Since(start) - - GatherTime.Incr(elapsed.Nanoseconds()) - - select { - case <-shutdown: - return - case <-ticker.C: - continue - } - } -} - -// gatherWithTimeout gathers from the given input, with the given timeout. -// when the given timeout is reached, gatherWithTimeout logs an error message -// but continues waiting for it to return. This is to avoid leaving behind -// hung processes, and to prevent re-calling the same hung process over and -// over. -func gatherWithTimeout( - shutdown chan struct{}, - input *models.RunningInput, - acc telegraf.Accumulator, - timeout time.Duration, -) { - ticker := time.NewTicker(timeout) - defer ticker.Stop() - done := make(chan error) - go func() { - done <- input.Input.Gather(acc) +// Test runs the inputs once and prints the output to stdout in line protocol. +func (a *Agent) Test() error { + var wg sync.WaitGroup + metricC := make(chan telegraf.Metric) + defer func() { + close(metricC) + wg.Wait() }() - for { - select { - case err := <-done: - if err != nil { - acc.AddError(err) - } - return - case <-ticker.C: - err := fmt.Errorf("took longer to collect than collection interval (%s)", - timeout) - acc.AddError(err) - continue - case <-shutdown: - return - } - } -} - -// Test verifies that we can 'Gather' from all inputs with their configured -// Config struct -func (a *Agent) Test() error { - shutdown := make(chan struct{}) - defer close(shutdown) - metricC := make(chan telegraf.Metric) - - // dummy receiver for the point channel + wg.Add(1) go func() { - for { - select { - case <-metricC: - // do nothing - case <-shutdown: - return + defer wg.Done() + + s := influx.NewSerializer() + s.SetFieldSortOrder(influx.SortFields) + for metric := range metricC { + octets, err := s.Serialize(metric) + if err == nil { + fmt.Print("> ", string(octets)) } } }() for _, input := range a.Config.Inputs { if _, ok := input.Input.(telegraf.ServiceInput); ok { - fmt.Printf("\nWARNING: skipping plugin [[%s]]: service inputs not supported in --test mode\n", + log.Printf("W!: [agent] skipping plugin [[%s]]: service inputs not supported in --test mode", input.Name()) continue } @@ -200,7 +170,6 @@ func (a *Agent) Test() error { acc := NewAccumulator(input, metricC) acc.SetPrecision(a.Config.Agent.Precision.Duration, a.Config.Agent.Interval.Duration) - input.SetTrace(true) input.SetDefaultTags(a.Config.Tags) if err := input.Input.Gather(acc); err != nil { @@ -218,216 +187,445 @@ func (a *Agent) Test() error { } } + return nil } -// flush writes a list of metrics to all configured outputs -func (a *Agent) flush() { - var wg sync.WaitGroup - - wg.Add(len(a.Config.Outputs)) - for _, o := range a.Config.Outputs { - go func(output *models.RunningOutput) { - defer wg.Done() - err := output.Write() - if err != nil { - log.Printf("E! Error writing to output [%s]: %s\n", - output.Name, err.Error()) - } - }(o) - } - - wg.Wait() -} - -// flusher monitors the metrics input channel and flushes on the minimum interval -func (a *Agent) flusher( - shutdown chan struct{}, - metricC chan telegraf.Metric, - aggMetricC chan telegraf.Metric, - outMetricC chan telegraf.Metric, +// runInputs starts and triggers the periodic gather for Inputs. +// +// When the context is done the timers are stopped and this function returns +// after all ongoing Gather calls complete. +func (a *Agent) runInputs( + ctx context.Context, + startTime time.Time, + dst chan<- telegraf.Metric, ) error { var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - for { - select { - case <-shutdown: - if len(outMetricC) > 0 { - // keep going until channel is empty - continue - } - return - case metric := <-outMetricC: - for i, o := range a.Config.Outputs { - if i == len(a.Config.Outputs)-1 { - o.AddMetric(metric) - } else { - o.AddMetric(metric.Copy()) - } - } - } - } - }() - - wg.Add(1) - go func() { - defer wg.Done() - for metric := range aggMetricC { - // Apply Processors - metrics := []telegraf.Metric{metric} - for _, processor := range a.Config.Processors { - metrics = processor.Apply(metrics...) - } - outMetricC <- metric - } - }() - - wg.Add(1) - go func() { - defer wg.Done() - for { - select { - case <-shutdown: - if len(metricC) > 0 { - // keep going until channel is empty - continue - } - close(aggMetricC) - return - case metric := <-metricC: - // Apply Processors - metrics := []telegraf.Metric{metric} - for _, processor := range a.Config.Processors { - metrics = processor.Apply(metrics...) - } - - for _, metric := range metrics { - // Apply Aggregators - var dropOriginal bool - for _, agg := range a.Config.Aggregators { - if ok := agg.Add(metric.Copy()); ok { - dropOriginal = true - } - } - - // Forward metric to Outputs - if !dropOriginal { - outMetricC <- metric - } - } - } - } - }() - - ticker := time.NewTicker(a.Config.Agent.FlushInterval.Duration) - semaphore := make(chan struct{}, 1) - for { - select { - case <-shutdown: - log.Println("I! Hang on, flushing any cached metrics before shutdown") - // wait for outMetricC to get flushed before flushing outputs - wg.Wait() - a.flush() - return nil - case <-ticker.C: - go func() { - select { - case semaphore <- struct{}{}: - internal.RandomSleep(a.Config.Agent.FlushJitter.Duration, shutdown) - a.flush() - <-semaphore - default: - // skipping this flush because one is already happening - log.Println("W! Skipping a scheduled flush because there is" + - " already a flush ongoing.") - } - }() - } - } -} - -// Run runs the agent daemon, gathering every Interval -func (a *Agent) Run(shutdown chan struct{}) error { - var wg sync.WaitGroup - - log.Printf("I! Agent Config: Interval:%s, Quiet:%#v, Hostname:%#v, "+ - "Flush Interval:%s \n", - a.Config.Agent.Interval.Duration, a.Config.Agent.Quiet, - a.Config.Agent.Hostname, a.Config.Agent.FlushInterval.Duration) - - // Channel shared between all input threads for accumulating metrics - metricC := make(chan telegraf.Metric, 100) - - // Channel for metrics ready to be output - outMetricC := make(chan telegraf.Metric, 100) - - // Channel for aggregated metrics - aggMetricC := make(chan telegraf.Metric, 100) - - // Round collection to nearest interval by sleeping - if a.Config.Agent.RoundInterval { - i := int64(a.Config.Agent.Interval.Duration) - time.Sleep(time.Duration(i - (time.Now().UnixNano() % i))) - } - - wg.Add(1) - go func() { - defer wg.Done() - if err := a.flusher(shutdown, metricC, aggMetricC, outMetricC); err != nil { - log.Printf("E! Flusher routine failed, exiting: %s\n", err.Error()) - close(shutdown) - } - }() - - wg.Add(len(a.Config.Aggregators)) - for _, aggregator := range a.Config.Aggregators { - go func(agg *models.RunningAggregator) { - defer wg.Done() - acc := NewAccumulator(agg, aggMetricC) - acc.SetPrecision(a.Config.Agent.Precision.Duration, - a.Config.Agent.Interval.Duration) - agg.Run(acc, shutdown) - }(aggregator) - } - - // Service inputs may immediately add metrics, if metrics are added before - // the aggregator starts they will be dropped. Generally this occurs - // only during testing but it is an outstanding issue. - // - // https://github.com/influxdata/telegraf/issues/4394 - for _, input := range a.Config.Inputs { - input.SetDefaultTags(a.Config.Tags) - switch p := input.Input.(type) { - case telegraf.ServiceInput: - acc := NewAccumulator(input, metricC) - // Service input plugins should set their own precision of their - // metrics. - acc.SetPrecision(time.Nanosecond, 0) - if err := p.Start(acc); err != nil { - log.Printf("E! Service for input %s failed to start, exiting\n%s\n", - input.Name(), err.Error()) - return err - } - defer p.Stop() - } - } - - wg.Add(len(a.Config.Inputs)) for _, input := range a.Config.Inputs { interval := a.Config.Agent.Interval.Duration - // overwrite global interval if this plugin has it's own. + precision := a.Config.Agent.Precision.Duration + jitter := a.Config.Agent.CollectionJitter.Duration + + // Overwrite agent interval if this plugin has its own. if input.Config.Interval != 0 { interval = input.Config.Interval } - go func(in *models.RunningInput, interv time.Duration) { + + acc := NewAccumulator(input, dst) + acc.SetPrecision(precision, interval) + + wg.Add(1) + go func(input *models.RunningInput) { defer wg.Done() - a.gatherer(shutdown, in, interv, metricC) - }(input, interval) + + if a.Config.Agent.RoundInterval { + err := internal.SleepContext( + ctx, internal.AlignDuration(startTime, interval)) + if err != nil { + return + } + } + + a.gatherOnInterval(ctx, acc, input, interval, jitter) + }(input) + } + wg.Wait() + + return nil +} + +// gather runs an input's gather function periodically until the context is +// done. +func (a *Agent) gatherOnInterval( + ctx context.Context, + acc telegraf.Accumulator, + input *models.RunningInput, + interval time.Duration, + jitter time.Duration, +) { + defer panicRecover(input) + + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + err := internal.SleepContext(ctx, internal.RandomDuration(jitter)) + if err != nil { + return + } + + err = a.gatherOnce(acc, input, interval) + if err != nil { + acc.AddError(err) + } + + select { + case <-ticker.C: + continue + case <-ctx.Done(): + return + } + } +} + +// gatherOnce runs the input's Gather function once, logging a warning each +// interval it fails to complete before. +func (a *Agent) gatherOnce( + acc telegraf.Accumulator, + input *models.RunningInput, + timeout time.Duration, +) error { + ticker := time.NewTicker(timeout) + defer ticker.Stop() + + done := make(chan error) + go func() { + done <- input.Gather(acc) + }() + + for { + select { + case err := <-done: + return err + case <-ticker.C: + log.Printf("W! [agent] input %q did not complete within its interval", + input.Name()) + } + } +} + +// runProcessors applies processors to metrics. +func (a *Agent) runProcessors( + src <-chan telegraf.Metric, + agg chan<- telegraf.Metric, +) error { + for metric := range src { + metrics := a.applyProcessors(metric) + + for _, metric := range metrics { + agg <- metric + } + } + + return nil +} + +// applyProcessors applies all processors to a metric. +func (a *Agent) applyProcessors(m telegraf.Metric) []telegraf.Metric { + metrics := []telegraf.Metric{m} + for _, processor := range a.Config.Processors { + metrics = processor.Apply(metrics...) + } + + return metrics +} + +// runAggregators triggers the periodic push for Aggregators. +// +// When the context is done a final push will occur and then this function +// will return. +func (a *Agent) runAggregators( + startTime time.Time, + src <-chan telegraf.Metric, + dst chan<- telegraf.Metric, +) error { + ctx, cancel := context.WithCancel(context.Background()) + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + for metric := range src { + var dropOriginal bool + for _, agg := range a.Config.Aggregators { + if ok := agg.Add(metric); ok { + dropOriginal = true + } + } + + if !dropOriginal { + dst <- metric + } + } + cancel() + }() + + precision := a.Config.Agent.Precision.Duration + interval := a.Config.Agent.Interval.Duration + aggregations := make(chan telegraf.Metric, 100) + for _, agg := range a.Config.Aggregators { + wg.Add(1) + go func(agg *models.RunningAggregator) { + defer wg.Done() + + if a.Config.Agent.RoundInterval { + // Aggregators are aligned to the agent interval regardless of + // their period. + err := internal.SleepContext(ctx, internal.AlignDuration(startTime, interval)) + if err != nil { + return + } + } + + agg.SetPeriodStart(startTime) + + acc := NewAccumulator(agg, aggregations) + acc.SetPrecision(precision, interval) + a.push(ctx, agg, acc) + close(aggregations) + }(agg) + } + + for metric := range aggregations { + metrics := a.applyProcessors(metric) + for _, metric := range metrics { + dst <- metric + } } wg.Wait() - a.Close() return nil } + +// push runs the push for a single aggregator every period. More simple than +// the output/input version as timeout should be less likely.... not really +// because the output channel can block for now. +func (a *Agent) push( + ctx context.Context, + aggregator *models.RunningAggregator, + acc telegraf.Accumulator, +) { + ticker := time.NewTicker(aggregator.Period()) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + break + case <-ctx.Done(): + aggregator.Push(acc) + return + } + + aggregator.Push(acc) + } +} + +// runOutputs triggers the periodic write for Outputs. +// +// When the context is done, outputs continue to run until their buffer is +// closed, afterwich they run flush once more. +func (a *Agent) runOutputs( + startTime time.Time, + src <-chan telegraf.Metric, +) error { + interval := a.Config.Agent.FlushInterval.Duration + jitter := a.Config.Agent.FlushJitter.Duration + + ctx, cancel := context.WithCancel(context.Background()) + + var wg sync.WaitGroup + for _, output := range a.Config.Outputs { + interval := interval + // Overwrite agent flush_interval if this plugin has its own. + if output.Config.FlushInterval != 0 { + interval = output.Config.FlushInterval + } + + wg.Add(1) + go func(output *models.RunningOutput) { + defer wg.Done() + + if a.Config.Agent.RoundInterval { + err := internal.SleepContext( + ctx, internal.AlignDuration(startTime, interval)) + if err != nil { + return + } + } + + a.flush(ctx, output, interval, jitter) + }(output) + } + + for metric := range src { + for i, output := range a.Config.Outputs { + if i == len(a.Config.Outputs)-1 { + output.AddMetric(metric) + } else { + output.AddMetric(metric.Copy()) + } + } + } + + log.Println("I! [agent] Hang on, flushing any cached metrics before shutdown") + cancel() + wg.Wait() + + return nil +} + +// flush runs an output's flush function periodically until the context is +// done. +func (a *Agent) flush( + ctx context.Context, + output *models.RunningOutput, + interval time.Duration, + jitter time.Duration, +) { + // since we are watching two channels we need a ticker with the jitter + // integrated. + ticker := NewTicker(interval, jitter) + defer ticker.Stop() + + logError := func(err error) { + if err != nil { + log.Printf("E! [agent] Error writing to output [%s]: %v", output.Name, err) + } + } + + for { + // Favor shutdown over other methods. + select { + case <-ctx.Done(): + logError(a.flushOnce(output, interval, output.Write)) + return + default: + } + + select { + case <-ticker.C: + logError(a.flushOnce(output, interval, output.Write)) + case <-output.BatchReady: + // Favor the ticker over batch ready + select { + case <-ticker.C: + logError(a.flushOnce(output, interval, output.Write)) + default: + logError(a.flushOnce(output, interval, output.WriteBatch)) + } + case <-ctx.Done(): + logError(a.flushOnce(output, interval, output.Write)) + return + } + } +} + +// flushOnce runs the output's Write function once, logging a warning each +// interval it fails to complete before. +func (a *Agent) flushOnce( + output *models.RunningOutput, + timeout time.Duration, + writeFunc func() error, +) error { + ticker := time.NewTicker(timeout) + defer ticker.Stop() + + done := make(chan error) + go func() { + done <- writeFunc() + }() + + for { + select { + case err := <-done: + output.LogBufferStatus() + return err + case <-ticker.C: + log.Printf("W! [agent] output %q did not complete within its flush interval", + output.Name) + output.LogBufferStatus() + } + } + +} + +// connectOutputs connects to all outputs. +func (a *Agent) connectOutputs(ctx context.Context) error { + for _, output := range a.Config.Outputs { + log.Printf("D! [agent] Attempting connection to output: %s\n", output.Name) + err := output.Output.Connect() + if err != nil { + log.Printf("E! [agent] Failed to connect to output %s, retrying in 15s, "+ + "error was '%s' \n", output.Name, err) + + err := internal.SleepContext(ctx, 15*time.Second) + if err != nil { + return err + } + + err = output.Output.Connect() + if err != nil { + return err + } + } + log.Printf("D! [agent] Successfully connected to output: %s\n", output.Name) + } + return nil +} + +// closeOutputs closes all outputs. +func (a *Agent) closeOutputs() error { + var err error + for _, output := range a.Config.Outputs { + err = output.Output.Close() + } + return err +} + +// startServiceInputs starts all service inputs. +func (a *Agent) startServiceInputs( + ctx context.Context, + dst chan<- telegraf.Metric, +) error { + started := []telegraf.ServiceInput{} + + for _, input := range a.Config.Inputs { + if si, ok := input.Input.(telegraf.ServiceInput); ok { + // Service input plugins are not subject to timestamp rounding. + // This only applies to the accumulator passed to Start(), the + // Gather() accumulator does apply rounding according to the + // precision agent setting. + acc := NewAccumulator(input, dst) + acc.SetPrecision(time.Nanosecond, 0) + + err := si.Start(acc) + if err != nil { + log.Printf("E! [agent] Service for input %s failed to start: %v", + input.Name(), err) + + for _, si := range started { + si.Stop() + } + + return err + } + + started = append(started, si) + } + } + + return nil +} + +// stopServiceInputs stops all service inputs. +func (a *Agent) stopServiceInputs() { + for _, input := range a.Config.Inputs { + if si, ok := input.Input.(telegraf.ServiceInput); ok { + si.Stop() + } + } +} + +// panicRecover displays an error if an input panics. +func panicRecover(input *models.RunningInput) { + if err := recover(); err != nil { + trace := make([]byte, 2048) + runtime.Stack(trace, true) + log.Printf("E! FATAL: Input [%s] panicked: %s, Stack:\n%s\n", + input.Name(), err, trace) + log.Println("E! PLEASE REPORT THIS PANIC ON GITHUB with " + + "stack trace, configuration, and OS information: " + + "https://github.com/influxdata/telegraf/issues/new/choose") + } +} diff --git a/agent/tick.go b/agent/tick.go new file mode 100644 index 000000000..64dbff50b --- /dev/null +++ b/agent/tick.go @@ -0,0 +1,57 @@ +package agent + +import ( + "context" + "sync" + "time" + + "github.com/influxdata/telegraf/internal" +) + +type Ticker struct { + C chan time.Time + ticker *time.Ticker + jitter time.Duration + wg sync.WaitGroup + cancelFunc context.CancelFunc +} + +func NewTicker( + interval time.Duration, + jitter time.Duration, +) *Ticker { + ctx, cancel := context.WithCancel(context.Background()) + + t := &Ticker{ + C: make(chan time.Time, 1), + ticker: time.NewTicker(interval), + jitter: jitter, + cancelFunc: cancel, + } + + t.wg.Add(1) + go t.relayTime(ctx) + + return t +} + +func (t *Ticker) Stop() { + t.cancelFunc() + t.wg.Wait() +} + +func (t *Ticker) relayTime(ctx context.Context) { + defer t.wg.Done() + for { + select { + case tm := <-t.ticker.C: + internal.SleepContext(ctx, internal.RandomDuration(t.jitter)) + select { + case t.C <- tm: + default: + } + case <-ctx.Done(): + return + } + } +} diff --git a/cmd/telegraf/telegraf.go b/cmd/telegraf/telegraf.go index 7c451c2db..0ad6fe717 100644 --- a/cmd/telegraf/telegraf.go +++ b/cmd/telegraf/telegraf.go @@ -1,6 +1,8 @@ package main import ( + "context" + "errors" "flag" "fmt" "log" @@ -78,112 +80,111 @@ func reloadLoop( for <-reload { reload <- false - // If no other options are specified, load the config file and run. - c := config.NewConfig() - c.OutputFilters = outputFilters - c.InputFilters = inputFilters - err := c.LoadConfig(*fConfig) - if err != nil { - log.Fatal("E! " + err.Error()) - } + ctx, cancel := context.WithCancel(context.Background()) - if *fConfigDirectory != "" { - err = c.LoadDirectory(*fConfigDirectory) - if err != nil { - log.Fatal("E! " + err.Error()) - } - } - if !*fTest && len(c.Outputs) == 0 { - log.Fatalf("E! Error: no outputs found, did you provide a valid config file?") - } - if len(c.Inputs) == 0 { - log.Fatalf("E! Error: no inputs found, did you provide a valid config file?") - } - - if int64(c.Agent.Interval.Duration) <= 0 { - log.Fatalf("E! Agent interval must be positive, found %s", - c.Agent.Interval.Duration) - } - - if int64(c.Agent.FlushInterval.Duration) <= 0 { - log.Fatalf("E! Agent flush_interval must be positive; found %s", - c.Agent.Interval.Duration) - } - - ag, err := agent.NewAgent(c) - if err != nil { - log.Fatal("E! " + err.Error()) - } - - // Setup logging - logger.SetupLogging( - ag.Config.Agent.Debug || *fDebug, - ag.Config.Agent.Quiet || *fQuiet, - ag.Config.Agent.Logfile, - ) - - if *fTest { - err = ag.Test() - if err != nil { - log.Fatal("E! " + err.Error()) - } - os.Exit(0) - } - - err = ag.Connect() - if err != nil { - log.Fatal("E! " + err.Error()) - } - - shutdown := make(chan struct{}) signals := make(chan os.Signal) signal.Notify(signals, os.Interrupt, syscall.SIGHUP, syscall.SIGTERM) go func() { select { case sig := <-signals: - if sig == os.Interrupt || sig == syscall.SIGTERM { - close(shutdown) - } if sig == syscall.SIGHUP { - log.Printf("I! Reloading Telegraf config\n") + log.Printf("I! Reloading Telegraf config") <-reload reload <- true - close(shutdown) } + cancel() case <-stop: - close(shutdown) + cancel() } }() - log.Printf("I! Starting Telegraf %s\n", version) - log.Printf("I! Loaded inputs: %s", strings.Join(c.InputNames(), " ")) - log.Printf("I! Loaded aggregators: %s", strings.Join(c.AggregatorNames(), " ")) - log.Printf("I! Loaded processors: %s", strings.Join(c.ProcessorNames(), " ")) - log.Printf("I! Loaded outputs: %s", strings.Join(c.OutputNames(), " ")) - log.Printf("I! Tags enabled: %s", c.ListTags()) - - if *fPidfile != "" { - f, err := os.OpenFile(*fPidfile, os.O_CREATE|os.O_WRONLY, 0644) - if err != nil { - log.Printf("E! Unable to create pidfile: %s", err) - } else { - fmt.Fprintf(f, "%d\n", os.Getpid()) - - f.Close() - - defer func() { - err := os.Remove(*fPidfile) - if err != nil { - log.Printf("E! Unable to remove pidfile: %s", err) - } - }() - } + err := runAgent(ctx, inputFilters, outputFilters) + if err != nil { + log.Fatalf("E! [telegraf] Error running agent: %v", err) } - - ag.Run(shutdown) } } +func runAgent(ctx context.Context, + inputFilters []string, + outputFilters []string, +) error { + // If no other options are specified, load the config file and run. + c := config.NewConfig() + c.OutputFilters = outputFilters + c.InputFilters = inputFilters + err := c.LoadConfig(*fConfig) + if err != nil { + return err + } + + if *fConfigDirectory != "" { + err = c.LoadDirectory(*fConfigDirectory) + if err != nil { + return err + } + } + if !*fTest && len(c.Outputs) == 0 { + return errors.New("Error: no outputs found, did you provide a valid config file?") + } + if len(c.Inputs) == 0 { + return errors.New("Error: no inputs found, did you provide a valid config file?") + } + + if int64(c.Agent.Interval.Duration) <= 0 { + return fmt.Errorf("Agent interval must be positive, found %s", + c.Agent.Interval.Duration) + } + + if int64(c.Agent.FlushInterval.Duration) <= 0 { + return fmt.Errorf("Agent flush_interval must be positive; found %s", + c.Agent.Interval.Duration) + } + + ag, err := agent.NewAgent(c) + if err != nil { + return err + } + + // Setup logging + logger.SetupLogging( + ag.Config.Agent.Debug || *fDebug, + ag.Config.Agent.Quiet || *fQuiet, + ag.Config.Agent.Logfile, + ) + + if *fTest { + return ag.Test() + } + + log.Printf("I! Starting Telegraf %s\n", version) + log.Printf("I! Loaded inputs: %s", strings.Join(c.InputNames(), " ")) + log.Printf("I! Loaded aggregators: %s", strings.Join(c.AggregatorNames(), " ")) + log.Printf("I! Loaded processors: %s", strings.Join(c.ProcessorNames(), " ")) + log.Printf("I! Loaded outputs: %s", strings.Join(c.OutputNames(), " ")) + log.Printf("I! Tags enabled: %s", c.ListTags()) + + if *fPidfile != "" { + f, err := os.OpenFile(*fPidfile, os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + log.Printf("E! Unable to create pidfile: %s", err) + } else { + fmt.Fprintf(f, "%d\n", os.Getpid()) + + f.Close() + + defer func() { + err := os.Remove(*fPidfile) + if err != nil { + log.Printf("E! Unable to remove pidfile: %s", err) + } + }() + } + } + + return ag.Run(ctx) +} + func usageExit(rc int) { fmt.Println(internal.Usage) os.Exit(rc) diff --git a/docs/AGGREGATORS.md b/docs/AGGREGATORS.md new file mode 100644 index 000000000..d0e926718 --- /dev/null +++ b/docs/AGGREGATORS.md @@ -0,0 +1,126 @@ +### Aggregator Plugins + +This section is for developers who want to create a new aggregator plugin. + +### Aggregator Plugin Guidelines + +* A aggregator must conform to the [telegraf.Aggregator][] interface. +* Aggregators should call `aggregators.Add` in their `init` function to + register themselves. See below for a quick example. +* To be available within Telegraf itself, plugins must add themselves to the + `github.com/influxdata/telegraf/plugins/aggregators/all/all.go` file. +- The `SampleConfig` function should return valid toml that describes how the + plugin can be configured. This is included in `telegraf config`. Please + consult the [SampleConfig][] page for the latest style guidelines. +* The `Description` function should say in one line what this aggregator does. +* The Aggregator plugin will need to keep caches of metrics that have passed + through it. This should be done using the builtin `HashID()` function of + each metric. +* When the `Reset()` function is called, all caches should be cleared. + +### Aggregator Plugin Example + +```go +package min + +// min.go + +import ( + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/plugins/aggregators" +) + +type Min struct { + // caches for metric fields, names, and tags + fieldCache map[uint64]map[string]float64 + nameCache map[uint64]string + tagCache map[uint64]map[string]string +} + +func NewMin() telegraf.Aggregator { + m := &Min{} + m.Reset() + return m +} + +var sampleConfig = ` + ## period is the flush & clear interval of the aggregator. + period = "30s" + ## If true drop_original will drop the original metrics and + ## only send aggregates. + drop_original = false +` + +func (m *Min) SampleConfig() string { + return sampleConfig +} + +func (m *Min) Description() string { + return "Keep the aggregate min of each metric passing through." +} + +func (m *Min) Add(in telegraf.Metric) { + id := in.HashID() + if _, ok := m.nameCache[id]; !ok { + // hit an uncached metric, create caches for first time: + m.nameCache[id] = in.Name() + m.tagCache[id] = in.Tags() + m.fieldCache[id] = make(map[string]float64) + for k, v := range in.Fields() { + if fv, ok := convert(v); ok { + m.fieldCache[id][k] = fv + } + } + } else { + for k, v := range in.Fields() { + if fv, ok := convert(v); ok { + if _, ok := m.fieldCache[id][k]; !ok { + // hit an uncached field of a cached metric + m.fieldCache[id][k] = fv + continue + } + if fv < m.fieldCache[id][k] { + // set new minimum + m.fieldCache[id][k] = fv + } + } + } + } +} + +func (m *Min) Push(acc telegraf.Accumulator) { + for id, _ := range m.nameCache { + fields := map[string]interface{}{} + for k, v := range m.fieldCache[id] { + fields[k+"_min"] = v + } + acc.AddFields(m.nameCache[id], fields, m.tagCache[id]) + } +} + +func (m *Min) Reset() { + m.fieldCache = make(map[uint64]map[string]float64) + m.nameCache = make(map[uint64]string) + m.tagCache = make(map[uint64]map[string]string) +} + +func convert(in interface{}) (float64, bool) { + switch v := in.(type) { + case float64: + return v, true + case int64: + return float64(v), true + default: + return 0, false + } +} + +func init() { + aggregators.Add("min", func() telegraf.Aggregator { + return NewMin() + }) +} +``` + +[telegraf.Aggregator]: https://godoc.org/github.com/influxdata/telegraf#Aggregator +[SampleConfig]: https://github.com/influxdata/telegraf/wiki/SampleConfig diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 27002be0d..4677e54f2 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -106,6 +106,14 @@ emitted from the input plugin. ### Output Configuration +- **flush_interval**: The maximum time between flushes. Use this setting to + override the agent `flush_interval` on a per plugin basis. +- **metric_batch_size**: The maximum number of metrics to send at once. Use + this setting to override the agent `metric_batch_size` on a per plugin basis. +- **metric_buffer_limit**: The maximum number of unsent metrics to buffer. + Use this setting to override the agent `metric_buffer_limit` on a per plugin + basis. + The [metric filtering](#metric-filtering) parameters can be used to limit what metrics are emitted from the output plugin. diff --git a/docs/INPUTS.md b/docs/INPUTS.md new file mode 100644 index 000000000..b1b196398 --- /dev/null +++ b/docs/INPUTS.md @@ -0,0 +1,143 @@ +### Input Plugins + +This section is for developers who want to create new collection inputs. +Telegraf is entirely plugin driven. This interface allows for operators to +pick and chose what is gathered and makes it easy for developers +to create new ways of generating metrics. + +Plugin authorship is kept as simple as possible to promote people to develop +and submit new inputs. + +### Input Plugin Guidelines + +- A plugin must conform to the [telegraf.Input][] interface. +- Input Plugins should call `inputs.Add` in their `init` function to register + themselves. See below for a quick example. +- Input Plugins must be added to the + `github.com/influxdata/telegraf/plugins/inputs/all/all.go` file. +- The `SampleConfig` function should return valid toml that describes how the + plugin can be configured. This is included in `telegraf config`. Please + consult the [SampleConfig][] page for the latest style + guidelines. +- The `Description` function should say in one line what this plugin does. + +Let's say you've written a plugin that emits metrics about processes on the +current host. + +### Input Plugin Example + +```go +package simple + +// simple.go + +import ( + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/plugins/inputs" +) + +type Simple struct { + Ok bool +} + +func (s *Simple) Description() string { + return "a demo plugin" +} + +func (s *Simple) SampleConfig() string { + return ` + ## Indicate if everything is fine + ok = true +` +} + +func (s *Simple) Gather(acc telegraf.Accumulator) error { + if s.Ok { + acc.AddFields("state", map[string]interface{}{"value": "pretty good"}, nil) + } else { + acc.AddFields("state", map[string]interface{}{"value": "not great"}, nil) + } + + return nil +} + +func init() { + inputs.Add("simple", func() telegraf.Input { return &Simple{} }) +} +``` + +### Development + +* Run `make static` followed by `make plugin-[pluginName]` to spin up a docker + dev environment using docker-compose. +* ***[Optional]*** When developing a plugin, add a `dev` directory with a + `docker-compose.yml` and `telegraf.conf` as well as any other supporting + files, where sensible. + +### Typed Metrics + +In addition the the `AddFields` function, the accumulator also supports +functions to add typed metrics: `AddGauge`, `AddCounter`, etc. Metric types +are ignored by the InfluxDB output, but can be used for other outputs, such as +[prometheus][prom metric types]. + +### Data Formats + +Some input plugins, such as the [exec][] plugin, can accept any supported +[input data formats][]. + +In order to enable this, you must specify a `SetParser(parser parsers.Parser)` +function on the plugin object (see the exec plugin for an example), as well as +defining `parser` as a field of the object. + +You can then utilize the parser internally in your plugin, parsing data as you +see fit. Telegraf's configuration layer will take care of instantiating and +creating the `Parser` object. + +Add the following to the `SampleConfig()`: + +```toml + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" +``` + +### Service Input Plugins + +This section is for developers who want to create new "service" collection +inputs. A service plugin differs from a regular plugin in that it operates a +background service while Telegraf is running. One example would be the +`statsd` plugin, which operates a statsd server. + +Service Input Plugins are substantially more complicated than a regular +plugin, as they will require threads and locks to verify data integrity. +Service Input Plugins should be avoided unless there is no way to create their +behavior with a regular plugin. + +To create a Service Input implement the [telegraf.ServiceInput][] interface. + +### Metric Tracking + +Metric Tracking provides a system to be notified when metrics have been +successfully written to their outputs or otherwise discarded. This allows +inputs to be created that function as reliable queue consumers. + +To get started with metric tracking begin by calling `WithTracking` on the +[telegraf.Accumulator][]. Add metrics using the `AddTrackingMetricGroup` +function on the returned [telegraf.TrackingAccumulator][] and store the +`TrackingID`. The `Delivered()` channel will return a type with information +about the final delivery status of the metric group. + +Check the [amqp_consumer][] for an example implementation. + +[exec]: https://github.com/influxdata/telegraf/tree/master/plugins/inputs/exec +[amqp_consumer]: https://github.com/influxdata/telegraf/tree/master/plugins/inputs/amqp_consumer +[prom metric types]: https://prometheus.io/docs/concepts/metric_types/ +[input data formats]: https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +[SampleConfig]: https://github.com/influxdata/telegraf/wiki/SampleConfig +[telegraf.Input]: https://godoc.org/github.com/influxdata/telegraf#Input +[telegraf.ServiceInput]: https://godoc.org/github.com/influxdata/telegraf#ServiceInput +[telegraf.Accumulator]: https://godoc.org/github.com/influxdata/telegraf#Accumulator +[telegraf.TrackingAccumulator]: https://godoc.org/github.com/influxdata/telegraf#Accumulator diff --git a/docs/OUTPUTS.md b/docs/OUTPUTS.md new file mode 100644 index 000000000..cfa8083b4 --- /dev/null +++ b/docs/OUTPUTS.md @@ -0,0 +1,95 @@ +### Output Plugins + +This section is for developers who want to create a new output sink. Outputs +are created in a similar manner as collection plugins, and their interface has +similar constructs. + +### Output Plugin Guidelines + +- An output must conform to the [telegraf.Output][] interface. +- Outputs should call `outputs.Add` in their `init` function to register + themselves. See below for a quick example. +- To be available within Telegraf itself, plugins must add themselves to the + `github.com/influxdata/telegraf/plugins/outputs/all/all.go` file. +- The `SampleConfig` function should return valid toml that describes how the + plugin can be configured. This is included in `telegraf config`. Please + consult the [SampleConfig][] page for the latest style guidelines. +- The `Description` function should say in one line what this output does. + +### Output Plugin Example + +```go +package simpleoutput + +// simpleoutput.go + +import ( + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/plugins/outputs" +) + +type Simple struct { + Ok bool +} + +func (s *Simple) Description() string { + return "a demo output" +} + +func (s *Simple) SampleConfig() string { + return ` + ok = true +` +} + +func (s *Simple) Connect() error { + // Make a connection to the URL here + return nil +} + +func (s *Simple) Close() error { + // Close connection to the URL here + return nil +} + +func (s *Simple) Write(metrics []telegraf.Metric) error { + for _, metric := range metrics { + // write `metric` to the output sink here + } + return nil +} + +func init() { + outputs.Add("simpleoutput", func() telegraf.Output { return &Simple{} }) +} + +``` + +## Data Formats + +Some output plugins, such as the [file][] plugin, can write in any supported +[output data formats][]. + +In order to enable this, you must specify a +`SetSerializer(serializer serializers.Serializer)` +function on the plugin object (see the file plugin for an example), as well as +defining `serializer` as a field of the object. + +You can then utilize the serializer internally in your plugin, serializing data +before it's written. Telegraf's configuration layer will take care of +instantiating and creating the `Serializer` object. + +You should also add the following to your `SampleConfig()`: + +```toml + ## Data format to output. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md + data_format = "influx" +``` + +[file]: https://github.com/influxdata/telegraf/tree/master/plugins/inputs/file +[output data formats]: https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +[SampleConfig]: https://github.com/influxdata/telegraf/wiki/SampleConfig +[telegraf.Output]: https://godoc.org/github.com/influxdata/telegraf#Output diff --git a/docs/PROCESSORS.md b/docs/PROCESSORS.md new file mode 100644 index 000000000..e1fa182ca --- /dev/null +++ b/docs/PROCESSORS.md @@ -0,0 +1,63 @@ +### Processor Plugins + +This section is for developers who want to create a new processor plugin. + +### Processor Plugin Guidelines + +* A processor must conform to the [telegraf.Processor][] interface. +* Processors should call `processors.Add` in their `init` function to register + themselves. See below for a quick example. +* To be available within Telegraf itself, plugins must add themselves to the + `github.com/influxdata/telegraf/plugins/processors/all/all.go` file. +* The `SampleConfig` function should return valid toml that describes how the + processor can be configured. This is include in the output of `telegraf + config`. +- The `SampleConfig` function should return valid toml that describes how the + plugin can be configured. This is included in `telegraf config`. Please + consult the [SampleConfig][] page for the latest style guidelines. +* The `Description` function should say in one line what this processor does. + +### Processor Plugin Example + +```go +package printer + +// printer.go + +import ( + "fmt" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/plugins/processors" +) + +type Printer struct { +} + +var sampleConfig = ` +` + +func (p *Printer) SampleConfig() string { + return sampleConfig +} + +func (p *Printer) Description() string { + return "Print all metrics that pass through this filter." +} + +func (p *Printer) Apply(in ...telegraf.Metric) []telegraf.Metric { + for _, metric := range in { + fmt.Println(metric.String()) + } + return in +} + +func init() { + processors.Add("printer", func() telegraf.Processor { + return &Printer{} + }) +} +``` + +[SampleConfig]: https://github.com/influxdata/telegraf/wiki/SampleConfig +[telegraf.Processor]: https://godoc.org/github.com/influxdata/telegraf#Processor diff --git a/input.go b/input.go index f7e1493e2..071ab7d9d 100644 --- a/input.go +++ b/input.go @@ -13,17 +13,10 @@ type Input interface { } type ServiceInput interface { - // SampleConfig returns the default configuration of the Input - SampleConfig() string + Input - // Description returns a one-sentence description on the Input - Description() string - - // Gather takes in an accumulator and adds the metrics that the Input - // gathers. This is called every "interval" - Gather(Accumulator) error - - // Start starts the ServiceInput's service, whatever that may be + // Start the ServiceInput. The Accumulator may be retained and used until + // Stop returns. Start(Accumulator) error // Stop stops the services and closes any necessary channels and connections diff --git a/internal/buffer/buffer.go b/internal/buffer/buffer.go deleted file mode 100644 index 6a460eccb..000000000 --- a/internal/buffer/buffer.go +++ /dev/null @@ -1,130 +0,0 @@ -package buffer - -import ( - "sync" - - "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/selfstat" -) - -var ( - MetricsWritten = selfstat.Register("agent", "metrics_written", map[string]string{}) - MetricsDropped = selfstat.Register("agent", "metrics_dropped", map[string]string{}) -) - -// Buffer is an object for storing metrics in a circular buffer. -type Buffer struct { - sync.Mutex - buf []telegraf.Metric - first int - last int - size int - empty bool -} - -// NewBuffer returns a Buffer -// size is the maximum number of metrics that Buffer will cache. If Add is -// called when the buffer is full, then the oldest metric(s) will be dropped. -func NewBuffer(size int) *Buffer { - return &Buffer{ - buf: make([]telegraf.Metric, size), - first: 0, - last: 0, - size: size, - empty: true, - } -} - -// IsEmpty returns true if Buffer is empty. -func (b *Buffer) IsEmpty() bool { - return b.empty -} - -// Len returns the current length of the buffer. -func (b *Buffer) Len() int { - if b.empty { - return 0 - } else if b.first <= b.last { - return b.last - b.first + 1 - } - // Spans the end of array. - // size - gap in the middle - return b.size - (b.first - b.last - 1) // size - gap -} - -func (b *Buffer) push(m telegraf.Metric) { - // Empty - if b.empty { - b.last = b.first // Reset - b.buf[b.last] = m - b.empty = false - return - } - - b.last++ - b.last %= b.size - - // Full - if b.first == b.last { - MetricsDropped.Incr(1) - b.first = (b.first + 1) % b.size - } - b.buf[b.last] = m -} - -// Add adds metrics to the buffer. -func (b *Buffer) Add(metrics ...telegraf.Metric) { - b.Lock() - defer b.Unlock() - for i := range metrics { - MetricsWritten.Incr(1) - b.push(metrics[i]) - } -} - -// Batch returns a batch of metrics of size batchSize. -// the batch will be of maximum length batchSize. It can be less than batchSize, -// if the length of Buffer is less than batchSize. -func (b *Buffer) Batch(batchSize int) []telegraf.Metric { - b.Lock() - defer b.Unlock() - outLen := min(b.Len(), batchSize) - out := make([]telegraf.Metric, outLen) - if outLen == 0 { - return out - } - - // We copy everything right of first up to last, count or end - // b.last >= rightInd || b.last < b.first - // therefore wont copy past b.last - rightInd := min(b.size, b.first+outLen) - 1 - - copyCount := copy(out, b.buf[b.first:rightInd+1]) - - // We've emptied the ring - if rightInd == b.last { - b.empty = true - } - b.first = rightInd + 1 - b.first %= b.size - - // We circle back for the rest - if copyCount < outLen { - right := min(b.last, outLen-copyCount) - copy(out[copyCount:], b.buf[b.first:right+1]) - // We've emptied the ring - if right == b.last { - b.empty = true - } - b.first = right + 1 - b.first %= b.size - } - return out -} - -func min(a, b int) int { - if b < a { - return b - } - return a -} diff --git a/internal/buffer/buffer_test.go b/internal/buffer/buffer_test.go deleted file mode 100644 index b3f666fd0..000000000 --- a/internal/buffer/buffer_test.go +++ /dev/null @@ -1,203 +0,0 @@ -package buffer - -import ( - "sync" - "sync/atomic" - "testing" - - "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/testutil" - - "github.com/stretchr/testify/assert" -) - -var metricList = []telegraf.Metric{ - testutil.TestMetric(2, "mymetric1"), - testutil.TestMetric(1, "mymetric2"), - testutil.TestMetric(11, "mymetric3"), - testutil.TestMetric(15, "mymetric4"), - testutil.TestMetric(8, "mymetric5"), -} - -func makeBench5(b *testing.B, freq, batchSize int) { - const k = 1000 - var wg sync.WaitGroup - buf := NewBuffer(10000) - m := testutil.TestMetric(1, "mymetric") - - for i := 0; i < b.N; i++ { - buf.Add(m, m, m, m, m) - if i%(freq*k) == 0 { - wg.Add(1) - go func() { - buf.Batch(batchSize * k) - wg.Done() - }() - } - } - // Flush - buf.Batch(b.N) - wg.Wait() - -} -func makeBenchStrict(b *testing.B, freq, batchSize int) { - const k = 1000 - var count uint64 - var wg sync.WaitGroup - buf := NewBuffer(10000) - m := testutil.TestMetric(1, "mymetric") - - for i := 0; i < b.N; i++ { - buf.Add(m) - if i%(freq*k) == 0 { - wg.Add(1) - go func() { - defer wg.Done() - l := len(buf.Batch(batchSize * k)) - atomic.AddUint64(&count, uint64(l)) - }() - } - } - // Flush - wg.Add(1) - go func() { - l := len(buf.Batch(b.N)) - atomic.AddUint64(&count, uint64(l)) - wg.Done() - }() - - wg.Wait() - if count != uint64(b.N) { - b.Errorf("not all metrics came out. %d of %d", count, b.N) - } -} -func makeBench(b *testing.B, freq, batchSize int) { - const k = 1000 - var wg sync.WaitGroup - buf := NewBuffer(10000) - m := testutil.TestMetric(1, "mymetric") - - for i := 0; i < b.N; i++ { - buf.Add(m) - if i%(freq*k) == 0 { - wg.Add(1) - go func() { - buf.Batch(batchSize * k) - wg.Done() - }() - } - } - wg.Wait() - // Flush - buf.Batch(b.N) -} - -func BenchmarkBufferBatch5Add(b *testing.B) { - makeBench5(b, 100, 101) -} -func BenchmarkBufferBigInfrequentBatchCatchup(b *testing.B) { - makeBench(b, 100, 101) -} -func BenchmarkBufferOftenBatch(b *testing.B) { - makeBench(b, 1, 1) -} -func BenchmarkBufferAlmostBatch(b *testing.B) { - makeBench(b, 10, 9) -} -func BenchmarkBufferSlowBatch(b *testing.B) { - makeBench(b, 10, 1) -} -func BenchmarkBufferBatchNoDrop(b *testing.B) { - makeBenchStrict(b, 1, 4) -} -func BenchmarkBufferCatchup(b *testing.B) { - buf := NewBuffer(10000) - m := testutil.TestMetric(1, "mymetric") - - for i := 0; i < b.N; i++ { - buf.Add(m) - } - buf.Batch(b.N) -} - -func BenchmarkAddMetrics(b *testing.B) { - buf := NewBuffer(10000) - m := testutil.TestMetric(1, "mymetric") - for n := 0; n < b.N; n++ { - buf.Add(m) - } -} - -func TestNewBufferBasicFuncs(t *testing.T) { - b := NewBuffer(10) - MetricsDropped.Set(0) - MetricsWritten.Set(0) - - assert.True(t, b.IsEmpty()) - assert.Zero(t, b.Len()) - assert.Zero(t, MetricsDropped.Get()) - assert.Zero(t, MetricsWritten.Get()) - - m := testutil.TestMetric(1, "mymetric") - b.Add(m) - assert.False(t, b.IsEmpty()) - assert.Equal(t, b.Len(), 1) - assert.Equal(t, int64(0), MetricsDropped.Get()) - assert.Equal(t, int64(1), MetricsWritten.Get()) - - b.Add(metricList...) - assert.False(t, b.IsEmpty()) - assert.Equal(t, b.Len(), 6) - assert.Equal(t, int64(0), MetricsDropped.Get()) - assert.Equal(t, int64(6), MetricsWritten.Get()) -} - -func TestDroppingMetrics(t *testing.T) { - b := NewBuffer(10) - MetricsDropped.Set(0) - MetricsWritten.Set(0) - - // Add up to the size of the buffer - b.Add(metricList...) - b.Add(metricList...) - assert.False(t, b.IsEmpty()) - assert.Equal(t, b.Len(), 10) - assert.Equal(t, int64(0), MetricsDropped.Get()) - assert.Equal(t, int64(10), MetricsWritten.Get()) - - // Add 5 more and verify they were dropped - b.Add(metricList...) - assert.False(t, b.IsEmpty()) - assert.Equal(t, b.Len(), 10) - assert.Equal(t, int64(5), MetricsDropped.Get()) - assert.Equal(t, int64(15), MetricsWritten.Get()) -} - -func TestGettingBatches(t *testing.T) { - b := NewBuffer(20) - MetricsDropped.Set(0) - MetricsWritten.Set(0) - - // Verify that the buffer returned is smaller than requested when there are - // not as many items as requested. - b.Add(metricList...) - batch := b.Batch(10) - assert.Len(t, batch, 5) - - // Verify that the buffer is now empty - assert.True(t, b.IsEmpty()) - assert.Zero(t, b.Len()) - assert.Zero(t, MetricsDropped.Get()) - assert.Equal(t, int64(5), MetricsWritten.Get()) - - // Verify that the buffer returned is not more than the size requested - b.Add(metricList...) - batch = b.Batch(3) - assert.Len(t, batch, 3) - - // Verify that buffer is not empty - assert.False(t, b.IsEmpty()) - assert.Equal(t, b.Len(), 2) - assert.Equal(t, int64(0), MetricsDropped.Get()) - assert.Equal(t, int64(10), MetricsWritten.Get()) -} diff --git a/internal/config/config.go b/internal/config/config.go index 36027834b..7d266852a 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -9,7 +9,6 @@ import ( "math" "os" "path/filepath" - "regexp" "runtime" "sort" @@ -26,7 +25,6 @@ import ( "github.com/influxdata/telegraf/plugins/parsers" "github.com/influxdata/telegraf/plugins/processors" "github.com/influxdata/telegraf/plugins/serializers" - "github.com/influxdata/toml" "github.com/influxdata/toml/ast" ) @@ -622,6 +620,19 @@ func (c *Config) LoadConfig(path string) error { } } + if !c.Agent.OmitHostname { + if c.Agent.Hostname == "" { + hostname, err := os.Hostname() + if err != nil { + return err + } + + c.Agent.Hostname = hostname + } + + c.Tags["host"] = c.Agent.Hostname + } + // Parse all the rest of the plugins: for name, val := range tbl.Fields { subTable, ok := val.(*ast.Table) @@ -709,6 +720,7 @@ func (c *Config) LoadConfig(path string) error { if len(c.Processors) > 1 { sort.Sort(c.Processors) } + return nil } @@ -876,6 +888,7 @@ func (c *Config) addInput(name string, table *ast.Table) error { } rp := models.NewRunningInput(input, pluginConfig) + rp.SetDefaultTags(c.Tags) c.Inputs = append(c.Inputs, rp) return nil } @@ -1751,6 +1764,8 @@ func buildOutput(name string, tbl *ast.Table) (*models.OutputConfig, error) { Name: name, Filter: filter, } + + // TODO // Outputs don't support FieldDrop/FieldPass, so set to NameDrop/NamePass if len(oc.Filter.FieldDrop) > 0 { oc.Filter.NameDrop = oc.Filter.FieldDrop @@ -1758,5 +1773,47 @@ func buildOutput(name string, tbl *ast.Table) (*models.OutputConfig, error) { if len(oc.Filter.FieldPass) > 0 { oc.Filter.NamePass = oc.Filter.FieldPass } + + if node, ok := tbl.Fields["flush_interval"]; ok { + if kv, ok := node.(*ast.KeyValue); ok { + if str, ok := kv.Value.(*ast.String); ok { + dur, err := time.ParseDuration(str.Value) + if err != nil { + return nil, err + } + + oc.FlushInterval = dur + } + } + } + + if node, ok := tbl.Fields["metric_buffer_limit"]; ok { + if kv, ok := node.(*ast.KeyValue); ok { + if integer, ok := kv.Value.(*ast.Integer); ok { + v, err := integer.Int() + if err != nil { + return nil, err + } + oc.MetricBufferLimit = int(v) + } + } + } + + if node, ok := tbl.Fields["metric_batch_size"]; ok { + if kv, ok := node.(*ast.KeyValue); ok { + if integer, ok := kv.Value.(*ast.Integer); ok { + v, err := integer.Int() + if err != nil { + return nil, err + } + oc.MetricBatchSize = int(v) + } + } + } + + delete(tbl.Fields, "flush_interval") + delete(tbl.Fields, "metric_buffer_limit") + delete(tbl.Fields, "metric_batch_size") + return oc, nil } diff --git a/internal/internal.go b/internal/internal.go index 567b0f773..8acf63e96 100644 --- a/internal/internal.go +++ b/internal/internal.go @@ -4,6 +4,7 @@ import ( "bufio" "bytes" "compress/gzip" + "context" "crypto/rand" "errors" "io" @@ -246,6 +247,51 @@ func RandomSleep(max time.Duration, shutdown chan struct{}) { } } +// RandomDuration returns a random duration between 0 and max. +func RandomDuration(max time.Duration) time.Duration { + if max == 0 { + return 0 + } + + var sleepns int64 + maxSleep := big.NewInt(max.Nanoseconds()) + if j, err := rand.Int(rand.Reader, maxSleep); err == nil { + sleepns = j.Int64() + } + + return time.Duration(sleepns) +} + +// SleepContext sleeps until the context is closed or the duration is reached. +func SleepContext(ctx context.Context, duration time.Duration) error { + if duration == 0 { + return nil + } + + t := time.NewTimer(duration) + select { + case <-t.C: + return nil + case <-ctx.Done(): + t.Stop() + return ctx.Err() + } +} + +// AlignDuration returns the duration until next aligned interval. +func AlignDuration(tm time.Time, interval time.Duration) time.Duration { + return AlignTime(tm, interval).Sub(tm) +} + +// AlignTime returns the time of the next aligned interval. +func AlignTime(tm time.Time, interval time.Duration) time.Time { + truncated := tm.Truncate(interval) + if truncated == tm { + return tm + } + return truncated.Add(interval) +} + // Exit status takes the error from exec.Command // and returns the exit status and true // if error is not exit status, will return 0 and false diff --git a/internal/internal_test.go b/internal/internal_test.go index 89ee06903..46b1b5962 100644 --- a/internal/internal_test.go +++ b/internal/internal_test.go @@ -9,6 +9,7 @@ import ( "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) type SnakeTest struct { @@ -217,3 +218,55 @@ func TestVersionAlreadySet(t *testing.T) { assert.Equal(t, "foo", Version()) } + +func TestAlignDuration(t *testing.T) { + tests := []struct { + name string + now time.Time + interval time.Duration + expected time.Duration + }{ + { + name: "aligned", + now: time.Date(2018, 1, 1, 1, 1, 0, 0, time.UTC), + interval: 10 * time.Second, + expected: 0 * time.Second, + }, + { + name: "standard interval", + now: time.Date(2018, 1, 1, 1, 1, 1, 0, time.UTC), + interval: 10 * time.Second, + expected: 9 * time.Second, + }, + { + name: "odd interval", + now: time.Date(2018, 1, 1, 1, 1, 1, 0, time.UTC), + interval: 3 * time.Second, + expected: 2 * time.Second, + }, + { + name: "sub second interval", + now: time.Date(2018, 1, 1, 1, 1, 0, 5e8, time.UTC), + interval: 1 * time.Second, + expected: 500 * time.Millisecond, + }, + { + name: "non divisible not aligned on minutes", + now: time.Date(2018, 1, 1, 1, 0, 0, 0, time.UTC), + interval: 1*time.Second + 100*time.Millisecond, + expected: 400 * time.Millisecond, + }, + { + name: "long interval", + now: time.Date(2018, 1, 1, 1, 1, 0, 0, time.UTC), + interval: 1 * time.Hour, + expected: 59 * time.Minute, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + actual := AlignDuration(tt.now, tt.interval) + require.Equal(t, tt.expected, actual) + }) + } +} diff --git a/internal/models/buffer.go b/internal/models/buffer.go new file mode 100644 index 000000000..6848c26fa --- /dev/null +++ b/internal/models/buffer.go @@ -0,0 +1,214 @@ +package models + +import ( + "sync" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/selfstat" +) + +var ( + AgentMetricsWritten = selfstat.Register("agent", "metrics_written", map[string]string{}) + AgentMetricsDropped = selfstat.Register("agent", "metrics_dropped", map[string]string{}) +) + +// Buffer stores metrics in a circular buffer. +type Buffer struct { + sync.Mutex + buf []telegraf.Metric + first int // index of the first/oldest metric + last int // one after the index of the last/newest metric + size int // number of metrics currently in the buffer + cap int // the capacity of the buffer + + batchFirst int // index of the first metric in the batch + batchLast int // one after the index of the last metric in the batch + batchSize int // number of metrics current in the batch + + MetricsAdded selfstat.Stat + MetricsWritten selfstat.Stat + MetricsDropped selfstat.Stat +} + +// NewBuffer returns a new empty Buffer with the given capacity. +func NewBuffer(name string, capacity int) *Buffer { + b := &Buffer{ + buf: make([]telegraf.Metric, capacity), + first: 0, + last: 0, + size: 0, + cap: capacity, + + MetricsAdded: selfstat.Register( + "write", + "metrics_added", + map[string]string{"output": name}, + ), + MetricsWritten: selfstat.Register( + "write", + "metrics_written", + map[string]string{"output": name}, + ), + MetricsDropped: selfstat.Register( + "write", + "metrics_dropped", + map[string]string{"output": name}, + ), + } + return b +} + +// Len returns the number of metrics currently in the buffer. +func (b *Buffer) Len() int { + b.Lock() + defer b.Unlock() + + return b.size +} + +func (b *Buffer) metricAdded() { + b.MetricsAdded.Incr(1) +} + +func (b *Buffer) metricWritten(metric telegraf.Metric) { + AgentMetricsWritten.Incr(1) + b.MetricsWritten.Incr(1) + metric.Accept() +} + +func (b *Buffer) metricDropped(metric telegraf.Metric) { + AgentMetricsDropped.Incr(1) + b.MetricsDropped.Incr(1) + metric.Reject() +} + +func (b *Buffer) inBatch() bool { + if b.batchSize == 0 { + return false + } + + if b.batchFirst < b.batchLast { + return b.last >= b.batchFirst && b.last < b.batchLast + } else { + return b.last >= b.batchFirst || b.last < b.batchLast + } +} + +func (b *Buffer) add(m telegraf.Metric) { + // Check if Buffer is full + if b.size == b.cap { + if b.batchSize == 0 { + // No batch taken by the output, we can drop the metric now. + b.metricDropped(b.buf[b.last]) + } else if b.inBatch() { + // There is an outstanding batch and this will overwrite a metric + // in it, delay the dropping only in case the batch gets rejected. + b.batchSize-- + b.batchFirst++ + b.batchFirst %= b.cap + } else { + // There is an outstanding batch, but this overwrites a metric + // outside of it. + b.metricDropped(b.buf[b.last]) + } + } + + b.metricAdded() + + b.buf[b.last] = m + b.last++ + b.last %= b.cap + + if b.size == b.cap { + b.first++ + b.first %= b.cap + } + + b.size = min(b.size+1, b.cap) +} + +// Add adds metrics to the buffer +func (b *Buffer) Add(metrics ...telegraf.Metric) { + b.Lock() + defer b.Unlock() + + for i := range metrics { + b.add(metrics[i]) + } +} + +// Batch returns a slice containing up to batchSize of the most recently added +// metrics. +// +// The metrics contained in the batch are not removed from the buffer, instead +// the last batch is recorded and removed only if Accept is called. +func (b *Buffer) Batch(batchSize int) []telegraf.Metric { + b.Lock() + defer b.Unlock() + + outLen := min(b.size, batchSize) + out := make([]telegraf.Metric, outLen) + if outLen == 0 { + return out + } + + b.batchFirst = b.first + b.batchLast = b.first + outLen + b.batchLast %= b.cap + b.batchSize = outLen + + until := min(b.cap, b.first+outLen) + + n := copy(out, b.buf[b.first:until]) + if n < outLen { + copy(out[n:], b.buf[:outLen-n]) + } + return out +} + +// Accept removes the metrics contained in the last batch. +func (b *Buffer) Accept(batch []telegraf.Metric) { + b.Lock() + defer b.Unlock() + + for _, m := range batch { + b.metricWritten(m) + } + + if b.batchSize > 0 { + b.size -= b.batchSize + b.first += b.batchSize + b.first %= b.cap + } + + b.resetBatch() +} + +// Reject clears the current batch record so that calls to Accept will have no +// effect. +func (b *Buffer) Reject(batch []telegraf.Metric) { + b.Lock() + defer b.Unlock() + + if len(batch) > b.batchSize { + // Part or all of the batch was dropped before reject was called. + for _, m := range batch[b.batchSize:] { + b.metricDropped(m) + } + } + + b.resetBatch() +} + +func (b *Buffer) resetBatch() { + b.batchFirst = 0 + b.batchLast = 0 + b.batchSize = 0 +} + +func min(a, b int) int { + if b < a { + return b + } + return a +} diff --git a/internal/models/buffer_test.go b/internal/models/buffer_test.go new file mode 100644 index 000000000..246aaf6ea --- /dev/null +++ b/internal/models/buffer_test.go @@ -0,0 +1,385 @@ +package models + +import ( + "testing" + "time" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/metric" + "github.com/stretchr/testify/require" +) + +type MockMetric struct { + telegraf.Metric + AcceptF func() + RejectF func() + DropF func() +} + +func (m *MockMetric) Accept() { + m.AcceptF() +} + +func (m *MockMetric) Reject() { + m.RejectF() +} + +func (m *MockMetric) Drop() { + m.DropF() +} + +func Metric() telegraf.Metric { + m, err := metric.New( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42.0, + }, + time.Unix(0, 0), + ) + if err != nil { + panic(err) + } + return m +} + +func BenchmarkAddMetrics(b *testing.B) { + buf := NewBuffer("test", 10000) + m := Metric() + for n := 0; n < b.N; n++ { + buf.Add(m) + } +} + +func setup(b *Buffer) *Buffer { + b.MetricsAdded.Set(0) + b.MetricsWritten.Set(0) + b.MetricsDropped.Set(0) + return b +} + +func TestBuffer_LenEmpty(t *testing.T) { + b := setup(NewBuffer("test", 5)) + + require.Equal(t, 0, b.Len()) +} + +func TestBuffer_LenOne(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + b.Add(m) + + require.Equal(t, 1, b.Len()) +} + +func TestBuffer_LenFull(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + b.Add(m, m, m, m, m) + + require.Equal(t, 5, b.Len()) +} + +func TestBuffer_LenOverfill(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + setup(b) + b.Add(m, m, m, m, m, m) + + require.Equal(t, 5, b.Len()) +} + +func TestBuffer_BatchLenZero(t *testing.T) { + b := setup(NewBuffer("test", 5)) + batch := b.Batch(0) + + require.Len(t, batch, 0) +} + +func TestBuffer_BatchLenBufferEmpty(t *testing.T) { + b := setup(NewBuffer("test", 5)) + batch := b.Batch(2) + + require.Len(t, batch, 0) +} + +func TestBuffer_BatchLenUnderfill(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + b.Add(m) + batch := b.Batch(2) + + require.Len(t, batch, 1) +} + +func TestBuffer_BatchLenFill(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + b.Add(m, m, m) + batch := b.Batch(2) + require.Len(t, batch, 2) +} + +func TestBuffer_BatchLenExact(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + b.Add(m, m) + batch := b.Batch(2) + require.Len(t, batch, 2) +} + +func TestBuffer_BatchLenLargerThanBuffer(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + b.Add(m, m, m, m, m) + batch := b.Batch(6) + require.Len(t, batch, 5) +} + +func TestBuffer_BatchWrap(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + b.Add(m, m, m, m, m) + batch := b.Batch(2) + b.Accept(batch) + b.Add(m, m) + batch = b.Batch(5) + require.Len(t, batch, 5) +} + +func TestBuffer_AddDropsOverwrittenMetrics(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + + b.Add(m, m, m, m, m) + b.Add(m, m, m, m, m) + + require.Equal(t, int64(5), b.MetricsDropped.Get()) + require.Equal(t, int64(0), b.MetricsWritten.Get()) +} + +func TestBuffer_AcceptRemovesBatch(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + b.Add(m, m, m) + batch := b.Batch(2) + b.Accept(batch) + require.Equal(t, 1, b.Len()) +} + +func TestBuffer_RejectLeavesBatch(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + b.Add(m, m, m) + batch := b.Batch(2) + b.Reject(batch) + require.Equal(t, 3, b.Len()) +} + +func TestBuffer_AcceptWritesOverwrittenBatch(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + + b.Add(m, m, m, m, m) + batch := b.Batch(5) + b.Add(m, m, m, m, m) + b.Accept(batch) + + require.Equal(t, int64(0), b.MetricsDropped.Get()) + require.Equal(t, int64(5), b.MetricsWritten.Get()) +} + +func TestBuffer_BatchRejectDropsOverwrittenBatch(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + + b.Add(m, m, m, m, m) + batch := b.Batch(5) + b.Add(m, m, m, m, m) + b.Reject(batch) + + require.Equal(t, int64(5), b.MetricsDropped.Get()) + require.Equal(t, int64(0), b.MetricsWritten.Get()) +} + +func TestBuffer_MetricsOverwriteBatchAccept(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + + b.Add(m, m, m, m, m) + batch := b.Batch(3) + b.Add(m, m, m) + b.Accept(batch) + require.Equal(t, int64(0), b.MetricsDropped.Get()) + require.Equal(t, int64(3), b.MetricsWritten.Get()) +} + +func TestBuffer_MetricsOverwriteBatchReject(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + + b.Add(m, m, m, m, m) + batch := b.Batch(3) + b.Add(m, m, m) + b.Reject(batch) + require.Equal(t, int64(3), b.MetricsDropped.Get()) + require.Equal(t, int64(0), b.MetricsWritten.Get()) +} + +func TestBuffer_MetricsBatchAcceptRemoved(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + + b.Add(m, m, m, m, m) + batch := b.Batch(3) + b.Add(m, m, m, m, m) + b.Accept(batch) + require.Equal(t, int64(2), b.MetricsDropped.Get()) + require.Equal(t, int64(3), b.MetricsWritten.Get()) +} + +func TestBuffer_WrapWithBatch(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + + b.Add(m, m, m) + b.Batch(3) + b.Add(m, m, m, m, m, m) + + require.Equal(t, int64(1), b.MetricsDropped.Get()) +} + +func TestBuffer_BatchNotRemoved(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + b.Add(m, m, m, m, m) + b.Batch(2) + require.Equal(t, 5, b.Len()) +} + +func TestBuffer_BatchRejectAcceptNoop(t *testing.T) { + m := Metric() + b := setup(NewBuffer("test", 5)) + b.Add(m, m, m, m, m) + batch := b.Batch(2) + b.Reject(batch) + b.Accept(batch) + require.Equal(t, 5, b.Len()) +} + +func TestBuffer_AcceptCallsMetricAccept(t *testing.T) { + var accept int + mm := &MockMetric{ + Metric: Metric(), + AcceptF: func() { + accept++ + }, + } + b := setup(NewBuffer("test", 5)) + b.Add(mm, mm, mm) + batch := b.Batch(2) + b.Accept(batch) + require.Equal(t, 2, accept) +} + +func TestBuffer_AddCallsMetricRejectWhenNoBatch(t *testing.T) { + var reject int + mm := &MockMetric{ + Metric: Metric(), + RejectF: func() { + reject++ + }, + } + b := setup(NewBuffer("test", 5)) + setup(b) + b.Add(mm, mm, mm, mm, mm) + b.Add(mm, mm) + require.Equal(t, 2, reject) +} + +func TestBuffer_AddCallsMetricRejectWhenNotInBatch(t *testing.T) { + var reject int + mm := &MockMetric{ + Metric: Metric(), + RejectF: func() { + reject++ + }, + } + b := setup(NewBuffer("test", 5)) + setup(b) + b.Add(mm, mm, mm, mm, mm) + batch := b.Batch(2) + b.Add(mm, mm, mm, mm) + // metric[2] and metric[3] rejected + require.Equal(t, 2, reject) + b.Reject(batch) + // metric[1] and metric[2] now rejected + require.Equal(t, 4, reject) +} + +func TestBuffer_RejectCallsMetricRejectWithOverwritten(t *testing.T) { + var reject int + mm := &MockMetric{ + Metric: Metric(), + RejectF: func() { + reject++ + }, + } + b := setup(NewBuffer("test", 5)) + b.Add(mm, mm, mm, mm, mm) + batch := b.Batch(5) + b.Add(mm, mm) + require.Equal(t, 0, reject) + b.Reject(batch) + require.Equal(t, 2, reject) +} + +func TestBuffer_AddOverwriteAndReject(t *testing.T) { + var reject int + mm := &MockMetric{ + Metric: Metric(), + RejectF: func() { + reject++ + }, + } + b := setup(NewBuffer("test", 5)) + b.Add(mm, mm, mm, mm, mm) + batch := b.Batch(5) + b.Add(mm, mm, mm, mm, mm) + b.Add(mm, mm, mm, mm, mm) + b.Add(mm, mm, mm, mm, mm) + b.Add(mm, mm, mm, mm, mm) + require.Equal(t, 15, reject) + b.Reject(batch) + require.Equal(t, 20, reject) +} + +func TestBuffer_AddOverwriteAndRejectOffset(t *testing.T) { + var reject int + var accept int + mm := &MockMetric{ + Metric: Metric(), + RejectF: func() { + reject++ + }, + AcceptF: func() { + accept++ + }, + } + b := setup(NewBuffer("test", 5)) + b.Add(mm, mm, mm) + b.Add(mm, mm, mm, mm) + require.Equal(t, 2, reject) + batch := b.Batch(5) + b.Add(mm, mm, mm, mm) + require.Equal(t, 2, reject) + b.Add(mm, mm, mm, mm) + require.Equal(t, 5, reject) + b.Add(mm, mm, mm, mm) + require.Equal(t, 9, reject) + b.Add(mm, mm, mm, mm) + require.Equal(t, 13, reject) + b.Accept(batch) + require.Equal(t, 13, reject) + require.Equal(t, 5, accept) +} diff --git a/internal/models/filter_test.go b/internal/models/filter_test.go index eb208f7c3..84cd1d397 100644 --- a/internal/models/filter_test.go +++ b/internal/models/filter_test.go @@ -6,6 +6,7 @@ import ( "github.com/influxdata/telegraf" "github.com/influxdata/telegraf/metric" + "github.com/influxdata/telegraf/testutil" "github.com/stretchr/testify/require" ) @@ -480,3 +481,45 @@ func TestFilter_FilterTagsPassAndDrop(t *testing.T) { } } + +func BenchmarkFilter(b *testing.B) { + tests := []struct { + name string + filter Filter + metric telegraf.Metric + }{ + { + name: "empty filter", + filter: Filter{}, + metric: testutil.MustMetric("cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + }, + { + name: "namepass", + filter: Filter{ + NamePass: []string{"cpu"}, + }, + metric: testutil.MustMetric("cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + }, + } + + for _, tt := range tests { + b.Run(tt.name, func(b *testing.B) { + require.NoError(b, tt.filter.Compile()) + for n := 0; n < b.N; n++ { + tt.filter.Select(tt.metric) + } + }) + } +} diff --git a/internal/models/running_aggregator.go b/internal/models/running_aggregator.go index 960fd3131..0315aa671 100644 --- a/internal/models/running_aggregator.go +++ b/internal/models/running_aggregator.go @@ -1,30 +1,53 @@ package models import ( - "log" + "sync" "time" "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/selfstat" ) type RunningAggregator struct { - a telegraf.Aggregator - Config *AggregatorConfig - - metrics chan telegraf.Metric - + sync.Mutex + Aggregator telegraf.Aggregator + Config *AggregatorConfig periodStart time.Time periodEnd time.Time + + MetricsPushed selfstat.Stat + MetricsFiltered selfstat.Stat + MetricsDropped selfstat.Stat + PushTime selfstat.Stat } func NewRunningAggregator( - a telegraf.Aggregator, - conf *AggregatorConfig, + aggregator telegraf.Aggregator, + config *AggregatorConfig, ) *RunningAggregator { return &RunningAggregator{ - a: a, - Config: conf, - metrics: make(chan telegraf.Metric, 100), + Aggregator: aggregator, + Config: config, + MetricsPushed: selfstat.Register( + "aggregate", + "metrics_pushed", + map[string]string{"aggregator": config.Name}, + ), + MetricsFiltered: selfstat.Register( + "aggregate", + "metrics_filtered", + map[string]string{"aggregator": config.Name}, + ), + MetricsDropped: selfstat.Register( + "aggregate", + "metrics_dropped", + map[string]string{"aggregator": config.Name}, + ), + PushTime: selfstat.Register( + "aggregate", + "push_time_ns", + map[string]string{"aggregator": config.Name}, + ), } } @@ -46,6 +69,15 @@ func (r *RunningAggregator) Name() string { return "aggregators." + r.Config.Name } +func (r *RunningAggregator) Period() time.Duration { + return r.Config.Period +} + +func (r *RunningAggregator) SetPeriodStart(start time.Time) { + r.periodStart = start + r.periodEnd = r.periodStart.Add(r.Config.Period).Add(r.Config.Delay) +} + func (r *RunningAggregator) MakeMetric(metric telegraf.Metric) telegraf.Metric { m := makemetric( metric, @@ -59,9 +91,21 @@ func (r *RunningAggregator) MakeMetric(metric telegraf.Metric) telegraf.Metric { m.SetAggregate(true) } + r.MetricsPushed.Incr(1) + return m } +func (r *RunningAggregator) metricFiltered(metric telegraf.Metric) { + r.MetricsFiltered.Incr(1) + metric.Accept() +} + +func (r *RunningAggregator) metricDropped(metric telegraf.Metric) { + r.MetricsDropped.Incr(1) + metric.Accept() +} + // Add a metric to the aggregator and return true if the original metric // should be dropped. func (r *RunningAggregator) Add(metric telegraf.Metric) bool { @@ -74,75 +118,31 @@ func (r *RunningAggregator) Add(metric telegraf.Metric) bool { return r.Config.DropOriginal } - r.metrics <- metric + r.Lock() + defer r.Unlock() + if r.periodStart.IsZero() || metric.Time().Before(r.periodStart) || metric.Time().After(r.periodEnd) { + r.metricDropped(metric) + return false + } + + r.Aggregator.Add(metric) return r.Config.DropOriginal } -func (r *RunningAggregator) add(in telegraf.Metric) { - r.a.Add(in) +func (r *RunningAggregator) Push(acc telegraf.Accumulator) { + r.Lock() + defer r.Unlock() + + r.periodStart = r.periodEnd + r.periodEnd = r.periodStart.Add(r.Config.Period).Add(r.Config.Delay) + r.push(acc) + r.Aggregator.Reset() } func (r *RunningAggregator) push(acc telegraf.Accumulator) { - r.a.Push(acc) -} - -func (r *RunningAggregator) reset() { - r.a.Reset() -} - -// Run runs the running aggregator, listens for incoming metrics, and waits -// for period ticks to tell it when to push and reset the aggregator. -func (r *RunningAggregator) Run( - acc telegraf.Accumulator, - shutdown chan struct{}, -) { - // The start of the period is truncated to the nearest second. - // - // Every metric then gets it's timestamp checked and is dropped if it - // is not within: - // - // start < t < end + truncation + delay - // - // So if we start at now = 00:00.2 with a 10s period and 0.3s delay: - // now = 00:00.2 - // start = 00:00 - // truncation = 00:00.2 - // end = 00:10 - // 1st interval: 00:00 - 00:10.5 - // 2nd interval: 00:10 - 00:20.5 - // etc. - // - now := time.Now() - r.periodStart = now.Truncate(time.Second) - truncation := now.Sub(r.periodStart) - r.periodEnd = r.periodStart.Add(r.Config.Period) - time.Sleep(r.Config.Delay) - periodT := time.NewTicker(r.Config.Period) - defer periodT.Stop() - - for { - select { - case <-shutdown: - if len(r.metrics) > 0 { - // wait until metrics are flushed before exiting - continue - } - return - case m := <-r.metrics: - if m.Time().Before(r.periodStart) || - m.Time().After(r.periodEnd.Add(truncation).Add(r.Config.Delay)) { - // the metric is outside the current aggregation period, so - // skip it. - log.Printf("D! aggregator: metric \"%s\" is not in the current timewindow, skipping", m.Name()) - continue - } - r.add(m) - case <-periodT.C: - r.periodStart = r.periodEnd - r.periodEnd = r.periodStart.Add(r.Config.Period) - r.push(acc) - r.reset() - } - } + start := time.Now() + r.Aggregator.Push(acc) + elapsed := time.Since(start) + r.PushTime.Incr(elapsed.Nanoseconds()) } diff --git a/internal/models/running_aggregator_test.go b/internal/models/running_aggregator_test.go index 34d513646..2212829f9 100644 --- a/internal/models/running_aggregator_test.go +++ b/internal/models/running_aggregator_test.go @@ -1,16 +1,13 @@ package models import ( - "sync" "sync/atomic" "testing" "time" "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/metric" "github.com/influxdata/telegraf/testutil" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -23,28 +20,24 @@ func TestAdd(t *testing.T) { }, Period: time.Millisecond * 500, }) - assert.NoError(t, ra.Config.Filter.Compile()) + require.NoError(t, ra.Config.Filter.Compile()) acc := testutil.Accumulator{} - go ra.Run(&acc, make(chan struct{})) - m, err := metric.New("RITest", + now := time.Now() + ra.SetPeriodStart(now) + + m := testutil.MustMetric("RITest", map[string]string{}, map[string]interface{}{ "value": int64(101), }, time.Now().Add(time.Millisecond*150), telegraf.Untyped) - require.NoError(t, err) + require.False(t, ra.Add(m)) + ra.Push(&acc) - assert.False(t, ra.Add(m)) - - for { - time.Sleep(time.Millisecond) - if atomic.LoadInt64(&a.sum) > 0 { - break - } - } - assert.Equal(t, int64(101), atomic.LoadInt64(&a.sum)) + require.Equal(t, 1, len(acc.Metrics)) + require.Equal(t, int64(101), acc.Metrics[0].Fields["sum"]) } func TestAddMetricsOutsideCurrentPeriod(t *testing.T) { @@ -56,50 +49,45 @@ func TestAddMetricsOutsideCurrentPeriod(t *testing.T) { }, Period: time.Millisecond * 500, }) - assert.NoError(t, ra.Config.Filter.Compile()) + require.NoError(t, ra.Config.Filter.Compile()) acc := testutil.Accumulator{} - go ra.Run(&acc, make(chan struct{})) + now := time.Now() + ra.SetPeriodStart(now) - m, err := metric.New("RITest", + m := testutil.MustMetric("RITest", map[string]string{}, map[string]interface{}{ "value": int64(101), }, - time.Now().Add(-time.Hour), - telegraf.Untyped) - require.NoError(t, err) - - assert.False(t, ra.Add(m)) + now.Add(-time.Hour), + telegraf.Untyped, + ) + require.False(t, ra.Add(m)) // metric after current period - m, err = metric.New("RITest", + m = testutil.MustMetric("RITest", map[string]string{}, map[string]interface{}{ "value": int64(101), }, - time.Now().Add(time.Hour), - telegraf.Untyped) - require.NoError(t, err) - assert.False(t, ra.Add(m)) + now.Add(time.Hour), + telegraf.Untyped, + ) + require.False(t, ra.Add(m)) // "now" metric - m, err = metric.New("RITest", + m = testutil.MustMetric("RITest", map[string]string{}, map[string]interface{}{ "value": int64(101), }, time.Now().Add(time.Millisecond*50), telegraf.Untyped) - require.NoError(t, err) - assert.False(t, ra.Add(m)) + require.False(t, ra.Add(m)) - for { - time.Sleep(time.Millisecond) - if atomic.LoadInt64(&a.sum) > 0 { - break - } - } - assert.Equal(t, int64(101), atomic.LoadInt64(&a.sum)) + ra.Push(&acc) + require.Equal(t, 1, len(acc.Metrics)) + require.Equal(t, int64(101), acc.Metrics[0].Fields["sum"]) } func TestAddAndPushOnePeriod(t *testing.T) { @@ -111,37 +99,24 @@ func TestAddAndPushOnePeriod(t *testing.T) { }, Period: time.Millisecond * 500, }) - assert.NoError(t, ra.Config.Filter.Compile()) + require.NoError(t, ra.Config.Filter.Compile()) acc := testutil.Accumulator{} - shutdown := make(chan struct{}) - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - ra.Run(&acc, shutdown) - }() + now := time.Now() + ra.SetPeriodStart(now) - m, err := metric.New("RITest", + m := testutil.MustMetric("RITest", map[string]string{}, map[string]interface{}{ "value": int64(101), }, time.Now().Add(time.Millisecond*100), telegraf.Untyped) - require.NoError(t, err) - assert.False(t, ra.Add(m)) + require.False(t, ra.Add(m)) + + ra.Push(&acc) - for { - time.Sleep(time.Millisecond) - if acc.NMetrics() > 0 { - break - } - } acc.AssertContainsFields(t, "TestMetric", map[string]interface{}{"sum": int64(101)}) - - close(shutdown) - wg.Wait() } func TestAddDropOriginal(t *testing.T) { @@ -152,28 +127,29 @@ func TestAddDropOriginal(t *testing.T) { }, DropOriginal: true, }) - assert.NoError(t, ra.Config.Filter.Compile()) + require.NoError(t, ra.Config.Filter.Compile()) - m, err := metric.New("RITest", + now := time.Now() + ra.SetPeriodStart(now) + + m := testutil.MustMetric("RITest", map[string]string{}, map[string]interface{}{ "value": int64(101), }, - time.Now(), + now, telegraf.Untyped) - require.NoError(t, err) - assert.True(t, ra.Add(m)) + require.True(t, ra.Add(m)) // this metric name doesn't match the filter, so Add will return false - m2, err := metric.New("foobar", + m2 := testutil.MustMetric("foobar", map[string]string{}, map[string]interface{}{ "value": int64(101), }, - time.Now(), + now, telegraf.Untyped) - require.NoError(t, err) - assert.False(t, ra.Add(m2)) + require.False(t, ra.Add(m2)) } type TestAggregator struct { diff --git a/internal/models/running_input.go b/internal/models/running_input.go index fce2437ca..0775d5c5d 100644 --- a/internal/models/running_input.go +++ b/internal/models/running_input.go @@ -1,11 +1,9 @@ package models import ( - "fmt" "time" "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/plugins/serializers/influx" "github.com/influxdata/telegraf/selfstat" ) @@ -15,16 +13,13 @@ type RunningInput struct { Input telegraf.Input Config *InputConfig - trace bool defaultTags map[string]string MetricsGathered selfstat.Stat + GatherTime selfstat.Stat } -func NewRunningInput( - input telegraf.Input, - config *InputConfig, -) *RunningInput { +func NewRunningInput(input telegraf.Input, config *InputConfig) *RunningInput { return &RunningInput{ Input: input, Config: config, @@ -33,6 +28,11 @@ func NewRunningInput( "metrics_gathered", map[string]string{"input": config.Name}, ), + GatherTime: selfstat.RegisterTiming( + "gather", + "gather_time_ns", + map[string]string{"input": config.Name}, + ), } } @@ -52,13 +52,19 @@ func (r *RunningInput) Name() string { return "inputs." + r.Config.Name } +func (r *RunningInput) metricFiltered(metric telegraf.Metric) { + metric.Drop() +} + func (r *RunningInput) MakeMetric(metric telegraf.Metric) telegraf.Metric { if ok := r.Config.Filter.Select(metric); !ok { + r.metricFiltered(metric) return nil } r.Config.Filter.Modify(metric) if len(metric.FieldList()) == 0 { + r.metricFiltered(metric) return nil } @@ -70,26 +76,17 @@ func (r *RunningInput) MakeMetric(metric telegraf.Metric) telegraf.Metric { r.Config.Tags, r.defaultTags) - if r.trace && m != nil { - s := influx.NewSerializer() - s.SetFieldSortOrder(influx.SortFields) - octets, err := s.Serialize(m) - if err == nil { - fmt.Print("> " + string(octets)) - } - } - r.MetricsGathered.Incr(1) GlobalMetricsGathered.Incr(1) return m } -func (r *RunningInput) Trace() bool { - return r.trace -} - -func (r *RunningInput) SetTrace(trace bool) { - r.trace = trace +func (r *RunningInput) Gather(acc telegraf.Accumulator) error { + start := time.Now() + err := r.Input.Gather(acc) + elapsed := time.Since(start) + r.GatherTime.Incr(elapsed.Nanoseconds()) + return err } func (r *RunningInput) SetDefaultTags(tags map[string]string) { diff --git a/internal/models/running_input_test.go b/internal/models/running_input_test.go index b83f75ea9..898007e61 100644 --- a/internal/models/running_input_test.go +++ b/internal/models/running_input_test.go @@ -6,6 +6,7 @@ import ( "github.com/influxdata/telegraf" "github.com/influxdata/telegraf/metric" + "github.com/influxdata/telegraf/testutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -66,17 +67,13 @@ func TestMakeMetricWithPluginTags(t *testing.T) { }, }) - ri.SetTrace(true) - assert.Equal(t, true, ri.Trace()) - - m, err := metric.New("RITest", + m := testutil.MustMetric("RITest", map[string]string{}, map[string]interface{}{ "value": int64(101), }, now, telegraf.Untyped) - require.NoError(t, err) m = ri.MakeMetric(m) expected, err := metric.New("RITest", @@ -102,8 +99,6 @@ func TestMakeMetricFilteredOut(t *testing.T) { Filter: Filter{NamePass: []string{"foobar"}}, }) - ri.SetTrace(true) - assert.Equal(t, true, ri.Trace()) assert.NoError(t, ri.Config.Filter.Compile()) m, err := metric.New("RITest", @@ -127,17 +122,13 @@ func TestMakeMetricWithDaemonTags(t *testing.T) { "foo": "bar", }) - ri.SetTrace(true) - assert.Equal(t, true, ri.Trace()) - - m, err := metric.New("RITest", + m := testutil.MustMetric("RITest", map[string]string{}, map[string]interface{}{ "value": int64(101), }, now, telegraf.Untyped) - require.NoError(t, err) m = ri.MakeMetric(m) expected, err := metric.New("RITest", map[string]string{ diff --git a/internal/models/running_output.go b/internal/models/running_output.go index 0f2c138a6..8d7d9854b 100644 --- a/internal/models/running_output.go +++ b/internal/models/running_output.go @@ -6,7 +6,6 @@ import ( "time" "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/internal/buffer" "github.com/influxdata/telegraf/selfstat" ) @@ -18,6 +17,16 @@ const ( DEFAULT_METRIC_BUFFER_LIMIT = 10000 ) +// OutputConfig containing name and filter +type OutputConfig struct { + Name string + Filter Filter + + FlushInterval time.Duration + MetricBufferLimit int + MetricBatchSize int +} + // RunningOutput contains the output configuration type RunningOutput struct { Name string @@ -27,24 +36,16 @@ type RunningOutput struct { MetricBatchSize int MetricsFiltered selfstat.Stat - MetricsWritten selfstat.Stat BufferSize selfstat.Stat BufferLimit selfstat.Stat WriteTime selfstat.Stat - metrics *buffer.Buffer - failMetrics *buffer.Buffer + batch []telegraf.Metric + buffer *Buffer + BatchReady chan time.Time - // Guards against concurrent calls to Add, Push, Reset - aggMutex sync.Mutex - // Guards against concurrent calls to the Output as described in #3009 - writeMutex sync.Mutex -} - -// OutputConfig containing name and filter -type OutputConfig struct { - Name string - Filter Filter + aggMutex sync.Mutex + batchMutex sync.Mutex } func NewRunningOutput( @@ -54,25 +55,27 @@ func NewRunningOutput( batchSize int, bufferLimit int, ) *RunningOutput { + if conf.MetricBufferLimit > 0 { + bufferLimit = conf.MetricBufferLimit + } if bufferLimit == 0 { bufferLimit = DEFAULT_METRIC_BUFFER_LIMIT } + if conf.MetricBatchSize > 0 { + batchSize = conf.MetricBatchSize + } if batchSize == 0 { batchSize = DEFAULT_METRIC_BATCH_SIZE } ro := &RunningOutput{ Name: name, - metrics: buffer.NewBuffer(batchSize), - failMetrics: buffer.NewBuffer(bufferLimit), + batch: make([]telegraf.Metric, 0, batchSize), + buffer: NewBuffer(name, bufferLimit), + BatchReady: make(chan time.Time, 1), Output: output, Config: conf, MetricBufferLimit: bufferLimit, MetricBatchSize: batchSize, - MetricsWritten: selfstat.Register( - "write", - "metrics_written", - map[string]string{"output": name}, - ), MetricsFiltered: selfstat.Register( "write", "metrics_filtered", @@ -94,20 +97,28 @@ func NewRunningOutput( map[string]string{"output": name}, ), } + ro.BufferLimit.Set(int64(ro.MetricBufferLimit)) return ro } -// AddMetric adds a metric to the output. This function can also write cached -// points if FlushBufferWhenFull is true. +func (ro *RunningOutput) metricFiltered(metric telegraf.Metric) { + ro.MetricsFiltered.Incr(1) + metric.Drop() +} + +// AddMetric adds a metric to the output. +// +// Takes ownership of metric func (ro *RunningOutput) AddMetric(metric telegraf.Metric) { if ok := ro.Config.Filter.Select(metric); !ok { - ro.MetricsFiltered.Incr(1) + ro.metricFiltered(metric) return } ro.Config.Filter.Modify(metric) if len(metric.FieldList()) == 0 { + ro.metricFiltered(metric) return } @@ -118,85 +129,98 @@ func (ro *RunningOutput) AddMetric(metric telegraf.Metric) { return } - ro.metrics.Add(metric) - if ro.metrics.Len() == ro.MetricBatchSize { - batch := ro.metrics.Batch(ro.MetricBatchSize) - err := ro.write(batch) - if err != nil { - ro.failMetrics.Add(batch...) - log.Printf("E! Error writing to output [%s]: %v", ro.Name, err) + ro.batchMutex.Lock() + + ro.batch = append(ro.batch, metric) + if len(ro.batch) == ro.MetricBatchSize { + ro.addBatchToBuffer() + + nBuffer := ro.buffer.Len() + ro.BufferSize.Set(int64(nBuffer)) + + select { + case ro.BatchReady <- time.Now(): + default: } } + + ro.batchMutex.Unlock() } -// Write writes all cached points to this output. +// AddBatchToBuffer moves the metrics from the batch into the metric buffer. +func (ro *RunningOutput) addBatchToBuffer() { + ro.buffer.Add(ro.batch...) + ro.batch = ro.batch[:0] +} + +// Write writes all metrics to the output, stopping when all have been sent on +// or error. func (ro *RunningOutput) Write() error { if output, ok := ro.Output.(telegraf.AggregatingOutput); ok { ro.aggMutex.Lock() metrics := output.Push() - ro.metrics.Add(metrics...) + ro.buffer.Add(metrics...) output.Reset() ro.aggMutex.Unlock() } + // add and write can be called concurrently + ro.batchMutex.Lock() + ro.addBatchToBuffer() + ro.batchMutex.Unlock() - nFails, nMetrics := ro.failMetrics.Len(), ro.metrics.Len() - ro.BufferSize.Set(int64(nFails + nMetrics)) - log.Printf("D! Output [%s] buffer fullness: %d / %d metrics. ", - ro.Name, nFails+nMetrics, ro.MetricBufferLimit) - var err error - if !ro.failMetrics.IsEmpty() { - // how many batches of failed writes we need to write. - nBatches := nFails/ro.MetricBatchSize + 1 - batchSize := ro.MetricBatchSize + nBuffer := ro.buffer.Len() - for i := 0; i < nBatches; i++ { - // If it's the last batch, only grab the metrics that have not had - // a write attempt already (this is primarily to preserve order). - if i == nBatches-1 { - batchSize = nFails % ro.MetricBatchSize - } - batch := ro.failMetrics.Batch(batchSize) - // If we've already failed previous writes, don't bother trying to - // write to this output again. We are not exiting the loop just so - // that we can rotate the metrics to preserve order. - if err == nil { - err = ro.write(batch) - } - if err != nil { - ro.failMetrics.Add(batch...) - } + // Only process the metrics in the buffer now. Metrics added while we are + // writing will be sent on the next call. + nBatches := nBuffer/ro.MetricBatchSize + 1 + for i := 0; i < nBatches; i++ { + batch := ro.buffer.Batch(ro.MetricBatchSize) + if len(batch) == 0 { + break } - } - batch := ro.metrics.Batch(ro.MetricBatchSize) - // see comment above about not trying to write to an already failed output. - // if ro.failMetrics is empty then err will always be nil at this point. - if err == nil { - err = ro.write(batch) - } - - if err != nil { - ro.failMetrics.Add(batch...) - return err + err := ro.write(batch) + if err != nil { + ro.buffer.Reject(batch) + return err + } + ro.buffer.Accept(batch) } return nil } -func (ro *RunningOutput) write(metrics []telegraf.Metric) error { - nMetrics := len(metrics) - if nMetrics == 0 { +// WriteBatch writes only the batch metrics to the output. +func (ro *RunningOutput) WriteBatch() error { + batch := ro.buffer.Batch(ro.MetricBatchSize) + if len(batch) == 0 { return nil } - ro.writeMutex.Lock() - defer ro.writeMutex.Unlock() + + err := ro.write(batch) + if err != nil { + ro.buffer.Reject(batch) + return err + } + ro.buffer.Accept(batch) + + return nil +} + +func (ro *RunningOutput) write(metrics []telegraf.Metric) error { start := time.Now() err := ro.Output.Write(metrics) elapsed := time.Since(start) + ro.WriteTime.Incr(elapsed.Nanoseconds()) + if err == nil { - log.Printf("D! Output [%s] wrote batch of %d metrics in %s\n", - ro.Name, nMetrics, elapsed) - ro.MetricsWritten.Incr(int64(nMetrics)) - ro.WriteTime.Incr(elapsed.Nanoseconds()) + log.Printf("D! [outputs.%s] wrote batch of %d metrics in %s\n", + ro.Name, len(metrics), elapsed) } return err } + +func (ro *RunningOutput) LogBufferStatus() { + nBuffer := ro.buffer.Len() + log.Printf("D! [outputs.%s] buffer fullness: %d / %d metrics. ", + ro.Name, nBuffer, ro.MetricBufferLimit) +} diff --git a/internal/models/running_output_test.go b/internal/models/running_output_test.go index c55334218..fe8755395 100644 --- a/internal/models/running_output_test.go +++ b/internal/models/running_output_test.go @@ -231,56 +231,6 @@ func TestRunningOutputDefault(t *testing.T) { assert.Len(t, m.Metrics(), 10) } -// Test that running output doesn't flush until it's full when -// FlushBufferWhenFull is set. -func TestRunningOutputFlushWhenFull(t *testing.T) { - conf := &OutputConfig{ - Filter: Filter{}, - } - - m := &mockOutput{} - ro := NewRunningOutput("test", m, conf, 6, 10) - - // Fill buffer to 1 under limit - for _, metric := range first5 { - ro.AddMetric(metric) - } - // no flush yet - assert.Len(t, m.Metrics(), 0) - - // add one more metric - ro.AddMetric(next5[0]) - // now it flushed - assert.Len(t, m.Metrics(), 6) - - // add one more metric and write it manually - ro.AddMetric(next5[1]) - err := ro.Write() - assert.NoError(t, err) - assert.Len(t, m.Metrics(), 7) -} - -// Test that running output doesn't flush until it's full when -// FlushBufferWhenFull is set, twice. -func TestRunningOutputMultiFlushWhenFull(t *testing.T) { - conf := &OutputConfig{ - Filter: Filter{}, - } - - m := &mockOutput{} - ro := NewRunningOutput("test", m, conf, 4, 12) - - // Fill buffer past limit twive - for _, metric := range first5 { - ro.AddMetric(metric) - } - for _, metric := range next5 { - ro.AddMetric(metric) - } - // flushed twice - assert.Len(t, m.Metrics(), 8) -} - func TestRunningOutputWriteFail(t *testing.T) { conf := &OutputConfig{ Filter: Filter{}, diff --git a/internal/models/running_processor.go b/internal/models/running_processor.go index a210d9799..38369d03b 100644 --- a/internal/models/running_processor.go +++ b/internal/models/running_processor.go @@ -27,6 +27,19 @@ type ProcessorConfig struct { Filter Filter } +func (rp *RunningProcessor) metricFiltered(metric telegraf.Metric) { + metric.Drop() +} + +func containsMetric(item telegraf.Metric, metrics []telegraf.Metric) bool { + for _, m := range metrics { + if item == m { + return true + } + } + return false +} + func (rp *RunningProcessor) Apply(in ...telegraf.Metric) []telegraf.Metric { rp.Lock() defer rp.Unlock() @@ -43,6 +56,7 @@ func (rp *RunningProcessor) Apply(in ...telegraf.Metric) []telegraf.Metric { rp.Config.Filter.Modify(metric) if len(metric.FieldList()) == 0 { + rp.metricFiltered(metric) continue } diff --git a/internal/models/running_processor_test.go b/internal/models/running_processor_test.go index 02db40460..c24347b8e 100644 --- a/internal/models/running_processor_test.go +++ b/internal/models/running_processor_test.go @@ -6,7 +6,7 @@ import ( "time" "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/metric" + "github.com/influxdata/telegraf/testutil" "github.com/stretchr/testify/require" ) @@ -41,20 +41,6 @@ func TagProcessor(key, value string) *MockProcessor { } } -func Metric( - name string, - tags map[string]string, - fields map[string]interface{}, - tm time.Time, - tp ...telegraf.ValueType, -) telegraf.Metric { - m, err := metric.New(name, tags, fields, tm, tp...) - if err != nil { - panic(err) - } - return m -} - func TestRunningProcessor_Apply(t *testing.T) { type args struct { Processor telegraf.Processor @@ -76,7 +62,7 @@ func TestRunningProcessor_Apply(t *testing.T) { }, }, input: []telegraf.Metric{ - Metric( + testutil.MustMetric( "cpu", map[string]string{}, map[string]interface{}{ @@ -86,7 +72,7 @@ func TestRunningProcessor_Apply(t *testing.T) { ), }, expected: []telegraf.Metric{ - Metric( + testutil.MustMetric( "cpu", map[string]string{ "apply": "true", @@ -109,7 +95,7 @@ func TestRunningProcessor_Apply(t *testing.T) { }, }, input: []telegraf.Metric{ - Metric( + testutil.MustMetric( "cpu", map[string]string{}, map[string]interface{}{ @@ -119,7 +105,7 @@ func TestRunningProcessor_Apply(t *testing.T) { ), }, expected: []telegraf.Metric{ - Metric( + testutil.MustMetric( "cpu", map[string]string{ "apply": "true", @@ -142,7 +128,7 @@ func TestRunningProcessor_Apply(t *testing.T) { }, }, input: []telegraf.Metric{ - Metric( + testutil.MustMetric( "cpu", map[string]string{}, map[string]interface{}{ @@ -152,7 +138,7 @@ func TestRunningProcessor_Apply(t *testing.T) { ), }, expected: []telegraf.Metric{ - Metric( + testutil.MustMetric( "cpu", map[string]string{}, map[string]interface{}{ diff --git a/metric.go b/metric.go index b8da02931..396321e6e 100644 --- a/metric.go +++ b/metric.go @@ -62,6 +62,17 @@ type Metric interface { // Copy returns a deep copy of the Metric. Copy() Metric + // Accept marks the metric as processed successfully and written to an + // output. + Accept() + + // Reject marks the metric as processed unsuccessfully. + Reject() + + // Drop marks the metric as processed successfully without being written + // to any output. + Drop() + // Mark Metric as an aggregate SetAggregate(bool) IsAggregate() bool diff --git a/metric/metric.go b/metric/metric.go index 9f1a42ccb..f2a49957e 100644 --- a/metric/metric.go +++ b/metric/metric.go @@ -248,6 +248,15 @@ func (m *metric) HashID() uint64 { return h.Sum64() } +func (m *metric) Accept() { +} + +func (m *metric) Reject() { +} + +func (m *metric) Drop() { +} + // Convert field to a supported type or nil if unconvertible func convertField(v interface{}) interface{} { switch v := v.(type) { diff --git a/metric/tracking.go b/metric/tracking.go new file mode 100644 index 000000000..83c3c7aec --- /dev/null +++ b/metric/tracking.go @@ -0,0 +1,171 @@ +package metric + +import ( + "log" + "runtime" + "sync/atomic" + + "github.com/influxdata/telegraf" +) + +// NotifyFunc is called when a tracking metric is done being processed with +// the tracking information. +type NotifyFunc = func(track telegraf.DeliveryInfo) + +// WithTracking adds tracking to the metric and registers the notify function +// to be called when processing is complete. +func WithTracking(metric telegraf.Metric, fn NotifyFunc) (telegraf.Metric, telegraf.TrackingID) { + return newTrackingMetric(metric, fn) +} + +// WithBatchTracking adds tracking to the metrics and registers the notify +// function to be called when processing is complete. +func WithGroupTracking(metric []telegraf.Metric, fn NotifyFunc) ([]telegraf.Metric, telegraf.TrackingID) { + return newTrackingMetricGroup(metric, fn) +} + +func EnableDebugFinalizer() { + finalizer = debugFinalizer +} + +var ( + lastID uint64 + finalizer func(*trackingData) +) + +func newTrackingID() telegraf.TrackingID { + atomic.AddUint64(&lastID, 1) + return telegraf.TrackingID(lastID) +} + +func debugFinalizer(d *trackingData) { + rc := atomic.LoadInt32(&d.rc) + if rc != 0 { + log.Fatalf("E! [agent] metric collected with non-zero reference count rc: %d", rc) + } +} + +type trackingData struct { + id telegraf.TrackingID + rc int32 + acceptCount int32 + rejectCount int32 + notify NotifyFunc +} + +func (d *trackingData) incr() { + atomic.AddInt32(&d.rc, 1) +} + +func (d *trackingData) decr() int32 { + return atomic.AddInt32(&d.rc, -1) +} + +func (d *trackingData) accept() { + atomic.AddInt32(&d.acceptCount, 1) +} + +func (d *trackingData) reject() { + atomic.AddInt32(&d.rejectCount, 1) +} + +type trackingMetric struct { + telegraf.Metric + d *trackingData +} + +func newTrackingMetric(metric telegraf.Metric, fn NotifyFunc) (telegraf.Metric, telegraf.TrackingID) { + m := &trackingMetric{ + Metric: metric, + d: &trackingData{ + id: newTrackingID(), + rc: 1, + acceptCount: 0, + rejectCount: 0, + notify: fn, + }, + } + + if finalizer != nil { + runtime.SetFinalizer(m.d, finalizer) + } + return m, m.d.id +} + +func newTrackingMetricGroup(group []telegraf.Metric, fn NotifyFunc) ([]telegraf.Metric, telegraf.TrackingID) { + d := &trackingData{ + id: newTrackingID(), + rc: 0, + acceptCount: 0, + rejectCount: 0, + notify: fn, + } + + for i, m := range group { + d.incr() + dm := &trackingMetric{ + Metric: m, + d: d, + } + group[i] = dm + + } + if finalizer != nil { + runtime.SetFinalizer(d, finalizer) + } + + return group, d.id +} + +func (m *trackingMetric) Copy() telegraf.Metric { + m.d.incr() + return &trackingMetric{ + Metric: m.Metric.Copy(), + d: m.d, + } +} + +func (m *trackingMetric) Accept() { + m.d.accept() + m.decr() +} + +func (m *trackingMetric) Reject() { + m.d.reject() + m.decr() +} + +func (m *trackingMetric) Drop() { + m.decr() +} + +func (m *trackingMetric) decr() { + v := m.d.decr() + if v < 0 { + panic("negative refcount") + } + + if v == 0 { + m.d.notify( + &deliveryInfo{ + id: m.d.id, + accepted: int(m.d.acceptCount), + rejected: int(m.d.rejectCount), + }, + ) + } +} + +type deliveryInfo struct { + id telegraf.TrackingID + accepted int + rejected int +} + +func (r *deliveryInfo) ID() telegraf.TrackingID { + return r.id +} + +func (r *deliveryInfo) Delivered() bool { + return r.rejected == 0 +} diff --git a/metric/tracking_test.go b/metric/tracking_test.go new file mode 100644 index 000000000..f950cfcd1 --- /dev/null +++ b/metric/tracking_test.go @@ -0,0 +1,260 @@ +package metric + +import ( + "testing" + "time" + + "github.com/influxdata/telegraf" + "github.com/stretchr/testify/require" +) + +func mustMetric( + name string, + tags map[string]string, + fields map[string]interface{}, + tm time.Time, + tp ...telegraf.ValueType, +) telegraf.Metric { + m, err := New(name, tags, fields, tm, tp...) + if err != nil { + panic("mustMetric") + } + return m +} + +type deliveries struct { + Info map[telegraf.TrackingID]telegraf.DeliveryInfo +} + +func (d *deliveries) onDelivery(info telegraf.DeliveryInfo) { + d.Info[info.ID()] = info +} + +func TestTracking(t *testing.T) { + tests := []struct { + name string + metric telegraf.Metric + actions func(metric telegraf.Metric) + delivered bool + }{ + { + name: "accept", + metric: mustMetric( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + actions: func(m telegraf.Metric) { + m.Accept() + }, + delivered: true, + }, + { + name: "reject", + metric: mustMetric( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + actions: func(m telegraf.Metric) { + m.Reject() + }, + delivered: false, + }, + { + name: "accept copy", + metric: mustMetric( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + actions: func(m telegraf.Metric) { + m2 := m.Copy() + m.Accept() + m2.Accept() + }, + delivered: true, + }, + { + name: "copy with accept and done", + metric: mustMetric( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + actions: func(m telegraf.Metric) { + m2 := m.Copy() + m.Accept() + m2.Drop() + }, + delivered: true, + }, + { + name: "copy with mixed delivery", + metric: mustMetric( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + actions: func(m telegraf.Metric) { + m2 := m.Copy() + m.Accept() + m2.Reject() + }, + delivered: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + d := &deliveries{ + Info: make(map[telegraf.TrackingID]telegraf.DeliveryInfo), + } + metric, id := WithTracking(tt.metric, d.onDelivery) + tt.actions(metric) + + info := d.Info[id] + require.Equal(t, tt.delivered, info.Delivered()) + }) + } +} + +func TestGroupTracking(t *testing.T) { + tests := []struct { + name string + metrics []telegraf.Metric + actions func(metrics []telegraf.Metric) + delivered bool + }{ + { + name: "accept", + metrics: []telegraf.Metric{ + mustMetric( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + mustMetric( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + }, + actions: func(metrics []telegraf.Metric) { + metrics[0].Accept() + metrics[1].Accept() + }, + delivered: true, + }, + { + name: "reject", + metrics: []telegraf.Metric{ + mustMetric( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + mustMetric( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + }, + actions: func(metrics []telegraf.Metric) { + metrics[0].Reject() + metrics[1].Reject() + }, + delivered: false, + }, + { + name: "remove", + metrics: []telegraf.Metric{ + mustMetric( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + mustMetric( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + }, + actions: func(metrics []telegraf.Metric) { + metrics[0].Drop() + metrics[1].Drop() + }, + delivered: true, + }, + { + name: "mixed", + metrics: []telegraf.Metric{ + mustMetric( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + mustMetric( + "cpu", + map[string]string{}, + map[string]interface{}{ + "value": 42, + }, + time.Unix(0, 0), + ), + }, + actions: func(metrics []telegraf.Metric) { + metrics[0].Accept() + metrics[1].Reject() + }, + delivered: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + d := &deliveries{ + Info: make(map[telegraf.TrackingID]telegraf.DeliveryInfo), + } + metrics, id := WithGroupTracking(tt.metrics, d.onDelivery) + tt.actions(metrics) + + info := d.Info[id] + require.Equal(t, tt.delivered, info.Delivered()) + }) + } +} diff --git a/output.go b/output.go index 2421048f0..3c4a85ddb 100644 --- a/output.go +++ b/output.go @@ -17,16 +17,7 @@ type Output interface { // if the Output only accepts a fixed set of aggregations over a time period. // These functions may be called concurrently to the Write function. type AggregatingOutput interface { - // Connect to the Output - Connect() error - // Close any connections to the Output - Close() error - // Description returns a one-sentence description on the Output - Description() string - // SampleConfig returns the default configuration of the Output - SampleConfig() string - // Write takes in group of points to be written to the Output - Write(metrics []Metric) error + Output // Add the metric to the aggregator Add(in Metric) @@ -35,21 +26,3 @@ type AggregatingOutput interface { // Reset signals the the aggregator period is completed. Reset() } - -type ServiceOutput interface { - // Connect to the Output - Connect() error - // Close any connections to the Output - Close() error - // Description returns a one-sentence description on the Output - Description() string - // SampleConfig returns the default configuration of the Output - SampleConfig() string - // Write takes in group of points to be written to the Output - Write(metrics []Metric) error - - // Start the "service" that will provide an Output - Start() error - // Stop the "service" that will provide an Output - Stop() -} diff --git a/plugins/aggregators/basicstats/basicstats.go b/plugins/aggregators/basicstats/basicstats.go index 701cd8a85..c5c7e5d3f 100644 --- a/plugins/aggregators/basicstats/basicstats.go +++ b/plugins/aggregators/basicstats/basicstats.go @@ -133,7 +133,6 @@ func (m *BasicStats) Add(in telegraf.Metric) { } func (m *BasicStats) Push(acc telegraf.Accumulator) { - config := getConfiguredStats(m) for _, aggregate := range m.cache { diff --git a/plugins/inputs/amqp_consumer/README.md b/plugins/inputs/amqp_consumer/README.md index 133531421..ca1af800c 100644 --- a/plugins/inputs/amqp_consumer/README.md +++ b/plugins/inputs/amqp_consumer/README.md @@ -13,7 +13,6 @@ For an introduction to AMQP see: The following defaults are known to work with RabbitMQ: ```toml -# AMQP consumer plugin [[inputs.amqp_consumer]] ## Broker to consume from. ## deprecated in 1.7; use the brokers option @@ -46,16 +45,26 @@ The following defaults are known to work with RabbitMQ: ## AMQP queue name queue = "telegraf" - + ## AMQP queue durability can be "transient" or "durable". queue_durability = "durable" - + ## Binding Key binding_key = "#" ## Maximum number of messages server should give to the worker. # prefetch_count = 50 + ## Maximum messages to read from the broker that have not been written by an + ## output. For best throughput set based on the number of metrics within + ## each message and the size of the output's metric_batch_size. + ## + ## For example, if each message from the queue contains 10 metrics and the + ## output metric_batch_size is 1000, setting this to 100 will ensure that a + ## full batch is collected and the write is triggered immediately without + ## waiting until the next flush_interval. + # max_undelivered_messages = 1000 + ## Auth method. PLAIN and EXTERNAL are supported ## Using EXTERNAL requires enabling the rabbitmq_auth_mechanism_ssl plugin as ## described here: https://www.rabbitmq.com/plugins.html diff --git a/plugins/inputs/amqp_consumer/amqp_consumer.go b/plugins/inputs/amqp_consumer/amqp_consumer.go index 33cd9971b..568ee6f38 100644 --- a/plugins/inputs/amqp_consumer/amqp_consumer.go +++ b/plugins/inputs/amqp_consumer/amqp_consumer.go @@ -1,6 +1,7 @@ package amqp_consumer import ( + "context" "errors" "fmt" "log" @@ -9,25 +10,32 @@ import ( "sync" "time" - "github.com/streadway/amqp" - "github.com/influxdata/telegraf" "github.com/influxdata/telegraf/internal/tls" "github.com/influxdata/telegraf/plugins/inputs" "github.com/influxdata/telegraf/plugins/parsers" + "github.com/streadway/amqp" ) +const ( + defaultMaxUndeliveredMessages = 1000 +) + +type empty struct{} +type semaphore chan empty + // AMQPConsumer is the top level struct for this plugin type AMQPConsumer struct { - URL string `toml:"url"` // deprecated in 1.7; use brokers - Brokers []string `toml:"brokers"` - Username string `toml:"username"` - Password string `toml:"password"` - Exchange string `toml:"exchange"` - ExchangeType string `toml:"exchange_type"` - ExchangeDurability string `toml:"exchange_durability"` - ExchangePassive bool `toml:"exchange_passive"` - ExchangeArguments map[string]string `toml:"exchange_arguments"` + URL string `toml:"url"` // deprecated in 1.7; use brokers + Brokers []string `toml:"brokers"` + Username string `toml:"username"` + Password string `toml:"password"` + Exchange string `toml:"exchange"` + ExchangeType string `toml:"exchange_type"` + ExchangeDurability string `toml:"exchange_durability"` + ExchangePassive bool `toml:"exchange_passive"` + ExchangeArguments map[string]string `toml:"exchange_arguments"` + MaxUndeliveredMessages int `toml:"max_undelivered_messages"` // Queue Name Queue string `toml:"queue"` @@ -44,9 +52,12 @@ type AMQPConsumer struct { AuthMethod string tls.ClientConfig + deliveries map[telegraf.TrackingID]amqp.Delivery + parser parsers.Parser conn *amqp.Connection wg *sync.WaitGroup + cancel context.CancelFunc } type externalAuth struct{} @@ -114,6 +125,16 @@ func (a *AMQPConsumer) SampleConfig() string { ## Maximum number of messages server should give to the worker. # prefetch_count = 50 + ## Maximum messages to read from the broker that have not been written by an + ## output. For best throughput set based on the number of metrics within + ## each message and the size of the output's metric_batch_size. + ## + ## For example, if each message from the queue contains 10 metrics and the + ## output metric_batch_size is 1000, setting this to 100 will ensure that a + ## full batch is collected and the write is triggered immediately without + ## waiting until the next flush_interval. + # max_undelivered_messages = 1000 + ## Auth method. PLAIN and EXTERNAL are supported ## Using EXTERNAL requires enabling the rabbitmq_auth_mechanism_ssl plugin as ## described here: https://www.rabbitmq.com/plugins.html @@ -185,9 +206,15 @@ func (a *AMQPConsumer) Start(acc telegraf.Accumulator) error { return err } + ctx, cancel := context.WithCancel(context.Background()) + a.cancel = cancel + a.wg = &sync.WaitGroup{} a.wg.Add(1) - go a.process(msgs, acc) + go func() { + defer a.wg.Done() + a.process(ctx, msgs, acc) + }() go func() { for { @@ -196,7 +223,7 @@ func (a *AMQPConsumer) Start(acc telegraf.Accumulator) error { break } - log.Printf("I! AMQP consumer connection closed: %s; trying to reconnect", err) + log.Printf("I! [inputs.amqp_consumer] connection closed: %s; trying to reconnect", err) for { msgs, err := a.connect(amqpConf) if err != nil { @@ -206,7 +233,10 @@ func (a *AMQPConsumer) Start(acc telegraf.Accumulator) error { } a.wg.Add(1) - go a.process(msgs, acc) + go func() { + defer a.wg.Done() + a.process(ctx, msgs, acc) + }() break } } @@ -224,14 +254,14 @@ func (a *AMQPConsumer) connect(amqpConf *amqp.Config) (<-chan amqp.Delivery, err p := rand.Perm(len(brokers)) for _, n := range p { broker := brokers[n] - log.Printf("D! [amqp_consumer] connecting to %q", broker) + log.Printf("D! [inputs.amqp_consumer] connecting to %q", broker) conn, err := amqp.DialConfig(broker, *amqpConf) if err == nil { a.conn = conn - log.Printf("D! [amqp_consumer] connected to %q", broker) + log.Printf("D! [inputs.amqp_consumer] connected to %q", broker) break } - log.Printf("D! [amqp_consumer] error connecting to %q", broker) + log.Printf("D! [inputs.amqp_consumer] error connecting to %q", broker) } if a.conn == nil { @@ -320,7 +350,6 @@ func (a *AMQPConsumer) connect(amqpConf *amqp.Config) (<-chan amqp.Delivery, err return nil, fmt.Errorf("Failed establishing connection to queue: %s", err) } - log.Println("I! Started AMQP consumer") return msgs, err } @@ -361,42 +390,101 @@ func declareExchange( } // Read messages from queue and add them to the Accumulator -func (a *AMQPConsumer) process(msgs <-chan amqp.Delivery, acc telegraf.Accumulator) { - defer a.wg.Done() - for d := range msgs { - metrics, err := a.parser.Parse(d.Body) - if err != nil { - log.Printf("E! %v: error parsing metric - %v", err, string(d.Body)) - } else { - for _, m := range metrics { - acc.AddFields(m.Name(), m.Fields(), m.Tags(), m.Time()) +func (a *AMQPConsumer) process(ctx context.Context, msgs <-chan amqp.Delivery, ac telegraf.Accumulator) { + a.deliveries = make(map[telegraf.TrackingID]amqp.Delivery) + + acc := ac.WithTracking(a.MaxUndeliveredMessages) + sem := make(semaphore, a.MaxUndeliveredMessages) + + for { + select { + case <-ctx.Done(): + return + case track := <-acc.Delivered(): + if a.onDelivery(track) { + <-sem + } + case sem <- empty{}: + select { + case <-ctx.Done(): + return + case track := <-acc.Delivered(): + if a.onDelivery(track) { + <-sem + <-sem + } + case d, ok := <-msgs: + if !ok { + return + } + err := a.onMessage(acc, d) + if err != nil { + acc.AddError(err) + <-sem + } } } - - d.Ack(false) } - log.Printf("I! AMQP consumer queue closed") +} + +func (a *AMQPConsumer) onMessage(acc telegraf.TrackingAccumulator, d amqp.Delivery) error { + metrics, err := a.parser.Parse(d.Body) + if err != nil { + return err + } + + id := acc.AddTrackingMetricGroup(metrics) + a.deliveries[id] = d + return nil +} + +func (a *AMQPConsumer) onDelivery(track telegraf.DeliveryInfo) bool { + delivery, ok := a.deliveries[track.ID()] + if !ok { + // Added by a previous connection + return false + } + + if track.Delivered() { + err := delivery.Ack(false) + if err != nil { + log.Printf("E! [inputs.amqp_consumer] Unable to ack written delivery: %d: %v", + delivery.DeliveryTag, err) + a.conn.Close() + } + } else { + err := delivery.Reject(false) + if err != nil { + log.Printf("E! [inputs.amqp_consumer] Unable to reject failed delivery: %d: %v", + delivery.DeliveryTag, err) + a.conn.Close() + } + } + + delete(a.deliveries, track.ID()) + return true } func (a *AMQPConsumer) Stop() { + a.cancel() + a.wg.Wait() err := a.conn.Close() if err != nil && err != amqp.ErrClosed { - log.Printf("E! Error closing AMQP connection: %s", err) + log.Printf("E! [inputs.amqp_consumer] Error closing AMQP connection: %s", err) return } - a.wg.Wait() - log.Println("I! Stopped AMQP service") } func init() { inputs.Add("amqp_consumer", func() telegraf.Input { return &AMQPConsumer{ - URL: DefaultBroker, - AuthMethod: DefaultAuthMethod, - ExchangeType: DefaultExchangeType, - ExchangeDurability: DefaultExchangeDurability, - QueueDurability: DefaultQueueDurability, - PrefetchCount: DefaultPrefetchCount, + URL: DefaultBroker, + AuthMethod: DefaultAuthMethod, + ExchangeType: DefaultExchangeType, + ExchangeDurability: DefaultExchangeDurability, + QueueDurability: DefaultQueueDurability, + PrefetchCount: DefaultPrefetchCount, + MaxUndeliveredMessages: defaultMaxUndeliveredMessages, } }) } diff --git a/plugins/inputs/internal/README.md b/plugins/inputs/internal/README.md index fbec4d86f..73f0b018e 100644 --- a/plugins/inputs/internal/README.md +++ b/plugins/inputs/internal/README.md @@ -18,52 +18,54 @@ plugin. memstats are taken from the Go runtime: https://golang.org/pkg/runtime/#MemStats -- internal\_memstats - - alloc\_bytes +- internal_memstats + - alloc_bytes - frees - - heap\_alloc\_bytes - - heap\_idle\_bytes - - heap\_in\_use\_bytes - - heap\_objects\_bytes - - heap\_released\_bytes - - heap\_sys\_bytes + - heap_alloc_bytes + - heap_idle_bytes + - heap_in_use_bytes + - heap_objects_bytes + - heap_released_bytes + - heap_sys_bytes - mallocs - - num\_gc - - pointer\_lookups - - sys\_bytes - - total\_alloc\_bytes + - num_gc + - pointer_lookups + - sys_bytes + - total_alloc_bytes agent stats collect aggregate stats on all telegraf plugins. -- internal\_agent - - gather\_errors - - metrics\_dropped - - metrics\_gathered - - metrics\_written +- internal_agent + - gather_errors + - metrics_dropped + - metrics_gathered + - metrics_written -internal\_gather stats collect aggregate stats on all input plugins +internal_gather stats collect aggregate stats on all input plugins that are of the same input type. They are tagged with `input=`. -- internal\_gather - - gather\_time\_ns - - metrics\_gathered +- internal_gather + - gather_time_ns + - metrics_gathered -internal\_write stats collect aggregate stats on all output plugins +internal_write stats collect aggregate stats on all output plugins that are of the same input type. They are tagged with `output=`. -- internal\_write - - buffer\_limit - - buffer\_size - - metrics\_written - - metrics\_filtered - - write\_time\_ns +- internal_write + - buffer_limit + - buffer_size + - metrics_added + - metrics_written + - metrics_dropped + - metrics_filtered + - write_time_ns -internal\_\ are metrics which are defined on a per-plugin basis, and +internal_ are metrics which are defined on a per-plugin basis, and usually contain tags which differentiate each instance of a particular type of plugin. -- internal\_\ +- internal_ - individual plugin-specific fields, such as requests counts. ### Tags: @@ -76,7 +78,7 @@ to each particular plugin. ``` internal_memstats,host=tyrion alloc_bytes=4457408i,sys_bytes=10590456i,pointer_lookups=7i,mallocs=17642i,frees=7473i,heap_sys_bytes=6848512i,heap_idle_bytes=1368064i,heap_in_use_bytes=5480448i,heap_released_bytes=0i,total_alloc_bytes=6875560i,heap_alloc_bytes=4457408i,heap_objects_bytes=10169i,num_gc=2i 1480682800000000000 internal_agent,host=tyrion metrics_written=18i,metrics_dropped=0i,metrics_gathered=19i,gather_errors=0i 1480682800000000000 -internal_write,output=file,host=tyrion buffer_limit=10000i,write_time_ns=636609i,metrics_written=18i,buffer_size=0i 1480682800000000000 +internal_write,output=file,host=tyrion buffer_limit=10000i,write_time_ns=636609i,metrics_added=18i,metrics_written=18i,buffer_size=0i 1480682800000000000 internal_gather,input=internal,host=tyrion metrics_gathered=19i,gather_time_ns=442114i 1480682800000000000 internal_gather,input=http_listener,host=tyrion metrics_gathered=0i,gather_time_ns=167285i 1480682800000000000 internal_http_listener,address=:8186,host=tyrion queries_received=0i,writes_received=0i,requests_received=0i,buffers_created=0i,requests_served=0i,pings_received=0i,bytes_received=0i,not_founds_served=0i,pings_served=0i,queries_served=0i,writes_served=0i 1480682800000000000 diff --git a/plugins/inputs/kafka_consumer/README.md b/plugins/inputs/kafka_consumer/README.md index 2bc290c6b..8922f5071 100644 --- a/plugins/inputs/kafka_consumer/README.md +++ b/plugins/inputs/kafka_consumer/README.md @@ -1,18 +1,14 @@ # Kafka Consumer Input Plugin -The [Kafka](http://kafka.apache.org/) consumer plugin polls a specified Kafka -topic and adds messages to InfluxDB. The plugin assumes messages follow the -line protocol. [Consumer Group](http://godoc.org/github.com/wvanbergen/kafka/consumergroup) -is used to talk to the Kafka cluster so multiple instances of telegraf can read -from the same topic in parallel. +The [Kafka][kafka] consumer plugin reads from Kafka +and creates metrics using one of the supported [input data formats][]. -For old kafka version (< 0.8), please use the kafka_consumer_legacy input plugin +For old kafka version (< 0.8), please use the [kafka_consumer_legacy][] input plugin and use the old zookeeper connection method. -## Configuration +### Configuration ```toml -# Read metrics from Kafka topic(s) [[inputs.kafka_consumer]] ## kafka servers brokers = ["localhost:9092"] @@ -44,18 +40,27 @@ and use the old zookeeper connection method. ## Offset (must be either "oldest" or "newest") offset = "oldest" + ## Maximum length of a message to consume, in bytes (default 0/unlimited); + ## larger messages are dropped + max_message_len = 1000000 + + ## Maximum messages to read from the broker that have not been written by an + ## output. For best throughput set based on the number of metrics within + ## each message and the size of the output's metric_batch_size. + ## + ## For example, if each message from the queue contains 10 metrics and the + ## output metric_batch_size is 1000, setting this to 100 will ensure that a + ## full batch is collected and the write is triggered immediately without + ## waiting until the next flush_interval. + # max_undelivered_messages = 1000 + ## Data format to consume. ## Each data format has its own unique set of configuration options, read ## more about them here: ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md data_format = "influx" - - ## Maximum length of a message to consume, in bytes (default 0/unlimited); - ## larger messages are dropped - max_message_len = 1000000 ``` -## Testing - -Running integration tests requires running Zookeeper & Kafka. See Makefile -for kafka container command. +[kafka]: https://kafka.apache.org +[kafka_consumer_legacy]: /plugins/inputs/kafka_consumer_legacy/README.md +[input data formats]: /docs/DATA_FORMATS_INPUT.md diff --git a/plugins/inputs/kafka_consumer/kafka_consumer.go b/plugins/inputs/kafka_consumer/kafka_consumer.go index eba9b68ac..31159def3 100644 --- a/plugins/inputs/kafka_consumer/kafka_consumer.go +++ b/plugins/inputs/kafka_consumer/kafka_consumer.go @@ -1,55 +1,54 @@ package kafka_consumer import ( + "context" "fmt" "log" "strings" "sync" + "github.com/Shopify/sarama" + cluster "github.com/bsm/sarama-cluster" "github.com/influxdata/telegraf" "github.com/influxdata/telegraf/internal/tls" "github.com/influxdata/telegraf/plugins/inputs" "github.com/influxdata/telegraf/plugins/parsers" - - "github.com/Shopify/sarama" - cluster "github.com/bsm/sarama-cluster" ) +const ( + defaultMaxUndeliveredMessages = 1000 +) + +type empty struct{} +type semaphore chan empty + +type Consumer interface { + Errors() <-chan error + Messages() <-chan *sarama.ConsumerMessage + MarkOffset(msg *sarama.ConsumerMessage, metadata string) + Close() error +} + type Kafka struct { - ConsumerGroup string - ClientID string `toml:"client_id"` - Topics []string - Brokers []string - MaxMessageLen int - Version string `toml:"version"` - - Cluster *cluster.Consumer - + ConsumerGroup string `toml:"consumer_group"` + ClientID string `toml:"client_id"` + Topics []string `toml:"topics"` + Brokers []string `toml:"brokers"` + MaxMessageLen int `toml:"max_message_len"` + Version string `toml:"version"` + MaxUndeliveredMessages int `toml:"max_undelivered_messages"` + Offset string `toml:"offset"` + SASLUsername string `toml:"sasl_username"` + SASLPassword string `toml:"sasl_password"` tls.ClientConfig - // SASL Username - SASLUsername string `toml:"sasl_username"` - // SASL Password - SASLPassword string `toml:"sasl_password"` + cluster Consumer + parser parsers.Parser + wg *sync.WaitGroup + cancel context.CancelFunc - // Legacy metric buffer support - MetricBuffer int - // TODO remove PointBuffer, legacy support - PointBuffer int - - Offset string - parser parsers.Parser - - sync.Mutex - - // channel for all incoming kafka messages - in <-chan *sarama.ConsumerMessage - // channel for all kafka consumer errors - errs <-chan error - done chan struct{} - - // keep the accumulator internally: - acc telegraf.Accumulator + // Unconfirmed messages + messages map[telegraf.TrackingID]*sarama.ConsumerMessage // doNotCommitMsgs tells the parser not to call CommitUpTo on the consumer // this is mostly for test purposes, but there may be a use-case for it later. @@ -86,16 +85,25 @@ var sampleConfig = ` consumer_group = "telegraf_metrics_consumers" ## Offset (must be either "oldest" or "newest") offset = "oldest" + ## Maximum length of a message to consume, in bytes (default 0/unlimited); + ## larger messages are dropped + max_message_len = 1000000 + + ## Maximum messages to read from the broker that have not been written by an + ## output. For best throughput set based on the number of metrics within + ## each message and the size of the output's metric_batch_size. + ## + ## For example, if each message from the queue contains 10 metrics and the + ## output metric_batch_size is 1000, setting this to 100 will ensure that a + ## full batch is collected and the write is triggered immediately without + ## waiting until the next flush_interval. + # max_undelivered_messages = 1000 ## Data format to consume. ## Each data format has its own unique set of configuration options, read ## more about them here: ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md data_format = "influx" - - ## Maximum length of a message to consume, in bytes (default 0/unlimited); - ## larger messages are dropped - max_message_len = 1000000 ` func (k *Kafka) SampleConfig() string { @@ -111,12 +119,8 @@ func (k *Kafka) SetParser(parser parsers.Parser) { } func (k *Kafka) Start(acc telegraf.Accumulator) error { - k.Lock() - defer k.Unlock() var clusterErr error - k.acc = acc - config := cluster.NewConfig() if k.Version != "" { @@ -159,13 +163,13 @@ func (k *Kafka) Start(acc telegraf.Accumulator) error { case "newest": config.Consumer.Offsets.Initial = sarama.OffsetNewest default: - log.Printf("I! WARNING: Kafka consumer invalid offset '%s', using 'oldest'\n", + log.Printf("I! WARNING: Kafka consumer invalid offset '%s', using 'oldest'", k.Offset) config.Consumer.Offsets.Initial = sarama.OffsetOldest } - if k.Cluster == nil { - k.Cluster, clusterErr = cluster.NewConsumer( + if k.cluster == nil { + k.cluster, clusterErr = cluster.NewConsumer( k.Brokers, k.ConsumerGroup, k.Topics, @@ -173,67 +177,110 @@ func (k *Kafka) Start(acc telegraf.Accumulator) error { ) if clusterErr != nil { - log.Printf("E! Error when creating Kafka Consumer, brokers: %v, topics: %v\n", + log.Printf("E! Error when creating Kafka Consumer, brokers: %v, topics: %v", k.Brokers, k.Topics) return clusterErr } - - // Setup message and error channels - k.in = k.Cluster.Messages() - k.errs = k.Cluster.Errors() } - k.done = make(chan struct{}) - // Start the kafka message reader - go k.receiver() - log.Printf("I! Started the kafka consumer service, brokers: %v, topics: %v\n", + ctx, cancel := context.WithCancel(context.Background()) + k.cancel = cancel + + // Start consumer goroutine + k.wg = &sync.WaitGroup{} + k.wg.Add(1) + go func() { + defer k.wg.Done() + k.receiver(ctx, acc) + }() + + log.Printf("I! Started the kafka consumer service, brokers: %v, topics: %v", k.Brokers, k.Topics) return nil } // receiver() reads all incoming messages from the consumer, and parses them into // influxdb metric points. -func (k *Kafka) receiver() { +func (k *Kafka) receiver(ctx context.Context, ac telegraf.Accumulator) { + k.messages = make(map[telegraf.TrackingID]*sarama.ConsumerMessage) + + acc := ac.WithTracking(k.MaxUndeliveredMessages) + sem := make(semaphore, k.MaxUndeliveredMessages) + for { select { - case <-k.done: + case <-ctx.Done(): return - case err := <-k.errs: - if err != nil { - k.acc.AddError(fmt.Errorf("Consumer Error: %s\n", err)) - } - case msg := <-k.in: - if k.MaxMessageLen != 0 && len(msg.Value) > k.MaxMessageLen { - k.acc.AddError(fmt.Errorf("Message longer than max_message_len (%d > %d)", - len(msg.Value), k.MaxMessageLen)) - } else { - metrics, err := k.parser.Parse(msg.Value) + case track := <-acc.Delivered(): + <-sem + k.onDelivery(track) + case err := <-k.cluster.Errors(): + acc.AddError(err) + case sem <- empty{}: + select { + case <-ctx.Done(): + return + case track := <-acc.Delivered(): + // Once for the delivered message, once to leave the case + <-sem + <-sem + k.onDelivery(track) + case err := <-k.cluster.Errors(): + <-sem + acc.AddError(err) + case msg := <-k.cluster.Messages(): + err := k.onMessage(acc, msg) if err != nil { - k.acc.AddError(fmt.Errorf("Message Parse Error\nmessage: %s\nerror: %s", - string(msg.Value), err.Error())) + acc.AddError(err) + <-sem } - for _, metric := range metrics { - k.acc.AddFields(metric.Name(), metric.Fields(), metric.Tags(), metric.Time()) - } - } - - if !k.doNotCommitMsgs { - // TODO(cam) this locking can be removed if this PR gets merged: - // https://github.com/wvanbergen/kafka/pull/84 - k.Lock() - k.Cluster.MarkOffset(msg, "") - k.Unlock() } } } } +func (k *Kafka) markOffset(msg *sarama.ConsumerMessage) { + if !k.doNotCommitMsgs { + k.cluster.MarkOffset(msg, "") + } +} + +func (k *Kafka) onMessage(acc telegraf.TrackingAccumulator, msg *sarama.ConsumerMessage) error { + if k.MaxMessageLen != 0 && len(msg.Value) > k.MaxMessageLen { + k.markOffset(msg) + return fmt.Errorf("Message longer than max_message_len (%d > %d)", + len(msg.Value), k.MaxMessageLen) + } + + metrics, err := k.parser.Parse(msg.Value) + if err != nil { + return err + } + + id := acc.AddTrackingMetricGroup(metrics) + k.messages[id] = msg + + return nil +} + +func (k *Kafka) onDelivery(track telegraf.DeliveryInfo) { + msg, ok := k.messages[track.ID()] + if !ok { + log.Printf("E! [inputs.kafka_consumer] Could not mark message delivered: %d", track.ID()) + } + + if track.Delivered() { + k.markOffset(msg) + } + delete(k.messages, track.ID()) +} + func (k *Kafka) Stop() { - k.Lock() - defer k.Unlock() - close(k.done) - if err := k.Cluster.Close(); err != nil { - k.acc.AddError(fmt.Errorf("Error closing consumer: %s\n", err.Error())) + k.cancel() + k.wg.Wait() + + if err := k.cluster.Close(); err != nil { + log.Printf("E! [inputs.kafka_consumer] Error closing consumer: %v", err) } } @@ -243,6 +290,8 @@ func (k *Kafka) Gather(acc telegraf.Accumulator) error { func init() { inputs.Add("kafka_consumer", func() telegraf.Input { - return &Kafka{} + return &Kafka{ + MaxUndeliveredMessages: defaultMaxUndeliveredMessages, + } }) } diff --git a/plugins/inputs/kafka_consumer/kafka_consumer_integration_test.go b/plugins/inputs/kafka_consumer/kafka_consumer_integration_test.go index a145a938a..23f9e0f92 100644 --- a/plugins/inputs/kafka_consumer/kafka_consumer_integration_test.go +++ b/plugins/inputs/kafka_consumer/kafka_consumer_integration_test.go @@ -38,7 +38,6 @@ func TestReadsMetricsFromKafka(t *testing.T) { ConsumerGroup: "telegraf_test_consumers", Topics: []string{testTopic}, Brokers: brokerPeers, - PointBuffer: 100000, Offset: "oldest", } p, _ := parsers.NewInfluxParser() diff --git a/plugins/inputs/kafka_consumer/kafka_consumer_test.go b/plugins/inputs/kafka_consumer/kafka_consumer_test.go index 18f7f80be..5bb7740a5 100644 --- a/plugins/inputs/kafka_consumer/kafka_consumer_test.go +++ b/plugins/inputs/kafka_consumer/kafka_consumer_test.go @@ -1,13 +1,14 @@ package kafka_consumer import ( + "context" "strings" "testing" + "github.com/Shopify/sarama" + "github.com/influxdata/telegraf" "github.com/influxdata/telegraf/plugins/parsers" "github.com/influxdata/telegraf/testutil" - - "github.com/Shopify/sarama" "github.com/stretchr/testify/assert" ) @@ -18,31 +19,57 @@ const ( invalidMsg = "cpu_load_short,host=server01 1422568543702900257\n" ) -func newTestKafka() (*Kafka, chan *sarama.ConsumerMessage) { - in := make(chan *sarama.ConsumerMessage, 1000) - k := Kafka{ - ConsumerGroup: "test", - Topics: []string{"telegraf"}, - Brokers: []string{"localhost:9092"}, - Offset: "oldest", - in: in, - doNotCommitMsgs: true, - errs: make(chan error, 1000), - done: make(chan struct{}), +type TestConsumer struct { + errors chan error + messages chan *sarama.ConsumerMessage +} + +func (c *TestConsumer) Errors() <-chan error { + return c.errors +} + +func (c *TestConsumer) Messages() <-chan *sarama.ConsumerMessage { + return c.messages +} + +func (c *TestConsumer) MarkOffset(msg *sarama.ConsumerMessage, metadata string) { +} + +func (c *TestConsumer) Close() error { + return nil +} + +func (c *TestConsumer) Inject(msg *sarama.ConsumerMessage) { + c.messages <- msg +} + +func newTestKafka() (*Kafka, *TestConsumer) { + consumer := &TestConsumer{ + errors: make(chan error), + messages: make(chan *sarama.ConsumerMessage, 1000), } - return &k, in + k := Kafka{ + cluster: consumer, + ConsumerGroup: "test", + Topics: []string{"telegraf"}, + Brokers: []string{"localhost:9092"}, + Offset: "oldest", + MaxUndeliveredMessages: defaultMaxUndeliveredMessages, + doNotCommitMsgs: true, + messages: make(map[telegraf.TrackingID]*sarama.ConsumerMessage), + } + return &k, consumer } // Test that the parser parses kafka messages into points func TestRunParser(t *testing.T) { - k, in := newTestKafka() + k, consumer := newTestKafka() acc := testutil.Accumulator{} - k.acc = &acc - defer close(k.done) + ctx := context.Background() k.parser, _ = parsers.NewInfluxParser() - go k.receiver() - in <- saramaMsg(testMsg) + go k.receiver(ctx, &acc) + consumer.Inject(saramaMsg(testMsg)) acc.Wait(1) assert.Equal(t, acc.NFields(), 1) @@ -50,14 +77,13 @@ func TestRunParser(t *testing.T) { // Test that the parser ignores invalid messages func TestRunParserInvalidMsg(t *testing.T) { - k, in := newTestKafka() + k, consumer := newTestKafka() acc := testutil.Accumulator{} - k.acc = &acc - defer close(k.done) + ctx := context.Background() k.parser, _ = parsers.NewInfluxParser() - go k.receiver() - in <- saramaMsg(invalidMsg) + go k.receiver(ctx, &acc) + consumer.Inject(saramaMsg(invalidMsg)) acc.WaitError(1) assert.Equal(t, acc.NFields(), 0) @@ -66,15 +92,14 @@ func TestRunParserInvalidMsg(t *testing.T) { // Test that overlong messages are dropped func TestDropOverlongMsg(t *testing.T) { const maxMessageLen = 64 * 1024 - k, in := newTestKafka() + k, consumer := newTestKafka() k.MaxMessageLen = maxMessageLen acc := testutil.Accumulator{} - k.acc = &acc - defer close(k.done) + ctx := context.Background() overlongMsg := strings.Repeat("v", maxMessageLen+1) - go k.receiver() - in <- saramaMsg(overlongMsg) + go k.receiver(ctx, &acc) + consumer.Inject(saramaMsg(overlongMsg)) acc.WaitError(1) assert.Equal(t, acc.NFields(), 0) @@ -82,14 +107,13 @@ func TestDropOverlongMsg(t *testing.T) { // Test that the parser parses kafka messages into points func TestRunParserAndGather(t *testing.T) { - k, in := newTestKafka() + k, consumer := newTestKafka() acc := testutil.Accumulator{} - k.acc = &acc - defer close(k.done) + ctx := context.Background() k.parser, _ = parsers.NewInfluxParser() - go k.receiver() - in <- saramaMsg(testMsg) + go k.receiver(ctx, &acc) + consumer.Inject(saramaMsg(testMsg)) acc.Wait(1) acc.GatherError(k.Gather) @@ -101,14 +125,13 @@ func TestRunParserAndGather(t *testing.T) { // Test that the parser parses kafka messages into points func TestRunParserAndGatherGraphite(t *testing.T) { - k, in := newTestKafka() + k, consumer := newTestKafka() acc := testutil.Accumulator{} - k.acc = &acc - defer close(k.done) + ctx := context.Background() k.parser, _ = parsers.NewGraphiteParser("_", []string{}, nil) - go k.receiver() - in <- saramaMsg(testMsgGraphite) + go k.receiver(ctx, &acc) + consumer.Inject(saramaMsg(testMsgGraphite)) acc.Wait(1) acc.GatherError(k.Gather) @@ -120,17 +143,16 @@ func TestRunParserAndGatherGraphite(t *testing.T) { // Test that the parser parses kafka messages into points func TestRunParserAndGatherJSON(t *testing.T) { - k, in := newTestKafka() + k, consumer := newTestKafka() acc := testutil.Accumulator{} - k.acc = &acc - defer close(k.done) + ctx := context.Background() k.parser, _ = parsers.NewParser(&parsers.Config{ DataFormat: "json", MetricName: "kafka_json_test", }) - go k.receiver() - in <- saramaMsg(testMsgJSON) + go k.receiver(ctx, &acc) + consumer.Inject(saramaMsg(testMsgJSON)) acc.Wait(1) acc.GatherError(k.Gather) diff --git a/plugins/inputs/mqtt_consumer/README.md b/plugins/inputs/mqtt_consumer/README.md index 0ec668c40..da3ce43f5 100644 --- a/plugins/inputs/mqtt_consumer/README.md +++ b/plugins/inputs/mqtt_consumer/README.md @@ -1,14 +1,11 @@ # MQTT Consumer Input Plugin -The [MQTT](http://mqtt.org/) consumer plugin reads from -specified MQTT topics and adds messages to InfluxDB. -The plugin expects messages in the -[Telegraf Input Data Formats](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md). +The [MQTT][mqtt] consumer plugin reads from the specified MQTT topics +and creates metrics using one of the supported [input data formats][]. ### Configuration: ```toml -# Read metrics from MQTT topic(s) [[inputs.mqtt_consumer]] ## MQTT broker URLs to be used. The format should be scheme://host:port, ## schema can be tcp, ssl, or ws. @@ -26,6 +23,16 @@ The plugin expects messages in the ## Connection timeout for initial connection in seconds connection_timeout = "30s" + ## Maximum messages to read from the broker that have not been written by an + ## output. For best throughput set based on the number of metrics within + ## each message and the size of the output's metric_batch_size. + ## + ## For example, if each message from the queue contains 10 metrics and the + ## output metric_batch_size is 1000, setting this to 100 will ensure that a + ## full batch is collected and the write is triggered immediately without + ## waiting until the next flush_interval. + # max_undelivered_messages = 1000 + ## Topics to subscribe to topics = [ "telegraf/host01/cpu", @@ -62,3 +69,6 @@ The plugin expects messages in the - All measurements are tagged with the incoming topic, ie `topic=telegraf/host01/cpu` + +[mqtt]: https://mqtt.org +[input data formats]: /docs/DATA_FORMATS_INPUT.md diff --git a/plugins/inputs/mqtt_consumer/mqtt_consumer.go b/plugins/inputs/mqtt_consumer/mqtt_consumer.go index 6d1e2cf58..03c3696f0 100644 --- a/plugins/inputs/mqtt_consumer/mqtt_consumer.go +++ b/plugins/inputs/mqtt_consumer/mqtt_consumer.go @@ -1,25 +1,31 @@ package mqtt_consumer import ( + "context" "errors" "fmt" "log" "strings" "time" + "github.com/eclipse/paho.mqtt.golang" "github.com/influxdata/telegraf" "github.com/influxdata/telegraf/internal" "github.com/influxdata/telegraf/internal/tls" "github.com/influxdata/telegraf/plugins/inputs" "github.com/influxdata/telegraf/plugins/parsers" - - "github.com/eclipse/paho.mqtt.golang" ) -// 30 Seconds is the default used by paho.mqtt.golang -var defaultConnectionTimeout = internal.Duration{Duration: 30 * time.Second} +var ( + // 30 Seconds is the default used by paho.mqtt.golang + defaultConnectionTimeout = internal.Duration{Duration: 30 * time.Second} + + defaultMaxUndeliveredMessages = 1000 +) type ConnectionState int +type empty struct{} +type semaphore chan empty const ( Disconnected ConnectionState = iota @@ -28,12 +34,13 @@ const ( ) type MQTTConsumer struct { - Servers []string - Topics []string - Username string - Password string - QoS int `toml:"qos"` - ConnectionTimeout internal.Duration `toml:"connection_timeout"` + Servers []string + Topics []string + Username string + Password string + QoS int `toml:"qos"` + ConnectionTimeout internal.Duration `toml:"connection_timeout"` + MaxUndeliveredMessages int `toml:"max_undelivered_messages"` parser parsers.Parser @@ -45,9 +52,14 @@ type MQTTConsumer struct { tls.ClientConfig client mqtt.Client - acc telegraf.Accumulator + acc telegraf.TrackingAccumulator state ConnectionState subscribed bool + sem semaphore + messages map[telegraf.TrackingID]bool + + ctx context.Context + cancel context.CancelFunc } var sampleConfig = ` @@ -67,6 +79,16 @@ var sampleConfig = ` ## Connection timeout for initial connection in seconds connection_timeout = "30s" + ## Maximum messages to read from the broker that have not been written by an + ## output. For best throughput set based on the number of metrics within + ## each message and the size of the output's metric_batch_size. + ## + ## For example, if each message from the queue contains 10 metrics and the + ## output metric_batch_size is 1000, setting this to 100 will ensure that a + ## full batch is collected and the write is triggered immediately without + ## waiting until the next flush_interval. + # max_undelivered_messages = 1000 + ## Topics to subscribe to topics = [ "telegraf/host01/cpu", @@ -118,7 +140,6 @@ func (m *MQTTConsumer) Start(acc telegraf.Accumulator) error { return errors.New("persistent_session requires client_id") } - m.acc = acc if m.QoS > 2 || m.QoS < 0 { return fmt.Errorf("qos value must be 0, 1, or 2: %d", m.QoS) } @@ -127,6 +148,9 @@ func (m *MQTTConsumer) Start(acc telegraf.Accumulator) error { return fmt.Errorf("connection_timeout must be greater than 1s: %s", m.ConnectionTimeout.Duration) } + m.acc = acc.WithTracking(m.MaxUndeliveredMessages) + m.ctx, m.cancel = context.WithCancel(context.Background()) + opts, err := m.createOpts() if err != nil { return err @@ -146,8 +170,10 @@ func (m *MQTTConsumer) connect() error { return err } - log.Printf("I! [inputs.mqtt_consumer]: connected %v", m.Servers) + log.Printf("I! [inputs.mqtt_consumer] Connected %v", m.Servers) m.state = Connected + m.sem = make(semaphore, m.MaxUndeliveredMessages) + m.messages = make(map[telegraf.TrackingID]bool) // Only subscribe on first connection when using persistent sessions. On // subsequent connections the subscriptions should be stored in the @@ -172,38 +198,64 @@ func (m *MQTTConsumer) connect() error { func (m *MQTTConsumer) onConnectionLost(c mqtt.Client, err error) { m.acc.AddError(fmt.Errorf("connection lost: %v", err)) - log.Printf("D! [inputs.mqtt_consumer]: disconnected %v", m.Servers) + log.Printf("D! [inputs.mqtt_consumer] Disconnected %v", m.Servers) m.state = Disconnected return } func (m *MQTTConsumer) recvMessage(c mqtt.Client, msg mqtt.Message) { - topic := msg.Topic() + for { + select { + case track := <-m.acc.Delivered(): + _, ok := m.messages[track.ID()] + if !ok { + // Added by a previous connection + continue + } + <-m.sem + // No ack, MQTT does not support durable handling + delete(m.messages, track.ID()) + case m.sem <- empty{}: + err := m.onMessage(m.acc, msg) + if err != nil { + m.acc.AddError(err) + <-m.sem + } + return + } + } +} + +func (m *MQTTConsumer) onMessage(acc telegraf.TrackingAccumulator, msg mqtt.Message) error { metrics, err := m.parser.Parse(msg.Payload()) if err != nil { - m.acc.AddError(err) + return err } + topic := msg.Topic() for _, metric := range metrics { - tags := metric.Tags() - tags["topic"] = topic - m.acc.AddFields(metric.Name(), metric.Fields(), tags, metric.Time()) + metric.AddTag("topic", topic) } + + id := acc.AddTrackingMetricGroup(metrics) + m.messages[id] = true + return nil } func (m *MQTTConsumer) Stop() { if m.state == Connected { - log.Printf("D! [inputs.mqtt_consumer]: disconnecting %v", m.Servers) + log.Printf("D! [inputs.mqtt_consumer] Disconnecting %v", m.Servers) m.client.Disconnect(200) - log.Printf("D! [inputs.mqtt_consumer]: disconnected %v", m.Servers) + log.Printf("D! [inputs.mqtt_consumer] Disconnected %v", m.Servers) m.state = Disconnected } + m.cancel() } func (m *MQTTConsumer) Gather(acc telegraf.Accumulator) error { if m.state == Disconnected { m.state = Connecting - log.Printf("D! [inputs.mqtt_consumer]: connecting %v", m.Servers) + log.Printf("D! [inputs.mqtt_consumer] Connecting %v", m.Servers) m.connect() } @@ -246,7 +298,7 @@ func (m *MQTTConsumer) createOpts() (*mqtt.ClientOptions, error) { for _, server := range m.Servers { // Preserve support for host:port style servers; deprecated in Telegraf 1.4.4 if !strings.Contains(server, "://") { - log.Printf("W! [inputs.mqtt_consumer] server %q should be updated to use `scheme://host:port` format", server) + log.Printf("W! [inputs.mqtt_consumer] Server %q should be updated to use `scheme://host:port` format", server) if tlsCfg == nil { server = "tcp://" + server } else { @@ -267,8 +319,9 @@ func (m *MQTTConsumer) createOpts() (*mqtt.ClientOptions, error) { func init() { inputs.Add("mqtt_consumer", func() telegraf.Input { return &MQTTConsumer{ - ConnectionTimeout: defaultConnectionTimeout, - state: Disconnected, + ConnectionTimeout: defaultConnectionTimeout, + MaxUndeliveredMessages: defaultMaxUndeliveredMessages, + state: Disconnected, } }) } diff --git a/plugins/inputs/mqtt_consumer/mqtt_consumer_test.go b/plugins/inputs/mqtt_consumer/mqtt_consumer_test.go index c04bd18a7..4209963bb 100644 --- a/plugins/inputs/mqtt_consumer/mqtt_consumer_test.go +++ b/plugins/inputs/mqtt_consumer/mqtt_consumer_test.go @@ -3,12 +3,9 @@ package mqtt_consumer import ( "testing" - "github.com/influxdata/telegraf/plugins/parsers" - "github.com/influxdata/telegraf/testutil" - - "github.com/stretchr/testify/assert" - "github.com/eclipse/paho.mqtt.golang" + "github.com/influxdata/telegraf/testutil" + "github.com/stretchr/testify/assert" ) const ( @@ -71,47 +68,6 @@ func TestPersistentClientIDFail(t *testing.T) { assert.Error(t, err) } -func TestRunParser(t *testing.T) { - n := newTestMQTTConsumer() - acc := testutil.Accumulator{} - n.acc = &acc - n.parser, _ = parsers.NewInfluxParser() - - n.recvMessage(nil, mqttMsg(testMsg)) - - if a := acc.NFields(); a != 1 { - t.Errorf("got %v, expected %v", a, 1) - } -} - -// Test that the parser ignores invalid messages -func TestRunParserInvalidMsg(t *testing.T) { - n := newTestMQTTConsumer() - acc := testutil.Accumulator{} - n.acc = &acc - n.parser, _ = parsers.NewInfluxParser() - - n.recvMessage(nil, mqttMsg(invalidMsg)) - - if a := acc.NFields(); a != 0 { - t.Errorf("got %v, expected %v", a, 0) - } - assert.Len(t, acc.Errors, 1) -} - -// Test that the parser parses line format messages into metrics -func TestRunParserAndGather(t *testing.T) { - n := newTestMQTTConsumer() - acc := testutil.Accumulator{} - n.acc = &acc - n.parser, _ = parsers.NewInfluxParser() - - n.recvMessage(nil, mqttMsg(testMsg)) - - acc.AssertContainsFields(t, "cpu_load_short", - map[string]interface{}{"value": float64(23422)}) -} - func mqttMsg(val string) mqtt.Message { return &message{ topic: "telegraf/unit_test", diff --git a/plugins/inputs/nats_consumer/README.md b/plugins/inputs/nats_consumer/README.md index 18dd57f07..8a89d90c5 100644 --- a/plugins/inputs/nats_consumer/README.md +++ b/plugins/inputs/nats_consumer/README.md @@ -1,16 +1,14 @@ # NATS Consumer Input Plugin -The [NATS](http://www.nats.io/about/) consumer plugin reads from -specified NATS subjects and adds messages to InfluxDB. The plugin expects messages -in the [Telegraf Input Data Formats](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md). -A [Queue Group](http://www.nats.io/documentation/concepts/nats-queueing/) -is used when subscribing to subjects so multiple instances of telegraf can read -from a NATS cluster in parallel. +The [NATS][nats] consumer plugin reads from the specified NATS subjects and +creates metrics using one of the supported [input data formats][]. -## Configuration +A [Queue Group][queue group] is used when subscribing to subjects so multiple +instances of telegraf can read from a NATS cluster in parallel. + +### Configuration: ```toml -# Read metrics from NATS subject(s) [[inputs.nats_consumer]] ## urls of NATS servers servers = ["nats://localhost:4222"] @@ -20,13 +18,29 @@ from a NATS cluster in parallel. subjects = ["telegraf"] ## name a queue group queue_group = "telegraf_consumers" - ## Maximum number of metrics to buffer between collection intervals - metric_buffer = 100000 - ## Data format to consume. + ## Sets the limits for pending msgs and bytes for each subscription + ## These shouldn't need to be adjusted except in very high throughput scenarios + # pending_message_limit = 65536 + # pending_bytes_limit = 67108864 + ## Maximum messages to read from the broker that have not been written by an + ## output. For best throughput set based on the number of metrics within + ## each message and the size of the output's metric_batch_size. + ## + ## For example, if each message from the queue contains 10 metrics and the + ## output metric_batch_size is 1000, setting this to 100 will ensure that a + ## full batch is collected and the write is triggered immediately without + ## waiting until the next flush_interval. + # max_undelivered_messages = 1000 + + ## Data format to consume. ## Each data format has its own unique set of configuration options, read ## more about them here: ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md data_format = "influx" ``` + +[nats]: https://www.nats.io/about/ +[input data formats]: /docs/DATA_FORMATS_INPUT.md +[queue group]: https://www.nats.io/documentation/concepts/nats-queueing/ diff --git a/plugins/inputs/nats_consumer/nats_consumer.go b/plugins/inputs/nats_consumer/nats_consumer.go index dac80476d..4411d8c3e 100644 --- a/plugins/inputs/nats_consumer/nats_consumer.go +++ b/plugins/inputs/nats_consumer/nats_consumer.go @@ -1,6 +1,7 @@ package natsconsumer import ( + "context" "fmt" "log" "sync" @@ -11,6 +12,13 @@ import ( nats "github.com/nats-io/go-nats" ) +var ( + defaultMaxUndeliveredMessages = 1000 +) + +type empty struct{} +type semaphore chan empty + type natsError struct { conn *nats.Conn sub *nats.Subscription @@ -23,48 +31,58 @@ func (e natsError) Error() string { } type natsConsumer struct { - QueueGroup string - Subjects []string - Servers []string - Secure bool + QueueGroup string `toml:"queue_group"` + Subjects []string `toml:"subjects"` + Servers []string `toml:"servers"` + Secure bool `toml:"secure"` // Client pending limits: - PendingMessageLimit int - PendingBytesLimit int + PendingMessageLimit int `toml:"pending_message_limit"` + PendingBytesLimit int `toml:"pending_bytes_limit"` + + MaxUndeliveredMessages int `toml:"max_undelivered_messages"` // Legacy metric buffer support; deprecated in v0.10.3 MetricBuffer int + conn *nats.Conn + subs []*nats.Subscription + parser parsers.Parser - - sync.Mutex - wg sync.WaitGroup - Conn *nats.Conn - Subs []*nats.Subscription - // channel for all incoming NATS messages in chan *nats.Msg // channel for all NATS read errors - errs chan error - done chan struct{} - acc telegraf.Accumulator + errs chan error + acc telegraf.TrackingAccumulator + wg sync.WaitGroup + cancel context.CancelFunc } var sampleConfig = ` ## urls of NATS servers - # servers = ["nats://localhost:4222"] + servers = ["nats://localhost:4222"] ## Use Transport Layer Security - # secure = false + secure = false ## subject(s) to consume - # subjects = ["telegraf"] + subjects = ["telegraf"] ## name a queue group - # queue_group = "telegraf_consumers" + queue_group = "telegraf_consumers" ## Sets the limits for pending msgs and bytes for each subscription ## These shouldn't need to be adjusted except in very high throughput scenarios # pending_message_limit = 65536 # pending_bytes_limit = 67108864 + ## Maximum messages to read from the broker that have not been written by an + ## output. For best throughput set based on the number of metrics within + ## each message and the size of the output's metric_batch_size. + ## + ## For example, if each message from the queue contains 10 metrics and the + ## output metric_batch_size is 1000, setting this to 100 will ensure that a + ## full batch is collected and the write is triggered immediately without + ## waiting until the next flush_interval. + # max_undelivered_messages = 1000 + ## Data format to consume. ## Each data format has its own unique set of configuration options, read ## more about them here: @@ -94,10 +112,7 @@ func (n *natsConsumer) natsErrHandler(c *nats.Conn, s *nats.Subscription, e erro // Start the nats consumer. Caller must call *natsConsumer.Stop() to clean up. func (n *natsConsumer) Start(acc telegraf.Accumulator) error { - n.Lock() - defer n.Unlock() - - n.acc = acc + n.acc = acc.WithTracking(n.MaxUndeliveredMessages) var connectErr error @@ -112,89 +127,106 @@ func (n *natsConsumer) Start(acc telegraf.Accumulator) error { opts.Secure = n.Secure - if n.Conn == nil || n.Conn.IsClosed() { - n.Conn, connectErr = opts.Connect() + if n.conn == nil || n.conn.IsClosed() { + n.conn, connectErr = opts.Connect() if connectErr != nil { return connectErr } // Setup message and error channels n.errs = make(chan error) - n.Conn.SetErrorHandler(n.natsErrHandler) + n.conn.SetErrorHandler(n.natsErrHandler) n.in = make(chan *nats.Msg, 1000) for _, subj := range n.Subjects { - sub, err := n.Conn.QueueSubscribe(subj, n.QueueGroup, func(m *nats.Msg) { + sub, err := n.conn.QueueSubscribe(subj, n.QueueGroup, func(m *nats.Msg) { n.in <- m }) if err != nil { return err } // ensure that the subscription has been processed by the server - if err = n.Conn.Flush(); err != nil { + if err = n.conn.Flush(); err != nil { return err } // set the subscription pending limits if err = sub.SetPendingLimits(n.PendingMessageLimit, n.PendingBytesLimit); err != nil { return err } - n.Subs = append(n.Subs, sub) + n.subs = append(n.subs, sub) } } - n.done = make(chan struct{}) + ctx, cancel := context.WithCancel(context.Background()) + n.cancel = cancel // Start the message reader n.wg.Add(1) - go n.receiver() + go func() { + defer n.wg.Done() + go n.receiver(ctx) + }() + log.Printf("I! Started the NATS consumer service, nats: %v, subjects: %v, queue: %v\n", - n.Conn.ConnectedUrl(), n.Subjects, n.QueueGroup) + n.conn.ConnectedUrl(), n.Subjects, n.QueueGroup) return nil } // receiver() reads all incoming messages from NATS, and parses them into // telegraf metrics. -func (n *natsConsumer) receiver() { - defer n.wg.Done() +func (n *natsConsumer) receiver(ctx context.Context) { + sem := make(semaphore, n.MaxUndeliveredMessages) + for { select { - case <-n.done: + case <-ctx.Done(): return + case <-n.acc.Delivered(): + <-sem case err := <-n.errs: - n.acc.AddError(fmt.Errorf("E! error reading from %s\n", err.Error())) - case msg := <-n.in: - metrics, err := n.parser.Parse(msg.Data) - if err != nil { - n.acc.AddError(fmt.Errorf("E! subject: %s, error: %s", msg.Subject, err.Error())) - } + n.acc.AddError(err) + case sem <- empty{}: + select { + case <-ctx.Done(): + return + case err := <-n.errs: + <-sem + n.acc.AddError(err) + case <-n.acc.Delivered(): + <-sem + <-sem + case msg := <-n.in: + metrics, err := n.parser.Parse(msg.Data) + if err != nil { + n.acc.AddError(fmt.Errorf("subject: %s, error: %s", msg.Subject, err.Error())) + <-sem + continue + } - for _, metric := range metrics { - n.acc.AddFields(metric.Name(), metric.Fields(), metric.Tags(), metric.Time()) + n.acc.AddTrackingMetricGroup(metrics) } } } } func (n *natsConsumer) clean() { - for _, sub := range n.Subs { + for _, sub := range n.subs { if err := sub.Unsubscribe(); err != nil { - n.acc.AddError(fmt.Errorf("E! Error unsubscribing from subject %s in queue %s: %s\n", + n.acc.AddError(fmt.Errorf("Error unsubscribing from subject %s in queue %s: %s\n", sub.Subject, sub.Queue, err.Error())) } } - if n.Conn != nil && !n.Conn.IsClosed() { - n.Conn.Close() + if n.conn != nil && !n.conn.IsClosed() { + n.conn.Close() } } func (n *natsConsumer) Stop() { - n.Lock() - close(n.done) + n.cancel() n.wg.Wait() n.clean() - n.Unlock() } func (n *natsConsumer) Gather(acc telegraf.Accumulator) error { @@ -204,12 +236,13 @@ func (n *natsConsumer) Gather(acc telegraf.Accumulator) error { func init() { inputs.Add("nats_consumer", func() telegraf.Input { return &natsConsumer{ - Servers: []string{"nats://localhost:4222"}, - Secure: false, - Subjects: []string{"telegraf"}, - QueueGroup: "telegraf_consumers", - PendingBytesLimit: nats.DefaultSubPendingBytesLimit, - PendingMessageLimit: nats.DefaultSubPendingMsgsLimit, + Servers: []string{"nats://localhost:4222"}, + Secure: false, + Subjects: []string{"telegraf"}, + QueueGroup: "telegraf_consumers", + PendingBytesLimit: nats.DefaultSubPendingBytesLimit, + PendingMessageLimit: nats.DefaultSubPendingMsgsLimit, + MaxUndeliveredMessages: defaultMaxUndeliveredMessages, } }) } diff --git a/plugins/inputs/nats_consumer/nats_consumer_test.go b/plugins/inputs/nats_consumer/nats_consumer_test.go deleted file mode 100644 index a1f499554..000000000 --- a/plugins/inputs/nats_consumer/nats_consumer_test.go +++ /dev/null @@ -1,134 +0,0 @@ -package natsconsumer - -import ( - "testing" - - "github.com/influxdata/telegraf/plugins/parsers" - "github.com/influxdata/telegraf/testutil" - nats "github.com/nats-io/go-nats" - "github.com/stretchr/testify/assert" -) - -const ( - testMsg = "cpu_load_short,host=server01 value=23422.0 1422568543702900257\n" - testMsgGraphite = "cpu.load.short.graphite 23422 1454780029" - testMsgJSON = "{\"a\": 5, \"b\": {\"c\": 6}}\n" - invalidMsg = "cpu_load_short,host=server01 1422568543702900257\n" - metricBuffer = 5 -) - -func newTestNatsConsumer() (*natsConsumer, chan *nats.Msg) { - in := make(chan *nats.Msg, metricBuffer) - n := &natsConsumer{ - QueueGroup: "test", - Subjects: []string{"telegraf"}, - Servers: []string{"nats://localhost:4222"}, - Secure: false, - in: in, - errs: make(chan error, metricBuffer), - done: make(chan struct{}), - } - return n, in -} - -// Test that the parser parses NATS messages into metrics -func TestRunParser(t *testing.T) { - n, in := newTestNatsConsumer() - acc := testutil.Accumulator{} - n.acc = &acc - defer close(n.done) - - n.parser, _ = parsers.NewInfluxParser() - n.wg.Add(1) - go n.receiver() - in <- natsMsg(testMsg) - - acc.Wait(1) -} - -// Test that the parser ignores invalid messages -func TestRunParserInvalidMsg(t *testing.T) { - n, in := newTestNatsConsumer() - acc := testutil.Accumulator{} - n.acc = &acc - defer close(n.done) - - n.parser, _ = parsers.NewInfluxParser() - n.wg.Add(1) - go n.receiver() - in <- natsMsg(invalidMsg) - - acc.WaitError(1) - assert.Contains(t, acc.Errors[0].Error(), "E! subject: telegraf, error: metric parse error") - assert.EqualValues(t, 0, acc.NMetrics()) -} - -// Test that the parser parses line format messages into metrics -func TestRunParserAndGather(t *testing.T) { - n, in := newTestNatsConsumer() - acc := testutil.Accumulator{} - n.acc = &acc - defer close(n.done) - - n.parser, _ = parsers.NewInfluxParser() - n.wg.Add(1) - go n.receiver() - in <- natsMsg(testMsg) - - n.Gather(&acc) - - acc.Wait(1) - acc.AssertContainsFields(t, "cpu_load_short", - map[string]interface{}{"value": float64(23422)}) -} - -// Test that the parser parses graphite format messages into metrics -func TestRunParserAndGatherGraphite(t *testing.T) { - n, in := newTestNatsConsumer() - acc := testutil.Accumulator{} - n.acc = &acc - defer close(n.done) - - n.parser, _ = parsers.NewGraphiteParser("_", []string{}, nil) - n.wg.Add(1) - go n.receiver() - in <- natsMsg(testMsgGraphite) - - n.Gather(&acc) - - acc.Wait(1) - acc.AssertContainsFields(t, "cpu_load_short_graphite", - map[string]interface{}{"value": float64(23422)}) -} - -// Test that the parser parses json format messages into metrics -func TestRunParserAndGatherJSON(t *testing.T) { - n, in := newTestNatsConsumer() - acc := testutil.Accumulator{} - n.acc = &acc - defer close(n.done) - - n.parser, _ = parsers.NewParser(&parsers.Config{ - DataFormat: "json", - MetricName: "nats_json_test", - }) - n.wg.Add(1) - go n.receiver() - in <- natsMsg(testMsgJSON) - - n.Gather(&acc) - - acc.Wait(1) - acc.AssertContainsFields(t, "nats_json_test", - map[string]interface{}{ - "a": float64(5), - "b_c": float64(6), - }) -} - -func natsMsg(val string) *nats.Msg { - return &nats.Msg{ - Subject: "telegraf", - Data: []byte(val), - } -} diff --git a/plugins/inputs/nsq_consumer/README.md b/plugins/inputs/nsq_consumer/README.md index 5ac156eec..0dae26e8c 100644 --- a/plugins/inputs/nsq_consumer/README.md +++ b/plugins/inputs/nsq_consumer/README.md @@ -1,9 +1,9 @@ # NSQ Consumer Input Plugin -The [NSQ](http://nsq.io/) consumer plugin polls a specified NSQD -topic and adds messages to InfluxDB. This plugin allows a message to be in any of the supported `data_format` types. +The [NSQ][nsq] consumer plugin reads from NSQD and creates metrics using one +of the supported [input data formats][]. -## Configuration +### Configuration: ```toml # Read metrics from NSQD topic(s) @@ -18,6 +18,16 @@ topic and adds messages to InfluxDB. This plugin allows a message to be in any o channel = "consumer" max_in_flight = 100 + ## Maximum messages to read from the broker that have not been written by an + ## output. For best throughput set based on the number of metrics within + ## each message and the size of the output's metric_batch_size. + ## + ## For example, if each message from the queue contains 10 metrics and the + ## output metric_batch_size is 1000, setting this to 100 will ensure that a + ## full batch is collected and the write is triggered immediately without + ## waiting until the next flush_interval. + # max_undelivered_messages = 1000 + ## Data format to consume. ## Each data format has its own unique set of configuration options, read ## more about them here: @@ -25,5 +35,5 @@ topic and adds messages to InfluxDB. This plugin allows a message to be in any o data_format = "influx" ``` -## Testing -The `nsq_consumer_test` mocks out the interaction with `NSQD`. It requires no outside dependencies. +[nsq]: https://nsq.io +[input data formats]: /docs/DATA_FORMATS_INPUT.md diff --git a/plugins/inputs/nsq_consumer/nsq_consumer.go b/plugins/inputs/nsq_consumer/nsq_consumer.go index 0823b3ac9..de7572316 100644 --- a/plugins/inputs/nsq_consumer/nsq_consumer.go +++ b/plugins/inputs/nsq_consumer/nsq_consumer.go @@ -1,7 +1,9 @@ package nsq_consumer import ( - "fmt" + "context" + "log" + "sync" "github.com/influxdata/telegraf" "github.com/influxdata/telegraf/plugins/inputs" @@ -9,17 +11,38 @@ import ( nsq "github.com/nsqio/go-nsq" ) +const ( + defaultMaxUndeliveredMessages = 1000 +) + +type empty struct{} +type semaphore chan empty + +type logger struct{} + +func (l *logger) Output(calldepth int, s string) error { + log.Println("D! [inputs.nsq_consumer] " + s) + return nil +} + //NSQConsumer represents the configuration of the plugin type NSQConsumer struct { - Server string - Nsqd []string - Nsqlookupd []string - Topic string - Channel string - MaxInFlight int - parser parsers.Parser - consumer *nsq.Consumer - acc telegraf.Accumulator + Server string `toml:"server"` + Nsqd []string `toml:"nsqd"` + Nsqlookupd []string `toml:"nsqlookupd"` + Topic string `toml:"topic"` + Channel string `toml:"channel"` + MaxInFlight int `toml:"max_in_flight"` + + MaxUndeliveredMessages int `toml:"max_undelivered_messages"` + + parser parsers.Parser + consumer *nsq.Consumer + + mu sync.Mutex + messages map[telegraf.TrackingID]*nsq.Message + wg sync.WaitGroup + cancel context.CancelFunc } var sampleConfig = ` @@ -33,6 +56,16 @@ var sampleConfig = ` channel = "consumer" max_in_flight = 100 + ## Maximum messages to read from the broker that have not been written by an + ## output. For best throughput set based on the number of metrics within + ## each message and the size of the output's metric_batch_size. + ## + ## For example, if each message from the queue contains 10 metrics and the + ## output metric_batch_size is 1000, setting this to 100 will ensure that a + ## full batch is collected and the write is triggered immediately without + ## waiting until the next flush_interval. + # max_undelivered_messages = 1000 + ## Data format to consume. ## Each data format has its own unique set of configuration options, read ## more about them here: @@ -40,12 +73,6 @@ var sampleConfig = ` data_format = "influx" ` -func init() { - inputs.Add("nsq_consumer", func() telegraf.Input { - return &NSQConsumer{} - }) -} - // SetParser takes the data_format from the config and finds the right parser for that format func (n *NSQConsumer) SetParser(parser parsers.Parser) { n.parser = parser @@ -62,32 +89,88 @@ func (n *NSQConsumer) Description() string { } // Start pulls data from nsq -func (n *NSQConsumer) Start(acc telegraf.Accumulator) error { - n.acc = acc +func (n *NSQConsumer) Start(ac telegraf.Accumulator) error { + acc := ac.WithTracking(n.MaxUndeliveredMessages) + sem := make(semaphore, n.MaxUndeliveredMessages) + n.messages = make(map[telegraf.TrackingID]*nsq.Message, n.MaxUndeliveredMessages) + + ctx, cancel := context.WithCancel(context.Background()) + n.cancel = cancel + n.connect() - n.consumer.AddConcurrentHandlers(nsq.HandlerFunc(func(message *nsq.Message) error { + n.consumer.SetLogger(&logger{}, nsq.LogLevelInfo) + n.consumer.AddHandler(nsq.HandlerFunc(func(message *nsq.Message) error { metrics, err := n.parser.Parse(message.Body) if err != nil { - acc.AddError(fmt.Errorf("E! NSQConsumer Parse Error\nmessage:%s\nerror:%s", string(message.Body), err.Error())) + acc.AddError(err) + // Remove the message from the queue + message.Finish() return nil } - for _, metric := range metrics { - n.acc.AddFields(metric.Name(), metric.Fields(), metric.Tags(), metric.Time()) + if len(metrics) == 0 { + message.Finish() + return nil } - message.Finish() + + select { + case <-ctx.Done(): + return ctx.Err() + case sem <- empty{}: + break + } + + n.mu.Lock() + id := acc.AddTrackingMetricGroup(metrics) + n.messages[id] = message + n.mu.Unlock() + message.DisableAutoResponse() return nil - }), n.MaxInFlight) + })) if len(n.Nsqlookupd) > 0 { n.consumer.ConnectToNSQLookupds(n.Nsqlookupd) } n.consumer.ConnectToNSQDs(append(n.Nsqd, n.Server)) + + n.wg.Add(1) + go func() { + defer n.wg.Done() + n.onDelivery(ctx, acc, sem) + }() return nil } +func (n *NSQConsumer) onDelivery(ctx context.Context, acc telegraf.TrackingAccumulator, sem semaphore) { + for { + select { + case <-ctx.Done(): + return + case info := <-acc.Delivered(): + n.mu.Lock() + msg, ok := n.messages[info.ID()] + if !ok { + n.mu.Unlock() + continue + } + <-sem + delete(n.messages, info.ID()) + n.mu.Unlock() + + if info.Delivered() { + msg.Finish() + } else { + msg.Requeue(-1) + } + } + } +} + // Stop processing messages func (n *NSQConsumer) Stop() { + n.cancel() + n.wg.Wait() n.consumer.Stop() + <-n.consumer.StopChan } // Gather is a noop @@ -107,3 +190,11 @@ func (n *NSQConsumer) connect() error { } return nil } + +func init() { + inputs.Add("nsq_consumer", func() telegraf.Input { + return &NSQConsumer{ + MaxUndeliveredMessages: defaultMaxUndeliveredMessages, + } + }) +} diff --git a/plugins/inputs/nsq_consumer/nsq_consumer_test.go b/plugins/inputs/nsq_consumer/nsq_consumer_test.go index a8e743c12..8376f7bb1 100644 --- a/plugins/inputs/nsq_consumer/nsq_consumer_test.go +++ b/plugins/inputs/nsq_consumer/nsq_consumer_test.go @@ -36,11 +36,12 @@ func TestReadsMetricsFromNSQ(t *testing.T) { newMockNSQD(script, addr.String()) consumer := &NSQConsumer{ - Server: "127.0.0.1:4155", - Topic: "telegraf", - Channel: "consume", - MaxInFlight: 1, - Nsqd: []string{"127.0.0.1:4155"}, + Server: "127.0.0.1:4155", + Topic: "telegraf", + Channel: "consume", + MaxInFlight: 1, + MaxUndeliveredMessages: defaultMaxUndeliveredMessages, + Nsqd: []string{"127.0.0.1:4155"}, } p, _ := parsers.NewInfluxParser() diff --git a/plugins/inputs/socket_listener/socket_listener.go b/plugins/inputs/socket_listener/socket_listener.go index 73c321f81..c83f3eb68 100644 --- a/plugins/inputs/socket_listener/socket_listener.go +++ b/plugins/inputs/socket_listener/socket_listener.go @@ -2,6 +2,7 @@ package socket_listener import ( "bufio" + "crypto/tls" "fmt" "io" "log" @@ -9,11 +10,8 @@ import ( "os" "strings" "sync" - "time" - "crypto/tls" - "github.com/influxdata/telegraf" "github.com/influxdata/telegraf/internal" tlsint "github.com/influxdata/telegraf/internal/tls" @@ -120,7 +118,7 @@ func (ssl *streamSocketListener) read(c net.Conn) { continue } for _, m := range metrics { - ssl.AddFields(m.Name(), m.Fields(), m.Tags(), m.Time()) + ssl.AddMetric(m) } } @@ -156,7 +154,7 @@ func (psl *packetSocketListener) listen() { continue } for _, m := range metrics { - psl.AddFields(m.Name(), m.Fields(), m.Tags(), m.Time()) + psl.AddMetric(m) } } } diff --git a/plugins/outputs/discard/discard.go b/plugins/outputs/discard/discard.go index 4a6d634b7..919f74b47 100644 --- a/plugins/outputs/discard/discard.go +++ b/plugins/outputs/discard/discard.go @@ -7,11 +7,13 @@ import ( type Discard struct{} -func (d *Discard) Connect() error { return nil } -func (d *Discard) Close() error { return nil } -func (d *Discard) SampleConfig() string { return "" } -func (d *Discard) Description() string { return "Send metrics to nowhere at all" } -func (d *Discard) Write(metrics []telegraf.Metric) error { return nil } +func (d *Discard) Connect() error { return nil } +func (d *Discard) Close() error { return nil } +func (d *Discard) SampleConfig() string { return "" } +func (d *Discard) Description() string { return "Send metrics to nowhere at all" } +func (d *Discard) Write(metrics []telegraf.Metric) error { + return nil +} func init() { outputs.Add("discard", func() telegraf.Output { return &Discard{} }) diff --git a/plugins/outputs/prometheus_client/prometheus_client.go b/plugins/outputs/prometheus_client/prometheus_client.go index 1b8e06a49..0192d935f 100644 --- a/plugins/outputs/prometheus_client/prometheus_client.go +++ b/plugins/outputs/prometheus_client/prometheus_client.go @@ -144,7 +144,7 @@ func (p *PrometheusClient) auth(h http.Handler) http.Handler { }) } -func (p *PrometheusClient) Start() error { +func (p *PrometheusClient) Connect() error { defaultCollectors := map[string]bool{ "gocollector": true, "process": true, @@ -200,15 +200,6 @@ func (p *PrometheusClient) Start() error { return nil } -func (p *PrometheusClient) Stop() { - // plugin gets cleaned up in Close() already. -} - -func (p *PrometheusClient) Connect() error { - // This service output does not need to make any further connections - return nil -} - func (p *PrometheusClient) Close() error { ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) defer cancel() diff --git a/plugins/outputs/prometheus_client/prometheus_client_test.go b/plugins/outputs/prometheus_client/prometheus_client_test.go index bd2398a23..b6bbe35fd 100644 --- a/plugins/outputs/prometheus_client/prometheus_client_test.go +++ b/plugins/outputs/prometheus_client/prometheus_client_test.go @@ -600,7 +600,7 @@ func TestPrometheusWritePointEmptyTag(t *testing.T) { pClient, p, err := setupPrometheus() require.NoError(t, err) - defer pClient.Stop() + defer pClient.Close() now := time.Now() tags := make(map[string]string) @@ -675,7 +675,7 @@ func setupPrometheus() (*PrometheusClient, *prometheus_input.Prometheus, error) pTesting = NewClient() pTesting.Listen = "localhost:9127" pTesting.Path = "/metrics" - err := pTesting.Start() + err := pTesting.Connect() if err != nil { return nil, nil, err } diff --git a/plugins/processors/topk/topk.go b/plugins/processors/topk/topk.go index 36283482b..df5d542e3 100644 --- a/plugins/processors/topk/topk.go +++ b/plugins/processors/topk/topk.go @@ -10,6 +10,7 @@ import ( "github.com/influxdata/telegraf" "github.com/influxdata/telegraf/filter" "github.com/influxdata/telegraf/internal" + "github.com/influxdata/telegraf/metric" "github.com/influxdata/telegraf/plugins/processors" ) @@ -76,12 +77,12 @@ var sampleConfig = ` ## tags. If this setting is different than "" the plugin will add a ## tag (which name will be the value of this setting) to each metric with ## the value of the calculated GroupBy tag. Useful for debugging - # add_groupby_tag = "" + # add_groupby_tag = "" ## These settings provide a way to know the position of each metric in ## the top k. The 'add_rank_field' setting allows to specify for which ## fields the position is required. If the list is non empty, then a field - ## will be added to each and every metric for each string present in this + ## will be added to each and every metric for each string present in this ## setting. This field will contain the ranking of the group that ## the metric belonged to when aggregated over that field. ## The name of the field will be set to the name of the aggregation field, @@ -208,6 +209,11 @@ func (t *TopK) Apply(in ...telegraf.Metric) []telegraf.Metric { // Add the metrics received to our internal cache for _, m := range in { + // When tracking metrics this plugin could deadlock the input by + // holding undelivered metrics while the input waits for metrics to be + // delivered. Instead, treat all handled metrics as delivered and + // produced metrics as untracked in a similar way to aggregators. + m.Drop() // Check if the metric has any of the fields over which we are aggregating hasField := false @@ -281,7 +287,6 @@ func (t *TopK) push() []telegraf.Metric { // Create a one dimensional list with the top K metrics of each key for i, ag := range aggregations[0:min(t.K, len(aggregations))] { - // Check whether of not we need to add fields of tags to the selected metrics if len(t.aggFieldSet) != 0 || len(t.rankFieldSet) != 0 || groupTag != "" { for _, m := range t.cache[ag.groupbykey] { @@ -311,7 +316,16 @@ func (t *TopK) push() []telegraf.Metric { t.Reset() - return ret + result := make([]telegraf.Metric, 0, len(ret)) + for _, m := range ret { + copy, err := metric.New(m.Name(), m.Tags(), m.Fields(), m.Time(), m.Type()) + if err != nil { + continue + } + result = append(result, copy) + } + + return result } // Function that generates the aggregation functions diff --git a/plugins/processors/topk/topk_test.go b/plugins/processors/topk/topk_test.go index 67d80cbf9..ff0eb4d8b 100644 --- a/plugins/processors/topk/topk_test.go +++ b/plugins/processors/topk/topk_test.go @@ -1,12 +1,12 @@ package topk import ( - "reflect" "testing" "time" "github.com/influxdata/telegraf" "github.com/influxdata/telegraf/internal" + "github.com/influxdata/telegraf/testutil" ) // Key, value pair that represents a telegraf.Metric Field @@ -95,7 +95,7 @@ func deepCopy(a []telegraf.Metric) []telegraf.Metric { func belongs(m telegraf.Metric, ms []telegraf.Metric) bool { for _, i := range ms { - if reflect.DeepEqual(i, m) { + if testutil.MetricEqual(i, m) { return true } } diff --git a/processor.go b/processor.go index f2b5133a5..e084adab7 100644 --- a/processor.go +++ b/processor.go @@ -7,6 +7,6 @@ type Processor interface { // Description returns a one-sentence description on the Input Description() string - // Apply the filter to the given metric + // Apply the filter to the given metric. Apply(in ...Metric) []Metric } diff --git a/testutil/accumulator.go b/testutil/accumulator.go index d4a4bebd8..c13f02ab3 100644 --- a/testutil/accumulator.go +++ b/testutil/accumulator.go @@ -14,6 +14,15 @@ import ( "github.com/stretchr/testify/assert" ) +var ( + lastID uint64 +) + +func newTrackingID() telegraf.TrackingID { + atomic.AddUint64(&lastID, 1) + return telegraf.TrackingID(lastID) +} + // Metric defines a single point measurement type Metric struct { Measurement string @@ -23,7 +32,7 @@ type Metric struct { } func (p *Metric) String() string { - return fmt.Sprintf("%s %v", p.Measurement, p.Fields) + return fmt.Sprintf("%s %v %v", p.Measurement, p.Tags, p.Fields) } // Accumulator defines a mocked out accumulator @@ -31,11 +40,12 @@ type Accumulator struct { sync.Mutex *sync.Cond - Metrics []*Metric - nMetrics uint64 - Discard bool - Errors []error - debug bool + Metrics []*Metric + nMetrics uint64 + Discard bool + Errors []error + debug bool + delivered chan telegraf.DeliveryInfo } func (a *Accumulator) NMetrics() uint64 { @@ -154,6 +164,33 @@ func (a *Accumulator) AddHistogram( a.AddFields(measurement, fields, tags, timestamp...) } +func (a *Accumulator) AddMetric(m telegraf.Metric) { + a.AddFields(m.Name(), m.Fields(), m.Tags(), m.Time()) +} + +func (a *Accumulator) WithTracking(maxTracked int) telegraf.TrackingAccumulator { + return a +} + +func (a *Accumulator) AddTrackingMetric(m telegraf.Metric) telegraf.TrackingID { + a.AddMetric(m) + return newTrackingID() +} + +func (a *Accumulator) AddTrackingMetricGroup(group []telegraf.Metric) telegraf.TrackingID { + for _, m := range group { + a.AddMetric(m) + } + return newTrackingID() +} + +func (a *Accumulator) Delivered() <-chan telegraf.DeliveryInfo { + if a.delivered == nil { + a.delivered = make(chan telegraf.DeliveryInfo) + } + return a.delivered +} + // AddError appends the given error to Accumulator.Errors. func (a *Accumulator) AddError(err error) { if err == nil { diff --git a/testutil/metric.go b/testutil/metric.go index 56debd093..6d0db4e17 100644 --- a/testutil/metric.go +++ b/testutil/metric.go @@ -41,6 +41,18 @@ func newMetricDiff(metric telegraf.Metric) *metricDiff { return m } +func MetricEqual(expected, actual telegraf.Metric) bool { + var lhs, rhs *metricDiff + if expected != nil { + lhs = newMetricDiff(expected) + } + if actual != nil { + rhs = newMetricDiff(actual) + } + + return cmp.Equal(lhs, rhs) +} + func RequireMetricEqual(t *testing.T, expected, actual telegraf.Metric) { t.Helper() @@ -60,11 +72,11 @@ func RequireMetricEqual(t *testing.T, expected, actual telegraf.Metric) { func RequireMetricsEqual(t *testing.T, expected, actual []telegraf.Metric) { t.Helper() - lhs := make([]*metricDiff, len(expected)) + lhs := make([]*metricDiff, 0, len(expected)) for _, m := range expected { lhs = append(lhs, newMetricDiff(m)) } - rhs := make([]*metricDiff, len(actual)) + rhs := make([]*metricDiff, 0, len(actual)) for _, m := range actual { rhs = append(rhs, newMetricDiff(m)) }