Remove outputs blocking inputs when output is slow (#4938)

This commit is contained in:
Daniel Nelson 2018-11-05 13:34:28 -08:00 committed by GitHub
parent 74667cd681
commit 6e5c2f8bb6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
59 changed files with 3615 additions and 2189 deletions

View File

@ -1,489 +1,52 @@
## Steps for Contributing:
### Contributing
1. [Sign the CLA](http://influxdb.com/community/cla.html)
1. Make changes or write plugin (see below for details)
1. Add your plugin to one of: `plugins/{inputs,outputs,aggregators,processors}/all/all.go`
1. If your plugin requires a new Go package,
[add it](https://github.com/influxdata/telegraf/blob/master/CONTRIBUTING.md#adding-a-dependency)
1. Write a README for your plugin, if it's an input plugin, it should be structured
like the [input example here](https://github.com/influxdata/telegraf/blob/master/plugins/inputs/EXAMPLE_README.md).
Output plugins READMEs are less structured,
but any information you can provide on how the data will look is appreciated.
See the [OpenTSDB output](https://github.com/influxdata/telegraf/tree/master/plugins/outputs/opentsdb)
for a good example.
1. **Optional:** Help users of your plugin by including example queries for populating dashboards. Include these sample queries in the `README.md` for the plugin.
1. **Optional:** Write a [tickscript](https://docs.influxdata.com/kapacitor/v1.0/tick/syntax/) for your plugin and add it to [Kapacitor](https://github.com/influxdata/kapacitor/tree/master/examples/telegraf).
1. [Sign the CLA][cla].
1. Open a [new issue][] to discuss the changes you would like to make. This is
not strictly required but it may help reduce the amount of rework you need
to do later.
1. Make changes or write plugin using the guidelines in the following
documents:
- [Input Plugins][inputs]
- [Processor Plugins][processors]
- [Aggregator Plugins][aggregators]
- [Output Plugins][outputs]
1. Ensure you have added proper unit tests and documentation.
1. Open a new [pull request][].
## GoDoc
### GoDoc
Public interfaces for inputs, outputs, processors, aggregators, metrics,
and the accumulator can be found on the GoDoc
and the accumulator can be found in the GoDoc:
[![GoDoc](https://godoc.org/github.com/influxdata/telegraf?status.svg)](https://godoc.org/github.com/influxdata/telegraf)
## Sign the CLA
### Common development tasks
Before we can merge a pull request, you will need to sign the CLA,
which can be found [on our website](http://influxdb.com/community/cla.html)
## Adding a dependency
**Adding a dependency:**
Assuming you can already build the project, run these in the telegraf directory:
1. `dep ensure -vendor-only`
2. `dep ensure -add github.com/[dependency]/[new-package]`
## Input Plugins
This section is for developers who want to create new collection inputs.
Telegraf is entirely plugin driven. This interface allows for operators to
pick and chose what is gathered and makes it easy for developers
to create new ways of generating metrics.
Plugin authorship is kept as simple as possible to promote people to develop
and submit new inputs.
### Input Plugin Guidelines
* A plugin must conform to the [`telegraf.Input`](https://godoc.org/github.com/influxdata/telegraf#Input) interface.
* Input Plugins should call `inputs.Add` in their `init` function to register themselves.
See below for a quick example.
* Input Plugins must be added to the
`github.com/influxdata/telegraf/plugins/inputs/all/all.go` file.
* The `SampleConfig` function should return valid toml that describes how the
plugin can be configured. This is included in `telegraf config`. Please
consult the [SampleConfig](https://github.com/influxdata/telegraf/wiki/SampleConfig)
page for the latest style guidelines.
* The `Description` function should say in one line what this plugin does.
Let's say you've written a plugin that emits metrics about processes on the
current host.
### Input Plugin Example
```go
package simple
// simple.go
import (
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/inputs"
)
type Simple struct {
Ok bool
}
func (s *Simple) Description() string {
return "a demo plugin"
}
func (s *Simple) SampleConfig() string {
return `
## Indicate if everything is fine
ok = true
`
}
func (s *Simple) Gather(acc telegraf.Accumulator) error {
if s.Ok {
acc.AddFields("state", map[string]interface{}{"value": "pretty good"}, nil)
} else {
acc.AddFields("state", map[string]interface{}{"value": "not great"}, nil)
}
return nil
}
func init() {
inputs.Add("simple", func() telegraf.Input { return &Simple{} })
}
```
### Input Plugin Development
* Run `make static` followed by `make plugin-[pluginName]` to spin up a docker dev environment
using docker-compose.
* ***[Optional]*** When developing a plugin, add a `dev` directory with a `docker-compose.yml` and `telegraf.conf`
as well as any other supporting files, where sensible.
## Adding Typed Metrics
In addition the the `AddFields` function, the accumulator also supports an
`AddGauge` and `AddCounter` function. These functions are for adding _typed_
metrics. Metric types are ignored for the InfluxDB output, but can be used
for other outputs, such as [prometheus](https://prometheus.io/docs/concepts/metric_types/).
## Input Plugins Accepting Arbitrary Data Formats
Some input plugins (such as
[exec](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/exec))
accept arbitrary input data formats. An overview of these data formats can
be found
[here](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md).
In order to enable this, you must specify a `SetParser(parser parsers.Parser)`
function on the plugin object (see the exec plugin for an example), as well as
defining `parser` as a field of the object.
You can then utilize the parser internally in your plugin, parsing data as you
see fit. Telegraf's configuration layer will take care of instantiating and
creating the `Parser` object.
You should also add the following to your SampleConfig() return:
```toml
## Data format to consume.
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
data_format = "influx"
```
Below is the `Parser` interface.
```go
// Parser is an interface defining functions that a parser plugin must satisfy.
type Parser interface {
// Parse takes a byte buffer separated by newlines
// ie, `cpu.usage.idle 90\ncpu.usage.busy 10`
// and parses it into telegraf metrics
Parse(buf []byte) ([]telegraf.Metric, error)
// ParseLine takes a single string metric
// ie, "cpu.usage.idle 90"
// and parses it into a telegraf metric.
ParseLine(line string) (telegraf.Metric, error)
}
```
And you can view the code
[here.](https://github.com/influxdata/telegraf/blob/henrypfhu-master/plugins/parsers/registry.go)
## Service Input Plugins
This section is for developers who want to create new "service" collection
inputs. A service plugin differs from a regular plugin in that it operates
a background service while Telegraf is running. One example would be the `statsd`
plugin, which operates a statsd server.
Service Input Plugins are substantially more complicated than a regular plugin, as they
will require threads and locks to verify data integrity. Service Input Plugins should
be avoided unless there is no way to create their behavior with a regular plugin.
Their interface is quite similar to a regular plugin, with the addition of `Start()`
and `Stop()` methods.
### Service Plugin Guidelines
* Same as the `Plugin` guidelines, except that they must conform to the
[`telegraf.ServiceInput`](https://godoc.org/github.com/influxdata/telegraf#ServiceInput) interface.
## Output Plugins
This section is for developers who want to create a new output sink. Outputs
are created in a similar manner as collection plugins, and their interface has
similar constructs.
### Output Plugin Guidelines
* An output must conform to the [`telegraf.Output`](https://godoc.org/github.com/influxdata/telegraf#Output) interface.
* Outputs should call `outputs.Add` in their `init` function to register themselves.
See below for a quick example.
* To be available within Telegraf itself, plugins must add themselves to the
`github.com/influxdata/telegraf/plugins/outputs/all/all.go` file.
* The `SampleConfig` function should return valid toml that describes how the
plugin can be configured. This is included in `telegraf config`. Please
consult the [SampleConfig](https://github.com/influxdata/telegraf/wiki/SampleConfig)
page for the latest style guidelines.
* The `Description` function should say in one line what this output does.
### Output Example
```go
package simpleoutput
// simpleoutput.go
import (
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/outputs"
)
type Simple struct {
Ok bool
}
func (s *Simple) Description() string {
return "a demo output"
}
func (s *Simple) SampleConfig() string {
return `
ok = true
`
}
func (s *Simple) Connect() error {
// Make a connection to the URL here
return nil
}
func (s *Simple) Close() error {
// Close connection to the URL here
return nil
}
func (s *Simple) Write(metrics []telegraf.Metric) error {
for _, metric := range metrics {
// write `metric` to the output sink here
}
return nil
}
func init() {
outputs.Add("simpleoutput", func() telegraf.Output { return &Simple{} })
}
```
## Output Plugins Writing Arbitrary Data Formats
Some output plugins (such as
[file](https://github.com/influxdata/telegraf/tree/master/plugins/outputs/file))
can write arbitrary output data formats. An overview of these data formats can
be found
[here](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md).
In order to enable this, you must specify a
`SetSerializer(serializer serializers.Serializer)`
function on the plugin object (see the file plugin for an example), as well as
defining `serializer` as a field of the object.
You can then utilize the serializer internally in your plugin, serializing data
before it's written. Telegraf's configuration layer will take care of
instantiating and creating the `Serializer` object.
You should also add the following to your SampleConfig() return:
```toml
## Data format to output.
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md
data_format = "influx"
```
## Service Output Plugins
This section is for developers who want to create new "service" output. A
service output differs from a regular output in that it operates a background service
while Telegraf is running. One example would be the `prometheus_client` output,
which operates an HTTP server.
Their interface is quite similar to a regular output, with the addition of `Start()`
and `Stop()` methods.
### Service Output Guidelines
* Same as the `Output` guidelines, except that they must conform to the
`output.ServiceOutput` interface.
## Processor Plugins
This section is for developers who want to create a new processor plugin.
### Processor Plugin Guidelines
* A processor must conform to the [`telegraf.Processor`](https://godoc.org/github.com/influxdata/telegraf#Processor) interface.
* Processors should call `processors.Add` in their `init` function to register themselves.
See below for a quick example.
* To be available within Telegraf itself, plugins must add themselves to the
`github.com/influxdata/telegraf/plugins/processors/all/all.go` file.
* The `SampleConfig` function should return valid toml that describes how the
processor can be configured. This is include in the output of `telegraf config`.
* The `Description` function should say in one line what this processor does.
### Processor Example
```go
package printer
// printer.go
import (
"fmt"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/processors"
)
type Printer struct {
}
var sampleConfig = `
`
func (p *Printer) SampleConfig() string {
return sampleConfig
}
func (p *Printer) Description() string {
return "Print all metrics that pass through this filter."
}
func (p *Printer) Apply(in ...telegraf.Metric) []telegraf.Metric {
for _, metric := range in {
fmt.Println(metric.String())
}
return in
}
func init() {
processors.Add("printer", func() telegraf.Processor {
return &Printer{}
})
}
```
## Aggregator Plugins
This section is for developers who want to create a new aggregator plugin.
### Aggregator Plugin Guidelines
* A aggregator must conform to the [`telegraf.Aggregator`](https://godoc.org/github.com/influxdata/telegraf#Aggregator) interface.
* Aggregators should call `aggregators.Add` in their `init` function to register themselves.
See below for a quick example.
* To be available within Telegraf itself, plugins must add themselves to the
`github.com/influxdata/telegraf/plugins/aggregators/all/all.go` file.
* The `SampleConfig` function should return valid toml that describes how the
aggregator can be configured. This is include in `telegraf config`.
* The `Description` function should say in one line what this aggregator does.
* The Aggregator plugin will need to keep caches of metrics that have passed
through it. This should be done using the builtin `HashID()` function of each
metric.
* When the `Reset()` function is called, all caches should be cleared.
### Aggregator Example
```go
package min
// min.go
import (
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/aggregators"
)
type Min struct {
// caches for metric fields, names, and tags
fieldCache map[uint64]map[string]float64
nameCache map[uint64]string
tagCache map[uint64]map[string]string
}
func NewMin() telegraf.Aggregator {
m := &Min{}
m.Reset()
return m
}
var sampleConfig = `
## period is the flush & clear interval of the aggregator.
period = "30s"
## If true drop_original will drop the original metrics and
## only send aggregates.
drop_original = false
`
func (m *Min) SampleConfig() string {
return sampleConfig
}
func (m *Min) Description() string {
return "Keep the aggregate min of each metric passing through."
}
func (m *Min) Add(in telegraf.Metric) {
id := in.HashID()
if _, ok := m.nameCache[id]; !ok {
// hit an uncached metric, create caches for first time:
m.nameCache[id] = in.Name()
m.tagCache[id] = in.Tags()
m.fieldCache[id] = make(map[string]float64)
for k, v := range in.Fields() {
if fv, ok := convert(v); ok {
m.fieldCache[id][k] = fv
}
}
} else {
for k, v := range in.Fields() {
if fv, ok := convert(v); ok {
if _, ok := m.fieldCache[id][k]; !ok {
// hit an uncached field of a cached metric
m.fieldCache[id][k] = fv
continue
}
if fv < m.fieldCache[id][k] {
// set new minimum
m.fieldCache[id][k] = fv
}
}
}
}
}
func (m *Min) Push(acc telegraf.Accumulator) {
for id, _ := range m.nameCache {
fields := map[string]interface{}{}
for k, v := range m.fieldCache[id] {
fields[k+"_min"] = v
}
acc.AddFields(m.nameCache[id], fields, m.tagCache[id])
}
}
func (m *Min) Reset() {
m.fieldCache = make(map[uint64]map[string]float64)
m.nameCache = make(map[uint64]string)
m.tagCache = make(map[uint64]map[string]string)
}
func convert(in interface{}) (float64, bool) {
switch v := in.(type) {
case float64:
return v, true
case int64:
return float64(v), true
default:
return 0, false
}
}
func init() {
aggregators.Add("min", func() telegraf.Aggregator {
return NewMin()
})
}
```
## Unit Tests
**Unit Tests:**
Before opening a pull request you should run the linter checks and
the short tests.
### Execute linter
**Run static analysis:**
execute `make check`
```
make check
```
### Execute short tests
**Run short tests:**
execute `make test`
```
make test
```
### Execute integration tests
**Execute integration tests:**
Running the integration tests requires several docker containers to be
running. You can start the containers with:
@ -497,3 +60,12 @@ make test-all
```
Use `make docker-kill` to stop the containers.
[cla]: https://www.influxdata.com/legal/cla/
[new issue]: https://github.com/influxdata/telegraf/issues/new/choose
[pull request]: https://github.com/influxdata/telegraf/compare
[inputs]: /docs/INPUTS.md
[processors]: /docs/PROCESSORS.md
[aggregators]: /docs/AGGREGATORS.md
[outputs]: /docs/OUTPUTS.md

View File

@ -1,16 +1,14 @@
package telegraf
import "time"
import (
"time"
)
// Accumulator is an interface for "accumulating" metrics from plugin(s).
// The metrics are sent down a channel shared between all plugins.
// Accumulator allows adding metrics to the processing flow.
type Accumulator interface {
// AddFields adds a metric to the accumulator with the given measurement
// name, fields, and tags (and timestamp). If a timestamp is not provided,
// then the accumulator sets it to "now".
// Create a point with a value, decorating it with tags
// NOTE: tags is expected to be owned by the caller, don't mutate
// it after passing to Add.
AddFields(measurement string,
fields map[string]interface{},
tags map[string]string,
@ -40,7 +38,49 @@ type Accumulator interface {
tags map[string]string,
t ...time.Time)
// AddMetric adds an metric to the accumulator.
AddMetric(Metric)
// SetPrecision takes two time.Duration objects. If the first is non-zero,
// it sets that as the precision. Otherwise, it takes the second argument
// as the order of time that the metrics should be rounded to, with the
// maximum being 1s.
SetPrecision(precision, interval time.Duration)
// Report an error.
AddError(err error)
// Upgrade to a TrackingAccumulator with space for maxTracked
// metrics/batches.
WithTracking(maxTracked int) TrackingAccumulator
}
// TrackingID uniquely identifies a tracked metric group
type TrackingID uint64
// DeliveryInfo provides the results of a delivered metric group.
type DeliveryInfo interface {
// ID is the TrackingID
ID() TrackingID
// Delivered returns true if the metric was processed successfully.
Delivered() bool
}
// TrackingAccumulator is an Accumulator that provides a signal when the
// metric has been fully processed. Sending more metrics than the accumulator
// has been allocated for without reading status from the Accepted or Rejected
// channels is an error.
type TrackingAccumulator interface {
Accumulator
// Add the Metric and arrange for tracking feedback after processing..
AddTrackingMetric(m Metric) TrackingID
// Add a group of Metrics and arrange for a signal when the group has been
// processed.
AddTrackingMetricGroup(group []Metric) TrackingID
// Delivered returns a channel that will contain the tracking results.
Delivered() <-chan DeliveryInfo
}

View File

@ -20,13 +20,13 @@ type MetricMaker interface {
type accumulator struct {
maker MetricMaker
metrics chan telegraf.Metric
metrics chan<- telegraf.Metric
precision time.Duration
}
func NewAccumulator(
maker MetricMaker,
metrics chan telegraf.Metric,
metrics chan<- telegraf.Metric,
) telegraf.Accumulator {
acc := accumulator{
maker: maker,
@ -42,7 +42,7 @@ func (ac *accumulator) AddFields(
tags map[string]string,
t ...time.Time,
) {
ac.addMetric(measurement, tags, fields, telegraf.Untyped, t...)
ac.addFields(measurement, tags, fields, telegraf.Untyped, t...)
}
func (ac *accumulator) AddGauge(
@ -51,7 +51,7 @@ func (ac *accumulator) AddGauge(
tags map[string]string,
t ...time.Time,
) {
ac.addMetric(measurement, tags, fields, telegraf.Gauge, t...)
ac.addFields(measurement, tags, fields, telegraf.Gauge, t...)
}
func (ac *accumulator) AddCounter(
@ -60,7 +60,7 @@ func (ac *accumulator) AddCounter(
tags map[string]string,
t ...time.Time,
) {
ac.addMetric(measurement, tags, fields, telegraf.Counter, t...)
ac.addFields(measurement, tags, fields, telegraf.Counter, t...)
}
func (ac *accumulator) AddSummary(
@ -69,7 +69,7 @@ func (ac *accumulator) AddSummary(
tags map[string]string,
t ...time.Time,
) {
ac.addMetric(measurement, tags, fields, telegraf.Summary, t...)
ac.addFields(measurement, tags, fields, telegraf.Summary, t...)
}
func (ac *accumulator) AddHistogram(
@ -78,10 +78,16 @@ func (ac *accumulator) AddHistogram(
tags map[string]string,
t ...time.Time,
) {
ac.addMetric(measurement, tags, fields, telegraf.Histogram, t...)
ac.addFields(measurement, tags, fields, telegraf.Histogram, t...)
}
func (ac *accumulator) addMetric(
func (ac *accumulator) AddMetric(m telegraf.Metric) {
if m := ac.maker.MakeMetric(m); m != nil {
ac.metrics <- m
}
}
func (ac *accumulator) addFields(
measurement string,
tags map[string]string,
fields map[string]interface{},
@ -104,13 +110,9 @@ func (ac *accumulator) AddError(err error) {
return
}
NErrors.Incr(1)
log.Printf("E! Error in plugin [%s]: %s", ac.maker.Name(), err)
log.Printf("E! [%s]: Error in plugin: %v", ac.maker.Name(), err)
}
// SetPrecision takes two time.Duration objects. If the first is non-zero,
// it sets that as the precision. Otherwise, it takes the second argument
// as the order of time that the metrics should be rounded to, with the
// maximum being 1s.
func (ac *accumulator) SetPrecision(precision, interval time.Duration) {
if precision > 0 {
ac.precision = precision
@ -128,7 +130,7 @@ func (ac *accumulator) SetPrecision(precision, interval time.Duration) {
}
}
func (ac accumulator) getTime(t []time.Time) time.Time {
func (ac *accumulator) getTime(t []time.Time) time.Time {
var timestamp time.Time
if len(t) > 0 {
timestamp = t[0]
@ -137,3 +139,43 @@ func (ac accumulator) getTime(t []time.Time) time.Time {
}
return timestamp.Round(ac.precision)
}
func (ac *accumulator) WithTracking(maxTracked int) telegraf.TrackingAccumulator {
return &trackingAccumulator{
Accumulator: ac,
delivered: make(chan telegraf.DeliveryInfo, maxTracked),
}
}
type trackingAccumulator struct {
telegraf.Accumulator
delivered chan telegraf.DeliveryInfo
}
func (a *trackingAccumulator) AddTrackingMetric(m telegraf.Metric) telegraf.TrackingID {
dm, id := metric.WithTracking(m, a.onDelivery)
a.AddMetric(dm)
return id
}
func (a *trackingAccumulator) AddTrackingMetricGroup(group []telegraf.Metric) telegraf.TrackingID {
db, id := metric.WithGroupTracking(group, a.onDelivery)
for _, m := range db {
a.AddMetric(m)
}
return id
}
func (a *trackingAccumulator) Delivered() <-chan telegraf.DeliveryInfo {
return a.delivered
}
func (a *trackingAccumulator) onDelivery(info telegraf.DeliveryInfo) {
select {
case a.delivered <- info:
default:
// This is a programming error in the input. More items were sent for
// tracking than space requested.
panic("channel is full")
}
}

View File

@ -1,9 +1,9 @@
package agent
import (
"context"
"fmt"
"log"
"os"
"runtime"
"sync"
"time"
@ -12,187 +12,157 @@ import (
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/internal/config"
"github.com/influxdata/telegraf/internal/models"
"github.com/influxdata/telegraf/selfstat"
"github.com/influxdata/telegraf/plugins/serializers/influx"
)
// Agent runs telegraf and collects data based on the given config
// Agent runs a set of plugins.
type Agent struct {
Config *config.Config
}
// NewAgent returns an Agent struct based off the given Config
// NewAgent returns an Agent for the given Config.
func NewAgent(config *config.Config) (*Agent, error) {
a := &Agent{
Config: config,
}
if !a.Config.Agent.OmitHostname {
if a.Config.Agent.Hostname == "" {
hostname, err := os.Hostname()
if err != nil {
return nil, err
}
a.Config.Agent.Hostname = hostname
}
config.Tags["host"] = a.Config.Agent.Hostname
}
return a, nil
}
// Connect connects to all configured outputs
func (a *Agent) Connect() error {
for _, o := range a.Config.Outputs {
switch ot := o.Output.(type) {
case telegraf.ServiceOutput:
if err := ot.Start(); err != nil {
log.Printf("E! Service for output %s failed to start, exiting\n%s\n",
o.Name, err.Error())
return err
}
// Run starts and runs the Agent until the context is done.
func (a *Agent) Run(ctx context.Context) error {
log.Printf("I! [agent] Config: Interval:%s, Quiet:%#v, Hostname:%#v, "+
"Flush Interval:%s",
a.Config.Agent.Interval.Duration, a.Config.Agent.Quiet,
a.Config.Agent.Hostname, a.Config.Agent.FlushInterval.Duration)
if ctx.Err() != nil {
return ctx.Err()
}
log.Printf("D! [agent] Connecting outputs")
err := a.connectOutputs(ctx)
if err != nil {
return err
}
inputC := make(chan telegraf.Metric, 100)
procC := make(chan telegraf.Metric, 100)
outputC := make(chan telegraf.Metric, 100)
startTime := time.Now()
log.Printf("D! [agent] Starting service inputs")
err = a.startServiceInputs(ctx, inputC)
if err != nil {
return err
}
var wg sync.WaitGroup
src := inputC
dst := inputC
wg.Add(1)
go func(dst chan telegraf.Metric) {
defer wg.Done()
err := a.runInputs(ctx, startTime, dst)
if err != nil {
log.Printf("E! [agent] Error running inputs: %v", err)
}
log.Printf("D! Attempting connection to output: %s\n", o.Name)
err := o.Output.Connect()
if err != nil {
log.Printf("E! Failed to connect to output %s, retrying in 15s, "+
"error was '%s' \n", o.Name, err)
time.Sleep(15 * time.Second)
err = o.Output.Connect()
log.Printf("D! [agent] Stopping service inputs")
a.stopServiceInputs()
close(dst)
log.Printf("D! [agent] Input channel closed")
}(dst)
src = dst
if len(a.Config.Processors) > 0 {
dst = procC
wg.Add(1)
go func(src, dst chan telegraf.Metric) {
defer wg.Done()
err := a.runProcessors(src, dst)
if err != nil {
return err
log.Printf("E! [agent] Error running processors: %v", err)
}
}
log.Printf("D! Successfully connected to output: %s\n", o.Name)
close(dst)
log.Printf("D! [agent] Processor channel closed")
}(src, dst)
src = dst
}
if len(a.Config.Aggregators) > 0 {
dst = outputC
wg.Add(1)
go func(src, dst chan telegraf.Metric) {
defer wg.Done()
err := a.runAggregators(startTime, src, dst)
if err != nil {
log.Printf("E! [agent] Error running aggregators: %v", err)
}
close(dst)
log.Printf("D! [agent] Output channel closed")
}(src, dst)
src = dst
}
wg.Add(1)
go func(src chan telegraf.Metric) {
defer wg.Done()
err := a.runOutputs(startTime, src)
if err != nil {
log.Printf("E! [agent] Error running outputs: %v", err)
}
}(src)
wg.Wait()
log.Printf("D! [agent] Closing outputs")
err = a.closeOutputs()
if err != nil {
return err
}
return nil
}
// Close closes the connection to all configured outputs
func (a *Agent) Close() error {
var err error
for _, o := range a.Config.Outputs {
err = o.Output.Close()
switch ot := o.Output.(type) {
case telegraf.ServiceOutput:
ot.Stop()
}
}
return err
}
func panicRecover(input *models.RunningInput) {
if err := recover(); err != nil {
trace := make([]byte, 2048)
runtime.Stack(trace, true)
log.Printf("E! FATAL: Input [%s] panicked: %s, Stack:\n%s\n",
input.Name(), err, trace)
log.Println("E! PLEASE REPORT THIS PANIC ON GITHUB with " +
"stack trace, configuration, and OS information: " +
"https://github.com/influxdata/telegraf/issues/new")
}
}
// gatherer runs the inputs that have been configured with their own
// reporting interval.
func (a *Agent) gatherer(
shutdown chan struct{},
input *models.RunningInput,
interval time.Duration,
metricC chan telegraf.Metric,
) {
defer panicRecover(input)
GatherTime := selfstat.RegisterTiming("gather",
"gather_time_ns",
map[string]string{"input": input.Config.Name},
)
acc := NewAccumulator(input, metricC)
acc.SetPrecision(a.Config.Agent.Precision.Duration,
a.Config.Agent.Interval.Duration)
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
internal.RandomSleep(a.Config.Agent.CollectionJitter.Duration, shutdown)
start := time.Now()
gatherWithTimeout(shutdown, input, acc, interval)
elapsed := time.Since(start)
GatherTime.Incr(elapsed.Nanoseconds())
select {
case <-shutdown:
return
case <-ticker.C:
continue
}
}
}
// gatherWithTimeout gathers from the given input, with the given timeout.
// when the given timeout is reached, gatherWithTimeout logs an error message
// but continues waiting for it to return. This is to avoid leaving behind
// hung processes, and to prevent re-calling the same hung process over and
// over.
func gatherWithTimeout(
shutdown chan struct{},
input *models.RunningInput,
acc telegraf.Accumulator,
timeout time.Duration,
) {
ticker := time.NewTicker(timeout)
defer ticker.Stop()
done := make(chan error)
go func() {
done <- input.Input.Gather(acc)
// Test runs the inputs once and prints the output to stdout in line protocol.
func (a *Agent) Test() error {
var wg sync.WaitGroup
metricC := make(chan telegraf.Metric)
defer func() {
close(metricC)
wg.Wait()
}()
for {
select {
case err := <-done:
if err != nil {
acc.AddError(err)
}
return
case <-ticker.C:
err := fmt.Errorf("took longer to collect than collection interval (%s)",
timeout)
acc.AddError(err)
continue
case <-shutdown:
return
}
}
}
// Test verifies that we can 'Gather' from all inputs with their configured
// Config struct
func (a *Agent) Test() error {
shutdown := make(chan struct{})
defer close(shutdown)
metricC := make(chan telegraf.Metric)
// dummy receiver for the point channel
wg.Add(1)
go func() {
for {
select {
case <-metricC:
// do nothing
case <-shutdown:
return
defer wg.Done()
s := influx.NewSerializer()
s.SetFieldSortOrder(influx.SortFields)
for metric := range metricC {
octets, err := s.Serialize(metric)
if err == nil {
fmt.Print("> ", string(octets))
}
}
}()
for _, input := range a.Config.Inputs {
if _, ok := input.Input.(telegraf.ServiceInput); ok {
fmt.Printf("\nWARNING: skipping plugin [[%s]]: service inputs not supported in --test mode\n",
log.Printf("W!: [agent] skipping plugin [[%s]]: service inputs not supported in --test mode",
input.Name())
continue
}
@ -200,7 +170,6 @@ func (a *Agent) Test() error {
acc := NewAccumulator(input, metricC)
acc.SetPrecision(a.Config.Agent.Precision.Duration,
a.Config.Agent.Interval.Duration)
input.SetTrace(true)
input.SetDefaultTags(a.Config.Tags)
if err := input.Input.Gather(acc); err != nil {
@ -218,216 +187,445 @@ func (a *Agent) Test() error {
}
}
return nil
}
// flush writes a list of metrics to all configured outputs
func (a *Agent) flush() {
var wg sync.WaitGroup
wg.Add(len(a.Config.Outputs))
for _, o := range a.Config.Outputs {
go func(output *models.RunningOutput) {
defer wg.Done()
err := output.Write()
if err != nil {
log.Printf("E! Error writing to output [%s]: %s\n",
output.Name, err.Error())
}
}(o)
}
wg.Wait()
}
// flusher monitors the metrics input channel and flushes on the minimum interval
func (a *Agent) flusher(
shutdown chan struct{},
metricC chan telegraf.Metric,
aggMetricC chan telegraf.Metric,
outMetricC chan telegraf.Metric,
// runInputs starts and triggers the periodic gather for Inputs.
//
// When the context is done the timers are stopped and this function returns
// after all ongoing Gather calls complete.
func (a *Agent) runInputs(
ctx context.Context,
startTime time.Time,
dst chan<- telegraf.Metric,
) error {
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
for {
select {
case <-shutdown:
if len(outMetricC) > 0 {
// keep going until channel is empty
continue
}
return
case metric := <-outMetricC:
for i, o := range a.Config.Outputs {
if i == len(a.Config.Outputs)-1 {
o.AddMetric(metric)
} else {
o.AddMetric(metric.Copy())
}
}
}
}
}()
wg.Add(1)
go func() {
defer wg.Done()
for metric := range aggMetricC {
// Apply Processors
metrics := []telegraf.Metric{metric}
for _, processor := range a.Config.Processors {
metrics = processor.Apply(metrics...)
}
outMetricC <- metric
}
}()
wg.Add(1)
go func() {
defer wg.Done()
for {
select {
case <-shutdown:
if len(metricC) > 0 {
// keep going until channel is empty
continue
}
close(aggMetricC)
return
case metric := <-metricC:
// Apply Processors
metrics := []telegraf.Metric{metric}
for _, processor := range a.Config.Processors {
metrics = processor.Apply(metrics...)
}
for _, metric := range metrics {
// Apply Aggregators
var dropOriginal bool
for _, agg := range a.Config.Aggregators {
if ok := agg.Add(metric.Copy()); ok {
dropOriginal = true
}
}
// Forward metric to Outputs
if !dropOriginal {
outMetricC <- metric
}
}
}
}
}()
ticker := time.NewTicker(a.Config.Agent.FlushInterval.Duration)
semaphore := make(chan struct{}, 1)
for {
select {
case <-shutdown:
log.Println("I! Hang on, flushing any cached metrics before shutdown")
// wait for outMetricC to get flushed before flushing outputs
wg.Wait()
a.flush()
return nil
case <-ticker.C:
go func() {
select {
case semaphore <- struct{}{}:
internal.RandomSleep(a.Config.Agent.FlushJitter.Duration, shutdown)
a.flush()
<-semaphore
default:
// skipping this flush because one is already happening
log.Println("W! Skipping a scheduled flush because there is" +
" already a flush ongoing.")
}
}()
}
}
}
// Run runs the agent daemon, gathering every Interval
func (a *Agent) Run(shutdown chan struct{}) error {
var wg sync.WaitGroup
log.Printf("I! Agent Config: Interval:%s, Quiet:%#v, Hostname:%#v, "+
"Flush Interval:%s \n",
a.Config.Agent.Interval.Duration, a.Config.Agent.Quiet,
a.Config.Agent.Hostname, a.Config.Agent.FlushInterval.Duration)
// Channel shared between all input threads for accumulating metrics
metricC := make(chan telegraf.Metric, 100)
// Channel for metrics ready to be output
outMetricC := make(chan telegraf.Metric, 100)
// Channel for aggregated metrics
aggMetricC := make(chan telegraf.Metric, 100)
// Round collection to nearest interval by sleeping
if a.Config.Agent.RoundInterval {
i := int64(a.Config.Agent.Interval.Duration)
time.Sleep(time.Duration(i - (time.Now().UnixNano() % i)))
}
wg.Add(1)
go func() {
defer wg.Done()
if err := a.flusher(shutdown, metricC, aggMetricC, outMetricC); err != nil {
log.Printf("E! Flusher routine failed, exiting: %s\n", err.Error())
close(shutdown)
}
}()
wg.Add(len(a.Config.Aggregators))
for _, aggregator := range a.Config.Aggregators {
go func(agg *models.RunningAggregator) {
defer wg.Done()
acc := NewAccumulator(agg, aggMetricC)
acc.SetPrecision(a.Config.Agent.Precision.Duration,
a.Config.Agent.Interval.Duration)
agg.Run(acc, shutdown)
}(aggregator)
}
// Service inputs may immediately add metrics, if metrics are added before
// the aggregator starts they will be dropped. Generally this occurs
// only during testing but it is an outstanding issue.
//
// https://github.com/influxdata/telegraf/issues/4394
for _, input := range a.Config.Inputs {
input.SetDefaultTags(a.Config.Tags)
switch p := input.Input.(type) {
case telegraf.ServiceInput:
acc := NewAccumulator(input, metricC)
// Service input plugins should set their own precision of their
// metrics.
acc.SetPrecision(time.Nanosecond, 0)
if err := p.Start(acc); err != nil {
log.Printf("E! Service for input %s failed to start, exiting\n%s\n",
input.Name(), err.Error())
return err
}
defer p.Stop()
}
}
wg.Add(len(a.Config.Inputs))
for _, input := range a.Config.Inputs {
interval := a.Config.Agent.Interval.Duration
// overwrite global interval if this plugin has it's own.
precision := a.Config.Agent.Precision.Duration
jitter := a.Config.Agent.CollectionJitter.Duration
// Overwrite agent interval if this plugin has its own.
if input.Config.Interval != 0 {
interval = input.Config.Interval
}
go func(in *models.RunningInput, interv time.Duration) {
acc := NewAccumulator(input, dst)
acc.SetPrecision(precision, interval)
wg.Add(1)
go func(input *models.RunningInput) {
defer wg.Done()
a.gatherer(shutdown, in, interv, metricC)
}(input, interval)
if a.Config.Agent.RoundInterval {
err := internal.SleepContext(
ctx, internal.AlignDuration(startTime, interval))
if err != nil {
return
}
}
a.gatherOnInterval(ctx, acc, input, interval, jitter)
}(input)
}
wg.Wait()
return nil
}
// gather runs an input's gather function periodically until the context is
// done.
func (a *Agent) gatherOnInterval(
ctx context.Context,
acc telegraf.Accumulator,
input *models.RunningInput,
interval time.Duration,
jitter time.Duration,
) {
defer panicRecover(input)
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
err := internal.SleepContext(ctx, internal.RandomDuration(jitter))
if err != nil {
return
}
err = a.gatherOnce(acc, input, interval)
if err != nil {
acc.AddError(err)
}
select {
case <-ticker.C:
continue
case <-ctx.Done():
return
}
}
}
// gatherOnce runs the input's Gather function once, logging a warning each
// interval it fails to complete before.
func (a *Agent) gatherOnce(
acc telegraf.Accumulator,
input *models.RunningInput,
timeout time.Duration,
) error {
ticker := time.NewTicker(timeout)
defer ticker.Stop()
done := make(chan error)
go func() {
done <- input.Gather(acc)
}()
for {
select {
case err := <-done:
return err
case <-ticker.C:
log.Printf("W! [agent] input %q did not complete within its interval",
input.Name())
}
}
}
// runProcessors applies processors to metrics.
func (a *Agent) runProcessors(
src <-chan telegraf.Metric,
agg chan<- telegraf.Metric,
) error {
for metric := range src {
metrics := a.applyProcessors(metric)
for _, metric := range metrics {
agg <- metric
}
}
return nil
}
// applyProcessors applies all processors to a metric.
func (a *Agent) applyProcessors(m telegraf.Metric) []telegraf.Metric {
metrics := []telegraf.Metric{m}
for _, processor := range a.Config.Processors {
metrics = processor.Apply(metrics...)
}
return metrics
}
// runAggregators triggers the periodic push for Aggregators.
//
// When the context is done a final push will occur and then this function
// will return.
func (a *Agent) runAggregators(
startTime time.Time,
src <-chan telegraf.Metric,
dst chan<- telegraf.Metric,
) error {
ctx, cancel := context.WithCancel(context.Background())
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
for metric := range src {
var dropOriginal bool
for _, agg := range a.Config.Aggregators {
if ok := agg.Add(metric); ok {
dropOriginal = true
}
}
if !dropOriginal {
dst <- metric
}
}
cancel()
}()
precision := a.Config.Agent.Precision.Duration
interval := a.Config.Agent.Interval.Duration
aggregations := make(chan telegraf.Metric, 100)
for _, agg := range a.Config.Aggregators {
wg.Add(1)
go func(agg *models.RunningAggregator) {
defer wg.Done()
if a.Config.Agent.RoundInterval {
// Aggregators are aligned to the agent interval regardless of
// their period.
err := internal.SleepContext(ctx, internal.AlignDuration(startTime, interval))
if err != nil {
return
}
}
agg.SetPeriodStart(startTime)
acc := NewAccumulator(agg, aggregations)
acc.SetPrecision(precision, interval)
a.push(ctx, agg, acc)
close(aggregations)
}(agg)
}
for metric := range aggregations {
metrics := a.applyProcessors(metric)
for _, metric := range metrics {
dst <- metric
}
}
wg.Wait()
a.Close()
return nil
}
// push runs the push for a single aggregator every period. More simple than
// the output/input version as timeout should be less likely.... not really
// because the output channel can block for now.
func (a *Agent) push(
ctx context.Context,
aggregator *models.RunningAggregator,
acc telegraf.Accumulator,
) {
ticker := time.NewTicker(aggregator.Period())
defer ticker.Stop()
for {
select {
case <-ticker.C:
break
case <-ctx.Done():
aggregator.Push(acc)
return
}
aggregator.Push(acc)
}
}
// runOutputs triggers the periodic write for Outputs.
//
// When the context is done, outputs continue to run until their buffer is
// closed, afterwich they run flush once more.
func (a *Agent) runOutputs(
startTime time.Time,
src <-chan telegraf.Metric,
) error {
interval := a.Config.Agent.FlushInterval.Duration
jitter := a.Config.Agent.FlushJitter.Duration
ctx, cancel := context.WithCancel(context.Background())
var wg sync.WaitGroup
for _, output := range a.Config.Outputs {
interval := interval
// Overwrite agent flush_interval if this plugin has its own.
if output.Config.FlushInterval != 0 {
interval = output.Config.FlushInterval
}
wg.Add(1)
go func(output *models.RunningOutput) {
defer wg.Done()
if a.Config.Agent.RoundInterval {
err := internal.SleepContext(
ctx, internal.AlignDuration(startTime, interval))
if err != nil {
return
}
}
a.flush(ctx, output, interval, jitter)
}(output)
}
for metric := range src {
for i, output := range a.Config.Outputs {
if i == len(a.Config.Outputs)-1 {
output.AddMetric(metric)
} else {
output.AddMetric(metric.Copy())
}
}
}
log.Println("I! [agent] Hang on, flushing any cached metrics before shutdown")
cancel()
wg.Wait()
return nil
}
// flush runs an output's flush function periodically until the context is
// done.
func (a *Agent) flush(
ctx context.Context,
output *models.RunningOutput,
interval time.Duration,
jitter time.Duration,
) {
// since we are watching two channels we need a ticker with the jitter
// integrated.
ticker := NewTicker(interval, jitter)
defer ticker.Stop()
logError := func(err error) {
if err != nil {
log.Printf("E! [agent] Error writing to output [%s]: %v", output.Name, err)
}
}
for {
// Favor shutdown over other methods.
select {
case <-ctx.Done():
logError(a.flushOnce(output, interval, output.Write))
return
default:
}
select {
case <-ticker.C:
logError(a.flushOnce(output, interval, output.Write))
case <-output.BatchReady:
// Favor the ticker over batch ready
select {
case <-ticker.C:
logError(a.flushOnce(output, interval, output.Write))
default:
logError(a.flushOnce(output, interval, output.WriteBatch))
}
case <-ctx.Done():
logError(a.flushOnce(output, interval, output.Write))
return
}
}
}
// flushOnce runs the output's Write function once, logging a warning each
// interval it fails to complete before.
func (a *Agent) flushOnce(
output *models.RunningOutput,
timeout time.Duration,
writeFunc func() error,
) error {
ticker := time.NewTicker(timeout)
defer ticker.Stop()
done := make(chan error)
go func() {
done <- writeFunc()
}()
for {
select {
case err := <-done:
output.LogBufferStatus()
return err
case <-ticker.C:
log.Printf("W! [agent] output %q did not complete within its flush interval",
output.Name)
output.LogBufferStatus()
}
}
}
// connectOutputs connects to all outputs.
func (a *Agent) connectOutputs(ctx context.Context) error {
for _, output := range a.Config.Outputs {
log.Printf("D! [agent] Attempting connection to output: %s\n", output.Name)
err := output.Output.Connect()
if err != nil {
log.Printf("E! [agent] Failed to connect to output %s, retrying in 15s, "+
"error was '%s' \n", output.Name, err)
err := internal.SleepContext(ctx, 15*time.Second)
if err != nil {
return err
}
err = output.Output.Connect()
if err != nil {
return err
}
}
log.Printf("D! [agent] Successfully connected to output: %s\n", output.Name)
}
return nil
}
// closeOutputs closes all outputs.
func (a *Agent) closeOutputs() error {
var err error
for _, output := range a.Config.Outputs {
err = output.Output.Close()
}
return err
}
// startServiceInputs starts all service inputs.
func (a *Agent) startServiceInputs(
ctx context.Context,
dst chan<- telegraf.Metric,
) error {
started := []telegraf.ServiceInput{}
for _, input := range a.Config.Inputs {
if si, ok := input.Input.(telegraf.ServiceInput); ok {
// Service input plugins are not subject to timestamp rounding.
// This only applies to the accumulator passed to Start(), the
// Gather() accumulator does apply rounding according to the
// precision agent setting.
acc := NewAccumulator(input, dst)
acc.SetPrecision(time.Nanosecond, 0)
err := si.Start(acc)
if err != nil {
log.Printf("E! [agent] Service for input %s failed to start: %v",
input.Name(), err)
for _, si := range started {
si.Stop()
}
return err
}
started = append(started, si)
}
}
return nil
}
// stopServiceInputs stops all service inputs.
func (a *Agent) stopServiceInputs() {
for _, input := range a.Config.Inputs {
if si, ok := input.Input.(telegraf.ServiceInput); ok {
si.Stop()
}
}
}
// panicRecover displays an error if an input panics.
func panicRecover(input *models.RunningInput) {
if err := recover(); err != nil {
trace := make([]byte, 2048)
runtime.Stack(trace, true)
log.Printf("E! FATAL: Input [%s] panicked: %s, Stack:\n%s\n",
input.Name(), err, trace)
log.Println("E! PLEASE REPORT THIS PANIC ON GITHUB with " +
"stack trace, configuration, and OS information: " +
"https://github.com/influxdata/telegraf/issues/new/choose")
}
}

57
agent/tick.go Normal file
View File

@ -0,0 +1,57 @@
package agent
import (
"context"
"sync"
"time"
"github.com/influxdata/telegraf/internal"
)
type Ticker struct {
C chan time.Time
ticker *time.Ticker
jitter time.Duration
wg sync.WaitGroup
cancelFunc context.CancelFunc
}
func NewTicker(
interval time.Duration,
jitter time.Duration,
) *Ticker {
ctx, cancel := context.WithCancel(context.Background())
t := &Ticker{
C: make(chan time.Time, 1),
ticker: time.NewTicker(interval),
jitter: jitter,
cancelFunc: cancel,
}
t.wg.Add(1)
go t.relayTime(ctx)
return t
}
func (t *Ticker) Stop() {
t.cancelFunc()
t.wg.Wait()
}
func (t *Ticker) relayTime(ctx context.Context) {
defer t.wg.Done()
for {
select {
case tm := <-t.ticker.C:
internal.SleepContext(ctx, internal.RandomDuration(t.jitter))
select {
case t.C <- tm:
default:
}
case <-ctx.Done():
return
}
}
}

View File

@ -1,6 +1,8 @@
package main
import (
"context"
"errors"
"flag"
"fmt"
"log"
@ -78,112 +80,111 @@ func reloadLoop(
for <-reload {
reload <- false
// If no other options are specified, load the config file and run.
c := config.NewConfig()
c.OutputFilters = outputFilters
c.InputFilters = inputFilters
err := c.LoadConfig(*fConfig)
if err != nil {
log.Fatal("E! " + err.Error())
}
ctx, cancel := context.WithCancel(context.Background())
if *fConfigDirectory != "" {
err = c.LoadDirectory(*fConfigDirectory)
if err != nil {
log.Fatal("E! " + err.Error())
}
}
if !*fTest && len(c.Outputs) == 0 {
log.Fatalf("E! Error: no outputs found, did you provide a valid config file?")
}
if len(c.Inputs) == 0 {
log.Fatalf("E! Error: no inputs found, did you provide a valid config file?")
}
if int64(c.Agent.Interval.Duration) <= 0 {
log.Fatalf("E! Agent interval must be positive, found %s",
c.Agent.Interval.Duration)
}
if int64(c.Agent.FlushInterval.Duration) <= 0 {
log.Fatalf("E! Agent flush_interval must be positive; found %s",
c.Agent.Interval.Duration)
}
ag, err := agent.NewAgent(c)
if err != nil {
log.Fatal("E! " + err.Error())
}
// Setup logging
logger.SetupLogging(
ag.Config.Agent.Debug || *fDebug,
ag.Config.Agent.Quiet || *fQuiet,
ag.Config.Agent.Logfile,
)
if *fTest {
err = ag.Test()
if err != nil {
log.Fatal("E! " + err.Error())
}
os.Exit(0)
}
err = ag.Connect()
if err != nil {
log.Fatal("E! " + err.Error())
}
shutdown := make(chan struct{})
signals := make(chan os.Signal)
signal.Notify(signals, os.Interrupt, syscall.SIGHUP, syscall.SIGTERM)
go func() {
select {
case sig := <-signals:
if sig == os.Interrupt || sig == syscall.SIGTERM {
close(shutdown)
}
if sig == syscall.SIGHUP {
log.Printf("I! Reloading Telegraf config\n")
log.Printf("I! Reloading Telegraf config")
<-reload
reload <- true
close(shutdown)
}
cancel()
case <-stop:
close(shutdown)
cancel()
}
}()
log.Printf("I! Starting Telegraf %s\n", version)
log.Printf("I! Loaded inputs: %s", strings.Join(c.InputNames(), " "))
log.Printf("I! Loaded aggregators: %s", strings.Join(c.AggregatorNames(), " "))
log.Printf("I! Loaded processors: %s", strings.Join(c.ProcessorNames(), " "))
log.Printf("I! Loaded outputs: %s", strings.Join(c.OutputNames(), " "))
log.Printf("I! Tags enabled: %s", c.ListTags())
if *fPidfile != "" {
f, err := os.OpenFile(*fPidfile, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
log.Printf("E! Unable to create pidfile: %s", err)
} else {
fmt.Fprintf(f, "%d\n", os.Getpid())
f.Close()
defer func() {
err := os.Remove(*fPidfile)
if err != nil {
log.Printf("E! Unable to remove pidfile: %s", err)
}
}()
}
err := runAgent(ctx, inputFilters, outputFilters)
if err != nil {
log.Fatalf("E! [telegraf] Error running agent: %v", err)
}
ag.Run(shutdown)
}
}
func runAgent(ctx context.Context,
inputFilters []string,
outputFilters []string,
) error {
// If no other options are specified, load the config file and run.
c := config.NewConfig()
c.OutputFilters = outputFilters
c.InputFilters = inputFilters
err := c.LoadConfig(*fConfig)
if err != nil {
return err
}
if *fConfigDirectory != "" {
err = c.LoadDirectory(*fConfigDirectory)
if err != nil {
return err
}
}
if !*fTest && len(c.Outputs) == 0 {
return errors.New("Error: no outputs found, did you provide a valid config file?")
}
if len(c.Inputs) == 0 {
return errors.New("Error: no inputs found, did you provide a valid config file?")
}
if int64(c.Agent.Interval.Duration) <= 0 {
return fmt.Errorf("Agent interval must be positive, found %s",
c.Agent.Interval.Duration)
}
if int64(c.Agent.FlushInterval.Duration) <= 0 {
return fmt.Errorf("Agent flush_interval must be positive; found %s",
c.Agent.Interval.Duration)
}
ag, err := agent.NewAgent(c)
if err != nil {
return err
}
// Setup logging
logger.SetupLogging(
ag.Config.Agent.Debug || *fDebug,
ag.Config.Agent.Quiet || *fQuiet,
ag.Config.Agent.Logfile,
)
if *fTest {
return ag.Test()
}
log.Printf("I! Starting Telegraf %s\n", version)
log.Printf("I! Loaded inputs: %s", strings.Join(c.InputNames(), " "))
log.Printf("I! Loaded aggregators: %s", strings.Join(c.AggregatorNames(), " "))
log.Printf("I! Loaded processors: %s", strings.Join(c.ProcessorNames(), " "))
log.Printf("I! Loaded outputs: %s", strings.Join(c.OutputNames(), " "))
log.Printf("I! Tags enabled: %s", c.ListTags())
if *fPidfile != "" {
f, err := os.OpenFile(*fPidfile, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
log.Printf("E! Unable to create pidfile: %s", err)
} else {
fmt.Fprintf(f, "%d\n", os.Getpid())
f.Close()
defer func() {
err := os.Remove(*fPidfile)
if err != nil {
log.Printf("E! Unable to remove pidfile: %s", err)
}
}()
}
}
return ag.Run(ctx)
}
func usageExit(rc int) {
fmt.Println(internal.Usage)
os.Exit(rc)

126
docs/AGGREGATORS.md Normal file
View File

@ -0,0 +1,126 @@
### Aggregator Plugins
This section is for developers who want to create a new aggregator plugin.
### Aggregator Plugin Guidelines
* A aggregator must conform to the [telegraf.Aggregator][] interface.
* Aggregators should call `aggregators.Add` in their `init` function to
register themselves. See below for a quick example.
* To be available within Telegraf itself, plugins must add themselves to the
`github.com/influxdata/telegraf/plugins/aggregators/all/all.go` file.
- The `SampleConfig` function should return valid toml that describes how the
plugin can be configured. This is included in `telegraf config`. Please
consult the [SampleConfig][] page for the latest style guidelines.
* The `Description` function should say in one line what this aggregator does.
* The Aggregator plugin will need to keep caches of metrics that have passed
through it. This should be done using the builtin `HashID()` function of
each metric.
* When the `Reset()` function is called, all caches should be cleared.
### Aggregator Plugin Example
```go
package min
// min.go
import (
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/aggregators"
)
type Min struct {
// caches for metric fields, names, and tags
fieldCache map[uint64]map[string]float64
nameCache map[uint64]string
tagCache map[uint64]map[string]string
}
func NewMin() telegraf.Aggregator {
m := &Min{}
m.Reset()
return m
}
var sampleConfig = `
## period is the flush & clear interval of the aggregator.
period = "30s"
## If true drop_original will drop the original metrics and
## only send aggregates.
drop_original = false
`
func (m *Min) SampleConfig() string {
return sampleConfig
}
func (m *Min) Description() string {
return "Keep the aggregate min of each metric passing through."
}
func (m *Min) Add(in telegraf.Metric) {
id := in.HashID()
if _, ok := m.nameCache[id]; !ok {
// hit an uncached metric, create caches for first time:
m.nameCache[id] = in.Name()
m.tagCache[id] = in.Tags()
m.fieldCache[id] = make(map[string]float64)
for k, v := range in.Fields() {
if fv, ok := convert(v); ok {
m.fieldCache[id][k] = fv
}
}
} else {
for k, v := range in.Fields() {
if fv, ok := convert(v); ok {
if _, ok := m.fieldCache[id][k]; !ok {
// hit an uncached field of a cached metric
m.fieldCache[id][k] = fv
continue
}
if fv < m.fieldCache[id][k] {
// set new minimum
m.fieldCache[id][k] = fv
}
}
}
}
}
func (m *Min) Push(acc telegraf.Accumulator) {
for id, _ := range m.nameCache {
fields := map[string]interface{}{}
for k, v := range m.fieldCache[id] {
fields[k+"_min"] = v
}
acc.AddFields(m.nameCache[id], fields, m.tagCache[id])
}
}
func (m *Min) Reset() {
m.fieldCache = make(map[uint64]map[string]float64)
m.nameCache = make(map[uint64]string)
m.tagCache = make(map[uint64]map[string]string)
}
func convert(in interface{}) (float64, bool) {
switch v := in.(type) {
case float64:
return v, true
case int64:
return float64(v), true
default:
return 0, false
}
}
func init() {
aggregators.Add("min", func() telegraf.Aggregator {
return NewMin()
})
}
```
[telegraf.Aggregator]: https://godoc.org/github.com/influxdata/telegraf#Aggregator
[SampleConfig]: https://github.com/influxdata/telegraf/wiki/SampleConfig

View File

@ -106,6 +106,14 @@ emitted from the input plugin.
### Output Configuration
- **flush_interval**: The maximum time between flushes. Use this setting to
override the agent `flush_interval` on a per plugin basis.
- **metric_batch_size**: The maximum number of metrics to send at once. Use
this setting to override the agent `metric_batch_size` on a per plugin basis.
- **metric_buffer_limit**: The maximum number of unsent metrics to buffer.
Use this setting to override the agent `metric_buffer_limit` on a per plugin
basis.
The [metric filtering](#metric-filtering) parameters can be used to limit what metrics are
emitted from the output plugin.

143
docs/INPUTS.md Normal file
View File

@ -0,0 +1,143 @@
### Input Plugins
This section is for developers who want to create new collection inputs.
Telegraf is entirely plugin driven. This interface allows for operators to
pick and chose what is gathered and makes it easy for developers
to create new ways of generating metrics.
Plugin authorship is kept as simple as possible to promote people to develop
and submit new inputs.
### Input Plugin Guidelines
- A plugin must conform to the [telegraf.Input][] interface.
- Input Plugins should call `inputs.Add` in their `init` function to register
themselves. See below for a quick example.
- Input Plugins must be added to the
`github.com/influxdata/telegraf/plugins/inputs/all/all.go` file.
- The `SampleConfig` function should return valid toml that describes how the
plugin can be configured. This is included in `telegraf config`. Please
consult the [SampleConfig][] page for the latest style
guidelines.
- The `Description` function should say in one line what this plugin does.
Let's say you've written a plugin that emits metrics about processes on the
current host.
### Input Plugin Example
```go
package simple
// simple.go
import (
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/inputs"
)
type Simple struct {
Ok bool
}
func (s *Simple) Description() string {
return "a demo plugin"
}
func (s *Simple) SampleConfig() string {
return `
## Indicate if everything is fine
ok = true
`
}
func (s *Simple) Gather(acc telegraf.Accumulator) error {
if s.Ok {
acc.AddFields("state", map[string]interface{}{"value": "pretty good"}, nil)
} else {
acc.AddFields("state", map[string]interface{}{"value": "not great"}, nil)
}
return nil
}
func init() {
inputs.Add("simple", func() telegraf.Input { return &Simple{} })
}
```
### Development
* Run `make static` followed by `make plugin-[pluginName]` to spin up a docker
dev environment using docker-compose.
* ***[Optional]*** When developing a plugin, add a `dev` directory with a
`docker-compose.yml` and `telegraf.conf` as well as any other supporting
files, where sensible.
### Typed Metrics
In addition the the `AddFields` function, the accumulator also supports
functions to add typed metrics: `AddGauge`, `AddCounter`, etc. Metric types
are ignored by the InfluxDB output, but can be used for other outputs, such as
[prometheus][prom metric types].
### Data Formats
Some input plugins, such as the [exec][] plugin, can accept any supported
[input data formats][].
In order to enable this, you must specify a `SetParser(parser parsers.Parser)`
function on the plugin object (see the exec plugin for an example), as well as
defining `parser` as a field of the object.
You can then utilize the parser internally in your plugin, parsing data as you
see fit. Telegraf's configuration layer will take care of instantiating and
creating the `Parser` object.
Add the following to the `SampleConfig()`:
```toml
## Data format to consume.
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
data_format = "influx"
```
### Service Input Plugins
This section is for developers who want to create new "service" collection
inputs. A service plugin differs from a regular plugin in that it operates a
background service while Telegraf is running. One example would be the
`statsd` plugin, which operates a statsd server.
Service Input Plugins are substantially more complicated than a regular
plugin, as they will require threads and locks to verify data integrity.
Service Input Plugins should be avoided unless there is no way to create their
behavior with a regular plugin.
To create a Service Input implement the [telegraf.ServiceInput][] interface.
### Metric Tracking
Metric Tracking provides a system to be notified when metrics have been
successfully written to their outputs or otherwise discarded. This allows
inputs to be created that function as reliable queue consumers.
To get started with metric tracking begin by calling `WithTracking` on the
[telegraf.Accumulator][]. Add metrics using the `AddTrackingMetricGroup`
function on the returned [telegraf.TrackingAccumulator][] and store the
`TrackingID`. The `Delivered()` channel will return a type with information
about the final delivery status of the metric group.
Check the [amqp_consumer][] for an example implementation.
[exec]: https://github.com/influxdata/telegraf/tree/master/plugins/inputs/exec
[amqp_consumer]: https://github.com/influxdata/telegraf/tree/master/plugins/inputs/amqp_consumer
[prom metric types]: https://prometheus.io/docs/concepts/metric_types/
[input data formats]: https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
[SampleConfig]: https://github.com/influxdata/telegraf/wiki/SampleConfig
[telegraf.Input]: https://godoc.org/github.com/influxdata/telegraf#Input
[telegraf.ServiceInput]: https://godoc.org/github.com/influxdata/telegraf#ServiceInput
[telegraf.Accumulator]: https://godoc.org/github.com/influxdata/telegraf#Accumulator
[telegraf.TrackingAccumulator]: https://godoc.org/github.com/influxdata/telegraf#Accumulator

95
docs/OUTPUTS.md Normal file
View File

@ -0,0 +1,95 @@
### Output Plugins
This section is for developers who want to create a new output sink. Outputs
are created in a similar manner as collection plugins, and their interface has
similar constructs.
### Output Plugin Guidelines
- An output must conform to the [telegraf.Output][] interface.
- Outputs should call `outputs.Add` in their `init` function to register
themselves. See below for a quick example.
- To be available within Telegraf itself, plugins must add themselves to the
`github.com/influxdata/telegraf/plugins/outputs/all/all.go` file.
- The `SampleConfig` function should return valid toml that describes how the
plugin can be configured. This is included in `telegraf config`. Please
consult the [SampleConfig][] page for the latest style guidelines.
- The `Description` function should say in one line what this output does.
### Output Plugin Example
```go
package simpleoutput
// simpleoutput.go
import (
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/outputs"
)
type Simple struct {
Ok bool
}
func (s *Simple) Description() string {
return "a demo output"
}
func (s *Simple) SampleConfig() string {
return `
ok = true
`
}
func (s *Simple) Connect() error {
// Make a connection to the URL here
return nil
}
func (s *Simple) Close() error {
// Close connection to the URL here
return nil
}
func (s *Simple) Write(metrics []telegraf.Metric) error {
for _, metric := range metrics {
// write `metric` to the output sink here
}
return nil
}
func init() {
outputs.Add("simpleoutput", func() telegraf.Output { return &Simple{} })
}
```
## Data Formats
Some output plugins, such as the [file][] plugin, can write in any supported
[output data formats][].
In order to enable this, you must specify a
`SetSerializer(serializer serializers.Serializer)`
function on the plugin object (see the file plugin for an example), as well as
defining `serializer` as a field of the object.
You can then utilize the serializer internally in your plugin, serializing data
before it's written. Telegraf's configuration layer will take care of
instantiating and creating the `Serializer` object.
You should also add the following to your `SampleConfig()`:
```toml
## Data format to output.
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md
data_format = "influx"
```
[file]: https://github.com/influxdata/telegraf/tree/master/plugins/inputs/file
[output data formats]: https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md
[SampleConfig]: https://github.com/influxdata/telegraf/wiki/SampleConfig
[telegraf.Output]: https://godoc.org/github.com/influxdata/telegraf#Output

63
docs/PROCESSORS.md Normal file
View File

@ -0,0 +1,63 @@
### Processor Plugins
This section is for developers who want to create a new processor plugin.
### Processor Plugin Guidelines
* A processor must conform to the [telegraf.Processor][] interface.
* Processors should call `processors.Add` in their `init` function to register
themselves. See below for a quick example.
* To be available within Telegraf itself, plugins must add themselves to the
`github.com/influxdata/telegraf/plugins/processors/all/all.go` file.
* The `SampleConfig` function should return valid toml that describes how the
processor can be configured. This is include in the output of `telegraf
config`.
- The `SampleConfig` function should return valid toml that describes how the
plugin can be configured. This is included in `telegraf config`. Please
consult the [SampleConfig][] page for the latest style guidelines.
* The `Description` function should say in one line what this processor does.
### Processor Plugin Example
```go
package printer
// printer.go
import (
"fmt"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/processors"
)
type Printer struct {
}
var sampleConfig = `
`
func (p *Printer) SampleConfig() string {
return sampleConfig
}
func (p *Printer) Description() string {
return "Print all metrics that pass through this filter."
}
func (p *Printer) Apply(in ...telegraf.Metric) []telegraf.Metric {
for _, metric := range in {
fmt.Println(metric.String())
}
return in
}
func init() {
processors.Add("printer", func() telegraf.Processor {
return &Printer{}
})
}
```
[SampleConfig]: https://github.com/influxdata/telegraf/wiki/SampleConfig
[telegraf.Processor]: https://godoc.org/github.com/influxdata/telegraf#Processor

View File

@ -13,17 +13,10 @@ type Input interface {
}
type ServiceInput interface {
// SampleConfig returns the default configuration of the Input
SampleConfig() string
Input
// Description returns a one-sentence description on the Input
Description() string
// Gather takes in an accumulator and adds the metrics that the Input
// gathers. This is called every "interval"
Gather(Accumulator) error
// Start starts the ServiceInput's service, whatever that may be
// Start the ServiceInput. The Accumulator may be retained and used until
// Stop returns.
Start(Accumulator) error
// Stop stops the services and closes any necessary channels and connections

View File

@ -1,130 +0,0 @@
package buffer
import (
"sync"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/selfstat"
)
var (
MetricsWritten = selfstat.Register("agent", "metrics_written", map[string]string{})
MetricsDropped = selfstat.Register("agent", "metrics_dropped", map[string]string{})
)
// Buffer is an object for storing metrics in a circular buffer.
type Buffer struct {
sync.Mutex
buf []telegraf.Metric
first int
last int
size int
empty bool
}
// NewBuffer returns a Buffer
// size is the maximum number of metrics that Buffer will cache. If Add is
// called when the buffer is full, then the oldest metric(s) will be dropped.
func NewBuffer(size int) *Buffer {
return &Buffer{
buf: make([]telegraf.Metric, size),
first: 0,
last: 0,
size: size,
empty: true,
}
}
// IsEmpty returns true if Buffer is empty.
func (b *Buffer) IsEmpty() bool {
return b.empty
}
// Len returns the current length of the buffer.
func (b *Buffer) Len() int {
if b.empty {
return 0
} else if b.first <= b.last {
return b.last - b.first + 1
}
// Spans the end of array.
// size - gap in the middle
return b.size - (b.first - b.last - 1) // size - gap
}
func (b *Buffer) push(m telegraf.Metric) {
// Empty
if b.empty {
b.last = b.first // Reset
b.buf[b.last] = m
b.empty = false
return
}
b.last++
b.last %= b.size
// Full
if b.first == b.last {
MetricsDropped.Incr(1)
b.first = (b.first + 1) % b.size
}
b.buf[b.last] = m
}
// Add adds metrics to the buffer.
func (b *Buffer) Add(metrics ...telegraf.Metric) {
b.Lock()
defer b.Unlock()
for i := range metrics {
MetricsWritten.Incr(1)
b.push(metrics[i])
}
}
// Batch returns a batch of metrics of size batchSize.
// the batch will be of maximum length batchSize. It can be less than batchSize,
// if the length of Buffer is less than batchSize.
func (b *Buffer) Batch(batchSize int) []telegraf.Metric {
b.Lock()
defer b.Unlock()
outLen := min(b.Len(), batchSize)
out := make([]telegraf.Metric, outLen)
if outLen == 0 {
return out
}
// We copy everything right of first up to last, count or end
// b.last >= rightInd || b.last < b.first
// therefore wont copy past b.last
rightInd := min(b.size, b.first+outLen) - 1
copyCount := copy(out, b.buf[b.first:rightInd+1])
// We've emptied the ring
if rightInd == b.last {
b.empty = true
}
b.first = rightInd + 1
b.first %= b.size
// We circle back for the rest
if copyCount < outLen {
right := min(b.last, outLen-copyCount)
copy(out[copyCount:], b.buf[b.first:right+1])
// We've emptied the ring
if right == b.last {
b.empty = true
}
b.first = right + 1
b.first %= b.size
}
return out
}
func min(a, b int) int {
if b < a {
return b
}
return a
}

View File

@ -1,203 +0,0 @@
package buffer
import (
"sync"
"sync/atomic"
"testing"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/assert"
)
var metricList = []telegraf.Metric{
testutil.TestMetric(2, "mymetric1"),
testutil.TestMetric(1, "mymetric2"),
testutil.TestMetric(11, "mymetric3"),
testutil.TestMetric(15, "mymetric4"),
testutil.TestMetric(8, "mymetric5"),
}
func makeBench5(b *testing.B, freq, batchSize int) {
const k = 1000
var wg sync.WaitGroup
buf := NewBuffer(10000)
m := testutil.TestMetric(1, "mymetric")
for i := 0; i < b.N; i++ {
buf.Add(m, m, m, m, m)
if i%(freq*k) == 0 {
wg.Add(1)
go func() {
buf.Batch(batchSize * k)
wg.Done()
}()
}
}
// Flush
buf.Batch(b.N)
wg.Wait()
}
func makeBenchStrict(b *testing.B, freq, batchSize int) {
const k = 1000
var count uint64
var wg sync.WaitGroup
buf := NewBuffer(10000)
m := testutil.TestMetric(1, "mymetric")
for i := 0; i < b.N; i++ {
buf.Add(m)
if i%(freq*k) == 0 {
wg.Add(1)
go func() {
defer wg.Done()
l := len(buf.Batch(batchSize * k))
atomic.AddUint64(&count, uint64(l))
}()
}
}
// Flush
wg.Add(1)
go func() {
l := len(buf.Batch(b.N))
atomic.AddUint64(&count, uint64(l))
wg.Done()
}()
wg.Wait()
if count != uint64(b.N) {
b.Errorf("not all metrics came out. %d of %d", count, b.N)
}
}
func makeBench(b *testing.B, freq, batchSize int) {
const k = 1000
var wg sync.WaitGroup
buf := NewBuffer(10000)
m := testutil.TestMetric(1, "mymetric")
for i := 0; i < b.N; i++ {
buf.Add(m)
if i%(freq*k) == 0 {
wg.Add(1)
go func() {
buf.Batch(batchSize * k)
wg.Done()
}()
}
}
wg.Wait()
// Flush
buf.Batch(b.N)
}
func BenchmarkBufferBatch5Add(b *testing.B) {
makeBench5(b, 100, 101)
}
func BenchmarkBufferBigInfrequentBatchCatchup(b *testing.B) {
makeBench(b, 100, 101)
}
func BenchmarkBufferOftenBatch(b *testing.B) {
makeBench(b, 1, 1)
}
func BenchmarkBufferAlmostBatch(b *testing.B) {
makeBench(b, 10, 9)
}
func BenchmarkBufferSlowBatch(b *testing.B) {
makeBench(b, 10, 1)
}
func BenchmarkBufferBatchNoDrop(b *testing.B) {
makeBenchStrict(b, 1, 4)
}
func BenchmarkBufferCatchup(b *testing.B) {
buf := NewBuffer(10000)
m := testutil.TestMetric(1, "mymetric")
for i := 0; i < b.N; i++ {
buf.Add(m)
}
buf.Batch(b.N)
}
func BenchmarkAddMetrics(b *testing.B) {
buf := NewBuffer(10000)
m := testutil.TestMetric(1, "mymetric")
for n := 0; n < b.N; n++ {
buf.Add(m)
}
}
func TestNewBufferBasicFuncs(t *testing.T) {
b := NewBuffer(10)
MetricsDropped.Set(0)
MetricsWritten.Set(0)
assert.True(t, b.IsEmpty())
assert.Zero(t, b.Len())
assert.Zero(t, MetricsDropped.Get())
assert.Zero(t, MetricsWritten.Get())
m := testutil.TestMetric(1, "mymetric")
b.Add(m)
assert.False(t, b.IsEmpty())
assert.Equal(t, b.Len(), 1)
assert.Equal(t, int64(0), MetricsDropped.Get())
assert.Equal(t, int64(1), MetricsWritten.Get())
b.Add(metricList...)
assert.False(t, b.IsEmpty())
assert.Equal(t, b.Len(), 6)
assert.Equal(t, int64(0), MetricsDropped.Get())
assert.Equal(t, int64(6), MetricsWritten.Get())
}
func TestDroppingMetrics(t *testing.T) {
b := NewBuffer(10)
MetricsDropped.Set(0)
MetricsWritten.Set(0)
// Add up to the size of the buffer
b.Add(metricList...)
b.Add(metricList...)
assert.False(t, b.IsEmpty())
assert.Equal(t, b.Len(), 10)
assert.Equal(t, int64(0), MetricsDropped.Get())
assert.Equal(t, int64(10), MetricsWritten.Get())
// Add 5 more and verify they were dropped
b.Add(metricList...)
assert.False(t, b.IsEmpty())
assert.Equal(t, b.Len(), 10)
assert.Equal(t, int64(5), MetricsDropped.Get())
assert.Equal(t, int64(15), MetricsWritten.Get())
}
func TestGettingBatches(t *testing.T) {
b := NewBuffer(20)
MetricsDropped.Set(0)
MetricsWritten.Set(0)
// Verify that the buffer returned is smaller than requested when there are
// not as many items as requested.
b.Add(metricList...)
batch := b.Batch(10)
assert.Len(t, batch, 5)
// Verify that the buffer is now empty
assert.True(t, b.IsEmpty())
assert.Zero(t, b.Len())
assert.Zero(t, MetricsDropped.Get())
assert.Equal(t, int64(5), MetricsWritten.Get())
// Verify that the buffer returned is not more than the size requested
b.Add(metricList...)
batch = b.Batch(3)
assert.Len(t, batch, 3)
// Verify that buffer is not empty
assert.False(t, b.IsEmpty())
assert.Equal(t, b.Len(), 2)
assert.Equal(t, int64(0), MetricsDropped.Get())
assert.Equal(t, int64(10), MetricsWritten.Get())
}

View File

@ -9,7 +9,6 @@ import (
"math"
"os"
"path/filepath"
"regexp"
"runtime"
"sort"
@ -26,7 +25,6 @@ import (
"github.com/influxdata/telegraf/plugins/parsers"
"github.com/influxdata/telegraf/plugins/processors"
"github.com/influxdata/telegraf/plugins/serializers"
"github.com/influxdata/toml"
"github.com/influxdata/toml/ast"
)
@ -622,6 +620,19 @@ func (c *Config) LoadConfig(path string) error {
}
}
if !c.Agent.OmitHostname {
if c.Agent.Hostname == "" {
hostname, err := os.Hostname()
if err != nil {
return err
}
c.Agent.Hostname = hostname
}
c.Tags["host"] = c.Agent.Hostname
}
// Parse all the rest of the plugins:
for name, val := range tbl.Fields {
subTable, ok := val.(*ast.Table)
@ -709,6 +720,7 @@ func (c *Config) LoadConfig(path string) error {
if len(c.Processors) > 1 {
sort.Sort(c.Processors)
}
return nil
}
@ -876,6 +888,7 @@ func (c *Config) addInput(name string, table *ast.Table) error {
}
rp := models.NewRunningInput(input, pluginConfig)
rp.SetDefaultTags(c.Tags)
c.Inputs = append(c.Inputs, rp)
return nil
}
@ -1751,6 +1764,8 @@ func buildOutput(name string, tbl *ast.Table) (*models.OutputConfig, error) {
Name: name,
Filter: filter,
}
// TODO
// Outputs don't support FieldDrop/FieldPass, so set to NameDrop/NamePass
if len(oc.Filter.FieldDrop) > 0 {
oc.Filter.NameDrop = oc.Filter.FieldDrop
@ -1758,5 +1773,47 @@ func buildOutput(name string, tbl *ast.Table) (*models.OutputConfig, error) {
if len(oc.Filter.FieldPass) > 0 {
oc.Filter.NamePass = oc.Filter.FieldPass
}
if node, ok := tbl.Fields["flush_interval"]; ok {
if kv, ok := node.(*ast.KeyValue); ok {
if str, ok := kv.Value.(*ast.String); ok {
dur, err := time.ParseDuration(str.Value)
if err != nil {
return nil, err
}
oc.FlushInterval = dur
}
}
}
if node, ok := tbl.Fields["metric_buffer_limit"]; ok {
if kv, ok := node.(*ast.KeyValue); ok {
if integer, ok := kv.Value.(*ast.Integer); ok {
v, err := integer.Int()
if err != nil {
return nil, err
}
oc.MetricBufferLimit = int(v)
}
}
}
if node, ok := tbl.Fields["metric_batch_size"]; ok {
if kv, ok := node.(*ast.KeyValue); ok {
if integer, ok := kv.Value.(*ast.Integer); ok {
v, err := integer.Int()
if err != nil {
return nil, err
}
oc.MetricBatchSize = int(v)
}
}
}
delete(tbl.Fields, "flush_interval")
delete(tbl.Fields, "metric_buffer_limit")
delete(tbl.Fields, "metric_batch_size")
return oc, nil
}

View File

@ -4,6 +4,7 @@ import (
"bufio"
"bytes"
"compress/gzip"
"context"
"crypto/rand"
"errors"
"io"
@ -246,6 +247,51 @@ func RandomSleep(max time.Duration, shutdown chan struct{}) {
}
}
// RandomDuration returns a random duration between 0 and max.
func RandomDuration(max time.Duration) time.Duration {
if max == 0 {
return 0
}
var sleepns int64
maxSleep := big.NewInt(max.Nanoseconds())
if j, err := rand.Int(rand.Reader, maxSleep); err == nil {
sleepns = j.Int64()
}
return time.Duration(sleepns)
}
// SleepContext sleeps until the context is closed or the duration is reached.
func SleepContext(ctx context.Context, duration time.Duration) error {
if duration == 0 {
return nil
}
t := time.NewTimer(duration)
select {
case <-t.C:
return nil
case <-ctx.Done():
t.Stop()
return ctx.Err()
}
}
// AlignDuration returns the duration until next aligned interval.
func AlignDuration(tm time.Time, interval time.Duration) time.Duration {
return AlignTime(tm, interval).Sub(tm)
}
// AlignTime returns the time of the next aligned interval.
func AlignTime(tm time.Time, interval time.Duration) time.Time {
truncated := tm.Truncate(interval)
if truncated == tm {
return tm
}
return truncated.Add(interval)
}
// Exit status takes the error from exec.Command
// and returns the exit status and true
// if error is not exit status, will return 0 and false

View File

@ -9,6 +9,7 @@ import (
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type SnakeTest struct {
@ -217,3 +218,55 @@ func TestVersionAlreadySet(t *testing.T) {
assert.Equal(t, "foo", Version())
}
func TestAlignDuration(t *testing.T) {
tests := []struct {
name string
now time.Time
interval time.Duration
expected time.Duration
}{
{
name: "aligned",
now: time.Date(2018, 1, 1, 1, 1, 0, 0, time.UTC),
interval: 10 * time.Second,
expected: 0 * time.Second,
},
{
name: "standard interval",
now: time.Date(2018, 1, 1, 1, 1, 1, 0, time.UTC),
interval: 10 * time.Second,
expected: 9 * time.Second,
},
{
name: "odd interval",
now: time.Date(2018, 1, 1, 1, 1, 1, 0, time.UTC),
interval: 3 * time.Second,
expected: 2 * time.Second,
},
{
name: "sub second interval",
now: time.Date(2018, 1, 1, 1, 1, 0, 5e8, time.UTC),
interval: 1 * time.Second,
expected: 500 * time.Millisecond,
},
{
name: "non divisible not aligned on minutes",
now: time.Date(2018, 1, 1, 1, 0, 0, 0, time.UTC),
interval: 1*time.Second + 100*time.Millisecond,
expected: 400 * time.Millisecond,
},
{
name: "long interval",
now: time.Date(2018, 1, 1, 1, 1, 0, 0, time.UTC),
interval: 1 * time.Hour,
expected: 59 * time.Minute,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
actual := AlignDuration(tt.now, tt.interval)
require.Equal(t, tt.expected, actual)
})
}
}

214
internal/models/buffer.go Normal file
View File

@ -0,0 +1,214 @@
package models
import (
"sync"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/selfstat"
)
var (
AgentMetricsWritten = selfstat.Register("agent", "metrics_written", map[string]string{})
AgentMetricsDropped = selfstat.Register("agent", "metrics_dropped", map[string]string{})
)
// Buffer stores metrics in a circular buffer.
type Buffer struct {
sync.Mutex
buf []telegraf.Metric
first int // index of the first/oldest metric
last int // one after the index of the last/newest metric
size int // number of metrics currently in the buffer
cap int // the capacity of the buffer
batchFirst int // index of the first metric in the batch
batchLast int // one after the index of the last metric in the batch
batchSize int // number of metrics current in the batch
MetricsAdded selfstat.Stat
MetricsWritten selfstat.Stat
MetricsDropped selfstat.Stat
}
// NewBuffer returns a new empty Buffer with the given capacity.
func NewBuffer(name string, capacity int) *Buffer {
b := &Buffer{
buf: make([]telegraf.Metric, capacity),
first: 0,
last: 0,
size: 0,
cap: capacity,
MetricsAdded: selfstat.Register(
"write",
"metrics_added",
map[string]string{"output": name},
),
MetricsWritten: selfstat.Register(
"write",
"metrics_written",
map[string]string{"output": name},
),
MetricsDropped: selfstat.Register(
"write",
"metrics_dropped",
map[string]string{"output": name},
),
}
return b
}
// Len returns the number of metrics currently in the buffer.
func (b *Buffer) Len() int {
b.Lock()
defer b.Unlock()
return b.size
}
func (b *Buffer) metricAdded() {
b.MetricsAdded.Incr(1)
}
func (b *Buffer) metricWritten(metric telegraf.Metric) {
AgentMetricsWritten.Incr(1)
b.MetricsWritten.Incr(1)
metric.Accept()
}
func (b *Buffer) metricDropped(metric telegraf.Metric) {
AgentMetricsDropped.Incr(1)
b.MetricsDropped.Incr(1)
metric.Reject()
}
func (b *Buffer) inBatch() bool {
if b.batchSize == 0 {
return false
}
if b.batchFirst < b.batchLast {
return b.last >= b.batchFirst && b.last < b.batchLast
} else {
return b.last >= b.batchFirst || b.last < b.batchLast
}
}
func (b *Buffer) add(m telegraf.Metric) {
// Check if Buffer is full
if b.size == b.cap {
if b.batchSize == 0 {
// No batch taken by the output, we can drop the metric now.
b.metricDropped(b.buf[b.last])
} else if b.inBatch() {
// There is an outstanding batch and this will overwrite a metric
// in it, delay the dropping only in case the batch gets rejected.
b.batchSize--
b.batchFirst++
b.batchFirst %= b.cap
} else {
// There is an outstanding batch, but this overwrites a metric
// outside of it.
b.metricDropped(b.buf[b.last])
}
}
b.metricAdded()
b.buf[b.last] = m
b.last++
b.last %= b.cap
if b.size == b.cap {
b.first++
b.first %= b.cap
}
b.size = min(b.size+1, b.cap)
}
// Add adds metrics to the buffer
func (b *Buffer) Add(metrics ...telegraf.Metric) {
b.Lock()
defer b.Unlock()
for i := range metrics {
b.add(metrics[i])
}
}
// Batch returns a slice containing up to batchSize of the most recently added
// metrics.
//
// The metrics contained in the batch are not removed from the buffer, instead
// the last batch is recorded and removed only if Accept is called.
func (b *Buffer) Batch(batchSize int) []telegraf.Metric {
b.Lock()
defer b.Unlock()
outLen := min(b.size, batchSize)
out := make([]telegraf.Metric, outLen)
if outLen == 0 {
return out
}
b.batchFirst = b.first
b.batchLast = b.first + outLen
b.batchLast %= b.cap
b.batchSize = outLen
until := min(b.cap, b.first+outLen)
n := copy(out, b.buf[b.first:until])
if n < outLen {
copy(out[n:], b.buf[:outLen-n])
}
return out
}
// Accept removes the metrics contained in the last batch.
func (b *Buffer) Accept(batch []telegraf.Metric) {
b.Lock()
defer b.Unlock()
for _, m := range batch {
b.metricWritten(m)
}
if b.batchSize > 0 {
b.size -= b.batchSize
b.first += b.batchSize
b.first %= b.cap
}
b.resetBatch()
}
// Reject clears the current batch record so that calls to Accept will have no
// effect.
func (b *Buffer) Reject(batch []telegraf.Metric) {
b.Lock()
defer b.Unlock()
if len(batch) > b.batchSize {
// Part or all of the batch was dropped before reject was called.
for _, m := range batch[b.batchSize:] {
b.metricDropped(m)
}
}
b.resetBatch()
}
func (b *Buffer) resetBatch() {
b.batchFirst = 0
b.batchLast = 0
b.batchSize = 0
}
func min(a, b int) int {
if b < a {
return b
}
return a
}

View File

@ -0,0 +1,385 @@
package models
import (
"testing"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/metric"
"github.com/stretchr/testify/require"
)
type MockMetric struct {
telegraf.Metric
AcceptF func()
RejectF func()
DropF func()
}
func (m *MockMetric) Accept() {
m.AcceptF()
}
func (m *MockMetric) Reject() {
m.RejectF()
}
func (m *MockMetric) Drop() {
m.DropF()
}
func Metric() telegraf.Metric {
m, err := metric.New(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42.0,
},
time.Unix(0, 0),
)
if err != nil {
panic(err)
}
return m
}
func BenchmarkAddMetrics(b *testing.B) {
buf := NewBuffer("test", 10000)
m := Metric()
for n := 0; n < b.N; n++ {
buf.Add(m)
}
}
func setup(b *Buffer) *Buffer {
b.MetricsAdded.Set(0)
b.MetricsWritten.Set(0)
b.MetricsDropped.Set(0)
return b
}
func TestBuffer_LenEmpty(t *testing.T) {
b := setup(NewBuffer("test", 5))
require.Equal(t, 0, b.Len())
}
func TestBuffer_LenOne(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m)
require.Equal(t, 1, b.Len())
}
func TestBuffer_LenFull(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m, m, m)
require.Equal(t, 5, b.Len())
}
func TestBuffer_LenOverfill(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
setup(b)
b.Add(m, m, m, m, m, m)
require.Equal(t, 5, b.Len())
}
func TestBuffer_BatchLenZero(t *testing.T) {
b := setup(NewBuffer("test", 5))
batch := b.Batch(0)
require.Len(t, batch, 0)
}
func TestBuffer_BatchLenBufferEmpty(t *testing.T) {
b := setup(NewBuffer("test", 5))
batch := b.Batch(2)
require.Len(t, batch, 0)
}
func TestBuffer_BatchLenUnderfill(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m)
batch := b.Batch(2)
require.Len(t, batch, 1)
}
func TestBuffer_BatchLenFill(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m)
batch := b.Batch(2)
require.Len(t, batch, 2)
}
func TestBuffer_BatchLenExact(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m)
batch := b.Batch(2)
require.Len(t, batch, 2)
}
func TestBuffer_BatchLenLargerThanBuffer(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m, m, m)
batch := b.Batch(6)
require.Len(t, batch, 5)
}
func TestBuffer_BatchWrap(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m, m, m)
batch := b.Batch(2)
b.Accept(batch)
b.Add(m, m)
batch = b.Batch(5)
require.Len(t, batch, 5)
}
func TestBuffer_AddDropsOverwrittenMetrics(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m, m, m)
b.Add(m, m, m, m, m)
require.Equal(t, int64(5), b.MetricsDropped.Get())
require.Equal(t, int64(0), b.MetricsWritten.Get())
}
func TestBuffer_AcceptRemovesBatch(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m)
batch := b.Batch(2)
b.Accept(batch)
require.Equal(t, 1, b.Len())
}
func TestBuffer_RejectLeavesBatch(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m)
batch := b.Batch(2)
b.Reject(batch)
require.Equal(t, 3, b.Len())
}
func TestBuffer_AcceptWritesOverwrittenBatch(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m, m, m)
batch := b.Batch(5)
b.Add(m, m, m, m, m)
b.Accept(batch)
require.Equal(t, int64(0), b.MetricsDropped.Get())
require.Equal(t, int64(5), b.MetricsWritten.Get())
}
func TestBuffer_BatchRejectDropsOverwrittenBatch(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m, m, m)
batch := b.Batch(5)
b.Add(m, m, m, m, m)
b.Reject(batch)
require.Equal(t, int64(5), b.MetricsDropped.Get())
require.Equal(t, int64(0), b.MetricsWritten.Get())
}
func TestBuffer_MetricsOverwriteBatchAccept(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m, m, m)
batch := b.Batch(3)
b.Add(m, m, m)
b.Accept(batch)
require.Equal(t, int64(0), b.MetricsDropped.Get())
require.Equal(t, int64(3), b.MetricsWritten.Get())
}
func TestBuffer_MetricsOverwriteBatchReject(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m, m, m)
batch := b.Batch(3)
b.Add(m, m, m)
b.Reject(batch)
require.Equal(t, int64(3), b.MetricsDropped.Get())
require.Equal(t, int64(0), b.MetricsWritten.Get())
}
func TestBuffer_MetricsBatchAcceptRemoved(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m, m, m)
batch := b.Batch(3)
b.Add(m, m, m, m, m)
b.Accept(batch)
require.Equal(t, int64(2), b.MetricsDropped.Get())
require.Equal(t, int64(3), b.MetricsWritten.Get())
}
func TestBuffer_WrapWithBatch(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m)
b.Batch(3)
b.Add(m, m, m, m, m, m)
require.Equal(t, int64(1), b.MetricsDropped.Get())
}
func TestBuffer_BatchNotRemoved(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m, m, m)
b.Batch(2)
require.Equal(t, 5, b.Len())
}
func TestBuffer_BatchRejectAcceptNoop(t *testing.T) {
m := Metric()
b := setup(NewBuffer("test", 5))
b.Add(m, m, m, m, m)
batch := b.Batch(2)
b.Reject(batch)
b.Accept(batch)
require.Equal(t, 5, b.Len())
}
func TestBuffer_AcceptCallsMetricAccept(t *testing.T) {
var accept int
mm := &MockMetric{
Metric: Metric(),
AcceptF: func() {
accept++
},
}
b := setup(NewBuffer("test", 5))
b.Add(mm, mm, mm)
batch := b.Batch(2)
b.Accept(batch)
require.Equal(t, 2, accept)
}
func TestBuffer_AddCallsMetricRejectWhenNoBatch(t *testing.T) {
var reject int
mm := &MockMetric{
Metric: Metric(),
RejectF: func() {
reject++
},
}
b := setup(NewBuffer("test", 5))
setup(b)
b.Add(mm, mm, mm, mm, mm)
b.Add(mm, mm)
require.Equal(t, 2, reject)
}
func TestBuffer_AddCallsMetricRejectWhenNotInBatch(t *testing.T) {
var reject int
mm := &MockMetric{
Metric: Metric(),
RejectF: func() {
reject++
},
}
b := setup(NewBuffer("test", 5))
setup(b)
b.Add(mm, mm, mm, mm, mm)
batch := b.Batch(2)
b.Add(mm, mm, mm, mm)
// metric[2] and metric[3] rejected
require.Equal(t, 2, reject)
b.Reject(batch)
// metric[1] and metric[2] now rejected
require.Equal(t, 4, reject)
}
func TestBuffer_RejectCallsMetricRejectWithOverwritten(t *testing.T) {
var reject int
mm := &MockMetric{
Metric: Metric(),
RejectF: func() {
reject++
},
}
b := setup(NewBuffer("test", 5))
b.Add(mm, mm, mm, mm, mm)
batch := b.Batch(5)
b.Add(mm, mm)
require.Equal(t, 0, reject)
b.Reject(batch)
require.Equal(t, 2, reject)
}
func TestBuffer_AddOverwriteAndReject(t *testing.T) {
var reject int
mm := &MockMetric{
Metric: Metric(),
RejectF: func() {
reject++
},
}
b := setup(NewBuffer("test", 5))
b.Add(mm, mm, mm, mm, mm)
batch := b.Batch(5)
b.Add(mm, mm, mm, mm, mm)
b.Add(mm, mm, mm, mm, mm)
b.Add(mm, mm, mm, mm, mm)
b.Add(mm, mm, mm, mm, mm)
require.Equal(t, 15, reject)
b.Reject(batch)
require.Equal(t, 20, reject)
}
func TestBuffer_AddOverwriteAndRejectOffset(t *testing.T) {
var reject int
var accept int
mm := &MockMetric{
Metric: Metric(),
RejectF: func() {
reject++
},
AcceptF: func() {
accept++
},
}
b := setup(NewBuffer("test", 5))
b.Add(mm, mm, mm)
b.Add(mm, mm, mm, mm)
require.Equal(t, 2, reject)
batch := b.Batch(5)
b.Add(mm, mm, mm, mm)
require.Equal(t, 2, reject)
b.Add(mm, mm, mm, mm)
require.Equal(t, 5, reject)
b.Add(mm, mm, mm, mm)
require.Equal(t, 9, reject)
b.Add(mm, mm, mm, mm)
require.Equal(t, 13, reject)
b.Accept(batch)
require.Equal(t, 13, reject)
require.Equal(t, 5, accept)
}

View File

@ -6,6 +6,7 @@ import (
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/metric"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/require"
)
@ -480,3 +481,45 @@ func TestFilter_FilterTagsPassAndDrop(t *testing.T) {
}
}
func BenchmarkFilter(b *testing.B) {
tests := []struct {
name string
filter Filter
metric telegraf.Metric
}{
{
name: "empty filter",
filter: Filter{},
metric: testutil.MustMetric("cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
},
{
name: "namepass",
filter: Filter{
NamePass: []string{"cpu"},
},
metric: testutil.MustMetric("cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
},
}
for _, tt := range tests {
b.Run(tt.name, func(b *testing.B) {
require.NoError(b, tt.filter.Compile())
for n := 0; n < b.N; n++ {
tt.filter.Select(tt.metric)
}
})
}
}

View File

@ -1,30 +1,53 @@
package models
import (
"log"
"sync"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/selfstat"
)
type RunningAggregator struct {
a telegraf.Aggregator
Config *AggregatorConfig
metrics chan telegraf.Metric
sync.Mutex
Aggregator telegraf.Aggregator
Config *AggregatorConfig
periodStart time.Time
periodEnd time.Time
MetricsPushed selfstat.Stat
MetricsFiltered selfstat.Stat
MetricsDropped selfstat.Stat
PushTime selfstat.Stat
}
func NewRunningAggregator(
a telegraf.Aggregator,
conf *AggregatorConfig,
aggregator telegraf.Aggregator,
config *AggregatorConfig,
) *RunningAggregator {
return &RunningAggregator{
a: a,
Config: conf,
metrics: make(chan telegraf.Metric, 100),
Aggregator: aggregator,
Config: config,
MetricsPushed: selfstat.Register(
"aggregate",
"metrics_pushed",
map[string]string{"aggregator": config.Name},
),
MetricsFiltered: selfstat.Register(
"aggregate",
"metrics_filtered",
map[string]string{"aggregator": config.Name},
),
MetricsDropped: selfstat.Register(
"aggregate",
"metrics_dropped",
map[string]string{"aggregator": config.Name},
),
PushTime: selfstat.Register(
"aggregate",
"push_time_ns",
map[string]string{"aggregator": config.Name},
),
}
}
@ -46,6 +69,15 @@ func (r *RunningAggregator) Name() string {
return "aggregators." + r.Config.Name
}
func (r *RunningAggregator) Period() time.Duration {
return r.Config.Period
}
func (r *RunningAggregator) SetPeriodStart(start time.Time) {
r.periodStart = start
r.periodEnd = r.periodStart.Add(r.Config.Period).Add(r.Config.Delay)
}
func (r *RunningAggregator) MakeMetric(metric telegraf.Metric) telegraf.Metric {
m := makemetric(
metric,
@ -59,9 +91,21 @@ func (r *RunningAggregator) MakeMetric(metric telegraf.Metric) telegraf.Metric {
m.SetAggregate(true)
}
r.MetricsPushed.Incr(1)
return m
}
func (r *RunningAggregator) metricFiltered(metric telegraf.Metric) {
r.MetricsFiltered.Incr(1)
metric.Accept()
}
func (r *RunningAggregator) metricDropped(metric telegraf.Metric) {
r.MetricsDropped.Incr(1)
metric.Accept()
}
// Add a metric to the aggregator and return true if the original metric
// should be dropped.
func (r *RunningAggregator) Add(metric telegraf.Metric) bool {
@ -74,75 +118,31 @@ func (r *RunningAggregator) Add(metric telegraf.Metric) bool {
return r.Config.DropOriginal
}
r.metrics <- metric
r.Lock()
defer r.Unlock()
if r.periodStart.IsZero() || metric.Time().Before(r.periodStart) || metric.Time().After(r.periodEnd) {
r.metricDropped(metric)
return false
}
r.Aggregator.Add(metric)
return r.Config.DropOriginal
}
func (r *RunningAggregator) add(in telegraf.Metric) {
r.a.Add(in)
func (r *RunningAggregator) Push(acc telegraf.Accumulator) {
r.Lock()
defer r.Unlock()
r.periodStart = r.periodEnd
r.periodEnd = r.periodStart.Add(r.Config.Period).Add(r.Config.Delay)
r.push(acc)
r.Aggregator.Reset()
}
func (r *RunningAggregator) push(acc telegraf.Accumulator) {
r.a.Push(acc)
}
func (r *RunningAggregator) reset() {
r.a.Reset()
}
// Run runs the running aggregator, listens for incoming metrics, and waits
// for period ticks to tell it when to push and reset the aggregator.
func (r *RunningAggregator) Run(
acc telegraf.Accumulator,
shutdown chan struct{},
) {
// The start of the period is truncated to the nearest second.
//
// Every metric then gets it's timestamp checked and is dropped if it
// is not within:
//
// start < t < end + truncation + delay
//
// So if we start at now = 00:00.2 with a 10s period and 0.3s delay:
// now = 00:00.2
// start = 00:00
// truncation = 00:00.2
// end = 00:10
// 1st interval: 00:00 - 00:10.5
// 2nd interval: 00:10 - 00:20.5
// etc.
//
now := time.Now()
r.periodStart = now.Truncate(time.Second)
truncation := now.Sub(r.periodStart)
r.periodEnd = r.periodStart.Add(r.Config.Period)
time.Sleep(r.Config.Delay)
periodT := time.NewTicker(r.Config.Period)
defer periodT.Stop()
for {
select {
case <-shutdown:
if len(r.metrics) > 0 {
// wait until metrics are flushed before exiting
continue
}
return
case m := <-r.metrics:
if m.Time().Before(r.periodStart) ||
m.Time().After(r.periodEnd.Add(truncation).Add(r.Config.Delay)) {
// the metric is outside the current aggregation period, so
// skip it.
log.Printf("D! aggregator: metric \"%s\" is not in the current timewindow, skipping", m.Name())
continue
}
r.add(m)
case <-periodT.C:
r.periodStart = r.periodEnd
r.periodEnd = r.periodStart.Add(r.Config.Period)
r.push(acc)
r.reset()
}
}
start := time.Now()
r.Aggregator.Push(acc)
elapsed := time.Since(start)
r.PushTime.Incr(elapsed.Nanoseconds())
}

View File

@ -1,16 +1,13 @@
package models
import (
"sync"
"sync/atomic"
"testing"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/metric"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@ -23,28 +20,24 @@ func TestAdd(t *testing.T) {
},
Period: time.Millisecond * 500,
})
assert.NoError(t, ra.Config.Filter.Compile())
require.NoError(t, ra.Config.Filter.Compile())
acc := testutil.Accumulator{}
go ra.Run(&acc, make(chan struct{}))
m, err := metric.New("RITest",
now := time.Now()
ra.SetPeriodStart(now)
m := testutil.MustMetric("RITest",
map[string]string{},
map[string]interface{}{
"value": int64(101),
},
time.Now().Add(time.Millisecond*150),
telegraf.Untyped)
require.NoError(t, err)
require.False(t, ra.Add(m))
ra.Push(&acc)
assert.False(t, ra.Add(m))
for {
time.Sleep(time.Millisecond)
if atomic.LoadInt64(&a.sum) > 0 {
break
}
}
assert.Equal(t, int64(101), atomic.LoadInt64(&a.sum))
require.Equal(t, 1, len(acc.Metrics))
require.Equal(t, int64(101), acc.Metrics[0].Fields["sum"])
}
func TestAddMetricsOutsideCurrentPeriod(t *testing.T) {
@ -56,50 +49,45 @@ func TestAddMetricsOutsideCurrentPeriod(t *testing.T) {
},
Period: time.Millisecond * 500,
})
assert.NoError(t, ra.Config.Filter.Compile())
require.NoError(t, ra.Config.Filter.Compile())
acc := testutil.Accumulator{}
go ra.Run(&acc, make(chan struct{}))
now := time.Now()
ra.SetPeriodStart(now)
m, err := metric.New("RITest",
m := testutil.MustMetric("RITest",
map[string]string{},
map[string]interface{}{
"value": int64(101),
},
time.Now().Add(-time.Hour),
telegraf.Untyped)
require.NoError(t, err)
assert.False(t, ra.Add(m))
now.Add(-time.Hour),
telegraf.Untyped,
)
require.False(t, ra.Add(m))
// metric after current period
m, err = metric.New("RITest",
m = testutil.MustMetric("RITest",
map[string]string{},
map[string]interface{}{
"value": int64(101),
},
time.Now().Add(time.Hour),
telegraf.Untyped)
require.NoError(t, err)
assert.False(t, ra.Add(m))
now.Add(time.Hour),
telegraf.Untyped,
)
require.False(t, ra.Add(m))
// "now" metric
m, err = metric.New("RITest",
m = testutil.MustMetric("RITest",
map[string]string{},
map[string]interface{}{
"value": int64(101),
},
time.Now().Add(time.Millisecond*50),
telegraf.Untyped)
require.NoError(t, err)
assert.False(t, ra.Add(m))
require.False(t, ra.Add(m))
for {
time.Sleep(time.Millisecond)
if atomic.LoadInt64(&a.sum) > 0 {
break
}
}
assert.Equal(t, int64(101), atomic.LoadInt64(&a.sum))
ra.Push(&acc)
require.Equal(t, 1, len(acc.Metrics))
require.Equal(t, int64(101), acc.Metrics[0].Fields["sum"])
}
func TestAddAndPushOnePeriod(t *testing.T) {
@ -111,37 +99,24 @@ func TestAddAndPushOnePeriod(t *testing.T) {
},
Period: time.Millisecond * 500,
})
assert.NoError(t, ra.Config.Filter.Compile())
require.NoError(t, ra.Config.Filter.Compile())
acc := testutil.Accumulator{}
shutdown := make(chan struct{})
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
ra.Run(&acc, shutdown)
}()
now := time.Now()
ra.SetPeriodStart(now)
m, err := metric.New("RITest",
m := testutil.MustMetric("RITest",
map[string]string{},
map[string]interface{}{
"value": int64(101),
},
time.Now().Add(time.Millisecond*100),
telegraf.Untyped)
require.NoError(t, err)
assert.False(t, ra.Add(m))
require.False(t, ra.Add(m))
ra.Push(&acc)
for {
time.Sleep(time.Millisecond)
if acc.NMetrics() > 0 {
break
}
}
acc.AssertContainsFields(t, "TestMetric", map[string]interface{}{"sum": int64(101)})
close(shutdown)
wg.Wait()
}
func TestAddDropOriginal(t *testing.T) {
@ -152,28 +127,29 @@ func TestAddDropOriginal(t *testing.T) {
},
DropOriginal: true,
})
assert.NoError(t, ra.Config.Filter.Compile())
require.NoError(t, ra.Config.Filter.Compile())
m, err := metric.New("RITest",
now := time.Now()
ra.SetPeriodStart(now)
m := testutil.MustMetric("RITest",
map[string]string{},
map[string]interface{}{
"value": int64(101),
},
time.Now(),
now,
telegraf.Untyped)
require.NoError(t, err)
assert.True(t, ra.Add(m))
require.True(t, ra.Add(m))
// this metric name doesn't match the filter, so Add will return false
m2, err := metric.New("foobar",
m2 := testutil.MustMetric("foobar",
map[string]string{},
map[string]interface{}{
"value": int64(101),
},
time.Now(),
now,
telegraf.Untyped)
require.NoError(t, err)
assert.False(t, ra.Add(m2))
require.False(t, ra.Add(m2))
}
type TestAggregator struct {

View File

@ -1,11 +1,9 @@
package models
import (
"fmt"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/serializers/influx"
"github.com/influxdata/telegraf/selfstat"
)
@ -15,16 +13,13 @@ type RunningInput struct {
Input telegraf.Input
Config *InputConfig
trace bool
defaultTags map[string]string
MetricsGathered selfstat.Stat
GatherTime selfstat.Stat
}
func NewRunningInput(
input telegraf.Input,
config *InputConfig,
) *RunningInput {
func NewRunningInput(input telegraf.Input, config *InputConfig) *RunningInput {
return &RunningInput{
Input: input,
Config: config,
@ -33,6 +28,11 @@ func NewRunningInput(
"metrics_gathered",
map[string]string{"input": config.Name},
),
GatherTime: selfstat.RegisterTiming(
"gather",
"gather_time_ns",
map[string]string{"input": config.Name},
),
}
}
@ -52,13 +52,19 @@ func (r *RunningInput) Name() string {
return "inputs." + r.Config.Name
}
func (r *RunningInput) metricFiltered(metric telegraf.Metric) {
metric.Drop()
}
func (r *RunningInput) MakeMetric(metric telegraf.Metric) telegraf.Metric {
if ok := r.Config.Filter.Select(metric); !ok {
r.metricFiltered(metric)
return nil
}
r.Config.Filter.Modify(metric)
if len(metric.FieldList()) == 0 {
r.metricFiltered(metric)
return nil
}
@ -70,26 +76,17 @@ func (r *RunningInput) MakeMetric(metric telegraf.Metric) telegraf.Metric {
r.Config.Tags,
r.defaultTags)
if r.trace && m != nil {
s := influx.NewSerializer()
s.SetFieldSortOrder(influx.SortFields)
octets, err := s.Serialize(m)
if err == nil {
fmt.Print("> " + string(octets))
}
}
r.MetricsGathered.Incr(1)
GlobalMetricsGathered.Incr(1)
return m
}
func (r *RunningInput) Trace() bool {
return r.trace
}
func (r *RunningInput) SetTrace(trace bool) {
r.trace = trace
func (r *RunningInput) Gather(acc telegraf.Accumulator) error {
start := time.Now()
err := r.Input.Gather(acc)
elapsed := time.Since(start)
r.GatherTime.Incr(elapsed.Nanoseconds())
return err
}
func (r *RunningInput) SetDefaultTags(tags map[string]string) {

View File

@ -6,6 +6,7 @@ import (
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/metric"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@ -66,17 +67,13 @@ func TestMakeMetricWithPluginTags(t *testing.T) {
},
})
ri.SetTrace(true)
assert.Equal(t, true, ri.Trace())
m, err := metric.New("RITest",
m := testutil.MustMetric("RITest",
map[string]string{},
map[string]interface{}{
"value": int64(101),
},
now,
telegraf.Untyped)
require.NoError(t, err)
m = ri.MakeMetric(m)
expected, err := metric.New("RITest",
@ -102,8 +99,6 @@ func TestMakeMetricFilteredOut(t *testing.T) {
Filter: Filter{NamePass: []string{"foobar"}},
})
ri.SetTrace(true)
assert.Equal(t, true, ri.Trace())
assert.NoError(t, ri.Config.Filter.Compile())
m, err := metric.New("RITest",
@ -127,17 +122,13 @@ func TestMakeMetricWithDaemonTags(t *testing.T) {
"foo": "bar",
})
ri.SetTrace(true)
assert.Equal(t, true, ri.Trace())
m, err := metric.New("RITest",
m := testutil.MustMetric("RITest",
map[string]string{},
map[string]interface{}{
"value": int64(101),
},
now,
telegraf.Untyped)
require.NoError(t, err)
m = ri.MakeMetric(m)
expected, err := metric.New("RITest",
map[string]string{

View File

@ -6,7 +6,6 @@ import (
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal/buffer"
"github.com/influxdata/telegraf/selfstat"
)
@ -18,6 +17,16 @@ const (
DEFAULT_METRIC_BUFFER_LIMIT = 10000
)
// OutputConfig containing name and filter
type OutputConfig struct {
Name string
Filter Filter
FlushInterval time.Duration
MetricBufferLimit int
MetricBatchSize int
}
// RunningOutput contains the output configuration
type RunningOutput struct {
Name string
@ -27,24 +36,16 @@ type RunningOutput struct {
MetricBatchSize int
MetricsFiltered selfstat.Stat
MetricsWritten selfstat.Stat
BufferSize selfstat.Stat
BufferLimit selfstat.Stat
WriteTime selfstat.Stat
metrics *buffer.Buffer
failMetrics *buffer.Buffer
batch []telegraf.Metric
buffer *Buffer
BatchReady chan time.Time
// Guards against concurrent calls to Add, Push, Reset
aggMutex sync.Mutex
// Guards against concurrent calls to the Output as described in #3009
writeMutex sync.Mutex
}
// OutputConfig containing name and filter
type OutputConfig struct {
Name string
Filter Filter
aggMutex sync.Mutex
batchMutex sync.Mutex
}
func NewRunningOutput(
@ -54,25 +55,27 @@ func NewRunningOutput(
batchSize int,
bufferLimit int,
) *RunningOutput {
if conf.MetricBufferLimit > 0 {
bufferLimit = conf.MetricBufferLimit
}
if bufferLimit == 0 {
bufferLimit = DEFAULT_METRIC_BUFFER_LIMIT
}
if conf.MetricBatchSize > 0 {
batchSize = conf.MetricBatchSize
}
if batchSize == 0 {
batchSize = DEFAULT_METRIC_BATCH_SIZE
}
ro := &RunningOutput{
Name: name,
metrics: buffer.NewBuffer(batchSize),
failMetrics: buffer.NewBuffer(bufferLimit),
batch: make([]telegraf.Metric, 0, batchSize),
buffer: NewBuffer(name, bufferLimit),
BatchReady: make(chan time.Time, 1),
Output: output,
Config: conf,
MetricBufferLimit: bufferLimit,
MetricBatchSize: batchSize,
MetricsWritten: selfstat.Register(
"write",
"metrics_written",
map[string]string{"output": name},
),
MetricsFiltered: selfstat.Register(
"write",
"metrics_filtered",
@ -94,20 +97,28 @@ func NewRunningOutput(
map[string]string{"output": name},
),
}
ro.BufferLimit.Set(int64(ro.MetricBufferLimit))
return ro
}
// AddMetric adds a metric to the output. This function can also write cached
// points if FlushBufferWhenFull is true.
func (ro *RunningOutput) metricFiltered(metric telegraf.Metric) {
ro.MetricsFiltered.Incr(1)
metric.Drop()
}
// AddMetric adds a metric to the output.
//
// Takes ownership of metric
func (ro *RunningOutput) AddMetric(metric telegraf.Metric) {
if ok := ro.Config.Filter.Select(metric); !ok {
ro.MetricsFiltered.Incr(1)
ro.metricFiltered(metric)
return
}
ro.Config.Filter.Modify(metric)
if len(metric.FieldList()) == 0 {
ro.metricFiltered(metric)
return
}
@ -118,85 +129,98 @@ func (ro *RunningOutput) AddMetric(metric telegraf.Metric) {
return
}
ro.metrics.Add(metric)
if ro.metrics.Len() == ro.MetricBatchSize {
batch := ro.metrics.Batch(ro.MetricBatchSize)
err := ro.write(batch)
if err != nil {
ro.failMetrics.Add(batch...)
log.Printf("E! Error writing to output [%s]: %v", ro.Name, err)
ro.batchMutex.Lock()
ro.batch = append(ro.batch, metric)
if len(ro.batch) == ro.MetricBatchSize {
ro.addBatchToBuffer()
nBuffer := ro.buffer.Len()
ro.BufferSize.Set(int64(nBuffer))
select {
case ro.BatchReady <- time.Now():
default:
}
}
ro.batchMutex.Unlock()
}
// Write writes all cached points to this output.
// AddBatchToBuffer moves the metrics from the batch into the metric buffer.
func (ro *RunningOutput) addBatchToBuffer() {
ro.buffer.Add(ro.batch...)
ro.batch = ro.batch[:0]
}
// Write writes all metrics to the output, stopping when all have been sent on
// or error.
func (ro *RunningOutput) Write() error {
if output, ok := ro.Output.(telegraf.AggregatingOutput); ok {
ro.aggMutex.Lock()
metrics := output.Push()
ro.metrics.Add(metrics...)
ro.buffer.Add(metrics...)
output.Reset()
ro.aggMutex.Unlock()
}
// add and write can be called concurrently
ro.batchMutex.Lock()
ro.addBatchToBuffer()
ro.batchMutex.Unlock()
nFails, nMetrics := ro.failMetrics.Len(), ro.metrics.Len()
ro.BufferSize.Set(int64(nFails + nMetrics))
log.Printf("D! Output [%s] buffer fullness: %d / %d metrics. ",
ro.Name, nFails+nMetrics, ro.MetricBufferLimit)
var err error
if !ro.failMetrics.IsEmpty() {
// how many batches of failed writes we need to write.
nBatches := nFails/ro.MetricBatchSize + 1
batchSize := ro.MetricBatchSize
nBuffer := ro.buffer.Len()
for i := 0; i < nBatches; i++ {
// If it's the last batch, only grab the metrics that have not had
// a write attempt already (this is primarily to preserve order).
if i == nBatches-1 {
batchSize = nFails % ro.MetricBatchSize
}
batch := ro.failMetrics.Batch(batchSize)
// If we've already failed previous writes, don't bother trying to
// write to this output again. We are not exiting the loop just so
// that we can rotate the metrics to preserve order.
if err == nil {
err = ro.write(batch)
}
if err != nil {
ro.failMetrics.Add(batch...)
}
// Only process the metrics in the buffer now. Metrics added while we are
// writing will be sent on the next call.
nBatches := nBuffer/ro.MetricBatchSize + 1
for i := 0; i < nBatches; i++ {
batch := ro.buffer.Batch(ro.MetricBatchSize)
if len(batch) == 0 {
break
}
}
batch := ro.metrics.Batch(ro.MetricBatchSize)
// see comment above about not trying to write to an already failed output.
// if ro.failMetrics is empty then err will always be nil at this point.
if err == nil {
err = ro.write(batch)
}
if err != nil {
ro.failMetrics.Add(batch...)
return err
err := ro.write(batch)
if err != nil {
ro.buffer.Reject(batch)
return err
}
ro.buffer.Accept(batch)
}
return nil
}
func (ro *RunningOutput) write(metrics []telegraf.Metric) error {
nMetrics := len(metrics)
if nMetrics == 0 {
// WriteBatch writes only the batch metrics to the output.
func (ro *RunningOutput) WriteBatch() error {
batch := ro.buffer.Batch(ro.MetricBatchSize)
if len(batch) == 0 {
return nil
}
ro.writeMutex.Lock()
defer ro.writeMutex.Unlock()
err := ro.write(batch)
if err != nil {
ro.buffer.Reject(batch)
return err
}
ro.buffer.Accept(batch)
return nil
}
func (ro *RunningOutput) write(metrics []telegraf.Metric) error {
start := time.Now()
err := ro.Output.Write(metrics)
elapsed := time.Since(start)
ro.WriteTime.Incr(elapsed.Nanoseconds())
if err == nil {
log.Printf("D! Output [%s] wrote batch of %d metrics in %s\n",
ro.Name, nMetrics, elapsed)
ro.MetricsWritten.Incr(int64(nMetrics))
ro.WriteTime.Incr(elapsed.Nanoseconds())
log.Printf("D! [outputs.%s] wrote batch of %d metrics in %s\n",
ro.Name, len(metrics), elapsed)
}
return err
}
func (ro *RunningOutput) LogBufferStatus() {
nBuffer := ro.buffer.Len()
log.Printf("D! [outputs.%s] buffer fullness: %d / %d metrics. ",
ro.Name, nBuffer, ro.MetricBufferLimit)
}

View File

@ -231,56 +231,6 @@ func TestRunningOutputDefault(t *testing.T) {
assert.Len(t, m.Metrics(), 10)
}
// Test that running output doesn't flush until it's full when
// FlushBufferWhenFull is set.
func TestRunningOutputFlushWhenFull(t *testing.T) {
conf := &OutputConfig{
Filter: Filter{},
}
m := &mockOutput{}
ro := NewRunningOutput("test", m, conf, 6, 10)
// Fill buffer to 1 under limit
for _, metric := range first5 {
ro.AddMetric(metric)
}
// no flush yet
assert.Len(t, m.Metrics(), 0)
// add one more metric
ro.AddMetric(next5[0])
// now it flushed
assert.Len(t, m.Metrics(), 6)
// add one more metric and write it manually
ro.AddMetric(next5[1])
err := ro.Write()
assert.NoError(t, err)
assert.Len(t, m.Metrics(), 7)
}
// Test that running output doesn't flush until it's full when
// FlushBufferWhenFull is set, twice.
func TestRunningOutputMultiFlushWhenFull(t *testing.T) {
conf := &OutputConfig{
Filter: Filter{},
}
m := &mockOutput{}
ro := NewRunningOutput("test", m, conf, 4, 12)
// Fill buffer past limit twive
for _, metric := range first5 {
ro.AddMetric(metric)
}
for _, metric := range next5 {
ro.AddMetric(metric)
}
// flushed twice
assert.Len(t, m.Metrics(), 8)
}
func TestRunningOutputWriteFail(t *testing.T) {
conf := &OutputConfig{
Filter: Filter{},

View File

@ -27,6 +27,19 @@ type ProcessorConfig struct {
Filter Filter
}
func (rp *RunningProcessor) metricFiltered(metric telegraf.Metric) {
metric.Drop()
}
func containsMetric(item telegraf.Metric, metrics []telegraf.Metric) bool {
for _, m := range metrics {
if item == m {
return true
}
}
return false
}
func (rp *RunningProcessor) Apply(in ...telegraf.Metric) []telegraf.Metric {
rp.Lock()
defer rp.Unlock()
@ -43,6 +56,7 @@ func (rp *RunningProcessor) Apply(in ...telegraf.Metric) []telegraf.Metric {
rp.Config.Filter.Modify(metric)
if len(metric.FieldList()) == 0 {
rp.metricFiltered(metric)
continue
}

View File

@ -6,7 +6,7 @@ import (
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/metric"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/require"
)
@ -41,20 +41,6 @@ func TagProcessor(key, value string) *MockProcessor {
}
}
func Metric(
name string,
tags map[string]string,
fields map[string]interface{},
tm time.Time,
tp ...telegraf.ValueType,
) telegraf.Metric {
m, err := metric.New(name, tags, fields, tm, tp...)
if err != nil {
panic(err)
}
return m
}
func TestRunningProcessor_Apply(t *testing.T) {
type args struct {
Processor telegraf.Processor
@ -76,7 +62,7 @@ func TestRunningProcessor_Apply(t *testing.T) {
},
},
input: []telegraf.Metric{
Metric(
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
@ -86,7 +72,7 @@ func TestRunningProcessor_Apply(t *testing.T) {
),
},
expected: []telegraf.Metric{
Metric(
testutil.MustMetric(
"cpu",
map[string]string{
"apply": "true",
@ -109,7 +95,7 @@ func TestRunningProcessor_Apply(t *testing.T) {
},
},
input: []telegraf.Metric{
Metric(
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
@ -119,7 +105,7 @@ func TestRunningProcessor_Apply(t *testing.T) {
),
},
expected: []telegraf.Metric{
Metric(
testutil.MustMetric(
"cpu",
map[string]string{
"apply": "true",
@ -142,7 +128,7 @@ func TestRunningProcessor_Apply(t *testing.T) {
},
},
input: []telegraf.Metric{
Metric(
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
@ -152,7 +138,7 @@ func TestRunningProcessor_Apply(t *testing.T) {
),
},
expected: []telegraf.Metric{
Metric(
testutil.MustMetric(
"cpu",
map[string]string{},
map[string]interface{}{

View File

@ -62,6 +62,17 @@ type Metric interface {
// Copy returns a deep copy of the Metric.
Copy() Metric
// Accept marks the metric as processed successfully and written to an
// output.
Accept()
// Reject marks the metric as processed unsuccessfully.
Reject()
// Drop marks the metric as processed successfully without being written
// to any output.
Drop()
// Mark Metric as an aggregate
SetAggregate(bool)
IsAggregate() bool

View File

@ -248,6 +248,15 @@ func (m *metric) HashID() uint64 {
return h.Sum64()
}
func (m *metric) Accept() {
}
func (m *metric) Reject() {
}
func (m *metric) Drop() {
}
// Convert field to a supported type or nil if unconvertible
func convertField(v interface{}) interface{} {
switch v := v.(type) {

171
metric/tracking.go Normal file
View File

@ -0,0 +1,171 @@
package metric
import (
"log"
"runtime"
"sync/atomic"
"github.com/influxdata/telegraf"
)
// NotifyFunc is called when a tracking metric is done being processed with
// the tracking information.
type NotifyFunc = func(track telegraf.DeliveryInfo)
// WithTracking adds tracking to the metric and registers the notify function
// to be called when processing is complete.
func WithTracking(metric telegraf.Metric, fn NotifyFunc) (telegraf.Metric, telegraf.TrackingID) {
return newTrackingMetric(metric, fn)
}
// WithBatchTracking adds tracking to the metrics and registers the notify
// function to be called when processing is complete.
func WithGroupTracking(metric []telegraf.Metric, fn NotifyFunc) ([]telegraf.Metric, telegraf.TrackingID) {
return newTrackingMetricGroup(metric, fn)
}
func EnableDebugFinalizer() {
finalizer = debugFinalizer
}
var (
lastID uint64
finalizer func(*trackingData)
)
func newTrackingID() telegraf.TrackingID {
atomic.AddUint64(&lastID, 1)
return telegraf.TrackingID(lastID)
}
func debugFinalizer(d *trackingData) {
rc := atomic.LoadInt32(&d.rc)
if rc != 0 {
log.Fatalf("E! [agent] metric collected with non-zero reference count rc: %d", rc)
}
}
type trackingData struct {
id telegraf.TrackingID
rc int32
acceptCount int32
rejectCount int32
notify NotifyFunc
}
func (d *trackingData) incr() {
atomic.AddInt32(&d.rc, 1)
}
func (d *trackingData) decr() int32 {
return atomic.AddInt32(&d.rc, -1)
}
func (d *trackingData) accept() {
atomic.AddInt32(&d.acceptCount, 1)
}
func (d *trackingData) reject() {
atomic.AddInt32(&d.rejectCount, 1)
}
type trackingMetric struct {
telegraf.Metric
d *trackingData
}
func newTrackingMetric(metric telegraf.Metric, fn NotifyFunc) (telegraf.Metric, telegraf.TrackingID) {
m := &trackingMetric{
Metric: metric,
d: &trackingData{
id: newTrackingID(),
rc: 1,
acceptCount: 0,
rejectCount: 0,
notify: fn,
},
}
if finalizer != nil {
runtime.SetFinalizer(m.d, finalizer)
}
return m, m.d.id
}
func newTrackingMetricGroup(group []telegraf.Metric, fn NotifyFunc) ([]telegraf.Metric, telegraf.TrackingID) {
d := &trackingData{
id: newTrackingID(),
rc: 0,
acceptCount: 0,
rejectCount: 0,
notify: fn,
}
for i, m := range group {
d.incr()
dm := &trackingMetric{
Metric: m,
d: d,
}
group[i] = dm
}
if finalizer != nil {
runtime.SetFinalizer(d, finalizer)
}
return group, d.id
}
func (m *trackingMetric) Copy() telegraf.Metric {
m.d.incr()
return &trackingMetric{
Metric: m.Metric.Copy(),
d: m.d,
}
}
func (m *trackingMetric) Accept() {
m.d.accept()
m.decr()
}
func (m *trackingMetric) Reject() {
m.d.reject()
m.decr()
}
func (m *trackingMetric) Drop() {
m.decr()
}
func (m *trackingMetric) decr() {
v := m.d.decr()
if v < 0 {
panic("negative refcount")
}
if v == 0 {
m.d.notify(
&deliveryInfo{
id: m.d.id,
accepted: int(m.d.acceptCount),
rejected: int(m.d.rejectCount),
},
)
}
}
type deliveryInfo struct {
id telegraf.TrackingID
accepted int
rejected int
}
func (r *deliveryInfo) ID() telegraf.TrackingID {
return r.id
}
func (r *deliveryInfo) Delivered() bool {
return r.rejected == 0
}

260
metric/tracking_test.go Normal file
View File

@ -0,0 +1,260 @@
package metric
import (
"testing"
"time"
"github.com/influxdata/telegraf"
"github.com/stretchr/testify/require"
)
func mustMetric(
name string,
tags map[string]string,
fields map[string]interface{},
tm time.Time,
tp ...telegraf.ValueType,
) telegraf.Metric {
m, err := New(name, tags, fields, tm, tp...)
if err != nil {
panic("mustMetric")
}
return m
}
type deliveries struct {
Info map[telegraf.TrackingID]telegraf.DeliveryInfo
}
func (d *deliveries) onDelivery(info telegraf.DeliveryInfo) {
d.Info[info.ID()] = info
}
func TestTracking(t *testing.T) {
tests := []struct {
name string
metric telegraf.Metric
actions func(metric telegraf.Metric)
delivered bool
}{
{
name: "accept",
metric: mustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
actions: func(m telegraf.Metric) {
m.Accept()
},
delivered: true,
},
{
name: "reject",
metric: mustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
actions: func(m telegraf.Metric) {
m.Reject()
},
delivered: false,
},
{
name: "accept copy",
metric: mustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
actions: func(m telegraf.Metric) {
m2 := m.Copy()
m.Accept()
m2.Accept()
},
delivered: true,
},
{
name: "copy with accept and done",
metric: mustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
actions: func(m telegraf.Metric) {
m2 := m.Copy()
m.Accept()
m2.Drop()
},
delivered: true,
},
{
name: "copy with mixed delivery",
metric: mustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
actions: func(m telegraf.Metric) {
m2 := m.Copy()
m.Accept()
m2.Reject()
},
delivered: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := &deliveries{
Info: make(map[telegraf.TrackingID]telegraf.DeliveryInfo),
}
metric, id := WithTracking(tt.metric, d.onDelivery)
tt.actions(metric)
info := d.Info[id]
require.Equal(t, tt.delivered, info.Delivered())
})
}
}
func TestGroupTracking(t *testing.T) {
tests := []struct {
name string
metrics []telegraf.Metric
actions func(metrics []telegraf.Metric)
delivered bool
}{
{
name: "accept",
metrics: []telegraf.Metric{
mustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
mustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
},
actions: func(metrics []telegraf.Metric) {
metrics[0].Accept()
metrics[1].Accept()
},
delivered: true,
},
{
name: "reject",
metrics: []telegraf.Metric{
mustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
mustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
},
actions: func(metrics []telegraf.Metric) {
metrics[0].Reject()
metrics[1].Reject()
},
delivered: false,
},
{
name: "remove",
metrics: []telegraf.Metric{
mustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
mustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
},
actions: func(metrics []telegraf.Metric) {
metrics[0].Drop()
metrics[1].Drop()
},
delivered: true,
},
{
name: "mixed",
metrics: []telegraf.Metric{
mustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
mustMetric(
"cpu",
map[string]string{},
map[string]interface{}{
"value": 42,
},
time.Unix(0, 0),
),
},
actions: func(metrics []telegraf.Metric) {
metrics[0].Accept()
metrics[1].Reject()
},
delivered: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := &deliveries{
Info: make(map[telegraf.TrackingID]telegraf.DeliveryInfo),
}
metrics, id := WithGroupTracking(tt.metrics, d.onDelivery)
tt.actions(metrics)
info := d.Info[id]
require.Equal(t, tt.delivered, info.Delivered())
})
}
}

View File

@ -17,16 +17,7 @@ type Output interface {
// if the Output only accepts a fixed set of aggregations over a time period.
// These functions may be called concurrently to the Write function.
type AggregatingOutput interface {
// Connect to the Output
Connect() error
// Close any connections to the Output
Close() error
// Description returns a one-sentence description on the Output
Description() string
// SampleConfig returns the default configuration of the Output
SampleConfig() string
// Write takes in group of points to be written to the Output
Write(metrics []Metric) error
Output
// Add the metric to the aggregator
Add(in Metric)
@ -35,21 +26,3 @@ type AggregatingOutput interface {
// Reset signals the the aggregator period is completed.
Reset()
}
type ServiceOutput interface {
// Connect to the Output
Connect() error
// Close any connections to the Output
Close() error
// Description returns a one-sentence description on the Output
Description() string
// SampleConfig returns the default configuration of the Output
SampleConfig() string
// Write takes in group of points to be written to the Output
Write(metrics []Metric) error
// Start the "service" that will provide an Output
Start() error
// Stop the "service" that will provide an Output
Stop()
}

View File

@ -133,7 +133,6 @@ func (m *BasicStats) Add(in telegraf.Metric) {
}
func (m *BasicStats) Push(acc telegraf.Accumulator) {
config := getConfiguredStats(m)
for _, aggregate := range m.cache {

View File

@ -13,7 +13,6 @@ For an introduction to AMQP see:
The following defaults are known to work with RabbitMQ:
```toml
# AMQP consumer plugin
[[inputs.amqp_consumer]]
## Broker to consume from.
## deprecated in 1.7; use the brokers option
@ -46,16 +45,26 @@ The following defaults are known to work with RabbitMQ:
## AMQP queue name
queue = "telegraf"
## AMQP queue durability can be "transient" or "durable".
queue_durability = "durable"
## Binding Key
binding_key = "#"
## Maximum number of messages server should give to the worker.
# prefetch_count = 50
## Maximum messages to read from the broker that have not been written by an
## output. For best throughput set based on the number of metrics within
## each message and the size of the output's metric_batch_size.
##
## For example, if each message from the queue contains 10 metrics and the
## output metric_batch_size is 1000, setting this to 100 will ensure that a
## full batch is collected and the write is triggered immediately without
## waiting until the next flush_interval.
# max_undelivered_messages = 1000
## Auth method. PLAIN and EXTERNAL are supported
## Using EXTERNAL requires enabling the rabbitmq_auth_mechanism_ssl plugin as
## described here: https://www.rabbitmq.com/plugins.html

View File

@ -1,6 +1,7 @@
package amqp_consumer
import (
"context"
"errors"
"fmt"
"log"
@ -9,25 +10,32 @@ import (
"sync"
"time"
"github.com/streadway/amqp"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal/tls"
"github.com/influxdata/telegraf/plugins/inputs"
"github.com/influxdata/telegraf/plugins/parsers"
"github.com/streadway/amqp"
)
const (
defaultMaxUndeliveredMessages = 1000
)
type empty struct{}
type semaphore chan empty
// AMQPConsumer is the top level struct for this plugin
type AMQPConsumer struct {
URL string `toml:"url"` // deprecated in 1.7; use brokers
Brokers []string `toml:"brokers"`
Username string `toml:"username"`
Password string `toml:"password"`
Exchange string `toml:"exchange"`
ExchangeType string `toml:"exchange_type"`
ExchangeDurability string `toml:"exchange_durability"`
ExchangePassive bool `toml:"exchange_passive"`
ExchangeArguments map[string]string `toml:"exchange_arguments"`
URL string `toml:"url"` // deprecated in 1.7; use brokers
Brokers []string `toml:"brokers"`
Username string `toml:"username"`
Password string `toml:"password"`
Exchange string `toml:"exchange"`
ExchangeType string `toml:"exchange_type"`
ExchangeDurability string `toml:"exchange_durability"`
ExchangePassive bool `toml:"exchange_passive"`
ExchangeArguments map[string]string `toml:"exchange_arguments"`
MaxUndeliveredMessages int `toml:"max_undelivered_messages"`
// Queue Name
Queue string `toml:"queue"`
@ -44,9 +52,12 @@ type AMQPConsumer struct {
AuthMethod string
tls.ClientConfig
deliveries map[telegraf.TrackingID]amqp.Delivery
parser parsers.Parser
conn *amqp.Connection
wg *sync.WaitGroup
cancel context.CancelFunc
}
type externalAuth struct{}
@ -114,6 +125,16 @@ func (a *AMQPConsumer) SampleConfig() string {
## Maximum number of messages server should give to the worker.
# prefetch_count = 50
## Maximum messages to read from the broker that have not been written by an
## output. For best throughput set based on the number of metrics within
## each message and the size of the output's metric_batch_size.
##
## For example, if each message from the queue contains 10 metrics and the
## output metric_batch_size is 1000, setting this to 100 will ensure that a
## full batch is collected and the write is triggered immediately without
## waiting until the next flush_interval.
# max_undelivered_messages = 1000
## Auth method. PLAIN and EXTERNAL are supported
## Using EXTERNAL requires enabling the rabbitmq_auth_mechanism_ssl plugin as
## described here: https://www.rabbitmq.com/plugins.html
@ -185,9 +206,15 @@ func (a *AMQPConsumer) Start(acc telegraf.Accumulator) error {
return err
}
ctx, cancel := context.WithCancel(context.Background())
a.cancel = cancel
a.wg = &sync.WaitGroup{}
a.wg.Add(1)
go a.process(msgs, acc)
go func() {
defer a.wg.Done()
a.process(ctx, msgs, acc)
}()
go func() {
for {
@ -196,7 +223,7 @@ func (a *AMQPConsumer) Start(acc telegraf.Accumulator) error {
break
}
log.Printf("I! AMQP consumer connection closed: %s; trying to reconnect", err)
log.Printf("I! [inputs.amqp_consumer] connection closed: %s; trying to reconnect", err)
for {
msgs, err := a.connect(amqpConf)
if err != nil {
@ -206,7 +233,10 @@ func (a *AMQPConsumer) Start(acc telegraf.Accumulator) error {
}
a.wg.Add(1)
go a.process(msgs, acc)
go func() {
defer a.wg.Done()
a.process(ctx, msgs, acc)
}()
break
}
}
@ -224,14 +254,14 @@ func (a *AMQPConsumer) connect(amqpConf *amqp.Config) (<-chan amqp.Delivery, err
p := rand.Perm(len(brokers))
for _, n := range p {
broker := brokers[n]
log.Printf("D! [amqp_consumer] connecting to %q", broker)
log.Printf("D! [inputs.amqp_consumer] connecting to %q", broker)
conn, err := amqp.DialConfig(broker, *amqpConf)
if err == nil {
a.conn = conn
log.Printf("D! [amqp_consumer] connected to %q", broker)
log.Printf("D! [inputs.amqp_consumer] connected to %q", broker)
break
}
log.Printf("D! [amqp_consumer] error connecting to %q", broker)
log.Printf("D! [inputs.amqp_consumer] error connecting to %q", broker)
}
if a.conn == nil {
@ -320,7 +350,6 @@ func (a *AMQPConsumer) connect(amqpConf *amqp.Config) (<-chan amqp.Delivery, err
return nil, fmt.Errorf("Failed establishing connection to queue: %s", err)
}
log.Println("I! Started AMQP consumer")
return msgs, err
}
@ -361,42 +390,101 @@ func declareExchange(
}
// Read messages from queue and add them to the Accumulator
func (a *AMQPConsumer) process(msgs <-chan amqp.Delivery, acc telegraf.Accumulator) {
defer a.wg.Done()
for d := range msgs {
metrics, err := a.parser.Parse(d.Body)
if err != nil {
log.Printf("E! %v: error parsing metric - %v", err, string(d.Body))
} else {
for _, m := range metrics {
acc.AddFields(m.Name(), m.Fields(), m.Tags(), m.Time())
func (a *AMQPConsumer) process(ctx context.Context, msgs <-chan amqp.Delivery, ac telegraf.Accumulator) {
a.deliveries = make(map[telegraf.TrackingID]amqp.Delivery)
acc := ac.WithTracking(a.MaxUndeliveredMessages)
sem := make(semaphore, a.MaxUndeliveredMessages)
for {
select {
case <-ctx.Done():
return
case track := <-acc.Delivered():
if a.onDelivery(track) {
<-sem
}
case sem <- empty{}:
select {
case <-ctx.Done():
return
case track := <-acc.Delivered():
if a.onDelivery(track) {
<-sem
<-sem
}
case d, ok := <-msgs:
if !ok {
return
}
err := a.onMessage(acc, d)
if err != nil {
acc.AddError(err)
<-sem
}
}
}
d.Ack(false)
}
log.Printf("I! AMQP consumer queue closed")
}
func (a *AMQPConsumer) onMessage(acc telegraf.TrackingAccumulator, d amqp.Delivery) error {
metrics, err := a.parser.Parse(d.Body)
if err != nil {
return err
}
id := acc.AddTrackingMetricGroup(metrics)
a.deliveries[id] = d
return nil
}
func (a *AMQPConsumer) onDelivery(track telegraf.DeliveryInfo) bool {
delivery, ok := a.deliveries[track.ID()]
if !ok {
// Added by a previous connection
return false
}
if track.Delivered() {
err := delivery.Ack(false)
if err != nil {
log.Printf("E! [inputs.amqp_consumer] Unable to ack written delivery: %d: %v",
delivery.DeliveryTag, err)
a.conn.Close()
}
} else {
err := delivery.Reject(false)
if err != nil {
log.Printf("E! [inputs.amqp_consumer] Unable to reject failed delivery: %d: %v",
delivery.DeliveryTag, err)
a.conn.Close()
}
}
delete(a.deliveries, track.ID())
return true
}
func (a *AMQPConsumer) Stop() {
a.cancel()
a.wg.Wait()
err := a.conn.Close()
if err != nil && err != amqp.ErrClosed {
log.Printf("E! Error closing AMQP connection: %s", err)
log.Printf("E! [inputs.amqp_consumer] Error closing AMQP connection: %s", err)
return
}
a.wg.Wait()
log.Println("I! Stopped AMQP service")
}
func init() {
inputs.Add("amqp_consumer", func() telegraf.Input {
return &AMQPConsumer{
URL: DefaultBroker,
AuthMethod: DefaultAuthMethod,
ExchangeType: DefaultExchangeType,
ExchangeDurability: DefaultExchangeDurability,
QueueDurability: DefaultQueueDurability,
PrefetchCount: DefaultPrefetchCount,
URL: DefaultBroker,
AuthMethod: DefaultAuthMethod,
ExchangeType: DefaultExchangeType,
ExchangeDurability: DefaultExchangeDurability,
QueueDurability: DefaultQueueDurability,
PrefetchCount: DefaultPrefetchCount,
MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
}
})
}

View File

@ -18,52 +18,54 @@ plugin.
memstats are taken from the Go runtime: https://golang.org/pkg/runtime/#MemStats
- internal\_memstats
- alloc\_bytes
- internal_memstats
- alloc_bytes
- frees
- heap\_alloc\_bytes
- heap\_idle\_bytes
- heap\_in\_use\_bytes
- heap\_objects\_bytes
- heap\_released\_bytes
- heap\_sys\_bytes
- heap_alloc_bytes
- heap_idle_bytes
- heap_in_use_bytes
- heap_objects_bytes
- heap_released_bytes
- heap_sys_bytes
- mallocs
- num\_gc
- pointer\_lookups
- sys\_bytes
- total\_alloc\_bytes
- num_gc
- pointer_lookups
- sys_bytes
- total_alloc_bytes
agent stats collect aggregate stats on all telegraf plugins.
- internal\_agent
- gather\_errors
- metrics\_dropped
- metrics\_gathered
- metrics\_written
- internal_agent
- gather_errors
- metrics_dropped
- metrics_gathered
- metrics_written
internal\_gather stats collect aggregate stats on all input plugins
internal_gather stats collect aggregate stats on all input plugins
that are of the same input type. They are tagged with `input=<plugin_name>`.
- internal\_gather
- gather\_time\_ns
- metrics\_gathered
- internal_gather
- gather_time_ns
- metrics_gathered
internal\_write stats collect aggregate stats on all output plugins
internal_write stats collect aggregate stats on all output plugins
that are of the same input type. They are tagged with `output=<plugin_name>`.
- internal\_write
- buffer\_limit
- buffer\_size
- metrics\_written
- metrics\_filtered
- write\_time\_ns
- internal_write
- buffer_limit
- buffer_size
- metrics_added
- metrics_written
- metrics_dropped
- metrics_filtered
- write_time_ns
internal\_\<plugin\_name\> are metrics which are defined on a per-plugin basis, and
internal_<plugin_name> are metrics which are defined on a per-plugin basis, and
usually contain tags which differentiate each instance of a particular type of
plugin.
- internal\_\<plugin\_name\>
- internal_<plugin_name>
- individual plugin-specific fields, such as requests counts.
### Tags:
@ -76,7 +78,7 @@ to each particular plugin.
```
internal_memstats,host=tyrion alloc_bytes=4457408i,sys_bytes=10590456i,pointer_lookups=7i,mallocs=17642i,frees=7473i,heap_sys_bytes=6848512i,heap_idle_bytes=1368064i,heap_in_use_bytes=5480448i,heap_released_bytes=0i,total_alloc_bytes=6875560i,heap_alloc_bytes=4457408i,heap_objects_bytes=10169i,num_gc=2i 1480682800000000000
internal_agent,host=tyrion metrics_written=18i,metrics_dropped=0i,metrics_gathered=19i,gather_errors=0i 1480682800000000000
internal_write,output=file,host=tyrion buffer_limit=10000i,write_time_ns=636609i,metrics_written=18i,buffer_size=0i 1480682800000000000
internal_write,output=file,host=tyrion buffer_limit=10000i,write_time_ns=636609i,metrics_added=18i,metrics_written=18i,buffer_size=0i 1480682800000000000
internal_gather,input=internal,host=tyrion metrics_gathered=19i,gather_time_ns=442114i 1480682800000000000
internal_gather,input=http_listener,host=tyrion metrics_gathered=0i,gather_time_ns=167285i 1480682800000000000
internal_http_listener,address=:8186,host=tyrion queries_received=0i,writes_received=0i,requests_received=0i,buffers_created=0i,requests_served=0i,pings_received=0i,bytes_received=0i,not_founds_served=0i,pings_served=0i,queries_served=0i,writes_served=0i 1480682800000000000

View File

@ -1,18 +1,14 @@
# Kafka Consumer Input Plugin
The [Kafka](http://kafka.apache.org/) consumer plugin polls a specified Kafka
topic and adds messages to InfluxDB. The plugin assumes messages follow the
line protocol. [Consumer Group](http://godoc.org/github.com/wvanbergen/kafka/consumergroup)
is used to talk to the Kafka cluster so multiple instances of telegraf can read
from the same topic in parallel.
The [Kafka][kafka] consumer plugin reads from Kafka
and creates metrics using one of the supported [input data formats][].
For old kafka version (< 0.8), please use the kafka_consumer_legacy input plugin
For old kafka version (< 0.8), please use the [kafka_consumer_legacy][] input plugin
and use the old zookeeper connection method.
## Configuration
### Configuration
```toml
# Read metrics from Kafka topic(s)
[[inputs.kafka_consumer]]
## kafka servers
brokers = ["localhost:9092"]
@ -44,18 +40,27 @@ and use the old zookeeper connection method.
## Offset (must be either "oldest" or "newest")
offset = "oldest"
## Maximum length of a message to consume, in bytes (default 0/unlimited);
## larger messages are dropped
max_message_len = 1000000
## Maximum messages to read from the broker that have not been written by an
## output. For best throughput set based on the number of metrics within
## each message and the size of the output's metric_batch_size.
##
## For example, if each message from the queue contains 10 metrics and the
## output metric_batch_size is 1000, setting this to 100 will ensure that a
## full batch is collected and the write is triggered immediately without
## waiting until the next flush_interval.
# max_undelivered_messages = 1000
## Data format to consume.
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
data_format = "influx"
## Maximum length of a message to consume, in bytes (default 0/unlimited);
## larger messages are dropped
max_message_len = 1000000
```
## Testing
Running integration tests requires running Zookeeper & Kafka. See Makefile
for kafka container command.
[kafka]: https://kafka.apache.org
[kafka_consumer_legacy]: /plugins/inputs/kafka_consumer_legacy/README.md
[input data formats]: /docs/DATA_FORMATS_INPUT.md

View File

@ -1,55 +1,54 @@
package kafka_consumer
import (
"context"
"fmt"
"log"
"strings"
"sync"
"github.com/Shopify/sarama"
cluster "github.com/bsm/sarama-cluster"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal/tls"
"github.com/influxdata/telegraf/plugins/inputs"
"github.com/influxdata/telegraf/plugins/parsers"
"github.com/Shopify/sarama"
cluster "github.com/bsm/sarama-cluster"
)
const (
defaultMaxUndeliveredMessages = 1000
)
type empty struct{}
type semaphore chan empty
type Consumer interface {
Errors() <-chan error
Messages() <-chan *sarama.ConsumerMessage
MarkOffset(msg *sarama.ConsumerMessage, metadata string)
Close() error
}
type Kafka struct {
ConsumerGroup string
ClientID string `toml:"client_id"`
Topics []string
Brokers []string
MaxMessageLen int
Version string `toml:"version"`
Cluster *cluster.Consumer
ConsumerGroup string `toml:"consumer_group"`
ClientID string `toml:"client_id"`
Topics []string `toml:"topics"`
Brokers []string `toml:"brokers"`
MaxMessageLen int `toml:"max_message_len"`
Version string `toml:"version"`
MaxUndeliveredMessages int `toml:"max_undelivered_messages"`
Offset string `toml:"offset"`
SASLUsername string `toml:"sasl_username"`
SASLPassword string `toml:"sasl_password"`
tls.ClientConfig
// SASL Username
SASLUsername string `toml:"sasl_username"`
// SASL Password
SASLPassword string `toml:"sasl_password"`
cluster Consumer
parser parsers.Parser
wg *sync.WaitGroup
cancel context.CancelFunc
// Legacy metric buffer support
MetricBuffer int
// TODO remove PointBuffer, legacy support
PointBuffer int
Offset string
parser parsers.Parser
sync.Mutex
// channel for all incoming kafka messages
in <-chan *sarama.ConsumerMessage
// channel for all kafka consumer errors
errs <-chan error
done chan struct{}
// keep the accumulator internally:
acc telegraf.Accumulator
// Unconfirmed messages
messages map[telegraf.TrackingID]*sarama.ConsumerMessage
// doNotCommitMsgs tells the parser not to call CommitUpTo on the consumer
// this is mostly for test purposes, but there may be a use-case for it later.
@ -86,16 +85,25 @@ var sampleConfig = `
consumer_group = "telegraf_metrics_consumers"
## Offset (must be either "oldest" or "newest")
offset = "oldest"
## Maximum length of a message to consume, in bytes (default 0/unlimited);
## larger messages are dropped
max_message_len = 1000000
## Maximum messages to read from the broker that have not been written by an
## output. For best throughput set based on the number of metrics within
## each message and the size of the output's metric_batch_size.
##
## For example, if each message from the queue contains 10 metrics and the
## output metric_batch_size is 1000, setting this to 100 will ensure that a
## full batch is collected and the write is triggered immediately without
## waiting until the next flush_interval.
# max_undelivered_messages = 1000
## Data format to consume.
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
data_format = "influx"
## Maximum length of a message to consume, in bytes (default 0/unlimited);
## larger messages are dropped
max_message_len = 1000000
`
func (k *Kafka) SampleConfig() string {
@ -111,12 +119,8 @@ func (k *Kafka) SetParser(parser parsers.Parser) {
}
func (k *Kafka) Start(acc telegraf.Accumulator) error {
k.Lock()
defer k.Unlock()
var clusterErr error
k.acc = acc
config := cluster.NewConfig()
if k.Version != "" {
@ -159,13 +163,13 @@ func (k *Kafka) Start(acc telegraf.Accumulator) error {
case "newest":
config.Consumer.Offsets.Initial = sarama.OffsetNewest
default:
log.Printf("I! WARNING: Kafka consumer invalid offset '%s', using 'oldest'\n",
log.Printf("I! WARNING: Kafka consumer invalid offset '%s', using 'oldest'",
k.Offset)
config.Consumer.Offsets.Initial = sarama.OffsetOldest
}
if k.Cluster == nil {
k.Cluster, clusterErr = cluster.NewConsumer(
if k.cluster == nil {
k.cluster, clusterErr = cluster.NewConsumer(
k.Brokers,
k.ConsumerGroup,
k.Topics,
@ -173,67 +177,110 @@ func (k *Kafka) Start(acc telegraf.Accumulator) error {
)
if clusterErr != nil {
log.Printf("E! Error when creating Kafka Consumer, brokers: %v, topics: %v\n",
log.Printf("E! Error when creating Kafka Consumer, brokers: %v, topics: %v",
k.Brokers, k.Topics)
return clusterErr
}
// Setup message and error channels
k.in = k.Cluster.Messages()
k.errs = k.Cluster.Errors()
}
k.done = make(chan struct{})
// Start the kafka message reader
go k.receiver()
log.Printf("I! Started the kafka consumer service, brokers: %v, topics: %v\n",
ctx, cancel := context.WithCancel(context.Background())
k.cancel = cancel
// Start consumer goroutine
k.wg = &sync.WaitGroup{}
k.wg.Add(1)
go func() {
defer k.wg.Done()
k.receiver(ctx, acc)
}()
log.Printf("I! Started the kafka consumer service, brokers: %v, topics: %v",
k.Brokers, k.Topics)
return nil
}
// receiver() reads all incoming messages from the consumer, and parses them into
// influxdb metric points.
func (k *Kafka) receiver() {
func (k *Kafka) receiver(ctx context.Context, ac telegraf.Accumulator) {
k.messages = make(map[telegraf.TrackingID]*sarama.ConsumerMessage)
acc := ac.WithTracking(k.MaxUndeliveredMessages)
sem := make(semaphore, k.MaxUndeliveredMessages)
for {
select {
case <-k.done:
case <-ctx.Done():
return
case err := <-k.errs:
if err != nil {
k.acc.AddError(fmt.Errorf("Consumer Error: %s\n", err))
}
case msg := <-k.in:
if k.MaxMessageLen != 0 && len(msg.Value) > k.MaxMessageLen {
k.acc.AddError(fmt.Errorf("Message longer than max_message_len (%d > %d)",
len(msg.Value), k.MaxMessageLen))
} else {
metrics, err := k.parser.Parse(msg.Value)
case track := <-acc.Delivered():
<-sem
k.onDelivery(track)
case err := <-k.cluster.Errors():
acc.AddError(err)
case sem <- empty{}:
select {
case <-ctx.Done():
return
case track := <-acc.Delivered():
// Once for the delivered message, once to leave the case
<-sem
<-sem
k.onDelivery(track)
case err := <-k.cluster.Errors():
<-sem
acc.AddError(err)
case msg := <-k.cluster.Messages():
err := k.onMessage(acc, msg)
if err != nil {
k.acc.AddError(fmt.Errorf("Message Parse Error\nmessage: %s\nerror: %s",
string(msg.Value), err.Error()))
acc.AddError(err)
<-sem
}
for _, metric := range metrics {
k.acc.AddFields(metric.Name(), metric.Fields(), metric.Tags(), metric.Time())
}
}
if !k.doNotCommitMsgs {
// TODO(cam) this locking can be removed if this PR gets merged:
// https://github.com/wvanbergen/kafka/pull/84
k.Lock()
k.Cluster.MarkOffset(msg, "")
k.Unlock()
}
}
}
}
func (k *Kafka) markOffset(msg *sarama.ConsumerMessage) {
if !k.doNotCommitMsgs {
k.cluster.MarkOffset(msg, "")
}
}
func (k *Kafka) onMessage(acc telegraf.TrackingAccumulator, msg *sarama.ConsumerMessage) error {
if k.MaxMessageLen != 0 && len(msg.Value) > k.MaxMessageLen {
k.markOffset(msg)
return fmt.Errorf("Message longer than max_message_len (%d > %d)",
len(msg.Value), k.MaxMessageLen)
}
metrics, err := k.parser.Parse(msg.Value)
if err != nil {
return err
}
id := acc.AddTrackingMetricGroup(metrics)
k.messages[id] = msg
return nil
}
func (k *Kafka) onDelivery(track telegraf.DeliveryInfo) {
msg, ok := k.messages[track.ID()]
if !ok {
log.Printf("E! [inputs.kafka_consumer] Could not mark message delivered: %d", track.ID())
}
if track.Delivered() {
k.markOffset(msg)
}
delete(k.messages, track.ID())
}
func (k *Kafka) Stop() {
k.Lock()
defer k.Unlock()
close(k.done)
if err := k.Cluster.Close(); err != nil {
k.acc.AddError(fmt.Errorf("Error closing consumer: %s\n", err.Error()))
k.cancel()
k.wg.Wait()
if err := k.cluster.Close(); err != nil {
log.Printf("E! [inputs.kafka_consumer] Error closing consumer: %v", err)
}
}
@ -243,6 +290,8 @@ func (k *Kafka) Gather(acc telegraf.Accumulator) error {
func init() {
inputs.Add("kafka_consumer", func() telegraf.Input {
return &Kafka{}
return &Kafka{
MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
}
})
}

View File

@ -38,7 +38,6 @@ func TestReadsMetricsFromKafka(t *testing.T) {
ConsumerGroup: "telegraf_test_consumers",
Topics: []string{testTopic},
Brokers: brokerPeers,
PointBuffer: 100000,
Offset: "oldest",
}
p, _ := parsers.NewInfluxParser()

View File

@ -1,13 +1,14 @@
package kafka_consumer
import (
"context"
"strings"
"testing"
"github.com/Shopify/sarama"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/parsers"
"github.com/influxdata/telegraf/testutil"
"github.com/Shopify/sarama"
"github.com/stretchr/testify/assert"
)
@ -18,31 +19,57 @@ const (
invalidMsg = "cpu_load_short,host=server01 1422568543702900257\n"
)
func newTestKafka() (*Kafka, chan *sarama.ConsumerMessage) {
in := make(chan *sarama.ConsumerMessage, 1000)
k := Kafka{
ConsumerGroup: "test",
Topics: []string{"telegraf"},
Brokers: []string{"localhost:9092"},
Offset: "oldest",
in: in,
doNotCommitMsgs: true,
errs: make(chan error, 1000),
done: make(chan struct{}),
type TestConsumer struct {
errors chan error
messages chan *sarama.ConsumerMessage
}
func (c *TestConsumer) Errors() <-chan error {
return c.errors
}
func (c *TestConsumer) Messages() <-chan *sarama.ConsumerMessage {
return c.messages
}
func (c *TestConsumer) MarkOffset(msg *sarama.ConsumerMessage, metadata string) {
}
func (c *TestConsumer) Close() error {
return nil
}
func (c *TestConsumer) Inject(msg *sarama.ConsumerMessage) {
c.messages <- msg
}
func newTestKafka() (*Kafka, *TestConsumer) {
consumer := &TestConsumer{
errors: make(chan error),
messages: make(chan *sarama.ConsumerMessage, 1000),
}
return &k, in
k := Kafka{
cluster: consumer,
ConsumerGroup: "test",
Topics: []string{"telegraf"},
Brokers: []string{"localhost:9092"},
Offset: "oldest",
MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
doNotCommitMsgs: true,
messages: make(map[telegraf.TrackingID]*sarama.ConsumerMessage),
}
return &k, consumer
}
// Test that the parser parses kafka messages into points
func TestRunParser(t *testing.T) {
k, in := newTestKafka()
k, consumer := newTestKafka()
acc := testutil.Accumulator{}
k.acc = &acc
defer close(k.done)
ctx := context.Background()
k.parser, _ = parsers.NewInfluxParser()
go k.receiver()
in <- saramaMsg(testMsg)
go k.receiver(ctx, &acc)
consumer.Inject(saramaMsg(testMsg))
acc.Wait(1)
assert.Equal(t, acc.NFields(), 1)
@ -50,14 +77,13 @@ func TestRunParser(t *testing.T) {
// Test that the parser ignores invalid messages
func TestRunParserInvalidMsg(t *testing.T) {
k, in := newTestKafka()
k, consumer := newTestKafka()
acc := testutil.Accumulator{}
k.acc = &acc
defer close(k.done)
ctx := context.Background()
k.parser, _ = parsers.NewInfluxParser()
go k.receiver()
in <- saramaMsg(invalidMsg)
go k.receiver(ctx, &acc)
consumer.Inject(saramaMsg(invalidMsg))
acc.WaitError(1)
assert.Equal(t, acc.NFields(), 0)
@ -66,15 +92,14 @@ func TestRunParserInvalidMsg(t *testing.T) {
// Test that overlong messages are dropped
func TestDropOverlongMsg(t *testing.T) {
const maxMessageLen = 64 * 1024
k, in := newTestKafka()
k, consumer := newTestKafka()
k.MaxMessageLen = maxMessageLen
acc := testutil.Accumulator{}
k.acc = &acc
defer close(k.done)
ctx := context.Background()
overlongMsg := strings.Repeat("v", maxMessageLen+1)
go k.receiver()
in <- saramaMsg(overlongMsg)
go k.receiver(ctx, &acc)
consumer.Inject(saramaMsg(overlongMsg))
acc.WaitError(1)
assert.Equal(t, acc.NFields(), 0)
@ -82,14 +107,13 @@ func TestDropOverlongMsg(t *testing.T) {
// Test that the parser parses kafka messages into points
func TestRunParserAndGather(t *testing.T) {
k, in := newTestKafka()
k, consumer := newTestKafka()
acc := testutil.Accumulator{}
k.acc = &acc
defer close(k.done)
ctx := context.Background()
k.parser, _ = parsers.NewInfluxParser()
go k.receiver()
in <- saramaMsg(testMsg)
go k.receiver(ctx, &acc)
consumer.Inject(saramaMsg(testMsg))
acc.Wait(1)
acc.GatherError(k.Gather)
@ -101,14 +125,13 @@ func TestRunParserAndGather(t *testing.T) {
// Test that the parser parses kafka messages into points
func TestRunParserAndGatherGraphite(t *testing.T) {
k, in := newTestKafka()
k, consumer := newTestKafka()
acc := testutil.Accumulator{}
k.acc = &acc
defer close(k.done)
ctx := context.Background()
k.parser, _ = parsers.NewGraphiteParser("_", []string{}, nil)
go k.receiver()
in <- saramaMsg(testMsgGraphite)
go k.receiver(ctx, &acc)
consumer.Inject(saramaMsg(testMsgGraphite))
acc.Wait(1)
acc.GatherError(k.Gather)
@ -120,17 +143,16 @@ func TestRunParserAndGatherGraphite(t *testing.T) {
// Test that the parser parses kafka messages into points
func TestRunParserAndGatherJSON(t *testing.T) {
k, in := newTestKafka()
k, consumer := newTestKafka()
acc := testutil.Accumulator{}
k.acc = &acc
defer close(k.done)
ctx := context.Background()
k.parser, _ = parsers.NewParser(&parsers.Config{
DataFormat: "json",
MetricName: "kafka_json_test",
})
go k.receiver()
in <- saramaMsg(testMsgJSON)
go k.receiver(ctx, &acc)
consumer.Inject(saramaMsg(testMsgJSON))
acc.Wait(1)
acc.GatherError(k.Gather)

View File

@ -1,14 +1,11 @@
# MQTT Consumer Input Plugin
The [MQTT](http://mqtt.org/) consumer plugin reads from
specified MQTT topics and adds messages to InfluxDB.
The plugin expects messages in the
[Telegraf Input Data Formats](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md).
The [MQTT][mqtt] consumer plugin reads from the specified MQTT topics
and creates metrics using one of the supported [input data formats][].
### Configuration:
```toml
# Read metrics from MQTT topic(s)
[[inputs.mqtt_consumer]]
## MQTT broker URLs to be used. The format should be scheme://host:port,
## schema can be tcp, ssl, or ws.
@ -26,6 +23,16 @@ The plugin expects messages in the
## Connection timeout for initial connection in seconds
connection_timeout = "30s"
## Maximum messages to read from the broker that have not been written by an
## output. For best throughput set based on the number of metrics within
## each message and the size of the output's metric_batch_size.
##
## For example, if each message from the queue contains 10 metrics and the
## output metric_batch_size is 1000, setting this to 100 will ensure that a
## full batch is collected and the write is triggered immediately without
## waiting until the next flush_interval.
# max_undelivered_messages = 1000
## Topics to subscribe to
topics = [
"telegraf/host01/cpu",
@ -62,3 +69,6 @@ The plugin expects messages in the
- All measurements are tagged with the incoming topic, ie
`topic=telegraf/host01/cpu`
[mqtt]: https://mqtt.org
[input data formats]: /docs/DATA_FORMATS_INPUT.md

View File

@ -1,25 +1,31 @@
package mqtt_consumer
import (
"context"
"errors"
"fmt"
"log"
"strings"
"time"
"github.com/eclipse/paho.mqtt.golang"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/internal/tls"
"github.com/influxdata/telegraf/plugins/inputs"
"github.com/influxdata/telegraf/plugins/parsers"
"github.com/eclipse/paho.mqtt.golang"
)
// 30 Seconds is the default used by paho.mqtt.golang
var defaultConnectionTimeout = internal.Duration{Duration: 30 * time.Second}
var (
// 30 Seconds is the default used by paho.mqtt.golang
defaultConnectionTimeout = internal.Duration{Duration: 30 * time.Second}
defaultMaxUndeliveredMessages = 1000
)
type ConnectionState int
type empty struct{}
type semaphore chan empty
const (
Disconnected ConnectionState = iota
@ -28,12 +34,13 @@ const (
)
type MQTTConsumer struct {
Servers []string
Topics []string
Username string
Password string
QoS int `toml:"qos"`
ConnectionTimeout internal.Duration `toml:"connection_timeout"`
Servers []string
Topics []string
Username string
Password string
QoS int `toml:"qos"`
ConnectionTimeout internal.Duration `toml:"connection_timeout"`
MaxUndeliveredMessages int `toml:"max_undelivered_messages"`
parser parsers.Parser
@ -45,9 +52,14 @@ type MQTTConsumer struct {
tls.ClientConfig
client mqtt.Client
acc telegraf.Accumulator
acc telegraf.TrackingAccumulator
state ConnectionState
subscribed bool
sem semaphore
messages map[telegraf.TrackingID]bool
ctx context.Context
cancel context.CancelFunc
}
var sampleConfig = `
@ -67,6 +79,16 @@ var sampleConfig = `
## Connection timeout for initial connection in seconds
connection_timeout = "30s"
## Maximum messages to read from the broker that have not been written by an
## output. For best throughput set based on the number of metrics within
## each message and the size of the output's metric_batch_size.
##
## For example, if each message from the queue contains 10 metrics and the
## output metric_batch_size is 1000, setting this to 100 will ensure that a
## full batch is collected and the write is triggered immediately without
## waiting until the next flush_interval.
# max_undelivered_messages = 1000
## Topics to subscribe to
topics = [
"telegraf/host01/cpu",
@ -118,7 +140,6 @@ func (m *MQTTConsumer) Start(acc telegraf.Accumulator) error {
return errors.New("persistent_session requires client_id")
}
m.acc = acc
if m.QoS > 2 || m.QoS < 0 {
return fmt.Errorf("qos value must be 0, 1, or 2: %d", m.QoS)
}
@ -127,6 +148,9 @@ func (m *MQTTConsumer) Start(acc telegraf.Accumulator) error {
return fmt.Errorf("connection_timeout must be greater than 1s: %s", m.ConnectionTimeout.Duration)
}
m.acc = acc.WithTracking(m.MaxUndeliveredMessages)
m.ctx, m.cancel = context.WithCancel(context.Background())
opts, err := m.createOpts()
if err != nil {
return err
@ -146,8 +170,10 @@ func (m *MQTTConsumer) connect() error {
return err
}
log.Printf("I! [inputs.mqtt_consumer]: connected %v", m.Servers)
log.Printf("I! [inputs.mqtt_consumer] Connected %v", m.Servers)
m.state = Connected
m.sem = make(semaphore, m.MaxUndeliveredMessages)
m.messages = make(map[telegraf.TrackingID]bool)
// Only subscribe on first connection when using persistent sessions. On
// subsequent connections the subscriptions should be stored in the
@ -172,38 +198,64 @@ func (m *MQTTConsumer) connect() error {
func (m *MQTTConsumer) onConnectionLost(c mqtt.Client, err error) {
m.acc.AddError(fmt.Errorf("connection lost: %v", err))
log.Printf("D! [inputs.mqtt_consumer]: disconnected %v", m.Servers)
log.Printf("D! [inputs.mqtt_consumer] Disconnected %v", m.Servers)
m.state = Disconnected
return
}
func (m *MQTTConsumer) recvMessage(c mqtt.Client, msg mqtt.Message) {
topic := msg.Topic()
for {
select {
case track := <-m.acc.Delivered():
_, ok := m.messages[track.ID()]
if !ok {
// Added by a previous connection
continue
}
<-m.sem
// No ack, MQTT does not support durable handling
delete(m.messages, track.ID())
case m.sem <- empty{}:
err := m.onMessage(m.acc, msg)
if err != nil {
m.acc.AddError(err)
<-m.sem
}
return
}
}
}
func (m *MQTTConsumer) onMessage(acc telegraf.TrackingAccumulator, msg mqtt.Message) error {
metrics, err := m.parser.Parse(msg.Payload())
if err != nil {
m.acc.AddError(err)
return err
}
topic := msg.Topic()
for _, metric := range metrics {
tags := metric.Tags()
tags["topic"] = topic
m.acc.AddFields(metric.Name(), metric.Fields(), tags, metric.Time())
metric.AddTag("topic", topic)
}
id := acc.AddTrackingMetricGroup(metrics)
m.messages[id] = true
return nil
}
func (m *MQTTConsumer) Stop() {
if m.state == Connected {
log.Printf("D! [inputs.mqtt_consumer]: disconnecting %v", m.Servers)
log.Printf("D! [inputs.mqtt_consumer] Disconnecting %v", m.Servers)
m.client.Disconnect(200)
log.Printf("D! [inputs.mqtt_consumer]: disconnected %v", m.Servers)
log.Printf("D! [inputs.mqtt_consumer] Disconnected %v", m.Servers)
m.state = Disconnected
}
m.cancel()
}
func (m *MQTTConsumer) Gather(acc telegraf.Accumulator) error {
if m.state == Disconnected {
m.state = Connecting
log.Printf("D! [inputs.mqtt_consumer]: connecting %v", m.Servers)
log.Printf("D! [inputs.mqtt_consumer] Connecting %v", m.Servers)
m.connect()
}
@ -246,7 +298,7 @@ func (m *MQTTConsumer) createOpts() (*mqtt.ClientOptions, error) {
for _, server := range m.Servers {
// Preserve support for host:port style servers; deprecated in Telegraf 1.4.4
if !strings.Contains(server, "://") {
log.Printf("W! [inputs.mqtt_consumer] server %q should be updated to use `scheme://host:port` format", server)
log.Printf("W! [inputs.mqtt_consumer] Server %q should be updated to use `scheme://host:port` format", server)
if tlsCfg == nil {
server = "tcp://" + server
} else {
@ -267,8 +319,9 @@ func (m *MQTTConsumer) createOpts() (*mqtt.ClientOptions, error) {
func init() {
inputs.Add("mqtt_consumer", func() telegraf.Input {
return &MQTTConsumer{
ConnectionTimeout: defaultConnectionTimeout,
state: Disconnected,
ConnectionTimeout: defaultConnectionTimeout,
MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
state: Disconnected,
}
})
}

View File

@ -3,12 +3,9 @@ package mqtt_consumer
import (
"testing"
"github.com/influxdata/telegraf/plugins/parsers"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/assert"
"github.com/eclipse/paho.mqtt.golang"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/assert"
)
const (
@ -71,47 +68,6 @@ func TestPersistentClientIDFail(t *testing.T) {
assert.Error(t, err)
}
func TestRunParser(t *testing.T) {
n := newTestMQTTConsumer()
acc := testutil.Accumulator{}
n.acc = &acc
n.parser, _ = parsers.NewInfluxParser()
n.recvMessage(nil, mqttMsg(testMsg))
if a := acc.NFields(); a != 1 {
t.Errorf("got %v, expected %v", a, 1)
}
}
// Test that the parser ignores invalid messages
func TestRunParserInvalidMsg(t *testing.T) {
n := newTestMQTTConsumer()
acc := testutil.Accumulator{}
n.acc = &acc
n.parser, _ = parsers.NewInfluxParser()
n.recvMessage(nil, mqttMsg(invalidMsg))
if a := acc.NFields(); a != 0 {
t.Errorf("got %v, expected %v", a, 0)
}
assert.Len(t, acc.Errors, 1)
}
// Test that the parser parses line format messages into metrics
func TestRunParserAndGather(t *testing.T) {
n := newTestMQTTConsumer()
acc := testutil.Accumulator{}
n.acc = &acc
n.parser, _ = parsers.NewInfluxParser()
n.recvMessage(nil, mqttMsg(testMsg))
acc.AssertContainsFields(t, "cpu_load_short",
map[string]interface{}{"value": float64(23422)})
}
func mqttMsg(val string) mqtt.Message {
return &message{
topic: "telegraf/unit_test",

View File

@ -1,16 +1,14 @@
# NATS Consumer Input Plugin
The [NATS](http://www.nats.io/about/) consumer plugin reads from
specified NATS subjects and adds messages to InfluxDB. The plugin expects messages
in the [Telegraf Input Data Formats](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md).
A [Queue Group](http://www.nats.io/documentation/concepts/nats-queueing/)
is used when subscribing to subjects so multiple instances of telegraf can read
from a NATS cluster in parallel.
The [NATS][nats] consumer plugin reads from the specified NATS subjects and
creates metrics using one of the supported [input data formats][].
## Configuration
A [Queue Group][queue group] is used when subscribing to subjects so multiple
instances of telegraf can read from a NATS cluster in parallel.
### Configuration:
```toml
# Read metrics from NATS subject(s)
[[inputs.nats_consumer]]
## urls of NATS servers
servers = ["nats://localhost:4222"]
@ -20,13 +18,29 @@ from a NATS cluster in parallel.
subjects = ["telegraf"]
## name a queue group
queue_group = "telegraf_consumers"
## Maximum number of metrics to buffer between collection intervals
metric_buffer = 100000
## Data format to consume.
## Sets the limits for pending msgs and bytes for each subscription
## These shouldn't need to be adjusted except in very high throughput scenarios
# pending_message_limit = 65536
# pending_bytes_limit = 67108864
## Maximum messages to read from the broker that have not been written by an
## output. For best throughput set based on the number of metrics within
## each message and the size of the output's metric_batch_size.
##
## For example, if each message from the queue contains 10 metrics and the
## output metric_batch_size is 1000, setting this to 100 will ensure that a
## full batch is collected and the write is triggered immediately without
## waiting until the next flush_interval.
# max_undelivered_messages = 1000
## Data format to consume.
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
data_format = "influx"
```
[nats]: https://www.nats.io/about/
[input data formats]: /docs/DATA_FORMATS_INPUT.md
[queue group]: https://www.nats.io/documentation/concepts/nats-queueing/

View File

@ -1,6 +1,7 @@
package natsconsumer
import (
"context"
"fmt"
"log"
"sync"
@ -11,6 +12,13 @@ import (
nats "github.com/nats-io/go-nats"
)
var (
defaultMaxUndeliveredMessages = 1000
)
type empty struct{}
type semaphore chan empty
type natsError struct {
conn *nats.Conn
sub *nats.Subscription
@ -23,48 +31,58 @@ func (e natsError) Error() string {
}
type natsConsumer struct {
QueueGroup string
Subjects []string
Servers []string
Secure bool
QueueGroup string `toml:"queue_group"`
Subjects []string `toml:"subjects"`
Servers []string `toml:"servers"`
Secure bool `toml:"secure"`
// Client pending limits:
PendingMessageLimit int
PendingBytesLimit int
PendingMessageLimit int `toml:"pending_message_limit"`
PendingBytesLimit int `toml:"pending_bytes_limit"`
MaxUndeliveredMessages int `toml:"max_undelivered_messages"`
// Legacy metric buffer support; deprecated in v0.10.3
MetricBuffer int
conn *nats.Conn
subs []*nats.Subscription
parser parsers.Parser
sync.Mutex
wg sync.WaitGroup
Conn *nats.Conn
Subs []*nats.Subscription
// channel for all incoming NATS messages
in chan *nats.Msg
// channel for all NATS read errors
errs chan error
done chan struct{}
acc telegraf.Accumulator
errs chan error
acc telegraf.TrackingAccumulator
wg sync.WaitGroup
cancel context.CancelFunc
}
var sampleConfig = `
## urls of NATS servers
# servers = ["nats://localhost:4222"]
servers = ["nats://localhost:4222"]
## Use Transport Layer Security
# secure = false
secure = false
## subject(s) to consume
# subjects = ["telegraf"]
subjects = ["telegraf"]
## name a queue group
# queue_group = "telegraf_consumers"
queue_group = "telegraf_consumers"
## Sets the limits for pending msgs and bytes for each subscription
## These shouldn't need to be adjusted except in very high throughput scenarios
# pending_message_limit = 65536
# pending_bytes_limit = 67108864
## Maximum messages to read from the broker that have not been written by an
## output. For best throughput set based on the number of metrics within
## each message and the size of the output's metric_batch_size.
##
## For example, if each message from the queue contains 10 metrics and the
## output metric_batch_size is 1000, setting this to 100 will ensure that a
## full batch is collected and the write is triggered immediately without
## waiting until the next flush_interval.
# max_undelivered_messages = 1000
## Data format to consume.
## Each data format has its own unique set of configuration options, read
## more about them here:
@ -94,10 +112,7 @@ func (n *natsConsumer) natsErrHandler(c *nats.Conn, s *nats.Subscription, e erro
// Start the nats consumer. Caller must call *natsConsumer.Stop() to clean up.
func (n *natsConsumer) Start(acc telegraf.Accumulator) error {
n.Lock()
defer n.Unlock()
n.acc = acc
n.acc = acc.WithTracking(n.MaxUndeliveredMessages)
var connectErr error
@ -112,89 +127,106 @@ func (n *natsConsumer) Start(acc telegraf.Accumulator) error {
opts.Secure = n.Secure
if n.Conn == nil || n.Conn.IsClosed() {
n.Conn, connectErr = opts.Connect()
if n.conn == nil || n.conn.IsClosed() {
n.conn, connectErr = opts.Connect()
if connectErr != nil {
return connectErr
}
// Setup message and error channels
n.errs = make(chan error)
n.Conn.SetErrorHandler(n.natsErrHandler)
n.conn.SetErrorHandler(n.natsErrHandler)
n.in = make(chan *nats.Msg, 1000)
for _, subj := range n.Subjects {
sub, err := n.Conn.QueueSubscribe(subj, n.QueueGroup, func(m *nats.Msg) {
sub, err := n.conn.QueueSubscribe(subj, n.QueueGroup, func(m *nats.Msg) {
n.in <- m
})
if err != nil {
return err
}
// ensure that the subscription has been processed by the server
if err = n.Conn.Flush(); err != nil {
if err = n.conn.Flush(); err != nil {
return err
}
// set the subscription pending limits
if err = sub.SetPendingLimits(n.PendingMessageLimit, n.PendingBytesLimit); err != nil {
return err
}
n.Subs = append(n.Subs, sub)
n.subs = append(n.subs, sub)
}
}
n.done = make(chan struct{})
ctx, cancel := context.WithCancel(context.Background())
n.cancel = cancel
// Start the message reader
n.wg.Add(1)
go n.receiver()
go func() {
defer n.wg.Done()
go n.receiver(ctx)
}()
log.Printf("I! Started the NATS consumer service, nats: %v, subjects: %v, queue: %v\n",
n.Conn.ConnectedUrl(), n.Subjects, n.QueueGroup)
n.conn.ConnectedUrl(), n.Subjects, n.QueueGroup)
return nil
}
// receiver() reads all incoming messages from NATS, and parses them into
// telegraf metrics.
func (n *natsConsumer) receiver() {
defer n.wg.Done()
func (n *natsConsumer) receiver(ctx context.Context) {
sem := make(semaphore, n.MaxUndeliveredMessages)
for {
select {
case <-n.done:
case <-ctx.Done():
return
case <-n.acc.Delivered():
<-sem
case err := <-n.errs:
n.acc.AddError(fmt.Errorf("E! error reading from %s\n", err.Error()))
case msg := <-n.in:
metrics, err := n.parser.Parse(msg.Data)
if err != nil {
n.acc.AddError(fmt.Errorf("E! subject: %s, error: %s", msg.Subject, err.Error()))
}
n.acc.AddError(err)
case sem <- empty{}:
select {
case <-ctx.Done():
return
case err := <-n.errs:
<-sem
n.acc.AddError(err)
case <-n.acc.Delivered():
<-sem
<-sem
case msg := <-n.in:
metrics, err := n.parser.Parse(msg.Data)
if err != nil {
n.acc.AddError(fmt.Errorf("subject: %s, error: %s", msg.Subject, err.Error()))
<-sem
continue
}
for _, metric := range metrics {
n.acc.AddFields(metric.Name(), metric.Fields(), metric.Tags(), metric.Time())
n.acc.AddTrackingMetricGroup(metrics)
}
}
}
}
func (n *natsConsumer) clean() {
for _, sub := range n.Subs {
for _, sub := range n.subs {
if err := sub.Unsubscribe(); err != nil {
n.acc.AddError(fmt.Errorf("E! Error unsubscribing from subject %s in queue %s: %s\n",
n.acc.AddError(fmt.Errorf("Error unsubscribing from subject %s in queue %s: %s\n",
sub.Subject, sub.Queue, err.Error()))
}
}
if n.Conn != nil && !n.Conn.IsClosed() {
n.Conn.Close()
if n.conn != nil && !n.conn.IsClosed() {
n.conn.Close()
}
}
func (n *natsConsumer) Stop() {
n.Lock()
close(n.done)
n.cancel()
n.wg.Wait()
n.clean()
n.Unlock()
}
func (n *natsConsumer) Gather(acc telegraf.Accumulator) error {
@ -204,12 +236,13 @@ func (n *natsConsumer) Gather(acc telegraf.Accumulator) error {
func init() {
inputs.Add("nats_consumer", func() telegraf.Input {
return &natsConsumer{
Servers: []string{"nats://localhost:4222"},
Secure: false,
Subjects: []string{"telegraf"},
QueueGroup: "telegraf_consumers",
PendingBytesLimit: nats.DefaultSubPendingBytesLimit,
PendingMessageLimit: nats.DefaultSubPendingMsgsLimit,
Servers: []string{"nats://localhost:4222"},
Secure: false,
Subjects: []string{"telegraf"},
QueueGroup: "telegraf_consumers",
PendingBytesLimit: nats.DefaultSubPendingBytesLimit,
PendingMessageLimit: nats.DefaultSubPendingMsgsLimit,
MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
}
})
}

View File

@ -1,134 +0,0 @@
package natsconsumer
import (
"testing"
"github.com/influxdata/telegraf/plugins/parsers"
"github.com/influxdata/telegraf/testutil"
nats "github.com/nats-io/go-nats"
"github.com/stretchr/testify/assert"
)
const (
testMsg = "cpu_load_short,host=server01 value=23422.0 1422568543702900257\n"
testMsgGraphite = "cpu.load.short.graphite 23422 1454780029"
testMsgJSON = "{\"a\": 5, \"b\": {\"c\": 6}}\n"
invalidMsg = "cpu_load_short,host=server01 1422568543702900257\n"
metricBuffer = 5
)
func newTestNatsConsumer() (*natsConsumer, chan *nats.Msg) {
in := make(chan *nats.Msg, metricBuffer)
n := &natsConsumer{
QueueGroup: "test",
Subjects: []string{"telegraf"},
Servers: []string{"nats://localhost:4222"},
Secure: false,
in: in,
errs: make(chan error, metricBuffer),
done: make(chan struct{}),
}
return n, in
}
// Test that the parser parses NATS messages into metrics
func TestRunParser(t *testing.T) {
n, in := newTestNatsConsumer()
acc := testutil.Accumulator{}
n.acc = &acc
defer close(n.done)
n.parser, _ = parsers.NewInfluxParser()
n.wg.Add(1)
go n.receiver()
in <- natsMsg(testMsg)
acc.Wait(1)
}
// Test that the parser ignores invalid messages
func TestRunParserInvalidMsg(t *testing.T) {
n, in := newTestNatsConsumer()
acc := testutil.Accumulator{}
n.acc = &acc
defer close(n.done)
n.parser, _ = parsers.NewInfluxParser()
n.wg.Add(1)
go n.receiver()
in <- natsMsg(invalidMsg)
acc.WaitError(1)
assert.Contains(t, acc.Errors[0].Error(), "E! subject: telegraf, error: metric parse error")
assert.EqualValues(t, 0, acc.NMetrics())
}
// Test that the parser parses line format messages into metrics
func TestRunParserAndGather(t *testing.T) {
n, in := newTestNatsConsumer()
acc := testutil.Accumulator{}
n.acc = &acc
defer close(n.done)
n.parser, _ = parsers.NewInfluxParser()
n.wg.Add(1)
go n.receiver()
in <- natsMsg(testMsg)
n.Gather(&acc)
acc.Wait(1)
acc.AssertContainsFields(t, "cpu_load_short",
map[string]interface{}{"value": float64(23422)})
}
// Test that the parser parses graphite format messages into metrics
func TestRunParserAndGatherGraphite(t *testing.T) {
n, in := newTestNatsConsumer()
acc := testutil.Accumulator{}
n.acc = &acc
defer close(n.done)
n.parser, _ = parsers.NewGraphiteParser("_", []string{}, nil)
n.wg.Add(1)
go n.receiver()
in <- natsMsg(testMsgGraphite)
n.Gather(&acc)
acc.Wait(1)
acc.AssertContainsFields(t, "cpu_load_short_graphite",
map[string]interface{}{"value": float64(23422)})
}
// Test that the parser parses json format messages into metrics
func TestRunParserAndGatherJSON(t *testing.T) {
n, in := newTestNatsConsumer()
acc := testutil.Accumulator{}
n.acc = &acc
defer close(n.done)
n.parser, _ = parsers.NewParser(&parsers.Config{
DataFormat: "json",
MetricName: "nats_json_test",
})
n.wg.Add(1)
go n.receiver()
in <- natsMsg(testMsgJSON)
n.Gather(&acc)
acc.Wait(1)
acc.AssertContainsFields(t, "nats_json_test",
map[string]interface{}{
"a": float64(5),
"b_c": float64(6),
})
}
func natsMsg(val string) *nats.Msg {
return &nats.Msg{
Subject: "telegraf",
Data: []byte(val),
}
}

View File

@ -1,9 +1,9 @@
# NSQ Consumer Input Plugin
The [NSQ](http://nsq.io/) consumer plugin polls a specified NSQD
topic and adds messages to InfluxDB. This plugin allows a message to be in any of the supported `data_format` types.
The [NSQ][nsq] consumer plugin reads from NSQD and creates metrics using one
of the supported [input data formats][].
## Configuration
### Configuration:
```toml
# Read metrics from NSQD topic(s)
@ -18,6 +18,16 @@ topic and adds messages to InfluxDB. This plugin allows a message to be in any o
channel = "consumer"
max_in_flight = 100
## Maximum messages to read from the broker that have not been written by an
## output. For best throughput set based on the number of metrics within
## each message and the size of the output's metric_batch_size.
##
## For example, if each message from the queue contains 10 metrics and the
## output metric_batch_size is 1000, setting this to 100 will ensure that a
## full batch is collected and the write is triggered immediately without
## waiting until the next flush_interval.
# max_undelivered_messages = 1000
## Data format to consume.
## Each data format has its own unique set of configuration options, read
## more about them here:
@ -25,5 +35,5 @@ topic and adds messages to InfluxDB. This plugin allows a message to be in any o
data_format = "influx"
```
## Testing
The `nsq_consumer_test` mocks out the interaction with `NSQD`. It requires no outside dependencies.
[nsq]: https://nsq.io
[input data formats]: /docs/DATA_FORMATS_INPUT.md

View File

@ -1,7 +1,9 @@
package nsq_consumer
import (
"fmt"
"context"
"log"
"sync"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/inputs"
@ -9,17 +11,38 @@ import (
nsq "github.com/nsqio/go-nsq"
)
const (
defaultMaxUndeliveredMessages = 1000
)
type empty struct{}
type semaphore chan empty
type logger struct{}
func (l *logger) Output(calldepth int, s string) error {
log.Println("D! [inputs.nsq_consumer] " + s)
return nil
}
//NSQConsumer represents the configuration of the plugin
type NSQConsumer struct {
Server string
Nsqd []string
Nsqlookupd []string
Topic string
Channel string
MaxInFlight int
parser parsers.Parser
consumer *nsq.Consumer
acc telegraf.Accumulator
Server string `toml:"server"`
Nsqd []string `toml:"nsqd"`
Nsqlookupd []string `toml:"nsqlookupd"`
Topic string `toml:"topic"`
Channel string `toml:"channel"`
MaxInFlight int `toml:"max_in_flight"`
MaxUndeliveredMessages int `toml:"max_undelivered_messages"`
parser parsers.Parser
consumer *nsq.Consumer
mu sync.Mutex
messages map[telegraf.TrackingID]*nsq.Message
wg sync.WaitGroup
cancel context.CancelFunc
}
var sampleConfig = `
@ -33,6 +56,16 @@ var sampleConfig = `
channel = "consumer"
max_in_flight = 100
## Maximum messages to read from the broker that have not been written by an
## output. For best throughput set based on the number of metrics within
## each message and the size of the output's metric_batch_size.
##
## For example, if each message from the queue contains 10 metrics and the
## output metric_batch_size is 1000, setting this to 100 will ensure that a
## full batch is collected and the write is triggered immediately without
## waiting until the next flush_interval.
# max_undelivered_messages = 1000
## Data format to consume.
## Each data format has its own unique set of configuration options, read
## more about them here:
@ -40,12 +73,6 @@ var sampleConfig = `
data_format = "influx"
`
func init() {
inputs.Add("nsq_consumer", func() telegraf.Input {
return &NSQConsumer{}
})
}
// SetParser takes the data_format from the config and finds the right parser for that format
func (n *NSQConsumer) SetParser(parser parsers.Parser) {
n.parser = parser
@ -62,32 +89,88 @@ func (n *NSQConsumer) Description() string {
}
// Start pulls data from nsq
func (n *NSQConsumer) Start(acc telegraf.Accumulator) error {
n.acc = acc
func (n *NSQConsumer) Start(ac telegraf.Accumulator) error {
acc := ac.WithTracking(n.MaxUndeliveredMessages)
sem := make(semaphore, n.MaxUndeliveredMessages)
n.messages = make(map[telegraf.TrackingID]*nsq.Message, n.MaxUndeliveredMessages)
ctx, cancel := context.WithCancel(context.Background())
n.cancel = cancel
n.connect()
n.consumer.AddConcurrentHandlers(nsq.HandlerFunc(func(message *nsq.Message) error {
n.consumer.SetLogger(&logger{}, nsq.LogLevelInfo)
n.consumer.AddHandler(nsq.HandlerFunc(func(message *nsq.Message) error {
metrics, err := n.parser.Parse(message.Body)
if err != nil {
acc.AddError(fmt.Errorf("E! NSQConsumer Parse Error\nmessage:%s\nerror:%s", string(message.Body), err.Error()))
acc.AddError(err)
// Remove the message from the queue
message.Finish()
return nil
}
for _, metric := range metrics {
n.acc.AddFields(metric.Name(), metric.Fields(), metric.Tags(), metric.Time())
if len(metrics) == 0 {
message.Finish()
return nil
}
message.Finish()
select {
case <-ctx.Done():
return ctx.Err()
case sem <- empty{}:
break
}
n.mu.Lock()
id := acc.AddTrackingMetricGroup(metrics)
n.messages[id] = message
n.mu.Unlock()
message.DisableAutoResponse()
return nil
}), n.MaxInFlight)
}))
if len(n.Nsqlookupd) > 0 {
n.consumer.ConnectToNSQLookupds(n.Nsqlookupd)
}
n.consumer.ConnectToNSQDs(append(n.Nsqd, n.Server))
n.wg.Add(1)
go func() {
defer n.wg.Done()
n.onDelivery(ctx, acc, sem)
}()
return nil
}
func (n *NSQConsumer) onDelivery(ctx context.Context, acc telegraf.TrackingAccumulator, sem semaphore) {
for {
select {
case <-ctx.Done():
return
case info := <-acc.Delivered():
n.mu.Lock()
msg, ok := n.messages[info.ID()]
if !ok {
n.mu.Unlock()
continue
}
<-sem
delete(n.messages, info.ID())
n.mu.Unlock()
if info.Delivered() {
msg.Finish()
} else {
msg.Requeue(-1)
}
}
}
}
// Stop processing messages
func (n *NSQConsumer) Stop() {
n.cancel()
n.wg.Wait()
n.consumer.Stop()
<-n.consumer.StopChan
}
// Gather is a noop
@ -107,3 +190,11 @@ func (n *NSQConsumer) connect() error {
}
return nil
}
func init() {
inputs.Add("nsq_consumer", func() telegraf.Input {
return &NSQConsumer{
MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
}
})
}

View File

@ -36,11 +36,12 @@ func TestReadsMetricsFromNSQ(t *testing.T) {
newMockNSQD(script, addr.String())
consumer := &NSQConsumer{
Server: "127.0.0.1:4155",
Topic: "telegraf",
Channel: "consume",
MaxInFlight: 1,
Nsqd: []string{"127.0.0.1:4155"},
Server: "127.0.0.1:4155",
Topic: "telegraf",
Channel: "consume",
MaxInFlight: 1,
MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
Nsqd: []string{"127.0.0.1:4155"},
}
p, _ := parsers.NewInfluxParser()

View File

@ -2,6 +2,7 @@ package socket_listener
import (
"bufio"
"crypto/tls"
"fmt"
"io"
"log"
@ -9,11 +10,8 @@ import (
"os"
"strings"
"sync"
"time"
"crypto/tls"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
tlsint "github.com/influxdata/telegraf/internal/tls"
@ -120,7 +118,7 @@ func (ssl *streamSocketListener) read(c net.Conn) {
continue
}
for _, m := range metrics {
ssl.AddFields(m.Name(), m.Fields(), m.Tags(), m.Time())
ssl.AddMetric(m)
}
}
@ -156,7 +154,7 @@ func (psl *packetSocketListener) listen() {
continue
}
for _, m := range metrics {
psl.AddFields(m.Name(), m.Fields(), m.Tags(), m.Time())
psl.AddMetric(m)
}
}
}

View File

@ -7,11 +7,13 @@ import (
type Discard struct{}
func (d *Discard) Connect() error { return nil }
func (d *Discard) Close() error { return nil }
func (d *Discard) SampleConfig() string { return "" }
func (d *Discard) Description() string { return "Send metrics to nowhere at all" }
func (d *Discard) Write(metrics []telegraf.Metric) error { return nil }
func (d *Discard) Connect() error { return nil }
func (d *Discard) Close() error { return nil }
func (d *Discard) SampleConfig() string { return "" }
func (d *Discard) Description() string { return "Send metrics to nowhere at all" }
func (d *Discard) Write(metrics []telegraf.Metric) error {
return nil
}
func init() {
outputs.Add("discard", func() telegraf.Output { return &Discard{} })

View File

@ -144,7 +144,7 @@ func (p *PrometheusClient) auth(h http.Handler) http.Handler {
})
}
func (p *PrometheusClient) Start() error {
func (p *PrometheusClient) Connect() error {
defaultCollectors := map[string]bool{
"gocollector": true,
"process": true,
@ -200,15 +200,6 @@ func (p *PrometheusClient) Start() error {
return nil
}
func (p *PrometheusClient) Stop() {
// plugin gets cleaned up in Close() already.
}
func (p *PrometheusClient) Connect() error {
// This service output does not need to make any further connections
return nil
}
func (p *PrometheusClient) Close() error {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
defer cancel()

View File

@ -600,7 +600,7 @@ func TestPrometheusWritePointEmptyTag(t *testing.T) {
pClient, p, err := setupPrometheus()
require.NoError(t, err)
defer pClient.Stop()
defer pClient.Close()
now := time.Now()
tags := make(map[string]string)
@ -675,7 +675,7 @@ func setupPrometheus() (*PrometheusClient, *prometheus_input.Prometheus, error)
pTesting = NewClient()
pTesting.Listen = "localhost:9127"
pTesting.Path = "/metrics"
err := pTesting.Start()
err := pTesting.Connect()
if err != nil {
return nil, nil, err
}

View File

@ -10,6 +10,7 @@ import (
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/filter"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/metric"
"github.com/influxdata/telegraf/plugins/processors"
)
@ -76,12 +77,12 @@ var sampleConfig = `
## tags. If this setting is different than "" the plugin will add a
## tag (which name will be the value of this setting) to each metric with
## the value of the calculated GroupBy tag. Useful for debugging
# add_groupby_tag = ""
# add_groupby_tag = ""
## These settings provide a way to know the position of each metric in
## the top k. The 'add_rank_field' setting allows to specify for which
## fields the position is required. If the list is non empty, then a field
## will be added to each and every metric for each string present in this
## will be added to each and every metric for each string present in this
## setting. This field will contain the ranking of the group that
## the metric belonged to when aggregated over that field.
## The name of the field will be set to the name of the aggregation field,
@ -208,6 +209,11 @@ func (t *TopK) Apply(in ...telegraf.Metric) []telegraf.Metric {
// Add the metrics received to our internal cache
for _, m := range in {
// When tracking metrics this plugin could deadlock the input by
// holding undelivered metrics while the input waits for metrics to be
// delivered. Instead, treat all handled metrics as delivered and
// produced metrics as untracked in a similar way to aggregators.
m.Drop()
// Check if the metric has any of the fields over which we are aggregating
hasField := false
@ -281,7 +287,6 @@ func (t *TopK) push() []telegraf.Metric {
// Create a one dimensional list with the top K metrics of each key
for i, ag := range aggregations[0:min(t.K, len(aggregations))] {
// Check whether of not we need to add fields of tags to the selected metrics
if len(t.aggFieldSet) != 0 || len(t.rankFieldSet) != 0 || groupTag != "" {
for _, m := range t.cache[ag.groupbykey] {
@ -311,7 +316,16 @@ func (t *TopK) push() []telegraf.Metric {
t.Reset()
return ret
result := make([]telegraf.Metric, 0, len(ret))
for _, m := range ret {
copy, err := metric.New(m.Name(), m.Tags(), m.Fields(), m.Time(), m.Type())
if err != nil {
continue
}
result = append(result, copy)
}
return result
}
// Function that generates the aggregation functions

View File

@ -1,12 +1,12 @@
package topk
import (
"reflect"
"testing"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/testutil"
)
// Key, value pair that represents a telegraf.Metric Field
@ -95,7 +95,7 @@ func deepCopy(a []telegraf.Metric) []telegraf.Metric {
func belongs(m telegraf.Metric, ms []telegraf.Metric) bool {
for _, i := range ms {
if reflect.DeepEqual(i, m) {
if testutil.MetricEqual(i, m) {
return true
}
}

View File

@ -7,6 +7,6 @@ type Processor interface {
// Description returns a one-sentence description on the Input
Description() string
// Apply the filter to the given metric
// Apply the filter to the given metric.
Apply(in ...Metric) []Metric
}

View File

@ -14,6 +14,15 @@ import (
"github.com/stretchr/testify/assert"
)
var (
lastID uint64
)
func newTrackingID() telegraf.TrackingID {
atomic.AddUint64(&lastID, 1)
return telegraf.TrackingID(lastID)
}
// Metric defines a single point measurement
type Metric struct {
Measurement string
@ -23,7 +32,7 @@ type Metric struct {
}
func (p *Metric) String() string {
return fmt.Sprintf("%s %v", p.Measurement, p.Fields)
return fmt.Sprintf("%s %v %v", p.Measurement, p.Tags, p.Fields)
}
// Accumulator defines a mocked out accumulator
@ -31,11 +40,12 @@ type Accumulator struct {
sync.Mutex
*sync.Cond
Metrics []*Metric
nMetrics uint64
Discard bool
Errors []error
debug bool
Metrics []*Metric
nMetrics uint64
Discard bool
Errors []error
debug bool
delivered chan telegraf.DeliveryInfo
}
func (a *Accumulator) NMetrics() uint64 {
@ -154,6 +164,33 @@ func (a *Accumulator) AddHistogram(
a.AddFields(measurement, fields, tags, timestamp...)
}
func (a *Accumulator) AddMetric(m telegraf.Metric) {
a.AddFields(m.Name(), m.Fields(), m.Tags(), m.Time())
}
func (a *Accumulator) WithTracking(maxTracked int) telegraf.TrackingAccumulator {
return a
}
func (a *Accumulator) AddTrackingMetric(m telegraf.Metric) telegraf.TrackingID {
a.AddMetric(m)
return newTrackingID()
}
func (a *Accumulator) AddTrackingMetricGroup(group []telegraf.Metric) telegraf.TrackingID {
for _, m := range group {
a.AddMetric(m)
}
return newTrackingID()
}
func (a *Accumulator) Delivered() <-chan telegraf.DeliveryInfo {
if a.delivered == nil {
a.delivered = make(chan telegraf.DeliveryInfo)
}
return a.delivered
}
// AddError appends the given error to Accumulator.Errors.
func (a *Accumulator) AddError(err error) {
if err == nil {

View File

@ -41,6 +41,18 @@ func newMetricDiff(metric telegraf.Metric) *metricDiff {
return m
}
func MetricEqual(expected, actual telegraf.Metric) bool {
var lhs, rhs *metricDiff
if expected != nil {
lhs = newMetricDiff(expected)
}
if actual != nil {
rhs = newMetricDiff(actual)
}
return cmp.Equal(lhs, rhs)
}
func RequireMetricEqual(t *testing.T, expected, actual telegraf.Metric) {
t.Helper()
@ -60,11 +72,11 @@ func RequireMetricEqual(t *testing.T, expected, actual telegraf.Metric) {
func RequireMetricsEqual(t *testing.T, expected, actual []telegraf.Metric) {
t.Helper()
lhs := make([]*metricDiff, len(expected))
lhs := make([]*metricDiff, 0, len(expected))
for _, m := range expected {
lhs = append(lhs, newMetricDiff(m))
}
rhs := make([]*metricDiff, len(actual))
rhs := make([]*metricDiff, 0, len(actual))
for _, m := range actual {
rhs = append(rhs, newMetricDiff(m))
}