Performance refactor of running_output buffers

closes #914
closes #967
This commit is contained in:
Cameron Sparr 2016-04-25 17:49:06 -06:00
parent 1c4043ab39
commit 4de75ce621
7 changed files with 587 additions and 206 deletions

View File

@ -2,9 +2,18 @@
### Release Notes ### Release Notes
- New [agent] configuration option: `metric_batch_size`. This option tells
telegraf the maximum batch size to allow to accumulate before sending a flush
to the configured outputs. `metric_buffer_limit` now refers to the absolute
maximum number of metrics that will accumulate before metrics are dropped.
- There is no longer an option to
`flush_buffer_when_full`, this is now the default and only behavior of telegraf.
- **Breaking Change**: docker plugin tags. The cont_id tag no longer exists, it - **Breaking Change**: docker plugin tags. The cont_id tag no longer exists, it
will now be a field, and be called container_id. Additionally, cont_image and will now be a field, and be called container_id. Additionally, cont_image and
cont_name are being renamed to container_image and container_name. cont_name are being renamed to container_image and container_name.
- **Breaking Change**: docker plugin measurements. The `docker_cpu`, `docker_mem`, - **Breaking Change**: docker plugin measurements. The `docker_cpu`, `docker_mem`,
`docker_blkio` and `docker_net` measurements are being renamed to `docker_blkio` and `docker_net` measurements are being renamed to
`docker_container_cpu`, `docker_container_mem`, `docker_container_blkio` and `docker_container_cpu`, `docker_container_mem`, `docker_container_blkio` and
@ -16,15 +25,19 @@ So adding "container" to each metric will:
(1) make it more clear that these metrics are per-container, and (1) make it more clear that these metrics are per-container, and
(2) allow users to easily drop per-container metrics if cardinality is an (2) allow users to easily drop per-container metrics if cardinality is an
issue (`namedrop = ["docker_container_*"]`) issue (`namedrop = ["docker_container_*"]`)
- `tagexclude` and `taginclude` are now available, which can be used to remove - `tagexclude` and `taginclude` are now available, which can be used to remove
tags from measurements on inputs and outputs. See tags from measurements on inputs and outputs. See
[the configuration doc](https://github.com/influxdata/telegraf/blob/master/docs/CONFIGURATION.md) [the configuration doc](https://github.com/influxdata/telegraf/blob/master/docs/CONFIGURATION.md)
for more details. for more details.
- **Measurement filtering:** All measurement filters now match based on glob - **Measurement filtering:** All measurement filters now match based on glob
only. Previously there was an undocumented behavior where filters would match only. Previously there was an undocumented behavior where filters would match
based on _prefix_ in addition to globs. This means that a filter like based on _prefix_ in addition to globs. This means that a filter like
`fielddrop = ["time_"]` will need to be changed to `fielddrop = ["time_*"]` `fielddrop = ["time_"]` will need to be changed to `fielddrop = ["time_*"]`
- **datadog**: measurement and field names will no longer have `_` replaced by `.` - **datadog**: measurement and field names will no longer have `_` replaced by `.`
- The following plugins have changed their tags to _not_ overwrite the host tag: - The following plugins have changed their tags to _not_ overwrite the host tag:
- cassandra: `host -> cassandra_host` - cassandra: `host -> cassandra_host`
- disque: `host -> disque_host` - disque: `host -> disque_host`
@ -42,6 +55,8 @@ based on _prefix_ in addition to globs. This means that a filter like
- [#1072](https://github.com/influxdata/telegraf/pull/1072): New Input Plugin: filestat. - [#1072](https://github.com/influxdata/telegraf/pull/1072): New Input Plugin: filestat.
- [#1066](https://github.com/influxdata/telegraf/pull/1066): Replication lag metrics for MongoDB input plugin - [#1066](https://github.com/influxdata/telegraf/pull/1066): Replication lag metrics for MongoDB input plugin
- [#1086](https://github.com/influxdata/telegraf/pull/1086): Ability to specify AWS keys in config file. Thanks @johnrengleman! - [#1086](https://github.com/influxdata/telegraf/pull/1086): Ability to specify AWS keys in config file. Thanks @johnrengleman!
- [#1096](https://github.com/influxdata/telegraf/pull/1096): Performance refactor of running output buffers.
- [#967](https://github.com/influxdata/telegraf/issues/967): Buffer logging improvements.
### Bugfixes ### Bugfixes
@ -55,6 +70,7 @@ based on _prefix_ in addition to globs. This means that a filter like
- [#1078](https://github.com/influxdata/telegraf/issues/1078): Use default AWS credential chain. - [#1078](https://github.com/influxdata/telegraf/issues/1078): Use default AWS credential chain.
- [#1070](https://github.com/influxdata/telegraf/issues/1070): SQL Server input. Fix datatype conversion. - [#1070](https://github.com/influxdata/telegraf/issues/1070): SQL Server input. Fix datatype conversion.
- [#1089](https://github.com/influxdata/telegraf/issues/1089): Fix leaky TCP connections in phpfpm plugin. - [#1089](https://github.com/influxdata/telegraf/issues/1089): Fix leaky TCP connections in phpfpm plugin.
- [#914](https://github.com/influxdata/telegraf/issues/914): Telegraf can drop metrics on full buffers.
## v0.12.1 [2016-04-14] ## v0.12.1 [2016-04-14]

View File

@ -30,15 +30,13 @@
## ie, if interval="10s" then always collect on :00, :10, :20, etc. ## ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true round_interval = true
## Telegraf will send metrics to output in batch of at ## Telegraf will send metrics to outputs in batches of at
## most metric_batch_size metrics. ## most metric_batch_size metrics.
metric_batch_size = 1000 metric_batch_size = 1000
## Telegraf will cache metric_buffer_limit metrics for each output, and will ## For failed writes, telegraf will cache metric_buffer_limit metrics for each
## flush this buffer on a successful write. This should be a multiple of ## output, and will flush this buffer on a successful write. Oldest metrics
## metric_batch_size and could not be less than 2 times metric_batch_size ## are dropped first when this buffer fills.
metric_buffer_limit = 10000 metric_buffer_limit = 10000
## Flush the buffer whenever full, regardless of flush_interval.
flush_buffer_when_full = true
## Collection jitter is used to jitter the collection by a random amount. ## Collection jitter is used to jitter the collection by a random amount.
## Each plugin will sleep for a random time within jitter before collecting. ## Each plugin will sleep for a random time within jitter before collecting.
@ -151,6 +149,15 @@
# ## Amazon REGION # ## Amazon REGION
# region = 'us-east-1' # region = 'us-east-1'
# #
# ## Amazon Credentials
# ## Credentials are loaded in the following order
# ## 1) explicit credentials from 'access_key' and 'secret_key'
# ## 2) environment variables
# ## 3) shared credentials file
# ## 4) EC2 Instance Profile
# #access_key = ""
# #secret_key = ""
#
# ## Namespace for the CloudWatch MetricDatums # ## Namespace for the CloudWatch MetricDatums
# namespace = 'InfluxData/Telegraf' # namespace = 'InfluxData/Telegraf'
@ -243,6 +250,16 @@
# [[outputs.kinesis]] # [[outputs.kinesis]]
# ## Amazon REGION of kinesis endpoint. # ## Amazon REGION of kinesis endpoint.
# region = "ap-southeast-2" # region = "ap-southeast-2"
#
# ## Amazon Credentials
# ## Credentials are loaded in the following order
# ## 1) explicit credentials from 'access_key' and 'secret_key'
# ## 2) environment variables
# ## 3) shared credentials file
# ## 4) EC2 Instance Profile
# #access_key = ""
# #secret_key = ""
#
# ## Kinesis StreamName must exist prior to starting telegraf. # ## Kinesis StreamName must exist prior to starting telegraf.
# streamname = "StreamName" # streamname = "StreamName"
# ## PartitionKey as used for sharding data. # ## PartitionKey as used for sharding data.
@ -457,6 +474,15 @@
# ## Amazon Region # ## Amazon Region
# region = 'us-east-1' # region = 'us-east-1'
# #
# ## Amazon Credentials
# ## Credentials are loaded in the following order
# ## 1) explicit credentials from 'access_key' and 'secret_key'
# ## 2) environment variables
# ## 3) shared credentials file
# ## 4) EC2 Instance Profile
# #access_key = ""
# #secret_key = ""
#
# ## Requested CloudWatch aggregation Period (required - must be a multiple of 60s) # ## Requested CloudWatch aggregation Period (required - must be a multiple of 60s)
# period = '1m' # period = '1m'
# #
@ -588,8 +614,14 @@
# [[inputs.filestat]] # [[inputs.filestat]]
# ## Files to gather stats about. # ## Files to gather stats about.
# ## These accept standard unix glob matching rules, but with the addition of # ## These accept standard unix glob matching rules, but with the addition of
# ## ** as a "super asterisk". See https://github.com/gobwas/glob. # ## ** as a "super asterisk". ie:
# files = ["/etc/telegraf/telegraf.conf", "/var/log/**.log"] # ## "/var/log/**.log" -> recursively find all .log files in /var/log
# ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log
# ## "/var/log/apache.log" -> just tail the apache log file
# ##
# ## See https://github.com/gobwas/glob for more examples
# ##
# files = ["/var/log/**.log"]
# ## If true, read the entire file and calculate an md5 checksum. # ## If true, read the entire file and calculate an md5 checksum.
# md5 = false # md5 = false
@ -980,6 +1012,11 @@
# ## databases are gathered. # ## databases are gathered.
# ## databases = ["app_production", "testing"] # ## databases = ["app_production", "testing"]
# # # #
# # outputaddress = "db01"
# ## A custom name for the database that will be used as the "server" tag in the
# ## measurement output. If not specified, a default one generated from
# ## the connection address is used.
# #
# ## Define the toml config where the sql queries are stored # ## Define the toml config where the sql queries are stored
# ## New queries can be added, if the withdbname is set to true and there is no # ## New queries can be added, if the withdbname is set to true and there is no
# ## databases defined in the 'databases field', the sql query is ended by a # ## databases defined in the 'databases field', the sql query is ended by a
@ -990,24 +1027,28 @@
# ## because the databases variable was set to ['postgres', 'pgbench' ] and the # ## because the databases variable was set to ['postgres', 'pgbench' ] and the
# ## withdbname was true. Be careful that if the withdbname is set to false you # ## withdbname was true. Be careful that if the withdbname is set to false you
# ## don't have to define the where clause (aka with the dbname) the tagvalue # ## don't have to define the where clause (aka with the dbname) the tagvalue
# ## field is used to define custom tags (separated by comas) # ## field is used to define custom tags (separated by commas)
# ## The optional "measurement" value can be used to override the default
# ## output measurement name ("postgresql").
# # # #
# ## Structure : # ## Structure :
# ## [[inputs.postgresql_extensible.query]] # ## [[inputs.postgresql_extensible.query]]
# ## sqlquery string # ## sqlquery string
# ## version string # ## version string
# ## withdbname boolean # ## withdbname boolean
# ## tagvalue string (coma separated) # ## tagvalue string (comma separated)
# ## measurement string
# [[inputs.postgresql_extensible.query]] # [[inputs.postgresql_extensible.query]]
# sqlquery="SELECT * FROM pg_stat_database" # sqlquery="SELECT * FROM pg_stat_database"
# version=901 # version=901
# withdbname=false # withdbname=false
# tagvalue="" # tagvalue=""
# measurement=""
# [[inputs.postgresql_extensible.query]] # [[inputs.postgresql_extensible.query]]
# sqlquery="SELECT * FROM pg_stat_bgwriter" # sqlquery="SELECT * FROM pg_stat_bgwriter"
# version=901 # version=901
# withdbname=false # withdbname=false
# tagvalue="" # tagvalue="postgresql.stats"
# # Read metrics from one or many PowerDNS servers # # Read metrics from one or many PowerDNS servers
@ -1379,6 +1420,28 @@
# percentile_limit = 1000 # percentile_limit = 1000
# # Stream a log file, like the tail -f command
# [[inputs.tail]]
# ## files to tail.
# ## These accept standard unix glob matching rules, but with the addition of
# ## ** as a "super asterisk". ie:
# ## "/var/log/**.log" -> recursively find all .log files in /var/log
# ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log
# ## "/var/log/apache.log" -> just tail the apache log file
# ##
# ## See https://github.com/gobwas/glob for more examples
# ##
# files = ["/var/mymetrics.out"]
# ## Read file from beginning.
# from_beginning = false
#
# ## Data format to consume.
# ## Each data format has it's own unique set of configuration options, read
# ## more about them here:
# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
# data_format = "influx"
# # Generic TCP listener # # Generic TCP listener
# [[inputs.tcp_listener]] # [[inputs.tcp_listener]]
# ## Address and port to host TCP listener on # ## Address and port to host TCP listener on

77
internal/buffer/buffer.go Normal file
View File

@ -0,0 +1,77 @@
package buffer
import (
"github.com/influxdata/telegraf"
)
// Buffer is an object for storing metrics in a circular buffer.
type Buffer struct {
buf chan telegraf.Metric
// total dropped metrics
drops int
// total metrics added
total int
}
// NewBuffer returns a Buffer
// size is the maximum number of metrics that Buffer will cache. If Add is
// called when the buffer is full, then the oldest metric(s) will be dropped.
func NewBuffer(size int) *Buffer {
return &Buffer{
buf: make(chan telegraf.Metric, size),
}
}
// IsEmpty returns true if Buffer is empty.
func (b *Buffer) IsEmpty() bool {
return len(b.buf) == 0
}
// Len returns the current length of the buffer.
func (b *Buffer) Len() int {
return len(b.buf)
}
// Drops returns the total number of dropped metrics that have occured in this
// buffer since instantiation.
func (b *Buffer) Drops() int {
return b.drops
}
// Total returns the total number of metrics that have been added to this buffer.
func (b *Buffer) Total() int {
return b.total
}
// Add adds metrics to the buffer.
func (b *Buffer) Add(metrics ...telegraf.Metric) {
for i, _ := range metrics {
b.total++
select {
case b.buf <- metrics[i]:
default:
b.drops++
<-b.buf
b.buf <- metrics[i]
}
}
}
// Batch returns a batch of metrics of size batchSize.
// the batch will be of maximum length batchSize. It can be less than batchSize,
// if the length of Buffer is less than batchSize.
func (b *Buffer) Batch(batchSize int) []telegraf.Metric {
n := min(len(b.buf), batchSize)
out := make([]telegraf.Metric, n)
for i := 0; i < n; i++ {
out[i] = <-b.buf
}
return out
}
func min(a, b int) int {
if b < a {
return b
}
return a
}

View File

@ -0,0 +1,94 @@
package buffer
import (
"testing"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/assert"
)
var metricList = []telegraf.Metric{
testutil.TestMetric(2, "mymetric1"),
testutil.TestMetric(1, "mymetric2"),
testutil.TestMetric(11, "mymetric3"),
testutil.TestMetric(15, "mymetric4"),
testutil.TestMetric(8, "mymetric5"),
}
func BenchmarkAddMetrics(b *testing.B) {
buf := NewBuffer(10000)
m := testutil.TestMetric(1, "mymetric")
for n := 0; n < b.N; n++ {
buf.Add(m)
}
}
func TestNewBufferBasicFuncs(t *testing.T) {
b := NewBuffer(10)
assert.True(t, b.IsEmpty())
assert.Zero(t, b.Len())
assert.Zero(t, b.Drops())
assert.Zero(t, b.Total())
m := testutil.TestMetric(1, "mymetric")
b.Add(m)
assert.False(t, b.IsEmpty())
assert.Equal(t, b.Len(), 1)
assert.Equal(t, b.Drops(), 0)
assert.Equal(t, b.Total(), 1)
b.Add(metricList...)
assert.False(t, b.IsEmpty())
assert.Equal(t, b.Len(), 6)
assert.Equal(t, b.Drops(), 0)
assert.Equal(t, b.Total(), 6)
}
func TestDroppingMetrics(t *testing.T) {
b := NewBuffer(10)
// Add up to the size of the buffer
b.Add(metricList...)
b.Add(metricList...)
assert.False(t, b.IsEmpty())
assert.Equal(t, b.Len(), 10)
assert.Equal(t, b.Drops(), 0)
assert.Equal(t, b.Total(), 10)
// Add 5 more and verify they were dropped
b.Add(metricList...)
assert.False(t, b.IsEmpty())
assert.Equal(t, b.Len(), 10)
assert.Equal(t, b.Drops(), 5)
assert.Equal(t, b.Total(), 15)
}
func TestGettingBatches(t *testing.T) {
b := NewBuffer(20)
// Verify that the buffer returned is smaller than requested when there are
// not as many items as requested.
b.Add(metricList...)
batch := b.Batch(10)
assert.Len(t, batch, 5)
// Verify that the buffer is now empty
assert.True(t, b.IsEmpty())
assert.Zero(t, b.Len())
assert.Zero(t, b.Drops())
assert.Equal(t, b.Total(), 5)
// Verify that the buffer returned is not more than the size requested
b.Add(metricList...)
batch = b.Batch(3)
assert.Len(t, batch, 3)
// Verify that buffer is not empty
assert.False(t, b.IsEmpty())
assert.Equal(t, b.Len(), 2)
assert.Equal(t, b.Drops(), 0)
assert.Equal(t, b.Total(), 10)
}

View File

@ -188,15 +188,13 @@ var header = `# Telegraf Configuration
## ie, if interval="10s" then always collect on :00, :10, :20, etc. ## ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true round_interval = true
## Telegraf will send metrics to output in batch of at ## Telegraf will send metrics to outputs in batches of at
## most metric_batch_size metrics. ## most metric_batch_size metrics.
metric_batch_size = 1000 metric_batch_size = 1000
## Telegraf will cache metric_buffer_limit metrics for each output, and will ## For failed writes, telegraf will cache metric_buffer_limit metrics for each
## flush this buffer on a successful write. This should be a multiple of ## output, and will flush this buffer on a successful write. Oldest metrics
## metric_batch_size and could not be less than 2 times metric_batch_size ## are dropped first when this buffer fills.
metric_buffer_limit = 10000 metric_buffer_limit = 10000
## Flush the buffer whenever full, regardless of flush_interval.
flush_buffer_when_full = true
## Collection jitter is used to jitter the collection by a random amount. ## Collection jitter is used to jitter the collection by a random amount.
## Each plugin will sleep for a random time within jitter before collecting. ## Each plugin will sleep for a random time within jitter before collecting.
@ -535,14 +533,8 @@ func (c *Config) addOutput(name string, table *ast.Table) error {
return err return err
} }
ro := internal_models.NewRunningOutput(name, output, outputConfig) ro := internal_models.NewRunningOutput(name, output, outputConfig,
if c.Agent.MetricBatchSize > 0 { c.Agent.MetricBatchSize, c.Agent.MetricBufferLimit)
ro.MetricBatchSize = c.Agent.MetricBatchSize
}
if c.Agent.MetricBufferLimit > 0 {
ro.MetricBufferLimit = c.Agent.MetricBufferLimit
}
ro.FlushBufferWhenFull = c.Agent.FlushBufferWhenFull
c.Outputs = append(c.Outputs, ro) c.Outputs = append(c.Outputs, ro)
return nil return nil
} }

View File

@ -2,14 +2,13 @@ package internal_models
import ( import (
"log" "log"
"sync"
"time" "time"
"github.com/influxdata/telegraf" "github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal/buffer"
) )
const ( const (
// Default size of metrics batch size. // Default size of metrics batch size.
DEFAULT_METRIC_BATCH_SIZE = 1000 DEFAULT_METRIC_BATCH_SIZE = 1000
@ -17,40 +16,40 @@ const (
DEFAULT_METRIC_BUFFER_LIMIT = 10000 DEFAULT_METRIC_BUFFER_LIMIT = 10000
) )
// tmpmetrics point to batch of metrics ready to be wrote to output. // RunningOutput contains the output configuration
// readI point to the oldest batch of metrics (the first to sent to output). It
// may point to nil value if tmpmetrics is empty.
// writeI point to the next slot to buffer a batch of metrics is output fail to
// write.
type RunningOutput struct { type RunningOutput struct {
Name string Name string
Output telegraf.Output Output telegraf.Output
Config *OutputConfig Config *OutputConfig
Quiet bool Quiet bool
MetricBufferLimit int MetricBufferLimit int
MetricBatchSize int MetricBatchSize int
FlushBufferWhenFull bool
metrics []telegraf.Metric metrics *buffer.Buffer
tmpmetrics []([]telegraf.Metric) failMetrics *buffer.Buffer
writeI int
readI int
sync.Mutex
} }
func NewRunningOutput( func NewRunningOutput(
name string, name string,
output telegraf.Output, output telegraf.Output,
conf *OutputConfig, conf *OutputConfig,
batchSize int,
bufferLimit int,
) *RunningOutput { ) *RunningOutput {
if bufferLimit == 0 {
bufferLimit = DEFAULT_METRIC_BUFFER_LIMIT
}
if batchSize == 0 {
batchSize = DEFAULT_METRIC_BATCH_SIZE
}
ro := &RunningOutput{ ro := &RunningOutput{
Name: name, Name: name,
metrics: make([]telegraf.Metric, 0), metrics: buffer.NewBuffer(batchSize),
failMetrics: buffer.NewBuffer(bufferLimit),
Output: output, Output: output,
Config: conf, Config: conf,
MetricBufferLimit: DEFAULT_METRIC_BUFFER_LIMIT, MetricBufferLimit: bufferLimit,
MetricBatchSize: DEFAULT_METRIC_BATCH_SIZE, MetricBatchSize: batchSize,
} }
return ro return ro
} }
@ -63,19 +62,6 @@ func (ro *RunningOutput) AddMetric(metric telegraf.Metric) {
return return
} }
} }
ro.Lock()
defer ro.Unlock()
if ro.tmpmetrics == nil {
size := ro.MetricBufferLimit / ro.MetricBatchSize
// ro.metrics already contains one batch
size = size - 1
if size < 1 {
size = 1
}
ro.tmpmetrics = make([]([]telegraf.Metric), size)
}
// Filter any tagexclude/taginclude parameters before adding metric // Filter any tagexclude/taginclude parameters before adding metric
if len(ro.Config.Filter.TagExclude) != 0 || len(ro.Config.Filter.TagInclude) != 0 { if len(ro.Config.Filter.TagExclude) != 0 || len(ro.Config.Filter.TagInclude) != 0 {
@ -90,69 +76,64 @@ func (ro *RunningOutput) AddMetric(metric telegraf.Metric) {
metric, _ = telegraf.NewMetric(name, tags, fields, t) metric, _ = telegraf.NewMetric(name, tags, fields, t)
} }
if len(ro.metrics) < ro.MetricBatchSize { ro.metrics.Add(metric)
ro.metrics = append(ro.metrics, metric) if ro.metrics.Len() == ro.MetricBatchSize {
} else { batch := ro.metrics.Batch(ro.MetricBatchSize)
flushSuccess := true err := ro.write(batch)
if ro.FlushBufferWhenFull { if err != nil {
err := ro.write(ro.metrics) ro.failMetrics.Add(batch...)
if err != nil {
log.Printf("ERROR writing full metric buffer to output %s, %s",
ro.Name, err)
flushSuccess = false
}
} else {
flushSuccess = false
} }
if !flushSuccess {
if ro.tmpmetrics[ro.writeI] != nil && ro.writeI == ro.readI {
log.Printf("WARNING: overwriting cached metrics, you may want to " +
"increase the metric_buffer_limit setting in your [agent] " +
"config if you do not wish to overwrite metrics.\n")
ro.readI = (ro.readI + 1) % cap(ro.tmpmetrics)
}
ro.tmpmetrics[ro.writeI] = ro.metrics
ro.writeI = (ro.writeI + 1) % cap(ro.tmpmetrics)
}
ro.metrics = make([]telegraf.Metric, 0)
ro.metrics = append(ro.metrics, metric)
} }
} }
// Write writes all cached points to this output. // Write writes all cached points to this output.
func (ro *RunningOutput) Write() error { func (ro *RunningOutput) Write() error {
ro.Lock() if !ro.Quiet {
defer ro.Unlock() log.Printf("Output [%s] buffer fullness: %d / %d metrics. "+
"Total gathered metrics: %d. Total dropped metrics: %d.",
if ro.tmpmetrics == nil { ro.Name,
size := ro.MetricBufferLimit / ro.MetricBatchSize ro.failMetrics.Len()+ro.metrics.Len(),
// ro.metrics already contains one batch ro.MetricBufferLimit,
size = size - 1 ro.metrics.Total(),
ro.metrics.Drops()+ro.failMetrics.Drops())
if size < 1 {
size = 1
}
ro.tmpmetrics = make([]([]telegraf.Metric), size)
} }
// Write any cached metric buffers before, as those metrics are the var err error
// oldest if !ro.failMetrics.IsEmpty() {
for ro.tmpmetrics[ro.readI] != nil { bufLen := ro.failMetrics.Len()
if err := ro.write(ro.tmpmetrics[ro.readI]); err != nil { // how many batches of failed writes we need to write.
return err nBatches := bufLen/ro.MetricBatchSize + 1
} else { batchSize := ro.MetricBatchSize
ro.tmpmetrics[ro.readI] = nil
ro.readI = (ro.readI + 1) % cap(ro.tmpmetrics) for i := 0; i < nBatches; i++ {
// If it's the last batch, only grab the metrics that have not had
// a write attempt already (this is primarily to preserve order).
if i == nBatches-1 {
batchSize = bufLen % ro.MetricBatchSize
}
batch := ro.failMetrics.Batch(batchSize)
// If we've already failed previous writes, don't bother trying to
// write to this output again. We are not exiting the loop just so
// that we can rotate the metrics to preserve order.
if err == nil {
err = ro.write(batch)
}
if err != nil {
ro.failMetrics.Add(batch...)
}
} }
} }
err := ro.write(ro.metrics) batch := ro.metrics.Batch(ro.MetricBatchSize)
// see comment above about not trying to write to an already failed output.
// if ro.failMetrics is empty then err will always be nil at this point.
if err == nil {
err = ro.write(batch)
}
if err != nil { if err != nil {
ro.failMetrics.Add(batch...)
return err return err
} else {
ro.metrics = make([]telegraf.Metric, 0)
} }
return nil return nil
} }
@ -165,8 +146,8 @@ func (ro *RunningOutput) write(metrics []telegraf.Metric) error {
elapsed := time.Since(start) elapsed := time.Since(start)
if err == nil { if err == nil {
if !ro.Quiet { if !ro.Quiet {
log.Printf("Wrote %d metrics to output %s in %s\n", log.Printf("Output [%s] wrote batch of %d metrics in %s\n",
len(metrics), ro.Name, elapsed) ro.Name, len(metrics), elapsed)
} }
} }
return err return err

View File

@ -2,7 +2,6 @@ package internal_models
import ( import (
"fmt" "fmt"
"sort"
"sync" "sync"
"testing" "testing"
@ -29,6 +28,62 @@ var next5 = []telegraf.Metric{
testutil.TestMetric(101, "metric10"), testutil.TestMetric(101, "metric10"),
} }
// Benchmark adding metrics.
func BenchmarkRunningOutputAddWrite(b *testing.B) {
conf := &OutputConfig{
Filter: Filter{
IsActive: false,
},
}
m := &perfOutput{}
ro := NewRunningOutput("test", m, conf, 1000, 10000)
ro.Quiet = true
for n := 0; n < b.N; n++ {
ro.AddMetric(first5[0])
ro.Write()
}
}
// Benchmark adding metrics.
func BenchmarkRunningOutputAddWriteEvery100(b *testing.B) {
conf := &OutputConfig{
Filter: Filter{
IsActive: false,
},
}
m := &perfOutput{}
ro := NewRunningOutput("test", m, conf, 1000, 10000)
ro.Quiet = true
for n := 0; n < b.N; n++ {
ro.AddMetric(first5[0])
if n%100 == 0 {
ro.Write()
}
}
}
// Benchmark adding metrics.
func BenchmarkRunningOutputAddFailWrites(b *testing.B) {
conf := &OutputConfig{
Filter: Filter{
IsActive: false,
},
}
m := &perfOutput{}
m.failWrite = true
ro := NewRunningOutput("test", m, conf, 1000, 10000)
ro.Quiet = true
for n := 0; n < b.N; n++ {
ro.AddMetric(first5[0])
}
}
// Test that NameDrop filters ger properly applied. // Test that NameDrop filters ger properly applied.
func TestRunningOutput_DropFilter(t *testing.T) { func TestRunningOutput_DropFilter(t *testing.T) {
conf := &OutputConfig{ conf := &OutputConfig{
@ -40,7 +95,7 @@ func TestRunningOutput_DropFilter(t *testing.T) {
assert.NoError(t, conf.Filter.CompileFilter()) assert.NoError(t, conf.Filter.CompileFilter())
m := &mockOutput{} m := &mockOutput{}
ro := NewRunningOutput("test", m, conf) ro := NewRunningOutput("test", m, conf, 1000, 10000)
for _, metric := range first5 { for _, metric := range first5 {
ro.AddMetric(metric) ro.AddMetric(metric)
@ -66,7 +121,7 @@ func TestRunningOutput_PassFilter(t *testing.T) {
assert.NoError(t, conf.Filter.CompileFilter()) assert.NoError(t, conf.Filter.CompileFilter())
m := &mockOutput{} m := &mockOutput{}
ro := NewRunningOutput("test", m, conf) ro := NewRunningOutput("test", m, conf, 1000, 10000)
for _, metric := range first5 { for _, metric := range first5 {
ro.AddMetric(metric) ro.AddMetric(metric)
@ -92,7 +147,7 @@ func TestRunningOutput_TagIncludeNoMatch(t *testing.T) {
assert.NoError(t, conf.Filter.CompileFilter()) assert.NoError(t, conf.Filter.CompileFilter())
m := &mockOutput{} m := &mockOutput{}
ro := NewRunningOutput("test", m, conf) ro := NewRunningOutput("test", m, conf, 1000, 10000)
ro.AddMetric(first5[0]) ro.AddMetric(first5[0])
assert.Len(t, m.Metrics(), 0) assert.Len(t, m.Metrics(), 0)
@ -114,7 +169,7 @@ func TestRunningOutput_TagExcludeMatch(t *testing.T) {
assert.NoError(t, conf.Filter.CompileFilter()) assert.NoError(t, conf.Filter.CompileFilter())
m := &mockOutput{} m := &mockOutput{}
ro := NewRunningOutput("test", m, conf) ro := NewRunningOutput("test", m, conf, 1000, 10000)
ro.AddMetric(first5[0]) ro.AddMetric(first5[0])
assert.Len(t, m.Metrics(), 0) assert.Len(t, m.Metrics(), 0)
@ -136,7 +191,7 @@ func TestRunningOutput_TagExcludeNoMatch(t *testing.T) {
assert.NoError(t, conf.Filter.CompileFilter()) assert.NoError(t, conf.Filter.CompileFilter())
m := &mockOutput{} m := &mockOutput{}
ro := NewRunningOutput("test", m, conf) ro := NewRunningOutput("test", m, conf, 1000, 10000)
ro.AddMetric(first5[0]) ro.AddMetric(first5[0])
assert.Len(t, m.Metrics(), 0) assert.Len(t, m.Metrics(), 0)
@ -158,7 +213,7 @@ func TestRunningOutput_TagIncludeMatch(t *testing.T) {
assert.NoError(t, conf.Filter.CompileFilter()) assert.NoError(t, conf.Filter.CompileFilter())
m := &mockOutput{} m := &mockOutput{}
ro := NewRunningOutput("test", m, conf) ro := NewRunningOutput("test", m, conf, 1000, 10000)
ro.AddMetric(first5[0]) ro.AddMetric(first5[0])
assert.Len(t, m.Metrics(), 0) assert.Len(t, m.Metrics(), 0)
@ -178,7 +233,7 @@ func TestRunningOutputDefault(t *testing.T) {
} }
m := &mockOutput{} m := &mockOutput{}
ro := NewRunningOutput("test", m, conf) ro := NewRunningOutput("test", m, conf, 1000, 10000)
for _, metric := range first5 { for _, metric := range first5 {
ro.AddMetric(metric) ro.AddMetric(metric)
@ -193,77 +248,6 @@ func TestRunningOutputDefault(t *testing.T) {
assert.Len(t, m.Metrics(), 10) assert.Len(t, m.Metrics(), 10)
} }
// Test that the first metrics batch gets overwritten if there is a buffer overflow.
func TestRunningOutputOverwrite(t *testing.T) {
conf := &OutputConfig{
Filter: Filter{
IsActive: false,
},
}
m := &mockOutput{}
ro := NewRunningOutput("test", m, conf)
ro.MetricBatchSize = 1
ro.MetricBufferLimit = 4
for _, metric := range first5 {
ro.AddMetric(metric)
}
require.Len(t, m.Metrics(), 0)
err := ro.Write()
require.NoError(t, err)
require.Len(t, m.Metrics(), 4)
var expected, actual []string
for i, exp := range first5[1:] {
expected = append(expected, exp.String())
actual = append(actual, m.Metrics()[i].String())
}
sort.Strings(expected)
sort.Strings(actual)
assert.Equal(t, expected, actual)
}
// Test that multiple buffer overflows are handled properly.
func TestRunningOutputMultiOverwrite(t *testing.T) {
conf := &OutputConfig{
Filter: Filter{
IsActive: false,
},
}
m := &mockOutput{}
ro := NewRunningOutput("test", m, conf)
ro.MetricBatchSize = 1
ro.MetricBufferLimit = 3
for _, metric := range first5 {
ro.AddMetric(metric)
}
for _, metric := range next5 {
ro.AddMetric(metric)
}
require.Len(t, m.Metrics(), 0)
err := ro.Write()
require.NoError(t, err)
require.Len(t, m.Metrics(), 3)
var expected, actual []string
for i, exp := range next5[2:] {
expected = append(expected, exp.String())
actual = append(actual, m.Metrics()[i].String())
}
sort.Strings(expected)
sort.Strings(actual)
assert.Equal(t, expected, actual)
}
// Test that running output doesn't flush until it's full when // Test that running output doesn't flush until it's full when
// FlushBufferWhenFull is set. // FlushBufferWhenFull is set.
func TestRunningOutputFlushWhenFull(t *testing.T) { func TestRunningOutputFlushWhenFull(t *testing.T) {
@ -274,12 +258,9 @@ func TestRunningOutputFlushWhenFull(t *testing.T) {
} }
m := &mockOutput{} m := &mockOutput{}
ro := NewRunningOutput("test", m, conf) ro := NewRunningOutput("test", m, conf, 6, 10)
ro.FlushBufferWhenFull = true
ro.MetricBatchSize = 5
ro.MetricBufferLimit = 10
// Fill buffer to limit // Fill buffer to 1 under limit
for _, metric := range first5 { for _, metric := range first5 {
ro.AddMetric(metric) ro.AddMetric(metric)
} }
@ -289,7 +270,7 @@ func TestRunningOutputFlushWhenFull(t *testing.T) {
// add one more metric // add one more metric
ro.AddMetric(next5[0]) ro.AddMetric(next5[0])
// now it flushed // now it flushed
assert.Len(t, m.Metrics(), 5) assert.Len(t, m.Metrics(), 6)
// add one more metric and write it manually // add one more metric and write it manually
ro.AddMetric(next5[1]) ro.AddMetric(next5[1])
@ -308,10 +289,7 @@ func TestRunningOutputMultiFlushWhenFull(t *testing.T) {
} }
m := &mockOutput{} m := &mockOutput{}
ro := NewRunningOutput("test", m, conf) ro := NewRunningOutput("test", m, conf, 4, 12)
ro.FlushBufferWhenFull = true
ro.MetricBatchSize = 4
ro.MetricBufferLimit = 12
// Fill buffer past limit twive // Fill buffer past limit twive
for _, metric := range first5 { for _, metric := range first5 {
@ -333,12 +311,9 @@ func TestRunningOutputWriteFail(t *testing.T) {
m := &mockOutput{} m := &mockOutput{}
m.failWrite = true m.failWrite = true
ro := NewRunningOutput("test", m, conf) ro := NewRunningOutput("test", m, conf, 4, 12)
ro.FlushBufferWhenFull = true
ro.MetricBatchSize = 4
ro.MetricBufferLimit = 12
// Fill buffer past limit twice // Fill buffer to limit twice
for _, metric := range first5 { for _, metric := range first5 {
ro.AddMetric(metric) ro.AddMetric(metric)
} }
@ -361,6 +336,161 @@ func TestRunningOutputWriteFail(t *testing.T) {
assert.Len(t, m.Metrics(), 10) assert.Len(t, m.Metrics(), 10)
} }
// Verify that the order of points is preserved during a write failure.
func TestRunningOutputWriteFailOrder(t *testing.T) {
conf := &OutputConfig{
Filter: Filter{
IsActive: false,
},
}
m := &mockOutput{}
m.failWrite = true
ro := NewRunningOutput("test", m, conf, 100, 1000)
// add 5 metrics
for _, metric := range first5 {
ro.AddMetric(metric)
}
// no successful flush yet
assert.Len(t, m.Metrics(), 0)
// Write fails
err := ro.Write()
require.Error(t, err)
// no successful flush yet
assert.Len(t, m.Metrics(), 0)
m.failWrite = false
// add 5 more metrics
for _, metric := range next5 {
ro.AddMetric(metric)
}
err = ro.Write()
require.NoError(t, err)
// Verify that 10 metrics were written
assert.Len(t, m.Metrics(), 10)
// Verify that they are in order
expected := append(first5, next5...)
assert.Equal(t, expected, m.Metrics())
}
// Verify that the order of points is preserved during many write failures.
func TestRunningOutputWriteFailOrder2(t *testing.T) {
conf := &OutputConfig{
Filter: Filter{
IsActive: false,
},
}
m := &mockOutput{}
m.failWrite = true
ro := NewRunningOutput("test", m, conf, 5, 100)
// add 5 metrics
for _, metric := range first5 {
ro.AddMetric(metric)
}
// Write fails
err := ro.Write()
require.Error(t, err)
// no successful flush yet
assert.Len(t, m.Metrics(), 0)
// add 5 metrics
for _, metric := range next5 {
ro.AddMetric(metric)
}
// Write fails
err = ro.Write()
require.Error(t, err)
// no successful flush yet
assert.Len(t, m.Metrics(), 0)
// add 5 metrics
for _, metric := range first5 {
ro.AddMetric(metric)
}
// Write fails
err = ro.Write()
require.Error(t, err)
// no successful flush yet
assert.Len(t, m.Metrics(), 0)
// add 5 metrics
for _, metric := range next5 {
ro.AddMetric(metric)
}
// Write fails
err = ro.Write()
require.Error(t, err)
// no successful flush yet
assert.Len(t, m.Metrics(), 0)
m.failWrite = false
err = ro.Write()
require.NoError(t, err)
// Verify that 10 metrics were written
assert.Len(t, m.Metrics(), 20)
// Verify that they are in order
expected := append(first5, next5...)
expected = append(expected, first5...)
expected = append(expected, next5...)
assert.Equal(t, expected, m.Metrics())
}
// Verify that the order of points is preserved when there is a remainder
// of points for the batch.
//
// ie, with a batch size of 5:
//
// 1 2 3 4 5 6 <-- order, failed points
// 6 1 2 3 4 5 <-- order, after 1st write failure (1 2 3 4 5 was batch)
// 1 2 3 4 5 6 <-- order, after 2nd write failure, (6 was batch)
//
func TestRunningOutputWriteFailOrder3(t *testing.T) {
conf := &OutputConfig{
Filter: Filter{
IsActive: false,
},
}
m := &mockOutput{}
m.failWrite = true
ro := NewRunningOutput("test", m, conf, 5, 1000)
// add 5 metrics
for _, metric := range first5 {
ro.AddMetric(metric)
}
// no successful flush yet
assert.Len(t, m.Metrics(), 0)
// Write fails
err := ro.Write()
require.Error(t, err)
// no successful flush yet
assert.Len(t, m.Metrics(), 0)
// add and attempt to write a single metric:
ro.AddMetric(next5[0])
err = ro.Write()
require.Error(t, err)
// unset fail and write metrics
m.failWrite = false
err = ro.Write()
require.NoError(t, err)
// Verify that 6 metrics were written
assert.Len(t, m.Metrics(), 6)
// Verify that they are in order
expected := append(first5, next5[0])
assert.Equal(t, expected, m.Metrics())
}
type mockOutput struct { type mockOutput struct {
sync.Mutex sync.Mutex
@ -408,3 +538,31 @@ func (m *mockOutput) Metrics() []telegraf.Metric {
defer m.Unlock() defer m.Unlock()
return m.metrics return m.metrics
} }
type perfOutput struct {
// if true, mock a write failure
failWrite bool
}
func (m *perfOutput) Connect() error {
return nil
}
func (m *perfOutput) Close() error {
return nil
}
func (m *perfOutput) Description() string {
return ""
}
func (m *perfOutput) SampleConfig() string {
return ""
}
func (m *perfOutput) Write(metrics []telegraf.Metric) error {
if m.failWrite {
return fmt.Errorf("Failed Write!")
}
return nil
}