Performance refactor of running_output buffers

closes #914
closes #967
This commit is contained in:
Cameron Sparr
2016-04-25 17:49:06 -06:00
parent 1c4043ab39
commit 4de75ce621
7 changed files with 587 additions and 206 deletions

View File

@@ -2,14 +2,13 @@ package internal_models
import (
"log"
"sync"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal/buffer"
)
const (
// Default size of metrics batch size.
DEFAULT_METRIC_BATCH_SIZE = 1000
@@ -17,40 +16,40 @@ const (
DEFAULT_METRIC_BUFFER_LIMIT = 10000
)
// tmpmetrics point to batch of metrics ready to be wrote to output.
// readI point to the oldest batch of metrics (the first to sent to output). It
// may point to nil value if tmpmetrics is empty.
// writeI point to the next slot to buffer a batch of metrics is output fail to
// write.
// RunningOutput contains the output configuration
type RunningOutput struct {
Name string
Output telegraf.Output
Config *OutputConfig
Quiet bool
MetricBufferLimit int
MetricBatchSize int
FlushBufferWhenFull bool
Name string
Output telegraf.Output
Config *OutputConfig
Quiet bool
MetricBufferLimit int
MetricBatchSize int
metrics []telegraf.Metric
tmpmetrics []([]telegraf.Metric)
writeI int
readI int
sync.Mutex
metrics *buffer.Buffer
failMetrics *buffer.Buffer
}
func NewRunningOutput(
name string,
output telegraf.Output,
conf *OutputConfig,
batchSize int,
bufferLimit int,
) *RunningOutput {
if bufferLimit == 0 {
bufferLimit = DEFAULT_METRIC_BUFFER_LIMIT
}
if batchSize == 0 {
batchSize = DEFAULT_METRIC_BATCH_SIZE
}
ro := &RunningOutput{
Name: name,
metrics: make([]telegraf.Metric, 0),
metrics: buffer.NewBuffer(batchSize),
failMetrics: buffer.NewBuffer(bufferLimit),
Output: output,
Config: conf,
MetricBufferLimit: DEFAULT_METRIC_BUFFER_LIMIT,
MetricBatchSize: DEFAULT_METRIC_BATCH_SIZE,
MetricBufferLimit: bufferLimit,
MetricBatchSize: batchSize,
}
return ro
}
@@ -63,19 +62,6 @@ func (ro *RunningOutput) AddMetric(metric telegraf.Metric) {
return
}
}
ro.Lock()
defer ro.Unlock()
if ro.tmpmetrics == nil {
size := ro.MetricBufferLimit / ro.MetricBatchSize
// ro.metrics already contains one batch
size = size - 1
if size < 1 {
size = 1
}
ro.tmpmetrics = make([]([]telegraf.Metric), size)
}
// Filter any tagexclude/taginclude parameters before adding metric
if len(ro.Config.Filter.TagExclude) != 0 || len(ro.Config.Filter.TagInclude) != 0 {
@@ -90,69 +76,64 @@ func (ro *RunningOutput) AddMetric(metric telegraf.Metric) {
metric, _ = telegraf.NewMetric(name, tags, fields, t)
}
if len(ro.metrics) < ro.MetricBatchSize {
ro.metrics = append(ro.metrics, metric)
} else {
flushSuccess := true
if ro.FlushBufferWhenFull {
err := ro.write(ro.metrics)
if err != nil {
log.Printf("ERROR writing full metric buffer to output %s, %s",
ro.Name, err)
flushSuccess = false
}
} else {
flushSuccess = false
ro.metrics.Add(metric)
if ro.metrics.Len() == ro.MetricBatchSize {
batch := ro.metrics.Batch(ro.MetricBatchSize)
err := ro.write(batch)
if err != nil {
ro.failMetrics.Add(batch...)
}
if !flushSuccess {
if ro.tmpmetrics[ro.writeI] != nil && ro.writeI == ro.readI {
log.Printf("WARNING: overwriting cached metrics, you may want to " +
"increase the metric_buffer_limit setting in your [agent] " +
"config if you do not wish to overwrite metrics.\n")
ro.readI = (ro.readI + 1) % cap(ro.tmpmetrics)
}
ro.tmpmetrics[ro.writeI] = ro.metrics
ro.writeI = (ro.writeI + 1) % cap(ro.tmpmetrics)
}
ro.metrics = make([]telegraf.Metric, 0)
ro.metrics = append(ro.metrics, metric)
}
}
// Write writes all cached points to this output.
func (ro *RunningOutput) Write() error {
ro.Lock()
defer ro.Unlock()
if ro.tmpmetrics == nil {
size := ro.MetricBufferLimit / ro.MetricBatchSize
// ro.metrics already contains one batch
size = size - 1
if size < 1 {
size = 1
}
ro.tmpmetrics = make([]([]telegraf.Metric), size)
if !ro.Quiet {
log.Printf("Output [%s] buffer fullness: %d / %d metrics. "+
"Total gathered metrics: %d. Total dropped metrics: %d.",
ro.Name,
ro.failMetrics.Len()+ro.metrics.Len(),
ro.MetricBufferLimit,
ro.metrics.Total(),
ro.metrics.Drops()+ro.failMetrics.Drops())
}
// Write any cached metric buffers before, as those metrics are the
// oldest
for ro.tmpmetrics[ro.readI] != nil {
if err := ro.write(ro.tmpmetrics[ro.readI]); err != nil {
return err
} else {
ro.tmpmetrics[ro.readI] = nil
ro.readI = (ro.readI + 1) % cap(ro.tmpmetrics)
var err error
if !ro.failMetrics.IsEmpty() {
bufLen := ro.failMetrics.Len()
// how many batches of failed writes we need to write.
nBatches := bufLen/ro.MetricBatchSize + 1
batchSize := ro.MetricBatchSize
for i := 0; i < nBatches; i++ {
// If it's the last batch, only grab the metrics that have not had
// a write attempt already (this is primarily to preserve order).
if i == nBatches-1 {
batchSize = bufLen % ro.MetricBatchSize
}
batch := ro.failMetrics.Batch(batchSize)
// If we've already failed previous writes, don't bother trying to
// write to this output again. We are not exiting the loop just so
// that we can rotate the metrics to preserve order.
if err == nil {
err = ro.write(batch)
}
if err != nil {
ro.failMetrics.Add(batch...)
}
}
}
err := ro.write(ro.metrics)
batch := ro.metrics.Batch(ro.MetricBatchSize)
// see comment above about not trying to write to an already failed output.
// if ro.failMetrics is empty then err will always be nil at this point.
if err == nil {
err = ro.write(batch)
}
if err != nil {
ro.failMetrics.Add(batch...)
return err
} else {
ro.metrics = make([]telegraf.Metric, 0)
}
return nil
}
@@ -165,8 +146,8 @@ func (ro *RunningOutput) write(metrics []telegraf.Metric) error {
elapsed := time.Since(start)
if err == nil {
if !ro.Quiet {
log.Printf("Wrote %d metrics to output %s in %s\n",
len(metrics), ro.Name, elapsed)
log.Printf("Output [%s] wrote batch of %d metrics in %s\n",
ro.Name, len(metrics), elapsed)
}
}
return err