Flush based on buffer size rather than time

this includes: - Add Accumulator to the Start() function of service inputs - For message consumer plugins, use the Accumulator to constantly add metrics and make Gather a dummy function - rework unit tests to match this new behavior. - make "flush_buffer_when_full" a config option that defaults to true closes #666
2016-02-15 17:21:38 -07:00
parent 7f539c951a
commit ee468be696
15 changed files with 271 additions and 285 deletions
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -68,7 +68,7 @@ type AgentConfig struct {
 	// same time, which can have a measurable effect on the system.
 	CollectionJitter internal.Duration

-	// Interval at which to flush data
+	// FlushInterval is the Interval at which to flush data
 	FlushInterval internal.Duration

 	// FlushJitter Jitters the flush interval by a random amount.
@@ -82,6 +82,11 @@ type AgentConfig struct {
 	// full, the oldest metrics will be overwritten.
 	MetricBufferLimit int

+	// FlushBufferWhenFull tells Telegraf to flush the metric buffer whenever
+	// it fills up, regardless of FlushInterval. Setting this option to true
+	// does _not_ deactivate FlushInterval.
+	FlushBufferWhenFull bool
+
 	// TODO(cam): Remove UTC and Precision parameters, they are no longer
 	// valid for the agent config. Leaving them here for now for backwards-
 	// compatability
@@ -157,6 +162,8 @@ var header = `##################################################################
  ### Telegraf will cache metric_buffer_limit metrics for each output, and will
  ### flush this buffer on a successful write.
  metric_buffer_limit = 10000
+  ### Flush the buffer whenever full, regardless of flush_interval.
+  flush_buffer_when_full = true

  ### Collection jitter is used to jitter the collection by a random amount.
  ### Each plugin will sleep for a random time within jitter before collecting.
@@ -421,8 +428,9 @@ func (c *Config) addOutput(name string, table *ast.Table) error {

 	ro := internal_models.NewRunningOutput(name, output, outputConfig)
 	if c.Agent.MetricBufferLimit > 0 {
-		ro.PointBufferLimit = c.Agent.MetricBufferLimit
+		ro.MetricBufferLimit = c.Agent.MetricBufferLimit
 	}
+	ro.FlushBufferWhenFull = c.Agent.FlushBufferWhenFull
 	ro.Quiet = c.Agent.Quiet
 	c.Outputs = append(c.Outputs, ro)
 	return nil
--- a/internal/models/running_output.go
+++ b/internal/models/running_output.go
@@ -2,22 +2,34 @@ package internal_models

 import (
 	"log"
+	"sync"
 	"time"

 	"github.com/influxdata/telegraf"
 )

-const DEFAULT_POINT_BUFFER_LIMIT = 10000
+const (
+	// Default number of metrics kept between flushes.
+	DEFAULT_METRIC_BUFFER_LIMIT = 10000
+
+	// Limit how many full metric buffers are kept due to failed writes.
+	FULL_METRIC_BUFFERS_LIMIT = 100
+)

 type RunningOutput struct {
-	Name             string
-	Output           telegraf.Output
-	Config           *OutputConfig
-	Quiet            bool
-	PointBufferLimit int
+	Name                string
+	Output              telegraf.Output
+	Config              *OutputConfig
+	Quiet               bool
+	MetricBufferLimit   int
+	FlushBufferWhenFull bool

-	metrics          []telegraf.Metric
-	overwriteCounter int
+	metrics    []telegraf.Metric
+	tmpmetrics map[int][]telegraf.Metric
+	overwriteI int
+	mapI       int
+
+	sync.Mutex
 }

 func NewRunningOutput(
@@ -26,47 +38,94 @@ func NewRunningOutput(
 	conf *OutputConfig,
 ) *RunningOutput {
 	ro := &RunningOutput{
-		Name:             name,
-		metrics:          make([]telegraf.Metric, 0),
-		Output:           output,
-		Config:           conf,
-		PointBufferLimit: DEFAULT_POINT_BUFFER_LIMIT,
+		Name:              name,
+		metrics:           make([]telegraf.Metric, 0),
+		tmpmetrics:        make(map[int][]telegraf.Metric),
+		Output:            output,
+		Config:            conf,
+		MetricBufferLimit: DEFAULT_METRIC_BUFFER_LIMIT,
 	}
 	return ro
 }

-func (ro *RunningOutput) AddPoint(point telegraf.Metric) {
+// AddMetric adds a metric to the output. This function can also write cached
+// points if FlushBufferWhenFull is true.
+func (ro *RunningOutput) AddMetric(metric telegraf.Metric) {
 	if ro.Config.Filter.IsActive {
-		if !ro.Config.Filter.ShouldMetricPass(point) {
+		if !ro.Config.Filter.ShouldMetricPass(metric) {
 			return
 		}
 	}
+	ro.Lock()
+	defer ro.Unlock()

-	if len(ro.metrics) < ro.PointBufferLimit {
-		ro.metrics = append(ro.metrics, point)
+	if len(ro.metrics) < ro.MetricBufferLimit {
+		ro.metrics = append(ro.metrics, metric)
 	} else {
-		log.Printf("WARNING: overwriting cached metrics, you may want to " +
-			"increase the metric_buffer_limit setting in your [agent] config " +
-			"if you do not wish to overwrite metrics.\n")
-		if ro.overwriteCounter == len(ro.metrics) {
-			ro.overwriteCounter = 0
+		if ro.FlushBufferWhenFull {
+			tmpmetrics := make([]telegraf.Metric, len(ro.metrics))
+			copy(tmpmetrics, ro.metrics)
+			ro.metrics = make([]telegraf.Metric, 0)
+			err := ro.write(tmpmetrics)
+			if err != nil {
+				log.Printf("ERROR writing full metric buffer to output %s, %s",
+					ro.Name, err)
+				if len(ro.tmpmetrics) == FULL_METRIC_BUFFERS_LIMIT {
+					ro.mapI = 0
+					// overwrite one
+					ro.tmpmetrics[ro.mapI] = tmpmetrics
+					ro.mapI++
+				} else {
+					ro.tmpmetrics[ro.mapI] = tmpmetrics
+					ro.mapI++
+				}
+			}
+		} else {
+			log.Printf("WARNING: overwriting cached metrics, you may want to " +
+				"increase the metric_buffer_limit setting in your [agent] " +
+				"config if you do not wish to overwrite metrics.\n")
+			if ro.overwriteI == len(ro.metrics) {
+				ro.overwriteI = 0
+			}
+			ro.metrics[ro.overwriteI] = metric
+			ro.overwriteI++
 		}
-		ro.metrics[ro.overwriteCounter] = point
-		ro.overwriteCounter++
 	}
 }

+// Write writes all cached points to this output.
 func (ro *RunningOutput) Write() error {
+	ro.Lock()
+	defer ro.Unlock()
+	err := ro.write(ro.metrics)
+	if err != nil {
+		return err
+	} else {
+		ro.metrics = make([]telegraf.Metric, 0)
+		ro.overwriteI = 0
+	}
+
+	// Write any cached metric buffers that failed previously
+	for i, tmpmetrics := range ro.tmpmetrics {
+		if err := ro.write(tmpmetrics); err != nil {
+			return err
+		} else {
+			delete(ro.tmpmetrics, i)
+		}
+	}
+
+	return nil
+}
+
+func (ro *RunningOutput) write(metrics []telegraf.Metric) error {
 	start := time.Now()
-	err := ro.Output.Write(ro.metrics)
+	err := ro.Output.Write(metrics)
 	elapsed := time.Since(start)
 	if err == nil {
 		if !ro.Quiet {
 			log.Printf("Wrote %d metrics to output %s in %s\n",
-				len(ro.metrics), ro.Name, elapsed)
+				len(metrics), ro.Name, elapsed)
 		}
-		ro.metrics = make([]telegraf.Metric, 0)
-		ro.overwriteCounter = 0
 	}
 	return err
 }