Prevent possible deadlock when using aggregators (#3016)

Looping the metrics back through the same channel could result in a deadlock, by using a new channel and locking the processor we can ensure that all stages can make continual progress.
2017-07-13 15:34:21 -07:00 · 2017-07-13 15:34:21 -07:00 · 8567dfe7b1
parent 88037c8a2c
commit 8567dfe7b1
2 changed files with 35 additions and 4 deletions
--- a/agent/agent.go
+++ b/agent/agent.go
@ -247,7 +247,7 @@ func (a *Agent) flush() {
 }
 // flusher monitors the metrics input channel and flushes on the minimum interval
-func (a *Agent) flusher(shutdown chan struct{}, metricC chan telegraf.Metric) error {
+func (a *Agent) flusher(shutdown chan struct{}, metricC chan telegraf.Metric, aggC chan telegraf.Metric) error {
 	// Inelegant, but this sleep is to allow the Gather threads to run, so that
 	// the flusher will flush after metrics are collected.
 	time.Sleep(time.Millisecond * 300)
@ -291,6 +291,29 @@ func (a *Agent) flusher(shutdown chan struct{}, metricC chan telegraf.Metric) er
 		}
 	}()
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
 		for {
 			select {
 			case <-shutdown:
 				if len(aggC) > 0 {
 					// keep going until aggC is flushed
 					continue
 				}
 				return
 			case metric := <-aggC:
 				metrics := []telegraf.Metric{metric}
 				for _, processor := range a.Config.Processors {
 					metrics = processor.Apply(metrics...)
 				}
 				for _, m := range metrics {
 					outMetricC <- m
 				}
 			}
 		}
 	}()
 	ticker := time.NewTicker(a.Config.Agent.FlushInterval.Duration)
 	semaphore := make(chan struct{}, 1)
 	for {
@ -339,6 +362,7 @@ func (a *Agent) Run(shutdown chan struct{}) error {
 	// channel shared between all input threads for accumulating metrics
 	metricC := make(chan telegraf.Metric, 100)
 	aggC := make(chan telegraf.Metric, 100)
 	// Start all ServicePlugins
 	for _, input := range a.Config.Inputs {
@ -367,7 +391,7 @@ func (a *Agent) Run(shutdown chan struct{}) error {
 	wg.Add(1)
 	go func() {
 		defer wg.Done()
-		if err := a.flusher(shutdown, metricC); err != nil {
+		if err := a.flusher(shutdown, metricC, aggC); err != nil {
 			log.Printf("E! Flusher routine failed, exiting: %s\n", err.Error())
 			close(shutdown)
 		}
@ -377,7 +401,7 @@ func (a *Agent) Run(shutdown chan struct{}) error {
 	for _, aggregator := range a.Config.Aggregators {
 		go func(agg *models.RunningAggregator) {
 			defer wg.Done()
-			acc := NewAccumulator(agg, metricC)
+			acc := NewAccumulator(agg, aggC)
 			acc.SetPrecision(a.Config.Agent.Precision.Duration,
 				a.Config.Agent.Interval.Duration)
 			agg.Run(acc, shutdown)
--- a/internal/models/running_processor.go
+++ b/internal/models/running_processor.go
@ -1,11 +1,15 @@
 package models
 import (
 	"sync"
 	"github.com/influxdata/telegraf"
 )
 type RunningProcessor struct {
-	Name      string
+	Name string
 	sync.Mutex
 	Processor telegraf.Processor
 	Config    *ProcessorConfig
 }
@ -24,6 +28,9 @@ type ProcessorConfig struct {
 }
 func (rp *RunningProcessor) Apply(in ...telegraf.Metric) []telegraf.Metric {
 	rp.Lock()
 	defer rp.Unlock()
 	ret := []telegraf.Metric{}
 	for _, metric := range in {