Add support for retrying output writes, using independent threads

Fixes #285
2015-10-21 10:57:51 -06:00 · 2015-10-21 10:57:51 -06:00 · dfc59866e8
parent ac685d19f8
commit dfc59866e8
3 changed files with 61 additions and 29 deletions
--- a/agent.go
+++ b/agent.go
@ -34,6 +34,9 @@ type Agent struct {
 	// Interval at which to flush data
 	FlushInterval Duration
 	// FlushRetries is the number of times to retry each data flush
 	FlushRetries int
 	// TODO(cam): Remove UTC and Precision parameters, they are no longer
 	// valid for the agent config. Leaving them here for now for backwards-
 	// compatability
@ -61,6 +64,7 @@ func NewAgent(config *Config) (*Agent, error) {
 		Config:        config,
 		Interval:      Duration{10 * time.Second},
 		FlushInterval: Duration{10 * time.Second},
 		FlushRetries:  2,
 		UTC:           true,
 		Precision:     "s",
 	}
@ -293,28 +297,56 @@ func (a *Agent) Test() error {
 	return nil
 }
-func (a *Agent) flush(points []*client.Point) {
+// writeOutput writes a list of points to a single output, with retries
-	var wg sync.WaitGroup
+func (a *Agent) writeOutput(
-
+	points []*client.Point,
 	ro *runningOutput,
 	shutdown chan struct{},
 ) {
 	retry := 0
 	retries := a.FlushRetries
 	start := time.Now()
 	counter := 0
 	for _, o := range a.outputs {
 		wg.Add(1)
 		counter++
-		go func(ro *runningOutput) {
+	for {
-			defer wg.Done()
+		err := ro.output.Write(points)
-			// Log all output errors:
+
-			if err := ro.output.Write(points); err != nil {
+		select {
-				log.Printf("Error in output [%s]: %s", ro.name, err.Error())
+		case <-shutdown:
 			return
 		default:
 			if err == nil {
 				// Write successful
 				elapsed := time.Since(start)
 				log.Printf("Flushed %d metrics to output %s in %s\n",
 					len(points), ro.name, elapsed)
 				return
 			} else if retry >= retries {
 				// No more retries
 				msg := "FATAL: Write to output [%s] failed %d times, dropping" +
 					" %d metrics\n"
 				log.Printf(msg, ro.name, retries+1, len(points))
 				return
 			} else if err != nil {
 				// Sleep for a retry
 				log.Printf("Error in output [%s]: %s, retrying in %s",
 					ro.name, err.Error(), a.FlushInterval.Duration)
 				time.Sleep(a.FlushInterval.Duration)
 			}
-		}(o)
+		}
 		retry++
 	}
 }
 // flush writes a list of points to all configured outputs
 func (a *Agent) flush(points []*client.Point, shutdown chan struct{}) {
 	if len(points) == 0 {
 		return
 	}
-	wg.Wait()
+	for _, o := range a.outputs {
-	elapsed := time.Since(start)
+		go a.writeOutput(points, o, shutdown)
-	log.Printf("Flushed %d metrics to %d output sinks in %s\n",
+	}
 		len(points), counter, elapsed)
 }
 // flusher monitors the points input channel and flushes on the minimum interval
@ -327,9 +359,11 @@ func (a *Agent) flusher(shutdown chan struct{}, pointChan chan *client.Point) er
 	for {
 		select {
 		case <-shutdown:
 			log.Println("Hang on, flushing any cached points before shutdown")
 			a.flush(points, shutdown)
 			return nil
 		case <-ticker.C:
-			a.flush(points)
+			a.flush(points, shutdown)
 			points = make([]*client.Point, 0)
 		case pt := <-pointChan:
 			points = append(points, pt)
--- a/config.go
+++ b/config.go
@ -357,6 +357,8 @@ var header = `# Telegraf configuration
  interval = "10s"
  # Default data flushing interval for all outputs
  flush_interval = "10s"
  # Number of times to retry each data flush
  flush_retries = 2
  # run telegraf in debug mode
  debug = false
  # Override default hostname, if empty use os.Hostname()
--- a/etc/config.sample.toml
+++ b/etc/config.sample.toml
@ -27,17 +27,12 @@
 [agent]
  # Default data collection interval for all plugins
  interval = "10s"
-
+  # Default data flushing interval for all outputs
-  # If utc = false, uses local time (utc is highly recommended)
+  flush_interval = "10s"
-  utc = true
+  # Number of times to retry each data flush
-
+  flush_retries = 2
  # Precision of writes, valid values are n, u, ms, s, m, and h
  # note: using second precision greatly helps InfluxDB compression
  precision = "s"
  # run telegraf in debug mode
  debug = false
  # Override default hostname, if empty use os.Hostname()
  hostname = ""
@ -54,15 +49,16 @@
  # Multiple urls can be specified for InfluxDB cluster support. Server to
  # write to will be randomly chosen each interval.
  urls = ["http://localhost:8086"] # required.
  # The target database for metrics. This database must already exist
  database = "telegraf" # required.
  # Precision of writes, valid values are n, u, ms, s, m, and h
  # note: using second precision greatly helps InfluxDB compression
  precision = "s"
  # Connection timeout (for the connection with InfluxDB), formatted as a string.
  # Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h".
  # If not provided, will default to 0 (no timeout)
  # timeout = "5s"
  # username = "telegraf"
  # password = "metricsmetricsmetricsmetrics"