Add support for retrying output writes, using independent threads

Fixes #285
This commit is contained in:
Cameron Sparr 2015-10-21 10:57:51 -06:00
parent ac685d19f8
commit dfc59866e8
3 changed files with 61 additions and 29 deletions

View File

@ -34,6 +34,9 @@ type Agent struct {
// Interval at which to flush data
FlushInterval Duration
// FlushRetries is the number of times to retry each data flush
FlushRetries int
// TODO(cam): Remove UTC and Precision parameters, they are no longer
// valid for the agent config. Leaving them here for now for backwards-
// compatability
@ -61,6 +64,7 @@ func NewAgent(config *Config) (*Agent, error) {
Config: config,
Interval: Duration{10 * time.Second},
FlushInterval: Duration{10 * time.Second},
FlushRetries: 2,
UTC: true,
Precision: "s",
}
@ -293,28 +297,56 @@ func (a *Agent) Test() error {
return nil
}
func (a *Agent) flush(points []*client.Point) {
var wg sync.WaitGroup
// writeOutput writes a list of points to a single output, with retries
func (a *Agent) writeOutput(
points []*client.Point,
ro *runningOutput,
shutdown chan struct{},
) {
retry := 0
retries := a.FlushRetries
start := time.Now()
counter := 0
for _, o := range a.outputs {
wg.Add(1)
counter++
go func(ro *runningOutput) {
defer wg.Done()
// Log all output errors:
if err := ro.output.Write(points); err != nil {
log.Printf("Error in output [%s]: %s", ro.name, err.Error())
}
}(o)
}
for {
err := ro.output.Write(points)
wg.Wait()
select {
case <-shutdown:
return
default:
if err == nil {
// Write successful
elapsed := time.Since(start)
log.Printf("Flushed %d metrics to %d output sinks in %s\n",
len(points), counter, elapsed)
log.Printf("Flushed %d metrics to output %s in %s\n",
len(points), ro.name, elapsed)
return
} else if retry >= retries {
// No more retries
msg := "FATAL: Write to output [%s] failed %d times, dropping" +
" %d metrics\n"
log.Printf(msg, ro.name, retries+1, len(points))
return
} else if err != nil {
// Sleep for a retry
log.Printf("Error in output [%s]: %s, retrying in %s",
ro.name, err.Error(), a.FlushInterval.Duration)
time.Sleep(a.FlushInterval.Duration)
}
}
retry++
}
}
// flush writes a list of points to all configured outputs
func (a *Agent) flush(points []*client.Point, shutdown chan struct{}) {
if len(points) == 0 {
return
}
for _, o := range a.outputs {
go a.writeOutput(points, o, shutdown)
}
}
// flusher monitors the points input channel and flushes on the minimum interval
@ -327,9 +359,11 @@ func (a *Agent) flusher(shutdown chan struct{}, pointChan chan *client.Point) er
for {
select {
case <-shutdown:
log.Println("Hang on, flushing any cached points before shutdown")
a.flush(points, shutdown)
return nil
case <-ticker.C:
a.flush(points)
a.flush(points, shutdown)
points = make([]*client.Point, 0)
case pt := <-pointChan:
points = append(points, pt)

View File

@ -357,6 +357,8 @@ var header = `# Telegraf configuration
interval = "10s"
# Default data flushing interval for all outputs
flush_interval = "10s"
# Number of times to retry each data flush
flush_retries = 2
# run telegraf in debug mode
debug = false
# Override default hostname, if empty use os.Hostname()

View File

@ -27,17 +27,12 @@
[agent]
# Default data collection interval for all plugins
interval = "10s"
# If utc = false, uses local time (utc is highly recommended)
utc = true
# Precision of writes, valid values are n, u, ms, s, m, and h
# note: using second precision greatly helps InfluxDB compression
precision = "s"
# Default data flushing interval for all outputs
flush_interval = "10s"
# Number of times to retry each data flush
flush_retries = 2
# run telegraf in debug mode
debug = false
# Override default hostname, if empty use os.Hostname()
hostname = ""
@ -54,15 +49,16 @@
# Multiple urls can be specified for InfluxDB cluster support. Server to
# write to will be randomly chosen each interval.
urls = ["http://localhost:8086"] # required.
# The target database for metrics. This database must already exist
database = "telegraf" # required.
# Precision of writes, valid values are n, u, ms, s, m, and h
# note: using second precision greatly helps InfluxDB compression
precision = "s"
# Connection timeout (for the connection with InfluxDB), formatted as a string.
# Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h".
# If not provided, will default to 0 (no timeout)
# timeout = "5s"
# username = "telegraf"
# password = "metricsmetricsmetricsmetrics"