Set a timeout for calls to input.Gather

Changing the internal behavior around running plugins. Each plugin
will now have it's own goroutine with it's own ticker. This means that a
hung plugin will not block any other plugins. When a plugin is hung, we
will log an error message every interval, letting users know which
plugin is hung.

Currently the input interface does not have any methods for killing a
running Gather call, so there is nothing we can do but log an "ERROR"
and move on. This will give some visibility into the plugin that is
acting up.

closes #1230
fixes #479
This commit is contained in:
Cameron Sparr 2016-05-19 16:36:58 +01:00
parent 3c5c3b98df
commit 4f5d5926d9
3 changed files with 91 additions and 92 deletions

View File

@ -1,11 +1,20 @@
## v0.13.1 [unreleased] ## v0.13.1 [unreleased]
### Release Notes
- Input plugin Gathers will no longer be logged by default, but a Gather for
_each_ plugin will be logged in Debug mode.
- Debug mode will no longer print every point added to the accumulator. This
functionality can be duplicated using the `file` output plugin and printing
to "stdout".
### Features ### Features
- [#1138](https://github.com/influxdata/telegraf/pull/1138): nstat input plugin. Thanks @Maksadbek! - [#1138](https://github.com/influxdata/telegraf/pull/1138): nstat input plugin. Thanks @Maksadbek!
- [#1139](https://github.com/influxdata/telegraf/pull/1139): instrumental output plugin. Thanks @jasonroelofs! - [#1139](https://github.com/influxdata/telegraf/pull/1139): instrumental output plugin. Thanks @jasonroelofs!
- [#1172](https://github.com/influxdata/telegraf/pull/1172): Ceph storage stats. Thanks @robinpercy! - [#1172](https://github.com/influxdata/telegraf/pull/1172): Ceph storage stats. Thanks @robinpercy!
- [#1233](https://github.com/influxdata/telegraf/pull/1233): Updated golint gopsutil dependency. - [#1233](https://github.com/influxdata/telegraf/pull/1233): Updated golint gopsutil dependency.
- [#479](https://github.com/influxdata/telegraf/issues/479): per-plugin execution time added to debug output.
### Bugfixes ### Bugfixes
@ -14,6 +23,7 @@
- [#1215](https://github.com/influxdata/telegraf/pull/1215): Fix for possible gopsutil-dependent plugin hangs. - [#1215](https://github.com/influxdata/telegraf/pull/1215): Fix for possible gopsutil-dependent plugin hangs.
- [#1228](https://github.com/influxdata/telegraf/pull/1228): Fix service plugin host tag overwrite. - [#1228](https://github.com/influxdata/telegraf/pull/1228): Fix service plugin host tag overwrite.
- [#1198](https://github.com/influxdata/telegraf/pull/1198): http_response: override request Host header properly - [#1198](https://github.com/influxdata/telegraf/pull/1198): http_response: override request Host header properly
- [#1230](https://github.com/influxdata/telegraf/issues/1230): Fix Telegraf process hangup due to a single plugin hanging.
## v0.13 [2016-05-11] ## v0.13 [2016-05-11]

View File

@ -4,7 +4,6 @@ import (
"fmt" "fmt"
"log" "log"
"math" "math"
"sync"
"time" "time"
"github.com/influxdata/telegraf" "github.com/influxdata/telegraf"
@ -22,13 +21,13 @@ func NewAccumulator(
} }
type accumulator struct { type accumulator struct {
sync.Mutex
metrics chan telegraf.Metric metrics chan telegraf.Metric
defaultTags map[string]string defaultTags map[string]string
debug bool debug bool
// print every point added to the accumulator
trace bool
inputConfig *internal_models.InputConfig inputConfig *internal_models.InputConfig
@ -152,7 +151,7 @@ func (ac *accumulator) AddFields(
log.Printf("Error adding point [%s]: %s\n", measurement, err.Error()) log.Printf("Error adding point [%s]: %s\n", measurement, err.Error())
return return
} }
if ac.debug { if ac.trace {
fmt.Println("> " + m.String()) fmt.Println("> " + m.String())
} }
ac.metrics <- m ac.metrics <- m
@ -166,6 +165,14 @@ func (ac *accumulator) SetDebug(debug bool) {
ac.debug = debug ac.debug = debug
} }
func (ac *accumulator) Trace() bool {
return ac.trace
}
func (ac *accumulator) SetTrace(trace bool) {
ac.trace = trace
}
func (ac *accumulator) setDefaultTags(tags map[string]string) { func (ac *accumulator) setDefaultTags(tags map[string]string) {
ac.defaultTags = tags ac.defaultTags = tags
} }

View File

@ -102,24 +102,24 @@ func panicRecover(input *internal_models.RunningInput) {
} }
} }
// gatherParallel runs the inputs that are using the same reporting interval // gatherer runs the inputs that have been configured with their own
// as the telegraf agent. // reporting interval.
func (a *Agent) gatherParallel(metricC chan telegraf.Metric) error { func (a *Agent) gatherer(
var wg sync.WaitGroup shutdown chan struct{},
input *internal_models.RunningInput,
start := time.Now() interval time.Duration,
counter := 0 metricC chan telegraf.Metric,
jitter := a.Config.Agent.CollectionJitter.Duration.Nanoseconds() ) error {
for _, input := range a.Config.Inputs {
if input.Config.Interval != 0 {
continue
}
wg.Add(1)
counter++
go func(input *internal_models.RunningInput) {
defer panicRecover(input) defer panicRecover(input)
defer wg.Done()
ticker := time.NewTicker(interval)
defer ticker.Stop()
jitter := a.Config.Agent.CollectionJitter.Duration.Nanoseconds()
for {
var outerr error
start := time.Now()
acc := NewAccumulator(input.Config, metricC) acc := NewAccumulator(input.Config, metricC)
acc.SetDebug(a.Config.Agent.Debug) acc.SetDebug(a.Config.Agent.Debug)
@ -136,54 +136,12 @@ func (a *Agent) gatherParallel(metricC chan telegraf.Metric) error {
} }
} }
if err := input.Input.Gather(acc); err != nil { gatherWithTimeout(shutdown, input, acc, interval)
log.Printf("Error in input [%s]: %s", input.Name, err)
}
}(input)
}
if counter == 0 {
return nil
}
wg.Wait()
elapsed := time.Since(start) elapsed := time.Since(start)
if !a.Config.Agent.Quiet { if a.Config.Agent.Debug {
log.Printf("Gathered metrics, (%s interval), from %d inputs in %s\n", log.Printf("Input [%s] gathered metrics, (%s interval) in %s\n",
a.Config.Agent.Interval.Duration, counter, elapsed) input.Name, interval, elapsed)
}
return nil
}
// gatherSeparate runs the inputs that have been configured with their own
// reporting interval.
func (a *Agent) gatherSeparate(
shutdown chan struct{},
input *internal_models.RunningInput,
metricC chan telegraf.Metric,
) error {
defer panicRecover(input)
ticker := time.NewTicker(input.Config.Interval)
for {
var outerr error
start := time.Now()
acc := NewAccumulator(input.Config, metricC)
acc.SetDebug(a.Config.Agent.Debug)
acc.setDefaultTags(a.Config.Tags)
if err := input.Input.Gather(acc); err != nil {
log.Printf("Error in input [%s]: %s", input.Name, err)
}
elapsed := time.Since(start)
if !a.Config.Agent.Quiet {
log.Printf("Gathered metrics, (separate %s interval), from %s in %s\n",
input.Config.Interval, input.Name, elapsed)
} }
if outerr != nil { if outerr != nil {
@ -199,6 +157,42 @@ func (a *Agent) gatherSeparate(
} }
} }
// gatherWithTimeout gathers from the given input, with the given timeout.
// when the given timeout is reached, gatherWithTimeout logs an error message
// but continues waiting for it to return. This is to avoid leaving behind
// hung processes, and to prevent re-calling the same hung process over and
// over.
func gatherWithTimeout(
shutdown chan struct{},
input *internal_models.RunningInput,
acc *accumulator,
timeout time.Duration,
) {
ticker := time.NewTicker(timeout)
defer ticker.Stop()
done := make(chan error)
go func() {
done <- input.Input.Gather(acc)
}()
for {
select {
case err := <-done:
if err != nil {
log.Printf("ERROR in input [%s]: %s", input.Name, err)
}
return
case <-ticker.C:
log.Printf("ERROR: input [%s] took longer to collect than "+
"collection interval (%s)",
input.Name, timeout)
continue
case <-shutdown:
return
}
}
}
// Test verifies that we can 'Gather' from all inputs with their configured // Test verifies that we can 'Gather' from all inputs with their configured
// Config struct // Config struct
func (a *Agent) Test() error { func (a *Agent) Test() error {
@ -220,7 +214,7 @@ func (a *Agent) Test() error {
for _, input := range a.Config.Inputs { for _, input := range a.Config.Inputs {
acc := NewAccumulator(input.Config, metricC) acc := NewAccumulator(input.Config, metricC)
acc.SetDebug(true) acc.SetTrace(true)
acc.setDefaultTags(a.Config.Tags) acc.setDefaultTags(a.Config.Tags)
fmt.Printf("* Plugin: %s, Collection 1\n", input.Name) fmt.Printf("* Plugin: %s, Collection 1\n", input.Name)
@ -348,7 +342,6 @@ func (a *Agent) Run(shutdown chan struct{}) error {
i := int64(a.Config.Agent.Interval.Duration) i := int64(a.Config.Agent.Interval.Duration)
time.Sleep(time.Duration(i - (time.Now().UnixNano() % i))) time.Sleep(time.Duration(i - (time.Now().UnixNano() % i)))
} }
ticker := time.NewTicker(a.Config.Agent.Interval.Duration)
wg.Add(1) wg.Add(1)
go func() { go func() {
@ -359,32 +352,21 @@ func (a *Agent) Run(shutdown chan struct{}) error {
} }
}() }()
wg.Add(len(a.Config.Inputs))
for _, input := range a.Config.Inputs { for _, input := range a.Config.Inputs {
// Special handling for inputs that have their own collection interval interval := a.Config.Agent.Interval.Duration
// configured. Default intervals are handled below with gatherParallel // overwrite global interval if this plugin has it's own.
if input.Config.Interval != 0 { if input.Config.Interval != 0 {
wg.Add(1) interval = input.Config.Interval
go func(input *internal_models.RunningInput) { }
go func(in *internal_models.RunningInput, interv time.Duration) {
defer wg.Done() defer wg.Done()
if err := a.gatherSeparate(shutdown, input, metricC); err != nil { if err := a.gatherer(shutdown, in, interv, metricC); err != nil {
log.Printf(err.Error()) log.Printf(err.Error())
} }
}(input) }(input, interval)
}
} }
defer wg.Wait() wg.Wait()
for {
if err := a.gatherParallel(metricC); err != nil {
log.Printf(err.Error())
}
select {
case <-shutdown:
return nil return nil
case <-ticker.C:
continue
}
}
} }