Implement telegraf collecting stats on itself

closes #1348
This commit is contained in:
Cameron Sparr
2016-11-07 08:34:46 +00:00
parent d518d7d806
commit d71a42cd1b
26 changed files with 975 additions and 169 deletions

View File

@@ -2,10 +2,14 @@ package agent
import (
"log"
"sync/atomic"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/selfstat"
)
var (
NErrors = selfstat.Register("agent", "gather_errors", map[string]string{})
)
type MetricMaker interface {
@@ -37,8 +41,6 @@ type accumulator struct {
maker MetricMaker
precision time.Duration
errCount uint64
}
func (ac *accumulator) AddFields(
@@ -80,7 +82,7 @@ func (ac *accumulator) AddError(err error) {
if err == nil {
return
}
atomic.AddUint64(&ac.errCount, 1)
NErrors.Incr(1)
//TODO suppress/throttle consecutive duplicate errors?
log.Printf("E! Error in plugin [%s]: %s", ac.maker.Name(), err)
}

View File

@@ -88,7 +88,7 @@ func TestAccAddError(t *testing.T) {
a.AddError(fmt.Errorf("baz"))
errs := bytes.Split(errBuf.Bytes(), []byte{'\n'})
assert.EqualValues(t, 3, a.errCount)
assert.EqualValues(t, int64(3), NErrors.Get())
require.Len(t, errs, 4) // 4 because of trailing newline
assert.Contains(t, string(errs[0]), "TestPlugin")
assert.Contains(t, string(errs[0]), "foo")

View File

@@ -12,6 +12,7 @@ import (
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/internal/config"
"github.com/influxdata/telegraf/internal/models"
"github.com/influxdata/telegraf/selfstat"
)
// Agent runs telegraf and collects data based on the given config
@@ -44,8 +45,6 @@ func NewAgent(config *config.Config) (*Agent, error) {
// Connect connects to all configured outputs
func (a *Agent) Connect() error {
for _, o := range a.Config.Outputs {
o.Quiet = a.Config.Agent.Quiet
switch ot := o.Output.(type) {
case telegraf.ServiceOutput:
if err := ot.Start(); err != nil {
@@ -106,24 +105,26 @@ func (a *Agent) gatherer(
) {
defer panicRecover(input)
GatherTime := selfstat.RegisterTiming("gather",
"gather_time_ns",
map[string]string{"input": input.Config.Name},
)
acc := NewAccumulator(input, metricC)
acc.SetPrecision(a.Config.Agent.Precision.Duration,
a.Config.Agent.Interval.Duration)
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
acc := NewAccumulator(input, metricC)
acc.SetPrecision(a.Config.Agent.Precision.Duration,
a.Config.Agent.Interval.Duration)
input.SetDebug(a.Config.Agent.Debug)
input.SetDefaultTags(a.Config.Tags)
internal.RandomSleep(a.Config.Agent.CollectionJitter.Duration, shutdown)
start := time.Now()
gatherWithTimeout(shutdown, input, acc, interval)
elapsed := time.Since(start)
log.Printf("D! Input [%s] gathered metrics, (%s interval) in %s\n",
input.Name(), interval, elapsed)
GatherTime.Incr(elapsed.Nanoseconds())
select {
case <-shutdown:
@@ -204,9 +205,6 @@ func (a *Agent) Test() error {
if err := input.Input.Gather(acc); err != nil {
return err
}
if acc.errCount > 0 {
return fmt.Errorf("Errors encountered during processing")
}
// Special instructions for some inputs. cpu, for example, needs to be
// run twice in order to return cpu usage percentages.
@@ -327,13 +325,13 @@ func (a *Agent) Run(shutdown chan struct{}) error {
// Start all ServicePlugins
for _, input := range a.Config.Inputs {
input.SetDefaultTags(a.Config.Tags)
switch p := input.Input.(type) {
case telegraf.ServiceInput:
acc := NewAccumulator(input, metricC)
// Service input plugins should set their own precision of their
// metrics.
acc.SetPrecision(time.Nanosecond, 0)
input.SetDefaultTags(a.Config.Tags)
if err := p.Start(acc); err != nil {
log.Printf("E! Service for input %s failed to start, exiting\n%s\n",
input.Name(), err.Error())