Implement telegraf collecting stats on itself

closes #1348
2016-11-07 08:34:46 +00:00
parent d518d7d806
commit d71a42cd1b
26 changed files with 975 additions and 169 deletions
--- a/agent/accumulator.go
+++ b/agent/accumulator.go
@@ -2,10 +2,14 @@ package agent

 import (
 	"log"
-	"sync/atomic"
 	"time"

 	"github.com/influxdata/telegraf"
+	"github.com/influxdata/telegraf/selfstat"
+)
+
+var (
+	NErrors = selfstat.Register("agent", "gather_errors", map[string]string{})
 )

 type MetricMaker interface {
@@ -37,8 +41,6 @@ type accumulator struct {
 	maker MetricMaker

 	precision time.Duration
-
-	errCount uint64
 }

 func (ac *accumulator) AddFields(
@@ -80,7 +82,7 @@ func (ac *accumulator) AddError(err error) {
 	if err == nil {
 		return
 	}
-	atomic.AddUint64(&ac.errCount, 1)
+	NErrors.Incr(1)
 	//TODO suppress/throttle consecutive duplicate errors?
 	log.Printf("E! Error in plugin [%s]: %s", ac.maker.Name(), err)
 }
--- a/agent/accumulator_test.go
+++ b/agent/accumulator_test.go
@@ -88,7 +88,7 @@ func TestAccAddError(t *testing.T) {
 	a.AddError(fmt.Errorf("baz"))

 	errs := bytes.Split(errBuf.Bytes(), []byte{'\n'})
-	assert.EqualValues(t, 3, a.errCount)
+	assert.EqualValues(t, int64(3), NErrors.Get())
 	require.Len(t, errs, 4) // 4 because of trailing newline
 	assert.Contains(t, string(errs[0]), "TestPlugin")
 	assert.Contains(t, string(errs[0]), "foo")
--- a/agent/agent.go
+++ b/agent/agent.go
@@ -12,6 +12,7 @@ import (
 	"github.com/influxdata/telegraf/internal"
 	"github.com/influxdata/telegraf/internal/config"
 	"github.com/influxdata/telegraf/internal/models"
+	"github.com/influxdata/telegraf/selfstat"
 )

 // Agent runs telegraf and collects data based on the given config
@@ -44,8 +45,6 @@ func NewAgent(config *config.Config) (*Agent, error) {
 // Connect connects to all configured outputs
 func (a *Agent) Connect() error {
 	for _, o := range a.Config.Outputs {
-		o.Quiet = a.Config.Agent.Quiet
-
 		switch ot := o.Output.(type) {
 		case telegraf.ServiceOutput:
 			if err := ot.Start(); err != nil {
@@ -106,24 +105,26 @@ func (a *Agent) gatherer(
 ) {
 	defer panicRecover(input)

+	GatherTime := selfstat.RegisterTiming("gather",
+		"gather_time_ns",
+		map[string]string{"input": input.Config.Name},
+	)
+
+	acc := NewAccumulator(input, metricC)
+	acc.SetPrecision(a.Config.Agent.Precision.Duration,
+		a.Config.Agent.Interval.Duration)
+
 	ticker := time.NewTicker(interval)
 	defer ticker.Stop()

 	for {
-		acc := NewAccumulator(input, metricC)
-		acc.SetPrecision(a.Config.Agent.Precision.Duration,
-			a.Config.Agent.Interval.Duration)
-		input.SetDebug(a.Config.Agent.Debug)
-		input.SetDefaultTags(a.Config.Tags)
-
 		internal.RandomSleep(a.Config.Agent.CollectionJitter.Duration, shutdown)

 		start := time.Now()
 		gatherWithTimeout(shutdown, input, acc, interval)
 		elapsed := time.Since(start)

-		log.Printf("D! Input [%s] gathered metrics, (%s interval) in %s\n",
-			input.Name(), interval, elapsed)
+		GatherTime.Incr(elapsed.Nanoseconds())

 		select {
 		case <-shutdown:
@@ -204,9 +205,6 @@ func (a *Agent) Test() error {
 		if err := input.Input.Gather(acc); err != nil {
 			return err
 		}
-		if acc.errCount > 0 {
-			return fmt.Errorf("Errors encountered during processing")
-		}

 		// Special instructions for some inputs. cpu, for example, needs to be
 		// run twice in order to return cpu usage percentages.
@@ -327,13 +325,13 @@ func (a *Agent) Run(shutdown chan struct{}) error {

 	// Start all ServicePlugins
 	for _, input := range a.Config.Inputs {
+		input.SetDefaultTags(a.Config.Tags)
 		switch p := input.Input.(type) {
 		case telegraf.ServiceInput:
 			acc := NewAccumulator(input, metricC)
 			// Service input plugins should set their own precision of their
 			// metrics.
 			acc.SetPrecision(time.Nanosecond, 0)
-			input.SetDefaultTags(a.Config.Tags)
 			if err := p.Start(acc); err != nil {
 				log.Printf("E! Service for input %s failed to start, exiting\n%s\n",
 					input.Name(), err.Error())