Create public models for telegraf metrics, accumlator, plugins

This will basically make the root directory a place for storing the major telegraf interfaces, which will make telegraf's godoc looks quite a bit nicer. And make it easier for contributors to lookup the few data types that they actually care about. closes #564
2016-01-27 14:21:36 -07:00
parent a822d942cd
commit 9c0d14bb60
83 changed files with 699 additions and 525 deletions
--- a/agent/accumulator.go
+++ b/agent/accumulator.go
@@ -0,0 +1,164 @@
+package agent
+
+import (
+	"fmt"
+	"log"
+	"math"
+	"sync"
+	"time"
+
+	"github.com/influxdata/telegraf/internal/models"
+
+	"github.com/influxdata/influxdb/client/v2"
+)
+
+func NewAccumulator(
+	inputConfig *internal_models.InputConfig,
+	points chan *client.Point,
+) *accumulator {
+	acc := accumulator{}
+	acc.points = points
+	acc.inputConfig = inputConfig
+	return &acc
+}
+
+type accumulator struct {
+	sync.Mutex
+
+	points chan *client.Point
+
+	defaultTags map[string]string
+
+	debug bool
+
+	inputConfig *internal_models.InputConfig
+
+	prefix string
+}
+
+func (ac *accumulator) Add(
+	measurement string,
+	value interface{},
+	tags map[string]string,
+	t ...time.Time,
+) {
+	fields := make(map[string]interface{})
+	fields["value"] = value
+	ac.AddFields(measurement, fields, tags, t...)
+}
+
+func (ac *accumulator) AddFields(
+	measurement string,
+	fields map[string]interface{},
+	tags map[string]string,
+	t ...time.Time,
+) {
+	if len(fields) == 0 || len(measurement) == 0 {
+		return
+	}
+
+	if !ac.inputConfig.Filter.ShouldTagsPass(tags) {
+		return
+	}
+
+	// Override measurement name if set
+	if len(ac.inputConfig.NameOverride) != 0 {
+		measurement = ac.inputConfig.NameOverride
+	}
+	// Apply measurement prefix and suffix if set
+	if len(ac.inputConfig.MeasurementPrefix) != 0 {
+		measurement = ac.inputConfig.MeasurementPrefix + measurement
+	}
+	if len(ac.inputConfig.MeasurementSuffix) != 0 {
+		measurement = measurement + ac.inputConfig.MeasurementSuffix
+	}
+
+	if tags == nil {
+		tags = make(map[string]string)
+	}
+	// Apply plugin-wide tags if set
+	for k, v := range ac.inputConfig.Tags {
+		if _, ok := tags[k]; !ok {
+			tags[k] = v
+		}
+	}
+	// Apply daemon-wide tags if set
+	for k, v := range ac.defaultTags {
+		if _, ok := tags[k]; !ok {
+			tags[k] = v
+		}
+	}
+
+	result := make(map[string]interface{})
+	for k, v := range fields {
+		// Filter out any filtered fields
+		if ac.inputConfig != nil {
+			if !ac.inputConfig.Filter.ShouldPass(k) {
+				continue
+			}
+		}
+		result[k] = v
+
+		// Validate uint64 and float64 fields
+		switch val := v.(type) {
+		case uint64:
+			// InfluxDB does not support writing uint64
+			if val < uint64(9223372036854775808) {
+				result[k] = int64(val)
+			} else {
+				result[k] = int64(9223372036854775807)
+			}
+		case float64:
+			// NaNs are invalid values in influxdb, skip measurement
+			if math.IsNaN(val) || math.IsInf(val, 0) {
+				if ac.debug {
+					log.Printf("Measurement [%s] field [%s] has a NaN or Inf "+
+						"field, skipping",
+						measurement, k)
+				}
+				continue
+			}
+		}
+	}
+	fields = nil
+	if len(result) == 0 {
+		return
+	}
+
+	var timestamp time.Time
+	if len(t) > 0 {
+		timestamp = t[0]
+	} else {
+		timestamp = time.Now()
+	}
+
+	if ac.prefix != "" {
+		measurement = ac.prefix + measurement
+	}
+
+	pt, err := client.NewPoint(measurement, tags, result, timestamp)
+	if err != nil {
+		log.Printf("Error adding point [%s]: %s\n", measurement, err.Error())
+		return
+	}
+	if ac.debug {
+		fmt.Println("> " + pt.String())
+	}
+	ac.points <- pt
+}
+
+func (ac *accumulator) Debug() bool {
+	return ac.debug
+}
+
+func (ac *accumulator) SetDebug(debug bool) {
+	ac.debug = debug
+}
+
+func (ac *accumulator) setDefaultTags(tags map[string]string) {
+	ac.defaultTags = tags
+}
+
+func (ac *accumulator) addDefaultTag(key, value string) {
+	ac.defaultTags[key] = value
+}
--- a/agent/agent.go
+++ b/agent/agent.go
@@ -0,0 +1,382 @@
+package agent
+
+import (
+	cryptorand "crypto/rand"
+	"fmt"
+	"log"
+	"math/big"
+	"math/rand"
+	"os"
+	"runtime"
+	"sync"
+	"time"
+
+	"github.com/influxdata/telegraf"
+	"github.com/influxdata/telegraf/internal/config"
+	"github.com/influxdata/telegraf/internal/models"
+
+	"github.com/influxdata/influxdb/client/v2"
+)
+
+// Agent runs telegraf and collects data based on the given config
+type Agent struct {
+	Config *config.Config
+}
+
+// NewAgent returns an Agent struct based off the given Config
+func NewAgent(config *config.Config) (*Agent, error) {
+	a := &Agent{
+		Config: config,
+	}
+
+	if a.Config.Agent.Hostname == "" {
+		hostname, err := os.Hostname()
+		if err != nil {
+			return nil, err
+		}
+
+		a.Config.Agent.Hostname = hostname
+	}
+
+	config.Tags["host"] = a.Config.Agent.Hostname
+
+	return a, nil
+}
+
+// Connect connects to all configured outputs
+func (a *Agent) Connect() error {
+	for _, o := range a.Config.Outputs {
+		switch ot := o.Output.(type) {
+		case telegraf.ServiceOutput:
+			if err := ot.Start(); err != nil {
+				log.Printf("Service for output %s failed to start, exiting\n%s\n",
+					o.Name, err.Error())
+				return err
+			}
+		}
+
+		if a.Config.Agent.Debug {
+			log.Printf("Attempting connection to output: %s\n", o.Name)
+		}
+		err := o.Output.Connect()
+		if err != nil {
+			log.Printf("Failed to connect to output %s, retrying in 15s, error was '%s' \n", o.Name, err)
+			time.Sleep(15 * time.Second)
+			err = o.Output.Connect()
+			if err != nil {
+				return err
+			}
+		}
+		if a.Config.Agent.Debug {
+			log.Printf("Successfully connected to output: %s\n", o.Name)
+		}
+	}
+	return nil
+}
+
+// Close closes the connection to all configured outputs
+func (a *Agent) Close() error {
+	var err error
+	for _, o := range a.Config.Outputs {
+		err = o.Output.Close()
+		switch ot := o.Output.(type) {
+		case telegraf.ServiceOutput:
+			ot.Stop()
+		}
+	}
+	return err
+}
+
+func panicRecover(input *internal_models.RunningInput) {
+	if err := recover(); err != nil {
+		trace := make([]byte, 2048)
+		runtime.Stack(trace, true)
+		log.Printf("FATAL: Input [%s] panicked: %s, Stack:\n%s\n",
+			input.Name, err, trace)
+		log.Println("PLEASE REPORT THIS PANIC ON GITHUB with " +
+			"stack trace, configuration, and OS information: " +
+			"https://github.com/influxdata/telegraf/issues/new")
+	}
+}
+
+// gatherParallel runs the inputs that are using the same reporting interval
+// as the telegraf agent.
+func (a *Agent) gatherParallel(pointChan chan *client.Point) error {
+	var wg sync.WaitGroup
+
+	start := time.Now()
+	counter := 0
+	jitter := a.Config.Agent.CollectionJitter.Duration.Nanoseconds()
+	for _, input := range a.Config.Inputs {
+		if input.Config.Interval != 0 {
+			continue
+		}
+
+		wg.Add(1)
+		counter++
+		go func(input *internal_models.RunningInput) {
+			defer panicRecover(input)
+			defer wg.Done()
+
+			acc := NewAccumulator(input.Config, pointChan)
+			acc.SetDebug(a.Config.Agent.Debug)
+			acc.setDefaultTags(a.Config.Tags)
+
+			if jitter != 0 {
+				nanoSleep := rand.Int63n(jitter)
+				d, err := time.ParseDuration(fmt.Sprintf("%dns", nanoSleep))
+				if err != nil {
+					log.Printf("Jittering collection interval failed for plugin %s",
+						input.Name)
+				} else {
+					time.Sleep(d)
+				}
+			}
+
+			if err := input.Input.Gather(acc); err != nil {
+				log.Printf("Error in input [%s]: %s", input.Name, err)
+			}
+
+		}(input)
+	}
+
+	if counter == 0 {
+		return nil
+	}
+
+	wg.Wait()
+
+	elapsed := time.Since(start)
+	if !a.Config.Agent.Quiet {
+		log.Printf("Gathered metrics, (%s interval), from %d inputs in %s\n",
+			a.Config.Agent.Interval.Duration, counter, elapsed)
+	}
+	return nil
+}
+
+// gatherSeparate runs the inputs that have been configured with their own
+// reporting interval.
+func (a *Agent) gatherSeparate(
+	shutdown chan struct{},
+	input *internal_models.RunningInput,
+	pointChan chan *client.Point,
+) error {
+	defer panicRecover(input)
+
+	ticker := time.NewTicker(input.Config.Interval)
+
+	for {
+		var outerr error
+		start := time.Now()
+
+		acc := NewAccumulator(input.Config, pointChan)
+		acc.SetDebug(a.Config.Agent.Debug)
+		acc.setDefaultTags(a.Config.Tags)
+
+		if err := input.Input.Gather(acc); err != nil {
+			log.Printf("Error in input [%s]: %s", input.Name, err)
+		}
+
+		elapsed := time.Since(start)
+		if !a.Config.Agent.Quiet {
+			log.Printf("Gathered metrics, (separate %s interval), from %s in %s\n",
+				input.Config.Interval, input.Name, elapsed)
+		}
+
+		if outerr != nil {
+			return outerr
+		}
+
+		select {
+		case <-shutdown:
+			return nil
+		case <-ticker.C:
+			continue
+		}
+	}
+}
+
+// Test verifies that we can 'Gather' from all inputs with their configured
+// Config struct
+func (a *Agent) Test() error {
+	shutdown := make(chan struct{})
+	defer close(shutdown)
+	pointChan := make(chan *client.Point)
+
+	// dummy receiver for the point channel
+	go func() {
+		for {
+			select {
+			case <-pointChan:
+				// do nothing
+			case <-shutdown:
+				return
+			}
+		}
+	}()
+
+	for _, input := range a.Config.Inputs {
+		acc := NewAccumulator(input.Config, pointChan)
+		acc.SetDebug(true)
+
+		fmt.Printf("* Plugin: %s, Collection 1\n", input.Name)
+		if input.Config.Interval != 0 {
+			fmt.Printf("* Internal: %s\n", input.Config.Interval)
+		}
+
+		if err := input.Input.Gather(acc); err != nil {
+			return err
+		}
+
+		// Special instructions for some inputs. cpu, for example, needs to be
+		// run twice in order to return cpu usage percentages.
+		switch input.Name {
+		case "cpu", "mongodb", "procstat":
+			time.Sleep(500 * time.Millisecond)
+			fmt.Printf("* Plugin: %s, Collection 2\n", input.Name)
+			if err := input.Input.Gather(acc); err != nil {
+				return err
+			}
+		}
+
+	}
+	return nil
+}
+
+// flush writes a list of points to all configured outputs
+func (a *Agent) flush() {
+	var wg sync.WaitGroup
+
+	wg.Add(len(a.Config.Outputs))
+	for _, o := range a.Config.Outputs {
+		go func(output *internal_models.RunningOutput) {
+			defer wg.Done()
+			err := output.Write()
+			if err != nil {
+				log.Printf("Error writing to output [%s]: %s\n",
+					output.Name, err.Error())
+			}
+		}(o)
+	}
+
+	wg.Wait()
+}
+
+// flusher monitors the points input channel and flushes on the minimum interval
+func (a *Agent) flusher(shutdown chan struct{}, pointChan chan *client.Point) error {
+	// Inelegant, but this sleep is to allow the Gather threads to run, so that
+	// the flusher will flush after metrics are collected.
+	time.Sleep(time.Millisecond * 200)
+
+	ticker := time.NewTicker(a.Config.Agent.FlushInterval.Duration)
+
+	for {
+		select {
+		case <-shutdown:
+			log.Println("Hang on, flushing any cached points before shutdown")
+			a.flush()
+			return nil
+		case <-ticker.C:
+			a.flush()
+		case pt := <-pointChan:
+			for _, o := range a.Config.Outputs {
+				o.AddPoint(pt)
+			}
+		}
+	}
+}
+
+// jitterInterval applies the the interval jitter to the flush interval using
+// crypto/rand number generator
+func jitterInterval(ininterval, injitter time.Duration) time.Duration {
+	var jitter int64
+	outinterval := ininterval
+	if injitter.Nanoseconds() != 0 {
+		maxjitter := big.NewInt(injitter.Nanoseconds())
+		if j, err := cryptorand.Int(cryptorand.Reader, maxjitter); err == nil {
+			jitter = j.Int64()
+		}
+		outinterval = time.Duration(jitter + ininterval.Nanoseconds())
+	}
+
+	if outinterval.Nanoseconds() < time.Duration(500*time.Millisecond).Nanoseconds() {
+		log.Printf("Flush interval %s too low, setting to 500ms\n", outinterval)
+		outinterval = time.Duration(500 * time.Millisecond)
+	}
+
+	return outinterval
+}
+
+// Run runs the agent daemon, gathering every Interval
+func (a *Agent) Run(shutdown chan struct{}) error {
+	var wg sync.WaitGroup
+
+	a.Config.Agent.FlushInterval.Duration = jitterInterval(
+		a.Config.Agent.FlushInterval.Duration,
+		a.Config.Agent.FlushJitter.Duration)
+
+	log.Printf("Agent Config: Interval:%s, Debug:%#v, Quiet:%#v, Hostname:%#v, "+
+		"Flush Interval:%s \n",
+		a.Config.Agent.Interval.Duration, a.Config.Agent.Debug, a.Config.Agent.Quiet,
+		a.Config.Agent.Hostname, a.Config.Agent.FlushInterval.Duration)
+
+	// channel shared between all input threads for accumulating points
+	pointChan := make(chan *client.Point, 1000)
+
+	// Round collection to nearest interval by sleeping
+	if a.Config.Agent.RoundInterval {
+		i := int64(a.Config.Agent.Interval.Duration)
+		time.Sleep(time.Duration(i - (time.Now().UnixNano() % i)))
+	}
+	ticker := time.NewTicker(a.Config.Agent.Interval.Duration)
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		if err := a.flusher(shutdown, pointChan); err != nil {
+			log.Printf("Flusher routine failed, exiting: %s\n", err.Error())
+			close(shutdown)
+		}
+	}()
+
+	for _, input := range a.Config.Inputs {
+
+		// Start service of any ServicePlugins
+		switch p := input.Input.(type) {
+		case telegraf.ServiceInput:
+			if err := p.Start(); err != nil {
+				log.Printf("Service for input %s failed to start, exiting\n%s\n",
+					input.Name, err.Error())
+				return err
+			}
+			defer p.Stop()
+		}
+
+		// Special handling for inputs that have their own collection interval
+		// configured. Default intervals are handled below with gatherParallel
+		if input.Config.Interval != 0 {
+			wg.Add(1)
+			go func(input *internal_models.RunningInput) {
+				defer wg.Done()
+				if err := a.gatherSeparate(shutdown, input, pointChan); err != nil {
+					log.Printf(err.Error())
+				}
+			}(input)
+		}
+	}
+
+	defer wg.Wait()
+
+	for {
+		if err := a.gatherParallel(pointChan); err != nil {
+			log.Printf(err.Error())
+		}
+
+		select {
+		case <-shutdown:
+			return nil
+		case <-ticker.C:
+			continue
+		}
+	}
+}
--- a/agent/agent_test.go
+++ b/agent/agent_test.go
@@ -0,0 +1,175 @@
+package agent
+
+import (
+	"github.com/stretchr/testify/assert"
+	"testing"
+	"time"
+
+	"github.com/influxdata/telegraf/internal/config"
+
+	// needing to load the plugins
+	_ "github.com/influxdata/telegraf/plugins/inputs/all"
+	// needing to load the outputs
+	_ "github.com/influxdata/telegraf/plugins/outputs/all"
+)
+
+func TestAgent_LoadPlugin(t *testing.T) {
+	c := config.NewConfig()
+	c.InputFilters = []string{"mysql"}
+	err := c.LoadConfig("../internal/config/testdata/telegraf-agent.toml")
+	assert.NoError(t, err)
+	a, _ := NewAgent(c)
+	assert.Equal(t, 1, len(a.Config.Inputs))
+
+	c = config.NewConfig()
+	c.InputFilters = []string{"foo"}
+	err = c.LoadConfig("../internal/config/testdata/telegraf-agent.toml")
+	assert.NoError(t, err)
+	a, _ = NewAgent(c)
+	assert.Equal(t, 0, len(a.Config.Inputs))
+
+	c = config.NewConfig()
+	c.InputFilters = []string{"mysql", "foo"}
+	err = c.LoadConfig("../internal/config/testdata/telegraf-agent.toml")
+	assert.NoError(t, err)
+	a, _ = NewAgent(c)
+	assert.Equal(t, 1, len(a.Config.Inputs))
+
+	c = config.NewConfig()
+	c.InputFilters = []string{"mysql", "redis"}
+	err = c.LoadConfig("../internal/config/testdata/telegraf-agent.toml")
+	assert.NoError(t, err)
+	a, _ = NewAgent(c)
+	assert.Equal(t, 2, len(a.Config.Inputs))
+
+	c = config.NewConfig()
+	c.InputFilters = []string{"mysql", "foo", "redis", "bar"}
+	err = c.LoadConfig("../internal/config/testdata/telegraf-agent.toml")
+	assert.NoError(t, err)
+	a, _ = NewAgent(c)
+	assert.Equal(t, 2, len(a.Config.Inputs))
+}
+
+func TestAgent_LoadOutput(t *testing.T) {
+	c := config.NewConfig()
+	c.OutputFilters = []string{"influxdb"}
+	err := c.LoadConfig("../internal/config/testdata/telegraf-agent.toml")
+	assert.NoError(t, err)
+	a, _ := NewAgent(c)
+	assert.Equal(t, 2, len(a.Config.Outputs))
+
+	c = config.NewConfig()
+	c.OutputFilters = []string{"kafka"}
+	err = c.LoadConfig("../internal/config/testdata/telegraf-agent.toml")
+	assert.NoError(t, err)
+	a, _ = NewAgent(c)
+	assert.Equal(t, 1, len(a.Config.Outputs))
+
+	c = config.NewConfig()
+	c.OutputFilters = []string{}
+	err = c.LoadConfig("../internal/config/testdata/telegraf-agent.toml")
+	assert.NoError(t, err)
+	a, _ = NewAgent(c)
+	assert.Equal(t, 3, len(a.Config.Outputs))
+
+	c = config.NewConfig()
+	c.OutputFilters = []string{"foo"}
+	err = c.LoadConfig("../internal/config/testdata/telegraf-agent.toml")
+	assert.NoError(t, err)
+	a, _ = NewAgent(c)
+	assert.Equal(t, 0, len(a.Config.Outputs))
+
+	c = config.NewConfig()
+	c.OutputFilters = []string{"influxdb", "foo"}
+	err = c.LoadConfig("../internal/config/testdata/telegraf-agent.toml")
+	assert.NoError(t, err)
+	a, _ = NewAgent(c)
+	assert.Equal(t, 2, len(a.Config.Outputs))
+
+	c = config.NewConfig()
+	c.OutputFilters = []string{"influxdb", "kafka"}
+	err = c.LoadConfig("../internal/config/testdata/telegraf-agent.toml")
+	assert.NoError(t, err)
+	assert.Equal(t, 3, len(c.Outputs))
+	a, _ = NewAgent(c)
+	assert.Equal(t, 3, len(a.Config.Outputs))
+
+	c = config.NewConfig()
+	c.OutputFilters = []string{"influxdb", "foo", "kafka", "bar"}
+	err = c.LoadConfig("../internal/config/testdata/telegraf-agent.toml")
+	assert.NoError(t, err)
+	a, _ = NewAgent(c)
+	assert.Equal(t, 3, len(a.Config.Outputs))
+}
+
+func TestAgent_ZeroJitter(t *testing.T) {
+	flushinterval := jitterInterval(time.Duration(10*time.Second),
+		time.Duration(0*time.Second))
+
+	actual := flushinterval.Nanoseconds()
+	exp := time.Duration(10 * time.Second).Nanoseconds()
+
+	if actual != exp {
+		t.Errorf("Actual %v, expected %v", actual, exp)
+	}
+}
+
+func TestAgent_ZeroInterval(t *testing.T) {
+	min := time.Duration(500 * time.Millisecond).Nanoseconds()
+	max := time.Duration(5 * time.Second).Nanoseconds()
+
+	for i := 0; i < 1000; i++ {
+		flushinterval := jitterInterval(time.Duration(0*time.Second),
+			time.Duration(5*time.Second))
+		actual := flushinterval.Nanoseconds()
+
+		if actual > max {
+			t.Errorf("Didn't expect interval %d to be > %d", actual, max)
+			break
+		}
+		if actual < min {
+			t.Errorf("Didn't expect interval %d to be < %d", actual, min)
+			break
+		}
+	}
+}
+
+func TestAgent_ZeroBoth(t *testing.T) {
+	flushinterval := jitterInterval(time.Duration(0*time.Second),
+		time.Duration(0*time.Second))
+
+	actual := flushinterval
+	exp := time.Duration(500 * time.Millisecond)
+
+	if actual != exp {
+		t.Errorf("Actual %v, expected %v", actual, exp)
+	}
+}
+
+func TestAgent_JitterMax(t *testing.T) {
+	max := time.Duration(32 * time.Second).Nanoseconds()
+
+	for i := 0; i < 1000; i++ {
+		flushinterval := jitterInterval(time.Duration(30*time.Second),
+			time.Duration(2*time.Second))
+		actual := flushinterval.Nanoseconds()
+		if actual > max {
+			t.Errorf("Didn't expect interval %d to be > %d", actual, max)
+			break
+		}
+	}
+}
+
+func TestAgent_JitterMin(t *testing.T) {
+	min := time.Duration(30 * time.Second).Nanoseconds()
+
+	for i := 0; i < 1000; i++ {
+		flushinterval := jitterInterval(time.Duration(30*time.Second),
+			time.Duration(2*time.Second))
+		actual := flushinterval.Nanoseconds()
+		if actual < min {
+			t.Errorf("Didn't expect interval %d to be < %d", actual, min)
+			break
+		}
+	}
+}