telegraf/agent/agent.go

385 lines
9.2 KiB
Go
Raw Normal View History

package agent
import (
cryptorand "crypto/rand"
"fmt"
"log"
2015-10-23 17:23:08 +00:00
"math/big"
"math/rand"
2015-04-07 16:56:40 +00:00
"os"
"runtime"
"sync"
2015-04-07 16:23:35 +00:00
"time"
"github.com/influxdata/telegraf"
2016-01-20 18:57:35 +00:00
"github.com/influxdata/telegraf/internal/config"
"github.com/influxdata/telegraf/internal/models"
)
// Agent runs telegraf and collects data based on the given config
type Agent struct {
Config *config.Config
}
// NewAgent returns an Agent struct based off the given Config
func NewAgent(config *config.Config) (*Agent, error) {
a := &Agent{
Config: config,
}
if a.Config.Agent.Hostname == "" {
2015-04-07 16:56:40 +00:00
hostname, err := os.Hostname()
if err != nil {
return nil, err
}
a.Config.Agent.Hostname = hostname
}
config.Tags["host"] = a.Config.Agent.Hostname
2015-05-22 23:33:38 +00:00
return a, nil
}
// Connect connects to all configured outputs
func (a *Agent) Connect() error {
for _, o := range a.Config.Outputs {
switch ot := o.Output.(type) {
case telegraf.ServiceOutput:
if err := ot.Start(); err != nil {
log.Printf("Service for output %s failed to start, exiting\n%s\n",
o.Name, err.Error())
return err
}
}
if a.Config.Agent.Debug {
log.Printf("Attempting connection to output: %s\n", o.Name)
2015-09-09 21:56:10 +00:00
}
err := o.Output.Connect()
if err != nil {
log.Printf("Failed to connect to output %s, retrying in 15s, "+
"error was '%s' \n", o.Name, err)
time.Sleep(15 * time.Second)
err = o.Output.Connect()
if err != nil {
return err
}
}
if a.Config.Agent.Debug {
log.Printf("Successfully connected to output: %s\n", o.Name)
}
}
return nil
}
// Close closes the connection to all configured outputs
func (a *Agent) Close() error {
var err error
for _, o := range a.Config.Outputs {
err = o.Output.Close()
switch ot := o.Output.(type) {
case telegraf.ServiceOutput:
ot.Stop()
}
}
return err
}
func panicRecover(input *internal_models.RunningInput) {
if err := recover(); err != nil {
trace := make([]byte, 2048)
runtime.Stack(trace, true)
log.Printf("FATAL: Input [%s] panicked: %s, Stack:\n%s\n",
input.Name, err, trace)
log.Println("PLEASE REPORT THIS PANIC ON GITHUB with " +
"stack trace, configuration, and OS information: " +
"https://github.com/influxdata/telegraf/issues/new")
}
}
2016-01-07 20:39:43 +00:00
// gatherParallel runs the inputs that are using the same reporting interval
// as the telegraf agent.
func (a *Agent) gatherParallel(metricC chan telegraf.Metric) error {
var wg sync.WaitGroup
2015-09-28 20:08:28 +00:00
start := time.Now()
counter := 0
jitter := a.Config.Agent.CollectionJitter.Duration.Nanoseconds()
2016-01-07 20:39:43 +00:00
for _, input := range a.Config.Inputs {
if input.Config.Interval != 0 {
continue
}
wg.Add(1)
2015-09-28 20:08:28 +00:00
counter++
go func(input *internal_models.RunningInput) {
defer panicRecover(input)
defer wg.Done()
acc := NewAccumulator(input.Config, metricC)
acc.SetDebug(a.Config.Agent.Debug)
acc.setDefaultTags(a.Config.Tags)
if jitter != 0 {
nanoSleep := rand.Int63n(jitter)
d, err := time.ParseDuration(fmt.Sprintf("%dns", nanoSleep))
if err != nil {
log.Printf("Jittering collection interval failed for plugin %s",
input.Name)
} else {
time.Sleep(d)
}
}
2016-01-07 20:39:43 +00:00
if err := input.Input.Gather(acc); err != nil {
log.Printf("Error in input [%s]: %s", input.Name, err)
}
2016-01-07 20:39:43 +00:00
}(input)
}
2015-12-01 21:05:24 +00:00
if counter == 0 {
return nil
}
wg.Wait()
2015-09-28 20:08:28 +00:00
elapsed := time.Since(start)
if !a.Config.Agent.Quiet {
log.Printf("Gathered metrics, (%s interval), from %d inputs in %s\n",
a.Config.Agent.Interval.Duration, counter, elapsed)
}
return nil
}
2016-01-07 20:39:43 +00:00
// gatherSeparate runs the inputs that have been configured with their own
// reporting interval.
func (a *Agent) gatherSeparate(
shutdown chan struct{},
input *internal_models.RunningInput,
metricC chan telegraf.Metric,
) error {
defer panicRecover(input)
2016-01-07 20:39:43 +00:00
ticker := time.NewTicker(input.Config.Interval)
for {
var outerr error
2015-09-28 20:08:28 +00:00
start := time.Now()
acc := NewAccumulator(input.Config, metricC)
acc.SetDebug(a.Config.Agent.Debug)
acc.setDefaultTags(a.Config.Tags)
2016-01-07 20:39:43 +00:00
if err := input.Input.Gather(acc); err != nil {
log.Printf("Error in input [%s]: %s", input.Name, err)
}
2015-09-28 20:08:28 +00:00
elapsed := time.Since(start)
if !a.Config.Agent.Quiet {
log.Printf("Gathered metrics, (separate %s interval), from %s in %s\n",
input.Config.Interval, input.Name, elapsed)
}
if outerr != nil {
return outerr
}
select {
case <-shutdown:
return nil
case <-ticker.C:
continue
}
}
}
2016-01-07 20:39:43 +00:00
// Test verifies that we can 'Gather' from all inputs with their configured
// Config struct
func (a *Agent) Test() error {
shutdown := make(chan struct{})
defer close(shutdown)
metricC := make(chan telegraf.Metric)
2015-10-22 00:32:43 +00:00
// dummy receiver for the point channel
go func() {
for {
select {
case <-metricC:
2015-10-22 00:32:43 +00:00
// do nothing
case <-shutdown:
return
}
}
}()
2016-01-07 20:39:43 +00:00
for _, input := range a.Config.Inputs {
acc := NewAccumulator(input.Config, metricC)
acc.SetDebug(true)
2016-01-07 20:39:43 +00:00
fmt.Printf("* Plugin: %s, Collection 1\n", input.Name)
if input.Config.Interval != 0 {
fmt.Printf("* Internal: %s\n", input.Config.Interval)
}
2016-01-07 20:39:43 +00:00
if err := input.Input.Gather(acc); err != nil {
return err
}
2016-01-07 20:39:43 +00:00
// Special instructions for some inputs. cpu, for example, needs to be
// run twice in order to return cpu usage percentages.
2016-01-07 20:39:43 +00:00
switch input.Name {
case "cpu", "mongodb", "procstat":
time.Sleep(500 * time.Millisecond)
2016-01-07 20:39:43 +00:00
fmt.Printf("* Plugin: %s, Collection 2\n", input.Name)
if err := input.Input.Gather(acc); err != nil {
return err
}
}
}
return nil
}
// flush writes a list of metrics to all configured outputs
func (a *Agent) flush() {
var wg sync.WaitGroup
wg.Add(len(a.Config.Outputs))
for _, o := range a.Config.Outputs {
go func(output *internal_models.RunningOutput) {
defer wg.Done()
err := output.Write()
if err != nil {
log.Printf("Error writing to output [%s]: %s\n",
output.Name, err.Error())
}
}(o)
}
wg.Wait()
}
// flusher monitors the metrics input channel and flushes on the minimum interval
func (a *Agent) flusher(shutdown chan struct{}, metricC chan telegraf.Metric) error {
// Inelegant, but this sleep is to allow the Gather threads to run, so that
// the flusher will flush after metrics are collected.
time.Sleep(time.Millisecond * 200)
2015-10-23 17:23:08 +00:00
ticker := time.NewTicker(a.Config.Agent.FlushInterval.Duration)
2015-10-23 17:23:08 +00:00
for {
select {
case <-shutdown:
log.Println("Hang on, flushing any cached metrics before shutdown")
a.flush()
return nil
case <-ticker.C:
a.flush()
case m := <-metricC:
for _, o := range a.Config.Outputs {
o.AddMetric(m)
}
}
}
}
2015-10-23 17:23:08 +00:00
// jitterInterval applies the the interval jitter to the flush interval using
// crypto/rand number generator
func jitterInterval(ininterval, injitter time.Duration) time.Duration {
var jitter int64
outinterval := ininterval
if injitter.Nanoseconds() != 0 {
maxjitter := big.NewInt(injitter.Nanoseconds())
if j, err := cryptorand.Int(cryptorand.Reader, maxjitter); err == nil {
2015-10-23 17:23:08 +00:00
jitter = j.Int64()
}
outinterval = time.Duration(jitter + ininterval.Nanoseconds())
}
if outinterval.Nanoseconds() < time.Duration(500*time.Millisecond).Nanoseconds() {
log.Printf("Flush interval %s too low, setting to 500ms\n", outinterval)
outinterval = time.Duration(500 * time.Millisecond)
}
return outinterval
}
// Run runs the agent daemon, gathering every Interval
2015-04-07 16:23:58 +00:00
func (a *Agent) Run(shutdown chan struct{}) error {
var wg sync.WaitGroup
a.Config.Agent.FlushInterval.Duration = jitterInterval(
a.Config.Agent.FlushInterval.Duration,
a.Config.Agent.FlushJitter.Duration)
2015-10-23 17:23:08 +00:00
log.Printf("Agent Config: Interval:%s, Debug:%#v, Quiet:%#v, Hostname:%#v, "+
"Flush Interval:%s \n",
a.Config.Agent.Interval.Duration, a.Config.Agent.Debug, a.Config.Agent.Quiet,
2016-01-07 20:39:43 +00:00
a.Config.Agent.Hostname, a.Config.Agent.FlushInterval.Duration)
2015-10-23 17:23:08 +00:00
// channel shared between all input threads for accumulating metrics
metricC := make(chan telegraf.Metric, 10000)
// Round collection to nearest interval by sleeping
if a.Config.Agent.RoundInterval {
i := int64(a.Config.Agent.Interval.Duration)
time.Sleep(time.Duration(i - (time.Now().UnixNano() % i)))
}
ticker := time.NewTicker(a.Config.Agent.Interval.Duration)
wg.Add(1)
go func() {
defer wg.Done()
if err := a.flusher(shutdown, metricC); err != nil {
log.Printf("Flusher routine failed, exiting: %s\n", err.Error())
close(shutdown)
}
}()
2016-01-07 20:39:43 +00:00
for _, input := range a.Config.Inputs {
// Start service of any ServicePlugins
2016-01-07 20:39:43 +00:00
switch p := input.Input.(type) {
case telegraf.ServiceInput:
acc := NewAccumulator(input.Config, metricC)
acc.SetDebug(a.Config.Agent.Debug)
acc.setDefaultTags(a.Config.Tags)
if err := p.Start(acc); err != nil {
2016-01-07 20:39:43 +00:00
log.Printf("Service for input %s failed to start, exiting\n%s\n",
input.Name, err.Error())
return err
}
defer p.Stop()
}
2016-01-07 20:39:43 +00:00
// Special handling for inputs that have their own collection interval
// configured. Default intervals are handled below with gatherParallel
2016-01-07 20:39:43 +00:00
if input.Config.Interval != 0 {
wg.Add(1)
go func(input *internal_models.RunningInput) {
defer wg.Done()
if err := a.gatherSeparate(shutdown, input, metricC); err != nil {
log.Printf(err.Error())
}
2016-01-07 20:39:43 +00:00
}(input)
}
}
defer wg.Wait()
for {
if err := a.gatherParallel(metricC); err != nil {
log.Printf(err.Error())
}
select {
case <-shutdown:
2015-04-07 16:23:58 +00:00
return nil
case <-ticker.C:
continue
}
}
}