telegraf/agent.go

495 lines
12 KiB
Go
Raw Normal View History

package telegraf
import (
2015-10-23 17:23:08 +00:00
"crypto/rand"
"fmt"
"log"
2015-10-23 17:23:08 +00:00
"math/big"
2015-04-07 16:56:40 +00:00
"os"
"sort"
"strings"
"sync"
2015-04-07 16:23:35 +00:00
"time"
"github.com/influxdb/telegraf/internal"
"github.com/influxdb/telegraf/outputs"
"github.com/influxdb/telegraf/plugins"
"github.com/influxdb/influxdb/client/v2"
)
type runningOutput struct {
name string
output outputs.Output
}
type runningPlugin struct {
name string
plugin plugins.Plugin
config *ConfiguredPlugin
}
// Agent runs telegraf and collects data based on the given config
type Agent struct {
// Interval at which to gather information
Interval internal.Duration
// RoundInterval rounds collection interval to 'interval'.
// ie, if Interval=10s then always collect on :00, :10, :20, etc.
RoundInterval bool
// Interval at which to flush data
FlushInterval internal.Duration
// FlushRetries is the number of times to retry each data flush
FlushRetries int
// FlushJitter tells
FlushJitter internal.Duration
// TODO(cam): Remove UTC and Precision parameters, they are no longer
// valid for the agent config. Leaving them here for now for backwards-
// compatability
// Option for outputting data in UTC
UTC bool `toml:"utc"`
// Precision to write data at
// Valid values for Precision are n, u, ms, s, m, and h
Precision string
// Option for running in debug mode
Debug bool
2015-04-07 16:56:40 +00:00
Hostname string
Tags map[string]string
outputs []*runningOutput
plugins []*runningPlugin
}
// NewAgent returns an Agent struct based off the given Config
func NewAgent(config *Config) (*Agent, error) {
agent := &Agent{
Tags: make(map[string]string),
Interval: internal.Duration{10 * time.Second},
RoundInterval: true,
FlushInterval: internal.Duration{10 * time.Second},
FlushRetries: 2,
FlushJitter: internal.Duration{5 * time.Second},
}
// Apply the toml table to the agent config, overriding defaults
err := config.ApplyAgent(agent)
if err != nil {
return nil, err
}
2015-04-07 16:56:40 +00:00
if agent.Hostname == "" {
hostname, err := os.Hostname()
if err != nil {
return nil, err
}
agent.Hostname = hostname
2015-05-22 23:33:38 +00:00
}
2015-04-07 16:56:40 +00:00
agent.Tags["host"] = agent.Hostname
2015-05-22 23:33:38 +00:00
return agent, nil
}
// Connect connects to all configured outputs
func (a *Agent) Connect() error {
for _, o := range a.outputs {
switch ot := o.output.(type) {
case outputs.ServiceOutput:
if err := ot.Start(); err != nil {
log.Printf("Service for output %s failed to start, exiting\n%s\n",
o.name, err.Error())
return err
}
}
2015-09-09 21:56:10 +00:00
if a.Debug {
log.Printf("Attempting connection to output: %s\n", o.name)
}
err := o.output.Connect()
if err != nil {
log.Printf("Failed to connect to output %s, retrying in 15s\n", o.name)
time.Sleep(15 * time.Second)
err = o.output.Connect()
if err != nil {
return err
}
}
if a.Debug {
log.Printf("Successfully connected to output: %s\n", o.name)
}
}
return nil
}
// Close closes the connection to all configured outputs
func (a *Agent) Close() error {
var err error
for _, o := range a.outputs {
err = o.output.Close()
switch ot := o.output.(type) {
case outputs.ServiceOutput:
ot.Stop()
}
}
return err
}
// LoadOutputs loads the agent's outputs
func (a *Agent) LoadOutputs(filters []string, config *Config) ([]string, error) {
var names []string
for name, output := range config.OutputsDeclared() {
// Trim the ID off the output name for filtering
filtername := strings.TrimRight(name, "-0123456789")
if sliceContains(filtername, filters) || len(filters) == 0 {
if a.Debug {
log.Println("Output Enabled: ", name)
}
err := config.ApplyOutput(name, output)
if err != nil {
return nil, err
}
a.outputs = append(a.outputs, &runningOutput{name, output})
names = append(names, name)
}
}
sort.Strings(names)
return names, nil
}
// LoadPlugins loads the agent's plugins
func (a *Agent) LoadPlugins(filters []string, config *Config) ([]string, error) {
var names []string
for name, plugin := range config.PluginsDeclared() {
if sliceContains(name, filters) || len(filters) == 0 {
2015-11-18 00:57:29 +00:00
config := config.GetPluginConfig(name)
a.plugins = append(a.plugins, &runningPlugin{name, plugin, config})
names = append(names, name)
}
}
sort.Strings(names)
return names, nil
}
// gatherParallel runs the plugins that are using the same reporting interval
// as the telegraf agent.
func (a *Agent) gatherParallel(pointChan chan *client.Point) error {
var wg sync.WaitGroup
2015-09-28 20:08:28 +00:00
start := time.Now()
counter := 0
for _, plugin := range a.plugins {
if plugin.config.Interval != 0 {
continue
}
wg.Add(1)
2015-09-28 20:08:28 +00:00
counter++
go func(plugin *runningPlugin) {
defer wg.Done()
acc := NewAccumulator(plugin.config, pointChan)
acc.SetDebug(a.Debug)
acc.SetPrefix(plugin.name + "_")
acc.SetDefaultTags(a.Tags)
if err := plugin.plugin.Gather(acc); err != nil {
2015-08-24 17:25:15 +00:00
log.Printf("Error in plugin [%s]: %s", plugin.name, err)
}
}(plugin)
}
wg.Wait()
2015-09-28 20:08:28 +00:00
elapsed := time.Since(start)
log.Printf("Gathered metrics, (%s interval), from %d plugins in %s\n",
a.Interval, counter, elapsed)
return nil
}
// gatherSeparate runs the plugins that have been configured with their own
// reporting interval.
func (a *Agent) gatherSeparate(
shutdown chan struct{},
plugin *runningPlugin,
pointChan chan *client.Point,
) error {
ticker := time.NewTicker(plugin.config.Interval)
for {
var outerr error
2015-09-28 20:08:28 +00:00
start := time.Now()
acc := NewAccumulator(plugin.config, pointChan)
acc.SetDebug(a.Debug)
acc.SetPrefix(plugin.name + "_")
acc.SetDefaultTags(a.Tags)
if err := plugin.plugin.Gather(acc); err != nil {
log.Printf("Error in plugin [%s]: %s", plugin.name, err)
}
2015-09-28 20:08:28 +00:00
elapsed := time.Since(start)
log.Printf("Gathered metrics, (separate %s interval), from %s in %s\n",
plugin.config.Interval, plugin.name, elapsed)
if outerr != nil {
return outerr
}
select {
case <-shutdown:
return nil
case <-ticker.C:
continue
}
}
}
// Test verifies that we can 'Gather' from all plugins with their configured
// Config struct
func (a *Agent) Test() error {
shutdown := make(chan struct{})
defer close(shutdown)
pointChan := make(chan *client.Point)
2015-10-22 00:32:43 +00:00
// dummy receiver for the point channel
go func() {
for {
select {
case <-pointChan:
// do nothing
case <-shutdown:
return
}
}
}()
for _, plugin := range a.plugins {
acc := NewAccumulator(plugin.config, pointChan)
acc.SetDebug(true)
acc.SetPrefix(plugin.name + "_")
fmt.Printf("* Plugin: %s, Collection 1\n", plugin.name)
if plugin.config.Interval != 0 {
fmt.Printf("* Internal: %s\n", plugin.config.Interval)
}
if err := plugin.plugin.Gather(acc); err != nil {
return err
}
// Special instructions for some plugins. cpu, for example, needs to be
// run twice in order to return cpu usage percentages.
switch plugin.name {
case "cpu", "mongodb":
time.Sleep(500 * time.Millisecond)
fmt.Printf("* Plugin: %s, Collection 2\n", plugin.name)
if err := plugin.plugin.Gather(acc); err != nil {
return err
}
}
}
return nil
}
// writeOutput writes a list of points to a single output, with retries.
// Optionally takes a `done` channel to indicate that it is done writing.
func (a *Agent) writeOutput(
points []*client.Point,
ro *runningOutput,
shutdown chan struct{},
wg *sync.WaitGroup,
) {
defer wg.Done()
if len(points) == 0 {
return
}
retry := 0
retries := a.FlushRetries
start := time.Now()
for {
err := ro.output.Write(points)
if err == nil {
// Write successful
elapsed := time.Since(start)
log.Printf("Flushed %d metrics to output %s in %s\n",
len(points), ro.name, elapsed)
return
}
select {
case <-shutdown:
return
default:
if retry >= retries {
// No more retries
msg := "FATAL: Write to output [%s] failed %d times, dropping" +
" %d metrics\n"
log.Printf(msg, ro.name, retries+1, len(points))
return
} else if err != nil {
// Sleep for a retry
log.Printf("Error in output [%s]: %s, retrying in %s",
ro.name, err.Error(), a.FlushInterval.Duration)
time.Sleep(a.FlushInterval.Duration)
}
}
retry++
}
}
// flush writes a list of points to all configured outputs
func (a *Agent) flush(
points []*client.Point,
shutdown chan struct{},
wait bool,
) {
var wg sync.WaitGroup
for _, o := range a.outputs {
wg.Add(1)
go a.writeOutput(points, o, shutdown, &wg)
}
if wait {
wg.Wait()
}
}
// flusher monitors the points input channel and flushes on the minimum interval
func (a *Agent) flusher(shutdown chan struct{}, pointChan chan *client.Point) error {
// Inelegant, but this sleep is to allow the Gather threads to run, so that
// the flusher will flush after metrics are collected.
time.Sleep(time.Millisecond * 100)
2015-10-23 17:23:08 +00:00
ticker := time.NewTicker(a.FlushInterval.Duration)
points := make([]*client.Point, 0)
2015-10-23 17:23:08 +00:00
for {
select {
case <-shutdown:
log.Println("Hang on, flushing any cached points before shutdown")
a.flush(points, shutdown, true)
return nil
case <-ticker.C:
2015-10-23 17:23:08 +00:00
a.flush(points, shutdown, false)
points = make([]*client.Point, 0)
case pt := <-pointChan:
points = append(points, pt)
}
}
}
2015-10-23 17:23:08 +00:00
// jitterInterval applies the the interval jitter to the flush interval using
// crypto/rand number generator
func jitterInterval(ininterval, injitter time.Duration) time.Duration {
var jitter int64
outinterval := ininterval
if injitter.Nanoseconds() != 0 {
maxjitter := big.NewInt(injitter.Nanoseconds())
if j, err := rand.Int(rand.Reader, maxjitter); err == nil {
jitter = j.Int64()
}
outinterval = time.Duration(jitter + ininterval.Nanoseconds())
}
if outinterval.Nanoseconds() < time.Duration(500*time.Millisecond).Nanoseconds() {
log.Printf("Flush interval %s too low, setting to 500ms\n", outinterval)
outinterval = time.Duration(500 * time.Millisecond)
}
return outinterval
}
// Run runs the agent daemon, gathering every Interval
2015-04-07 16:23:58 +00:00
func (a *Agent) Run(shutdown chan struct{}) error {
var wg sync.WaitGroup
2015-10-23 17:23:08 +00:00
a.FlushInterval.Duration = jitterInterval(a.FlushInterval.Duration,
a.FlushJitter.Duration)
log.Printf("Agent Config: Interval:%s, Debug:%#v, Hostname:%#v, "+
"Flush Interval:%s\n",
a.Interval, a.Debug, a.Hostname, a.FlushInterval)
// channel shared between all plugin threads for accumulating points
pointChan := make(chan *client.Point, 1000)
// Round collection to nearest interval by sleeping
if a.RoundInterval {
i := int64(a.Interval.Duration)
time.Sleep(time.Duration(i - (time.Now().UnixNano() % i)))
}
ticker := time.NewTicker(a.Interval.Duration)
wg.Add(1)
go func() {
defer wg.Done()
if err := a.flusher(shutdown, pointChan); err != nil {
log.Printf("Flusher routine failed, exiting: %s\n", err.Error())
close(shutdown)
}
}()
for _, plugin := range a.plugins {
// Start service of any ServicePlugins
switch p := plugin.plugin.(type) {
case plugins.ServicePlugin:
if err := p.Start(); err != nil {
log.Printf("Service for plugin %s failed to start, exiting\n%s\n",
plugin.name, err.Error())
return err
}
defer p.Stop()
}
// Special handling for plugins that have their own collection interval
// configured. Default intervals are handled below with gatherParallel
if plugin.config.Interval != 0 {
wg.Add(1)
go func(plugin *runningPlugin) {
defer wg.Done()
if err := a.gatherSeparate(shutdown, plugin, pointChan); err != nil {
log.Printf(err.Error())
}
}(plugin)
}
}
defer wg.Wait()
for {
if err := a.gatherParallel(pointChan); err != nil {
log.Printf(err.Error())
}
select {
case <-shutdown:
2015-04-07 16:23:58 +00:00
return nil
case <-ticker.C:
continue
}
}
}