telegraf/poller/poller.go

457 lines
11 KiB
Go
Raw Normal View History

2016-02-28 09:37:03 +00:00
package poller
import (
"errors"
"fmt"
"log"
"os"
"runtime"
"sync"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/agent"
"github.com/influxdata/telegraf/plugins/inputs"
"github.com/influxdata/telegraf/plugins/parsers"
"github.com/influxdata/telegraf/internal/config"
"github.com/influxdata/telegraf/internal/models"
influxconfig "github.com/influxdata/config"
"github.com/influxdata/toml"
"github.com/influxdata/toml/ast"
"github.com/streadway/amqp"
)
// Poller runs telegraf and collects data based on the given config
type Poller struct {
Config *config.Config
AMQPconn *amqp.Connection
AMQPchannel *amqp.Channel
rawTasks chan []byte
}
// NewPoller returns an Poller struct based off the given Config
func NewPoller(config *config.Config) (*Poller, error) {
p := &Poller{
Config: config,
}
if p.Config.Poller.Hostname == "" {
hostname, err := os.Hostname()
if err != nil {
return nil, err
}
p.Config.Poller.Hostname = hostname
}
config.Tags["host"] = p.Config.Poller.Hostname
return p, nil
}
func (p *Poller) getTask(conn *amqp.Connection, queueName string, consumerTag string, toto chan []byte) error {
// defer conn.Close()
tasks, err := p.AMQPchannel.Consume(queueName, consumerTag+"_"+queueName, false, false, false, false, nil)
if err != nil {
// TODO BETER HANDLING
return fmt.Errorf("basic.consume: %v", err)
}
for task := range tasks {
//log.Printf("%s \n", task)
toto <- task.Body
err := task.Nack(false, false)
if err != nil {
//TODO ????
}
}
return nil
}
// Conenctio AMQP server
func (p *Poller) AMQPConnect() error {
p.rawTasks = make(chan []byte)
var err error
// Prepare config
// TODO Handle vhost
conf := amqp.Config{
// Vhost: "/telegraf",
Heartbeat: time.Duration(0) * time.Second,
}
// Dial server
p.AMQPconn, err = amqp.DialConfig(p.Config.Poller.AMQPUrl, conf)
if err != nil {
return err
}
return nil
}
// Create Channel
func (p *Poller) AMQPCreateChannel() error {
var err error
// Create Channel
p.AMQPchannel, err = p.AMQPconn.Channel()
if err != nil {
return err
}
for _, AMQPlabel := range p.Config.Poller.AMQPlabels {
// Subscribing to queue
go p.getTask(p.AMQPconn, AMQPlabel, p.Config.Poller.Hostname, p.rawTasks)
}
return nil
}
// Connect connects to all configured outputs
func (p *Poller) Connect() error {
for _, o := range p.Config.Outputs {
o.Quiet = p.Config.Poller.Quiet
switch ot := o.Output.(type) {
case telegraf.ServiceOutput:
if err := ot.Start(); err != nil {
log.Printf("Service for output %s failed to start, exiting\n%s\n",
o.Name, err.Error())
return err
}
}
if p.Config.Poller.Debug {
log.Printf("Attempting connection to output: %s\n", o.Name)
}
err := o.Output.Connect()
if err != nil {
log.Printf("Failed to connect to output %s, retrying in 15s, "+
"error was '%s' \n", o.Name, err)
time.Sleep(15 * time.Second)
err = o.Output.Connect()
if err != nil {
return err
}
}
if p.Config.Poller.Debug {
log.Printf("Successfully connected to output: %s\n", o.Name)
}
}
return nil
}
// Close closes the connection to all configured outputs
func (p *Poller) Close() error {
var err error
for _, o := range p.Config.Outputs {
err = o.Output.Close()
switch ot := o.Output.(type) {
case telegraf.ServiceOutput:
ot.Stop()
}
}
// TODO close AMQP connection
return err
}
func panicRecover(input *internal_models.RunningInput) {
if err := recover(); err != nil {
trace := make([]byte, 2048)
runtime.Stack(trace, true)
log.Printf("FATAL: Input [%s] panicked: %s, Stack:\n%s\n",
input.Name, err, trace)
log.Println("PLEASE REPORT THIS PANIC ON GITHUB with " +
"stack trace, configuration, and OS information: " +
"https://github.com/influxdata/telegraf/issues/new")
}
}
func (p *Poller) gather(input *internal_models.RunningInput, metricC chan telegraf.Metric) error {
defer panicRecover(input)
var outerr error
start := time.Now()
acc := agent.NewAccumulator(input.Config, metricC)
acc.SetDebug(p.Config.Poller.Debug)
acc.SetDefaultTags(p.Config.Tags)
if err := input.Input.Gather(acc); err != nil {
log.Printf("Error in input [%s]: %s", input.Name, err)
}
elapsed := time.Since(start)
if !p.Config.Poller.Quiet {
log.Printf("Gathered metric, from polling, from %s in %s\n",
input.Name, elapsed)
}
return outerr
}
func (p *Poller) getInput(rawInput []byte) (*internal_models.RunningInput, error) {
// Transform rawInput from Message body to input plugin object
table, err := toml.Parse(rawInput)
if err != nil {
return nil, errors.New("invalid configuration")
}
for name, val := range table.Fields {
subTable, ok := val.(*ast.Table)
if !ok {
return nil, errors.New("invalid configuration")
}
switch name {
case "inputs", "plugins":
for pluginName, pluginVal := range subTable.Fields {
name := pluginName
var table *ast.Table
switch pluginSubTable := pluginVal.(type) {
case *ast.Table:
table = pluginSubTable
case []*ast.Table:
table = pluginSubTable[0]
// TODO handle this case
/*
for _, t := range pluginSubTable {
if err = c.addInput(pluginName, t); err != nil {
return err
}
}*/
default:
return nil, fmt.Errorf("Unsupported config format: %s",
pluginName)
}
// TODO factorize copy/paste from config/addInput
// Legacy support renaming io input to diskio
if name == "io" {
name = "diskio"
}
creator, ok := inputs.Inputs[name]
if !ok {
return nil, fmt.Errorf("Undefined but requested input: %s", name)
}
input := creator()
// If the input has a SetParser function, then this means it can accept
// arbitrary types of input, so build the parser and set it.
switch t := input.(type) {
case parsers.ParserInput:
parser, err := config.BuildParser(name, table)
if err != nil {
return nil, err
}
t.SetParser(parser)
}
pluginConfig, err := config.BuildInput(name, table)
if err != nil {
return nil, err
}
if err := influxconfig.UnmarshalTable(table, input); err != nil {
return nil, err
}
rp := &internal_models.RunningInput{
Name: name,
Input: input,
Config: pluginConfig,
}
return rp, nil
}
default:
// TODO log bad conf
continue
}
}
return nil, nil
}
// Test verifies that we can 'Gather' from all inputs with their configured
// Config struct
func (p *Poller) Test() error {
//TODO remove it ?????
shutdown := make(chan struct{})
defer close(shutdown)
metricC := make(chan telegraf.Metric)
// dummy receiver for the point channel
go func() {
for {
select {
case <-metricC:
// do nothing
case <-shutdown:
return
}
}
}()
for _, input := range p.Config.Inputs {
acc := agent.NewAccumulator(input.Config, metricC)
acc.SetDebug(true)
fmt.Printf("* Plugin: %s, Collection 1\n", input.Name)
if input.Config.Interval != 0 {
fmt.Printf("* Internal: %s\n", input.Config.Interval)
}
if err := input.Input.Gather(acc); err != nil {
return err
}
// Special instructions for some inputs. cpu, for example, needs to be
// run twice in order to return cpu usage percentages.
switch input.Name {
case "cpu", "mongodb", "procstat":
time.Sleep(500 * time.Millisecond)
fmt.Printf("* Plugin: %s, Collection 2\n", input.Name)
if err := input.Input.Gather(acc); err != nil {
return err
}
}
}
return nil
}
// flush writes a list of metrics to all configured outputs
func (p *Poller) flush() {
var wg sync.WaitGroup
wg.Add(len(p.Config.Outputs))
for _, o := range p.Config.Outputs {
go func(output *internal_models.RunningOutput) {
defer wg.Done()
err := output.Write()
if err != nil {
log.Printf("Error writing to output [%s]: %s\n",
output.Name, err.Error())
}
}(o)
}
wg.Wait()
}
// flusher monitors the metrics input channel and flushes on the minimum interval
func (p *Poller) flusher(shutdown chan struct{}, metricC chan telegraf.Metric) error {
// Inelegant, but this sleep is to allow the Gather threads to run, so that
// the flusher will flush after metrics are collected.
time.Sleep(time.Millisecond * 200)
ticker := time.NewTicker(p.Config.Poller.FlushInterval.Duration)
for {
select {
case <-shutdown:
log.Println("Hang on, flushing any cached metrics before shutdown")
p.flush()
return nil
case <-ticker.C:
p.flush()
case m := <-metricC:
for _, o := range p.Config.Outputs {
o.AddMetric(m)
}
}
}
}
// Run runs the agent daemon, gathering every Interval
func (p *Poller) Run(shutdown chan struct{}) error {
var wg sync.WaitGroup
p.Config.Agent.FlushInterval.Duration = agent.JitterInterval(
p.Config.Agent.FlushInterval.Duration,
p.Config.Agent.FlushJitter.Duration)
log.Printf("Agent Config: Debug:%#v, Quiet:%#v, Hostname:%#v, "+
"Flush Interval:%s \n",
p.Config.Poller.Debug, p.Config.Poller.Quiet,
p.Config.Poller.Hostname, p.Config.Poller.FlushInterval.Duration)
log.Print("Message queue connection\n")
err := p.AMQPConnect()
if err == nil {
log.Print("Channel creation\n")
err = p.AMQPCreateChannel()
}
if err == nil {
log.Print("Message queue connected\n")
} else {
log.Printf("Message queue connection error: %s\n", err)
}
// channel shared between all input threads for accumulating metrics
metricC := make(chan telegraf.Metric, 10000)
wg.Add(1)
go func() {
defer wg.Done()
if err := p.flusher(shutdown, metricC); err != nil {
log.Printf("Flusher routine failed, exiting: %s\n", err.Error())
close(shutdown)
}
}()
defer wg.Wait()
c := make(chan *amqp.Error)
reconnection:
for {
// We need to be sure that channel is open
// TODO handle channelS!!!
if p.AMQPchannel != nil && p.AMQPconn != nil {
select {
case <-shutdown:
return nil
case rawTask := <-p.rawTasks:
go func(rawTask []byte) {
// Get input obj from message
input, err := p.getInput(rawTask)
if err != nil {
log.Printf(err.Error())
} else {
// Do the check
if err := p.gather(input, metricC); err != nil {
log.Printf(err.Error())
}
}
}(rawTask)
case err := <-p.AMQPconn.NotifyClose(c):
// handle connection errors
// and reconnections
log.Printf("Connection error: %s\n", err)
break reconnection
case err := <-p.AMQPchannel.NotifyClose(c):
// handle channel errors
// and reconnections
log.Printf("Channel error: %s\n", err)
break reconnection
}
} else {
break
}
}
// Handle restart
log.Print("Message queue reconnection in 3 seconds\n")
ticker := time.NewTicker(time.Duration(3) * time.Second)
select {
case <-shutdown:
return nil
case <-ticker.C:
}
// Send shutdown signal to restart routines
log.Print("Shutdown signal send to routines\n")
shutdown <- struct{}{}
return p.Run(shutdown)
}