Statsd listener plugin

implement gauges, sets, counters
2015-09-24 11:06:11 -07:00
parent 316fa1cc01
commit eb2a4dc724
9 changed files with 583 additions and 29 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -16,8 +16,6 @@ and submit new plugins.
 ### Plugin Guidelines

 * A plugin must conform to the `plugins.Plugin` interface.
-* Telegraf promises to run each plugin's Gather function serially. This means
-developers don't have to worry about thread safety within these functions.
 * Each generated metric automatically has the name of the plugin that generated
 it prepended. This is to keep plugins honest.
 * Plugins should call `plugins.Add` in their `init` function to register themselves.
--- a/agent.go
+++ b/agent.go
@@ -361,6 +361,20 @@ func (a *Agent) Run(shutdown chan struct{}) error {
 	var wg sync.WaitGroup

 	for _, plugin := range a.plugins {
+
+		// Start service of any ServicePlugins
+		switch p := plugin.plugin.(type) {
+		case plugins.ServicePlugin:
+			if err := p.Start(); err != nil {
+				log.Printf("Service for plugin %s failed to start, exiting\n%s\n",
+					plugin.name, err.Error())
+				return err
+			}
+			defer p.Stop()
+		}
+
+		// Special handling for plugins that have their own collection interval
+		// configured. Default intervals are handled below with crankParallel
 		if plugin.config.Interval != 0 {
 			wg.Add(1)
 			go func(plugin *runningPlugin) {
--- a/config.go
+++ b/config.go
@@ -377,18 +377,25 @@ var header = `# Telegraf configuration
 [outputs]
 `

-var header2 = `
+var pluginHeader = `

 ###############################################################################
 #                                  PLUGINS                                    #
 ###############################################################################
 `

+var servicePluginHeader = `
+
+###############################################################################
+#                              SERVICE PLUGINS                                #
+###############################################################################
+`
+
 // PrintSampleConfig prints the sample config
 func PrintSampleConfig(pluginFilters []string, outputFilters []string) {
 	fmt.Printf(header)

-	// Print Outputs
+	// Filter outputs
 	var onames []string
 	for oname := range outputs.Outputs {
 		if len(outputFilters) == 0 || sliceContains(oname, outputFilters) {
@@ -397,6 +404,7 @@ func PrintSampleConfig(pluginFilters []string, outputFilters []string) {
 	}
 	sort.Strings(onames)

+	// Print Outputs
 	for _, oname := range onames {
 		creator := outputs.Outputs[oname]
 		output := creator()
@@ -411,9 +419,7 @@ func PrintSampleConfig(pluginFilters []string, outputFilters []string) {
 		}
 	}

-	fmt.Printf(header2)
-
-	// Print Plugins
+	// Filter plugins
 	var pnames []string
 	for pname := range plugins.Plugins {
 		if len(pluginFilters) == 0 || sliceContains(pname, pluginFilters) {
@@ -422,12 +428,31 @@ func PrintSampleConfig(pluginFilters []string, outputFilters []string) {
 	}
 	sort.Strings(pnames)

+	// Print Plugins
+	fmt.Printf(pluginHeader)
+	servPlugins := make(map[string]plugins.ServicePlugin)
 	for _, pname := range pnames {
 		creator := plugins.Plugins[pname]
 		plugin := creator()

-		fmt.Printf("\n# %s\n[%s]", plugin.Description(), pname)
+		switch p := plugin.(type) {
+		case plugins.ServicePlugin:
+			servPlugins[pname] = p
+			continue
+		}

+		printConfig(pname, plugin)
+	}
+
+	// Print Service Plugins
+	fmt.Printf(servicePluginHeader)
+	for name, plugin := range servPlugins {
+		printConfig(name, plugin)
+	}
+}
+
+func printConfig(name string, plugin plugins.Plugin) {
+	fmt.Printf("\n# %s\n[%s]", plugin.Description(), name)
 	config := plugin.SampleConfig()
 	if config == "" {
 		fmt.Printf("\n	# no configuration\n")
@@ -435,7 +460,6 @@ func PrintSampleConfig(pluginFilters []string, outputFilters []string) {
 		fmt.Printf(config)
 	}
 }
-}

 func sliceContains(name string, list []string) bool {
 	for _, b := range list {
@@ -449,9 +473,7 @@ func sliceContains(name string, list []string) bool {
 // PrintPluginConfig prints the config usage of a single plugin.
 func PrintPluginConfig(name string) error {
 	if creator, ok := plugins.Plugins[name]; ok {
-		plugin := creator()
-		fmt.Printf("# %s\n[%s]", plugin.Description(), name)
-		fmt.Printf(plugin.SampleConfig())
+		printConfig(name, creator())
 	} else {
 		return errors.New(fmt.Sprintf("Plugin %s not found", name))
 	}
--- a/outputs/influxdb/influxdb.go
+++ b/outputs/influxdb/influxdb.go
@@ -28,22 +28,18 @@ type InfluxDB struct {

 var sampleConfig = `
 	# The full HTTP endpoint URL for your InfluxDB instance
-	# Multiple urls can be specified for InfluxDB cluster support. Server to
-	# write to will be randomly chosen each interval.
-	urls = ["http://localhost:8086"] # required.
+	# Multiple urls can be specified for InfluxDB cluster support.
+	urls = ["http://localhost:8086"] # required
+	# The target database for metrics (telegraf will create it if not exists)
+	database = "telegraf" # required

-	# The target database for metrics. This database must already exist
-	database = "telegraf" # required.
-
-	# Connection timeout (for the connection with InfluxDB), formatted as a string.
-	# Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h".
-	# If not provided, will default to 0 (no timeout)
+	# # Connection timeout (for the connection with InfluxDB), formatted as a string.
+	# # Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h".
+	# # If not provided, will default to 0 (no timeout)
 	# timeout = "5s"
-
 	# username = "telegraf"
 	# password = "metricsmetricsmetricsmetrics"
-
-	# Set the user agent for the POSTs (can be useful for log differentiation)
+	# # Set the user agent for the POSTs (can be useful for log differentiation)
 	# user_agent = "telegraf"
 `

--- a/plugins/all/all.go
+++ b/plugins/all/all.go
@@ -22,6 +22,7 @@ import (
 	_ "github.com/influxdb/telegraf/plugins/rabbitmq"
 	_ "github.com/influxdb/telegraf/plugins/redis"
 	_ "github.com/influxdb/telegraf/plugins/rethinkdb"
+	_ "github.com/influxdb/telegraf/plugins/statsd"
 	_ "github.com/influxdb/telegraf/plugins/system"
 	_ "github.com/influxdb/telegraf/plugins/zookeeper"
 )
--- a/plugins/registry.go
+++ b/plugins/registry.go
@@ -20,11 +20,35 @@ type Accumulator interface {
 }

 type Plugin interface {
+	// SampleConfig returns the default configuration of the Plugin
 	SampleConfig() string
+
+	// Description returns a one-sentence description on the Plugin
 	Description() string
+
+	// Gather takes in an accumulator and adds the metrics that the Plugin
+	// gathers. This is called every "interval"
 	Gather(Accumulator) error
 }

+type ServicePlugin interface {
+	// SampleConfig returns the default configuration of the Plugin
+	SampleConfig() string
+
+	// Description returns a one-sentence description on the Plugin
+	Description() string
+
+	// Gather takes in an accumulator and adds the metrics that the Plugin
+	// gathers. This is called every "interval"
+	Gather(Accumulator) error
+
+	// Start starts the ServicePlugin's service, whatever that may be
+	Start() error
+
+	// Stop stops the services and closes any necessary channels and connections
+	Stop()
+}
+
 type Creator func() Plugin

 var Plugins = map[string]Creator{}
--- a/plugins/statsd/README.md
+++ b/plugins/statsd/README.md
@@ -0,0 +1,79 @@
+# Telegraf Service Plugin: statsd
+
+#### Plugin arguments:
+
+- **service_address** string: Address to listen for statsd UDP packets on
+- **delete_gauges** boolean: Delete gauges on every collection interval
+- **delete_counters** boolean: Delete counters on every collection interval
+- **delete_sets** boolean: Delete set counters on every collection interval
+- **allowed_pending_messages** integer: Number of messages allowed to queue up
+on the UDP listener before the next flush. NOTE: gauge, counter, and set
+measurements are aggregated as they arrive, so this is not a straight counter of
+the number of total messages that the listener can handle between flushes.
+
+#### Statsd bucket -> InfluxDB Mapping
+
+By default, statsd buckets are converted to measurement names with the rules:
+- "." -> "_"
+- "-" -> "__"
+
+This plugin also accepts a list of config tables to describe a mapping of a statsd
+bucket to an InfluxDB measurement name and tags.
+
+Each mapping must specify a match glob pattern. It can optionally take a name
+for the measurement and a map of bucket indices to tag names.
+
+For example, the following configuration:
+
+```
+    [[statsd.mappings]]
+    match = "users.current.*.*"
+    name = "current_users"
+    [statsd.mappings.tagmap]
+    unit = 0
+    server = 2
+    service = 3
+
+    [[statsd.mappings]]
+    match = "deploys.*.*"
+    name = "service_deploys"
+    [statsd.mappings.tagmap]
+    service_type = 1
+    service_name = 2
+```
+
+Will map statsd -> influx like so:
+```
+users.current.den001.myapp:32|g
+=> [server="den001" service="myapp" unit="users"] statsd_current_users_gauge value=32
+
+deploys.test.myservice:1|c
+=> [service_name="myservice" service_type="test"] statsd_service_deploys_counter value=1
+
+random.jumping-sheep:10|c
+=> [] statsd_random_jumping__sheep_counter value=10
+```
+
+#### Description
+
+The statsd plugin is a special type of plugin which runs a backgrounded statsd
+listener service while telegraf is running.
+
+The format of the statsd messages was based on the format described in the
+original [etsy statsd](https://github.com/etsy/statsd/blob/master/docs/metric_types.md)
+implementation. In short, the telegraf statsd listener will accept:
+
+- Gauges
+    - `users.current.den001.myapp:32|g` <- standard
+    - `users.current.den001.myapp:+10|g` <- additive
+    - `users.current.den001.myapp:-10|g`
+- Counters
+    - `deploys.test.myservice:1|c` <- increments by 1
+    - `deploys.test.myservice:101|c` <- increments by 101
+    - `deploys.test.myservice:1|c|@0.1` <- sample rate, increments by 10
+- Sets
+    - `users.unique:101|s`
+    - `users.unique:101|s`
+    - `users.unique:102|s` <- would result in a count of 2 for `users.unique`
+- Timers
+    - TODO
--- a/plugins/statsd/statsd.go
+++ b/plugins/statsd/statsd.go
@@ -0,0 +1,409 @@
+package statsd
+
+import (
+	"log"
+	"net"
+	"strconv"
+	"strings"
+	"sync"
+
+	"github.com/influxdb/telegraf/plugins"
+)
+
+var dropwarn = "ERROR: Message queue full. Discarding line [%s] " +
+	"You may want to increase allowed_pending_messages in the config\n"
+
+type Statsd struct {
+	// Address & Port to serve from
+	ServiceAddress string
+
+	// Number of messages allowed to queue up in between calls to Gather. If this
+	// fills up, packets will get dropped until the next Gather interval is ran.
+	AllowedPendingMessages int
+
+	DeleteGauges   bool
+	DeleteCounters bool
+	DeleteSets     bool
+
+	sync.Mutex
+
+	// Channel for all incoming statsd messages
+	in        chan string
+	inmetrics chan metric
+	done      chan struct{}
+
+	// Cache gauges, counters & sets so they can be aggregated as they arrive
+	gauges   map[string]cachedmetric
+	counters map[string]cachedmetric
+	sets     map[string]cachedmetric
+
+	Mappings []struct {
+		Match  string
+		Name   string
+		Tagmap map[string]int
+	}
+}
+
+// One statsd metric, form is <bucket>:<value>|<mtype>|@<samplerate>
+type metric struct {
+	name       string
+	bucket     string
+	value      int64
+	mtype      string
+	additive   bool
+	samplerate float64
+	tags       map[string]string
+}
+
+// cachedmetric is a subset of metric used specifically for storing cached
+// gauges and counters, ready for sending to InfluxDB.
+type cachedmetric struct {
+	value int64
+	tags  map[string]string
+	set   map[int64]bool
+}
+
+func (_ *Statsd) Description() string {
+	return "Statsd listener"
+}
+
+const sampleConfig = `
+    # Address and port to host UDP listener on
+    service_address = ":8125"
+    # Delete gauges every interval
+    delete_gauges = false
+    # Delete counters every interval
+    delete_counters = false
+    # Delete sets every interval
+    delete_sets = false
+
+    # Number of messages allowed to queue up, once filled,
+    # the statsd server will start dropping packets
+    allowed_pending_messages = 10000
+`
+
+func (_ *Statsd) SampleConfig() string {
+	return sampleConfig
+}
+
+func (s *Statsd) Gather(acc plugins.Accumulator) error {
+	s.Lock()
+	defer s.Unlock()
+
+	values := make(map[string]int64)
+	items := len(s.inmetrics)
+	for i := 0; i < items; i++ {
+
+		m := <-s.inmetrics
+
+		switch m.mtype {
+		case "c", "g", "s":
+			log.Println("ERROR: Uh oh, this should not have happened")
+		case "ms", "h":
+			// TODO
+		}
+	}
+
+	for name, cmetric := range s.gauges {
+		acc.Add(name, cmetric.value, cmetric.tags)
+	}
+	if s.DeleteGauges {
+		s.gauges = make(map[string]cachedmetric)
+	}
+
+	for name, cmetric := range s.counters {
+		acc.Add(name, cmetric.value, cmetric.tags)
+	}
+	if s.DeleteCounters {
+		s.counters = make(map[string]cachedmetric)
+	}
+
+	for name, cmetric := range s.sets {
+		acc.Add(name, cmetric.value, cmetric.tags)
+	}
+	if s.DeleteSets {
+		s.sets = make(map[string]cachedmetric)
+	}
+
+	for name, value := range values {
+		acc.Add(name, value, nil)
+	}
+	return nil
+}
+
+func (s *Statsd) Start() error {
+	log.Println("Starting up the statsd service")
+
+	// Make data structures
+	s.done = make(chan struct{})
+	s.in = make(chan string, s.AllowedPendingMessages)
+	s.inmetrics = make(chan metric, s.AllowedPendingMessages)
+	s.gauges = make(map[string]cachedmetric)
+	s.counters = make(map[string]cachedmetric)
+	s.sets = make(map[string]cachedmetric)
+
+	// Start the UDP listener
+	go s.udpListen()
+	// Start the line parser
+	go s.parser()
+	return nil
+}
+
+// udpListen starts listening for udp packets on the configured port.
+func (s *Statsd) udpListen() error {
+	address, _ := net.ResolveUDPAddr("udp", s.ServiceAddress)
+	listener, err := net.ListenUDP("udp", address)
+	if err != nil {
+		log.Fatalf("ERROR: ListenUDP - %s", err)
+	}
+	defer listener.Close()
+	log.Println("Statsd listener listening on: ", listener.LocalAddr().String())
+
+	for {
+		select {
+		case <-s.done:
+			return nil
+		default:
+			buf := make([]byte, 1024)
+			n, _, err := listener.ReadFromUDP(buf)
+			if err != nil {
+				log.Printf("ERROR: %s\n", err.Error())
+			}
+
+			lines := strings.Split(string(buf[:n]), "\n")
+			for _, line := range lines {
+				line = strings.TrimSpace(line)
+				if line != "" {
+					select {
+					case s.in <- line:
+					default:
+						log.Printf(dropwarn, line)
+					}
+				}
+			}
+		}
+	}
+}
+
+// parser monitors the s.in channel, if there is a line ready, it parses the
+// statsd string into a usable metric struct and either aggregates the value
+// or pushes it into the s.inmetrics channel.
+func (s *Statsd) parser() error {
+	for {
+		select {
+		case <-s.done:
+			return nil
+		case line := <-s.in:
+			s.parseStatsdLine(line)
+		}
+	}
+}
+
+// parseStatsdLine will parse the given statsd line, validating it as it goes.
+// If the line is valid, it will be cached for the next call to Gather()
+func (s *Statsd) parseStatsdLine(line string) {
+	s.Lock()
+	defer s.Unlock()
+
+	// Validate splitting the line on "|"
+	m := metric{}
+	parts1 := strings.Split(line, "|")
+	if len(parts1) < 2 {
+		log.Printf("Error splitting '|', Unable to parse metric: %s\n", line)
+		return
+	} else if len(parts1) > 2 {
+		sr := parts1[2]
+		if strings.Contains(sr, "@") && len(sr) > 1 {
+			samplerate, err := strconv.ParseFloat(sr[1:], 64)
+			if err != nil {
+				log.Printf("Error parsing sample rate: %s\n", err.Error())
+			} else {
+				m.samplerate = samplerate
+			}
+		} else {
+			msg := "Error parsing sample rate, it must be in format like: " +
+				"@0.1, @0.5, etc. Ignoring sample rate for line: %s\n"
+			log.Printf(msg, line)
+		}
+	}
+
+	// Validate metric type
+	switch parts1[1] {
+	case "g", "c", "s", "ms", "h":
+		m.mtype = parts1[1]
+	default:
+		log.Printf("Statsd Metric type %s unsupported", parts1[1])
+		return
+	}
+
+	// Validate splitting the rest of the line on ":"
+	parts2 := strings.Split(parts1[0], ":")
+	if len(parts2) != 2 {
+		log.Printf("Error splitting ':', Unable to parse metric: %s\n", line)
+		return
+	}
+	m.bucket = parts2[0]
+
+	// Parse the value
+	if strings.ContainsAny(parts2[1], "-+") {
+		if m.mtype != "g" {
+			log.Printf("Error: +- values are only supported for gauges: %s\n", line)
+			return
+		}
+		m.additive = true
+	}
+	v, err := strconv.ParseInt(parts2[1], 10, 64)
+	if err != nil {
+		log.Printf("Error: parsing value to int64: %s\n", line)
+		return
+	}
+	// If a sample rate is given with a counter, divide value by the rate
+	if m.samplerate != 0 && m.mtype == "c" {
+		v = int64(float64(v) / m.samplerate)
+	}
+	m.value = v
+
+	// Parse the name
+	m.name, m.tags = s.parseName(m)
+
+	switch m.mtype {
+	// Aggregate gauges, counters and sets as we go
+	case "g", "c", "s":
+		s.aggregate(m)
+	// Timers get processed at flush time
+	default:
+		select {
+		case s.inmetrics <- m:
+		default:
+			log.Printf(dropwarn, line)
+		}
+	}
+}
+
+// parseName parses the given bucket name with the list of bucket maps in the
+// config file. If there is a match, it will parse the name of the metric and
+// map of tags.
+// Return values are (<name>, <tags>)
+func (s *Statsd) parseName(m metric) (string, map[string]string) {
+	var tags map[string]string
+	name := strings.Replace(m.bucket, ".", "_", -1)
+	name = strings.Replace(name, "-", "__", -1)
+
+	for _, bm := range s.Mappings {
+		if bucketglob(bm.Match, m.bucket) {
+			tags = make(map[string]string)
+			bparts := strings.Split(m.bucket, ".")
+			for name, index := range bm.Tagmap {
+				if index >= len(bparts) {
+					log.Printf("ERROR: Index %d out of range for bucket %s\n",
+						index, m.bucket)
+					continue
+				}
+				tags[name] = bparts[index]
+			}
+			if bm.Name != "" {
+				name = bm.Name
+			}
+		}
+	}
+
+	switch m.mtype {
+	case "c":
+		name = name + "_counter"
+	case "g":
+		name = name + "_gauge"
+	case "s":
+		name = name + "_set"
+	case "ms", "h":
+		name = name + "_timer"
+	}
+
+	return name, tags
+}
+
+func bucketglob(pattern, bucket string) bool {
+	pparts := strings.Split(pattern, ".")
+	bparts := strings.Split(bucket, ".")
+	if len(pparts) != len(bparts) {
+		return false
+	}
+
+	for i, _ := range pparts {
+		if pparts[i] == "*" || pparts[i] == bparts[i] {
+			continue
+		} else {
+			return false
+		}
+	}
+	return true
+}
+
+// aggregate takes in a metric of type "counter", "gauge", or "set". It then
+// aggregates and caches the current value. It does not deal with the
+// DeleteCounters, DeleteGauges or DeleteSets options, because those are dealt
+// with in the Gather function.
+func (s *Statsd) aggregate(m metric) {
+	switch m.mtype {
+	case "c":
+		cached, ok := s.counters[m.name]
+		if !ok {
+			s.counters[m.name] = cachedmetric{
+				value: m.value,
+				tags:  m.tags,
+			}
+		} else {
+			cached.value += m.value
+			cached.tags = m.tags
+			s.counters[m.name] = cached
+		}
+	case "g":
+		cached, ok := s.gauges[m.name]
+		if !ok {
+			s.gauges[m.name] = cachedmetric{
+				value: m.value,
+				tags:  m.tags,
+			}
+		} else {
+			if m.additive {
+				cached.value = cached.value + m.value
+			} else {
+				cached.value = m.value
+			}
+			cached.tags = m.tags
+			s.gauges[m.name] = cached
+		}
+	case "s":
+		cached, ok := s.sets[m.name]
+		if !ok {
+			// Completely new metric (initialize with count of 1)
+			s.sets[m.name] = cachedmetric{
+				value: 1,
+				tags:  m.tags,
+				set:   map[int64]bool{m.value: true},
+			}
+		} else {
+			_, ok := s.sets[m.name].set[m.value]
+			if !ok {
+				// Metric exists, but value has not been counted
+				cached.value += 1
+				cached.set[m.value] = true
+				s.sets[m.name] = cached
+			}
+		}
+	}
+}
+
+func (s *Statsd) Stop() {
+	s.Lock()
+	defer s.Unlock()
+	log.Println("Stopping the statsd service")
+	close(s.done)
+	close(s.in)
+	close(s.inmetrics)
+}
+
+func init() {
+	plugins.Add("statsd", func() plugins.Plugin {
+		return &Statsd{}
+	})
+}
--- a/plugins/statsd/statsd_test.go
+++ b/plugins/statsd/statsd_test.go
@@ -0,0 +1,11 @@
+package statsd
+
+import (
+	"testing"
+)
+
+func TestListen(t *testing.T) {
+	if false {
+		t.Errorf("Test failed!")
+	}
+}