Implement telegraf collecting stats on itself

closes #1348
2016-11-07 08:34:46 +00:00
parent d518d7d806
commit d71a42cd1b
26 changed files with 975 additions and 169 deletions
--- a/selfstat/selfstat.go
+++ b/selfstat/selfstat.go
@@ -0,0 +1,169 @@
+// selfstat is a package for tracking and collecting internal statistics
+// about telegraf. Metrics can be registered using this package, and then
+// incremented or set within your code. If the inputs.internal plugin is enabled,
+// then all registered stats will be collected as they would by any other input
+// plugin.
+package selfstat
+
+import (
+	"hash/fnv"
+	"log"
+	"sort"
+	"sync"
+	"time"
+
+	"github.com/influxdata/telegraf"
+	"github.com/influxdata/telegraf/metric"
+)
+
+var (
+	registry *rgstry
+)
+
+// Stat is an interface for dealing with telegraf statistics collected
+// on itself.
+type Stat interface {
+	// Name is the name of the measurement
+	Name() string
+
+	// FieldName is the name of the measurement field
+	FieldName() string
+
+	// Tags is a tag map. Each time this is called a new map is allocated.
+	Tags() map[string]string
+
+	// Key is the unique measurement+tags key of the stat.
+	Key() uint64
+
+	// Incr increments a regular stat by 'v'.
+	// in the case of a timing stat, increment adds the timing to the cache.
+	Incr(v int64)
+
+	// Set sets a regular stat to 'v'.
+	// in the case of a timing stat, set adds the timing to the cache.
+	Set(v int64)
+
+	// Get gets the value of the stat. In the case of timings, this returns
+	// an average value of all timings received since the last call to Get().
+	// If no timings were received, it returns the previous value.
+	Get() int64
+}
+
+// Register registers the given measurement, field, and tags in the selfstat
+// registry. If given an identical measurement, it will return the stat that's
+// already been registered.
+//
+// The returned Stat can be incremented by the consumer of Register(), and it's
+// value will be returned as a telegraf metric when Metrics() is called.
+func Register(measurement, field string, tags map[string]string) Stat {
+	return registry.register(&stat{
+		measurement: "internal_" + measurement,
+		field:       field,
+		tags:        tags,
+	})
+}
+
+// RegisterTiming registers the given measurement, field, and tags in the selfstat
+// registry. If given an identical measurement, it will return the stat that's
+// already been registered.
+//
+// Timing stats differ from regular stats in that they accumulate multiple
+// "timings" added to them, and will return the average when Get() is called.
+// After Get() is called, the average is cleared and the next timing returned
+// from Get() will only reflect timings added since the previous call to Get().
+// If Get() is called without receiving any new timings, then the previous value
+// is used.
+//
+// In other words, timings are an averaged metric that get cleared on each call
+// to Get().
+//
+// The returned Stat can be incremented by the consumer of Register(), and it's
+// value will be returned as a telegraf metric when Metrics() is called.
+func RegisterTiming(measurement, field string, tags map[string]string) Stat {
+	return registry.register(&timingStat{
+		measurement: "internal_" + measurement,
+		field:       field,
+		tags:        tags,
+	})
+}
+
+// Metrics returns all registered stats as telegraf metrics.
+func Metrics() []telegraf.Metric {
+	registry.mu.Lock()
+	now := time.Now()
+	metrics := make([]telegraf.Metric, len(registry.stats))
+	i := 0
+	for _, stats := range registry.stats {
+		if len(stats) > 0 {
+			var tags map[string]string
+			var name string
+			fields := map[string]interface{}{}
+			j := 0
+			for fieldname, stat := range stats {
+				if j == 0 {
+					tags = stat.Tags()
+					name = stat.Name()
+				}
+				fields[fieldname] = stat.Get()
+				j++
+			}
+			metric, err := metric.New(name, tags, fields, now)
+			if err != nil {
+				log.Printf("E! Error creating selfstat metric: %s", err)
+				continue
+			}
+			metrics[i] = metric
+			i++
+		}
+	}
+	registry.mu.Unlock()
+	return metrics
+}
+
+type rgstry struct {
+	stats map[uint64]map[string]Stat
+	mu    sync.Mutex
+}
+
+func (r *rgstry) register(s Stat) Stat {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if stats, ok := r.stats[s.Key()]; ok {
+		// measurement exists
+		if stat, ok := stats[s.FieldName()]; ok {
+			// field already exists, so don't create a new one
+			return stat
+		}
+		r.stats[s.Key()][s.FieldName()] = s
+		return s
+	} else {
+		// creating a new unique metric
+		r.stats[s.Key()] = map[string]Stat{s.FieldName(): s}
+		return s
+	}
+}
+
+func key(measurement string, tags map[string]string) uint64 {
+	h := fnv.New64a()
+	h.Write([]byte(measurement))
+
+	tmp := make([]string, len(tags))
+	i := 0
+	for k, v := range tags {
+		tmp[i] = k + v
+		i++
+	}
+	sort.Strings(tmp)
+
+	for _, s := range tmp {
+		h.Write([]byte(s))
+	}
+
+	return h.Sum64()
+}
+
+func init() {
+	registry = &rgstry{
+		stats: make(map[uint64]map[string]Stat),
+	}
+}
--- a/selfstat/selfstat_test.go
+++ b/selfstat/selfstat_test.go
@@ -0,0 +1,221 @@
+package selfstat
+
+import (
+	"sync"
+	"testing"
+
+	"github.com/influxdata/telegraf/testutil"
+
+	"github.com/stretchr/testify/assert"
+)
+
+var (
+	// only allow one test at a time
+	// this is because we are dealing with a global registry
+	testLock sync.Mutex
+	a        int64
+)
+
+// testCleanup resets the global registry for test cleanup & unlocks the test lock
+func testCleanup() {
+	registry = &rgstry{
+		stats: make(map[uint64]map[string]Stat),
+	}
+	testLock.Unlock()
+}
+
+func BenchmarkStats(b *testing.B) {
+	testLock.Lock()
+	defer testCleanup()
+	b1 := Register("benchmark1", "test_field1", map[string]string{"test": "foo"})
+	for n := 0; n < b.N; n++ {
+		b1.Incr(1)
+		b1.Incr(3)
+		a = b1.Get()
+	}
+}
+
+func BenchmarkTimingStats(b *testing.B) {
+	testLock.Lock()
+	defer testCleanup()
+	b2 := RegisterTiming("benchmark2", "test_field1", map[string]string{"test": "foo"})
+	for n := 0; n < b.N; n++ {
+		b2.Incr(1)
+		b2.Incr(3)
+		a = b2.Get()
+	}
+}
+
+func TestRegisterAndIncrAndSet(t *testing.T) {
+	testLock.Lock()
+	defer testCleanup()
+	s1 := Register("test", "test_field1", map[string]string{"test": "foo"})
+	s2 := Register("test", "test_field2", map[string]string{"test": "foo"})
+	assert.Equal(t, int64(0), s1.Get())
+
+	s1.Incr(10)
+	s1.Incr(5)
+	assert.Equal(t, int64(15), s1.Get())
+
+	s1.Set(12)
+	assert.Equal(t, int64(12), s1.Get())
+
+	s1.Incr(-2)
+	assert.Equal(t, int64(10), s1.Get())
+
+	s2.Set(101)
+	assert.Equal(t, int64(101), s2.Get())
+
+	// make sure that the same field returns the same metric
+	// this one should be the same as s2.
+	foo := Register("test", "test_field2", map[string]string{"test": "foo"})
+	assert.Equal(t, int64(101), foo.Get())
+
+	// check that tags are consistent
+	assert.Equal(t, map[string]string{"test": "foo"}, foo.Tags())
+	assert.Equal(t, "internal_test", foo.Name())
+}
+
+func TestRegisterTimingAndIncrAndSet(t *testing.T) {
+	testLock.Lock()
+	defer testCleanup()
+	s1 := RegisterTiming("test", "test_field1_ns", map[string]string{"test": "foo"})
+	s2 := RegisterTiming("test", "test_field2_ns", map[string]string{"test": "foo"})
+	assert.Equal(t, int64(0), s1.Get())
+
+	s1.Incr(10)
+	s1.Incr(5)
+	assert.Equal(t, int64(7), s1.Get())
+	// previous value is used on subsequent calls to Get()
+	assert.Equal(t, int64(7), s1.Get())
+
+	s1.Set(12)
+	assert.Equal(t, int64(12), s1.Get())
+
+	s1.Incr(-2)
+	assert.Equal(t, int64(-2), s1.Get())
+
+	s2.Set(101)
+	assert.Equal(t, int64(101), s2.Get())
+
+	// make sure that the same field returns the same metric
+	// this one should be the same as s2.
+	foo := RegisterTiming("test", "test_field2_ns", map[string]string{"test": "foo"})
+	assert.Equal(t, int64(101), foo.Get())
+
+	// check that tags are consistent
+	assert.Equal(t, map[string]string{"test": "foo"}, foo.Tags())
+	assert.Equal(t, "internal_test", foo.Name())
+}
+
+func TestStatKeyConsistency(t *testing.T) {
+	s := &stat{
+		measurement: "internal_stat",
+		field:       "myfield",
+		tags: map[string]string{
+			"foo":   "bar",
+			"bar":   "baz",
+			"whose": "first",
+		},
+	}
+	k := s.Key()
+	for i := 0; i < 5000; i++ {
+		// assert that the Key() func doesn't change anything.
+		assert.Equal(t, k, s.Key())
+
+		// assert that two identical measurements always produce the same key.
+		tmp := &stat{
+			measurement: "internal_stat",
+			field:       "myfield",
+			tags: map[string]string{
+				"foo":   "bar",
+				"bar":   "baz",
+				"whose": "first",
+			},
+		}
+		assert.Equal(t, k, tmp.Key())
+	}
+}
+
+func TestRegisterMetricsAndVerify(t *testing.T) {
+	testLock.Lock()
+	defer testCleanup()
+
+	// register two metrics with the same key
+	s1 := RegisterTiming("test_timing", "test_field1_ns", map[string]string{"test": "foo"})
+	s2 := RegisterTiming("test_timing", "test_field2_ns", map[string]string{"test": "foo"})
+	s1.Incr(10)
+	s2.Incr(15)
+	assert.Len(t, Metrics(), 1)
+
+	// register two more metrics with different keys
+	s3 := RegisterTiming("test_timing", "test_field1_ns", map[string]string{"test": "bar"})
+	s4 := RegisterTiming("test_timing", "test_field2_ns", map[string]string{"test": "baz"})
+	s3.Incr(10)
+	s4.Incr(15)
+	assert.Len(t, Metrics(), 3)
+
+	// register some non-timing metrics
+	s5 := Register("test", "test_field1", map[string]string{"test": "bar"})
+	s6 := Register("test", "test_field2", map[string]string{"test": "baz"})
+	Register("test", "test_field3", map[string]string{"test": "baz"})
+	s5.Incr(10)
+	s5.Incr(18)
+	s6.Incr(15)
+	assert.Len(t, Metrics(), 5)
+
+	acc := testutil.Accumulator{}
+	acc.AddMetrics(Metrics())
+
+	// verify s1 & s2
+	acc.AssertContainsTaggedFields(t, "internal_test_timing",
+		map[string]interface{}{
+			"test_field1_ns": int64(10),
+			"test_field2_ns": int64(15),
+		},
+		map[string]string{
+			"test": "foo",
+		},
+	)
+
+	// verify s3
+	acc.AssertContainsTaggedFields(t, "internal_test_timing",
+		map[string]interface{}{
+			"test_field1_ns": int64(10),
+		},
+		map[string]string{
+			"test": "bar",
+		},
+	)
+
+	// verify s4
+	acc.AssertContainsTaggedFields(t, "internal_test_timing",
+		map[string]interface{}{
+			"test_field2_ns": int64(15),
+		},
+		map[string]string{
+			"test": "baz",
+		},
+	)
+
+	// verify s5
+	acc.AssertContainsTaggedFields(t, "internal_test",
+		map[string]interface{}{
+			"test_field1": int64(28),
+		},
+		map[string]string{
+			"test": "bar",
+		},
+	)
+
+	// verify s6 & s7
+	acc.AssertContainsTaggedFields(t, "internal_test",
+		map[string]interface{}{
+			"test_field2": int64(15),
+			"test_field3": int64(0),
+		},
+		map[string]string{
+			"test": "baz",
+		},
+	)
+}
--- a/selfstat/stat.go
+++ b/selfstat/stat.go
@@ -0,0 +1,50 @@
+package selfstat
+
+import (
+	"sync/atomic"
+)
+
+type stat struct {
+	v           int64
+	measurement string
+	field       string
+	tags        map[string]string
+	key         uint64
+}
+
+func (s *stat) Incr(v int64) {
+	atomic.AddInt64(&s.v, v)
+}
+
+func (s *stat) Set(v int64) {
+	atomic.StoreInt64(&s.v, v)
+}
+
+func (s *stat) Get() int64 {
+	return atomic.LoadInt64(&s.v)
+}
+
+func (s *stat) Name() string {
+	return s.measurement
+}
+
+func (s *stat) FieldName() string {
+	return s.field
+}
+
+// Tags returns a copy of the stat's tags.
+// NOTE this allocates a new map every time it is called.
+func (s *stat) Tags() map[string]string {
+	m := make(map[string]string, len(s.tags))
+	for k, v := range s.tags {
+		m[k] = v
+	}
+	return m
+}
+
+func (s *stat) Key() uint64 {
+	if s.key == 0 {
+		s.key = key(s.measurement, s.tags)
+	}
+	return s.key
+}
--- a/selfstat/timingStat.go
+++ b/selfstat/timingStat.go
@@ -0,0 +1,66 @@
+package selfstat
+
+import (
+	"sync"
+)
+
+type timingStat struct {
+	measurement string
+	field       string
+	tags        map[string]string
+	key         uint64
+	v           int64
+	prev        int64
+	count       int64
+	mu          sync.Mutex
+}
+
+func (s *timingStat) Incr(v int64) {
+	s.mu.Lock()
+	s.v += v
+	s.count++
+	s.mu.Unlock()
+}
+
+func (s *timingStat) Set(v int64) {
+	s.Incr(v)
+}
+
+func (s *timingStat) Get() int64 {
+	var avg int64
+	s.mu.Lock()
+	if s.count > 0 {
+		s.prev, avg = s.v/s.count, s.v/s.count
+		s.v = 0
+		s.count = 0
+	} else {
+		avg = s.prev
+	}
+	s.mu.Unlock()
+	return avg
+}
+
+func (s *timingStat) Name() string {
+	return s.measurement
+}
+
+func (s *timingStat) FieldName() string {
+	return s.field
+}
+
+// Tags returns a copy of the timingStat's tags.
+// NOTE this allocates a new map every time it is called.
+func (s *timingStat) Tags() map[string]string {
+	m := make(map[string]string, len(s.tags))
+	for k, v := range s.tags {
+		m[k] = v
+	}
+	return m
+}
+
+func (s *timingStat) Key() uint64 {
+	if s.key == 0 {
+		s.key = key(s.measurement, s.tags)
+	}
+	return s.key
+}