Fixup random interval jittering

This commit is contained in:
Cameron Sparr 2015-10-23 11:23:08 -06:00
parent 7d15061984
commit 889c0a50a4
5 changed files with 137 additions and 22 deletions

View File

@ -14,8 +14,7 @@ even interval. This means that `interval="10s"` will collect every :00, :10, etc
To ease scale concerns, flushing will be "jittered" by a random amount so that To ease scale concerns, flushing will be "jittered" by a random amount so that
all Telegraf instances do not flush at the same time. Both of these options can all Telegraf instances do not flush at the same time. Both of these options can
be controlled via the `round_interval` and `flush_jitter` config options. be controlled via the `round_interval` and `flush_jitter` config options.
- Telegraf will now retry metric flushes, twice by default. This can be configued - Telegraf will now retry metric flushes twice
via the `flush_retries` agent config option.
### Features ### Features
- [#205](https://github.com/influxdb/telegraf/issues/205): Include per-db redis keyspace info - [#205](https://github.com/influxdb/telegraf/issues/205): Include per-db redis keyspace info

View File

@ -1,9 +1,10 @@
package telegraf package telegraf
import ( import (
"crypto/rand"
"fmt" "fmt"
"log" "log"
"math/rand" "math/big"
"os" "os"
"sort" "sort"
"sync" "sync"
@ -381,9 +382,10 @@ func (a *Agent) flusher(shutdown chan struct{}, pointChan chan *client.Point) er
// Inelegant, but this sleep is to allow the Gather threads to run, so that // Inelegant, but this sleep is to allow the Gather threads to run, so that
// the flusher will flush after metrics are collected. // the flusher will flush after metrics are collected.
time.Sleep(time.Millisecond * 100) time.Sleep(time.Millisecond * 100)
ticker := time.NewTicker(a.FlushInterval.Duration) ticker := time.NewTicker(a.FlushInterval.Duration)
points := make([]*client.Point, 0) points := make([]*client.Point, 0)
jitter := rand.Int63n(int64(a.FlushJitter.Duration))
for { for {
select { select {
case <-shutdown: case <-shutdown:
@ -391,15 +393,7 @@ func (a *Agent) flusher(shutdown chan struct{}, pointChan chan *client.Point) er
a.flush(points, shutdown, true) a.flush(points, shutdown, true)
return nil return nil
case <-ticker.C: case <-ticker.C:
timer := time.NewTimer(time.Duration(jitter))
select {
case <-timer.C:
a.flush(points, shutdown, false) a.flush(points, shutdown, false)
case <-shutdown:
log.Println("Hang on, flushing any cached points before shutdown")
a.flush(points, shutdown, true)
return nil
}
points = make([]*client.Point, 0) points = make([]*client.Point, 0)
case pt := <-pointChan: case pt := <-pointChan:
points = append(points, pt) points = append(points, pt)
@ -407,10 +401,38 @@ func (a *Agent) flusher(shutdown chan struct{}, pointChan chan *client.Point) er
} }
} }
// jitterInterval applies the the interval jitter to the flush interval using
// crypto/rand number generator
func jitterInterval(ininterval, injitter time.Duration) time.Duration {
var jitter int64
outinterval := ininterval
if injitter.Nanoseconds() != 0 {
maxjitter := big.NewInt(injitter.Nanoseconds())
if j, err := rand.Int(rand.Reader, maxjitter); err == nil {
jitter = j.Int64()
}
outinterval = time.Duration(jitter + ininterval.Nanoseconds())
}
if outinterval.Nanoseconds() < time.Duration(500*time.Millisecond).Nanoseconds() {
log.Printf("Flush interval %s too low, setting to 500ms\n", outinterval)
outinterval = time.Duration(500 * time.Millisecond)
}
return outinterval
}
// Run runs the agent daemon, gathering every Interval // Run runs the agent daemon, gathering every Interval
func (a *Agent) Run(shutdown chan struct{}) error { func (a *Agent) Run(shutdown chan struct{}) error {
var wg sync.WaitGroup var wg sync.WaitGroup
a.FlushInterval.Duration = jitterInterval(a.FlushInterval.Duration,
a.FlushJitter.Duration)
log.Printf("Agent Config: Interval:%s, Debug:%#v, Hostname:%#v, "+
"Flush Interval:%s\n",
a.Interval, a.Debug, a.Hostname, a.FlushInterval)
// channel shared between all plugin threads for accumulating points // channel shared between all plugin threads for accumulating points
pointChan := make(chan *client.Point, 1000) pointChan := make(chan *client.Point, 1000)

View File

@ -3,6 +3,9 @@ package telegraf
import ( import (
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"testing" "testing"
"time"
"github.com/influxdb/telegraf/duration"
// needing to load the plugins // needing to load the plugins
_ "github.com/influxdb/telegraf/plugins/all" _ "github.com/influxdb/telegraf/plugins/all"
@ -55,3 +58,97 @@ func TestAgent_LoadOutput(t *testing.T) {
outputsEnabled, _ = a.LoadOutputs([]string{"influxdb", "foo", "kafka", "bar"}, config) outputsEnabled, _ = a.LoadOutputs([]string{"influxdb", "foo", "kafka", "bar"}, config)
assert.Equal(t, 2, len(outputsEnabled)) assert.Equal(t, 2, len(outputsEnabled))
} }
func TestAgent_ZeroJitter(t *testing.T) {
a := &Agent{
FlushInterval: duration.Duration{10 * time.Second},
FlushJitter: duration.Duration{0 * time.Second},
}
flushinterval := jitterInterval(a.FlushInterval.Duration,
a.FlushJitter.Duration)
actual := flushinterval.Nanoseconds()
exp := time.Duration(10 * time.Second).Nanoseconds()
if actual != exp {
t.Errorf("Actual %v, expected %v", actual, exp)
}
}
func TestAgent_ZeroInterval(t *testing.T) {
min := time.Duration(500 * time.Millisecond).Nanoseconds()
max := time.Duration(5 * time.Second).Nanoseconds()
for i := 0; i < 1000; i++ {
a := &Agent{
FlushInterval: duration.Duration{0 * time.Second},
FlushJitter: duration.Duration{5 * time.Second},
}
flushinterval := jitterInterval(a.FlushInterval.Duration,
a.FlushJitter.Duration)
actual := flushinterval.Nanoseconds()
if actual > max {
t.Errorf("Didn't expect interval %d to be > %d", actual, max)
break
}
if actual < min {
t.Errorf("Didn't expect interval %d to be < %d", actual, min)
break
}
}
}
func TestAgent_ZeroBoth(t *testing.T) {
a := &Agent{
FlushInterval: duration.Duration{0 * time.Second},
FlushJitter: duration.Duration{0 * time.Second},
}
flushinterval := jitterInterval(a.FlushInterval.Duration,
a.FlushJitter.Duration)
actual := flushinterval
exp := time.Duration(500 * time.Millisecond)
if actual != exp {
t.Errorf("Actual %v, expected %v", actual, exp)
}
}
func TestAgent_JitterMax(t *testing.T) {
max := time.Duration(32 * time.Second).Nanoseconds()
for i := 0; i < 1000; i++ {
a := &Agent{
FlushInterval: duration.Duration{30 * time.Second},
FlushJitter: duration.Duration{2 * time.Second},
}
flushinterval := jitterInterval(a.FlushInterval.Duration,
a.FlushJitter.Duration)
actual := flushinterval.Nanoseconds()
if actual > max {
t.Errorf("Didn't expect interval %d to be > %d", actual, max)
break
}
}
}
func TestAgent_JitterMin(t *testing.T) {
min := time.Duration(30 * time.Second).Nanoseconds()
for i := 0; i < 1000; i++ {
a := &Agent{
FlushInterval: duration.Duration{30 * time.Second},
FlushJitter: duration.Duration{2 * time.Second},
}
flushinterval := jitterInterval(a.FlushInterval.Duration,
a.FlushJitter.Duration)
actual := flushinterval.Nanoseconds()
if actual < min {
t.Errorf("Didn't expect interval %d to be < %d", actual, min)
break
}
}
}

View File

@ -143,9 +143,6 @@ func main() {
log.Printf("Starting Telegraf (version %s)\n", Version) log.Printf("Starting Telegraf (version %s)\n", Version)
log.Printf("Loaded outputs: %s", strings.Join(outputs, " ")) log.Printf("Loaded outputs: %s", strings.Join(outputs, " "))
log.Printf("Loaded plugins: %s", strings.Join(plugins, " ")) log.Printf("Loaded plugins: %s", strings.Join(plugins, " "))
log.Printf("Agent Config: Interval:%s, Debug:%#v, Hostname:%#v, "+
"Flush Interval:%s\n",
ag.Interval, ag.Debug, ag.Hostname, ag.FlushInterval)
log.Printf("Tags enabled: %s", config.ListTags()) log.Printf("Tags enabled: %s", config.ListTags())
if *fPidfile != "" { if *fPidfile != "" {

View File

@ -230,13 +230,13 @@ var header = `# Telegraf configuration
# ie, if interval="10s" then always collect on :00, :10, :20, etc. # ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true round_interval = true
# Default data flushing interval for all outputs # Default data flushing interval for all outputs. You should not set this below
# interval. Maximum flush_interval will be flush_interval + flush_jitter
flush_interval = "10s" flush_interval = "10s"
# Jitter the flush interval by a random range # Jitter the flush interval by a random amount. This is primarily to avoid
# ie, a jitter of 5s and interval 10s means flush will happen every 10-15s # large write spikes for users running a large number of telegraf instances.
flush_jitter = "5s" # ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
# Number of times to retry each data flush flush_jitter = "0s"
flush_retries = 2
# Run telegraf in debug mode # Run telegraf in debug mode
debug = false debug = false