Fixup random interval jittering
This commit is contained in:
parent
7d15061984
commit
889c0a50a4
|
@ -14,8 +14,7 @@ even interval. This means that `interval="10s"` will collect every :00, :10, etc
|
||||||
To ease scale concerns, flushing will be "jittered" by a random amount so that
|
To ease scale concerns, flushing will be "jittered" by a random amount so that
|
||||||
all Telegraf instances do not flush at the same time. Both of these options can
|
all Telegraf instances do not flush at the same time. Both of these options can
|
||||||
be controlled via the `round_interval` and `flush_jitter` config options.
|
be controlled via the `round_interval` and `flush_jitter` config options.
|
||||||
- Telegraf will now retry metric flushes, twice by default. This can be configued
|
- Telegraf will now retry metric flushes twice
|
||||||
via the `flush_retries` agent config option.
|
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
- [#205](https://github.com/influxdb/telegraf/issues/205): Include per-db redis keyspace info
|
- [#205](https://github.com/influxdb/telegraf/issues/205): Include per-db redis keyspace info
|
||||||
|
|
42
agent.go
42
agent.go
|
@ -1,9 +1,10 @@
|
||||||
package telegraf
|
package telegraf
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"crypto/rand"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"math/rand"
|
"math/big"
|
||||||
"os"
|
"os"
|
||||||
"sort"
|
"sort"
|
||||||
"sync"
|
"sync"
|
||||||
|
@ -381,9 +382,10 @@ func (a *Agent) flusher(shutdown chan struct{}, pointChan chan *client.Point) er
|
||||||
// Inelegant, but this sleep is to allow the Gather threads to run, so that
|
// Inelegant, but this sleep is to allow the Gather threads to run, so that
|
||||||
// the flusher will flush after metrics are collected.
|
// the flusher will flush after metrics are collected.
|
||||||
time.Sleep(time.Millisecond * 100)
|
time.Sleep(time.Millisecond * 100)
|
||||||
|
|
||||||
ticker := time.NewTicker(a.FlushInterval.Duration)
|
ticker := time.NewTicker(a.FlushInterval.Duration)
|
||||||
points := make([]*client.Point, 0)
|
points := make([]*client.Point, 0)
|
||||||
jitter := rand.Int63n(int64(a.FlushJitter.Duration))
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-shutdown:
|
case <-shutdown:
|
||||||
|
@ -391,15 +393,7 @@ func (a *Agent) flusher(shutdown chan struct{}, pointChan chan *client.Point) er
|
||||||
a.flush(points, shutdown, true)
|
a.flush(points, shutdown, true)
|
||||||
return nil
|
return nil
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
timer := time.NewTimer(time.Duration(jitter))
|
|
||||||
select {
|
|
||||||
case <-timer.C:
|
|
||||||
a.flush(points, shutdown, false)
|
a.flush(points, shutdown, false)
|
||||||
case <-shutdown:
|
|
||||||
log.Println("Hang on, flushing any cached points before shutdown")
|
|
||||||
a.flush(points, shutdown, true)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
points = make([]*client.Point, 0)
|
points = make([]*client.Point, 0)
|
||||||
case pt := <-pointChan:
|
case pt := <-pointChan:
|
||||||
points = append(points, pt)
|
points = append(points, pt)
|
||||||
|
@ -407,10 +401,38 @@ func (a *Agent) flusher(shutdown chan struct{}, pointChan chan *client.Point) er
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// jitterInterval applies the the interval jitter to the flush interval using
|
||||||
|
// crypto/rand number generator
|
||||||
|
func jitterInterval(ininterval, injitter time.Duration) time.Duration {
|
||||||
|
var jitter int64
|
||||||
|
outinterval := ininterval
|
||||||
|
if injitter.Nanoseconds() != 0 {
|
||||||
|
maxjitter := big.NewInt(injitter.Nanoseconds())
|
||||||
|
if j, err := rand.Int(rand.Reader, maxjitter); err == nil {
|
||||||
|
jitter = j.Int64()
|
||||||
|
}
|
||||||
|
outinterval = time.Duration(jitter + ininterval.Nanoseconds())
|
||||||
|
}
|
||||||
|
|
||||||
|
if outinterval.Nanoseconds() < time.Duration(500*time.Millisecond).Nanoseconds() {
|
||||||
|
log.Printf("Flush interval %s too low, setting to 500ms\n", outinterval)
|
||||||
|
outinterval = time.Duration(500 * time.Millisecond)
|
||||||
|
}
|
||||||
|
|
||||||
|
return outinterval
|
||||||
|
}
|
||||||
|
|
||||||
// Run runs the agent daemon, gathering every Interval
|
// Run runs the agent daemon, gathering every Interval
|
||||||
func (a *Agent) Run(shutdown chan struct{}) error {
|
func (a *Agent) Run(shutdown chan struct{}) error {
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
|
a.FlushInterval.Duration = jitterInterval(a.FlushInterval.Duration,
|
||||||
|
a.FlushJitter.Duration)
|
||||||
|
|
||||||
|
log.Printf("Agent Config: Interval:%s, Debug:%#v, Hostname:%#v, "+
|
||||||
|
"Flush Interval:%s\n",
|
||||||
|
a.Interval, a.Debug, a.Hostname, a.FlushInterval)
|
||||||
|
|
||||||
// channel shared between all plugin threads for accumulating points
|
// channel shared between all plugin threads for accumulating points
|
||||||
pointChan := make(chan *client.Point, 1000)
|
pointChan := make(chan *client.Point, 1000)
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,9 @@ package telegraf
|
||||||
import (
|
import (
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/influxdb/telegraf/duration"
|
||||||
|
|
||||||
// needing to load the plugins
|
// needing to load the plugins
|
||||||
_ "github.com/influxdb/telegraf/plugins/all"
|
_ "github.com/influxdb/telegraf/plugins/all"
|
||||||
|
@ -55,3 +58,97 @@ func TestAgent_LoadOutput(t *testing.T) {
|
||||||
outputsEnabled, _ = a.LoadOutputs([]string{"influxdb", "foo", "kafka", "bar"}, config)
|
outputsEnabled, _ = a.LoadOutputs([]string{"influxdb", "foo", "kafka", "bar"}, config)
|
||||||
assert.Equal(t, 2, len(outputsEnabled))
|
assert.Equal(t, 2, len(outputsEnabled))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAgent_ZeroJitter(t *testing.T) {
|
||||||
|
a := &Agent{
|
||||||
|
FlushInterval: duration.Duration{10 * time.Second},
|
||||||
|
FlushJitter: duration.Duration{0 * time.Second},
|
||||||
|
}
|
||||||
|
flushinterval := jitterInterval(a.FlushInterval.Duration,
|
||||||
|
a.FlushJitter.Duration)
|
||||||
|
|
||||||
|
actual := flushinterval.Nanoseconds()
|
||||||
|
exp := time.Duration(10 * time.Second).Nanoseconds()
|
||||||
|
|
||||||
|
if actual != exp {
|
||||||
|
t.Errorf("Actual %v, expected %v", actual, exp)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAgent_ZeroInterval(t *testing.T) {
|
||||||
|
min := time.Duration(500 * time.Millisecond).Nanoseconds()
|
||||||
|
max := time.Duration(5 * time.Second).Nanoseconds()
|
||||||
|
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
a := &Agent{
|
||||||
|
FlushInterval: duration.Duration{0 * time.Second},
|
||||||
|
FlushJitter: duration.Duration{5 * time.Second},
|
||||||
|
}
|
||||||
|
|
||||||
|
flushinterval := jitterInterval(a.FlushInterval.Duration,
|
||||||
|
a.FlushJitter.Duration)
|
||||||
|
actual := flushinterval.Nanoseconds()
|
||||||
|
|
||||||
|
if actual > max {
|
||||||
|
t.Errorf("Didn't expect interval %d to be > %d", actual, max)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if actual < min {
|
||||||
|
t.Errorf("Didn't expect interval %d to be < %d", actual, min)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAgent_ZeroBoth(t *testing.T) {
|
||||||
|
a := &Agent{
|
||||||
|
FlushInterval: duration.Duration{0 * time.Second},
|
||||||
|
FlushJitter: duration.Duration{0 * time.Second},
|
||||||
|
}
|
||||||
|
|
||||||
|
flushinterval := jitterInterval(a.FlushInterval.Duration,
|
||||||
|
a.FlushJitter.Duration)
|
||||||
|
|
||||||
|
actual := flushinterval
|
||||||
|
exp := time.Duration(500 * time.Millisecond)
|
||||||
|
|
||||||
|
if actual != exp {
|
||||||
|
t.Errorf("Actual %v, expected %v", actual, exp)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAgent_JitterMax(t *testing.T) {
|
||||||
|
max := time.Duration(32 * time.Second).Nanoseconds()
|
||||||
|
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
a := &Agent{
|
||||||
|
FlushInterval: duration.Duration{30 * time.Second},
|
||||||
|
FlushJitter: duration.Duration{2 * time.Second},
|
||||||
|
}
|
||||||
|
flushinterval := jitterInterval(a.FlushInterval.Duration,
|
||||||
|
a.FlushJitter.Duration)
|
||||||
|
actual := flushinterval.Nanoseconds()
|
||||||
|
if actual > max {
|
||||||
|
t.Errorf("Didn't expect interval %d to be > %d", actual, max)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAgent_JitterMin(t *testing.T) {
|
||||||
|
min := time.Duration(30 * time.Second).Nanoseconds()
|
||||||
|
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
a := &Agent{
|
||||||
|
FlushInterval: duration.Duration{30 * time.Second},
|
||||||
|
FlushJitter: duration.Duration{2 * time.Second},
|
||||||
|
}
|
||||||
|
flushinterval := jitterInterval(a.FlushInterval.Duration,
|
||||||
|
a.FlushJitter.Duration)
|
||||||
|
actual := flushinterval.Nanoseconds()
|
||||||
|
if actual < min {
|
||||||
|
t.Errorf("Didn't expect interval %d to be < %d", actual, min)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -143,9 +143,6 @@ func main() {
|
||||||
log.Printf("Starting Telegraf (version %s)\n", Version)
|
log.Printf("Starting Telegraf (version %s)\n", Version)
|
||||||
log.Printf("Loaded outputs: %s", strings.Join(outputs, " "))
|
log.Printf("Loaded outputs: %s", strings.Join(outputs, " "))
|
||||||
log.Printf("Loaded plugins: %s", strings.Join(plugins, " "))
|
log.Printf("Loaded plugins: %s", strings.Join(plugins, " "))
|
||||||
log.Printf("Agent Config: Interval:%s, Debug:%#v, Hostname:%#v, "+
|
|
||||||
"Flush Interval:%s\n",
|
|
||||||
ag.Interval, ag.Debug, ag.Hostname, ag.FlushInterval)
|
|
||||||
log.Printf("Tags enabled: %s", config.ListTags())
|
log.Printf("Tags enabled: %s", config.ListTags())
|
||||||
|
|
||||||
if *fPidfile != "" {
|
if *fPidfile != "" {
|
||||||
|
|
12
config.go
12
config.go
|
@ -230,13 +230,13 @@ var header = `# Telegraf configuration
|
||||||
# ie, if interval="10s" then always collect on :00, :10, :20, etc.
|
# ie, if interval="10s" then always collect on :00, :10, :20, etc.
|
||||||
round_interval = true
|
round_interval = true
|
||||||
|
|
||||||
# Default data flushing interval for all outputs
|
# Default data flushing interval for all outputs. You should not set this below
|
||||||
|
# interval. Maximum flush_interval will be flush_interval + flush_jitter
|
||||||
flush_interval = "10s"
|
flush_interval = "10s"
|
||||||
# Jitter the flush interval by a random range
|
# Jitter the flush interval by a random amount. This is primarily to avoid
|
||||||
# ie, a jitter of 5s and interval 10s means flush will happen every 10-15s
|
# large write spikes for users running a large number of telegraf instances.
|
||||||
flush_jitter = "5s"
|
# ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
|
||||||
# Number of times to retry each data flush
|
flush_jitter = "0s"
|
||||||
flush_retries = 2
|
|
||||||
|
|
||||||
# Run telegraf in debug mode
|
# Run telegraf in debug mode
|
||||||
debug = false
|
debug = false
|
||||||
|
|
Loading…
Reference in New Issue