Copy metrics for each configured output

This is for better thread-safety when running with multiple outputs,
which can cause very odd panics at very high loads

primarily this is to address #1432

closes #1432
This commit is contained in:
Cameron Sparr 2016-07-13 08:14:48 -06:00
parent 821d3fafa6
commit bfdd665435
6 changed files with 42 additions and 10 deletions

View File

@ -45,6 +45,7 @@ should now look like:
- [#1339](https://github.com/influxdata/telegraf/issues/1339): Prometheus client output panic on service reload. - [#1339](https://github.com/influxdata/telegraf/issues/1339): Prometheus client output panic on service reload.
- [#1461](https://github.com/influxdata/telegraf/pull/1461): Prometheus parser, protobuf format header fix. - [#1461](https://github.com/influxdata/telegraf/pull/1461): Prometheus parser, protobuf format header fix.
- [#1334](https://github.com/influxdata/telegraf/issues/1334): Prometheus output, metric refresh and caching fixes. - [#1334](https://github.com/influxdata/telegraf/issues/1334): Prometheus output, metric refresh and caching fixes.
- [#1432](https://github.com/influxdata/telegraf/issues/1432): Panic fix for multiple graphite outputs under very high load.
## v1.0 beta 2 [2016-06-21] ## v1.0 beta 2 [2016-06-21]

View File

@ -25,10 +25,6 @@ build-for-docker:
"-s -X main.version=$(VERSION)" \ "-s -X main.version=$(VERSION)" \
./cmd/telegraf/telegraf.go ./cmd/telegraf/telegraf.go
# Build with race detector
dev: prepare
go build -race -ldflags "-X main.version=$(VERSION)" ./...
# run package script # run package script
package: package:
./scripts/build.py --package --version="$(VERSION)" --platform=linux --arch=all --upload ./scripts/build.py --package --version="$(VERSION)" --platform=linux --arch=all --upload

View File

@ -268,13 +268,33 @@ func (a *Agent) flusher(shutdown chan struct{}, metricC chan telegraf.Metric) er
internal.RandomSleep(a.Config.Agent.FlushJitter.Duration, shutdown) internal.RandomSleep(a.Config.Agent.FlushJitter.Duration, shutdown)
a.flush() a.flush()
case m := <-metricC: case m := <-metricC:
for _, o := range a.Config.Outputs { for i, o := range a.Config.Outputs {
o.AddMetric(m) if i == len(a.Config.Outputs)-1 {
o.AddMetric(m)
} else {
o.AddMetric(copyMetric(m))
}
} }
} }
} }
} }
func copyMetric(m telegraf.Metric) telegraf.Metric {
t := time.Time(m.Time())
tags := make(map[string]string)
fields := make(map[string]interface{})
for k, v := range m.Tags() {
tags[k] = v
}
for k, v := range m.Fields() {
fields[k] = v
}
out, _ := telegraf.NewMetric(m.Name(), tags, fields, t)
return out
}
// Run runs the agent daemon, gathering every Interval // Run runs the agent daemon, gathering every Interval
func (a *Agent) Run(shutdown chan struct{}) error { func (a *Agent) Run(shutdown chan struct{}) error {
var wg sync.WaitGroup var wg sync.WaitGroup

View File

@ -31,6 +31,8 @@ type TcpListener struct {
accept chan bool accept chan bool
// drops tracks the number of dropped metrics. // drops tracks the number of dropped metrics.
drops int drops int
// malformed tracks the number of malformed packets
malformed int
// track the listener here so we can close it in Stop() // track the listener here so we can close it in Stop()
listener *net.TCPListener listener *net.TCPListener
@ -45,6 +47,9 @@ var dropwarn = "ERROR: tcp_listener message queue full. " +
"We have dropped %d messages so far. " + "We have dropped %d messages so far. " +
"You may want to increase allowed_pending_messages in the config\n" "You may want to increase allowed_pending_messages in the config\n"
var malformedwarn = "WARNING: tcp_listener has received %d malformed packets" +
" thus far."
const sampleConfig = ` const sampleConfig = `
## Address and port to host TCP listener on ## Address and port to host TCP listener on
service_address = ":8094" service_address = ":8094"
@ -243,8 +248,10 @@ func (t *TcpListener) tcpParser() error {
if err == nil { if err == nil {
t.storeMetrics(metrics) t.storeMetrics(metrics)
} else { } else {
log.Printf("Malformed packet: [%s], Error: %s\n", t.malformed++
string(packet), err) if t.malformed == 1 || t.malformed%1000 == 0 {
log.Printf(malformedwarn, t.malformed)
}
} }
} }
} }

View File

@ -27,6 +27,8 @@ type UdpListener struct {
done chan struct{} done chan struct{}
// drops tracks the number of dropped metrics. // drops tracks the number of dropped metrics.
drops int drops int
// malformed tracks the number of malformed packets
malformed int
parser parsers.Parser parser parsers.Parser
@ -44,6 +46,9 @@ var dropwarn = "ERROR: udp_listener message queue full. " +
"We have dropped %d messages so far. " + "We have dropped %d messages so far. " +
"You may want to increase allowed_pending_messages in the config\n" "You may want to increase allowed_pending_messages in the config\n"
var malformedwarn = "WARNING: udp_listener has received %d malformed packets" +
" thus far."
const sampleConfig = ` const sampleConfig = `
## Address and port to host UDP listener on ## Address and port to host UDP listener on
service_address = ":8092" service_address = ":8092"
@ -152,7 +157,10 @@ func (u *UdpListener) udpParser() error {
if err == nil { if err == nil {
u.storeMetrics(metrics) u.storeMetrics(metrics)
} else { } else {
log.Printf("Malformed packet: [%s], Error: %s\n", packet, err) u.malformed++
if u.malformed == 1 || u.malformed%1000 == 0 {
log.Printf(malformedwarn, u.malformed)
}
} }
} }
} }

View File

@ -20,7 +20,7 @@ type GraphiteSerializer struct {
Template string Template string
} }
func (s GraphiteSerializer) Serialize(metric telegraf.Metric) ([]string, error) { func (s *GraphiteSerializer) Serialize(metric telegraf.Metric) ([]string, error) {
out := []string{} out := []string{}
// Convert UnixNano to Unix timestamps // Convert UnixNano to Unix timestamps