From c114849a314826ba074d843ec476f0b6c15cd048 Mon Sep 17 00:00:00 2001 From: Cameron Sparr Date: Sun, 1 May 2016 10:20:15 -0600 Subject: [PATCH] Use a timeout for docker list & stat cmds closes #1133 --- CHANGELOG.md | 2 ++ etc/telegraf.conf | 5 +++++ plugins/inputs/docker/docker.go | 17 +++++++++++++---- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad2207a43..5e89bbd72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -88,6 +88,8 @@ based on _prefix_ in addition to globs. This means that a filter like - [#1125](https://github.com/influxdata/telegraf/pull/1125): Wrap all exec command runners with a timeout, so hung os processes don't halt Telegraf. - [#1113](https://github.com/influxdata/telegraf/pull/1113): Set MaxRetry and RequiredAcks defaults in Kafka output. - [#1090](https://github.com/influxdata/telegraf/issues/1090): [agent] and [global_tags] config sometimes not getting applied. +- [#1133](https://github.com/influxdata/telegraf/issues/1133): Use a timeout for docker list & stat cmds. +- [#1052](https://github.com/influxdata/telegraf/issues/1052): Docker panic fix when decode fails. ## v0.12.1 [2016-04-14] diff --git a/etc/telegraf.conf b/etc/telegraf.conf index 40e126d94..f57bd1410 100644 --- a/etc/telegraf.conf +++ b/etc/telegraf.conf @@ -565,6 +565,8 @@ # endpoint = "unix:///var/run/docker.sock" # ## Only collect metrics for these containers, collect all if empty # container_names = [] +# ## Timeout for docker list, info, and stats commands +# timeout = "5s" # # Read statistics from one or many dovecot servers @@ -600,6 +602,9 @@ # ## Commands array # commands = ["/tmp/test.sh", "/usr/bin/mycollector --foo=bar"] # +# ## Timeout for each command to complete. +# timeout = "5s" +# # ## measurement name suffix (for separating different commands) # name_suffix = "_mycollector" # diff --git a/plugins/inputs/docker/docker.go b/plugins/inputs/docker/docker.go index 4241f6b5d..8a680a8e8 100644 --- a/plugins/inputs/docker/docker.go +++ b/plugins/inputs/docker/docker.go @@ -16,6 +16,7 @@ import ( "github.com/docker/engine-api/client" "github.com/docker/engine-api/types" "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal" "github.com/influxdata/telegraf/plugins/inputs" ) @@ -23,6 +24,7 @@ import ( type Docker struct { Endpoint string ContainerNames []string + Timeout internal.Duration client DockerClient } @@ -54,6 +56,8 @@ var sampleConfig = ` endpoint = "unix:///var/run/docker.sock" ## Only collect metrics for these containers, collect all if empty container_names = [] + ## Timeout for docker list, info, and stats commands + timeout = "5s" ` // Description returns input description @@ -97,7 +101,9 @@ func (d *Docker) Gather(acc telegraf.Accumulator) error { // List containers opts := types.ContainerListOptions{} - containers, err := d.client.ContainerList(context.Background(), opts) + ctx, cancel := context.WithTimeout(context.Background(), d.Timeout.Duration) + defer cancel() + containers, err := d.client.ContainerList(ctx, opts) if err != nil { return err } @@ -106,7 +112,6 @@ func (d *Docker) Gather(acc telegraf.Accumulator) error { var wg sync.WaitGroup wg.Add(len(containers)) for _, container := range containers { - go func(c types.Container) { defer wg.Done() err := d.gatherContainer(c, acc) @@ -127,7 +132,9 @@ func (d *Docker) gatherInfo(acc telegraf.Accumulator) error { metadataFields := make(map[string]interface{}) now := time.Now() // Get info from docker daemon - info, err := d.client.Info(context.Background()) + ctx, cancel := context.WithTimeout(context.Background(), d.Timeout.Duration) + defer cancel() + info, err := d.client.Info(ctx) if err != nil { return err } @@ -210,7 +217,9 @@ func (d *Docker) gatherContainer( } } - r, err := d.client.ContainerStats(context.Background(), container.ID, false) + ctx, cancel := context.WithTimeout(context.Background(), d.Timeout.Duration) + defer cancel() + r, err := d.client.ContainerStats(ctx, container.ID, false) if err != nil { log.Printf("Error getting docker stats: %s\n", err.Error()) }