shim improvements for docs, clean quit, and slow readers (#7452)

This commit is contained in:
Steven Soroka 2020-05-05 10:14:57 -04:00 committed by GitHub
parent 8ee12d07a1
commit cc927357a4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 159 additions and 104 deletions

View File

@ -1,9 +1,13 @@
# Execd Input Plugin # Execd Input Plugin
The `execd` plugin runs an external program as a daemon. The programs must output metrics in any one of the accepted The `execd` plugin runs an external program as a long-running daemon.
[Input Data Formats](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md) on its standard output. The programs must output metrics in any one of the accepted
[Input Data Formats](input_formats) on the process's STDOUT, and is expected to
stay running. If you'd instead like the process to collect metrics and then exit,
check out the [inputs.exec](exec_plugin) plugin.
The `signal` can be configured to send a signal the running daemon on each collection interval. The `signal` can be configured to send a signal the running daemon on each
collection interval.
Program output on standard error is mirrored to the telegraf log. Program output on standard error is mirrored to the telegraf log.
@ -16,10 +20,10 @@ Program output on standard error is mirrored to the telegraf log.
## Define how the process is signaled on each collection interval. ## Define how the process is signaled on each collection interval.
## Valid values are: ## Valid values are:
## "none" : Do not signal anything. ## "none" : Do not signal anything. (Recommended for service inputs)
## The process must output metrics by itself. ## The process must output metrics by itself.
## "STDIN" : Send a newline on STDIN. ## "STDIN" : Send a newline on STDIN. (Recommended for gather inputs)
## "SIGHUP" : Send a HUP signal. Not available on Windows. ## "SIGHUP" : Send a HUP signal. Not available on Windows. (not recommended)
## "SIGUSR1" : Send a USR1 signal. Not available on Windows. ## "SIGUSR1" : Send a USR1 signal. Not available on Windows.
## "SIGUSR2" : Send a USR2 signal. Not available on Windows. ## "SIGUSR2" : Send a USR2 signal. Not available on Windows.
signal = "none" signal = "none"
@ -110,3 +114,6 @@ end
command = ["plugins/inputs/execd/examples/count.rb"] command = ["plugins/inputs/execd/examples/count.rb"]
signal = "none" signal = "none"
``` ```
[input_formats]: https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
[exec_plugin]: https://github.com/influxdata/telegraf/blob/master/plugins/inputs/exec/README.md

View File

@ -75,7 +75,7 @@ func (e *Execd) Start(acc telegraf.Accumulator) error {
return fmt.Errorf("FATAL no command specified") return fmt.Errorf("FATAL no command specified")
} }
e.wg.Add(1) e.wg.Add(1) // for the main loop
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
e.cancel = cancel e.cancel = cancel

View File

@ -5,7 +5,9 @@ package execd
import ( import (
"fmt" "fmt"
"io" "io"
"os"
"syscall" "syscall"
"time"
"github.com/influxdata/telegraf" "github.com/influxdata/telegraf"
) )
@ -23,6 +25,9 @@ func (e *Execd) Gather(acc telegraf.Accumulator) error {
case "SIGUSR2": case "SIGUSR2":
e.cmd.Process.Signal(syscall.SIGUSR2) e.cmd.Process.Signal(syscall.SIGUSR2)
case "STDIN": case "STDIN":
if osStdin, ok := e.stdin.(*os.File); ok {
osStdin.SetWriteDeadline(time.Now().Add(1 * time.Second))
}
if _, err := io.WriteString(e.stdin, "\n"); err != nil { if _, err := io.WriteString(e.stdin, "\n"); err != nil {
return fmt.Errorf("Error writing to stdin: %s", err) return fmt.Errorf("Error writing to stdin: %s", err)
} }

View File

@ -33,11 +33,12 @@ func TestExternalInputWorks(t *testing.T) {
require.NoError(t, e.Start(acc)) require.NoError(t, e.Start(acc))
require.NoError(t, e.Gather(acc)) require.NoError(t, e.Gather(acc))
e.Stop()
// grab a metric and make sure it's a thing // grab a metric and make sure it's a thing
m := readChanWithTimeout(t, metrics, 10*time.Second) m := readChanWithTimeout(t, metrics, 10*time.Second)
e.Stop()
require.Equal(t, "counter_bash", m.Name()) require.Equal(t, "counter_bash", m.Name())
val, ok := m.GetField("count") val, ok := m.GetField("count")
require.True(t, ok) require.True(t, ok)

View File

@ -5,6 +5,8 @@ package execd
import ( import (
"fmt" "fmt"
"io" "io"
"os"
"time"
"github.com/influxdata/telegraf" "github.com/influxdata/telegraf"
) )
@ -16,6 +18,9 @@ func (e *Execd) Gather(acc telegraf.Accumulator) error {
switch e.Signal { switch e.Signal {
case "STDIN": case "STDIN":
if osStdin, ok := e.stdin.(*os.File); ok {
osStdin.SetWriteDeadline(time.Now().Add(1 * time.Second))
}
if _, err := io.WriteString(e.stdin, "\n"); err != nil { if _, err := io.WriteString(e.stdin, "\n"); err != nil {
return fmt.Errorf("Error writing to stdin: %s", err) return fmt.Errorf("Error writing to stdin: %s", err)
} }

View File

@ -23,9 +23,9 @@ import (
type empty struct{} type empty struct{}
var ( var (
gatherPromptChans []chan empty
stdout io.Writer = os.Stdout stdout io.Writer = os.Stdout
stdin io.Reader = os.Stdin stdin io.Reader = os.Stdin
forever = 100 * 365 * 24 * time.Hour
) )
const ( const (
@ -34,10 +34,15 @@ const (
PollIntervalDisabled = time.Duration(0) PollIntervalDisabled = time.Duration(0)
) )
// Shim allows you to wrap your inputs and run them as if they were part of Telegraf,
// except built externally.
type Shim struct { type Shim struct {
Inputs []telegraf.Input Inputs []telegraf.Input
gatherPromptChans []chan empty
metricCh chan telegraf.Metric
} }
// New creates a new shim interface
func New() *Shim { func New() *Shim {
return &Shim{} return &Shim{}
} }
@ -67,25 +72,26 @@ func (s *Shim) AddInputs(newInputs []telegraf.Input) error {
// Run the input plugins.. // Run the input plugins..
func (s *Shim) Run(pollInterval time.Duration) error { func (s *Shim) Run(pollInterval time.Duration) error {
// context is used only to close the stdin reader. everything else cascades
// from that point and closes cleanly when it's done.
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
s.metricCh = make(chan telegraf.Metric, 1)
wg := sync.WaitGroup{} wg := sync.WaitGroup{}
quit := make(chan os.Signal, 1) quit := make(chan os.Signal, 1)
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM) signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
collectMetricsPrompt := make(chan os.Signal, 1) collectMetricsPrompt := make(chan os.Signal, 1)
listenForCollectMetricsSignals(collectMetricsPrompt) listenForCollectMetricsSignals(ctx, collectMetricsPrompt)
wg.Add(1) // wait for the metric channel to close
metricCh := make(chan telegraf.Metric, 1)
serializer := influx.NewSerializer() serializer := influx.NewSerializer()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
for _, input := range s.Inputs { for _, input := range s.Inputs {
wrappedInput := inputShim{Input: input} wrappedInput := inputShim{Input: input}
acc := agent.NewAccumulator(wrappedInput, metricCh) acc := agent.NewAccumulator(wrappedInput, s.metricCh)
acc.SetPrecision(time.Nanosecond) acc.SetPrecision(time.Nanosecond)
if serviceInput, ok := input.(telegraf.ServiceInput); ok { if serviceInput, ok := input.(telegraf.ServiceInput); ok {
@ -94,30 +100,35 @@ func (s *Shim) Run(pollInterval time.Duration) error {
} }
} }
gatherPromptCh := make(chan empty, 1) gatherPromptCh := make(chan empty, 1)
gatherPromptChans = append(gatherPromptChans, gatherPromptCh) s.gatherPromptChans = append(s.gatherPromptChans, gatherPromptCh)
wg.Add(1) wg.Add(1)
go func(input telegraf.Input) { go func(input telegraf.Input) {
startGathering(ctx, input, acc, gatherPromptCh, pollInterval) startGathering(ctx, input, acc, gatherPromptCh, pollInterval)
if serviceInput, ok := input.(telegraf.ServiceInput); ok { if serviceInput, ok := input.(telegraf.ServiceInput); ok {
serviceInput.Stop() serviceInput.Stop()
} }
close(gatherPromptCh)
wg.Done() wg.Done()
}(input) }(input)
} }
go stdinCollectMetricsPrompt(ctx, collectMetricsPrompt) go s.stdinCollectMetricsPrompt(ctx, cancel, collectMetricsPrompt)
go s.closeMetricChannelWhenInputsFinish(&wg)
loop: loop:
for { for {
select { select {
case <-quit: case <-quit: // user-triggered quit
// cancel, but keep looping until the metric channel closes. // cancel, but keep looping until the metric channel closes.
cancel() cancel()
case <-collectMetricsPrompt: case _, open := <-collectMetricsPrompt:
collectMetrics(ctx) if !open { // stdin-close-triggered quit
case m, open := <-metricCh: cancel()
continue
}
s.collectMetrics(ctx)
case m, open := <-s.metricCh:
if !open { if !open {
wg.Done()
break loop break loop
} }
b, err := serializer.Serialize(m) b, err := serializer.Serialize(m)
@ -129,7 +140,6 @@ loop:
} }
} }
wg.Wait()
return nil return nil
} }
@ -142,11 +152,16 @@ func hasQuit(ctx context.Context) bool {
} }
} }
func stdinCollectMetricsPrompt(ctx context.Context, collectMetricsPrompt chan<- os.Signal) { func (s *Shim) stdinCollectMetricsPrompt(ctx context.Context, cancel context.CancelFunc, collectMetricsPrompt chan<- os.Signal) {
s := bufio.NewScanner(stdin) defer func() {
cancel()
close(collectMetricsPrompt)
}()
scanner := bufio.NewScanner(stdin)
// for every line read from stdin, make sure we're not supposed to quit, // for every line read from stdin, make sure we're not supposed to quit,
// then push a message on to the collectMetricsPrompt // then push a message on to the collectMetricsPrompt
for s.Scan() { for scanner.Scan() {
// first check if we should quit // first check if we should quit
if hasQuit(ctx) { if hasQuit(ctx) {
return return
@ -159,7 +174,7 @@ func stdinCollectMetricsPrompt(ctx context.Context, collectMetricsPrompt chan<-
// pushCollectMetricsRequest pushes a non-blocking (nil) message to the // pushCollectMetricsRequest pushes a non-blocking (nil) message to the
// collectMetricsPrompt channel to trigger metric collection. // collectMetricsPrompt channel to trigger metric collection.
// The channel is defined with a buffer of 1, so if it's full, duplicated // The channel is defined with a buffer of 1, so while it's full, subsequent
// requests are discarded. // requests are discarded.
func pushCollectMetricsRequest(collectMetricsPrompt chan<- os.Signal) { func pushCollectMetricsRequest(collectMetricsPrompt chan<- os.Signal) {
select { select {
@ -168,14 +183,14 @@ func pushCollectMetricsRequest(collectMetricsPrompt chan<- os.Signal) {
} }
} }
func collectMetrics(ctx context.Context) { func (s *Shim) collectMetrics(ctx context.Context) {
if hasQuit(ctx) { if hasQuit(ctx) {
return return
} }
for i := 0; i < len(gatherPromptChans); i++ { for i := 0; i < len(s.gatherPromptChans); i++ {
// push a message out to each channel to collect metrics. don't block. // push a message out to each channel to collect metrics. don't block.
select { select {
case gatherPromptChans[i] <- empty{}: case s.gatherPromptChans[i] <- empty{}:
default: default:
} }
} }
@ -196,7 +211,11 @@ func startGathering(ctx context.Context, input telegraf.Input, acc telegraf.Accu
select { select {
case <-ctx.Done(): case <-ctx.Done():
return return
case <-gatherPromptCh: case _, open := <-gatherPromptCh:
if !open {
// stdin has closed.
return
}
if err := input.Gather(acc); err != nil { if err := input.Gather(acc); err != nil {
fmt.Fprintf(os.Stderr, "failed to gather metrics: %s", err) fmt.Fprintf(os.Stderr, "failed to gather metrics: %s", err)
} }
@ -229,7 +248,7 @@ func DefaultImportedPlugins() (i []telegraf.Input, e error) {
// LoadConfig loads the config and returns inputs that later need to be loaded. // LoadConfig loads the config and returns inputs that later need to be loaded.
func LoadConfig(filePath *string) ([]telegraf.Input, error) { func LoadConfig(filePath *string) ([]telegraf.Input, error) {
if filePath == nil { if filePath == nil || *filePath == "" {
return DefaultImportedPlugins() return DefaultImportedPlugins()
} }
@ -276,3 +295,8 @@ func loadConfigIntoInputs(md toml.MetaData, inputConfigs map[string][]toml.Primi
} }
return renderedInputs, nil return renderedInputs, nil
} }
func (s *Shim) closeMetricChannelWhenInputsFinish(wg *sync.WaitGroup) {
wg.Wait()
close(s.metricCh)
}

View File

@ -1,14 +0,0 @@
// +build !windows
package shim
import (
"os"
"os/signal"
"syscall"
)
func listenForCollectMetricsSignals(collectMetricsPrompt chan os.Signal) {
// just listen to all the signals.
signal.Notify(collectMetricsPrompt, syscall.SIGHUP, syscall.SIGUSR1, syscall.SIGUSR2)
}

View File

@ -0,0 +1,23 @@
// +build !windows
package shim
import (
"context"
"os"
"os/signal"
"syscall"
)
func listenForCollectMetricsSignals(ctx context.Context, collectMetricsPrompt chan os.Signal) {
// just listen to all the signals.
signal.Notify(collectMetricsPrompt, syscall.SIGHUP, syscall.SIGUSR1, syscall.SIGUSR2)
go func() {
select {
case <-ctx.Done():
// context done. stop to signals to avoid pushing messages to a closed channel
signal.Stop(collectMetricsPrompt)
}
}()
}

View File

@ -3,11 +3,20 @@
package shim package shim
import ( import (
"context"
"os" "os"
"os/signal" "os/signal"
"syscall" "syscall"
) )
func listenForCollectMetricsSignals(collectMetricsPrompt chan os.Signal) { func listenForCollectMetricsSignals(ctx context.Context, collectMetricsPrompt chan os.Signal) {
signal.Notify(collectMetricsPrompt, syscall.SIGHUP) signal.Notify(collectMetricsPrompt, syscall.SIGHUP)
go func() {
select {
case <-ctx.Done():
// context done. stop to signals to avoid pushing messages to a closed channel
signal.Stop(collectMetricsPrompt)
}
}()
} }

View File

@ -3,11 +3,11 @@
package shim package shim
import ( import (
"bytes" "bufio"
"context" "context"
"io"
"os" "os"
"runtime" "runtime"
"strings"
"syscall" "syscall"
"testing" "testing"
"time" "time"
@ -20,15 +20,15 @@ func TestShimUSR1SignalingWorks(t *testing.T) {
t.Skip() t.Skip()
return return
} }
stdoutBytes := bytes.NewBufferString("") stdinReader, stdinWriter := io.Pipe()
stdout = stdoutBytes stdoutReader, stdoutWriter := io.Pipe()
stdin = stdinReader
stdout = stdoutWriter
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
defer cancel() defer cancel()
wait := runInputPlugin(t, 40*time.Second) metricProcessed, exited := runInputPlugin(t, 20*time.Minute)
// sleep a bit to avoid a race condition where the input hasn't loaded yet.
time.Sleep(10 * time.Millisecond)
// signal USR1 to yourself. // signal USR1 to yourself.
pid := os.Getpid() pid := os.Getpid()
@ -54,23 +54,17 @@ func TestShimUSR1SignalingWorks(t *testing.T) {
timeout := time.NewTimer(10 * time.Second) timeout := time.NewTimer(10 * time.Second)
select { select {
case <-wait: case <-metricProcessed:
case <-timeout.C: case <-timeout.C:
require.Fail(t, "Timeout waiting for metric to arrive") require.Fail(t, "Timeout waiting for metric to arrive")
} }
cancel()
for stdoutBytes.Len() == 0 { r := bufio.NewReader(stdoutReader)
select { out, err := r.ReadString('\n')
case <-timeout.C: require.NoError(t, err)
require.Fail(t, "Timeout waiting to read metric from stdout") require.Equal(t, "measurement,tag=tag field=1i 1234000005678\n", out)
return
default:
time.Sleep(10 * time.Millisecond)
}
}
out := string(stdoutBytes.Bytes()) stdinWriter.Close()
require.Contains(t, out, "\n") <-exited
metricLine := strings.Split(out, "\n")[0]
require.Equal(t, "measurement,tag=tag field=1i 1234000005678", metricLine)
} }

View File

@ -1,7 +1,9 @@
package shim package shim
import ( import (
"bufio"
"bytes" "bytes"
"io"
"strings" "strings"
"testing" "testing"
"time" "time"
@ -15,11 +17,13 @@ func TestShimWorks(t *testing.T) {
stdoutBytes := bytes.NewBufferString("") stdoutBytes := bytes.NewBufferString("")
stdout = stdoutBytes stdout = stdoutBytes
stdin, _ = io.Pipe() // hold the stdin pipe open
timeout := time.NewTimer(10 * time.Second) timeout := time.NewTimer(10 * time.Second)
wait := runInputPlugin(t, 10*time.Millisecond) metricProcessed, _ := runInputPlugin(t, 10*time.Millisecond)
select { select {
case <-wait: case <-metricProcessed:
case <-timeout.C: case <-timeout.C:
require.Fail(t, "Timeout waiting for metric to arrive") require.Fail(t, "Timeout waiting for metric to arrive")
} }
@ -40,55 +44,52 @@ func TestShimWorks(t *testing.T) {
} }
func TestShimStdinSignalingWorks(t *testing.T) { func TestShimStdinSignalingWorks(t *testing.T) {
stdoutBytes := bytes.NewBufferString("") stdinReader, stdinWriter := io.Pipe()
stdout = stdoutBytes stdoutReader, stdoutWriter := io.Pipe()
stdinBytes := bytes.NewBufferString("")
stdin = stdinBytes stdin = stdinReader
stdout = stdoutWriter
timeout := time.NewTimer(10 * time.Second) timeout := time.NewTimer(10 * time.Second)
wait := runInputPlugin(t, 40*time.Second) metricProcessed, exited := runInputPlugin(t, 40*time.Second)
stdinBytes.WriteString("\n") stdinWriter.Write([]byte("\n"))
select { select {
case <-wait: case <-metricProcessed:
case <-timeout.C: case <-timeout.C:
require.Fail(t, "Timeout waiting for metric to arrive") require.Fail(t, "Timeout waiting for metric to arrive")
} }
for stdoutBytes.Len() == 0 { r := bufio.NewReader(stdoutReader)
select { out, err := r.ReadString('\n')
case <-timeout.C: require.NoError(t, err)
require.Fail(t, "Timeout waiting to read metric from stdout") require.Equal(t, "measurement,tag=tag field=1i 1234000005678\n", out)
return
default:
time.Sleep(10 * time.Millisecond)
}
}
out := string(stdoutBytes.Bytes()) stdinWriter.Close()
require.Contains(t, out, "\n") // check that it exits cleanly
metricLine := strings.Split(out, "\n")[0] <-exited
require.Equal(t, "measurement,tag=tag field=1i 1234000005678", metricLine)
} }
func runInputPlugin(t *testing.T, timeout time.Duration) chan bool { func runInputPlugin(t *testing.T, interval time.Duration) (metricProcessed chan bool, exited chan bool) {
wait := make(chan bool) metricProcessed = make(chan bool)
exited = make(chan bool)
inp := &testInput{ inp := &testInput{
wait: wait, metricProcessed: metricProcessed,
} }
shim := New() shim := New()
shim.AddInput(inp) shim.AddInput(inp)
go func() { go func() {
err := shim.Run(timeout) // we aren't using the timer here err := shim.Run(interval)
require.NoError(t, err) require.NoError(t, err)
exited <- true
}() }()
return wait return metricProcessed, exited
} }
type testInput struct { type testInput struct {
wait chan bool metricProcessed chan bool
} }
func (i *testInput) SampleConfig() string { func (i *testInput) SampleConfig() string {
@ -107,7 +108,7 @@ func (i *testInput) Gather(acc telegraf.Accumulator) error {
map[string]string{ map[string]string{
"tag": "tag", "tag": "tag",
}, time.Unix(1234, 5678)) }, time.Unix(1234, 5678))
i.wait <- true i.metricProcessed <- true
return nil return nil
} }