add support for SIGUSR1 to trigger flush (#7366)

This commit is contained in:
Steven Soroka 2020-04-20 13:49:10 -04:00 committed by GitHub
parent 6c72c645a2
commit 819481b195
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 218 additions and 131 deletions

View File

@ -4,6 +4,7 @@ import (
"context"
"fmt"
"log"
"os"
"runtime"
"sync"
"time"
@ -516,16 +517,7 @@ func (a *Agent) runOutputs(
wg.Add(1)
go func(output *models.RunningOutput) {
defer wg.Done()
if a.Config.Agent.RoundInterval {
err := internal.SleepContext(
ctx, internal.AlignDuration(startTime, interval))
if err != nil {
return
}
}
a.flush(ctx, output, interval, jitter)
a.flushLoop(ctx, startTime, output, interval, jitter)
}(output)
}
@ -546,25 +538,39 @@ func (a *Agent) runOutputs(
return nil
}
// flush runs an output's flush function periodically until the context is
// flushLoop runs an output's flush function periodically until the context is
// done.
func (a *Agent) flush(
func (a *Agent) flushLoop(
ctx context.Context,
startTime time.Time,
output *models.RunningOutput,
interval time.Duration,
jitter time.Duration,
) {
// since we are watching two channels we need a ticker with the jitter
// integrated.
ticker := NewTicker(interval, jitter)
defer ticker.Stop()
logError := func(err error) {
if err != nil {
log.Printf("E! [agent] Error writing to %s: %v", output.LogName(), err)
}
}
// watch for flush requests
flushRequested := make(chan os.Signal, 1)
watchForFlushSignal(flushRequested)
// align to round interval
if a.Config.Agent.RoundInterval {
err := internal.SleepContext(
ctx, internal.AlignDuration(startTime, interval))
if err != nil {
return
}
}
// since we are watching two channels we need a ticker with the jitter
// integrated.
ticker := NewTicker(interval, jitter)
defer ticker.Stop()
for {
// Favor shutdown over other methods.
select {
@ -575,8 +581,13 @@ func (a *Agent) flush(
}
select {
case <-ctx.Done():
logError(a.flushOnce(output, interval, output.Write))
return
case <-ticker.C:
logError(a.flushOnce(output, interval, output.Write))
case <-flushRequested:
logError(a.flushOnce(output, interval, output.Write))
case <-output.BatchReady:
// Favor the ticker over batch ready
select {
@ -585,9 +596,6 @@ func (a *Agent) flush(
default:
logError(a.flushOnce(output, interval, output.WriteBatch))
}
case <-ctx.Done():
logError(a.flushOnce(output, interval, output.Write))
return
}
}
}

16
agent/agent_notwindows.go Normal file
View File

@ -0,0 +1,16 @@
// +build !windows
package agent
import (
"os"
"os/signal"
"syscall"
)
const flushSignal = syscall.SIGUSR1
func watchForFlushSignal(flushRequested chan os.Signal) {
signal.Notify(flushRequested, flushSignal)
defer signal.Stop(flushRequested)
}

9
agent/agent_windows.go Normal file
View File

@ -0,0 +1,9 @@
// +build windows
package agent
import "os"
func watchForFlushSignal(flushRequested chan os.Signal) {
// not implemented
}

View File

@ -10,7 +10,6 @@ import (
_ "net/http/pprof" // Comment this line to disable pprof endpoint.
"os"
"os/signal"
"runtime"
"sort"
"strings"
"syscall"
@ -27,16 +26,16 @@ import (
"github.com/influxdata/telegraf/plugins/outputs"
_ "github.com/influxdata/telegraf/plugins/outputs/all"
_ "github.com/influxdata/telegraf/plugins/processors/all"
"github.com/kardianos/service"
)
// If you update these, update usage.go and usage_windows.go
var fDebug = flag.Bool("debug", false,
"turn on debug logging")
var pprofAddr = flag.String("pprof-addr", "",
"pprof address to listen on, not activate pprof if empty")
var fQuiet = flag.Bool("quiet", false,
"run in quiet mode")
var fTest = flag.Bool("test", false, "enable test mode: gather metrics, print them out, and exit")
var fTest = flag.Bool("test", false, "enable test mode: gather metrics, print them out, and exit. Note: Test mode only runs inputs, not processors, aggregators, or outputs")
var fTestWait = flag.Int("test-wait", 0, "wait up to this many seconds for service inputs to complete in test mode")
var fConfig = flag.String("config", "", "configuration file to load")
var fConfigDirectory = flag.String("config-directory", "",
@ -78,7 +77,6 @@ var (
var stop chan struct{}
func reloadLoop(
stop chan struct{},
inputFilters []string,
outputFilters []string,
aggregatorFilters []string,
@ -91,7 +89,7 @@ func reloadLoop(
ctx, cancel := context.WithCancel(context.Background())
signals := make(chan os.Signal)
signals := make(chan os.Signal, 1)
signal.Notify(signals, os.Interrupt, syscall.SIGHUP,
syscall.SIGTERM, syscall.SIGINT)
go func() {
@ -208,32 +206,6 @@ func usageExit(rc int) {
os.Exit(rc)
}
type program struct {
inputFilters []string
outputFilters []string
aggregatorFilters []string
processorFilters []string
}
func (p *program) Start(s service.Service) error {
go p.run()
return nil
}
func (p *program) run() {
stop = make(chan struct{})
reloadLoop(
stop,
p.inputFilters,
p.outputFilters,
p.aggregatorFilters,
p.processorFilters,
)
}
func (p *program) Stop(s service.Service) error {
close(stop)
return nil
}
func formatFullVersion() string {
var parts = []string{"Telegraf"}
@ -380,80 +352,10 @@ func main() {
log.Println("Telegraf version already configured to: " + internal.Version())
}
if runtime.GOOS == "windows" && windowsRunAsService() {
programFiles := os.Getenv("ProgramFiles")
if programFiles == "" { // Should never happen
programFiles = "C:\\Program Files"
}
svcConfig := &service.Config{
Name: *fServiceName,
DisplayName: *fServiceDisplayName,
Description: "Collects data using a series of plugins and publishes it to" +
"another series of plugins.",
Arguments: []string{"--config", programFiles + "\\Telegraf\\telegraf.conf"},
}
prg := &program{
inputFilters: inputFilters,
outputFilters: outputFilters,
aggregatorFilters: aggregatorFilters,
processorFilters: processorFilters,
}
s, err := service.New(prg, svcConfig)
if err != nil {
log.Fatal("E! " + err.Error())
}
// Handle the --service flag here to prevent any issues with tooling that
// may not have an interactive session, e.g. installing from Ansible.
if *fService != "" {
if *fConfig != "" {
svcConfig.Arguments = []string{"--config", *fConfig}
}
if *fConfigDirectory != "" {
svcConfig.Arguments = append(svcConfig.Arguments, "--config-directory", *fConfigDirectory)
}
//set servicename to service cmd line, to have a custom name after relaunch as a service
svcConfig.Arguments = append(svcConfig.Arguments, "--service-name", *fServiceName)
err := service.Control(s, *fService)
if err != nil {
log.Fatal("E! " + err.Error())
}
os.Exit(0)
} else {
winlogger, err := s.Logger(nil)
if err == nil {
//When in service mode, register eventlog target andd setup default logging to eventlog
logger.RegisterEventLogger(winlogger)
logger.SetupLogging(logger.LogConfig{LogTarget: logger.LogTargetEventlog})
}
err = s.Run()
if err != nil {
log.Println("E! " + err.Error())
}
}
} else {
stop = make(chan struct{})
reloadLoop(
stop,
inputFilters,
outputFilters,
aggregatorFilters,
processorFilters,
)
}
}
// Return true if Telegraf should create a Windows service.
func windowsRunAsService() bool {
if *fService != "" {
return true
}
if *fRunAsConsole {
return false
}
return !service.Interactive()
run(
inputFilters,
outputFilters,
aggregatorFilters,
processorFilters,
)
}

View File

@ -0,0 +1,13 @@
// +build !windows
package main
func run(inputFilters, outputFilters, aggregatorFilters, processorFilters []string) {
stop = make(chan struct{})
reloadLoop(
inputFilters,
outputFilters,
aggregatorFilters,
processorFilters,
)
}

View File

@ -0,0 +1,124 @@
// +build windows
package main
import (
"log"
"os"
"runtime"
"github.com/influxdata/telegraf/logger"
"github.com/kardianos/service"
)
func run(inputFilters, outputFilters, aggregatorFilters, processorFilters []string) {
if runtime.GOOS == "windows" && windowsRunAsService() {
runAsWindowsService(
inputFilters,
outputFilters,
aggregatorFilters,
processorFilters,
)
} else {
stop = make(chan struct{})
reloadLoop(
inputFilters,
outputFilters,
aggregatorFilters,
processorFilters,
)
}
}
type program struct {
inputFilters []string
outputFilters []string
aggregatorFilters []string
processorFilters []string
}
func (p *program) Start(s service.Service) error {
go p.run()
return nil
}
func (p *program) run() {
stop = make(chan struct{})
reloadLoop(
p.inputFilters,
p.outputFilters,
p.aggregatorFilters,
p.processorFilters,
)
}
func (p *program) Stop(s service.Service) error {
close(stop)
return nil
}
func runAsWindowsService(inputFilters, outputFilters, aggregatorFilters, processorFilters []string) {
programFiles := os.Getenv("ProgramFiles")
if programFiles == "" { // Should never happen
programFiles = "C:\\Program Files"
}
svcConfig := &service.Config{
Name: *fServiceName,
DisplayName: *fServiceDisplayName,
Description: "Collects data using a series of plugins and publishes it to" +
"another series of plugins.",
Arguments: []string{"--config", programFiles + "\\Telegraf\\telegraf.conf"},
}
prg := &program{
inputFilters: inputFilters,
outputFilters: outputFilters,
aggregatorFilters: aggregatorFilters,
processorFilters: processorFilters,
}
s, err := service.New(prg, svcConfig)
if err != nil {
log.Fatal("E! " + err.Error())
}
// Handle the --service flag here to prevent any issues with tooling that
// may not have an interactive session, e.g. installing from Ansible.
if *fService != "" {
if *fConfig != "" {
svcConfig.Arguments = []string{"--config", *fConfig}
}
if *fConfigDirectory != "" {
svcConfig.Arguments = append(svcConfig.Arguments, "--config-directory", *fConfigDirectory)
}
//set servicename to service cmd line, to have a custom name after relaunch as a service
svcConfig.Arguments = append(svcConfig.Arguments, "--service-name", *fServiceName)
err := service.Control(s, *fService)
if err != nil {
log.Fatal("E! " + err.Error())
}
os.Exit(0)
} else {
winlogger, err := s.Logger(nil)
if err == nil {
//When in service mode, register eventlog target andd setup default logging to eventlog
logger.RegisterEventLogger(winlogger)
logger.SetupLogging(logger.LogConfig{LogTarget: logger.LogTargetEventlog})
}
err = s.Run()
if err != nil {
log.Println("E! " + err.Error())
}
}
}
// Return true if Telegraf should create a Windows service.
func windowsRunAsService() bool {
if *fService != "" {
return true
}
if *fRunAsConsole {
return false
}
return !service.Interactive()
}

View File

@ -94,6 +94,19 @@ You should also add the following to your `SampleConfig()`:
data_format = "influx"
```
## Flushing Metrics to Outputs
Metrics are flushed to outputs when any of the following events happen:
- `flush_interval + rand(flush_jitter)` has elapsed since start or the last flush interval
- At least `metric_batch_size` count of metrics are waiting in the buffer
- The telegraf process has received a SIGUSR1 signal
Note that if the flush takes longer than the `agent.interval` to write the metrics
to the output, you'll see a message saying the output `did not complete within its
flush interval`. This may mean your output is not keeping up with the flow of metrics,
and you may want to look into enabling compression, reducing the size of your metrics,
or investigate other reasons why the writes might be taking longer than expected.
[file]: https://github.com/influxdata/telegraf/tree/master/plugins/inputs/file
[output data formats]: https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md
[SampleConfig]: https://github.com/influxdata/telegraf/wiki/SampleConfig

View File

@ -32,8 +32,9 @@ The commands & flags are:
Valid values are 'agent', 'global_tags', 'outputs',
'processors', 'aggregators' and 'inputs'
--sample-config print out full sample configuration
--test gather metrics, print them out, and exit;
processors, aggregators, and outputs are not run
--test enable test mode: gather metrics, print them out,
and exit. Note: Test mode only runs inputs, not
processors, aggregators, or outputs
--test-wait wait up to this many seconds for service
inputs to complete in test mode
--usage <plugin> print usage for a plugin, ie, 'telegraf --usage mysql'

View File

@ -29,8 +29,9 @@ The commands & flags are:
--section-filter filter config sections to output, separator is :
Valid values are 'agent', 'global_tags', 'outputs',
'processors', 'aggregators' and 'inputs'
--test gather metrics, print them out, and exit;
processors, aggregators, and outputs are not run
--test enable test mode: gather metrics, print them out,
and exit. Note: Test mode only runs inputs, not
processors, aggregators, or outputs
--test-wait wait up to this many seconds for service
inputs to complete in test mode
--usage <plugin> print usage for a plugin, ie, 'telegraf --usage mysql'