telegraf/plugins/inputs/request_aggregates/request_aggregates.go

357 lines
11 KiB
Go

package request_aggregates
import (
"fmt"
"github.com/hpcloud/tail"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/plugins/inputs"
"log"
"regexp"
"sync"
"time"
)
type RequestAggregates struct {
File string
TimestampPosition int
TimestampFormat string
TimePosition int
TimePercentiles []float32
TimeWindowSize internal.Duration
TimeWindows int
ResultPosition int
ResultSuccessRegex string
ThroughputWindowSize internal.Duration
ThroughputWindows int
isTimestampEpoch bool
successRegexp *regexp.Regexp
tailer *tail.Tail
timeWindowSlice []Window
throughputWindowSlice []Window
timeTimer *time.Timer
throughputTimer *time.Timer
stopTimeChan chan bool
stopThroughputChan chan bool
timeMutex sync.Mutex
throughputMutex sync.Mutex
wg sync.WaitGroup
sync.Mutex
}
func NewRequestAggregates() *RequestAggregates {
return &RequestAggregates{
TimeWindows: 2,
ThroughputWindows: 10}
}
const sampleConfig = `
# File to monitor.
file = "/var/server/access.csv"
# Position of the timestamp of the request in every line
timestamp_position = 0
# Format of the timestamp (any layout accepted by Go Time.Parse or s/ms/us/ns for epoch time)
timestamp_format = "ms"
# Position of the time value to calculate in the log file (starting from 0)
time_position = 1
# Window to consider for time percentiles
time_window_size = "60s"
# Windows to keep in memory before flushing in order to avoid requests coming in after a window is shut.
# If the CSV file is sorted by timestamp, this can be set to 1
time_windows = 5
# List of percentiles to calculate
time_percentiles = [90.0, 95.0, 99.0, 99.99]
# Position of the result column (success or failure)
result_position = 3
# Regular expression used to determine if the result is successful or not (if empty only request_aggregates_all
# time series) will be generated
result_success_regex = ".*true.*"
# Time window to calculate throughput counters
throughput_window_size = "1s"
# Number of windows to keep in memory for throughput calculation
throughput_windows = 300
# List of tags and their values to add to every data point
[inputs.aggregates.tags]
name = "myserver"
`
func (ra *RequestAggregates) SampleConfig() string {
return sampleConfig
}
func (ra *RequestAggregates) Description() string {
return "Generates a set of aggregate values for a requests and their response times."
}
func (ra *RequestAggregates) Gather(acc telegraf.Accumulator) error {
return nil
}
func (ra *RequestAggregates) Start(acc telegraf.Accumulator) error {
ra.Lock()
defer ra.Unlock()
err := ra.validateConfig()
if err != nil {
return err
}
// Create tailer
ra.tailer, err = tail.TailFile(ra.File, tail.Config{
Follow: true,
ReOpen: true,
Location: &tail.SeekInfo{Whence: 2, Offset: 0}})
if err != nil {
return fmt.Errorf("ERROR tailing file %s, Error: %s", ra.File, err)
}
// Create first time window and start go routine to manage them
now := time.Now()
ra.timeWindowSlice = append(ra.timeWindowSlice, &TimeWindow{
StartTime: now, EndTime: now.Add(ra.TimeWindowSize.Duration),
OnlyTotal: ra.successRegexp == nil, Percentiles: ra.TimePercentiles})
ra.timeTimer = time.NewTimer(ra.TimeWindowSize.Duration)
ra.stopTimeChan = make(chan bool, 1)
ra.wg.Add(1)
go ra.manageTimeWindows(acc)
// Create first throughput window and start go routine to manage them
ra.throughputWindowSlice = append(ra.throughputWindowSlice, &ThroughputWindow{
StartTime: now, EndTime: now.Add(ra.ThroughputWindowSize.Duration)})
ra.throughputTimer = time.NewTimer(ra.ThroughputWindowSize.Duration)
ra.stopThroughputChan = make(chan bool, 1)
ra.wg.Add(1)
go ra.manageThroughputWindows(acc)
// Start go routine to tail the file and put requests in windows
ra.wg.Add(1)
go ra.gatherFromFile(ra.tailer, acc)
return nil
}
func (ra *RequestAggregates) Stop() {
ra.Lock()
defer ra.Unlock()
err := ra.tailer.Stop()
if err != nil {
log.Printf("ERROR: could not stop tail on file %s\n", ra.File)
}
ra.tailer.Cleanup()
ra.timeTimer.Stop()
ra.stopTimeChan <- true
ra.throughputTimer.Stop()
ra.stopThroughputChan <- true
ra.wg.Wait()
}
// Validates the configuration in the struct
func (ra *RequestAggregates) validateConfig() error {
var err error
// Compile regex to identify success
if ra.ResultSuccessRegex != "" {
ra.successRegexp, err = regexp.Compile(ra.ResultSuccessRegex)
if err != nil {
return fmt.Errorf("ERROR: success regexp is not valid, Error: %s", err)
}
}
// Check if timestamp format is valid
switch ra.TimestampFormat {
case "s", "ms", "us", "ns":
ra.isTimestampEpoch = true
break
default:
if time.Now().Format(ra.TimestampFormat) == ra.TimestampFormat {
return fmt.Errorf("ERROR: incorrect timestamp format")
}
}
// Check percentiles are valid
for _, percentile := range ra.TimePercentiles {
if percentile <= 0 || percentile >= 100 {
return fmt.Errorf("ERROR: percentiles must be numbers between 0 and 100 (not inclusive)")
}
}
//Check duration of windows
if ra.TimeWindowSize.Duration <= time.Duration(0) || ra.ThroughputWindowSize.Duration <= time.Duration(0) {
return fmt.Errorf("ERROR: windows need to be a positive duration")
}
// Check number of windows
if ra.TimeWindows <= 0 || ra.ThroughputWindows <= 0 {
return fmt.Errorf("ERROR: at least one window is required")
}
return nil
}
// Executed as a go routine, tails a given file and puts the parsed requests into their respective windows.
func (ra *RequestAggregates) gatherFromFile(tailer *tail.Tail, acc telegraf.Accumulator) {
defer ra.wg.Done()
requestParser := &RequestParser{
TimestampPosition: ra.TimestampPosition,
TimestampFormat: ra.TimestampFormat,
IsTimeEpoch: ra.isTimestampEpoch,
TimePosition: ra.TimePosition,
ResultPosition: ra.ResultPosition,
SuccessRegexp: ra.successRegexp}
var err error
var line *tail.Line
var request *Request
for line = range tailer.Lines {
// Parse and validate line
if line.Err != nil {
log.Printf("ERROR: could not tail file %s, Error: %s\n", tailer.Filename, err)
continue
}
request, err = requestParser.ParseLine(line.Text)
if err != nil {
log.Printf("ERROR: malformed line in %s: [%s], Error: %s\n", tailer.Filename, line.Text, err)
continue
}
// Wait until the window is created (it is possible that the line is read before the time ticks)
for ra.timeWindowSlice[len(ra.timeWindowSlice)-1].End().Before(request.Timestamp) {
time.Sleep(time.Millisecond * 10)
}
// Add request to time window
ra.timeMutex.Lock()
err = addToWindow(ra.timeWindowSlice, request)
if err != nil {
log.Printf("ERROR: could not find a time window, Request: %v, Error %s\n", request, err)
}
ra.timeMutex.Unlock()
// Wait until the window is created (it is possible that the line is read before the time ticks)
for ra.throughputWindowSlice[len(ra.throughputWindowSlice)-1].End().Before(request.Timestamp) {
time.Sleep(time.Millisecond * 10)
}
// Add request to throughput window
ra.throughputMutex.Lock()
err = addToWindow(ra.throughputWindowSlice, request)
if err != nil {
log.Printf("ERROR: could not find a throughput window, Request: %v, Error %s\n", request, err)
}
ra.throughputMutex.Unlock()
}
}
// Executed as a go routine, manages the windows related to time measures, creating new ones and flushing old ones
func (ra *RequestAggregates) manageTimeWindows(acc telegraf.Accumulator) {
defer ra.wg.Done()
onlyTotal := ra.successRegexp == nil
for {
select {
// If the timer is triggered
case <-ra.timeTimer.C:
ra.timeMutex.Lock()
// Create new window with the start time of the last one's end time
startTime := ra.timeWindowSlice[len(ra.timeWindowSlice)-1].End()
endTime := startTime.Add(ra.TimeWindowSize.Duration)
ra.timeWindowSlice = append(ra.timeWindowSlice, &TimeWindow{
StartTime: startTime, EndTime: endTime,
OnlyTotal: onlyTotal, Percentiles: ra.TimePercentiles})
// Flush oldest one if necessary
if len(ra.timeWindowSlice) > ra.TimeWindows {
ra.timeWindowSlice = flushWindow(ra.timeWindowSlice, acc)
}
ra.timeMutex.Unlock()
// Reset time till the end of the window
ra.timeTimer.Reset(endTime.Sub(time.Now()))
// If the stop signal is received
case <-ra.stopTimeChan:
ra.timeMutex.Lock()
ra.timeWindowSlice = flushAllWindows(ra.timeWindowSlice, acc)
ra.timeMutex.Unlock()
return
}
}
}
// Executed as a go routine, manages the windows related to throughput measures, creating new ones and flushing old ones
func (ra *RequestAggregates) manageThroughputWindows(acc telegraf.Accumulator) {
defer ra.wg.Done()
for {
select {
// If the timer is triggered
case <-ra.throughputTimer.C:
ra.throughputMutex.Lock()
// Create new window with the start time of the last one's end time
startTime := ra.throughputWindowSlice[len(ra.throughputWindowSlice)-1].End()
endTime := startTime.Add(ra.ThroughputWindowSize.Duration)
ra.throughputWindowSlice = append(ra.throughputWindowSlice, &ThroughputWindow{
StartTime: startTime, EndTime: endTime})
// Flush oldest one if necessary
if len(ra.throughputWindowSlice) > ra.ThroughputWindows {
ra.throughputWindowSlice = flushWindow(ra.throughputWindowSlice, acc)
}
ra.throughputMutex.Unlock()
ra.throughputTimer.Reset(endTime.Sub(time.Now()))
// If the stop signal is received
case <-ra.stopThroughputChan:
ra.throughputMutex.Lock()
ra.throughputWindowSlice = flushAllWindows(ra.throughputWindowSlice, acc)
ra.throughputMutex.Unlock()
return
}
}
}
// Removes the window at the front of the slice of windows and flushes its aggregated metrics to the accumulator
func flushWindow(windows []Window, acc telegraf.Accumulator) []Window {
if len(windows) > 0 {
var window Window
window, windows = windows[0], windows[1:]
metrics, err := window.Aggregate()
if err != nil {
log.Printf("ERROR: could not flush window, Error: %s\n", err)
}
for _, metric := range metrics {
acc.AddFields(metric.Name(), metric.Fields(), metric.Tags(), metric.Time())
}
}
return windows
}
// Flushes all windows ot the accumulator
func flushAllWindows(windows []Window, acc telegraf.Accumulator) []Window {
for len(windows) > 0 {
windows = flushWindow(windows, acc)
}
return windows
}
// Adds a request to a window, returns and error if it could not be added
func addToWindow(windows []Window, request *Request) error {
if len(windows) == 0 {
return fmt.Errorf("ERROR: no windows found")
}
first := windows[len(windows)-1]
if first.End().Before(request.Timestamp) {
return fmt.Errorf("ERROR: request is newer than any window")
}
last := windows[0]
if last.Start().After(request.Timestamp) {
return fmt.Errorf("ERROR: request is older than any window, try adding more windows")
}
for i := range windows {
window := windows[i]
if (window.Start().Before(request.Timestamp) || window.Start().Equal(request.Timestamp)) &&
window.End().After(request.Timestamp) {
return window.Add(request)
}
}
return fmt.Errorf("ERROR: no window could be found")
}
func init() {
inputs.Add("request_aggregates", func() telegraf.Input {
return NewRequestAggregates()
})
}