357 lines
11 KiB
Go
357 lines
11 KiB
Go
|
package request_aggregates
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"github.com/hpcloud/tail"
|
||
|
"github.com/influxdata/telegraf"
|
||
|
"github.com/influxdata/telegraf/internal"
|
||
|
"github.com/influxdata/telegraf/plugins/inputs"
|
||
|
"log"
|
||
|
"regexp"
|
||
|
"sync"
|
||
|
"time"
|
||
|
)
|
||
|
|
||
|
type RequestAggregates struct {
|
||
|
File string
|
||
|
TimestampPosition int
|
||
|
TimestampFormat string
|
||
|
TimePosition int
|
||
|
TimePercentiles []float32
|
||
|
TimeWindowSize internal.Duration
|
||
|
TimeWindows int
|
||
|
ResultPosition int
|
||
|
ResultSuccessRegex string
|
||
|
ThroughputWindowSize internal.Duration
|
||
|
ThroughputWindows int
|
||
|
|
||
|
isTimestampEpoch bool
|
||
|
successRegexp *regexp.Regexp
|
||
|
tailer *tail.Tail
|
||
|
timeWindowSlice []Window
|
||
|
throughputWindowSlice []Window
|
||
|
timeTimer *time.Timer
|
||
|
throughputTimer *time.Timer
|
||
|
stopTimeChan chan bool
|
||
|
stopThroughputChan chan bool
|
||
|
timeMutex sync.Mutex
|
||
|
throughputMutex sync.Mutex
|
||
|
wg sync.WaitGroup
|
||
|
sync.Mutex
|
||
|
}
|
||
|
|
||
|
func NewRequestAggregates() *RequestAggregates {
|
||
|
return &RequestAggregates{
|
||
|
TimeWindows: 2,
|
||
|
ThroughputWindows: 10}
|
||
|
}
|
||
|
|
||
|
const sampleConfig = `
|
||
|
# File to monitor.
|
||
|
file = "/var/server/access.csv"
|
||
|
# Position of the timestamp of the request in every line
|
||
|
timestamp_position = 0
|
||
|
# Format of the timestamp (any layout accepted by Go Time.Parse or s/ms/us/ns for epoch time)
|
||
|
timestamp_format = "ms"
|
||
|
# Position of the time value to calculate in the log file (starting from 0)
|
||
|
time_position = 1
|
||
|
# Window to consider for time percentiles
|
||
|
time_window_size = "60s"
|
||
|
# Windows to keep in memory before flushing in order to avoid requests coming in after a window is shut.
|
||
|
# If the CSV file is sorted by timestamp, this can be set to 1
|
||
|
time_windows = 5
|
||
|
# List of percentiles to calculate
|
||
|
time_percentiles = [90.0, 95.0, 99.0, 99.99]
|
||
|
# Position of the result column (success or failure)
|
||
|
result_position = 3
|
||
|
# Regular expression used to determine if the result is successful or not (if empty only request_aggregates_all
|
||
|
# time series) will be generated
|
||
|
result_success_regex = ".*true.*"
|
||
|
# Time window to calculate throughput counters
|
||
|
throughput_window_size = "1s"
|
||
|
# Number of windows to keep in memory for throughput calculation
|
||
|
throughput_windows = 300
|
||
|
# List of tags and their values to add to every data point
|
||
|
[inputs.aggregates.tags]
|
||
|
name = "myserver"
|
||
|
`
|
||
|
|
||
|
func (ra *RequestAggregates) SampleConfig() string {
|
||
|
return sampleConfig
|
||
|
}
|
||
|
|
||
|
func (ra *RequestAggregates) Description() string {
|
||
|
return "Generates a set of aggregate values for a requests and their response times."
|
||
|
}
|
||
|
|
||
|
func (ra *RequestAggregates) Gather(acc telegraf.Accumulator) error {
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (ra *RequestAggregates) Start(acc telegraf.Accumulator) error {
|
||
|
ra.Lock()
|
||
|
defer ra.Unlock()
|
||
|
|
||
|
err := ra.validateConfig()
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
// Create tailer
|
||
|
ra.tailer, err = tail.TailFile(ra.File, tail.Config{
|
||
|
Follow: true,
|
||
|
ReOpen: true,
|
||
|
Location: &tail.SeekInfo{Whence: 2, Offset: 0}})
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("ERROR tailing file %s, Error: %s", ra.File, err)
|
||
|
}
|
||
|
|
||
|
// Create first time window and start go routine to manage them
|
||
|
now := time.Now()
|
||
|
ra.timeWindowSlice = append(ra.timeWindowSlice, &TimeWindow{
|
||
|
StartTime: now, EndTime: now.Add(ra.TimeWindowSize.Duration),
|
||
|
OnlyTotal: ra.successRegexp == nil, Percentiles: ra.TimePercentiles})
|
||
|
ra.timeTimer = time.NewTimer(ra.TimeWindowSize.Duration)
|
||
|
ra.stopTimeChan = make(chan bool, 1)
|
||
|
ra.wg.Add(1)
|
||
|
go ra.manageTimeWindows(acc)
|
||
|
|
||
|
// Create first throughput window and start go routine to manage them
|
||
|
ra.throughputWindowSlice = append(ra.throughputWindowSlice, &ThroughputWindow{
|
||
|
StartTime: now, EndTime: now.Add(ra.ThroughputWindowSize.Duration)})
|
||
|
ra.throughputTimer = time.NewTimer(ra.ThroughputWindowSize.Duration)
|
||
|
ra.stopThroughputChan = make(chan bool, 1)
|
||
|
ra.wg.Add(1)
|
||
|
go ra.manageThroughputWindows(acc)
|
||
|
|
||
|
// Start go routine to tail the file and put requests in windows
|
||
|
ra.wg.Add(1)
|
||
|
go ra.gatherFromFile(ra.tailer, acc)
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (ra *RequestAggregates) Stop() {
|
||
|
ra.Lock()
|
||
|
defer ra.Unlock()
|
||
|
|
||
|
err := ra.tailer.Stop()
|
||
|
if err != nil {
|
||
|
log.Printf("ERROR: could not stop tail on file %s\n", ra.File)
|
||
|
}
|
||
|
ra.tailer.Cleanup()
|
||
|
|
||
|
ra.timeTimer.Stop()
|
||
|
ra.stopTimeChan <- true
|
||
|
ra.throughputTimer.Stop()
|
||
|
ra.stopThroughputChan <- true
|
||
|
|
||
|
ra.wg.Wait()
|
||
|
}
|
||
|
|
||
|
// Validates the configuration in the struct
|
||
|
func (ra *RequestAggregates) validateConfig() error {
|
||
|
var err error
|
||
|
|
||
|
// Compile regex to identify success
|
||
|
if ra.ResultSuccessRegex != "" {
|
||
|
ra.successRegexp, err = regexp.Compile(ra.ResultSuccessRegex)
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("ERROR: success regexp is not valid, Error: %s", err)
|
||
|
}
|
||
|
}
|
||
|
// Check if timestamp format is valid
|
||
|
switch ra.TimestampFormat {
|
||
|
case "s", "ms", "us", "ns":
|
||
|
ra.isTimestampEpoch = true
|
||
|
break
|
||
|
default:
|
||
|
if time.Now().Format(ra.TimestampFormat) == ra.TimestampFormat {
|
||
|
return fmt.Errorf("ERROR: incorrect timestamp format")
|
||
|
}
|
||
|
}
|
||
|
// Check percentiles are valid
|
||
|
for _, percentile := range ra.TimePercentiles {
|
||
|
if percentile <= 0 || percentile >= 100 {
|
||
|
return fmt.Errorf("ERROR: percentiles must be numbers between 0 and 100 (not inclusive)")
|
||
|
}
|
||
|
}
|
||
|
//Check duration of windows
|
||
|
if ra.TimeWindowSize.Duration <= time.Duration(0) || ra.ThroughputWindowSize.Duration <= time.Duration(0) {
|
||
|
return fmt.Errorf("ERROR: windows need to be a positive duration")
|
||
|
}
|
||
|
// Check number of windows
|
||
|
if ra.TimeWindows <= 0 || ra.ThroughputWindows <= 0 {
|
||
|
return fmt.Errorf("ERROR: at least one window is required")
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// Executed as a go routine, tails a given file and puts the parsed requests into their respective windows.
|
||
|
func (ra *RequestAggregates) gatherFromFile(tailer *tail.Tail, acc telegraf.Accumulator) {
|
||
|
defer ra.wg.Done()
|
||
|
|
||
|
requestParser := &RequestParser{
|
||
|
TimestampPosition: ra.TimestampPosition,
|
||
|
TimestampFormat: ra.TimestampFormat,
|
||
|
IsTimeEpoch: ra.isTimestampEpoch,
|
||
|
TimePosition: ra.TimePosition,
|
||
|
ResultPosition: ra.ResultPosition,
|
||
|
SuccessRegexp: ra.successRegexp}
|
||
|
|
||
|
var err error
|
||
|
var line *tail.Line
|
||
|
var request *Request
|
||
|
for line = range tailer.Lines {
|
||
|
// Parse and validate line
|
||
|
if line.Err != nil {
|
||
|
log.Printf("ERROR: could not tail file %s, Error: %s\n", tailer.Filename, err)
|
||
|
continue
|
||
|
}
|
||
|
request, err = requestParser.ParseLine(line.Text)
|
||
|
if err != nil {
|
||
|
log.Printf("ERROR: malformed line in %s: [%s], Error: %s\n", tailer.Filename, line.Text, err)
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
// Wait until the window is created (it is possible that the line is read before the time ticks)
|
||
|
for ra.timeWindowSlice[len(ra.timeWindowSlice)-1].End().Before(request.Timestamp) {
|
||
|
time.Sleep(time.Millisecond * 10)
|
||
|
}
|
||
|
// Add request to time window
|
||
|
ra.timeMutex.Lock()
|
||
|
err = addToWindow(ra.timeWindowSlice, request)
|
||
|
if err != nil {
|
||
|
log.Printf("ERROR: could not find a time window, Request: %v, Error %s\n", request, err)
|
||
|
}
|
||
|
ra.timeMutex.Unlock()
|
||
|
|
||
|
// Wait until the window is created (it is possible that the line is read before the time ticks)
|
||
|
for ra.throughputWindowSlice[len(ra.throughputWindowSlice)-1].End().Before(request.Timestamp) {
|
||
|
time.Sleep(time.Millisecond * 10)
|
||
|
}
|
||
|
// Add request to throughput window
|
||
|
ra.throughputMutex.Lock()
|
||
|
err = addToWindow(ra.throughputWindowSlice, request)
|
||
|
if err != nil {
|
||
|
log.Printf("ERROR: could not find a throughput window, Request: %v, Error %s\n", request, err)
|
||
|
}
|
||
|
ra.throughputMutex.Unlock()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Executed as a go routine, manages the windows related to time measures, creating new ones and flushing old ones
|
||
|
func (ra *RequestAggregates) manageTimeWindows(acc telegraf.Accumulator) {
|
||
|
defer ra.wg.Done()
|
||
|
onlyTotal := ra.successRegexp == nil
|
||
|
for {
|
||
|
select {
|
||
|
// If the timer is triggered
|
||
|
case <-ra.timeTimer.C:
|
||
|
ra.timeMutex.Lock()
|
||
|
// Create new window with the start time of the last one's end time
|
||
|
startTime := ra.timeWindowSlice[len(ra.timeWindowSlice)-1].End()
|
||
|
endTime := startTime.Add(ra.TimeWindowSize.Duration)
|
||
|
ra.timeWindowSlice = append(ra.timeWindowSlice, &TimeWindow{
|
||
|
StartTime: startTime, EndTime: endTime,
|
||
|
OnlyTotal: onlyTotal, Percentiles: ra.TimePercentiles})
|
||
|
// Flush oldest one if necessary
|
||
|
if len(ra.timeWindowSlice) > ra.TimeWindows {
|
||
|
ra.timeWindowSlice = flushWindow(ra.timeWindowSlice, acc)
|
||
|
}
|
||
|
ra.timeMutex.Unlock()
|
||
|
// Reset time till the end of the window
|
||
|
ra.timeTimer.Reset(endTime.Sub(time.Now()))
|
||
|
// If the stop signal is received
|
||
|
case <-ra.stopTimeChan:
|
||
|
ra.timeMutex.Lock()
|
||
|
ra.timeWindowSlice = flushAllWindows(ra.timeWindowSlice, acc)
|
||
|
ra.timeMutex.Unlock()
|
||
|
return
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Executed as a go routine, manages the windows related to throughput measures, creating new ones and flushing old ones
|
||
|
func (ra *RequestAggregates) manageThroughputWindows(acc telegraf.Accumulator) {
|
||
|
defer ra.wg.Done()
|
||
|
for {
|
||
|
select {
|
||
|
// If the timer is triggered
|
||
|
case <-ra.throughputTimer.C:
|
||
|
ra.throughputMutex.Lock()
|
||
|
// Create new window with the start time of the last one's end time
|
||
|
startTime := ra.throughputWindowSlice[len(ra.throughputWindowSlice)-1].End()
|
||
|
endTime := startTime.Add(ra.ThroughputWindowSize.Duration)
|
||
|
ra.throughputWindowSlice = append(ra.throughputWindowSlice, &ThroughputWindow{
|
||
|
StartTime: startTime, EndTime: endTime})
|
||
|
// Flush oldest one if necessary
|
||
|
if len(ra.throughputWindowSlice) > ra.ThroughputWindows {
|
||
|
ra.throughputWindowSlice = flushWindow(ra.throughputWindowSlice, acc)
|
||
|
}
|
||
|
ra.throughputMutex.Unlock()
|
||
|
ra.throughputTimer.Reset(endTime.Sub(time.Now()))
|
||
|
// If the stop signal is received
|
||
|
case <-ra.stopThroughputChan:
|
||
|
ra.throughputMutex.Lock()
|
||
|
ra.throughputWindowSlice = flushAllWindows(ra.throughputWindowSlice, acc)
|
||
|
ra.throughputMutex.Unlock()
|
||
|
return
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Removes the window at the front of the slice of windows and flushes its aggregated metrics to the accumulator
|
||
|
func flushWindow(windows []Window, acc telegraf.Accumulator) []Window {
|
||
|
if len(windows) > 0 {
|
||
|
var window Window
|
||
|
window, windows = windows[0], windows[1:]
|
||
|
metrics, err := window.Aggregate()
|
||
|
if err != nil {
|
||
|
log.Printf("ERROR: could not flush window, Error: %s\n", err)
|
||
|
}
|
||
|
for _, metric := range metrics {
|
||
|
acc.AddFields(metric.Name(), metric.Fields(), metric.Tags(), metric.Time())
|
||
|
}
|
||
|
}
|
||
|
return windows
|
||
|
}
|
||
|
|
||
|
// Flushes all windows ot the accumulator
|
||
|
func flushAllWindows(windows []Window, acc telegraf.Accumulator) []Window {
|
||
|
for len(windows) > 0 {
|
||
|
windows = flushWindow(windows, acc)
|
||
|
}
|
||
|
return windows
|
||
|
}
|
||
|
|
||
|
// Adds a request to a window, returns and error if it could not be added
|
||
|
func addToWindow(windows []Window, request *Request) error {
|
||
|
if len(windows) == 0 {
|
||
|
return fmt.Errorf("ERROR: no windows found")
|
||
|
}
|
||
|
first := windows[len(windows)-1]
|
||
|
if first.End().Before(request.Timestamp) {
|
||
|
return fmt.Errorf("ERROR: request is newer than any window")
|
||
|
}
|
||
|
last := windows[0]
|
||
|
if last.Start().After(request.Timestamp) {
|
||
|
return fmt.Errorf("ERROR: request is older than any window, try adding more windows")
|
||
|
}
|
||
|
for i := range windows {
|
||
|
window := windows[i]
|
||
|
if (window.Start().Before(request.Timestamp) || window.Start().Equal(request.Timestamp)) &&
|
||
|
window.End().After(request.Timestamp) {
|
||
|
return window.Add(request)
|
||
|
}
|
||
|
}
|
||
|
return fmt.Errorf("ERROR: no window could be found")
|
||
|
}
|
||
|
|
||
|
func init() {
|
||
|
inputs.Add("request_aggregates", func() telegraf.Input {
|
||
|
return NewRequestAggregates()
|
||
|
})
|
||
|
}
|