telegraf/plugins/inputs/request_aggregates/request_aggregates.go

package request_aggregates

import (
	"fmt"
	"github.com/hpcloud/tail"
	"github.com/influxdata/telegraf"
	"github.com/influxdata/telegraf/internal"
	"github.com/influxdata/telegraf/plugins/inputs"
	"log"
	"regexp"
	"sync"
	"time"
)

type RequestAggregates struct {
	File                 string
	TimestampPosition    int
	TimestampFormat      string
	TimePosition         int
	TimePercentiles      []float32
	TimeWindowSize       internal.Duration
	TimeWindows          int
	ResultPosition       int
	ResultSuccessRegex   string
	ThroughputWindowSize internal.Duration
	ThroughputWindows    int

	isTimestampEpoch      bool
	successRegexp         *regexp.Regexp
	tailer                *tail.Tail
	timeWindowSlice       []Window
	throughputWindowSlice []Window
	timeTimer             *time.Timer
	throughputTimer       *time.Timer
	stopTimeChan          chan bool
	stopThroughputChan    chan bool
	timeMutex             sync.Mutex
	throughputMutex       sync.Mutex
	wg                    sync.WaitGroup
	sync.Mutex
}

func NewRequestAggregates() *RequestAggregates {
	return &RequestAggregates{
		TimeWindows:       2,
		ThroughputWindows: 10}
}

const sampleConfig = `
  # File to monitor.
  file = "/var/server/access.csv"
  # Position of the timestamp of the request in every line
  timestamp_position = 0
  # Format of the timestamp (any layout accepted by Go Time.Parse or s/ms/us/ns for epoch time)
  timestamp_format = "ms"
  # Position of the time value to calculate in the log file (starting from 0)
  time_position = 1
  # Window to consider for time percentiles
  time_window_size = "60s"
  # Windows to keep in memory before flushing in order to avoid requests coming in after a window is shut.
  # If the CSV file is sorted by timestamp, this can be set to 1
  time_windows = 5
  # List of percentiles to calculate
  time_percentiles = [90.0, 95.0, 99.0, 99.99]
  # Position of the result column (success or failure)
  result_position = 3
  # Regular expression used to determine if the result is successful or not (if empty only request_aggregates_all
  # time series) will be generated
  result_success_regex = ".*true.*"
  # Time window to calculate throughput counters
  throughput_window_size = "1s"
  # Number of windows to keep in memory for throughput calculation
  throughput_windows = 300
  # List of tags and their values to add to every data point
  [inputs.aggregates.tags]
    name = "myserver"
`

func (ra *RequestAggregates) SampleConfig() string {
	return sampleConfig
}

func (ra *RequestAggregates) Description() string {
	return "Generates a set of aggregate values for a requests and their response times."
}

func (ra *RequestAggregates) Gather(acc telegraf.Accumulator) error {
	return nil
}

func (ra *RequestAggregates) Start(acc telegraf.Accumulator) error {
	ra.Lock()
	defer ra.Unlock()

	err := ra.validateConfig()
	if err != nil {
		return err
	}

	// Create tailer
	ra.tailer, err = tail.TailFile(ra.File, tail.Config{
		Follow:   true,
		ReOpen:   true,
		Location: &tail.SeekInfo{Whence: 2, Offset: 0}})
	if err != nil {
		return fmt.Errorf("ERROR tailing file %s, Error: %s", ra.File, err)
	}

	// Create first time window and start go routine to manage them
	now := time.Now()
	ra.timeWindowSlice = append(ra.timeWindowSlice, &TimeWindow{
		StartTime: now, EndTime: now.Add(ra.TimeWindowSize.Duration),
		OnlyTotal: ra.successRegexp == nil, Percentiles: ra.TimePercentiles})
	ra.timeTimer = time.NewTimer(ra.TimeWindowSize.Duration)
	ra.stopTimeChan = make(chan bool, 1)
	ra.wg.Add(1)
	go ra.manageTimeWindows(acc)

	// Create first throughput window and start go routine to manage them
	ra.throughputWindowSlice = append(ra.throughputWindowSlice, &ThroughputWindow{
		StartTime: now, EndTime: now.Add(ra.ThroughputWindowSize.Duration)})
	ra.throughputTimer = time.NewTimer(ra.ThroughputWindowSize.Duration)
	ra.stopThroughputChan = make(chan bool, 1)
	ra.wg.Add(1)
	go ra.manageThroughputWindows(acc)

	// Start go routine to tail the file and put requests in windows
	ra.wg.Add(1)
	go ra.gatherFromFile(ra.tailer, acc)

	return nil
}

func (ra *RequestAggregates) Stop() {
	ra.Lock()
	defer ra.Unlock()

	err := ra.tailer.Stop()
	if err != nil {
		log.Printf("ERROR: could not stop tail on file %s\n", ra.File)
	}
	ra.tailer.Cleanup()

	ra.timeTimer.Stop()
	ra.stopTimeChan <- true
	ra.throughputTimer.Stop()
	ra.stopThroughputChan <- true

	ra.wg.Wait()
}

// Validates the configuration in the struct
func (ra *RequestAggregates) validateConfig() error {
	var err error

	// Compile regex to identify success
	if ra.ResultSuccessRegex != "" {
		ra.successRegexp, err = regexp.Compile(ra.ResultSuccessRegex)
		if err != nil {
			return fmt.Errorf("ERROR: success regexp is not valid, Error: %s", err)
		}
	}
	// Check if timestamp format is valid
	switch ra.TimestampFormat {
	case "s", "ms", "us", "ns":
		ra.isTimestampEpoch = true
		break
	default:
		if time.Now().Format(ra.TimestampFormat) == ra.TimestampFormat {
			return fmt.Errorf("ERROR: incorrect timestamp format")
		}
	}
	// Check percentiles are valid
	for _, percentile := range ra.TimePercentiles {
		if percentile <= 0 || percentile >= 100 {
			return fmt.Errorf("ERROR: percentiles must be numbers between 0 and 100 (not inclusive)")
		}
	}
	//Check duration of windows
	if ra.TimeWindowSize.Duration <= time.Duration(0) || ra.ThroughputWindowSize.Duration <= time.Duration(0) {
		return fmt.Errorf("ERROR: windows need to be a positive duration")
	}
	// Check number of windows
	if ra.TimeWindows <= 0 || ra.ThroughputWindows <= 0 {
		return fmt.Errorf("ERROR: at least one window is required")
	}

	return nil
}

// Executed as a go routine, tails a given file and puts the parsed requests into their respective windows.
func (ra *RequestAggregates) gatherFromFile(tailer *tail.Tail, acc telegraf.Accumulator) {
	defer ra.wg.Done()

	requestParser := &RequestParser{
		TimestampPosition: ra.TimestampPosition,
		TimestampFormat:   ra.TimestampFormat,
		IsTimeEpoch:       ra.isTimestampEpoch,
		TimePosition:      ra.TimePosition,
		ResultPosition:    ra.ResultPosition,
		SuccessRegexp:     ra.successRegexp}

	var err error
	var line *tail.Line
	var request *Request
	for line = range tailer.Lines {
		// Parse and validate line
		if line.Err != nil {
			log.Printf("ERROR: could not tail file %s, Error: %s\n", tailer.Filename, err)
			continue
		}
		request, err = requestParser.ParseLine(line.Text)
		if err != nil {
			log.Printf("ERROR: malformed line in %s: [%s], Error: %s\n", tailer.Filename, line.Text, err)
			continue
		}

		// Wait until the window is created (it is possible that the line is read before the time ticks)
		for ra.timeWindowSlice[len(ra.timeWindowSlice)-1].End().Before(request.Timestamp) {
			time.Sleep(time.Millisecond * 10)
		}
		// Add request to time window
		ra.timeMutex.Lock()
		err = addToWindow(ra.timeWindowSlice, request)
		if err != nil {
			log.Printf("ERROR: could not find a time window, Request: %v, Error %s\n", request, err)
		}
		ra.timeMutex.Unlock()

		// Wait until the window is created (it is possible that the line is read before the time ticks)
		for ra.throughputWindowSlice[len(ra.throughputWindowSlice)-1].End().Before(request.Timestamp) {
			time.Sleep(time.Millisecond * 10)
		}
		// Add request to throughput window
		ra.throughputMutex.Lock()
		err = addToWindow(ra.throughputWindowSlice, request)
		if err != nil {
			log.Printf("ERROR: could not find a throughput window, Request: %v, Error %s\n", request, err)
		}
		ra.throughputMutex.Unlock()
	}
}

// Executed as a go routine, manages the windows related to time measures, creating new ones and flushing old ones
func (ra *RequestAggregates) manageTimeWindows(acc telegraf.Accumulator) {
	defer ra.wg.Done()
	onlyTotal := ra.successRegexp == nil
	for {
		select {
		// If the timer is triggered
		case <-ra.timeTimer.C:
			ra.timeMutex.Lock()
			// Create new window with the start time of the last one's end time
			startTime := ra.timeWindowSlice[len(ra.timeWindowSlice)-1].End()
			endTime := startTime.Add(ra.TimeWindowSize.Duration)
			ra.timeWindowSlice = append(ra.timeWindowSlice, &TimeWindow{
				StartTime: startTime, EndTime: endTime,
				OnlyTotal: onlyTotal, Percentiles: ra.TimePercentiles})
			// Flush oldest one if necessary
			if len(ra.timeWindowSlice) > ra.TimeWindows {
				ra.timeWindowSlice = flushWindow(ra.timeWindowSlice, acc)
			}
			ra.timeMutex.Unlock()
			// Reset time till the end of the window
			ra.timeTimer.Reset(endTime.Sub(time.Now()))
		// If the stop signal is received
		case <-ra.stopTimeChan:
			ra.timeMutex.Lock()
			ra.timeWindowSlice = flushAllWindows(ra.timeWindowSlice, acc)
			ra.timeMutex.Unlock()
			return
		}
	}
}

// Executed as a go routine, manages the windows related to throughput measures, creating new ones and flushing old ones
func (ra *RequestAggregates) manageThroughputWindows(acc telegraf.Accumulator) {
	defer ra.wg.Done()
	for {
		select {
		// If the timer is triggered
		case <-ra.throughputTimer.C:
			ra.throughputMutex.Lock()
			// Create new window with the start time of the last one's end time
			startTime := ra.throughputWindowSlice[len(ra.throughputWindowSlice)-1].End()
			endTime := startTime.Add(ra.ThroughputWindowSize.Duration)
			ra.throughputWindowSlice = append(ra.throughputWindowSlice, &ThroughputWindow{
				StartTime: startTime, EndTime: endTime})
			// Flush oldest one if necessary
			if len(ra.throughputWindowSlice) > ra.ThroughputWindows {
				ra.throughputWindowSlice = flushWindow(ra.throughputWindowSlice, acc)
			}
			ra.throughputMutex.Unlock()
			ra.throughputTimer.Reset(endTime.Sub(time.Now()))
		// If the stop signal is received
		case <-ra.stopThroughputChan:
			ra.throughputMutex.Lock()
			ra.throughputWindowSlice = flushAllWindows(ra.throughputWindowSlice, acc)
			ra.throughputMutex.Unlock()
			return
		}
	}
}

// Removes the window at the front of the slice of windows and flushes its aggregated metrics to the accumulator
func flushWindow(windows []Window, acc telegraf.Accumulator) []Window {
	if len(windows) > 0 {
		var window Window
		window, windows = windows[0], windows[1:]
		metrics, err := window.Aggregate()
		if err != nil {
			log.Printf("ERROR: could not flush window, Error: %s\n", err)
		}
		for _, metric := range metrics {
			acc.AddFields(metric.Name(), metric.Fields(), metric.Tags(), metric.Time())
		}
	}
	return windows
}

// Flushes all windows ot the accumulator
func flushAllWindows(windows []Window, acc telegraf.Accumulator) []Window {
	for len(windows) > 0 {
		windows = flushWindow(windows, acc)
	}
	return windows
}

// Adds a request to a window, returns and error if it could not be added
func addToWindow(windows []Window, request *Request) error {
	if len(windows) == 0 {
		return fmt.Errorf("ERROR: no windows found")
	}
	first := windows[len(windows)-1]
	if first.End().Before(request.Timestamp) {
		return fmt.Errorf("ERROR: request is newer than any window")
	}
	last := windows[0]
	if last.Start().After(request.Timestamp) {
		return fmt.Errorf("ERROR: request is older than any window, try adding more windows")
	}
	for i := range windows {
		window := windows[i]
		if (window.Start().Before(request.Timestamp) || window.Start().Equal(request.Timestamp)) &&
			window.End().After(request.Timestamp) {
			return window.Add(request)
		}
	}
	return fmt.Errorf("ERROR: no window could be found")
}

func init() {
	inputs.Add("request_aggregates", func() telegraf.Input {
		return NewRequestAggregates()
	})
}