Collect cloudwatch stats using GetMetricData (#5544)
This commit is contained in:
parent
58a6209a76
commit
e334830458
|
@ -17,7 +17,7 @@ API endpoint. In the following order the plugin will attempt to authenticate.
|
|||
|
||||
```toml
|
||||
[[inputs.cloudwatch]]
|
||||
## Amazon Region (required)
|
||||
## Amazon Region
|
||||
region = "us-east-1"
|
||||
|
||||
## Amazon Credentials
|
||||
|
@ -54,32 +54,43 @@ API endpoint. In the following order the plugin will attempt to authenticate.
|
|||
## Collection Delay (required - must account for metrics availability via CloudWatch API)
|
||||
delay = "5m"
|
||||
|
||||
## Override global run interval (optional - defaults to global interval)
|
||||
## Recomended: use metric 'interval' that is a multiple of 'period' to avoid
|
||||
## Recommended: use metric 'interval' that is a multiple of 'period' to avoid
|
||||
## gaps or overlap in pulled data
|
||||
interval = "5m"
|
||||
|
||||
## Configure the TTL for the internal cache of metrics.
|
||||
# cache_ttl = "1h"
|
||||
|
||||
## Metric Statistic Namespace (required)
|
||||
namespace = "AWS/ELB"
|
||||
|
||||
## Maximum requests per second. Note that the global default AWS rate limit is
|
||||
## 400 reqs/sec, so if you define multiple namespaces, these should add up to a
|
||||
## maximum of 400. Optional - default value is 200.
|
||||
## 50 reqs/sec, so if you define multiple namespaces, these should add up to a
|
||||
## maximum of 50.
|
||||
## See http://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/cloudwatch_limits.html
|
||||
ratelimit = 200
|
||||
# ratelimit = 25
|
||||
|
||||
## Metrics to Pull (optional)
|
||||
## Namespace-wide statistic filters. These allow fewer queries to be made to
|
||||
## cloudwatch.
|
||||
# statistic_include = [ "average", "sum", minimum", "maximum", sample_count" ]
|
||||
# statistic_exclude = []
|
||||
|
||||
## Metrics to Pull
|
||||
## Defaults to all Metrics in Namespace if nothing is provided
|
||||
## Refreshes Namespace available metrics every 1h
|
||||
[[inputs.cloudwatch.metrics]]
|
||||
names = ["Latency", "RequestCount"]
|
||||
|
||||
## Dimension filters for Metric. These are optional however all dimensions
|
||||
## defined for the metric names must be specified in order to retrieve
|
||||
## the metric statistics.
|
||||
[[inputs.cloudwatch.metrics.dimensions]]
|
||||
name = "LoadBalancerName"
|
||||
value = "p-example"
|
||||
#[[inputs.cloudwatch.metrics]]
|
||||
# names = ["Latency", "RequestCount"]
|
||||
#
|
||||
# ## Statistic filters for Metric. These allow for retrieving specific
|
||||
# ## statistics for an individual metric.
|
||||
# # statistic_include = [ "average", "sum", minimum", "maximum", sample_count" ]
|
||||
# # statistic_exclude = []
|
||||
#
|
||||
# ## Dimension filters for Metric. All dimensions defined for the metric names
|
||||
# ## must be specified in order to retrieve the metric statistics.
|
||||
# [[inputs.cloudwatch.metrics.dimensions]]
|
||||
# name = "LoadBalancerName"
|
||||
# value = "p-example"
|
||||
```
|
||||
#### Requirements and Terminology
|
||||
|
||||
|
@ -97,6 +108,10 @@ wildcard dimension is ignored.
|
|||
|
||||
Example:
|
||||
```
|
||||
[[inputs.cloudwatch]]
|
||||
period = "1m"
|
||||
interval = "5m"
|
||||
|
||||
[[inputs.cloudwatch.metrics]]
|
||||
names = ["Latency"]
|
||||
|
||||
|
@ -124,9 +139,11 @@ Then 2 metrics will be output:
|
|||
If the `AvailabilityZone` wildcard dimension was omitted, then a single metric (name: `p-example`)
|
||||
would be exported containing the aggregate values of the ELB across availability zones.
|
||||
|
||||
To maximize efficiency and savings, consider making fewer requests by increasing `interval` but keeping `period` at the duration you would like metrics to be reported. The above example will request metrics from Cloudwatch every 5 minutes but will output five metrics timestamped one minute apart.
|
||||
|
||||
#### Restrictions and Limitations
|
||||
- CloudWatch metrics are not available instantly via the CloudWatch API. You should adjust your collection `delay` to account for this lag in metrics availability based on your [monitoring subscription level](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-cloudwatch-new.html)
|
||||
- CloudWatch API usage incurs cost - see [GetMetricStatistics Pricing](https://aws.amazon.com/cloudwatch/pricing/)
|
||||
- CloudWatch API usage incurs cost - see [GetMetricData Pricing](https://aws.amazon.com/cloudwatch/pricing/)
|
||||
|
||||
### Measurements & Fields:
|
||||
|
||||
|
@ -147,7 +164,6 @@ Tag Dimension names are represented in [snake case](https://en.wikipedia.org/wik
|
|||
|
||||
- All measurements have the following tags:
|
||||
- region (CloudWatch Region)
|
||||
- unit (CloudWatch Metric Unit)
|
||||
- {dimension-name} (Cloudwatch Dimension value - one for each metric dimension)
|
||||
|
||||
### Troubleshooting:
|
||||
|
@ -168,5 +184,5 @@ aws cloudwatch get-metric-statistics --namespace AWS/EC2 --region us-east-1 --pe
|
|||
|
||||
```
|
||||
$ ./telegraf --config telegraf.conf --input-filter cloudwatch --test
|
||||
> cloudwatch_aws_elb,load_balancer_name=p-example,region=us-east-1,unit=seconds latency_average=0.004810798017284538,latency_maximum=0.1100282669067383,latency_minimum=0.0006084442138671875,latency_sample_count=4029,latency_sum=19.382705211639404 1459542420000000000
|
||||
> cloudwatch_aws_elb,load_balancer_name=p-example,region=us-east-1 latency_average=0.004810798017284538,latency_maximum=0.1100282669067383,latency_minimum=0.0006084442138671875,latency_sample_count=4029,latency_sum=19.382705211639404 1459542420000000000
|
||||
```
|
||||
|
|
|
@ -1,32 +1,38 @@
|
|||
package cloudwatch
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/aws/aws-sdk-go/aws"
|
||||
|
||||
"github.com/aws/aws-sdk-go/service/cloudwatch"
|
||||
|
||||
"github.com/influxdata/telegraf"
|
||||
"github.com/influxdata/telegraf/filter"
|
||||
"github.com/influxdata/telegraf/internal"
|
||||
internalaws "github.com/influxdata/telegraf/internal/config/aws"
|
||||
"github.com/influxdata/telegraf/internal/limiter"
|
||||
"github.com/influxdata/telegraf/metric"
|
||||
"github.com/influxdata/telegraf/plugins/inputs"
|
||||
)
|
||||
|
||||
type (
|
||||
// CloudWatch contains the configuration and cache for the cloudwatch plugin.
|
||||
CloudWatch struct {
|
||||
Region string `toml:"region"`
|
||||
AccessKey string `toml:"access_key"`
|
||||
SecretKey string `toml:"secret_key"`
|
||||
RoleARN string `toml:"role_arn"`
|
||||
Profile string `toml:"profile"`
|
||||
Filename string `toml:"shared_credential_file"`
|
||||
CredentialPath string `toml:"shared_credential_file"`
|
||||
Token string `toml:"token"`
|
||||
EndpointURL string `toml:"endpoint_url"`
|
||||
StatisticExclude []string `toml:"statistic_exclude"`
|
||||
StatisticInclude []string `toml:"statistic_include"`
|
||||
|
||||
Period internal.Duration `toml:"period"`
|
||||
Delay internal.Duration `toml:"delay"`
|
||||
|
@ -34,34 +40,44 @@ type (
|
|||
Metrics []*Metric `toml:"metrics"`
|
||||
CacheTTL internal.Duration `toml:"cache_ttl"`
|
||||
RateLimit int `toml:"ratelimit"`
|
||||
|
||||
client cloudwatchClient
|
||||
metricCache *MetricCache
|
||||
statFilter filter.Filter
|
||||
metricCache *metricCache
|
||||
queryDimensions map[string]*map[string]string
|
||||
windowStart time.Time
|
||||
windowEnd time.Time
|
||||
}
|
||||
|
||||
// Metric defines a simplified Cloudwatch metric.
|
||||
Metric struct {
|
||||
StatisticExclude *[]string `toml:"statistic_exclude"`
|
||||
StatisticInclude *[]string `toml:"statistic_include"`
|
||||
MetricNames []string `toml:"names"`
|
||||
Dimensions []*Dimension `toml:"dimensions"`
|
||||
}
|
||||
|
||||
// Dimension defines a simplified Cloudwatch dimension (provides metric filtering).
|
||||
Dimension struct {
|
||||
Name string `toml:"name"`
|
||||
Value string `toml:"value"`
|
||||
}
|
||||
|
||||
MetricCache struct {
|
||||
TTL time.Duration
|
||||
Fetched time.Time
|
||||
Metrics []*cloudwatch.Metric
|
||||
// metricCache caches metrics, their filters, and generated queries.
|
||||
metricCache struct {
|
||||
ttl time.Duration
|
||||
built time.Time
|
||||
metrics []filteredMetric
|
||||
queries []*cloudwatch.MetricDataQuery
|
||||
}
|
||||
|
||||
cloudwatchClient interface {
|
||||
ListMetrics(*cloudwatch.ListMetricsInput) (*cloudwatch.ListMetricsOutput, error)
|
||||
GetMetricStatistics(*cloudwatch.GetMetricStatisticsInput) (*cloudwatch.GetMetricStatisticsOutput, error)
|
||||
GetMetricData(*cloudwatch.GetMetricDataInput) (*cloudwatch.GetMetricDataOutput, error)
|
||||
}
|
||||
)
|
||||
|
||||
// SampleConfig returns the default configuration of the Cloudwatch input plugin.
|
||||
func (c *CloudWatch) SampleConfig() string {
|
||||
return `
|
||||
## Amazon Region
|
||||
|
@ -106,44 +122,155 @@ func (c *CloudWatch) SampleConfig() string {
|
|||
interval = "5m"
|
||||
|
||||
## Configure the TTL for the internal cache of metrics.
|
||||
## Defaults to 1 hr if not specified
|
||||
#cache_ttl = "10m"
|
||||
# cache_ttl = "1h"
|
||||
|
||||
## Metric Statistic Namespace (required)
|
||||
namespace = "AWS/ELB"
|
||||
|
||||
## Maximum requests per second. Note that the global default AWS rate limit is
|
||||
## 400 reqs/sec, so if you define multiple namespaces, these should add up to a
|
||||
## maximum of 400. Optional - default value is 200.
|
||||
## 50 reqs/sec, so if you define multiple namespaces, these should add up to a
|
||||
## maximum of 50.
|
||||
## See http://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/cloudwatch_limits.html
|
||||
ratelimit = 200
|
||||
# ratelimit = 25
|
||||
|
||||
## Metrics to Pull (optional)
|
||||
## Namespace-wide statistic filters. These allow fewer queries to be made to
|
||||
## cloudwatch.
|
||||
# statistic_include = [ "average", "sum", minimum", "maximum", sample_count" ]
|
||||
# statistic_exclude = []
|
||||
|
||||
## Metrics to Pull
|
||||
## Defaults to all Metrics in Namespace if nothing is provided
|
||||
## Refreshes Namespace available metrics every 1h
|
||||
#[[inputs.cloudwatch.metrics]]
|
||||
# names = ["Latency", "RequestCount"]
|
||||
#
|
||||
# ## Dimension filters for Metric. These are optional however all dimensions
|
||||
# ## defined for the metric names must be specified in order to retrieve
|
||||
# ## the metric statistics.
|
||||
# ## Statistic filters for Metric. These allow for retrieving specific
|
||||
# ## statistics for an individual metric.
|
||||
# # statistic_include = [ "average", "sum", minimum", "maximum", sample_count" ]
|
||||
# # statistic_exclude = []
|
||||
#
|
||||
# ## Dimension filters for Metric. All dimensions defined for the metric names
|
||||
# ## must be specified in order to retrieve the metric statistics.
|
||||
# [[inputs.cloudwatch.metrics.dimensions]]
|
||||
# name = "LoadBalancerName"
|
||||
# value = "p-example"
|
||||
`
|
||||
}
|
||||
|
||||
// Description returns a one-sentence description on the Cloudwatch input plugin.
|
||||
func (c *CloudWatch) Description() string {
|
||||
return "Pull Metric Statistics from Amazon CloudWatch"
|
||||
}
|
||||
|
||||
func SelectMetrics(c *CloudWatch) ([]*cloudwatch.Metric, error) {
|
||||
var metrics []*cloudwatch.Metric
|
||||
// Gather takes in an accumulator and adds the metrics that the Input
|
||||
// gathers. This is called every "interval".
|
||||
func (c *CloudWatch) Gather(acc telegraf.Accumulator) error {
|
||||
if c.statFilter == nil {
|
||||
var err error
|
||||
// Set config level filter (won't change throughout life of plugin).
|
||||
c.statFilter, err = filter.NewIncludeExcludeFilter(c.StatisticInclude, c.StatisticExclude)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if c.client == nil {
|
||||
c.initializeCloudWatch()
|
||||
}
|
||||
|
||||
filteredMetrics, err := getFilteredMetrics(c)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = c.updateWindow(time.Now())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Get all of the possible queries so we can send groups of 100.
|
||||
queries, err := c.getDataQueries(filteredMetrics)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Limit concurrency or we can easily exhaust user connection limit.
|
||||
// See cloudwatch API request limits:
|
||||
// http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/cloudwatch_limits.html
|
||||
lmtr := limiter.NewRateLimiter(c.RateLimit, time.Second)
|
||||
defer lmtr.Stop()
|
||||
wg := sync.WaitGroup{}
|
||||
rLock := sync.Mutex{}
|
||||
|
||||
results := []*cloudwatch.MetricDataResult{}
|
||||
|
||||
// 100 is the maximum number of metric data queries a `GetMetricData` request can contain.
|
||||
batchSize := 100
|
||||
var batches [][]*cloudwatch.MetricDataQuery
|
||||
|
||||
for batchSize < len(queries) {
|
||||
queries, batches = queries[batchSize:], append(batches, queries[0:batchSize:batchSize])
|
||||
}
|
||||
batches = append(batches, queries)
|
||||
|
||||
for i := range batches {
|
||||
wg.Add(1)
|
||||
<-lmtr.C
|
||||
go func(inm []*cloudwatch.MetricDataQuery) {
|
||||
defer wg.Done()
|
||||
result, err := c.gatherMetrics(c.getDataInputs(inm))
|
||||
if err != nil {
|
||||
acc.AddError(err)
|
||||
return
|
||||
}
|
||||
|
||||
rLock.Lock()
|
||||
results = append(results, result...)
|
||||
rLock.Unlock()
|
||||
}(batches[i])
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
return c.aggregateMetrics(acc, results)
|
||||
}
|
||||
|
||||
func (c *CloudWatch) initializeCloudWatch() error {
|
||||
credentialConfig := &internalaws.CredentialConfig{
|
||||
Region: c.Region,
|
||||
AccessKey: c.AccessKey,
|
||||
SecretKey: c.SecretKey,
|
||||
RoleARN: c.RoleARN,
|
||||
Profile: c.Profile,
|
||||
Filename: c.CredentialPath,
|
||||
Token: c.Token,
|
||||
EndpointURL: c.EndpointURL,
|
||||
}
|
||||
configProvider := credentialConfig.Credentials()
|
||||
|
||||
cfg := &aws.Config{}
|
||||
loglevel := aws.LogOff
|
||||
c.client = cloudwatch.New(configProvider, cfg.WithLogLevel(loglevel))
|
||||
return nil
|
||||
}
|
||||
|
||||
type filteredMetric struct {
|
||||
metrics []*cloudwatch.Metric
|
||||
statFilter filter.Filter
|
||||
}
|
||||
|
||||
// getFilteredMetrics returns metrics specified in the config file or metrics listed from Cloudwatch.
|
||||
func getFilteredMetrics(c *CloudWatch) ([]filteredMetric, error) {
|
||||
if c.metricCache != nil && c.metricCache.isValid() {
|
||||
return c.metricCache.metrics, nil
|
||||
}
|
||||
|
||||
fMetrics := []filteredMetric{}
|
||||
|
||||
// check for provided metric filter
|
||||
if c.Metrics != nil {
|
||||
metrics = []*cloudwatch.Metric{}
|
||||
for _, m := range c.Metrics {
|
||||
metrics := []*cloudwatch.Metric{}
|
||||
if !hasWilcard(m.Dimensions) {
|
||||
dimensions := make([]*cloudwatch.Dimension, len(m.Dimensions))
|
||||
for k, d := range m.Dimensions {
|
||||
|
@ -176,53 +303,73 @@ func SelectMetrics(c *CloudWatch) ([]*cloudwatch.Metric, error) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
if m.StatisticExclude == nil {
|
||||
m.StatisticExclude = &c.StatisticExclude
|
||||
}
|
||||
} else {
|
||||
var err error
|
||||
metrics, err = c.fetchNamespaceMetrics()
|
||||
if m.StatisticInclude == nil {
|
||||
m.StatisticInclude = &c.StatisticInclude
|
||||
}
|
||||
statFilter, err := filter.NewIncludeExcludeFilter(*m.StatisticInclude, *m.StatisticExclude)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fMetrics = append(fMetrics, filteredMetric{
|
||||
metrics: metrics,
|
||||
statFilter: statFilter,
|
||||
})
|
||||
}
|
||||
} else {
|
||||
metrics, err := c.fetchNamespaceMetrics()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fMetrics = []filteredMetric{{
|
||||
metrics: metrics,
|
||||
statFilter: c.statFilter,
|
||||
}}
|
||||
}
|
||||
|
||||
c.metricCache = &metricCache{
|
||||
metrics: fMetrics,
|
||||
built: time.Now(),
|
||||
ttl: c.CacheTTL.Duration,
|
||||
}
|
||||
|
||||
return fMetrics, nil
|
||||
}
|
||||
|
||||
// fetchNamespaceMetrics retrieves available metrics for a given CloudWatch namespace.
|
||||
func (c *CloudWatch) fetchNamespaceMetrics() ([]*cloudwatch.Metric, error) {
|
||||
metrics := []*cloudwatch.Metric{}
|
||||
|
||||
var token *string
|
||||
params := &cloudwatch.ListMetricsInput{
|
||||
Namespace: aws.String(c.Namespace),
|
||||
Dimensions: []*cloudwatch.DimensionFilter{},
|
||||
NextToken: token,
|
||||
MetricName: nil,
|
||||
}
|
||||
|
||||
for {
|
||||
resp, err := c.client.ListMetrics(params)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
metrics = append(metrics, resp.Metrics...)
|
||||
if resp.NextToken == nil {
|
||||
break
|
||||
}
|
||||
|
||||
params.NextToken = resp.NextToken
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
func (c *CloudWatch) Gather(acc telegraf.Accumulator) error {
|
||||
if c.client == nil {
|
||||
c.initializeCloudWatch()
|
||||
}
|
||||
|
||||
metrics, err := SelectMetrics(c)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
|
||||
err = c.updateWindow(now)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// limit concurrency or we can easily exhaust user connection limit
|
||||
// see cloudwatch API request limits:
|
||||
// http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/cloudwatch_limits.html
|
||||
lmtr := limiter.NewRateLimiter(c.RateLimit, time.Second)
|
||||
defer lmtr.Stop()
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(metrics))
|
||||
for _, m := range metrics {
|
||||
<-lmtr.C
|
||||
go func(inm *cloudwatch.Metric) {
|
||||
defer wg.Done()
|
||||
acc.AddError(c.gatherMetric(acc, inm))
|
||||
}(m)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *CloudWatch) updateWindow(relativeTo time.Time) error {
|
||||
windowEnd := relativeTo.Add(-c.Delay.Duration)
|
||||
|
||||
|
@ -239,168 +386,197 @@ func (c *CloudWatch) updateWindow(relativeTo time.Time) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// getDataQueries gets all of the possible queries so we can maximize the request payload.
|
||||
func (c *CloudWatch) getDataQueries(filteredMetrics []filteredMetric) ([]*cloudwatch.MetricDataQuery, error) {
|
||||
if c.metricCache != nil && c.metricCache.queries != nil && c.metricCache.isValid() {
|
||||
return c.metricCache.queries, nil
|
||||
}
|
||||
|
||||
c.queryDimensions = map[string]*map[string]string{}
|
||||
|
||||
dataQueries := []*cloudwatch.MetricDataQuery{}
|
||||
for i, filtered := range filteredMetrics {
|
||||
for j, metric := range filtered.metrics {
|
||||
id := strconv.Itoa(j) + "_" + strconv.Itoa(i)
|
||||
dimension := ctod(metric.Dimensions)
|
||||
if filtered.statFilter.Match("average") {
|
||||
c.queryDimensions["average_"+id] = dimension
|
||||
dataQueries = append(dataQueries, &cloudwatch.MetricDataQuery{
|
||||
Id: aws.String("average_" + id),
|
||||
Label: aws.String(snakeCase(*metric.MetricName + "_average")),
|
||||
MetricStat: &cloudwatch.MetricStat{
|
||||
Metric: metric,
|
||||
Period: aws.Int64(int64(c.Period.Duration.Seconds())),
|
||||
Stat: aws.String(cloudwatch.StatisticAverage),
|
||||
},
|
||||
})
|
||||
}
|
||||
if filtered.statFilter.Match("maximum") {
|
||||
c.queryDimensions["maximum_"+id] = dimension
|
||||
dataQueries = append(dataQueries, &cloudwatch.MetricDataQuery{
|
||||
Id: aws.String("maximum_" + id),
|
||||
Label: aws.String(snakeCase(*metric.MetricName + "_maximum")),
|
||||
MetricStat: &cloudwatch.MetricStat{
|
||||
Metric: metric,
|
||||
Period: aws.Int64(int64(c.Period.Duration.Seconds())),
|
||||
Stat: aws.String(cloudwatch.StatisticMaximum),
|
||||
},
|
||||
})
|
||||
}
|
||||
if filtered.statFilter.Match("minimum") {
|
||||
c.queryDimensions["minimum_"+id] = dimension
|
||||
dataQueries = append(dataQueries, &cloudwatch.MetricDataQuery{
|
||||
Id: aws.String("minimum_" + id),
|
||||
Label: aws.String(snakeCase(*metric.MetricName + "_minimum")),
|
||||
MetricStat: &cloudwatch.MetricStat{
|
||||
Metric: metric,
|
||||
Period: aws.Int64(int64(c.Period.Duration.Seconds())),
|
||||
Stat: aws.String(cloudwatch.StatisticMinimum),
|
||||
},
|
||||
})
|
||||
}
|
||||
if filtered.statFilter.Match("sum") {
|
||||
c.queryDimensions["sum_"+id] = dimension
|
||||
dataQueries = append(dataQueries, &cloudwatch.MetricDataQuery{
|
||||
Id: aws.String("sum_" + id),
|
||||
Label: aws.String(snakeCase(*metric.MetricName + "_sum")),
|
||||
MetricStat: &cloudwatch.MetricStat{
|
||||
Metric: metric,
|
||||
Period: aws.Int64(int64(c.Period.Duration.Seconds())),
|
||||
Stat: aws.String(cloudwatch.StatisticSum),
|
||||
},
|
||||
})
|
||||
}
|
||||
if filtered.statFilter.Match("sample_count") {
|
||||
c.queryDimensions["sample_count_"+id] = dimension
|
||||
dataQueries = append(dataQueries, &cloudwatch.MetricDataQuery{
|
||||
Id: aws.String("sample_count_" + id),
|
||||
Label: aws.String(snakeCase(*metric.MetricName + "_sample_count")),
|
||||
MetricStat: &cloudwatch.MetricStat{
|
||||
Metric: metric,
|
||||
Period: aws.Int64(int64(c.Period.Duration.Seconds())),
|
||||
Stat: aws.String(cloudwatch.StatisticSampleCount),
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(dataQueries) == 0 {
|
||||
return nil, errors.New("no metrics found to collect")
|
||||
}
|
||||
|
||||
if c.metricCache == nil {
|
||||
c.metricCache = &metricCache{
|
||||
queries: dataQueries,
|
||||
built: time.Now(),
|
||||
ttl: c.CacheTTL.Duration,
|
||||
}
|
||||
} else {
|
||||
c.metricCache.queries = dataQueries
|
||||
}
|
||||
|
||||
return dataQueries, nil
|
||||
}
|
||||
|
||||
// gatherMetrics gets metric data from Cloudwatch.
|
||||
func (c *CloudWatch) gatherMetrics(
|
||||
params *cloudwatch.GetMetricDataInput,
|
||||
) ([]*cloudwatch.MetricDataResult, error) {
|
||||
results := []*cloudwatch.MetricDataResult{}
|
||||
|
||||
for {
|
||||
resp, err := c.client.GetMetricData(params)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get metric data: %v", err)
|
||||
}
|
||||
|
||||
results = append(results, resp.MetricDataResults...)
|
||||
if resp.NextToken == nil {
|
||||
break
|
||||
}
|
||||
params.NextToken = resp.NextToken
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func (c *CloudWatch) aggregateMetrics(
|
||||
acc telegraf.Accumulator,
|
||||
metricDataResults []*cloudwatch.MetricDataResult,
|
||||
) error {
|
||||
var (
|
||||
grouper = metric.NewSeriesGrouper()
|
||||
namespace = sanitizeMeasurement(c.Namespace)
|
||||
)
|
||||
|
||||
for _, result := range metricDataResults {
|
||||
tags := map[string]string{}
|
||||
|
||||
if dimensions, ok := c.queryDimensions[*result.Id]; ok {
|
||||
tags = *dimensions
|
||||
}
|
||||
tags["region"] = c.Region
|
||||
|
||||
for i := range result.Values {
|
||||
grouper.Add(namespace, tags, *result.Timestamps[i], *result.Label, *result.Values[i])
|
||||
}
|
||||
}
|
||||
|
||||
for _, metric := range grouper.Metrics() {
|
||||
acc.AddMetric(metric)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
inputs.Add("cloudwatch", func() telegraf.Input {
|
||||
ttl, _ := time.ParseDuration("1hr")
|
||||
return &CloudWatch{
|
||||
CacheTTL: internal.Duration{Duration: ttl},
|
||||
RateLimit: 200,
|
||||
CacheTTL: internal.Duration{Duration: time.Hour},
|
||||
RateLimit: 25,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize CloudWatch client
|
||||
*/
|
||||
func (c *CloudWatch) initializeCloudWatch() error {
|
||||
credentialConfig := &internalaws.CredentialConfig{
|
||||
Region: c.Region,
|
||||
AccessKey: c.AccessKey,
|
||||
SecretKey: c.SecretKey,
|
||||
RoleARN: c.RoleARN,
|
||||
Profile: c.Profile,
|
||||
Filename: c.Filename,
|
||||
Token: c.Token,
|
||||
EndpointURL: c.EndpointURL,
|
||||
}
|
||||
configProvider := credentialConfig.Credentials()
|
||||
|
||||
c.client = cloudwatch.New(configProvider)
|
||||
return nil
|
||||
}
|
||||
|
||||
/*
|
||||
* Fetch available metrics for given CloudWatch Namespace
|
||||
*/
|
||||
func (c *CloudWatch) fetchNamespaceMetrics() ([]*cloudwatch.Metric, error) {
|
||||
if c.metricCache != nil && c.metricCache.IsValid() {
|
||||
return c.metricCache.Metrics, nil
|
||||
}
|
||||
|
||||
metrics := []*cloudwatch.Metric{}
|
||||
|
||||
var token *string
|
||||
for more := true; more; {
|
||||
params := &cloudwatch.ListMetricsInput{
|
||||
Namespace: aws.String(c.Namespace),
|
||||
Dimensions: []*cloudwatch.DimensionFilter{},
|
||||
NextToken: token,
|
||||
MetricName: nil,
|
||||
}
|
||||
|
||||
resp, err := c.client.ListMetrics(params)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
metrics = append(metrics, resp.Metrics...)
|
||||
|
||||
token = resp.NextToken
|
||||
more = token != nil
|
||||
}
|
||||
|
||||
c.metricCache = &MetricCache{
|
||||
Metrics: metrics,
|
||||
Fetched: time.Now(),
|
||||
TTL: c.CacheTTL.Duration,
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
/*
|
||||
* Gather given Metric and emit any error
|
||||
*/
|
||||
func (c *CloudWatch) gatherMetric(
|
||||
acc telegraf.Accumulator,
|
||||
metric *cloudwatch.Metric,
|
||||
) error {
|
||||
params := c.getStatisticsInput(metric)
|
||||
resp, err := c.client.GetMetricStatistics(params)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, point := range resp.Datapoints {
|
||||
tags := map[string]string{
|
||||
"region": c.Region,
|
||||
"unit": snakeCase(*point.Unit),
|
||||
}
|
||||
|
||||
for _, d := range metric.Dimensions {
|
||||
tags[snakeCase(*d.Name)] = *d.Value
|
||||
}
|
||||
|
||||
// record field for each statistic
|
||||
fields := map[string]interface{}{}
|
||||
|
||||
if point.Average != nil {
|
||||
fields[formatField(*metric.MetricName, cloudwatch.StatisticAverage)] = *point.Average
|
||||
}
|
||||
if point.Maximum != nil {
|
||||
fields[formatField(*metric.MetricName, cloudwatch.StatisticMaximum)] = *point.Maximum
|
||||
}
|
||||
if point.Minimum != nil {
|
||||
fields[formatField(*metric.MetricName, cloudwatch.StatisticMinimum)] = *point.Minimum
|
||||
}
|
||||
if point.SampleCount != nil {
|
||||
fields[formatField(*metric.MetricName, cloudwatch.StatisticSampleCount)] = *point.SampleCount
|
||||
}
|
||||
if point.Sum != nil {
|
||||
fields[formatField(*metric.MetricName, cloudwatch.StatisticSum)] = *point.Sum
|
||||
}
|
||||
|
||||
acc.AddFields(formatMeasurement(c.Namespace), fields, tags, *point.Timestamp)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
/*
|
||||
* Formatting helpers
|
||||
*/
|
||||
func formatField(metricName string, statistic string) string {
|
||||
return fmt.Sprintf("%s_%s", snakeCase(metricName), snakeCase(statistic))
|
||||
}
|
||||
|
||||
func formatMeasurement(namespace string) string {
|
||||
func sanitizeMeasurement(namespace string) string {
|
||||
namespace = strings.Replace(namespace, "/", "_", -1)
|
||||
namespace = snakeCase(namespace)
|
||||
return fmt.Sprintf("cloudwatch_%s", namespace)
|
||||
return "cloudwatch_" + namespace
|
||||
}
|
||||
|
||||
func snakeCase(s string) string {
|
||||
s = internal.SnakeCase(s)
|
||||
s = strings.Replace(s, " ", "_", -1)
|
||||
s = strings.Replace(s, "__", "_", -1)
|
||||
return s
|
||||
}
|
||||
|
||||
/*
|
||||
* Map Metric to *cloudwatch.GetMetricStatisticsInput for given timeframe
|
||||
*/
|
||||
func (c *CloudWatch) getStatisticsInput(metric *cloudwatch.Metric) *cloudwatch.GetMetricStatisticsInput {
|
||||
input := &cloudwatch.GetMetricStatisticsInput{
|
||||
StartTime: aws.Time(c.windowStart),
|
||||
EndTime: aws.Time(c.windowEnd),
|
||||
MetricName: metric.MetricName,
|
||||
Namespace: metric.Namespace,
|
||||
Period: aws.Int64(int64(c.Period.Duration.Seconds())),
|
||||
Dimensions: metric.Dimensions,
|
||||
Statistics: []*string{
|
||||
aws.String(cloudwatch.StatisticAverage),
|
||||
aws.String(cloudwatch.StatisticMaximum),
|
||||
aws.String(cloudwatch.StatisticMinimum),
|
||||
aws.String(cloudwatch.StatisticSum),
|
||||
aws.String(cloudwatch.StatisticSampleCount)},
|
||||
}
|
||||
return input
|
||||
type dimension struct {
|
||||
name string
|
||||
value string
|
||||
}
|
||||
|
||||
/*
|
||||
* Check Metric Cache validity
|
||||
*/
|
||||
func (c *MetricCache) IsValid() bool {
|
||||
return c.Metrics != nil && time.Since(c.Fetched) < c.TTL
|
||||
// ctod converts cloudwatch dimensions to regular dimensions.
|
||||
func ctod(cDimensions []*cloudwatch.Dimension) *map[string]string {
|
||||
dimensions := map[string]string{}
|
||||
for i := range cDimensions {
|
||||
dimensions[snakeCase(*cDimensions[i].Name)] = *cDimensions[i].Value
|
||||
}
|
||||
return &dimensions
|
||||
}
|
||||
|
||||
func (c *CloudWatch) getDataInputs(dataQueries []*cloudwatch.MetricDataQuery) *cloudwatch.GetMetricDataInput {
|
||||
return &cloudwatch.GetMetricDataInput{
|
||||
StartTime: aws.Time(c.windowStart),
|
||||
EndTime: aws.Time(c.windowEnd),
|
||||
MetricDataQueries: dataQueries,
|
||||
}
|
||||
}
|
||||
|
||||
// isValid checks the validity of the metric cache.
|
||||
func (f *metricCache) isValid() bool {
|
||||
return f.metrics != nil && time.Since(f.built) < f.ttl
|
||||
}
|
||||
|
||||
func hasWilcard(dimensions []*Dimension) bool {
|
||||
|
|
|
@ -6,15 +6,20 @@ import (
|
|||
|
||||
"github.com/aws/aws-sdk-go/aws"
|
||||
"github.com/aws/aws-sdk-go/service/cloudwatch"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/influxdata/telegraf/filter"
|
||||
"github.com/influxdata/telegraf/internal"
|
||||
"github.com/influxdata/telegraf/testutil"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
type mockGatherCloudWatchClient struct{}
|
||||
|
||||
func (m *mockGatherCloudWatchClient) ListMetrics(params *cloudwatch.ListMetricsInput) (*cloudwatch.ListMetricsOutput, error) {
|
||||
metric := &cloudwatch.Metric{
|
||||
return &cloudwatch.ListMetricsOutput{
|
||||
Metrics: []*cloudwatch.Metric{
|
||||
{
|
||||
Namespace: params.Namespace,
|
||||
MetricName: aws.String("Latency"),
|
||||
Dimensions: []*cloudwatch.Dimension{
|
||||
|
@ -23,29 +28,76 @@ func (m *mockGatherCloudWatchClient) ListMetrics(params *cloudwatch.ListMetricsI
|
|||
Value: aws.String("p-example"),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
result := &cloudwatch.ListMetricsOutput{
|
||||
Metrics: []*cloudwatch.Metric{metric},
|
||||
}
|
||||
return result, nil
|
||||
func (m *mockGatherCloudWatchClient) GetMetricData(params *cloudwatch.GetMetricDataInput) (*cloudwatch.GetMetricDataOutput, error) {
|
||||
return &cloudwatch.GetMetricDataOutput{
|
||||
MetricDataResults: []*cloudwatch.MetricDataResult{
|
||||
{
|
||||
Id: aws.String("minimum_0_0"),
|
||||
Label: aws.String("latency_minimum"),
|
||||
StatusCode: aws.String("completed"),
|
||||
Timestamps: []*time.Time{
|
||||
params.EndTime,
|
||||
},
|
||||
Values: []*float64{
|
||||
aws.Float64(0.1),
|
||||
},
|
||||
},
|
||||
{
|
||||
Id: aws.String("maximum_0_0"),
|
||||
Label: aws.String("latency_maximum"),
|
||||
StatusCode: aws.String("completed"),
|
||||
Timestamps: []*time.Time{
|
||||
params.EndTime,
|
||||
},
|
||||
Values: []*float64{
|
||||
aws.Float64(0.3),
|
||||
},
|
||||
},
|
||||
{
|
||||
Id: aws.String("average_0_0"),
|
||||
Label: aws.String("latency_average"),
|
||||
StatusCode: aws.String("completed"),
|
||||
Timestamps: []*time.Time{
|
||||
params.EndTime,
|
||||
},
|
||||
Values: []*float64{
|
||||
aws.Float64(0.2),
|
||||
},
|
||||
},
|
||||
{
|
||||
Id: aws.String("sum_0_0"),
|
||||
Label: aws.String("latency_sum"),
|
||||
StatusCode: aws.String("completed"),
|
||||
Timestamps: []*time.Time{
|
||||
params.EndTime,
|
||||
},
|
||||
Values: []*float64{
|
||||
aws.Float64(123),
|
||||
},
|
||||
},
|
||||
{
|
||||
Id: aws.String("sample_count_0_0"),
|
||||
Label: aws.String("latency_sample_count"),
|
||||
StatusCode: aws.String("completed"),
|
||||
Timestamps: []*time.Time{
|
||||
params.EndTime,
|
||||
},
|
||||
Values: []*float64{
|
||||
aws.Float64(100),
|
||||
},
|
||||
},
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (m *mockGatherCloudWatchClient) GetMetricStatistics(params *cloudwatch.GetMetricStatisticsInput) (*cloudwatch.GetMetricStatisticsOutput, error) {
|
||||
dataPoint := &cloudwatch.Datapoint{
|
||||
Timestamp: params.EndTime,
|
||||
Minimum: aws.Float64(0.1),
|
||||
Maximum: aws.Float64(0.3),
|
||||
Average: aws.Float64(0.2),
|
||||
Sum: aws.Float64(123),
|
||||
SampleCount: aws.Float64(100),
|
||||
Unit: aws.String("Seconds"),
|
||||
}
|
||||
result := &cloudwatch.GetMetricStatisticsOutput{
|
||||
Label: aws.String("Latency"),
|
||||
Datapoints: []*cloudwatch.Datapoint{dataPoint},
|
||||
}
|
||||
return result, nil
|
||||
func TestSnakeCase(t *testing.T) {
|
||||
assert.Equal(t, "cluster_name", snakeCase("Cluster Name"))
|
||||
assert.Equal(t, "broker_id", snakeCase("Broker ID"))
|
||||
}
|
||||
|
||||
func TestGather(t *testing.T) {
|
||||
|
@ -64,7 +116,7 @@ func TestGather(t *testing.T) {
|
|||
var acc testutil.Accumulator
|
||||
c.client = &mockGatherCloudWatchClient{}
|
||||
|
||||
acc.GatherError(c.Gather)
|
||||
assert.NoError(t, acc.GatherError(c.Gather))
|
||||
|
||||
fields := map[string]interface{}{}
|
||||
fields["latency_minimum"] = 0.1
|
||||
|
@ -74,13 +126,11 @@ func TestGather(t *testing.T) {
|
|||
fields["latency_sample_count"] = 100.0
|
||||
|
||||
tags := map[string]string{}
|
||||
tags["unit"] = "seconds"
|
||||
tags["region"] = "us-east-1"
|
||||
tags["load_balancer_name"] = "p-example"
|
||||
|
||||
assert.True(t, acc.HasMeasurement("cloudwatch_aws_elb"))
|
||||
acc.AssertContainsTaggedFields(t, "cloudwatch_aws_elb", fields, tags)
|
||||
|
||||
}
|
||||
|
||||
type mockSelectMetricsCloudWatchClient struct{}
|
||||
|
@ -132,7 +182,7 @@ func (m *mockSelectMetricsCloudWatchClient) ListMetrics(params *cloudwatch.ListM
|
|||
return result, nil
|
||||
}
|
||||
|
||||
func (m *mockSelectMetricsCloudWatchClient) GetMetricStatistics(params *cloudwatch.GetMetricStatisticsInput) (*cloudwatch.GetMetricStatisticsOutput, error) {
|
||||
func (m *mockSelectMetricsCloudWatchClient) GetMetricData(params *cloudwatch.GetMetricDataInput) (*cloudwatch.GetMetricDataOutput, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
|
@ -164,10 +214,10 @@ func TestSelectMetrics(t *testing.T) {
|
|||
},
|
||||
}
|
||||
c.client = &mockSelectMetricsCloudWatchClient{}
|
||||
metrics, err := SelectMetrics(c)
|
||||
filtered, err := getFilteredMetrics(c)
|
||||
// We've asked for 2 (out of 4) metrics, over all 3 load balancers in all 2
|
||||
// AZs. We should get 12 metrics.
|
||||
assert.Equal(t, 12, len(metrics))
|
||||
assert.Equal(t, 12, len(filtered[0].metrics))
|
||||
assert.Nil(t, err)
|
||||
}
|
||||
|
||||
|
@ -199,25 +249,66 @@ func TestGenerateStatisticsInputParams(t *testing.T) {
|
|||
|
||||
c.updateWindow(now)
|
||||
|
||||
params := c.getStatisticsInput(m)
|
||||
statFilter, _ := filter.NewIncludeExcludeFilter(nil, nil)
|
||||
queries, _ := c.getDataQueries([]filteredMetric{{metrics: []*cloudwatch.Metric{m}, statFilter: statFilter}})
|
||||
params := c.getDataInputs(queries)
|
||||
|
||||
assert.EqualValues(t, *params.EndTime, now.Add(-c.Delay.Duration))
|
||||
assert.EqualValues(t, *params.StartTime, now.Add(-c.Period.Duration).Add(-c.Delay.Duration))
|
||||
assert.Len(t, params.Dimensions, 1)
|
||||
assert.Len(t, params.Statistics, 5)
|
||||
assert.EqualValues(t, *params.Period, 60)
|
||||
require.Len(t, params.MetricDataQueries, 5)
|
||||
assert.Len(t, params.MetricDataQueries[0].MetricStat.Metric.Dimensions, 1)
|
||||
assert.EqualValues(t, *params.MetricDataQueries[0].MetricStat.Period, 60)
|
||||
}
|
||||
|
||||
func TestGenerateStatisticsInputParamsFiltered(t *testing.T) {
|
||||
d := &cloudwatch.Dimension{
|
||||
Name: aws.String("LoadBalancerName"),
|
||||
Value: aws.String("p-example"),
|
||||
}
|
||||
|
||||
m := &cloudwatch.Metric{
|
||||
MetricName: aws.String("Latency"),
|
||||
Dimensions: []*cloudwatch.Dimension{d},
|
||||
}
|
||||
|
||||
duration, _ := time.ParseDuration("1m")
|
||||
internalDuration := internal.Duration{
|
||||
Duration: duration,
|
||||
}
|
||||
|
||||
c := &CloudWatch{
|
||||
Namespace: "AWS/ELB",
|
||||
Delay: internalDuration,
|
||||
Period: internalDuration,
|
||||
}
|
||||
|
||||
c.initializeCloudWatch()
|
||||
|
||||
now := time.Now()
|
||||
|
||||
c.updateWindow(now)
|
||||
|
||||
statFilter, _ := filter.NewIncludeExcludeFilter([]string{"average", "sample_count"}, nil)
|
||||
queries, _ := c.getDataQueries([]filteredMetric{{metrics: []*cloudwatch.Metric{m}, statFilter: statFilter}})
|
||||
params := c.getDataInputs(queries)
|
||||
|
||||
assert.EqualValues(t, *params.EndTime, now.Add(-c.Delay.Duration))
|
||||
assert.EqualValues(t, *params.StartTime, now.Add(-c.Period.Duration).Add(-c.Delay.Duration))
|
||||
require.Len(t, params.MetricDataQueries, 2)
|
||||
assert.Len(t, params.MetricDataQueries[0].MetricStat.Metric.Dimensions, 1)
|
||||
assert.EqualValues(t, *params.MetricDataQueries[0].MetricStat.Period, 60)
|
||||
}
|
||||
|
||||
func TestMetricsCacheTimeout(t *testing.T) {
|
||||
cache := &MetricCache{
|
||||
Metrics: []*cloudwatch.Metric{},
|
||||
Fetched: time.Now(),
|
||||
TTL: time.Minute,
|
||||
cache := &metricCache{
|
||||
metrics: []filteredMetric{},
|
||||
built: time.Now(),
|
||||
ttl: time.Minute,
|
||||
}
|
||||
|
||||
assert.True(t, cache.IsValid())
|
||||
cache.Fetched = time.Now().Add(-time.Minute)
|
||||
assert.False(t, cache.IsValid())
|
||||
assert.True(t, cache.isValid())
|
||||
cache.built = time.Now().Add(-time.Minute)
|
||||
assert.False(t, cache.isValid())
|
||||
}
|
||||
|
||||
func TestUpdateWindow(t *testing.T) {
|
||||
|
|
Loading…
Reference in New Issue