Output: Azure Monitor: Initial aggregated metric implementation

This commit is contained in:
Gunnar Aasen 2018-04-11 16:50:48 -07:00
parent 02d86b1b6e
commit 5c4c3a1ca9
7 changed files with 307 additions and 244 deletions

3
.gitignore vendored
View File

@ -2,6 +2,3 @@
/telegraf
/telegraf.exe
/telegraf.gz
*~
*#
*.tar.gz

4
Godeps
View File

@ -1,4 +1,7 @@
collectd.org 2ce144541b8903101fb8f1483cc0497a68798122
github.com/Azure/go-autorest 9ad9326b278af8fa5cc67c30c0ce9a58cc0862b2
github.com/Shopify/sarama 3b1b38866a79f06deddf0487d5c27ba0697ccd65
github.com/Sirupsen/logrus 61e43dc76f7ee59a82bdf3d71033dc12bea4c77d
github.com/aerospike/aerospike-client-go 95e1ad7791bdbca44707fedbb29be42024900d9c
github.com/amir/raidman c74861fe6a7bb8ede0a010ce4485bdbb4fc4c985
github.com/apache/thrift 4aaa92ece8503a6da9bc6701604f69acf2b99d07
@ -18,6 +21,7 @@ github.com/eapache/go-xerial-snappy bb955e01b9346ac19dc29eb16586c90ded99a98c
github.com/eapache/queue 44cc805cf13205b55f69e14bcb69867d1ae92f98
github.com/eclipse/paho.mqtt.golang aff15770515e3c57fc6109da73d42b0d46f7f483
github.com/go-logfmt/logfmt 390ab7935ee28ec6b286364bba9b4dd6410cb3d5
github.com/go-redis/redis 73b70592cdaa9e6abdfcfbf97b4a90d80728c836
github.com/go-sql-driver/mysql 2e00b5cd70399450106cec6431c2e2ce3cae5034
github.com/gobwas/glob bea32b9cd2d6f55753d94a28e959b13f0244797a
github.com/go-ini/ini 9144852efba7c4daf409943ee90767da62d55438

View File

@ -1,13 +0,0 @@
#!/bin/bash
TELEGRAF_VERSION=1.5.3-azmon
CONTAINER_URI=https://masdiagstore.blob.core.windows.net/share/
make
tar -cvfz telegraf-${TELEGRAF_VERSION}-l_amd64.tar.gz ./telegraf
azcopy \
--source ./telegraf-${TELEGRAF_VERSION}-l_amd64.tar.gz \
--destination $CONTAINER_URL/telegraf-${TELEGRAF_VERSION}-l_amd64.tar.gz \
--dest-key $AZURE_STORAGE_KEY

View File

@ -3,6 +3,7 @@ package all
import (
_ "github.com/influxdata/telegraf/plugins/outputs/amon"
_ "github.com/influxdata/telegraf/plugins/outputs/amqp"
_ "github.com/influxdata/telegraf/plugins/outputs/azuremonitor"
_ "github.com/influxdata/telegraf/plugins/outputs/cloudwatch"
_ "github.com/influxdata/telegraf/plugins/outputs/cratedb"
_ "github.com/influxdata/telegraf/plugins/outputs/datadog"

View File

@ -13,8 +13,8 @@ import (
"github.com/Azure/go-autorest/autorest/adal"
"github.com/Azure/go-autorest/autorest/azure"
"github.com/davecgh/go-spew/spew"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/outputs"
)
// AzureMonitor allows publishing of metrics to the Azure Monitor custom metrics service
@ -37,6 +37,41 @@ type AzureMonitor struct {
oauthConfig *adal.OAuthConfig
adalToken adal.OAuthTokenProvider
client *http.Client
cache map[uint64]azureMonitorMetric
period time.Duration
delay time.Duration
periodStart time.Time
periodEnd time.Time
metrics chan telegraf.Metric
shutdown chan struct{}
}
type azureMonitorMetric struct {
Time time.Time `json:"time"`
Data *azureMonitorData `json:"data"`
}
type azureMonitorData struct {
BaseData *azureMonitorBaseData `json:"baseData"`
}
type azureMonitorBaseData struct {
Metric string `json:"metric"`
Namespace string `json:"namespace"`
DimensionNames []string `json:"dimNames"`
Series []*azureMonitorSeries `json:"series"`
}
type azureMonitorSeries struct {
DimensionValues []string `json:"dimValues"`
Min float64 `json:"min"`
Max float64 `json:"max"`
Sum float64 `json:"sum"`
Count float64 `json:"count"`
}
var sampleConfig = `
@ -51,6 +86,8 @@ region = "useast"
httpPostTimeout = 15
## Whether or not to use managed service identity (defaults to true).
useManagedServiceIdentity = true
## Leave this section blank to use Managed Service Identity.
## TODO
azureSubscription = "TODO"
## TODO
@ -65,245 +102,311 @@ const (
azureMonitorDefaultRegion = "eastus"
)
// Description provides a description of the plugin
func (s *AzureMonitor) Description() string {
return "Configuration for Azure Monitor to send metrics to"
}
// SampleConfig provides a sample configuration for the plugin
func (s *AzureMonitor) SampleConfig() string {
return sampleConfig
}
// Connect initializes the plugin and validates connectivity
func (s *AzureMonitor) Connect() error {
func (a *AzureMonitor) Connect() error {
// Set defaults
// If no direct AD values provided, fall back to MSI
if s.AzureSubscriptionID == "" && s.AzureTenantID == "" && s.AzureClientID == "" && s.AzureClientSecret == "" {
s.useMsi = true
} else if s.AzureSubscriptionID == "" || s.AzureTenantID == "" || s.AzureClientID == "" || s.AzureClientSecret == "" {
if a.AzureSubscriptionID == "" && a.AzureTenantID == "" && a.AzureClientID == "" && a.AzureClientSecret == "" {
a.useMsi = true
} else if a.AzureSubscriptionID == "" || a.AzureTenantID == "" || a.AzureClientID == "" || a.AzureClientSecret == "" {
return fmt.Errorf("Must provide values for azureSubscription, azureTenant, azureClient and azureClientSecret, or leave all blank to default to MSI")
}
if s.useMsi == false {
if a.useMsi == false {
// If using direct AD authentication create the AD access client
oauthConfig, err := adal.NewOAuthConfig(azure.PublicCloud.ActiveDirectoryEndpoint, s.AzureTenantID)
oauthConfig, err := adal.NewOAuthConfig(azure.PublicCloud.ActiveDirectoryEndpoint, a.AzureTenantID)
if err != nil {
return fmt.Errorf("Could not initialize AD client: %s", err)
}
s.oauthConfig = oauthConfig
a.oauthConfig = oauthConfig
}
if s.HTTPPostTimeout == 0 {
s.HTTPPostTimeout = 10
if a.HTTPPostTimeout == 0 {
a.HTTPPostTimeout = 10
}
s.metadataService = &AzureInstanceMetadata{}
a.metadataService = &AzureInstanceMetadata{}
// For the metrics API the MSI resource has to be https://ingestion.monitor.azure.com
s.msiResource = "https://ingestion.monitor.azure.com/"
a.msiResource = "https://monitoring.azure.com/"
// Validate the resource identifier
if s.ResourceID == "" {
metadata, err := s.metadataService.GetInstanceMetadata()
if a.ResourceID == "" {
metadata, err := a.metadataService.GetInstanceMetadata()
if err != nil {
return fmt.Errorf("No resource id specified, and Azure Instance metadata service not available. If not running on an Azure VM, provide a value for resourceId")
}
s.ResourceID = metadata.AzureResourceID
a.ResourceID = metadata.AzureResourceID
if s.Region == "" {
s.Region = metadata.Compute.Location
if a.Region == "" {
a.Region = metadata.Compute.Location
}
}
if s.Region == "" {
s.Region = azureMonitorDefaultRegion
if a.Region == "" {
a.Region = azureMonitorDefaultRegion
}
// Validate credentials
err := s.validateCredentials()
err := a.validateCredentials()
if err != nil {
return err
}
a.reset()
go a.run()
return nil
}
// Description provides a description of the plugin
func (a *AzureMonitor) Description() string {
return "Configuration for sending aggregate metrics to Azure Monitor"
}
// SampleConfig provides a sample configuration for the plugin
func (a *AzureMonitor) SampleConfig() string {
return sampleConfig
}
// Close shuts down an any active connections
func (s *AzureMonitor) Close() error {
func (a *AzureMonitor) Close() error {
// Close connection to the URL here
close(a.shutdown)
return nil
}
// Write writes metrics to the remote endpoint
func (s *AzureMonitor) Write(metrics []telegraf.Metric) error {
// Flatten metrics into an Azure Monitor common schema compatible format
metricsList, err := s.flattenMetrics(metrics)
if err != nil {
log.Printf("Error translating metrics %s", err)
return err
}
func (a *AzureMonitor) Write(metrics []telegraf.Metric) error {
log.Printf("metrics collected: %+v", metrics)
for _, v := range metricsList {
jsonBytes, err := json.Marshal(&v)
_, err = s.postData(&jsonBytes)
if err != nil {
log.Printf("Error publishing metrics %s", err)
return err
// Assemble stats on incoming metrics
for _, metric := range metrics {
select {
case a.metrics <- metric:
default:
log.Printf("metrics buffer is full")
}
}
return nil
}
func (s *AzureMonitor) validateCredentials() error {
func (a *AzureMonitor) validateCredentials() error {
// Use managed service identity
if s.useMsi {
if a.useMsi {
// Check expiry on the token
if s.msiToken != nil {
expiryDuration := s.msiToken.ExpiresInDuration()
if expiryDuration > s.expiryWatermark {
if a.msiToken != nil {
expiryDuration := a.msiToken.ExpiresInDuration()
if expiryDuration > a.expiryWatermark {
return nil
}
// Token is about to expire
log.Printf("Bearer token expiring in %s; acquiring new token\n", expiryDuration.String())
s.msiToken = nil
a.msiToken = nil
}
// No token, acquire an MSI token
if s.msiToken == nil {
msiToken, err := s.metadataService.GetMsiToken(s.AzureClientID, s.msiResource)
if a.msiToken == nil {
msiToken, err := a.metadataService.GetMsiToken(a.AzureClientID, a.msiResource)
if err != nil {
return err
}
log.Printf("Bearer token acquired; expiring in %s\n", msiToken.ExpiresInDuration().String())
s.msiToken = msiToken
s.bearerToken = msiToken.AccessToken
a.msiToken = msiToken
a.bearerToken = msiToken.AccessToken
}
// Otherwise directory acquire a token
} else {
adToken, err := adal.NewServicePrincipalToken(
*(s.oauthConfig), s.AzureClientID, s.AzureClientSecret,
*(a.oauthConfig), a.AzureClientID, a.AzureClientSecret,
azure.PublicCloud.ActiveDirectoryEndpoint)
if err != nil {
return fmt.Errorf("Could not acquire ADAL token: %s", err)
}
s.adalToken = adToken
a.adalToken = adToken
}
return nil
}
type azureMonitorMetric struct {
Time time.Time `json:"time"`
Data azureMonitorData `json:"data"`
}
type azureMonitorData struct {
BaseData azureMonitorBaseData `json:"baseData"`
}
type azureMonitorBaseData struct {
Metric string `json:"metric"`
Namespace string `json:"namespace"`
DimensionNames []string `json:"dimNames"`
Series []azureMonitorSeries `json:"series"`
}
type azureMonitorSeries struct {
DimensionValues []string `json:"dimValues"`
Min int64 `json:"min"`
Max int64 `json:"max"`
Sum int64 `json:"sum"`
Count int64 `json:"count"`
}
func (s *AzureMonitor) flattenMetrics(metrics []telegraf.Metric) ([]azureMonitorMetric, error) {
var azureMetrics []azureMonitorMetric
for _, metric := range metrics {
// Get the list of custom dimensions (elevated tags and fields)
func (a *AzureMonitor) add(metric telegraf.Metric) {
id := metric.HashID()
if azm, ok := a.cache[id]; !ok {
// hit an uncached metric, create caches for first time:
var dimensionNames []string
var dimensionValues []string
for name, value := range metric.Fields() {
dimensionNames = append(dimensionNames, name)
dimensionValues = append(dimensionValues, s.formatField(value))
for i, tag := range metric.TagList() {
// Azure custom metrics service supports up to 10 dimensions
if i > 9 {
continue
}
dimensionNames = append(dimensionNames, tag.Key)
dimensionValues = append(dimensionValues, tag.Value)
}
// Field keys are stored as the last dimension
dimensionNames = append(dimensionNames, "field")
var seriesList []*azureMonitorSeries
// Store each field as a separate series with field key as a new dimension
for _, field := range metric.FieldList() {
azmseries := newAzureMonitorSeries(field, dimensionValues)
seriesList = append(seriesList, azmseries)
}
series := azureMonitorSeries{
DimensionValues: dimensionValues,
if len(seriesList) < 1 {
log.Printf("no valid fields for metric: %s", metric)
return
}
if v, ok := metric.Fields()["min"]; ok {
series.Min = s.formatInt(v)
}
if v, ok := metric.Fields()["max"]; ok {
series.Max = s.formatInt(v)
}
if v, ok := metric.Fields()["sum"]; ok {
series.Sum = s.formatInt(v)
}
if v, ok := metric.Fields()["count"]; ok {
series.Count = s.formatInt(v)
} else {
// Azure Monitor requires count >= 1
series.Count = 1
}
azureMetric := azureMonitorMetric{
a.cache[id] = azureMonitorMetric{
Time: metric.Time(),
Data: azureMonitorData{
BaseData: azureMonitorBaseData{
Data: &azureMonitorData{
BaseData: &azureMonitorBaseData{
Metric: metric.Name(),
Namespace: "default",
DimensionNames: dimensionNames,
Series: []azureMonitorSeries{series},
Series: seriesList,
},
},
}
} else {
for _, f := range metric.FieldList() {
fv, ok := convert(f.Value)
if !ok {
continue
}
azureMetrics = append(azureMetrics, azureMetric)
tmp, ok := azm.findSeriesWithField(f.Key)
if !ok {
// hit an uncached field of a cached metric
var dimensionValues []string
for i, tag := range metric.TagList() {
// Azure custom metrics service supports up to 10 dimensions
if i > 9 {
continue
}
dimensionValues = append(dimensionValues, tag.Value)
}
azm.Data.BaseData.Series = append(azm.Data.BaseData.Series, newAzureMonitorSeries(f, dimensionValues))
continue
}
//counter compute
n := tmp.Count + 1
tmp.Count = n
//max/min compute
if fv < tmp.Min {
tmp.Min = fv
} else if fv > tmp.Max {
tmp.Max = fv
}
//sum compute
tmp.Sum += fv
//store final data
a.cache[id].Data.BaseData.Series = append(a.cache[id].Data.BaseData.Series, tmp)
}
}
return azureMetrics, nil
}
func (s *AzureMonitor) formatInt(value interface{}) int64 {
return 0
func (b *azureMonitorMetric) findSeriesWithField(f string) (*azureMonitorSeries, bool) {
if len(b.Data.BaseData.Series) > 0 {
for _, s := range b.Data.BaseData.Series {
if f == s.DimensionValues[len(s.DimensionValues)-1] {
return s, true
}
}
}
return nil, false
}
func (s *AzureMonitor) formatField(value interface{}) string {
var ret string
func newAzureMonitorSeries(f *telegraf.Field, dv []string) *azureMonitorSeries {
fv, ok := convert(f.Value)
if !ok {
log.Printf("unable to convert field %s (type %T) to float type: %v", f.Key, fv, fv)
return nil
}
return &azureMonitorSeries{
DimensionValues: append(append([]string{}, dv...), f.Key),
Min: fv,
Max: fv,
Sum: fv,
Count: 1,
}
}
switch v := value.(type) {
func (a *AzureMonitor) reset() {
a.cache = make(map[uint64]azureMonitorMetric)
}
func convert(in interface{}) (float64, bool) {
switch v := in.(type) {
case int:
ret = strconv.FormatInt(int64(value.(int)), 10)
return float64(v), true
case int8:
ret = strconv.FormatInt(int64(value.(int8)), 10)
return float64(v), true
case int16:
ret = strconv.FormatInt(int64(value.(int16)), 10)
return float64(v), true
case int32:
ret = strconv.FormatInt(int64(value.(int32)), 10)
return float64(v), true
case int64:
ret = strconv.FormatInt(value.(int64), 10)
return float64(v), true
case uint:
return float64(v), true
case uint8:
return float64(v), true
case uint16:
return float64(v), true
case uint32:
return float64(v), true
case uint64:
return float64(v), true
case float32:
ret = strconv.FormatFloat(float64(value.(float32)), 'f', -1, 64)
return float64(v), true
case float64:
ret = strconv.FormatFloat(value.(float64), 'f', -1, 64)
return v, true
case string:
f, err := strconv.ParseFloat(v, 64)
if err != nil {
log.Printf("converted string: %s to %v", v, f)
return 0, false
}
return f, true
default:
spew.Printf("field is of unsupported value type %v\n", v)
log.Printf("did not convert %T: %s", v, v)
return 0, false
}
return ret
}
func (s *AzureMonitor) postData(msg *[]byte) (*http.Request, error) {
func (a *AzureMonitor) push() {
var body []byte
for _, metric := range a.cache {
jsonBytes, err := json.Marshal(&metric)
log.Printf("marshalled point %s", jsonBytes)
if err != nil {
log.Printf("Error marshalling metrics %s", err)
return
}
body = append(body, jsonBytes...)
body = append(body, '\n')
}
log.Printf("Publishing metrics %s", body)
_, err := a.postData(&body)
if err != nil {
log.Printf("Error publishing metrics %s", err)
return
}
return
}
func (a *AzureMonitor) postData(msg *[]byte) (*http.Request, error) {
metricsEndpoint := fmt.Sprintf("https://%s.monitoring.azure.com%s/metrics",
s.Region, s.ResourceID)
a.Region, a.ResourceID)
req, err := http.NewRequest("POST", metricsEndpoint, bytes.NewBuffer(*msg))
if err != nil {
@ -311,8 +414,8 @@ func (s *AzureMonitor) postData(msg *[]byte) (*http.Request, error) {
return nil, err
}
req.Header.Set("Authorization", "Bearer "+s.bearerToken)
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+a.bearerToken)
req.Header.Set("Content-Type", "application/x-ndjson")
tr := &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
@ -341,3 +444,64 @@ func (s *AzureMonitor) postData(msg *[]byte) (*http.Request, error) {
}
return req, nil
}
func (a *AzureMonitor) run() {
// The start of the period is truncated to the nearest minute.
//
// Every metric then gets it's timestamp checked and is dropped if it
// is not within:
//
// start < t < end + truncation + delay
//
// So if we start at now = 00:00.2 with a 10s period and 0.3s delay:
// now = 00:00.2
// start = 00:00
// truncation = 00:00.2
// end = 00:10
// 1st interval: 00:00 - 00:10.5
// 2nd interval: 00:10 - 00:20.5
// etc.
//
now := time.Now()
a.periodStart = now.Truncate(time.Minute)
truncation := now.Sub(a.periodStart)
a.periodEnd = a.periodStart.Add(a.period)
time.Sleep(a.delay)
periodT := time.NewTicker(a.period)
defer periodT.Stop()
for {
select {
case <-a.shutdown:
if len(a.metrics) > 0 {
// wait until metrics are flushed before exiting
continue
}
return
case m := <-a.metrics:
if m.Time().Before(a.periodStart) ||
m.Time().After(a.periodEnd.Add(truncation).Add(a.delay)) {
// the metric is outside the current aggregation period, so
// skip it.
continue
}
a.add(m)
case <-periodT.C:
a.periodStart = a.periodEnd
a.periodEnd = a.periodStart.Add(a.period)
a.push()
a.reset()
}
}
}
func init() {
outputs.Add("azuremonitor", func() telegraf.Output {
return &AzureMonitor{
period: time.Minute,
delay: time.Second * 5,
metrics: make(chan telegraf.Metric, 100),
shutdown: make(chan struct{}),
}
})
}

View File

@ -68,9 +68,12 @@ func TestPostData(t *testing.T) {
metrics := getMockMetrics()
t.Logf("mock metrics are %+v\n", metrics)
metricsList, err := azmon.flattenMetrics(metrics)
// metricsList, err := azmon.add(&metrics[0])
for _, m := range metrics {
azmon.add(m)
}
jsonBytes, err := json.Marshal(&metricsList[0])
jsonBytes, err := json.Marshal(azmon.cache)
t.Logf("json content is:\n----------\n%s\n----------\n", string(jsonBytes))
req, err := azmon.postData(&jsonBytes)

View File

@ -1,93 +0,0 @@
# Telegraf Configuration
# Global tags can be specified here in key="value" format.
[global_tags]
# dc = "us-east-1" # will tag all metrics with dc=us-east-1
# rack = "1a"
## Environment variables can be used as tags, and throughout the config file
# user = "$USER"
# Configuration for telegraf agent
[agent]
interval = "10s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "0s"
flush_jitter = "0s"
precision = ""
debug = false
quiet = false
logfile = ""
hostname = ""
omit_hostname = false
# Configuration for Azure Log Analytics to send metrics
[[outputs.loganalytics]]
workspace = "$OMS_WORKSPACE"
sharedKey = "$OMS_KEY"
logname = "metrics"
includeTags = ["host", "dc"]
###############################################################################
# INPUT PLUGINS #
###############################################################################
# # Influx HTTP write listener
[[inputs.http_listener]]
## Address and port to host HTTP listener on
service_address = ":8186"
## timeouts
read_timeout = "10s"
write_timeout = "10s"
# Read metrics about cpu usage
[[inputs.cpu]]
## Whether to report per-cpu stats or not
percpu = false
## Whether to report total system cpu stats or not
totalcpu = true
## If true, collect raw CPU time metrics.
collect_cpu_time = false
## If true, compute and report the sum of all non-idle CPU states.
report_active = false
# # Read metrics exposed by fluentd in_monitor plugin
# [[inputs.fluentd]]
# ## This plugin reads information exposed by fluentd (using /api/plugins.json endpoint).
# ##
# ## Endpoint:
# ## - only one URI is allowed
# ## - https is not supported
# endpoint = "http://localhost:24220/api/plugins.json"
#
# ## Define which plugins have to be excluded (based on "type" field - e.g. monitor_agent)
# exclude = [
# "monitor_agent",
# "dummy",
# ]
# # Read InfluxDB-formatted JSON metrics from one or more HTTP endpoints
# [[inputs.influxdb]]
# ## Works with InfluxDB debug endpoints out of the box,
# ## but other services can use this format too.
# ## See the influxdb plugin's README for more details.
#
# ## Multiple URLs from which to read InfluxDB-formatted JSON
# ## Default is "http://localhost:8086/debug/vars".
# urls = [
# "http://localhost:8086/debug/vars"
# ]
#
# ## Optional SSL Config
# # ssl_ca = "/etc/telegraf/ca.pem"
# # ssl_cert = "/etc/telegraf/cert.pem"
# # ssl_key = "/etc/telegraf/key.pem"
# ## Use SSL but skip chain & host verification
# # insecure_skip_verify = false
#
# ## http request & header timeout
# timeout = "5s"