telegraf/plugins/outputs/stackdriver/stackdriver.go

package stackdriver

import (
	"context"
	"fmt"
	"hash/fnv"
	"log"
	"path"
	"sort"
	"strings"

	monitoring "cloud.google.com/go/monitoring/apiv3" // Imports the Stackdriver Monitoring client package.
	googlepb "github.com/golang/protobuf/ptypes/timestamp"
	"github.com/influxdata/telegraf"
	"github.com/influxdata/telegraf/internal"
	"github.com/influxdata/telegraf/plugins/outputs"
	"google.golang.org/api/option"
	metricpb "google.golang.org/genproto/googleapis/api/metric"
	monitoredrespb "google.golang.org/genproto/googleapis/api/monitoredres"
	monitoringpb "google.golang.org/genproto/googleapis/monitoring/v3"
)

// Stackdriver is the Google Stackdriver config info.
type Stackdriver struct {
	Project        string
	Namespace      string
	ResourceType   string            `toml:"resource_type"`
	ResourceLabels map[string]string `toml:"resource_labels"`

	client *monitoring.MetricClient
}

const (
	// QuotaLabelsPerMetricDescriptor is the limit
	// to labels (tags) per metric descriptor.
	QuotaLabelsPerMetricDescriptor = 10
	// QuotaStringLengthForLabelKey is the limit
	// to string length for label key.
	QuotaStringLengthForLabelKey = 100
	// QuotaStringLengthForLabelValue is the limit
	// to string length for label value.
	QuotaStringLengthForLabelValue = 1024

	// StartTime for cumulative metrics.
	StartTime = int64(1)
	// MaxInt is the max int64 value.
	MaxInt = int(^uint(0) >> 1)

	errStringPointsOutOfOrder  = "One or more of the points specified had an older end time than the most recent point"
	errStringPointsTooOld      = "Data points cannot be written more than 24h in the past"
	errStringPointsTooFrequent = "One or more points were written more frequently than the maximum sampling period configured for the metric"
)

var sampleConfig = `
  ## GCP Project
  project = "erudite-bloom-151019"

  ## The namespace for the metric descriptor
  namespace = "telegraf"

  ## Custom resource type
  # resource_type = "generic_node"

  ## Additional resource labels
  # [outputs.stackdriver.resource_labels]
  #   node_id = "$HOSTNAME"
  #   namespace = "myapp"
  #   location = "eu-north0"
`

// Connect initiates the primary connection to the GCP project.
func (s *Stackdriver) Connect() error {
	if s.Project == "" {
		return fmt.Errorf("Project is a required field for stackdriver output")
	}

	if s.Namespace == "" {
		return fmt.Errorf("Namespace is a required field for stackdriver output")
	}

	if s.ResourceType == "" {
		s.ResourceType = "global"
	}

	if s.ResourceLabels == nil {
		s.ResourceLabels = make(map[string]string, 1)
	}

	s.ResourceLabels["project_id"] = s.Project

	if s.client == nil {
		ctx := context.Background()
		client, err := monitoring.NewMetricClient(ctx, option.WithUserAgent(internal.ProductToken()))
		if err != nil {
			return err
		}
		s.client = client
	}

	return nil
}

// Sorted returns a copy of the metrics in time ascending order.  A copy is
// made to avoid modifying the input metric slice since doing so is not
// allowed.
func sorted(metrics []telegraf.Metric) []telegraf.Metric {
	batch := make([]telegraf.Metric, 0, len(metrics))
	for i := len(metrics) - 1; i >= 0; i-- {
		batch = append(batch, metrics[i])
	}
	sort.Slice(batch, func(i, j int) bool {
		return batch[i].Time().Before(batch[j].Time())
	})
	return batch
}

type timeSeriesBuckets map[uint64][]*monitoringpb.TimeSeries

func (tsb timeSeriesBuckets) Add(m telegraf.Metric, f *telegraf.Field, ts *monitoringpb.TimeSeries) {
	h := fnv.New64a()
	h.Write([]byte(m.Name()))
	h.Write([]byte{'\n'})
	h.Write([]byte(f.Key))
	h.Write([]byte{'\n'})
	for key, value := range m.Tags() {
		h.Write([]byte(key))
		h.Write([]byte{'\n'})
		h.Write([]byte(value))
		h.Write([]byte{'\n'})
	}
	k := h.Sum64()

	s := tsb[k]
	s = append(s, ts)
	tsb[k] = s
}

// Write the metrics to Google Cloud Stackdriver.
func (s *Stackdriver) Write(metrics []telegraf.Metric) error {
	ctx := context.Background()

	batch := sorted(metrics)
	buckets := make(timeSeriesBuckets)
	for _, m := range batch {
		for _, f := range m.FieldList() {
			value, err := getStackdriverTypedValue(f.Value)
			if err != nil {
				log.Printf("E! [outputs.stackdriver] get type failed: %s", err)
				continue
			}

			if value == nil {
				continue
			}

			metricKind, err := getStackdriverMetricKind(m.Type())
			if err != nil {
				log.Printf("E! [outputs.stackdriver] get metric failed: %s", err)
				continue
			}

			timeInterval, err := getStackdriverTimeInterval(metricKind, StartTime, m.Time().Unix())
			if err != nil {
				log.Printf("E! [outputs.stackdriver] get time interval failed: %s", err)
				continue
			}

			// Prepare an individual data point.
			dataPoint := &monitoringpb.Point{
				Interval: timeInterval,
				Value:    value,
			}

			// Prepare time series.
			timeSeries := &monitoringpb.TimeSeries{
				Metric: &metricpb.Metric{
					Type:   path.Join("custom.googleapis.com", s.Namespace, m.Name(), f.Key),
					Labels: getStackdriverLabels(m.TagList()),
				},
				MetricKind: metricKind,
				Resource: &monitoredrespb.MonitoredResource{
					Type:   s.ResourceType,
					Labels: s.ResourceLabels,
				},
				Points: []*monitoringpb.Point{
					dataPoint,
				},
			}

			buckets.Add(m, f, timeSeries)
		}
	}

	// process the buckets in order
	keys := make([]uint64, 0, len(buckets))
	for k := range buckets {
		keys = append(keys, k)
	}
	sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] })

	for len(buckets) != 0 {
		// can send up to 200 time series to stackdriver
		timeSeries := make([]*monitoringpb.TimeSeries, 0, 200)
		for i := 0; i < len(keys) && len(timeSeries) < cap(timeSeries); i++ {
			k := keys[i]
			s := buckets[k]
			timeSeries = append(timeSeries, s[0])
			if len(s) == 1 {
				delete(buckets, k)
				keys = append(keys[:i], keys[i+1:]...)
				i--
				continue
			}

			s = s[1:]
			buckets[k] = s
		}

		// Prepare time series request.
		timeSeriesRequest := &monitoringpb.CreateTimeSeriesRequest{
			Name:       monitoring.MetricProjectPath(s.Project),
			TimeSeries: timeSeries,
		}

		// Create the time series in Stackdriver.
		err := s.client.CreateTimeSeries(ctx, timeSeriesRequest)
		if err != nil {
			if strings.Contains(err.Error(), errStringPointsOutOfOrder) ||
				strings.Contains(err.Error(), errStringPointsTooOld) ||
				strings.Contains(err.Error(), errStringPointsTooFrequent) {
				log.Printf("D! [outputs.stackdriver] unable to write to Stackdriver: %s", err)
				return nil
			}
			log.Printf("E! [outputs.stackdriver] unable to write to Stackdriver: %s", err)
			return err
		}
	}

	return nil
}

func getStackdriverTimeInterval(
	m metricpb.MetricDescriptor_MetricKind,
	start int64,
	end int64,
) (*monitoringpb.TimeInterval, error) {
	switch m {
	case metricpb.MetricDescriptor_GAUGE:
		return &monitoringpb.TimeInterval{
			EndTime: &googlepb.Timestamp{
				Seconds: end,
			},
		}, nil
	case metricpb.MetricDescriptor_CUMULATIVE:
		return &monitoringpb.TimeInterval{
			StartTime: &googlepb.Timestamp{
				Seconds: start,
			},
			EndTime: &googlepb.Timestamp{
				Seconds: end,
			},
		}, nil
	case metricpb.MetricDescriptor_DELTA, metricpb.MetricDescriptor_METRIC_KIND_UNSPECIFIED:
		fallthrough
	default:
		return nil, fmt.Errorf("unsupported metric kind %T", m)
	}
}

func getStackdriverMetricKind(vt telegraf.ValueType) (metricpb.MetricDescriptor_MetricKind, error) {
	switch vt {
	case telegraf.Untyped:
		return metricpb.MetricDescriptor_GAUGE, nil
	case telegraf.Gauge:
		return metricpb.MetricDescriptor_GAUGE, nil
	case telegraf.Counter:
		return metricpb.MetricDescriptor_CUMULATIVE, nil
	case telegraf.Histogram, telegraf.Summary:
		fallthrough
	default:
		return metricpb.MetricDescriptor_METRIC_KIND_UNSPECIFIED, fmt.Errorf("unsupported telegraf value type")
	}
}

func getStackdriverTypedValue(value interface{}) (*monitoringpb.TypedValue, error) {
	switch v := value.(type) {
	case uint64:
		if v <= uint64(MaxInt) {
			return &monitoringpb.TypedValue{
				Value: &monitoringpb.TypedValue_Int64Value{
					Int64Value: int64(v),
				},
			}, nil
		}
		return &monitoringpb.TypedValue{
			Value: &monitoringpb.TypedValue_Int64Value{
				Int64Value: int64(MaxInt),
			},
		}, nil
	case int64:
		return &monitoringpb.TypedValue{
			Value: &monitoringpb.TypedValue_Int64Value{
				Int64Value: int64(v),
			},
		}, nil
	case float64:
		return &monitoringpb.TypedValue{
			Value: &monitoringpb.TypedValue_DoubleValue{
				DoubleValue: float64(v),
			},
		}, nil
	case bool:
		return &monitoringpb.TypedValue{
			Value: &monitoringpb.TypedValue_BoolValue{
				BoolValue: bool(v),
			},
		}, nil
	case string:
		// String value types are not available for custom metrics
		return nil, nil
	default:
		return nil, fmt.Errorf("value type \"%T\" not supported for stackdriver custom metrics", v)
	}
}

func getStackdriverLabels(tags []*telegraf.Tag) map[string]string {
	labels := make(map[string]string)
	for _, t := range tags {
		labels[t.Key] = t.Value
	}
	for k, v := range labels {
		if len(k) > QuotaStringLengthForLabelKey {
			log.Printf(
				"W! [outputs.stackdriver] removing tag [%s] key exceeds string length for label key [%d]",
				k,
				QuotaStringLengthForLabelKey,
			)
			delete(labels, k)
			continue
		}
		if len(v) > QuotaStringLengthForLabelValue {
			log.Printf(
				"W! [outputs.stackdriver] removing tag [%s] value exceeds string length for label value [%d]",
				k,
				QuotaStringLengthForLabelValue,
			)
			delete(labels, k)
			continue
		}
	}
	if len(labels) > QuotaLabelsPerMetricDescriptor {
		excess := len(labels) - QuotaLabelsPerMetricDescriptor
		log.Printf(
			"W! [outputs.stackdriver] tag count [%d] exceeds quota for stackdriver labels [%d] removing [%d] random tags",
			len(labels),
			QuotaLabelsPerMetricDescriptor,
			excess,
		)
		for k := range labels {
			if excess == 0 {
				break
			}
			excess--
			delete(labels, k)
		}
	}

	return labels
}

// Close will terminate the session to the backend, returning error if an issue arises.
func (s *Stackdriver) Close() error {
	return s.client.Close()
}

// SampleConfig returns the formatted sample configuration for the plugin.
func (s *Stackdriver) SampleConfig() string {
	return sampleConfig
}

// Description returns the human-readable function definition of the plugin.
func (s *Stackdriver) Description() string {
	return "Configuration for Google Cloud Stackdriver to send metrics to"
}

func newStackdriver() *Stackdriver {
	return &Stackdriver{}
}

func init() {
	outputs.Add("stackdriver", func() telegraf.Output {
		return newStackdriver()
	})
}