telegraf/plugins/inputs/kafka_consumer/kafka_consumer.go

454 lines
12 KiB
Go

package kafka_consumer
import (
"context"
"fmt"
"log"
"strings"
"sync"
"time"
"github.com/Shopify/sarama"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/internal/tls"
"github.com/influxdata/telegraf/plugins/common/kafka"
"github.com/influxdata/telegraf/plugins/inputs"
"github.com/influxdata/telegraf/plugins/parsers"
)
const sampleConfig = `
## Kafka brokers.
brokers = ["localhost:9092"]
## Topics to consume.
topics = ["telegraf"]
## When set this tag will be added to all metrics with the topic as the value.
# topic_tag = ""
## Optional Client id
# client_id = "Telegraf"
## Set the minimal supported Kafka version. Setting this enables the use of new
## Kafka features and APIs. Must be 0.10.2.0 or greater.
## ex: version = "1.1.0"
# version = ""
## Optional TLS Config
# enable_tls = true
# tls_ca = "/etc/telegraf/ca.pem"
# tls_cert = "/etc/telegraf/cert.pem"
# tls_key = "/etc/telegraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
## SASL authentication credentials. These settings should typically be used
## with TLS encryption enabled using the "enable_tls" option.
# sasl_username = "kafka"
# sasl_password = "secret"
## SASL protocol version. When connecting to Azure EventHub set to 0.
# sasl_version = 1
## Name of the consumer group.
# consumer_group = "telegraf_metrics_consumers"
## Initial offset position; one of "oldest" or "newest".
# offset = "oldest"
## Consumer group partition assignment strategy; one of "range", "roundrobin" or "sticky".
# balance_strategy = "range"
## Maximum length of a message to consume, in bytes (default 0/unlimited);
## larger messages are dropped
max_message_len = 1000000
## Maximum messages to read from the broker that have not been written by an
## output. For best throughput set based on the number of metrics within
## each message and the size of the output's metric_batch_size.
##
## For example, if each message from the queue contains 10 metrics and the
## output metric_batch_size is 1000, setting this to 100 will ensure that a
## full batch is collected and the write is triggered immediately without
## waiting until the next flush_interval.
# max_undelivered_messages = 1000
## Data format to consume.
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
data_format = "influx"
`
const (
defaultMaxUndeliveredMessages = 1000
defaultMaxMessageLen = 1000000
defaultConsumerGroup = "telegraf_metrics_consumers"
reconnectDelay = 5 * time.Second
)
type empty struct{}
type semaphore chan empty
type KafkaConsumer struct {
Brokers []string `toml:"brokers"`
ClientID string `toml:"client_id"`
ConsumerGroup string `toml:"consumer_group"`
MaxMessageLen int `toml:"max_message_len"`
MaxUndeliveredMessages int `toml:"max_undelivered_messages"`
Offset string `toml:"offset"`
BalanceStrategy string `toml:"balance_strategy"`
Topics []string `toml:"topics"`
TopicTag string `toml:"topic_tag"`
Version string `toml:"version"`
SASLPassword string `toml:"sasl_password"`
SASLUsername string `toml:"sasl_username"`
SASLVersion *int `toml:"sasl_version"`
EnableTLS *bool `toml:"enable_tls"`
tls.ClientConfig
Log telegraf.Logger `toml:"-"`
ConsumerCreator ConsumerGroupCreator `toml:"-"`
consumer ConsumerGroup
config *sarama.Config
parser parsers.Parser
wg sync.WaitGroup
cancel context.CancelFunc
}
type ConsumerGroup interface {
Consume(ctx context.Context, topics []string, handler sarama.ConsumerGroupHandler) error
Errors() <-chan error
Close() error
}
type ConsumerGroupCreator interface {
Create(brokers []string, group string, config *sarama.Config) (ConsumerGroup, error)
}
type SaramaCreator struct{}
func (*SaramaCreator) Create(brokers []string, group string, config *sarama.Config) (ConsumerGroup, error) {
return sarama.NewConsumerGroup(brokers, group, config)
}
func (k *KafkaConsumer) SampleConfig() string {
return sampleConfig
}
func (k *KafkaConsumer) Description() string {
return "Read metrics from Kafka topics"
}
func (k *KafkaConsumer) SetParser(parser parsers.Parser) {
k.parser = parser
}
func (k *KafkaConsumer) Init() error {
if k.MaxUndeliveredMessages == 0 {
k.MaxUndeliveredMessages = defaultMaxUndeliveredMessages
}
if k.ConsumerGroup == "" {
k.ConsumerGroup = defaultConsumerGroup
}
config := sarama.NewConfig()
config.Consumer.Return.Errors = true
// Kafka version 0.10.2.0 is required for consumer groups.
config.Version = sarama.V0_10_2_0
if k.Version != "" {
version, err := sarama.ParseKafkaVersion(k.Version)
if err != nil {
return err
}
config.Version = version
}
if k.EnableTLS != nil && *k.EnableTLS {
config.Net.TLS.Enable = true
}
tlsConfig, err := k.ClientConfig.TLSConfig()
if err != nil {
return err
}
if tlsConfig != nil {
config.Net.TLS.Config = tlsConfig
// To maintain backwards compatibility, if the enable_tls option is not
// set TLS is enabled if a non-default TLS config is used.
if k.EnableTLS == nil {
k.Log.Warnf("Use of deprecated configuration: enable_tls should be set when using TLS")
config.Net.TLS.Enable = true
}
}
if k.SASLUsername != "" && k.SASLPassword != "" {
config.Net.SASL.User = k.SASLUsername
config.Net.SASL.Password = k.SASLPassword
config.Net.SASL.Enable = true
version, err := kafka.SASLVersion(config.Version, k.SASLVersion)
if err != nil {
return err
}
config.Net.SASL.Version = version
}
if k.ClientID != "" {
config.ClientID = k.ClientID
} else {
config.ClientID = "Telegraf"
}
switch strings.ToLower(k.Offset) {
case "oldest", "":
config.Consumer.Offsets.Initial = sarama.OffsetOldest
case "newest":
config.Consumer.Offsets.Initial = sarama.OffsetNewest
default:
return fmt.Errorf("invalid offset %q", k.Offset)
}
switch strings.ToLower(k.BalanceStrategy) {
case "range", "":
config.Consumer.Group.Rebalance.Strategy = sarama.BalanceStrategyRange
case "roundrobin":
config.Consumer.Group.Rebalance.Strategy = sarama.BalanceStrategyRoundRobin
case "sticky":
config.Consumer.Group.Rebalance.Strategy = sarama.BalanceStrategySticky
default:
return fmt.Errorf("invalid balance strategy %q", k.BalanceStrategy)
}
if k.ConsumerCreator == nil {
k.ConsumerCreator = &SaramaCreator{}
}
k.config = config
return nil
}
func (k *KafkaConsumer) Start(acc telegraf.Accumulator) error {
var err error
k.consumer, err = k.ConsumerCreator.Create(
k.Brokers,
k.ConsumerGroup,
k.config,
)
if err != nil {
return err
}
ctx, cancel := context.WithCancel(context.Background())
k.cancel = cancel
// Start consumer goroutine
k.wg.Add(1)
go func() {
defer k.wg.Done()
for ctx.Err() == nil {
handler := NewConsumerGroupHandler(acc, k.MaxUndeliveredMessages, k.parser)
handler.MaxMessageLen = k.MaxMessageLen
handler.TopicTag = k.TopicTag
err := k.consumer.Consume(ctx, k.Topics, handler)
if err != nil {
acc.AddError(err)
internal.SleepContext(ctx, reconnectDelay)
}
}
err = k.consumer.Close()
if err != nil {
acc.AddError(err)
}
}()
k.wg.Add(1)
go func() {
defer k.wg.Done()
for err := range k.consumer.Errors() {
acc.AddError(err)
}
}()
return nil
}
func (k *KafkaConsumer) Gather(acc telegraf.Accumulator) error {
return nil
}
func (k *KafkaConsumer) Stop() {
k.cancel()
k.wg.Wait()
}
// Message is an aggregate type binding the Kafka message and the session so
// that offsets can be updated.
type Message struct {
message *sarama.ConsumerMessage
session sarama.ConsumerGroupSession
}
func NewConsumerGroupHandler(acc telegraf.Accumulator, maxUndelivered int, parser parsers.Parser) *ConsumerGroupHandler {
handler := &ConsumerGroupHandler{
acc: acc.WithTracking(maxUndelivered),
sem: make(chan empty, maxUndelivered),
undelivered: make(map[telegraf.TrackingID]Message, maxUndelivered),
parser: parser,
}
return handler
}
// ConsumerGroupHandler is a sarama.ConsumerGroupHandler implementation.
type ConsumerGroupHandler struct {
MaxMessageLen int
TopicTag string
acc telegraf.TrackingAccumulator
sem semaphore
parser parsers.Parser
wg sync.WaitGroup
cancel context.CancelFunc
mu sync.Mutex
undelivered map[telegraf.TrackingID]Message
}
// Setup is called once when a new session is opened. It setups up the handler
// and begins processing delivered messages.
func (h *ConsumerGroupHandler) Setup(sarama.ConsumerGroupSession) error {
h.undelivered = make(map[telegraf.TrackingID]Message)
ctx, cancel := context.WithCancel(context.Background())
h.cancel = cancel
h.wg.Add(1)
go func() {
defer h.wg.Done()
h.run(ctx)
}()
return nil
}
// Run processes any delivered metrics during the lifetime of the session.
func (h *ConsumerGroupHandler) run(ctx context.Context) error {
for {
select {
case <-ctx.Done():
return nil
case track := <-h.acc.Delivered():
h.onDelivery(track)
}
}
}
func (h *ConsumerGroupHandler) onDelivery(track telegraf.DeliveryInfo) {
h.mu.Lock()
defer h.mu.Unlock()
msg, ok := h.undelivered[track.ID()]
if !ok {
log.Printf("E! [inputs.kafka_consumer] Could not mark message delivered: %d", track.ID())
return
}
if track.Delivered() {
msg.session.MarkMessage(msg.message, "")
}
delete(h.undelivered, track.ID())
<-h.sem
}
// Reserve blocks until there is an available slot for a new message.
func (h *ConsumerGroupHandler) Reserve(ctx context.Context) error {
select {
case <-ctx.Done():
return ctx.Err()
case h.sem <- empty{}:
return nil
}
}
func (h *ConsumerGroupHandler) release() {
<-h.sem
}
// Handle processes a message and if successful saves it to be acknowledged
// after delivery.
func (h *ConsumerGroupHandler) Handle(session sarama.ConsumerGroupSession, msg *sarama.ConsumerMessage) error {
if h.MaxMessageLen != 0 && len(msg.Value) > h.MaxMessageLen {
session.MarkMessage(msg, "")
h.release()
return fmt.Errorf("message exceeds max_message_len (actual %d, max %d)",
len(msg.Value), h.MaxMessageLen)
}
metrics, err := h.parser.Parse(msg.Value)
if err != nil {
h.release()
return err
}
if len(h.TopicTag) > 0 {
for _, metric := range metrics {
metric.AddTag(h.TopicTag, msg.Topic)
}
}
h.mu.Lock()
id := h.acc.AddTrackingMetricGroup(metrics)
h.undelivered[id] = Message{session: session, message: msg}
h.mu.Unlock()
return nil
}
// ConsumeClaim is called once each claim in a goroutine and must be
// thread-safe. Should run until the claim is closed.
func (h *ConsumerGroupHandler) ConsumeClaim(session sarama.ConsumerGroupSession, claim sarama.ConsumerGroupClaim) error {
ctx := session.Context()
for {
err := h.Reserve(ctx)
if err != nil {
return nil
}
select {
case <-ctx.Done():
return nil
case msg, ok := <-claim.Messages():
if !ok {
return nil
}
err := h.Handle(session, msg)
if err != nil {
h.acc.AddError(err)
}
}
}
}
// Cleanup stops the internal goroutine and is called after all ConsumeClaim
// functions have completed.
func (h *ConsumerGroupHandler) Cleanup(sarama.ConsumerGroupSession) error {
h.cancel()
h.wg.Wait()
return nil
}
func init() {
inputs.Add("kafka_consumer", func() telegraf.Input {
return &KafkaConsumer{}
})
}