Remove outputs blocking inputs when output is slow (#4938)

2018-11-05 13:34:28 -08:00
parent 74667cd681
commit 6e5c2f8bb6
59 changed files with 3615 additions and 2189 deletions
--- a/plugins/inputs/kafka_consumer/README.md
+++ b/plugins/inputs/kafka_consumer/README.md
@@ -1,18 +1,14 @@
 # Kafka Consumer Input Plugin

-The [Kafka](http://kafka.apache.org/) consumer plugin polls a specified Kafka
-topic and adds messages to InfluxDB. The plugin assumes messages follow the
-line protocol. [Consumer Group](http://godoc.org/github.com/wvanbergen/kafka/consumergroup)
-is used to talk to the Kafka cluster so multiple instances of telegraf can read
-from the same topic in parallel.
+The [Kafka][kafka] consumer plugin reads from Kafka
+and creates metrics using one of the supported [input data formats][].

-For old kafka version (< 0.8), please use the kafka_consumer_legacy input plugin
+For old kafka version (< 0.8), please use the [kafka_consumer_legacy][] input plugin
 and use the old zookeeper connection method.

-## Configuration
+### Configuration

 ```toml
-# Read metrics from Kafka topic(s)
 [[inputs.kafka_consumer]]
  ## kafka servers
  brokers = ["localhost:9092"]
@@ -44,18 +40,27 @@ and use the old zookeeper connection method.
  ## Offset (must be either "oldest" or "newest")
  offset = "oldest"

+  ## Maximum length of a message to consume, in bytes (default 0/unlimited);
+  ## larger messages are dropped
+  max_message_len = 1000000
+
+  ## Maximum messages to read from the broker that have not been written by an
+  ## output.  For best throughput set based on the number of metrics within
+  ## each message and the size of the output's metric_batch_size.
+  ##
+  ## For example, if each message from the queue contains 10 metrics and the
+  ## output metric_batch_size is 1000, setting this to 100 will ensure that a
+  ## full batch is collected and the write is triggered immediately without
+  ## waiting until the next flush_interval.
+  # max_undelivered_messages = 1000
+
  ## Data format to consume.
  ## Each data format has its own unique set of configuration options, read
  ## more about them here:
  ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
  data_format = "influx"
-
-  ## Maximum length of a message to consume, in bytes (default 0/unlimited);
-  ## larger messages are dropped
-  max_message_len = 1000000
 ```

-## Testing
-
-Running integration tests requires running Zookeeper & Kafka. See Makefile
-for kafka container command.
+[kafka]: https://kafka.apache.org
+[kafka_consumer_legacy]: /plugins/inputs/kafka_consumer_legacy/README.md
+[input data formats]: /docs/DATA_FORMATS_INPUT.md
--- a/plugins/inputs/kafka_consumer/kafka_consumer.go
+++ b/plugins/inputs/kafka_consumer/kafka_consumer.go
@@ -1,55 +1,54 @@
 package kafka_consumer

 import (
+	"context"
 	"fmt"
 	"log"
 	"strings"
 	"sync"

+	"github.com/Shopify/sarama"
+	cluster "github.com/bsm/sarama-cluster"
 	"github.com/influxdata/telegraf"
 	"github.com/influxdata/telegraf/internal/tls"
 	"github.com/influxdata/telegraf/plugins/inputs"
 	"github.com/influxdata/telegraf/plugins/parsers"
-
-	"github.com/Shopify/sarama"
-	cluster "github.com/bsm/sarama-cluster"
 )

+const (
+	defaultMaxUndeliveredMessages = 1000
+)
+
+type empty struct{}
+type semaphore chan empty
+
+type Consumer interface {
+	Errors() <-chan error
+	Messages() <-chan *sarama.ConsumerMessage
+	MarkOffset(msg *sarama.ConsumerMessage, metadata string)
+	Close() error
+}
+
 type Kafka struct {
-	ConsumerGroup string
-	ClientID      string `toml:"client_id"`
-	Topics        []string
-	Brokers       []string
-	MaxMessageLen int
-	Version       string `toml:"version"`
-
-	Cluster *cluster.Consumer
-
+	ConsumerGroup          string   `toml:"consumer_group"`
+	ClientID               string   `toml:"client_id"`
+	Topics                 []string `toml:"topics"`
+	Brokers                []string `toml:"brokers"`
+	MaxMessageLen          int      `toml:"max_message_len"`
+	Version                string   `toml:"version"`
+	MaxUndeliveredMessages int      `toml:"max_undelivered_messages"`
+	Offset                 string   `toml:"offset"`
+	SASLUsername           string   `toml:"sasl_username"`
+	SASLPassword           string   `toml:"sasl_password"`
 	tls.ClientConfig

-	// SASL Username
-	SASLUsername string `toml:"sasl_username"`
-	// SASL Password
-	SASLPassword string `toml:"sasl_password"`
+	cluster Consumer
+	parser  parsers.Parser
+	wg      *sync.WaitGroup
+	cancel  context.CancelFunc

-	// Legacy metric buffer support
-	MetricBuffer int
-	// TODO remove PointBuffer, legacy support
-	PointBuffer int
-
-	Offset string
-	parser parsers.Parser
-
-	sync.Mutex
-
-	// channel for all incoming kafka messages
-	in <-chan *sarama.ConsumerMessage
-	// channel for all kafka consumer errors
-	errs <-chan error
-	done chan struct{}
-
-	// keep the accumulator internally:
-	acc telegraf.Accumulator
+	// Unconfirmed messages
+	messages map[telegraf.TrackingID]*sarama.ConsumerMessage

 	// doNotCommitMsgs tells the parser not to call CommitUpTo on the consumer
 	// this is mostly for test purposes, but there may be a use-case for it later.
@@ -86,16 +85,25 @@ var sampleConfig = `
  consumer_group = "telegraf_metrics_consumers"
  ## Offset (must be either "oldest" or "newest")
  offset = "oldest"
+  ## Maximum length of a message to consume, in bytes (default 0/unlimited);
+  ## larger messages are dropped
+  max_message_len = 1000000
+
+  ## Maximum messages to read from the broker that have not been written by an
+  ## output.  For best throughput set based on the number of metrics within
+  ## each message and the size of the output's metric_batch_size.
+  ##
+  ## For example, if each message from the queue contains 10 metrics and the
+  ## output metric_batch_size is 1000, setting this to 100 will ensure that a
+  ## full batch is collected and the write is triggered immediately without
+  ## waiting until the next flush_interval.
+  # max_undelivered_messages = 1000

  ## Data format to consume.
  ## Each data format has its own unique set of configuration options, read
  ## more about them here:
  ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
  data_format = "influx"
-
-  ## Maximum length of a message to consume, in bytes (default 0/unlimited);
-  ## larger messages are dropped
-  max_message_len = 1000000
 `

 func (k *Kafka) SampleConfig() string {
@@ -111,12 +119,8 @@ func (k *Kafka) SetParser(parser parsers.Parser) {
 }

 func (k *Kafka) Start(acc telegraf.Accumulator) error {
-	k.Lock()
-	defer k.Unlock()
 	var clusterErr error

-	k.acc = acc
-
 	config := cluster.NewConfig()

 	if k.Version != "" {
@@ -159,13 +163,13 @@ func (k *Kafka) Start(acc telegraf.Accumulator) error {
 	case "newest":
 		config.Consumer.Offsets.Initial = sarama.OffsetNewest
 	default:
-		log.Printf("I! WARNING: Kafka consumer invalid offset '%s', using 'oldest'\n",
+		log.Printf("I! WARNING: Kafka consumer invalid offset '%s', using 'oldest'",
 			k.Offset)
 		config.Consumer.Offsets.Initial = sarama.OffsetOldest
 	}

-	if k.Cluster == nil {
-		k.Cluster, clusterErr = cluster.NewConsumer(
+	if k.cluster == nil {
+		k.cluster, clusterErr = cluster.NewConsumer(
 			k.Brokers,
 			k.ConsumerGroup,
 			k.Topics,
@@ -173,67 +177,110 @@ func (k *Kafka) Start(acc telegraf.Accumulator) error {
 		)

 		if clusterErr != nil {
-			log.Printf("E! Error when creating Kafka Consumer, brokers: %v, topics: %v\n",
+			log.Printf("E! Error when creating Kafka Consumer, brokers: %v, topics: %v",
 				k.Brokers, k.Topics)
 			return clusterErr
 		}
-
-		// Setup message and error channels
-		k.in = k.Cluster.Messages()
-		k.errs = k.Cluster.Errors()
 	}

-	k.done = make(chan struct{})
-	// Start the kafka message reader
-	go k.receiver()
-	log.Printf("I! Started the kafka consumer service, brokers: %v, topics: %v\n",
+	ctx, cancel := context.WithCancel(context.Background())
+	k.cancel = cancel
+
+	// Start consumer goroutine
+	k.wg = &sync.WaitGroup{}
+	k.wg.Add(1)
+	go func() {
+		defer k.wg.Done()
+		k.receiver(ctx, acc)
+	}()
+
+	log.Printf("I! Started the kafka consumer service, brokers: %v, topics: %v",
 		k.Brokers, k.Topics)
 	return nil
 }

 // receiver() reads all incoming messages from the consumer, and parses them into
 // influxdb metric points.
-func (k *Kafka) receiver() {
+func (k *Kafka) receiver(ctx context.Context, ac telegraf.Accumulator) {
+	k.messages = make(map[telegraf.TrackingID]*sarama.ConsumerMessage)
+
+	acc := ac.WithTracking(k.MaxUndeliveredMessages)
+	sem := make(semaphore, k.MaxUndeliveredMessages)
+
 	for {
 		select {
-		case <-k.done:
+		case <-ctx.Done():
 			return
-		case err := <-k.errs:
-			if err != nil {
-				k.acc.AddError(fmt.Errorf("Consumer Error: %s\n", err))
-			}
-		case msg := <-k.in:
-			if k.MaxMessageLen != 0 && len(msg.Value) > k.MaxMessageLen {
-				k.acc.AddError(fmt.Errorf("Message longer than max_message_len (%d > %d)",
-					len(msg.Value), k.MaxMessageLen))
-			} else {
-				metrics, err := k.parser.Parse(msg.Value)
+		case track := <-acc.Delivered():
+			<-sem
+			k.onDelivery(track)
+		case err := <-k.cluster.Errors():
+			acc.AddError(err)
+		case sem <- empty{}:
+			select {
+			case <-ctx.Done():
+				return
+			case track := <-acc.Delivered():
+				// Once for the delivered message, once to leave the case
+				<-sem
+				<-sem
+				k.onDelivery(track)
+			case err := <-k.cluster.Errors():
+				<-sem
+				acc.AddError(err)
+			case msg := <-k.cluster.Messages():
+				err := k.onMessage(acc, msg)
 				if err != nil {
-					k.acc.AddError(fmt.Errorf("Message Parse Error\nmessage: %s\nerror: %s",
-						string(msg.Value), err.Error()))
+					acc.AddError(err)
+					<-sem
 				}
-				for _, metric := range metrics {
-					k.acc.AddFields(metric.Name(), metric.Fields(), metric.Tags(), metric.Time())
-				}
-			}
-
-			if !k.doNotCommitMsgs {
-				// TODO(cam) this locking can be removed if this PR gets merged:
-				// https://github.com/wvanbergen/kafka/pull/84
-				k.Lock()
-				k.Cluster.MarkOffset(msg, "")
-				k.Unlock()
 			}
 		}
 	}
 }

+func (k *Kafka) markOffset(msg *sarama.ConsumerMessage) {
+	if !k.doNotCommitMsgs {
+		k.cluster.MarkOffset(msg, "")
+	}
+}
+
+func (k *Kafka) onMessage(acc telegraf.TrackingAccumulator, msg *sarama.ConsumerMessage) error {
+	if k.MaxMessageLen != 0 && len(msg.Value) > k.MaxMessageLen {
+		k.markOffset(msg)
+		return fmt.Errorf("Message longer than max_message_len (%d > %d)",
+			len(msg.Value), k.MaxMessageLen)
+	}
+
+	metrics, err := k.parser.Parse(msg.Value)
+	if err != nil {
+		return err
+	}
+
+	id := acc.AddTrackingMetricGroup(metrics)
+	k.messages[id] = msg
+
+	return nil
+}
+
+func (k *Kafka) onDelivery(track telegraf.DeliveryInfo) {
+	msg, ok := k.messages[track.ID()]
+	if !ok {
+		log.Printf("E! [inputs.kafka_consumer] Could not mark message delivered: %d", track.ID())
+	}
+
+	if track.Delivered() {
+		k.markOffset(msg)
+	}
+	delete(k.messages, track.ID())
+}
+
 func (k *Kafka) Stop() {
-	k.Lock()
-	defer k.Unlock()
-	close(k.done)
-	if err := k.Cluster.Close(); err != nil {
-		k.acc.AddError(fmt.Errorf("Error closing consumer: %s\n", err.Error()))
+	k.cancel()
+	k.wg.Wait()
+
+	if err := k.cluster.Close(); err != nil {
+		log.Printf("E! [inputs.kafka_consumer] Error closing consumer: %v", err)
 	}
 }

@@ -243,6 +290,8 @@ func (k *Kafka) Gather(acc telegraf.Accumulator) error {

 func init() {
 	inputs.Add("kafka_consumer", func() telegraf.Input {
-		return &Kafka{}
+		return &Kafka{
+			MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
+		}
 	})
 }
--- a/plugins/inputs/kafka_consumer/kafka_consumer_integration_test.go
+++ b/plugins/inputs/kafka_consumer/kafka_consumer_integration_test.go
@@ -38,7 +38,6 @@ func TestReadsMetricsFromKafka(t *testing.T) {
 		ConsumerGroup: "telegraf_test_consumers",
 		Topics:        []string{testTopic},
 		Brokers:       brokerPeers,
-		PointBuffer:   100000,
 		Offset:        "oldest",
 	}
 	p, _ := parsers.NewInfluxParser()
--- a/plugins/inputs/kafka_consumer/kafka_consumer_test.go
+++ b/plugins/inputs/kafka_consumer/kafka_consumer_test.go
@@ -1,13 +1,14 @@
 package kafka_consumer

 import (
+	"context"
 	"strings"
 	"testing"

+	"github.com/Shopify/sarama"
+	"github.com/influxdata/telegraf"
 	"github.com/influxdata/telegraf/plugins/parsers"
 	"github.com/influxdata/telegraf/testutil"
-
-	"github.com/Shopify/sarama"
 	"github.com/stretchr/testify/assert"
 )

@@ -18,31 +19,57 @@ const (
 	invalidMsg      = "cpu_load_short,host=server01 1422568543702900257\n"
 )

-func newTestKafka() (*Kafka, chan *sarama.ConsumerMessage) {
-	in := make(chan *sarama.ConsumerMessage, 1000)
-	k := Kafka{
-		ConsumerGroup:   "test",
-		Topics:          []string{"telegraf"},
-		Brokers:         []string{"localhost:9092"},
-		Offset:          "oldest",
-		in:              in,
-		doNotCommitMsgs: true,
-		errs:            make(chan error, 1000),
-		done:            make(chan struct{}),
+type TestConsumer struct {
+	errors   chan error
+	messages chan *sarama.ConsumerMessage
+}
+
+func (c *TestConsumer) Errors() <-chan error {
+	return c.errors
+}
+
+func (c *TestConsumer) Messages() <-chan *sarama.ConsumerMessage {
+	return c.messages
+}
+
+func (c *TestConsumer) MarkOffset(msg *sarama.ConsumerMessage, metadata string) {
+}
+
+func (c *TestConsumer) Close() error {
+	return nil
+}
+
+func (c *TestConsumer) Inject(msg *sarama.ConsumerMessage) {
+	c.messages <- msg
+}
+
+func newTestKafka() (*Kafka, *TestConsumer) {
+	consumer := &TestConsumer{
+		errors:   make(chan error),
+		messages: make(chan *sarama.ConsumerMessage, 1000),
 	}
-	return &k, in
+	k := Kafka{
+		cluster:                consumer,
+		ConsumerGroup:          "test",
+		Topics:                 []string{"telegraf"},
+		Brokers:                []string{"localhost:9092"},
+		Offset:                 "oldest",
+		MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
+		doNotCommitMsgs:        true,
+		messages:               make(map[telegraf.TrackingID]*sarama.ConsumerMessage),
+	}
+	return &k, consumer
 }

 // Test that the parser parses kafka messages into points
 func TestRunParser(t *testing.T) {
-	k, in := newTestKafka()
+	k, consumer := newTestKafka()
 	acc := testutil.Accumulator{}
-	k.acc = &acc
-	defer close(k.done)
+	ctx := context.Background()

 	k.parser, _ = parsers.NewInfluxParser()
-	go k.receiver()
-	in <- saramaMsg(testMsg)
+	go k.receiver(ctx, &acc)
+	consumer.Inject(saramaMsg(testMsg))
 	acc.Wait(1)

 	assert.Equal(t, acc.NFields(), 1)
@@ -50,14 +77,13 @@ func TestRunParser(t *testing.T) {

 // Test that the parser ignores invalid messages
 func TestRunParserInvalidMsg(t *testing.T) {
-	k, in := newTestKafka()
+	k, consumer := newTestKafka()
 	acc := testutil.Accumulator{}
-	k.acc = &acc
-	defer close(k.done)
+	ctx := context.Background()

 	k.parser, _ = parsers.NewInfluxParser()
-	go k.receiver()
-	in <- saramaMsg(invalidMsg)
+	go k.receiver(ctx, &acc)
+	consumer.Inject(saramaMsg(invalidMsg))
 	acc.WaitError(1)

 	assert.Equal(t, acc.NFields(), 0)
@@ -66,15 +92,14 @@ func TestRunParserInvalidMsg(t *testing.T) {
 // Test that overlong messages are dropped
 func TestDropOverlongMsg(t *testing.T) {
 	const maxMessageLen = 64 * 1024
-	k, in := newTestKafka()
+	k, consumer := newTestKafka()
 	k.MaxMessageLen = maxMessageLen
 	acc := testutil.Accumulator{}
-	k.acc = &acc
-	defer close(k.done)
+	ctx := context.Background()
 	overlongMsg := strings.Repeat("v", maxMessageLen+1)

-	go k.receiver()
-	in <- saramaMsg(overlongMsg)
+	go k.receiver(ctx, &acc)
+	consumer.Inject(saramaMsg(overlongMsg))
 	acc.WaitError(1)

 	assert.Equal(t, acc.NFields(), 0)
@@ -82,14 +107,13 @@ func TestDropOverlongMsg(t *testing.T) {

 // Test that the parser parses kafka messages into points
 func TestRunParserAndGather(t *testing.T) {
-	k, in := newTestKafka()
+	k, consumer := newTestKafka()
 	acc := testutil.Accumulator{}
-	k.acc = &acc
-	defer close(k.done)
+	ctx := context.Background()

 	k.parser, _ = parsers.NewInfluxParser()
-	go k.receiver()
-	in <- saramaMsg(testMsg)
+	go k.receiver(ctx, &acc)
+	consumer.Inject(saramaMsg(testMsg))
 	acc.Wait(1)

 	acc.GatherError(k.Gather)
@@ -101,14 +125,13 @@ func TestRunParserAndGather(t *testing.T) {

 // Test that the parser parses kafka messages into points
 func TestRunParserAndGatherGraphite(t *testing.T) {
-	k, in := newTestKafka()
+	k, consumer := newTestKafka()
 	acc := testutil.Accumulator{}
-	k.acc = &acc
-	defer close(k.done)
+	ctx := context.Background()

 	k.parser, _ = parsers.NewGraphiteParser("_", []string{}, nil)
-	go k.receiver()
-	in <- saramaMsg(testMsgGraphite)
+	go k.receiver(ctx, &acc)
+	consumer.Inject(saramaMsg(testMsgGraphite))
 	acc.Wait(1)

 	acc.GatherError(k.Gather)
@@ -120,17 +143,16 @@ func TestRunParserAndGatherGraphite(t *testing.T) {

 // Test that the parser parses kafka messages into points
 func TestRunParserAndGatherJSON(t *testing.T) {
-	k, in := newTestKafka()
+	k, consumer := newTestKafka()
 	acc := testutil.Accumulator{}
-	k.acc = &acc
-	defer close(k.done)
+	ctx := context.Background()

 	k.parser, _ = parsers.NewParser(&parsers.Config{
 		DataFormat: "json",
 		MetricName: "kafka_json_test",
 	})
-	go k.receiver()
-	in <- saramaMsg(testMsgJSON)
+	go k.receiver(ctx, &acc)
+	consumer.Inject(saramaMsg(testMsgJSON))
 	acc.Wait(1)

 	acc.GatherError(k.Gather)