Remove outputs blocking inputs when output is slow (#4938)
This commit is contained in:
@@ -1,18 +1,14 @@
|
||||
# Kafka Consumer Input Plugin
|
||||
|
||||
The [Kafka](http://kafka.apache.org/) consumer plugin polls a specified Kafka
|
||||
topic and adds messages to InfluxDB. The plugin assumes messages follow the
|
||||
line protocol. [Consumer Group](http://godoc.org/github.com/wvanbergen/kafka/consumergroup)
|
||||
is used to talk to the Kafka cluster so multiple instances of telegraf can read
|
||||
from the same topic in parallel.
|
||||
The [Kafka][kafka] consumer plugin reads from Kafka
|
||||
and creates metrics using one of the supported [input data formats][].
|
||||
|
||||
For old kafka version (< 0.8), please use the kafka_consumer_legacy input plugin
|
||||
For old kafka version (< 0.8), please use the [kafka_consumer_legacy][] input plugin
|
||||
and use the old zookeeper connection method.
|
||||
|
||||
## Configuration
|
||||
### Configuration
|
||||
|
||||
```toml
|
||||
# Read metrics from Kafka topic(s)
|
||||
[[inputs.kafka_consumer]]
|
||||
## kafka servers
|
||||
brokers = ["localhost:9092"]
|
||||
@@ -44,18 +40,27 @@ and use the old zookeeper connection method.
|
||||
## Offset (must be either "oldest" or "newest")
|
||||
offset = "oldest"
|
||||
|
||||
## Maximum length of a message to consume, in bytes (default 0/unlimited);
|
||||
## larger messages are dropped
|
||||
max_message_len = 1000000
|
||||
|
||||
## Maximum messages to read from the broker that have not been written by an
|
||||
## output. For best throughput set based on the number of metrics within
|
||||
## each message and the size of the output's metric_batch_size.
|
||||
##
|
||||
## For example, if each message from the queue contains 10 metrics and the
|
||||
## output metric_batch_size is 1000, setting this to 100 will ensure that a
|
||||
## full batch is collected and the write is triggered immediately without
|
||||
## waiting until the next flush_interval.
|
||||
# max_undelivered_messages = 1000
|
||||
|
||||
## Data format to consume.
|
||||
## Each data format has its own unique set of configuration options, read
|
||||
## more about them here:
|
||||
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
|
||||
data_format = "influx"
|
||||
|
||||
## Maximum length of a message to consume, in bytes (default 0/unlimited);
|
||||
## larger messages are dropped
|
||||
max_message_len = 1000000
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
Running integration tests requires running Zookeeper & Kafka. See Makefile
|
||||
for kafka container command.
|
||||
[kafka]: https://kafka.apache.org
|
||||
[kafka_consumer_legacy]: /plugins/inputs/kafka_consumer_legacy/README.md
|
||||
[input data formats]: /docs/DATA_FORMATS_INPUT.md
|
||||
|
||||
@@ -1,55 +1,54 @@
|
||||
package kafka_consumer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/Shopify/sarama"
|
||||
cluster "github.com/bsm/sarama-cluster"
|
||||
"github.com/influxdata/telegraf"
|
||||
"github.com/influxdata/telegraf/internal/tls"
|
||||
"github.com/influxdata/telegraf/plugins/inputs"
|
||||
"github.com/influxdata/telegraf/plugins/parsers"
|
||||
|
||||
"github.com/Shopify/sarama"
|
||||
cluster "github.com/bsm/sarama-cluster"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultMaxUndeliveredMessages = 1000
|
||||
)
|
||||
|
||||
type empty struct{}
|
||||
type semaphore chan empty
|
||||
|
||||
type Consumer interface {
|
||||
Errors() <-chan error
|
||||
Messages() <-chan *sarama.ConsumerMessage
|
||||
MarkOffset(msg *sarama.ConsumerMessage, metadata string)
|
||||
Close() error
|
||||
}
|
||||
|
||||
type Kafka struct {
|
||||
ConsumerGroup string
|
||||
ClientID string `toml:"client_id"`
|
||||
Topics []string
|
||||
Brokers []string
|
||||
MaxMessageLen int
|
||||
Version string `toml:"version"`
|
||||
|
||||
Cluster *cluster.Consumer
|
||||
|
||||
ConsumerGroup string `toml:"consumer_group"`
|
||||
ClientID string `toml:"client_id"`
|
||||
Topics []string `toml:"topics"`
|
||||
Brokers []string `toml:"brokers"`
|
||||
MaxMessageLen int `toml:"max_message_len"`
|
||||
Version string `toml:"version"`
|
||||
MaxUndeliveredMessages int `toml:"max_undelivered_messages"`
|
||||
Offset string `toml:"offset"`
|
||||
SASLUsername string `toml:"sasl_username"`
|
||||
SASLPassword string `toml:"sasl_password"`
|
||||
tls.ClientConfig
|
||||
|
||||
// SASL Username
|
||||
SASLUsername string `toml:"sasl_username"`
|
||||
// SASL Password
|
||||
SASLPassword string `toml:"sasl_password"`
|
||||
cluster Consumer
|
||||
parser parsers.Parser
|
||||
wg *sync.WaitGroup
|
||||
cancel context.CancelFunc
|
||||
|
||||
// Legacy metric buffer support
|
||||
MetricBuffer int
|
||||
// TODO remove PointBuffer, legacy support
|
||||
PointBuffer int
|
||||
|
||||
Offset string
|
||||
parser parsers.Parser
|
||||
|
||||
sync.Mutex
|
||||
|
||||
// channel for all incoming kafka messages
|
||||
in <-chan *sarama.ConsumerMessage
|
||||
// channel for all kafka consumer errors
|
||||
errs <-chan error
|
||||
done chan struct{}
|
||||
|
||||
// keep the accumulator internally:
|
||||
acc telegraf.Accumulator
|
||||
// Unconfirmed messages
|
||||
messages map[telegraf.TrackingID]*sarama.ConsumerMessage
|
||||
|
||||
// doNotCommitMsgs tells the parser not to call CommitUpTo on the consumer
|
||||
// this is mostly for test purposes, but there may be a use-case for it later.
|
||||
@@ -86,16 +85,25 @@ var sampleConfig = `
|
||||
consumer_group = "telegraf_metrics_consumers"
|
||||
## Offset (must be either "oldest" or "newest")
|
||||
offset = "oldest"
|
||||
## Maximum length of a message to consume, in bytes (default 0/unlimited);
|
||||
## larger messages are dropped
|
||||
max_message_len = 1000000
|
||||
|
||||
## Maximum messages to read from the broker that have not been written by an
|
||||
## output. For best throughput set based on the number of metrics within
|
||||
## each message and the size of the output's metric_batch_size.
|
||||
##
|
||||
## For example, if each message from the queue contains 10 metrics and the
|
||||
## output metric_batch_size is 1000, setting this to 100 will ensure that a
|
||||
## full batch is collected and the write is triggered immediately without
|
||||
## waiting until the next flush_interval.
|
||||
# max_undelivered_messages = 1000
|
||||
|
||||
## Data format to consume.
|
||||
## Each data format has its own unique set of configuration options, read
|
||||
## more about them here:
|
||||
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
|
||||
data_format = "influx"
|
||||
|
||||
## Maximum length of a message to consume, in bytes (default 0/unlimited);
|
||||
## larger messages are dropped
|
||||
max_message_len = 1000000
|
||||
`
|
||||
|
||||
func (k *Kafka) SampleConfig() string {
|
||||
@@ -111,12 +119,8 @@ func (k *Kafka) SetParser(parser parsers.Parser) {
|
||||
}
|
||||
|
||||
func (k *Kafka) Start(acc telegraf.Accumulator) error {
|
||||
k.Lock()
|
||||
defer k.Unlock()
|
||||
var clusterErr error
|
||||
|
||||
k.acc = acc
|
||||
|
||||
config := cluster.NewConfig()
|
||||
|
||||
if k.Version != "" {
|
||||
@@ -159,13 +163,13 @@ func (k *Kafka) Start(acc telegraf.Accumulator) error {
|
||||
case "newest":
|
||||
config.Consumer.Offsets.Initial = sarama.OffsetNewest
|
||||
default:
|
||||
log.Printf("I! WARNING: Kafka consumer invalid offset '%s', using 'oldest'\n",
|
||||
log.Printf("I! WARNING: Kafka consumer invalid offset '%s', using 'oldest'",
|
||||
k.Offset)
|
||||
config.Consumer.Offsets.Initial = sarama.OffsetOldest
|
||||
}
|
||||
|
||||
if k.Cluster == nil {
|
||||
k.Cluster, clusterErr = cluster.NewConsumer(
|
||||
if k.cluster == nil {
|
||||
k.cluster, clusterErr = cluster.NewConsumer(
|
||||
k.Brokers,
|
||||
k.ConsumerGroup,
|
||||
k.Topics,
|
||||
@@ -173,67 +177,110 @@ func (k *Kafka) Start(acc telegraf.Accumulator) error {
|
||||
)
|
||||
|
||||
if clusterErr != nil {
|
||||
log.Printf("E! Error when creating Kafka Consumer, brokers: %v, topics: %v\n",
|
||||
log.Printf("E! Error when creating Kafka Consumer, brokers: %v, topics: %v",
|
||||
k.Brokers, k.Topics)
|
||||
return clusterErr
|
||||
}
|
||||
|
||||
// Setup message and error channels
|
||||
k.in = k.Cluster.Messages()
|
||||
k.errs = k.Cluster.Errors()
|
||||
}
|
||||
|
||||
k.done = make(chan struct{})
|
||||
// Start the kafka message reader
|
||||
go k.receiver()
|
||||
log.Printf("I! Started the kafka consumer service, brokers: %v, topics: %v\n",
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
k.cancel = cancel
|
||||
|
||||
// Start consumer goroutine
|
||||
k.wg = &sync.WaitGroup{}
|
||||
k.wg.Add(1)
|
||||
go func() {
|
||||
defer k.wg.Done()
|
||||
k.receiver(ctx, acc)
|
||||
}()
|
||||
|
||||
log.Printf("I! Started the kafka consumer service, brokers: %v, topics: %v",
|
||||
k.Brokers, k.Topics)
|
||||
return nil
|
||||
}
|
||||
|
||||
// receiver() reads all incoming messages from the consumer, and parses them into
|
||||
// influxdb metric points.
|
||||
func (k *Kafka) receiver() {
|
||||
func (k *Kafka) receiver(ctx context.Context, ac telegraf.Accumulator) {
|
||||
k.messages = make(map[telegraf.TrackingID]*sarama.ConsumerMessage)
|
||||
|
||||
acc := ac.WithTracking(k.MaxUndeliveredMessages)
|
||||
sem := make(semaphore, k.MaxUndeliveredMessages)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-k.done:
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case err := <-k.errs:
|
||||
if err != nil {
|
||||
k.acc.AddError(fmt.Errorf("Consumer Error: %s\n", err))
|
||||
}
|
||||
case msg := <-k.in:
|
||||
if k.MaxMessageLen != 0 && len(msg.Value) > k.MaxMessageLen {
|
||||
k.acc.AddError(fmt.Errorf("Message longer than max_message_len (%d > %d)",
|
||||
len(msg.Value), k.MaxMessageLen))
|
||||
} else {
|
||||
metrics, err := k.parser.Parse(msg.Value)
|
||||
case track := <-acc.Delivered():
|
||||
<-sem
|
||||
k.onDelivery(track)
|
||||
case err := <-k.cluster.Errors():
|
||||
acc.AddError(err)
|
||||
case sem <- empty{}:
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case track := <-acc.Delivered():
|
||||
// Once for the delivered message, once to leave the case
|
||||
<-sem
|
||||
<-sem
|
||||
k.onDelivery(track)
|
||||
case err := <-k.cluster.Errors():
|
||||
<-sem
|
||||
acc.AddError(err)
|
||||
case msg := <-k.cluster.Messages():
|
||||
err := k.onMessage(acc, msg)
|
||||
if err != nil {
|
||||
k.acc.AddError(fmt.Errorf("Message Parse Error\nmessage: %s\nerror: %s",
|
||||
string(msg.Value), err.Error()))
|
||||
acc.AddError(err)
|
||||
<-sem
|
||||
}
|
||||
for _, metric := range metrics {
|
||||
k.acc.AddFields(metric.Name(), metric.Fields(), metric.Tags(), metric.Time())
|
||||
}
|
||||
}
|
||||
|
||||
if !k.doNotCommitMsgs {
|
||||
// TODO(cam) this locking can be removed if this PR gets merged:
|
||||
// https://github.com/wvanbergen/kafka/pull/84
|
||||
k.Lock()
|
||||
k.Cluster.MarkOffset(msg, "")
|
||||
k.Unlock()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (k *Kafka) markOffset(msg *sarama.ConsumerMessage) {
|
||||
if !k.doNotCommitMsgs {
|
||||
k.cluster.MarkOffset(msg, "")
|
||||
}
|
||||
}
|
||||
|
||||
func (k *Kafka) onMessage(acc telegraf.TrackingAccumulator, msg *sarama.ConsumerMessage) error {
|
||||
if k.MaxMessageLen != 0 && len(msg.Value) > k.MaxMessageLen {
|
||||
k.markOffset(msg)
|
||||
return fmt.Errorf("Message longer than max_message_len (%d > %d)",
|
||||
len(msg.Value), k.MaxMessageLen)
|
||||
}
|
||||
|
||||
metrics, err := k.parser.Parse(msg.Value)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
id := acc.AddTrackingMetricGroup(metrics)
|
||||
k.messages[id] = msg
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (k *Kafka) onDelivery(track telegraf.DeliveryInfo) {
|
||||
msg, ok := k.messages[track.ID()]
|
||||
if !ok {
|
||||
log.Printf("E! [inputs.kafka_consumer] Could not mark message delivered: %d", track.ID())
|
||||
}
|
||||
|
||||
if track.Delivered() {
|
||||
k.markOffset(msg)
|
||||
}
|
||||
delete(k.messages, track.ID())
|
||||
}
|
||||
|
||||
func (k *Kafka) Stop() {
|
||||
k.Lock()
|
||||
defer k.Unlock()
|
||||
close(k.done)
|
||||
if err := k.Cluster.Close(); err != nil {
|
||||
k.acc.AddError(fmt.Errorf("Error closing consumer: %s\n", err.Error()))
|
||||
k.cancel()
|
||||
k.wg.Wait()
|
||||
|
||||
if err := k.cluster.Close(); err != nil {
|
||||
log.Printf("E! [inputs.kafka_consumer] Error closing consumer: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -243,6 +290,8 @@ func (k *Kafka) Gather(acc telegraf.Accumulator) error {
|
||||
|
||||
func init() {
|
||||
inputs.Add("kafka_consumer", func() telegraf.Input {
|
||||
return &Kafka{}
|
||||
return &Kafka{
|
||||
MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -38,7 +38,6 @@ func TestReadsMetricsFromKafka(t *testing.T) {
|
||||
ConsumerGroup: "telegraf_test_consumers",
|
||||
Topics: []string{testTopic},
|
||||
Brokers: brokerPeers,
|
||||
PointBuffer: 100000,
|
||||
Offset: "oldest",
|
||||
}
|
||||
p, _ := parsers.NewInfluxParser()
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
package kafka_consumer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/Shopify/sarama"
|
||||
"github.com/influxdata/telegraf"
|
||||
"github.com/influxdata/telegraf/plugins/parsers"
|
||||
"github.com/influxdata/telegraf/testutil"
|
||||
|
||||
"github.com/Shopify/sarama"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
@@ -18,31 +19,57 @@ const (
|
||||
invalidMsg = "cpu_load_short,host=server01 1422568543702900257\n"
|
||||
)
|
||||
|
||||
func newTestKafka() (*Kafka, chan *sarama.ConsumerMessage) {
|
||||
in := make(chan *sarama.ConsumerMessage, 1000)
|
||||
k := Kafka{
|
||||
ConsumerGroup: "test",
|
||||
Topics: []string{"telegraf"},
|
||||
Brokers: []string{"localhost:9092"},
|
||||
Offset: "oldest",
|
||||
in: in,
|
||||
doNotCommitMsgs: true,
|
||||
errs: make(chan error, 1000),
|
||||
done: make(chan struct{}),
|
||||
type TestConsumer struct {
|
||||
errors chan error
|
||||
messages chan *sarama.ConsumerMessage
|
||||
}
|
||||
|
||||
func (c *TestConsumer) Errors() <-chan error {
|
||||
return c.errors
|
||||
}
|
||||
|
||||
func (c *TestConsumer) Messages() <-chan *sarama.ConsumerMessage {
|
||||
return c.messages
|
||||
}
|
||||
|
||||
func (c *TestConsumer) MarkOffset(msg *sarama.ConsumerMessage, metadata string) {
|
||||
}
|
||||
|
||||
func (c *TestConsumer) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *TestConsumer) Inject(msg *sarama.ConsumerMessage) {
|
||||
c.messages <- msg
|
||||
}
|
||||
|
||||
func newTestKafka() (*Kafka, *TestConsumer) {
|
||||
consumer := &TestConsumer{
|
||||
errors: make(chan error),
|
||||
messages: make(chan *sarama.ConsumerMessage, 1000),
|
||||
}
|
||||
return &k, in
|
||||
k := Kafka{
|
||||
cluster: consumer,
|
||||
ConsumerGroup: "test",
|
||||
Topics: []string{"telegraf"},
|
||||
Brokers: []string{"localhost:9092"},
|
||||
Offset: "oldest",
|
||||
MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
|
||||
doNotCommitMsgs: true,
|
||||
messages: make(map[telegraf.TrackingID]*sarama.ConsumerMessage),
|
||||
}
|
||||
return &k, consumer
|
||||
}
|
||||
|
||||
// Test that the parser parses kafka messages into points
|
||||
func TestRunParser(t *testing.T) {
|
||||
k, in := newTestKafka()
|
||||
k, consumer := newTestKafka()
|
||||
acc := testutil.Accumulator{}
|
||||
k.acc = &acc
|
||||
defer close(k.done)
|
||||
ctx := context.Background()
|
||||
|
||||
k.parser, _ = parsers.NewInfluxParser()
|
||||
go k.receiver()
|
||||
in <- saramaMsg(testMsg)
|
||||
go k.receiver(ctx, &acc)
|
||||
consumer.Inject(saramaMsg(testMsg))
|
||||
acc.Wait(1)
|
||||
|
||||
assert.Equal(t, acc.NFields(), 1)
|
||||
@@ -50,14 +77,13 @@ func TestRunParser(t *testing.T) {
|
||||
|
||||
// Test that the parser ignores invalid messages
|
||||
func TestRunParserInvalidMsg(t *testing.T) {
|
||||
k, in := newTestKafka()
|
||||
k, consumer := newTestKafka()
|
||||
acc := testutil.Accumulator{}
|
||||
k.acc = &acc
|
||||
defer close(k.done)
|
||||
ctx := context.Background()
|
||||
|
||||
k.parser, _ = parsers.NewInfluxParser()
|
||||
go k.receiver()
|
||||
in <- saramaMsg(invalidMsg)
|
||||
go k.receiver(ctx, &acc)
|
||||
consumer.Inject(saramaMsg(invalidMsg))
|
||||
acc.WaitError(1)
|
||||
|
||||
assert.Equal(t, acc.NFields(), 0)
|
||||
@@ -66,15 +92,14 @@ func TestRunParserInvalidMsg(t *testing.T) {
|
||||
// Test that overlong messages are dropped
|
||||
func TestDropOverlongMsg(t *testing.T) {
|
||||
const maxMessageLen = 64 * 1024
|
||||
k, in := newTestKafka()
|
||||
k, consumer := newTestKafka()
|
||||
k.MaxMessageLen = maxMessageLen
|
||||
acc := testutil.Accumulator{}
|
||||
k.acc = &acc
|
||||
defer close(k.done)
|
||||
ctx := context.Background()
|
||||
overlongMsg := strings.Repeat("v", maxMessageLen+1)
|
||||
|
||||
go k.receiver()
|
||||
in <- saramaMsg(overlongMsg)
|
||||
go k.receiver(ctx, &acc)
|
||||
consumer.Inject(saramaMsg(overlongMsg))
|
||||
acc.WaitError(1)
|
||||
|
||||
assert.Equal(t, acc.NFields(), 0)
|
||||
@@ -82,14 +107,13 @@ func TestDropOverlongMsg(t *testing.T) {
|
||||
|
||||
// Test that the parser parses kafka messages into points
|
||||
func TestRunParserAndGather(t *testing.T) {
|
||||
k, in := newTestKafka()
|
||||
k, consumer := newTestKafka()
|
||||
acc := testutil.Accumulator{}
|
||||
k.acc = &acc
|
||||
defer close(k.done)
|
||||
ctx := context.Background()
|
||||
|
||||
k.parser, _ = parsers.NewInfluxParser()
|
||||
go k.receiver()
|
||||
in <- saramaMsg(testMsg)
|
||||
go k.receiver(ctx, &acc)
|
||||
consumer.Inject(saramaMsg(testMsg))
|
||||
acc.Wait(1)
|
||||
|
||||
acc.GatherError(k.Gather)
|
||||
@@ -101,14 +125,13 @@ func TestRunParserAndGather(t *testing.T) {
|
||||
|
||||
// Test that the parser parses kafka messages into points
|
||||
func TestRunParserAndGatherGraphite(t *testing.T) {
|
||||
k, in := newTestKafka()
|
||||
k, consumer := newTestKafka()
|
||||
acc := testutil.Accumulator{}
|
||||
k.acc = &acc
|
||||
defer close(k.done)
|
||||
ctx := context.Background()
|
||||
|
||||
k.parser, _ = parsers.NewGraphiteParser("_", []string{}, nil)
|
||||
go k.receiver()
|
||||
in <- saramaMsg(testMsgGraphite)
|
||||
go k.receiver(ctx, &acc)
|
||||
consumer.Inject(saramaMsg(testMsgGraphite))
|
||||
acc.Wait(1)
|
||||
|
||||
acc.GatherError(k.Gather)
|
||||
@@ -120,17 +143,16 @@ func TestRunParserAndGatherGraphite(t *testing.T) {
|
||||
|
||||
// Test that the parser parses kafka messages into points
|
||||
func TestRunParserAndGatherJSON(t *testing.T) {
|
||||
k, in := newTestKafka()
|
||||
k, consumer := newTestKafka()
|
||||
acc := testutil.Accumulator{}
|
||||
k.acc = &acc
|
||||
defer close(k.done)
|
||||
ctx := context.Background()
|
||||
|
||||
k.parser, _ = parsers.NewParser(&parsers.Config{
|
||||
DataFormat: "json",
|
||||
MetricName: "kafka_json_test",
|
||||
})
|
||||
go k.receiver()
|
||||
in <- saramaMsg(testMsgJSON)
|
||||
go k.receiver(ctx, &acc)
|
||||
consumer.Inject(saramaMsg(testMsgJSON))
|
||||
acc.Wait(1)
|
||||
|
||||
acc.GatherError(k.Gather)
|
||||
|
||||
Reference in New Issue
Block a user