Fix kafka plugin and rename to kafka_consumer

fixes #371
This commit is contained in:
Cameron Sparr 2015-11-16 13:12:45 -07:00
parent a3feddd8ed
commit 970bfce997
11 changed files with 462 additions and 430 deletions

View File

@ -1,12 +1,20 @@
## v0.2.3 [unreleased]
### Release Notes
- **breaking change** The `kafka` plugin has been renamed to `kafka_consumer`.
and most of the config option names have changed.
This only affects the kafka consumer _plugin_ (not the
output). There were a number of problems with the kafka plugin that led to it
only collecting data once at startup, so the kafka plugin was basically non-
functional.
- Riemann output added
### Features
- [#379](https://github.com/influxdb/telegraf/pull/379): Riemann output, thanks @allenj!
- [#375](https://github.com/influxdb/telegraf/pull/375): kafka_consumer service plugin.
### Bugfixes
- [#371](https://github.com/influxdb/telegraf/issues/371): Kafka consumer plugin not functioning.
## v0.2.2 [2015-11-18]

View File

@ -164,7 +164,6 @@ Telegraf currently has support for collecting metrics from:
* haproxy
* httpjson (generic JSON-emitting http service plugin)
* jolokia (remote JMX with JSON over HTTP)
* kafka_consumer
* leofs
* lustre2
* memcached
@ -197,6 +196,7 @@ Telegraf currently has support for collecting metrics from:
Telegraf can collect metrics via the following services:
* statsd
* kafka_consumer
We'll be adding support for many more over the coming months. Read on if you
want to add support for another service or third-party API.

View File

@ -8,7 +8,7 @@ import (
"github.com/influxdb/telegraf/plugins"
"github.com/influxdb/telegraf/plugins/exec"
"github.com/influxdb/telegraf/plugins/kafka_consumer"
"github.com/influxdb/telegraf/plugins/memcached"
"github.com/influxdb/telegraf/plugins/procstat"
"github.com/naoina/toml"
"github.com/naoina/toml/ast"
@ -205,17 +205,14 @@ func TestConfig_parsePlugin(t *testing.T) {
pluginConfigurationFieldsSet: make(map[string][]string),
}
subtbl := tbl.Fields["kafka"].(*ast.Table)
err = c.parsePlugin("kafka", subtbl)
subtbl := tbl.Fields["memcached"].(*ast.Table)
err = c.parsePlugin("memcached", subtbl)
kafka := plugins.Plugins["kafka"]().(*kafka_consumer.Kafka)
kafka.ConsumerGroupName = "telegraf_metrics_consumers"
kafka.Topic = "topic_with_metrics"
kafka.ZookeeperPeers = []string{"test.example.com:2181"}
kafka.BatchSize = 1000
memcached := plugins.Plugins["memcached"]().(*memcached.Memcached)
memcached.Servers = []string{"localhost"}
kConfig := &ConfiguredPlugin{
Name: "kafka",
mConfig := &ConfiguredPlugin{
Name: "memcached",
Drop: []string{"other", "stuff"},
Pass: []string{"some", "strings"},
TagDrop: []TagFilter{
@ -233,10 +230,10 @@ func TestConfig_parsePlugin(t *testing.T) {
Interval: 5 * time.Second,
}
assert.Equal(t, kafka, c.plugins["kafka"],
"Testdata did not produce a correct kafka struct.")
assert.Equal(t, kConfig, c.pluginConfigurations["kafka"],
"Testdata did not produce correct kafka metadata.")
assert.Equal(t, memcached, c.plugins["memcached"],
"Testdata did not produce a correct memcached struct.")
assert.Equal(t, mConfig, c.pluginConfigurations["memcached"],
"Testdata did not produce correct memcached metadata.")
}
func TestConfig_LoadDirectory(t *testing.T) {
@ -249,14 +246,11 @@ func TestConfig_LoadDirectory(t *testing.T) {
t.Error(err)
}
kafka := plugins.Plugins["kafka"]().(*kafka_consumer.Kafka)
kafka.ConsumerGroupName = "telegraf_metrics_consumers"
kafka.Topic = "topic_with_metrics"
kafka.ZookeeperPeers = []string{"test.example.com:2181"}
kafka.BatchSize = 10000
memcached := plugins.Plugins["memcached"]().(*memcached.Memcached)
memcached.Servers = []string{"192.168.1.1"}
kConfig := &ConfiguredPlugin{
Name: "kafka",
mConfig := &ConfiguredPlugin{
Name: "memcached",
Drop: []string{"other", "stuff"},
Pass: []string{"some", "strings"},
TagDrop: []TagFilter{
@ -296,10 +290,10 @@ func TestConfig_LoadDirectory(t *testing.T) {
pConfig := &ConfiguredPlugin{Name: "procstat"}
assert.Equal(t, kafka, c.plugins["kafka"],
"Merged Testdata did not produce a correct kafka struct.")
assert.Equal(t, kConfig, c.pluginConfigurations["kafka"],
"Merged Testdata did not produce correct kafka metadata.")
assert.Equal(t, memcached, c.plugins["memcached"],
"Merged Testdata did not produce a correct memcached struct.")
assert.Equal(t, mConfig, c.pluginConfigurations["memcached"],
"Merged Testdata did not produce correct memcached metadata.")
assert.Equal(t, ex, c.plugins["exec"],
"Merged Testdata did not produce a correct exec struct.")

View File

@ -29,8 +29,9 @@ type InfluxDB struct {
}
var sampleConfig = `
# The full HTTP or UDP endpoint URL for your InfluxDB instance
# Multiple urls can be specified for InfluxDB cluster support.
# The full HTTP or UDP endpoint URL for your InfluxDB instance.
# Multiple urls can be specified but it is assumed that they are part of the same
# cluster, this means that only ONE of the urls will be written to each interval.
# urls = ["udp://localhost:8089"] # UDP endpoint example
urls = ["http://localhost:8086"] # required
# The target database for metrics (telegraf will create it if not exists)

View File

@ -1,36 +1,51 @@
package kafka_consumer
import (
"os"
"os/signal"
"time"
"log"
"strings"
"sync"
"github.com/Shopify/sarama"
"github.com/influxdb/influxdb/models"
"github.com/influxdb/telegraf/plugins"
"github.com/Shopify/sarama"
"github.com/wvanbergen/kafka/consumergroup"
)
type Kafka struct {
ConsumerGroupName string
Topic string
ConsumerGroup string
Topics []string
ZookeeperPeers []string
Consumer *consumergroup.ConsumerGroup
BatchSize int
PointBuffer int
Offset string
sync.Mutex
// channel for all incoming kafka messages
in <-chan *sarama.ConsumerMessage
// channel for all kafka consumer errors
errs <-chan *sarama.ConsumerError
// channel for all incoming parsed kafka points
pointChan chan models.Point
done chan struct{}
// doNotCommitMsgs tells the parser not to call CommitUpTo on the consumer
// this is mostly for test purposes, but there may be a use-case for it later.
doNotCommitMsgs bool
}
var sampleConfig = `
# topic to consume
topic = "topic_with_metrics"
# the name of the consumer group
consumerGroupName = "telegraf_metrics_consumers"
# topic(s) to consume
topics = ["telegraf"]
# an array of Zookeeper connection strings
zookeeperPeers = ["localhost:2181"]
# Batch size of points sent to InfluxDB
batchSize = 1000
zookeeper_peers = ["localhost:2181"]
# the name of the consumer group
consumer_group = "telegraf_metrics_consumers"
# Maximum number of points to buffer between collection intervals
point_buffer = 100000
# Offset (must be either "oldest" or "newest")
offset = "oldest"
`
func (k *Kafka) SampleConfig() string {
@ -38,127 +53,114 @@ func (k *Kafka) SampleConfig() string {
}
func (k *Kafka) Description() string {
return "read metrics from a Kafka topic"
return "Read line-protocol metrics from Kafka topic(s)"
}
type Metric struct {
Measurement string `json:"measurement"`
Values map[string]interface{} `json:"values"`
Tags map[string]string `json:"tags"`
Time time.Time `json:"time"`
}
func (k *Kafka) Gather(acc plugins.Accumulator) error {
func (k *Kafka) Start() error {
k.Lock()
defer k.Unlock()
var consumerErr error
metricQueue := make(chan []byte, 200)
if k.Consumer == nil {
config := consumergroup.NewConfig()
switch strings.ToLower(k.Offset) {
case "oldest", "":
config.Offsets.Initial = sarama.OffsetOldest
case "newest":
config.Offsets.Initial = sarama.OffsetNewest
default:
log.Printf("WARNING: Kafka consumer invalid offset '%s', using 'oldest'\n",
k.Offset)
config.Offsets.Initial = sarama.OffsetOldest
}
if k.Consumer == nil || k.Consumer.Closed() {
k.Consumer, consumerErr = consumergroup.JoinConsumerGroup(
k.ConsumerGroupName,
[]string{k.Topic},
k.ConsumerGroup,
k.Topics,
k.ZookeeperPeers,
nil,
config,
)
if consumerErr != nil {
return consumerErr
}
c := make(chan os.Signal, 1)
halt := make(chan bool, 1)
signal.Notify(c, os.Interrupt)
go func() {
<-c
halt <- true
emitMetrics(k, acc, metricQueue)
k.Consumer.Close()
}()
go readFromKafka(k.Consumer.Messages(),
metricQueue,
k.BatchSize,
k.Consumer.CommitUpto,
halt)
// Setup message and error channels
k.in = k.Consumer.Messages()
k.errs = k.Consumer.Errors()
}
return emitMetrics(k, acc, metricQueue)
k.done = make(chan struct{})
if k.PointBuffer == 0 {
k.PointBuffer = 100000
}
k.pointChan = make(chan models.Point, k.PointBuffer)
// Start the kafka message reader
go k.parser()
log.Printf("Started the kafka consumer service, peers: %v, topics: %v\n",
k.ZookeeperPeers, k.Topics)
return nil
}
func emitMetrics(k *Kafka, acc plugins.Accumulator, metricConsumer <-chan []byte) error {
timeout := time.After(1 * time.Second)
// parser() reads all incoming messages from the consumer, and parses them into
// influxdb metric points.
func (k *Kafka) parser() {
for {
select {
case batch := <-metricConsumer:
var points []models.Point
var err error
if points, err = models.ParsePoints(batch); err != nil {
return err
case <-k.done:
return
case err := <-k.errs:
log.Printf("Kafka Consumer Error: %s\n", err.Error())
case msg := <-k.in:
points, err := models.ParsePoints(msg.Value)
if err != nil {
log.Printf("Could not parse kafka message: %s, error: %s",
string(msg.Value), err.Error())
}
for _, point := range points {
acc.AddFields(point.Name(), point.Fields(), point.Tags(), point.Time())
select {
case k.pointChan <- point:
continue
default:
log.Printf("Kafka Consumer buffer is full, dropping a point." +
" You may want to increase the point_buffer setting")
}
}
if !k.doNotCommitMsgs {
// TODO(cam) this locking can be removed if this PR gets merged:
// https://github.com/wvanbergen/kafka/pull/84
k.Lock()
k.Consumer.CommitUpto(msg)
k.Unlock()
}
case <-timeout:
return nil
}
}
}
const millisecond = 1000000 * time.Nanosecond
type ack func(*sarama.ConsumerMessage) error
func readFromKafka(
kafkaMsgs <-chan *sarama.ConsumerMessage,
metricProducer chan<- []byte,
maxBatchSize int,
ackMsg ack,
halt <-chan bool,
) {
batch := make([]byte, 0)
currentBatchSize := 0
timeout := time.After(500 * millisecond)
var msg *sarama.ConsumerMessage
for {
select {
case msg = <-kafkaMsgs:
if currentBatchSize != 0 {
batch = append(batch, '\n')
func (k *Kafka) Stop() {
k.Lock()
defer k.Unlock()
close(k.done)
if err := k.Consumer.Close(); err != nil {
log.Printf("Error closing kafka consumer: %s\n", err.Error())
}
}
batch = append(batch, msg.Value...)
currentBatchSize++
if currentBatchSize == maxBatchSize {
metricProducer <- batch
currentBatchSize = 0
batch = make([]byte, 0)
ackMsg(msg)
}
case <-timeout:
if currentBatchSize != 0 {
metricProducer <- batch
currentBatchSize = 0
batch = make([]byte, 0)
ackMsg(msg)
}
timeout = time.After(500 * millisecond)
case <-halt:
if currentBatchSize != 0 {
metricProducer <- batch
ackMsg(msg)
}
return
}
func (k *Kafka) Gather(acc plugins.Accumulator) error {
k.Lock()
defer k.Unlock()
npoints := len(k.pointChan)
for i := 0; i < npoints; i++ {
point := <-k.pointChan
acc.AddFields(point.Name(), point.Fields(), point.Tags(), point.Time())
}
return nil
}
func init() {
plugins.Add("kafka", func() plugins.Plugin {
plugins.Add("kafka_consumer", func() plugins.Plugin {
return &Kafka{}
})
}

View File

@ -15,36 +15,48 @@ func TestReadsMetricsFromKafka(t *testing.T) {
if testing.Short() {
t.Skip("Skipping integration test in short mode")
}
var zkPeers, brokerPeers []string
zkPeers = []string{testutil.GetLocalHost() + ":2181"}
brokerPeers = []string{testutil.GetLocalHost() + ":9092"}
k := &Kafka{
ConsumerGroupName: "telegraf_test_consumers",
Topic: fmt.Sprintf("telegraf_test_topic_%d", time.Now().Unix()),
ZookeeperPeers: zkPeers,
}
brokerPeers := []string{testutil.GetLocalHost() + ":9092"}
zkPeers := []string{testutil.GetLocalHost() + ":2181"}
testTopic := fmt.Sprintf("telegraf_test_topic_%d", time.Now().Unix())
// Send a Kafka message to the kafka host
msg := "cpu_load_short,direction=in,host=server01,region=us-west value=23422.0 1422568543702900257"
producer, err := sarama.NewSyncProducer(brokerPeers, nil)
require.NoError(t, err)
_, _, err = producer.SendMessage(&sarama.ProducerMessage{Topic: k.Topic, Value: sarama.StringEncoder(msg)})
_, _, err = producer.SendMessage(
&sarama.ProducerMessage{
Topic: testTopic,
Value: sarama.StringEncoder(msg),
})
require.NoError(t, err)
defer producer.Close()
producer.Close()
// Start the Kafka Consumer
k := &Kafka{
ConsumerGroup: "telegraf_test_consumers",
Topics: []string{testTopic},
ZookeeperPeers: zkPeers,
PointBuffer: 100000,
Offset: "oldest",
}
if err := k.Start(); err != nil {
t.Fatal(err.Error())
} else {
defer k.Stop()
}
waitForPoint(k, t)
// Verify that we can now gather the sent message
var acc testutil.Accumulator
// Sanity check
assert.Equal(t, 0, len(acc.Points), "there should not be any points")
assert.Equal(t, 0, len(acc.Points), "There should not be any points")
// Gather points
err = k.Gather(&acc)
require.NoError(t, err)
assert.Equal(t, 1, len(acc.Points), "there should be a single point")
if len(acc.Points) == 1 {
point := acc.Points[0]
assert.Equal(t, "cpu_load_short", point.Measurement)
assert.Equal(t, map[string]interface{}{"value": 23422.0}, point.Fields)
@ -54,4 +66,26 @@ func TestReadsMetricsFromKafka(t *testing.T) {
"region": "us-west",
}, point.Tags)
assert.Equal(t, time.Unix(0, 1422568543702900257).Unix(), point.Time.Unix())
} else {
t.Errorf("No points found in accumulator, expected 1")
}
}
// Waits for the metric that was sent to the kafka broker to arrive at the kafka
// consumer
func waitForPoint(k *Kafka, t *testing.T) {
// Give the kafka container up to 2 seconds to get the point to the consumer
ticker := time.NewTicker(5 * time.Millisecond)
counter := 0
for {
select {
case <-ticker.C:
counter++
if counter > 1000 {
t.Fatal("Waited for 5s, point never arrived to consumer")
} else if len(k.pointChan) == 1 {
return
}
}
}
}

View File

@ -1,92 +1,91 @@
package kafka_consumer
import (
"strings"
"testing"
"time"
"github.com/Shopify/sarama"
"github.com/influxdb/influxdb/models"
"github.com/influxdb/telegraf/testutil"
"github.com/Shopify/sarama"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
const testMsg = "cpu_load_short,direction=in,host=server01,region=us-west value=23422.0 1422568543702900257"
const (
testMsg = "cpu_load_short,host=server01 value=23422.0 1422568543702900257"
invalidMsg = "cpu_load_short,host=server01 1422568543702900257"
pointBuffer = 5
)
func TestReadFromKafkaBatchesMsgsOnBatchSize(t *testing.T) {
halt := make(chan bool, 1)
metricChan := make(chan []byte, 1)
kafkaChan := make(chan *sarama.ConsumerMessage, 10)
for i := 0; i < 10; i++ {
kafkaChan <- saramaMsg(testMsg)
func NewTestKafka() (*Kafka, chan *sarama.ConsumerMessage) {
in := make(chan *sarama.ConsumerMessage, pointBuffer)
k := Kafka{
ConsumerGroup: "test",
Topics: []string{"telegraf"},
ZookeeperPeers: []string{"localhost:2181"},
PointBuffer: pointBuffer,
Offset: "oldest",
in: in,
doNotCommitMsgs: true,
errs: make(chan *sarama.ConsumerError, pointBuffer),
done: make(chan struct{}),
pointChan: make(chan models.Point, pointBuffer),
}
expectedBatch := strings.Repeat(testMsg+"\n", 9) + testMsg
readFromKafka(kafkaChan, metricChan, 10, func(msg *sarama.ConsumerMessage) error {
batch := <-metricChan
assert.Equal(t, expectedBatch, string(batch))
halt <- true
return nil
}, halt)
return &k, in
}
func TestReadFromKafkaBatchesMsgsOnTimeout(t *testing.T) {
halt := make(chan bool, 1)
metricChan := make(chan []byte, 1)
kafkaChan := make(chan *sarama.ConsumerMessage, 10)
for i := 0; i < 3; i++ {
kafkaChan <- saramaMsg(testMsg)
}
// Test that the parser parses kafka messages into points
func TestRunParser(t *testing.T) {
k, in := NewTestKafka()
defer close(k.done)
expectedBatch := strings.Repeat(testMsg+"\n", 2) + testMsg
readFromKafka(kafkaChan, metricChan, 10, func(msg *sarama.ConsumerMessage) error {
batch := <-metricChan
assert.Equal(t, expectedBatch, string(batch))
go k.parser()
in <- saramaMsg(testMsg)
time.Sleep(time.Millisecond)
halt <- true
return nil
}, halt)
assert.Equal(t, len(k.pointChan), 1)
}
func TestEmitMetricsSendMetricsToAcc(t *testing.T) {
k := &Kafka{}
var acc testutil.Accumulator
testChan := make(chan []byte, 1)
testChan <- []byte(testMsg)
// Test that the parser ignores invalid messages
func TestRunParserInvalidMsg(t *testing.T) {
k, in := NewTestKafka()
defer close(k.done)
err := emitMetrics(k, &acc, testChan)
require.NoError(t, err)
go k.parser()
in <- saramaMsg(invalidMsg)
time.Sleep(time.Millisecond)
assert.Equal(t, 1, len(acc.Points), "there should be a single point")
point := acc.Points[0]
assert.Equal(t, "cpu_load_short", point.Measurement)
assert.Equal(t, map[string]interface{}{"value": 23422.0}, point.Fields)
assert.Equal(t, map[string]string{
"host": "server01",
"direction": "in",
"region": "us-west",
}, point.Tags)
if time.Unix(0, 1422568543702900257).Unix() != point.Time.Unix() {
t.Errorf("Expected: %v, received %v\n",
time.Unix(0, 1422568543702900257).Unix(),
point.Time.Unix())
}
assert.Equal(t, len(k.pointChan), 0)
}
func TestEmitMetricsTimesOut(t *testing.T) {
k := &Kafka{}
var acc testutil.Accumulator
testChan := make(chan []byte)
// Test that points are dropped when we hit the buffer limit
func TestRunParserRespectsBuffer(t *testing.T) {
k, in := NewTestKafka()
defer close(k.done)
err := emitMetrics(k, &acc, testChan)
require.NoError(t, err)
go k.parser()
for i := 0; i < pointBuffer+1; i++ {
in <- saramaMsg(testMsg)
}
time.Sleep(time.Millisecond)
assert.Equal(t, 0, len(acc.Points), "there should not be a any points")
assert.Equal(t, len(k.pointChan), 5)
}
// Test that the parser parses kafka messages into points
func TestRunParserAndGather(t *testing.T) {
k, in := NewTestKafka()
defer close(k.done)
go k.parser()
in <- saramaMsg(testMsg)
time.Sleep(time.Millisecond)
acc := testutil.Accumulator{}
k.Gather(&acc)
assert.Equal(t, len(acc.Points), 1)
assert.True(t, acc.CheckValue("cpu_load_short", 23422.0))
}
func saramaMsg(val string) *sarama.ConsumerMessage {

View File

@ -183,8 +183,6 @@ func (s *Statsd) Gather(acc plugins.Accumulator) error {
}
func (s *Statsd) Start() error {
log.Println("Starting up the statsd service")
// Make data structures
s.done = make(chan struct{})
s.in = make(chan string, s.AllowedPendingMessages)
@ -197,6 +195,7 @@ func (s *Statsd) Start() error {
go s.udpListen()
// Start the line parser
go s.parser()
log.Printf("Started the statsd service on %s\n", s.ServiceAddress)
return nil
}

View File

@ -1,12 +1,9 @@
[kafka]
topic = "topic_with_metrics"
consumerGroupName = "telegraf_metrics_consumers"
zookeeperPeers = ["test.example.com:2181"]
batchSize = 1000
[memcached]
servers = ["localhost"]
pass = ["some", "strings"]
drop = ["other", "stuff"]
interval = "5s"
[kafka.tagpass]
[memcached.tagpass]
goodtag = ["mytag"]
[kafka.tagdrop]
[memcached.tagdrop]
badtag = ["othertag"]

View File

@ -1,10 +1,9 @@
[kafka]
zookeeperPeers = ["test.example.com:2181"]
batchSize = 10000
[memcached]
servers = ["192.168.1.1"]
pass = ["some", "strings"]
drop = ["other", "stuff"]
interval = "5s"
[kafka.tagpass]
[memcached.tagpass]
goodtag = ["mytag"]
[kafka.tagdrop]
[memcached.tagdrop]
badtag = ["othertag"]

View File

@ -171,18 +171,17 @@ urls = ["http://localhost/server-status?auto"]
# no configuration
# read metrics from a Kafka topic
[kafka]
# topic to consume
topic = "topic_with_metrics"
# the name of the consumer group
consumerGroupName = "telegraf_metrics_consumers"
[kafka_consumer]
# topic(s) to consume
topics = ["telegraf"]
# an array of Zookeeper connection strings
zookeeperPeers = ["localhost:2181"]
# Batch size of points sent to InfluxDB
batchSize = 1000
zookeeper_peers = ["localhost:2181"]
# the name of the consumer group
consumer_group = "telegraf_metrics_consumers"
# Maximum number of points to buffer between collection intervals
point_buffer = 100000
# Offset (must be either "oldest" or "newest")
offset = "oldest"
# Read metrics from a LeoFS Server via SNMP
[leofs]