Remove outputs blocking inputs when output is slow (#4938)

2018-11-05 13:34:28 -08:00
parent 74667cd681
commit 6e5c2f8bb6
59 changed files with 3615 additions and 2189 deletions
--- a/plugins/inputs/nsq_consumer/README.md
+++ b/plugins/inputs/nsq_consumer/README.md
@@ -1,9 +1,9 @@
 # NSQ Consumer Input Plugin

-The [NSQ](http://nsq.io/) consumer plugin polls a specified NSQD
-topic and adds messages to InfluxDB. This plugin allows a message to be in any of the supported `data_format` types.
+The [NSQ][nsq] consumer plugin reads from NSQD and creates metrics using one
+of the supported [input data formats][].

-## Configuration
+### Configuration:

 ```toml
 # Read metrics from NSQD topic(s)
@@ -18,6 +18,16 @@ topic and adds messages to InfluxDB. This plugin allows a message to be in any o
  channel = "consumer"
  max_in_flight = 100

+  ## Maximum messages to read from the broker that have not been written by an
+  ## output.  For best throughput set based on the number of metrics within
+  ## each message and the size of the output's metric_batch_size.
+  ##
+  ## For example, if each message from the queue contains 10 metrics and the
+  ## output metric_batch_size is 1000, setting this to 100 will ensure that a
+  ## full batch is collected and the write is triggered immediately without
+  ## waiting until the next flush_interval.
+  # max_undelivered_messages = 1000
+
  ## Data format to consume.
  ## Each data format has its own unique set of configuration options, read
  ## more about them here:
@@ -25,5 +35,5 @@ topic and adds messages to InfluxDB. This plugin allows a message to be in any o
  data_format = "influx"
 ```

-## Testing
-The `nsq_consumer_test` mocks out the interaction with `NSQD`. It requires no outside dependencies.
+[nsq]: https://nsq.io
+[input data formats]: /docs/DATA_FORMATS_INPUT.md
--- a/plugins/inputs/nsq_consumer/nsq_consumer.go
+++ b/plugins/inputs/nsq_consumer/nsq_consumer.go
@@ -1,7 +1,9 @@
 package nsq_consumer

 import (
-	"fmt"
+	"context"
+	"log"
+	"sync"

 	"github.com/influxdata/telegraf"
 	"github.com/influxdata/telegraf/plugins/inputs"
@@ -9,17 +11,38 @@ import (
 	nsq "github.com/nsqio/go-nsq"
 )

+const (
+	defaultMaxUndeliveredMessages = 1000
+)
+
+type empty struct{}
+type semaphore chan empty
+
+type logger struct{}
+
+func (l *logger) Output(calldepth int, s string) error {
+	log.Println("D! [inputs.nsq_consumer] " + s)
+	return nil
+}
+
 //NSQConsumer represents the configuration of the plugin
 type NSQConsumer struct {
-	Server      string
-	Nsqd        []string
-	Nsqlookupd  []string
-	Topic       string
-	Channel     string
-	MaxInFlight int
-	parser      parsers.Parser
-	consumer    *nsq.Consumer
-	acc         telegraf.Accumulator
+	Server      string   `toml:"server"`
+	Nsqd        []string `toml:"nsqd"`
+	Nsqlookupd  []string `toml:"nsqlookupd"`
+	Topic       string   `toml:"topic"`
+	Channel     string   `toml:"channel"`
+	MaxInFlight int      `toml:"max_in_flight"`
+
+	MaxUndeliveredMessages int `toml:"max_undelivered_messages"`
+
+	parser   parsers.Parser
+	consumer *nsq.Consumer
+
+	mu       sync.Mutex
+	messages map[telegraf.TrackingID]*nsq.Message
+	wg       sync.WaitGroup
+	cancel   context.CancelFunc
 }

 var sampleConfig = `
@@ -33,6 +56,16 @@ var sampleConfig = `
  channel = "consumer"
  max_in_flight = 100

+  ## Maximum messages to read from the broker that have not been written by an
+  ## output.  For best throughput set based on the number of metrics within
+  ## each message and the size of the output's metric_batch_size.
+  ##
+  ## For example, if each message from the queue contains 10 metrics and the
+  ## output metric_batch_size is 1000, setting this to 100 will ensure that a
+  ## full batch is collected and the write is triggered immediately without
+  ## waiting until the next flush_interval.
+  # max_undelivered_messages = 1000
+
  ## Data format to consume.
  ## Each data format has its own unique set of configuration options, read
  ## more about them here:
@@ -40,12 +73,6 @@ var sampleConfig = `
  data_format = "influx"
 `

-func init() {
-	inputs.Add("nsq_consumer", func() telegraf.Input {
-		return &NSQConsumer{}
-	})
-}
-
 // SetParser takes the data_format from the config and finds the right parser for that format
 func (n *NSQConsumer) SetParser(parser parsers.Parser) {
 	n.parser = parser
@@ -62,32 +89,88 @@ func (n *NSQConsumer) Description() string {
 }

 // Start pulls data from nsq
-func (n *NSQConsumer) Start(acc telegraf.Accumulator) error {
-	n.acc = acc
+func (n *NSQConsumer) Start(ac telegraf.Accumulator) error {
+	acc := ac.WithTracking(n.MaxUndeliveredMessages)
+	sem := make(semaphore, n.MaxUndeliveredMessages)
+	n.messages = make(map[telegraf.TrackingID]*nsq.Message, n.MaxUndeliveredMessages)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	n.cancel = cancel
+
 	n.connect()
-	n.consumer.AddConcurrentHandlers(nsq.HandlerFunc(func(message *nsq.Message) error {
+	n.consumer.SetLogger(&logger{}, nsq.LogLevelInfo)
+	n.consumer.AddHandler(nsq.HandlerFunc(func(message *nsq.Message) error {
 		metrics, err := n.parser.Parse(message.Body)
 		if err != nil {
-			acc.AddError(fmt.Errorf("E! NSQConsumer Parse Error\nmessage:%s\nerror:%s", string(message.Body), err.Error()))
+			acc.AddError(err)
+			// Remove the message from the queue
+			message.Finish()
 			return nil
 		}
-		for _, metric := range metrics {
-			n.acc.AddFields(metric.Name(), metric.Fields(), metric.Tags(), metric.Time())
+		if len(metrics) == 0 {
+			message.Finish()
+			return nil
 		}
-		message.Finish()
+
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case sem <- empty{}:
+			break
+		}
+
+		n.mu.Lock()
+		id := acc.AddTrackingMetricGroup(metrics)
+		n.messages[id] = message
+		n.mu.Unlock()
+		message.DisableAutoResponse()
 		return nil
-	}), n.MaxInFlight)
+	}))

 	if len(n.Nsqlookupd) > 0 {
 		n.consumer.ConnectToNSQLookupds(n.Nsqlookupd)
 	}
 	n.consumer.ConnectToNSQDs(append(n.Nsqd, n.Server))
+
+	n.wg.Add(1)
+	go func() {
+		defer n.wg.Done()
+		n.onDelivery(ctx, acc, sem)
+	}()
 	return nil
 }

+func (n *NSQConsumer) onDelivery(ctx context.Context, acc telegraf.TrackingAccumulator, sem semaphore) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case info := <-acc.Delivered():
+			n.mu.Lock()
+			msg, ok := n.messages[info.ID()]
+			if !ok {
+				n.mu.Unlock()
+				continue
+			}
+			<-sem
+			delete(n.messages, info.ID())
+			n.mu.Unlock()
+
+			if info.Delivered() {
+				msg.Finish()
+			} else {
+				msg.Requeue(-1)
+			}
+		}
+	}
+}
+
 // Stop processing messages
 func (n *NSQConsumer) Stop() {
+	n.cancel()
+	n.wg.Wait()
 	n.consumer.Stop()
+	<-n.consumer.StopChan
 }

 // Gather is a noop
@@ -107,3 +190,11 @@ func (n *NSQConsumer) connect() error {
 	}
 	return nil
 }
+
+func init() {
+	inputs.Add("nsq_consumer", func() telegraf.Input {
+		return &NSQConsumer{
+			MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
+		}
+	})
+}
--- a/plugins/inputs/nsq_consumer/nsq_consumer_test.go
+++ b/plugins/inputs/nsq_consumer/nsq_consumer_test.go
@@ -36,11 +36,12 @@ func TestReadsMetricsFromNSQ(t *testing.T) {
 	newMockNSQD(script, addr.String())

 	consumer := &NSQConsumer{
-		Server:      "127.0.0.1:4155",
-		Topic:       "telegraf",
-		Channel:     "consume",
-		MaxInFlight: 1,
-		Nsqd:        []string{"127.0.0.1:4155"},
+		Server:                 "127.0.0.1:4155",
+		Topic:                  "telegraf",
+		Channel:                "consume",
+		MaxInFlight:            1,
+		MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
+		Nsqd: []string{"127.0.0.1:4155"},
 	}

 	p, _ := parsers.NewInfluxParser()