telegraf/plugins/inputs/nats_consumer/nats_consumer.go

package natsconsumer

import (
	"context"
	"fmt"
	"log"
	"sync"

	"github.com/influxdata/telegraf"
	"github.com/influxdata/telegraf/plugins/inputs"
	"github.com/influxdata/telegraf/plugins/parsers"
	nats "github.com/nats-io/go-nats"
)

var (
	defaultMaxUndeliveredMessages = 1000
)

type empty struct{}
type semaphore chan empty

type natsError struct {
	conn *nats.Conn
	sub  *nats.Subscription
	err  error
}

func (e natsError) Error() string {
	return fmt.Sprintf("%s url:%s id:%s sub:%s queue:%s",
		e.err.Error(), e.conn.ConnectedUrl(), e.conn.ConnectedServerId(), e.sub.Subject, e.sub.Queue)
}

type natsConsumer struct {
	QueueGroup string   `toml:"queue_group"`
	Subjects   []string `toml:"subjects"`
	Servers    []string `toml:"servers"`
	Secure     bool     `toml:"secure"`

	// Client pending limits:
	PendingMessageLimit int `toml:"pending_message_limit"`
	PendingBytesLimit   int `toml:"pending_bytes_limit"`

	MaxUndeliveredMessages int `toml:"max_undelivered_messages"`

	// Legacy metric buffer support; deprecated in v0.10.3
	MetricBuffer int

	conn *nats.Conn
	subs []*nats.Subscription

	parser parsers.Parser
	// channel for all incoming NATS messages
	in chan *nats.Msg
	// channel for all NATS read errors
	errs   chan error
	acc    telegraf.TrackingAccumulator
	wg     sync.WaitGroup
	cancel context.CancelFunc
}

var sampleConfig = `
  ## urls of NATS servers
  servers = ["nats://localhost:4222"]
  ## Use Transport Layer Security
  secure = false
  ## subject(s) to consume
  subjects = ["telegraf"]
  ## name a queue group
  queue_group = "telegraf_consumers"

  ## Sets the limits for pending msgs and bytes for each subscription
  ## These shouldn't need to be adjusted except in very high throughput scenarios
  # pending_message_limit = 65536
  # pending_bytes_limit = 67108864

  ## Maximum messages to read from the broker that have not been written by an
  ## output.  For best throughput set based on the number of metrics within
  ## each message and the size of the output's metric_batch_size.
  ##
  ## For example, if each message from the queue contains 10 metrics and the
  ## output metric_batch_size is 1000, setting this to 100 will ensure that a
  ## full batch is collected and the write is triggered immediately without
  ## waiting until the next flush_interval.
  # max_undelivered_messages = 1000

  ## Data format to consume.
  ## Each data format has its own unique set of configuration options, read
  ## more about them here:
  ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
  data_format = "influx"
`

func (n *natsConsumer) SampleConfig() string {
	return sampleConfig
}

func (n *natsConsumer) Description() string {
	return "Read metrics from NATS subject(s)"
}

func (n *natsConsumer) SetParser(parser parsers.Parser) {
	n.parser = parser
}

func (n *natsConsumer) natsErrHandler(c *nats.Conn, s *nats.Subscription, e error) {
	select {
	case n.errs <- natsError{conn: c, sub: s, err: e}:
	default:
		return
	}
}

// Start the nats consumer. Caller must call *natsConsumer.Stop() to clean up.
func (n *natsConsumer) Start(acc telegraf.Accumulator) error {
	n.acc = acc.WithTracking(n.MaxUndeliveredMessages)

	var connectErr error

	// set default NATS connection options
	opts := nats.DefaultOptions

	// override max reconnection tries
	opts.MaxReconnect = -1

	// override servers if any were specified
	opts.Servers = n.Servers

	opts.Secure = n.Secure

	if n.conn == nil || n.conn.IsClosed() {
		n.conn, connectErr = opts.Connect()
		if connectErr != nil {
			return connectErr
		}

		// Setup message and error channels
		n.errs = make(chan error)
		n.conn.SetErrorHandler(n.natsErrHandler)

		n.in = make(chan *nats.Msg, 1000)
		for _, subj := range n.Subjects {
			sub, err := n.conn.QueueSubscribe(subj, n.QueueGroup, func(m *nats.Msg) {
				n.in <- m
			})
			if err != nil {
				return err
			}
			// ensure that the subscription has been processed by the server
			if err = n.conn.Flush(); err != nil {
				return err
			}
			// set the subscription pending limits
			if err = sub.SetPendingLimits(n.PendingMessageLimit, n.PendingBytesLimit); err != nil {
				return err
			}
			n.subs = append(n.subs, sub)
		}
	}

	ctx, cancel := context.WithCancel(context.Background())
	n.cancel = cancel

	// Start the message reader
	n.wg.Add(1)
	go func() {
		defer n.wg.Done()
		go n.receiver(ctx)
	}()

	log.Printf("I! Started the NATS consumer service, nats: %v, subjects: %v, queue: %v\n",
		n.conn.ConnectedUrl(), n.Subjects, n.QueueGroup)

	return nil
}

// receiver() reads all incoming messages from NATS, and parses them into
// telegraf metrics.
func (n *natsConsumer) receiver(ctx context.Context) {
	sem := make(semaphore, n.MaxUndeliveredMessages)

	for {
		select {
		case <-ctx.Done():
			return
		case <-n.acc.Delivered():
			<-sem
		case err := <-n.errs:
			n.acc.AddError(err)
		case sem <- empty{}:
			select {
			case <-ctx.Done():
				return
			case err := <-n.errs:
				<-sem
				n.acc.AddError(err)
			case <-n.acc.Delivered():
				<-sem
				<-sem
			case msg := <-n.in:
				metrics, err := n.parser.Parse(msg.Data)
				if err != nil {
					n.acc.AddError(fmt.Errorf("subject: %s, error: %s", msg.Subject, err.Error()))
					<-sem
					continue
				}

				n.acc.AddTrackingMetricGroup(metrics)
			}
		}
	}
}

func (n *natsConsumer) clean() {
	for _, sub := range n.subs {
		if err := sub.Unsubscribe(); err != nil {
			n.acc.AddError(fmt.Errorf("Error unsubscribing from subject %s in queue %s: %s\n",
				sub.Subject, sub.Queue, err.Error()))
		}
	}

	if n.conn != nil && !n.conn.IsClosed() {
		n.conn.Close()
	}
}

func (n *natsConsumer) Stop() {
	n.cancel()
	n.wg.Wait()
	n.clean()
}

func (n *natsConsumer) Gather(acc telegraf.Accumulator) error {
	return nil
}

func init() {
	inputs.Add("nats_consumer", func() telegraf.Input {
		return &natsConsumer{
			Servers:                []string{"nats://localhost:4222"},
			Secure:                 false,
			Subjects:               []string{"telegraf"},
			QueueGroup:             "telegraf_consumers",
			PendingBytesLimit:      nats.DefaultSubPendingBytesLimit,
			PendingMessageLimit:    nats.DefaultSubPendingMsgsLimit,
			MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
		}
	})
}
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`package natsconsumer`

			`import (`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`"context"`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`"fmt"`
			`"log"`
			`"sync"`

			`"github.com/influxdata/telegraf"`
			`"github.com/influxdata/telegraf/plugins/inputs"`
			`"github.com/influxdata/telegraf/plugins/parsers"`
Use nats-io/go-nats instead of nats-io/nats 2018-06-11 22:24:45 +00:00			`nats "github.com/nats-io/go-nats"`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`)`

Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`var (`
			`defaultMaxUndeliveredMessages = 1000`
			`)`

			`type empty struct{}`
			`type semaphore chan empty`

Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`type natsError struct {`
			`conn *nats.Conn`
			`sub *nats.Subscription`
			`err error`
			`}`

			`func (e natsError) Error() string {`
			`return fmt.Sprintf("%s url:%s id:%s sub:%s queue:%s",`
			`e.err.Error(), e.conn.ConnectedUrl(), e.conn.ConnectedServerId(), e.sub.Subject, e.sub.Queue)`
			`}`

			`type natsConsumer struct {`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			QueueGroup string `toml:"queue_group"`
			Subjects []string `toml:"subjects"`
			Servers []string `toml:"servers"`
			Secure bool `toml:"secure"`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00
nats_consumer: buffer incoming messages fixes #1956 2016-10-26 15:38:56 +00:00			`// Client pending limits:`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			PendingMessageLimit int `toml:"pending_message_limit"`
			PendingBytesLimit int `toml:"pending_bytes_limit"`

			MaxUndeliveredMessages int `toml:"max_undelivered_messages"`
nats_consumer: buffer incoming messages fixes #1956 2016-10-26 15:38:56 +00:00
Add deprecation version for MetricBuffer 2018-10-19 22:46:20 +00:00			`// Legacy metric buffer support; deprecated in v0.10.3`
Change point_buffer to metric_buffer to conform will changes in https://github.com/influxdata/telegraf/pull/676 closes #680 2016-02-12 10:05:33 +00:00			`MetricBuffer int`
Flush based on buffer size rather than time this includes: - Add Accumulator to the Start() function of service inputs - For message consumer plugins, use the Accumulator to constantly add metrics and make Gather a dummy function - rework unit tests to match this new behavior. - make "flush_buffer_when_full" a config option that defaults to true closes #666 2016-02-16 00:21:38 +00:00
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`conn *nats.Conn`
			`subs []*nats.Subscription`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`parser parsers.Parser`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`// channel for all incoming NATS messages`
			`in chan *nats.Msg`
			`// channel for all NATS read errors`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`errs chan error`
			`acc telegraf.TrackingAccumulator`
			`wg sync.WaitGroup`
			`cancel context.CancelFunc`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`}`

			var sampleConfig = `
Seems to be a toml parse bug around triple pounds 2016-02-18 21:26:51 +00:00			`## urls of NATS servers`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`servers = ["nats://localhost:4222"]`
Seems to be a toml parse bug around triple pounds 2016-02-18 21:26:51 +00:00			`## Use Transport Layer Security`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`secure = false`
Seems to be a toml parse bug around triple pounds 2016-02-18 21:26:51 +00:00			`## subject(s) to consume`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`subjects = ["telegraf"]`
Seems to be a toml parse bug around triple pounds 2016-02-18 21:26:51 +00:00			`## name a queue group`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`queue_group = "telegraf_consumers"`
nats_consumer: buffer incoming messages fixes #1956 2016-10-26 15:38:56 +00:00
			`## Sets the limits for pending msgs and bytes for each subscription`
			`## These shouldn't need to be adjusted except in very high throughput scenarios`
			`# pending_message_limit = 65536`
			`# pending_bytes_limit = 67108864`
Flush based on buffer size rather than time this includes: - Add Accumulator to the Start() function of service inputs - For message consumer plugins, use the Accumulator to constantly add metrics and make Gather a dummy function - rework unit tests to match this new behavior. - make "flush_buffer_when_full" a config option that defaults to true closes #666 2016-02-16 00:21:38 +00:00
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`## Maximum messages to read from the broker that have not been written by an`
			`## output. For best throughput set based on the number of metrics within`
			`## each message and the size of the output's metric_batch_size.`
			`##`
			`## For example, if each message from the queue contains 10 metrics and the`
			`## output metric_batch_size is 1000, setting this to 100 will ensure that a`
			`## full batch is collected and the write is triggered immediately without`
			`## waiting until the next flush_interval.`
			`# max_undelivered_messages = 1000`

Cleanup & standardize config file changes: - -sample-config will now comment out all but a few default plugins. - config file parse errors will output path to bad conf file. - cleanup 80-char line-length and some other style issues. - default package conf file will now have all plugins, but commented out. closes #199 closes #944 2016-03-31 23:50:24 +00:00			`## Data format to consume.`
Fix grammar 2017-04-27 21:59:18 +00:00			`## Each data format has its own unique set of configuration options, read`
Seems to be a toml parse bug around triple pounds 2016-02-18 21:26:51 +00:00			`## more about them here:`
			`## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`data_format = "influx"`
			`

			`func (n *natsConsumer) SampleConfig() string {`
			`return sampleConfig`
			`}`

			`func (n *natsConsumer) Description() string {`
			`return "Read metrics from NATS subject(s)"`
			`}`

			`func (n *natsConsumer) SetParser(parser parsers.Parser) {`
			`n.parser = parser`
			`}`

			`func (n natsConsumer) natsErrHandler(c nats.Conn, s *nats.Subscription, e error) {`
			`select {`
			`case n.errs <- natsError{conn: c, sub: s, err: e}:`
			`default:`
			`return`
			`}`
			`}`

			`// Start the nats consumer. Caller must call *natsConsumer.Stop() to clean up.`
Flush based on buffer size rather than time this includes: - Add Accumulator to the Start() function of service inputs - For message consumer plugins, use the Accumulator to constantly add metrics and make Gather a dummy function - rework unit tests to match this new behavior. - make "flush_buffer_when_full" a config option that defaults to true closes #666 2016-02-16 00:21:38 +00:00			`func (n *natsConsumer) Start(acc telegraf.Accumulator) error {`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`n.acc = acc.WithTracking(n.MaxUndeliveredMessages)`
Flush based on buffer size rather than time this includes: - Add Accumulator to the Start() function of service inputs - For message consumer plugins, use the Accumulator to constantly add metrics and make Gather a dummy function - rework unit tests to match this new behavior. - make "flush_buffer_when_full" a config option that defaults to true closes #666 2016-02-16 00:21:38 +00:00
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`var connectErr error`

Fix NATS plug-ins reconnection logic (#1955) * NATS output plug-in now retries to reconnect forever after a lost connection. * NATS input plug-in now retries to reconnect forever after a lost connection. * Fixes #1953 2016-10-26 14:45:33 +00:00			`// set default NATS connection options`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`opts := nats.DefaultOptions`
Fix NATS plug-ins reconnection logic (#1955) * NATS output plug-in now retries to reconnect forever after a lost connection. * NATS input plug-in now retries to reconnect forever after a lost connection. * Fixes #1953 2016-10-26 14:45:33 +00:00
			`// override max reconnection tries`
			`opts.MaxReconnect = -1`

			`// override servers if any were specified`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`opts.Servers = n.Servers`
Fix NATS plug-ins reconnection logic (#1955) * NATS output plug-in now retries to reconnect forever after a lost connection. * NATS input plug-in now retries to reconnect forever after a lost connection. * Fixes #1953 2016-10-26 14:45:33 +00:00
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`opts.Secure = n.Secure`

Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`if n.conn == nil \|\| n.conn.IsClosed() {`
			`n.conn, connectErr = opts.Connect()`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`if connectErr != nil {`
			`return connectErr`
			`}`

			`// Setup message and error channels`
			`n.errs = make(chan error)`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`n.conn.SetErrorHandler(n.natsErrHandler)`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00
nats_consumer: buffer incoming messages fixes #1956 2016-10-26 15:38:56 +00:00			`n.in = make(chan *nats.Msg, 1000)`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`for _, subj := range n.Subjects {`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`sub, err := n.conn.QueueSubscribe(subj, n.QueueGroup, func(m *nats.Msg) {`
nats_consumer: buffer incoming messages fixes #1956 2016-10-26 15:38:56 +00:00			`n.in <- m`
			`})`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`if err != nil {`
			`return err`
			`}`
nats_consumer: buffer incoming messages fixes #1956 2016-10-26 15:38:56 +00:00			`// ensure that the subscription has been processed by the server`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`if err = n.conn.Flush(); err != nil {`
nats_consumer: buffer incoming messages fixes #1956 2016-10-26 15:38:56 +00:00			`return err`
			`}`
			`// set the subscription pending limits`
			`if err = sub.SetPendingLimits(n.PendingMessageLimit, n.PendingBytesLimit); err != nil {`
			`return err`
			`}`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`n.subs = append(n.subs, sub)`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`}`
			`}`

Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`ctx, cancel := context.WithCancel(context.Background())`
			`n.cancel = cancel`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00
			`// Start the message reader`
nats_consumer: buffer incoming messages fixes #1956 2016-10-26 15:38:56 +00:00			`n.wg.Add(1)`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`go func() {`
			`defer n.wg.Done()`
			`go n.receiver(ctx)`
			`}()`

Major Logging Overhaul in this commit: - centralize logging output handler. - set global Info/Debug/Error log levels based on config file or flags. - remove per-plugin debug arg handling. - add a I!, D!, or E! to every log message. - add configuration option to specify where to send logs. closes #1786 2016-09-30 21:37:56 +00:00			`log.Printf("I! Started the NATS consumer service, nats: %v, subjects: %v, queue: %v\n",`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`n.conn.ConnectedUrl(), n.Subjects, n.QueueGroup)`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00
			`return nil`
			`}`

			`// receiver() reads all incoming messages from NATS, and parses them into`
Change point_buffer to metric_buffer to conform will changes in https://github.com/influxdata/telegraf/pull/676 closes #680 2016-02-12 10:05:33 +00:00			`// telegraf metrics.`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`func (n *natsConsumer) receiver(ctx context.Context) {`
			`sem := make(semaphore, n.MaxUndeliveredMessages)`

Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`for {`
			`select {`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`case <-ctx.Done():`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`return`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`case <-n.acc.Delivered():`
			`<-sem`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`case err := <-n.errs:`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`n.acc.AddError(err)`
			`case sem <- empty{}:`
			`select {`
			`case <-ctx.Done():`
			`return`
			`case err := <-n.errs:`
			`<-sem`
			`n.acc.AddError(err)`
			`case <-n.acc.Delivered():`
			`<-sem`
			`<-sem`
			`case msg := <-n.in:`
			`metrics, err := n.parser.Parse(msg.Data)`
			`if err != nil {`
			`n.acc.AddError(fmt.Errorf("subject: %s, error: %s", msg.Subject, err.Error()))`
			`<-sem`
			`continue`
			`}`

			`n.acc.AddTrackingMetricGroup(metrics)`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`}`
			`}`
			`}`
			`}`

			`func (n *natsConsumer) clean() {`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`for _, sub := range n.subs {`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`if err := sub.Unsubscribe(); err != nil {`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`n.acc.AddError(fmt.Errorf("Error unsubscribing from subject %s in queue %s: %s\n",`
remove sleep from tests (#2555) 2017-03-24 19:03:36 +00:00			`sub.Subject, sub.Queue, err.Error()))`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`}`
			`}`

Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`if n.conn != nil && !n.conn.IsClosed() {`
			`n.conn.Close()`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`}`
			`}`

			`func (n *natsConsumer) Stop() {`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`n.cancel()`
nats_consumer: buffer incoming messages fixes #1956 2016-10-26 15:38:56 +00:00			`n.wg.Wait()`
			`n.clean()`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`}`

			`func (n *natsConsumer) Gather(acc telegraf.Accumulator) error {`
			`return nil`
			`}`

			`func init() {`
			`inputs.Add("nats_consumer", func() telegraf.Input {`
nats_consumer: buffer incoming messages fixes #1956 2016-10-26 15:38:56 +00:00			`return &natsConsumer{`
Remove outputs blocking inputs when output is slow (#4938) 2018-11-05 21:34:28 +00:00			`Servers: []string{"nats://localhost:4222"},`
			`Secure: false,`
			`Subjects: []string{"telegraf"},`
			`QueueGroup: "telegraf_consumers",`
			`PendingBytesLimit: nats.DefaultSubPendingBytesLimit,`
			`PendingMessageLimit: nats.DefaultSubPendingMsgsLimit,`
			`MaxUndeliveredMessages: defaultMaxUndeliveredMessages,`
nats_consumer: buffer incoming messages fixes #1956 2016-10-26 15:38:56 +00:00			`}`
Add NATS consumer input plugin. 2016-02-10 23:28:52 +00:00			`})`
			`}`