Retry restarting receiver on PubSub service error (#5458)
This commit is contained in:
parent
c234ba291e
commit
0a01713bcc
|
@ -26,6 +26,12 @@ and creates metrics using one of the supported [input data formats][].
|
||||||
## Application Default Credentials, which is preferred.
|
## Application Default Credentials, which is preferred.
|
||||||
# credentials_file = "path/to/my/creds.json"
|
# credentials_file = "path/to/my/creds.json"
|
||||||
|
|
||||||
|
## Optional. Number of seconds to wait before attempting to restart the
|
||||||
|
## PubSub subscription receiver after an unexpected error.
|
||||||
|
## If the streaming pull for a PubSub Subscription fails (receiver),
|
||||||
|
## the agent attempts to restart receiving messages after this many seconds.
|
||||||
|
# retry_delay_seconds = 5
|
||||||
|
|
||||||
## Optional. Maximum byte length of a message to consume.
|
## Optional. Maximum byte length of a message to consume.
|
||||||
## Larger messages are dropped with an error. If less than 0 or unspecified,
|
## Larger messages are dropped with an error. If less than 0 or unspecified,
|
||||||
## treated as no limit.
|
## treated as no limit.
|
||||||
|
|
|
@ -12,14 +12,19 @@ import (
|
||||||
"github.com/influxdata/telegraf/plugins/parsers"
|
"github.com/influxdata/telegraf/plugins/parsers"
|
||||||
"golang.org/x/oauth2/google"
|
"golang.org/x/oauth2/google"
|
||||||
"google.golang.org/api/option"
|
"google.golang.org/api/option"
|
||||||
|
"log"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
type empty struct{}
|
type empty struct{}
|
||||||
type semaphore chan empty
|
type semaphore chan empty
|
||||||
|
|
||||||
const defaultMaxUndeliveredMessages = 1000
|
const defaultMaxUndeliveredMessages = 1000
|
||||||
|
const defaultRetryDelaySeconds = 5
|
||||||
|
|
||||||
type PubSub struct {
|
type PubSub struct {
|
||||||
|
sync.Mutex
|
||||||
|
|
||||||
CredentialsFile string `toml:"credentials_file"`
|
CredentialsFile string `toml:"credentials_file"`
|
||||||
Project string `toml:"project"`
|
Project string `toml:"project"`
|
||||||
Subscription string `toml:"subscription"`
|
Subscription string `toml:"subscription"`
|
||||||
|
@ -33,6 +38,7 @@ type PubSub struct {
|
||||||
// Agent settings
|
// Agent settings
|
||||||
MaxMessageLen int `toml:"max_message_len"`
|
MaxMessageLen int `toml:"max_message_len"`
|
||||||
MaxUndeliveredMessages int `toml:"max_undelivered_messages"`
|
MaxUndeliveredMessages int `toml:"max_undelivered_messages"`
|
||||||
|
RetryReceiveDelaySeconds int `toml:"retry_delay_seconds"`
|
||||||
|
|
||||||
sub subscription
|
sub subscription
|
||||||
stubSub func() subscription
|
stubSub func() subscription
|
||||||
|
@ -42,7 +48,6 @@ type PubSub struct {
|
||||||
parser parsers.Parser
|
parser parsers.Parser
|
||||||
wg *sync.WaitGroup
|
wg *sync.WaitGroup
|
||||||
acc telegraf.TrackingAccumulator
|
acc telegraf.TrackingAccumulator
|
||||||
mu sync.Mutex
|
|
||||||
|
|
||||||
undelivered map[telegraf.TrackingID]message
|
undelivered map[telegraf.TrackingID]message
|
||||||
sem semaphore
|
sem semaphore
|
||||||
|
@ -78,35 +83,36 @@ func (ps *PubSub) Start(ac telegraf.Accumulator) error {
|
||||||
return fmt.Errorf(`"project" is required`)
|
return fmt.Errorf(`"project" is required`)
|
||||||
}
|
}
|
||||||
|
|
||||||
cctx, cancel := context.WithCancel(context.Background())
|
ps.sem = make(semaphore, ps.MaxUndeliveredMessages)
|
||||||
|
ps.acc = ac.WithTracking(ps.MaxUndeliveredMessages)
|
||||||
|
|
||||||
|
// Create top-level context with cancel that will be called on Stop().
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
ps.cancel = cancel
|
ps.cancel = cancel
|
||||||
|
|
||||||
if ps.stubSub != nil {
|
if ps.stubSub != nil {
|
||||||
ps.sub = ps.stubSub()
|
ps.sub = ps.stubSub()
|
||||||
} else {
|
} else {
|
||||||
subRef, err := ps.getGCPSubscription(cctx, ps.Subscription)
|
subRef, err := ps.getGCPSubscription(ps.Subscription)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("unable to create subscription handle: %v", err)
|
||||||
}
|
}
|
||||||
ps.sub = subRef
|
ps.sub = subRef
|
||||||
}
|
}
|
||||||
|
|
||||||
ps.wg = &sync.WaitGroup{}
|
ps.wg = &sync.WaitGroup{}
|
||||||
ps.acc = ac.WithTracking(ps.MaxUndeliveredMessages)
|
|
||||||
ps.sem = make(semaphore, ps.MaxUndeliveredMessages)
|
|
||||||
|
|
||||||
// Start receiver in new goroutine for each subscription.
|
|
||||||
ps.wg.Add(1)
|
|
||||||
go func() {
|
|
||||||
defer ps.wg.Done()
|
|
||||||
ps.subReceive(cctx)
|
|
||||||
}()
|
|
||||||
|
|
||||||
// Start goroutine to handle delivery notifications from accumulator.
|
// Start goroutine to handle delivery notifications from accumulator.
|
||||||
ps.wg.Add(1)
|
ps.wg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer ps.wg.Done()
|
defer ps.wg.Done()
|
||||||
ps.receiveDelivered(cctx)
|
ps.waitForDelivery(ctx)
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Start goroutine for subscription receiver.
|
||||||
|
ps.wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer ps.wg.Done()
|
||||||
|
ps.receiveWithRetry(ctx)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
@ -119,13 +125,41 @@ func (ps *PubSub) Stop() {
|
||||||
ps.wg.Wait()
|
ps.wg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ps *PubSub) subReceive(cctx context.Context) {
|
// startReceiver is called within a goroutine and manages keeping a
|
||||||
|
// subscription.Receive() up and running while the plugin has not been stopped.
|
||||||
|
func (ps *PubSub) receiveWithRetry(parentCtx context.Context) {
|
||||||
|
err := ps.startReceiver(parentCtx)
|
||||||
|
|
||||||
|
for err != nil && parentCtx.Err() == nil {
|
||||||
|
log.Printf("E! [inputs.cloud_pubsub] Receiver for subscription %s exited with error: %v", ps.sub.ID(), err)
|
||||||
|
|
||||||
|
delay := defaultRetryDelaySeconds
|
||||||
|
if ps.RetryReceiveDelaySeconds > 0 {
|
||||||
|
delay = ps.RetryReceiveDelaySeconds
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("I! [inputs.cloud_pubsub] Waiting %d seconds before attempting to restart receiver...", delay)
|
||||||
|
time.Sleep(time.Duration(delay) * time.Second)
|
||||||
|
|
||||||
|
err = ps.startReceiver(parentCtx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ps *PubSub) startReceiver(parentCtx context.Context) error {
|
||||||
|
log.Printf("I! [inputs.cloud_pubsub] Starting receiver for subscription %s...", ps.sub.ID())
|
||||||
|
cctx, ccancel := context.WithCancel(parentCtx)
|
||||||
err := ps.sub.Receive(cctx, func(ctx context.Context, msg message) {
|
err := ps.sub.Receive(cctx, func(ctx context.Context, msg message) {
|
||||||
if err := ps.onMessage(ctx, msg); err != nil {
|
if err := ps.onMessage(ctx, msg); err != nil {
|
||||||
ps.acc.AddError(fmt.Errorf("unable to add message from subscription %s: %v", ps.sub.ID(), err))
|
ps.acc.AddError(fmt.Errorf("unable to add message from subscription %s: %v", ps.sub.ID(), err))
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
if err != nil {
|
||||||
ps.acc.AddError(fmt.Errorf("receiver for subscription %s exited: %v", ps.sub.ID(), err))
|
ps.acc.AddError(fmt.Errorf("receiver for subscription %s exited: %v", ps.sub.ID(), err))
|
||||||
|
} else {
|
||||||
|
log.Printf("I! [inputs.cloud_pubsub] subscription pull ended (no error, most likely stopped)")
|
||||||
|
}
|
||||||
|
ccancel()
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// onMessage handles parsing and adding a received message to the accumulator.
|
// onMessage handles parsing and adding a received message to the accumulator.
|
||||||
|
@ -153,8 +187,8 @@ func (ps *PubSub) onMessage(ctx context.Context, msg message) error {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
ps.mu.Lock()
|
ps.Lock()
|
||||||
defer ps.mu.Unlock()
|
defer ps.Unlock()
|
||||||
|
|
||||||
id := ps.acc.AddTrackingMetricGroup(metrics)
|
id := ps.acc.AddTrackingMetricGroup(metrics)
|
||||||
if ps.undelivered == nil {
|
if ps.undelivered == nil {
|
||||||
|
@ -165,10 +199,10 @@ func (ps *PubSub) onMessage(ctx context.Context, msg message) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ps *PubSub) receiveDelivered(ctx context.Context) {
|
func (ps *PubSub) waitForDelivery(parentCtx context.Context) {
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-parentCtx.Done():
|
||||||
return
|
return
|
||||||
case info := <-ps.acc.Delivered():
|
case info := <-ps.acc.Delivered():
|
||||||
<-ps.sem
|
<-ps.sem
|
||||||
|
@ -182,8 +216,8 @@ func (ps *PubSub) receiveDelivered(ctx context.Context) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ps *PubSub) removeDelivered(id telegraf.TrackingID) message {
|
func (ps *PubSub) removeDelivered(id telegraf.TrackingID) message {
|
||||||
ps.mu.Lock()
|
ps.Lock()
|
||||||
defer ps.mu.Unlock()
|
defer ps.Unlock()
|
||||||
|
|
||||||
msg, ok := ps.undelivered[id]
|
msg, ok := ps.undelivered[id]
|
||||||
if !ok {
|
if !ok {
|
||||||
|
@ -219,7 +253,7 @@ func (ps *PubSub) getPubSubClient() (*pubsub.Client, error) {
|
||||||
return client, nil
|
return client, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ps *PubSub) getGCPSubscription(ctx context.Context, subId string) (subscription, error) {
|
func (ps *PubSub) getGCPSubscription(subId string) (subscription, error) {
|
||||||
client, err := ps.getPubSubClient()
|
client, err := ps.getPubSubClient()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -262,6 +296,12 @@ const sampleConfig = `
|
||||||
## Application Default Credentials, which is preferred.
|
## Application Default Credentials, which is preferred.
|
||||||
# credentials_file = "path/to/my/creds.json"
|
# credentials_file = "path/to/my/creds.json"
|
||||||
|
|
||||||
|
## Optional. Number of seconds to wait before attempting to restart the
|
||||||
|
## PubSub subscription receiver after an unexpected error.
|
||||||
|
## If the streaming pull for a PubSub Subscription fails (receiver),
|
||||||
|
## the agent attempts to restart receiving messages after this many seconds.
|
||||||
|
# retry_delay_seconds = 5
|
||||||
|
|
||||||
## Optional. Maximum byte length of a message to consume.
|
## Optional. Maximum byte length of a message to consume.
|
||||||
## Larger messages are dropped with an error. If less than 0 or unspecified,
|
## Larger messages are dropped with an error. If less than 0 or unspecified,
|
||||||
## treated as no limit.
|
## treated as no limit.
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package cloud_pubsub
|
package cloud_pubsub
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"github.com/influxdata/telegraf/plugins/parsers"
|
"github.com/influxdata/telegraf/plugins/parsers"
|
||||||
"github.com/influxdata/telegraf/testutil"
|
"github.com/influxdata/telegraf/testutil"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
|
@ -21,6 +22,7 @@ func TestRunParse(t *testing.T) {
|
||||||
id: subId,
|
id: subId,
|
||||||
messages: make(chan *testMsg, 100),
|
messages: make(chan *testMsg, 100),
|
||||||
}
|
}
|
||||||
|
sub.receiver = testMessagesReceive(sub)
|
||||||
|
|
||||||
ps := &PubSub{
|
ps := &PubSub{
|
||||||
parser: testParser,
|
parser: testParser,
|
||||||
|
@ -62,6 +64,7 @@ func TestRunInvalidMessages(t *testing.T) {
|
||||||
id: subId,
|
id: subId,
|
||||||
messages: make(chan *testMsg, 100),
|
messages: make(chan *testMsg, 100),
|
||||||
}
|
}
|
||||||
|
sub.receiver = testMessagesReceive(sub)
|
||||||
|
|
||||||
ps := &PubSub{
|
ps := &PubSub{
|
||||||
parser: testParser,
|
parser: testParser,
|
||||||
|
@ -107,6 +110,7 @@ func TestRunOverlongMessages(t *testing.T) {
|
||||||
id: subId,
|
id: subId,
|
||||||
messages: make(chan *testMsg, 100),
|
messages: make(chan *testMsg, 100),
|
||||||
}
|
}
|
||||||
|
sub.receiver = testMessagesReceive(sub)
|
||||||
|
|
||||||
ps := &PubSub{
|
ps := &PubSub{
|
||||||
parser: testParser,
|
parser: testParser,
|
||||||
|
@ -141,6 +145,41 @@ func TestRunOverlongMessages(t *testing.T) {
|
||||||
assert.Equal(t, acc.NFields(), 0)
|
assert.Equal(t, acc.NFields(), 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunErrorInSubscriber(t *testing.T) {
|
||||||
|
subId := "sub-unexpected-error"
|
||||||
|
|
||||||
|
acc := &testutil.Accumulator{}
|
||||||
|
|
||||||
|
testParser, _ := parsers.NewInfluxParser()
|
||||||
|
|
||||||
|
sub := &stubSub{
|
||||||
|
id: subId,
|
||||||
|
messages: make(chan *testMsg, 100),
|
||||||
|
}
|
||||||
|
fakeErrStr := "a fake error"
|
||||||
|
sub.receiver = testMessagesError(sub, errors.New("a fake error"))
|
||||||
|
|
||||||
|
ps := &PubSub{
|
||||||
|
parser: testParser,
|
||||||
|
stubSub: func() subscription { return sub },
|
||||||
|
Project: "projectIDontMatterForTests",
|
||||||
|
Subscription: subId,
|
||||||
|
MaxUndeliveredMessages: defaultMaxUndeliveredMessages,
|
||||||
|
RetryReceiveDelaySeconds: 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := ps.Start(acc); err != nil {
|
||||||
|
t.Fatalf("test PubSub failed to start: %s", err)
|
||||||
|
}
|
||||||
|
defer ps.Stop()
|
||||||
|
|
||||||
|
if ps.sub == nil {
|
||||||
|
t.Fatal("expected plugin subscription to be non-nil")
|
||||||
|
}
|
||||||
|
acc.WaitError(1)
|
||||||
|
assert.Regexp(t, fakeErrStr, acc.Errors[0])
|
||||||
|
}
|
||||||
|
|
||||||
func validateTestInfluxMetric(t *testing.T, m *testutil.Metric) {
|
func validateTestInfluxMetric(t *testing.T, m *testutil.Metric) {
|
||||||
assert.Equal(t, "cpu_load_short", m.Measurement)
|
assert.Equal(t, "cpu_load_short", m.Measurement)
|
||||||
assert.Equal(t, "server01", m.Tags["host"])
|
assert.Equal(t, "server01", m.Tags["host"])
|
||||||
|
|
|
@ -9,6 +9,7 @@ import (
|
||||||
type stubSub struct {
|
type stubSub struct {
|
||||||
id string
|
id string
|
||||||
messages chan *testMsg
|
messages chan *testMsg
|
||||||
|
receiver receiveFunc
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *stubSub) ID() string {
|
func (s *stubSub) ID() string {
|
||||||
|
@ -16,6 +17,19 @@ func (s *stubSub) ID() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *stubSub) Receive(ctx context.Context, f func(context.Context, message)) error {
|
func (s *stubSub) Receive(ctx context.Context, f func(context.Context, message)) error {
|
||||||
|
return s.receiver(ctx, f)
|
||||||
|
}
|
||||||
|
|
||||||
|
type receiveFunc func(ctx context.Context, f func(context.Context, message)) error
|
||||||
|
|
||||||
|
func testMessagesError(s *stubSub, expectedErr error) receiveFunc {
|
||||||
|
return func(ctx context.Context, f func(context.Context, message)) error {
|
||||||
|
return expectedErr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func testMessagesReceive(s *stubSub) receiveFunc {
|
||||||
|
return func(ctx context.Context, f func(context.Context, message)) error {
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
@ -25,6 +39,7 @@ func (s *stubSub) Receive(ctx context.Context, f func(context.Context, message))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type testMsg struct {
|
type testMsg struct {
|
||||||
id string
|
id string
|
||||||
|
|
Loading…
Reference in New Issue