telegraf/plugins/inputs/burrow/burrow.go

486 lines
11 KiB
Go

package burrow
import (
"encoding/json"
"fmt"
"net/http"
"net/url"
"strconv"
"strings"
"sync"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/filter"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/internal/tls"
"github.com/influxdata/telegraf/plugins/inputs"
)
const (
defaultBurrowPrefix = "/v3/kafka"
defaultConcurrentConnections = 20
defaultResponseTimeout = time.Second * 5
defaultServer = "http://localhost:8000"
)
const configSample = `
## Burrow API endpoints in format "schema://host:port".
## Default is "http://localhost:8000".
servers = ["http://localhost:8000"]
## Override Burrow API prefix.
## Useful when Burrow is behind reverse-proxy.
# api_prefix = "/v3/kafka"
## Maximum time to receive response.
# response_timeout = "5s"
## Limit per-server concurrent connections.
## Useful in case of large number of topics or consumer groups.
# concurrent_connections = 20
## Filter clusters, default is no filtering.
## Values can be specified as glob patterns.
# clusters_include = []
# clusters_exclude = []
## Filter consumer groups, default is no filtering.
## Values can be specified as glob patterns.
# groups_include = []
# groups_exclude = []
## Filter topics, default is no filtering.
## Values can be specified as glob patterns.
# topics_include = []
# topics_exclude = []
## Credentials for basic HTTP authentication.
# username = ""
# password = ""
## Optional SSL config
# ssl_ca = "/etc/telegraf/ca.pem"
# ssl_cert = "/etc/telegraf/cert.pem"
# ssl_key = "/etc/telegraf/key.pem"
# insecure_skip_verify = false
`
type (
burrow struct {
tls.ClientConfig
Servers []string
Username string
Password string
ResponseTimeout internal.Duration
ConcurrentConnections int
APIPrefix string `toml:"api_prefix"`
ClustersExclude []string
ClustersInclude []string
GroupsExclude []string
GroupsInclude []string
TopicsExclude []string
TopicsInclude []string
client *http.Client
filterClusters filter.Filter
filterGroups filter.Filter
filterTopics filter.Filter
}
// response
apiResponse struct {
Clusters []string `json:"clusters"`
Groups []string `json:"consumers"`
Topics []string `json:"topics"`
Offsets []int64 `json:"offsets"`
Status apiStatusResponse `json:"status"`
}
// response: status field
apiStatusResponse struct {
Partitions []apiStatusResponseLag `json:"partitions"`
Status string `json:"status"`
PartitionCount int `json:"partition_count"`
Maxlag *apiStatusResponseLag `json:"maxlag"`
TotalLag int64 `json:"totallag"`
}
// response: lag field
apiStatusResponseLag struct {
Topic string `json:"topic"`
Partition int32 `json:"partition"`
Status string `json:"status"`
Start apiStatusResponseLagItem `json:"start"`
End apiStatusResponseLagItem `json:"end"`
CurrentLag int64 `json:"current_lag"`
Owner string `json:"owner"`
}
// response: lag field item
apiStatusResponseLagItem struct {
Offset int64 `json:"offset"`
Timestamp int64 `json:"timestamp"`
Lag int64 `json:"lag"`
}
)
func init() {
inputs.Add("burrow", func() telegraf.Input {
return &burrow{}
})
}
func (b *burrow) SampleConfig() string {
return configSample
}
func (b *burrow) Description() string {
return "Collect Kafka topics and consumers status from Burrow HTTP API."
}
func (b *burrow) Gather(acc telegraf.Accumulator) error {
var wg sync.WaitGroup
if len(b.Servers) == 0 {
b.Servers = []string{defaultServer}
}
if b.client == nil {
b.setDefaults()
if err := b.compileGlobs(); err != nil {
return err
}
c, err := b.createClient()
if err != nil {
return err
}
b.client = c
}
for _, addr := range b.Servers {
u, err := url.Parse(addr)
if err != nil {
acc.AddError(fmt.Errorf("unable to parse address '%s': %s", addr, err))
continue
}
if u.Path == "" {
u.Path = b.APIPrefix
}
wg.Add(1)
go func(u *url.URL) {
defer wg.Done()
acc.AddError(b.gatherServer(u, acc))
}(u)
}
wg.Wait()
return nil
}
func (b *burrow) setDefaults() {
if b.APIPrefix == "" {
b.APIPrefix = defaultBurrowPrefix
}
if b.ConcurrentConnections < 1 {
b.ConcurrentConnections = defaultConcurrentConnections
}
if b.ResponseTimeout.Duration < time.Second {
b.ResponseTimeout = internal.Duration{
Duration: defaultResponseTimeout,
}
}
}
func (b *burrow) compileGlobs() error {
var err error
// compile glob patterns
b.filterClusters, err = filter.NewIncludeExcludeFilter(b.ClustersInclude, b.ClustersExclude)
if err != nil {
return err
}
b.filterGroups, err = filter.NewIncludeExcludeFilter(b.GroupsInclude, b.GroupsExclude)
if err != nil {
return err
}
b.filterTopics, err = filter.NewIncludeExcludeFilter(b.TopicsInclude, b.TopicsExclude)
if err != nil {
return err
}
return nil
}
func (b *burrow) createClient() (*http.Client, error) {
tlsCfg, err := b.ClientConfig.TLSConfig()
if err != nil {
return nil, err
}
client := &http.Client{
Transport: &http.Transport{
TLSClientConfig: tlsCfg,
},
Timeout: b.ResponseTimeout.Duration,
}
return client, nil
}
func (b *burrow) getResponse(u *url.URL) (*apiResponse, error) {
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return nil, err
}
if b.Username != "" {
req.SetBasicAuth(b.Username, b.Password)
}
res, err := b.client.Do(req)
if err != nil {
return nil, err
}
defer res.Body.Close()
if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("wrong response: %d", res.StatusCode)
}
ares := &apiResponse{}
dec := json.NewDecoder(res.Body)
return ares, dec.Decode(ares)
}
func (b *burrow) gatherServer(src *url.URL, acc telegraf.Accumulator) error {
var wg sync.WaitGroup
r, err := b.getResponse(src)
if err != nil {
return err
}
guard := make(chan struct{}, b.ConcurrentConnections)
for _, cluster := range r.Clusters {
if !b.filterClusters.Match(cluster) {
continue
}
wg.Add(1)
go func(cluster string) {
defer wg.Done()
// fetch topic list
// endpoint: <api_prefix>/(cluster)/topic
ut := appendPathToURL(src, cluster, "topic")
b.gatherTopics(guard, ut, cluster, acc)
}(cluster)
wg.Add(1)
go func(cluster string) {
defer wg.Done()
// fetch consumer group list
// endpoint: <api_prefix>/(cluster)/consumer
uc := appendPathToURL(src, cluster, "consumer")
b.gatherGroups(guard, uc, cluster, acc)
}(cluster)
}
wg.Wait()
return nil
}
func (b *burrow) gatherTopics(guard chan struct{}, src *url.URL, cluster string, acc telegraf.Accumulator) {
var wg sync.WaitGroup
r, err := b.getResponse(src)
if err != nil {
acc.AddError(err)
return
}
for _, topic := range r.Topics {
if !b.filterTopics.Match(topic) {
continue
}
guard <- struct{}{}
wg.Add(1)
go func(topic string) {
defer func() {
<-guard
wg.Done()
}()
// fetch topic offsets
// endpoint: <api_prefix>/<cluster>/topic/<topic>
tu := appendPathToURL(src, topic)
tr, err := b.getResponse(tu)
if err != nil {
acc.AddError(err)
return
}
b.genTopicMetrics(tr, cluster, topic, acc)
}(topic)
}
wg.Wait()
}
func (b *burrow) genTopicMetrics(r *apiResponse, cluster, topic string, acc telegraf.Accumulator) {
for i, offset := range r.Offsets {
tags := map[string]string{
"cluster": cluster,
"topic": topic,
"partition": strconv.Itoa(i),
}
acc.AddFields(
"burrow_topic",
map[string]interface{}{
"offset": offset,
},
tags,
)
}
}
func (b *burrow) gatherGroups(guard chan struct{}, src *url.URL, cluster string, acc telegraf.Accumulator) {
var wg sync.WaitGroup
r, err := b.getResponse(src)
if err != nil {
acc.AddError(err)
return
}
for _, group := range r.Groups {
if !b.filterGroups.Match(group) {
continue
}
guard <- struct{}{}
wg.Add(1)
go func(group string) {
defer func() {
<-guard
wg.Done()
}()
// fetch consumer group status
// endpoint: <api_prefix>/<cluster>/consumer/<group>/lag
gl := appendPathToURL(src, group, "lag")
gr, err := b.getResponse(gl)
if err != nil {
acc.AddError(err)
return
}
b.genGroupStatusMetrics(gr, cluster, group, acc)
b.genGroupLagMetrics(gr, cluster, group, acc)
}(group)
}
wg.Wait()
}
func (b *burrow) genGroupStatusMetrics(r *apiResponse, cluster, group string, acc telegraf.Accumulator) {
partitionCount := r.Status.PartitionCount
if partitionCount == 0 {
partitionCount = len(r.Status.Partitions)
}
// get max timestamp and total offset from partitions list
offset := int64(0)
timestamp := int64(0)
for _, partition := range r.Status.Partitions {
offset += partition.End.Offset
if partition.End.Timestamp > timestamp {
timestamp = partition.End.Timestamp
}
}
lag := int64(0)
if r.Status.Maxlag != nil {
lag = r.Status.Maxlag.CurrentLag
}
acc.AddFields(
"burrow_group",
map[string]interface{}{
"status": r.Status.Status,
"status_code": mapStatusToCode(r.Status.Status),
"partition_count": partitionCount,
"total_lag": r.Status.TotalLag,
"lag": lag,
"offset": offset,
"timestamp": timestamp,
},
map[string]string{
"cluster": cluster,
"group": group,
},
)
}
func (b *burrow) genGroupLagMetrics(r *apiResponse, cluster, group string, acc telegraf.Accumulator) {
for _, partition := range r.Status.Partitions {
acc.AddFields(
"burrow_partition",
map[string]interface{}{
"status": partition.Status,
"status_code": mapStatusToCode(partition.Status),
"lag": partition.CurrentLag,
"offset": partition.End.Offset,
"timestamp": partition.End.Timestamp,
},
map[string]string{
"cluster": cluster,
"group": group,
"topic": partition.Topic,
"partition": strconv.FormatInt(int64(partition.Partition), 10),
"owner": partition.Owner,
},
)
}
}
func appendPathToURL(src *url.URL, parts ...string) *url.URL {
dst := new(url.URL)
*dst = *src
for i, part := range parts {
parts[i] = url.PathEscape(part)
}
ext := strings.Join(parts, "/")
dst.Path = fmt.Sprintf("%s/%s", src.Path, ext)
return dst
}
func mapStatusToCode(src string) int {
switch src {
case "OK":
return 1
case "NOT_FOUND":
return 2
case "WARN":
return 3
case "ERR":
return 4
case "STOP":
return 5
case "STALL":
return 6
default:
return 0
}
}