telegraf/plugins/inputs/elasticsearch/elasticsearch.go

664 lines
20 KiB
Go

package elasticsearch
import (
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"regexp"
"sort"
"strings"
"sync"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/internal/tls"
"github.com/influxdata/telegraf/plugins/inputs"
jsonparser "github.com/influxdata/telegraf/plugins/parsers/json"
)
// mask for masking username/password from error messages
var mask = regexp.MustCompile(`https?:\/\/\S+:\S+@`)
// Nodestats are always generated, so simply define a constant for these endpoints
const statsPath = "/_nodes/stats"
const statsPathLocal = "/_nodes/_local/stats"
type nodeStat struct {
Host string `json:"host"`
Name string `json:"name"`
Roles []string `json:"roles"`
Attributes map[string]string `json:"attributes"`
Indices interface{} `json:"indices"`
OS interface{} `json:"os"`
Process interface{} `json:"process"`
JVM interface{} `json:"jvm"`
ThreadPool interface{} `json:"thread_pool"`
FS interface{} `json:"fs"`
Transport interface{} `json:"transport"`
HTTP interface{} `json:"http"`
Breakers interface{} `json:"breakers"`
}
type clusterHealth struct {
ActivePrimaryShards int `json:"active_primary_shards"`
ActiveShards int `json:"active_shards"`
ActiveShardsPercentAsNumber float64 `json:"active_shards_percent_as_number"`
ClusterName string `json:"cluster_name"`
DelayedUnassignedShards int `json:"delayed_unassigned_shards"`
InitializingShards int `json:"initializing_shards"`
NumberOfDataNodes int `json:"number_of_data_nodes"`
NumberOfInFlightFetch int `json:"number_of_in_flight_fetch"`
NumberOfNodes int `json:"number_of_nodes"`
NumberOfPendingTasks int `json:"number_of_pending_tasks"`
RelocatingShards int `json:"relocating_shards"`
Status string `json:"status"`
TaskMaxWaitingInQueueMillis int `json:"task_max_waiting_in_queue_millis"`
TimedOut bool `json:"timed_out"`
UnassignedShards int `json:"unassigned_shards"`
Indices map[string]indexHealth `json:"indices"`
}
type indexHealth struct {
ActivePrimaryShards int `json:"active_primary_shards"`
ActiveShards int `json:"active_shards"`
InitializingShards int `json:"initializing_shards"`
NumberOfReplicas int `json:"number_of_replicas"`
NumberOfShards int `json:"number_of_shards"`
RelocatingShards int `json:"relocating_shards"`
Status string `json:"status"`
UnassignedShards int `json:"unassigned_shards"`
}
type clusterStats struct {
NodeName string `json:"node_name"`
ClusterName string `json:"cluster_name"`
Status string `json:"status"`
Indices interface{} `json:"indices"`
Nodes interface{} `json:"nodes"`
}
type indexStat struct {
Primaries interface{} `json:"primaries"`
Total interface{} `json:"total"`
Shards map[string][]interface{} `json:"shards"`
}
const sampleConfig = `
## specify a list of one or more Elasticsearch servers
# you can add username and password to your url to use basic authentication:
# servers = ["http://user:pass@localhost:9200"]
servers = ["http://localhost:9200"]
## Timeout for HTTP requests to the elastic search server(s)
http_timeout = "5s"
## When local is true (the default), the node will read only its own stats.
## Set local to false when you want to read the node stats from all nodes
## of the cluster.
local = true
## Set cluster_health to true when you want to also obtain cluster health stats
cluster_health = false
## Adjust cluster_health_level when you want to also obtain detailed health stats
## The options are
## - indices (default)
## - cluster
# cluster_health_level = "indices"
## Set cluster_stats to true when you want to also obtain cluster stats.
cluster_stats = false
## Only gather cluster_stats from the master node. To work this require local = true
cluster_stats_only_from_master = true
## Indices to collect; can be one or more indices names or _all
indices_include = ["_all"]
## One of "shards", "cluster", "indices"
indices_level = "shards"
## node_stats is a list of sub-stats that you want to have gathered. Valid options
## are "indices", "os", "process", "jvm", "thread_pool", "fs", "transport", "http",
## "breaker". Per default, all stats are gathered.
# node_stats = ["jvm", "http"]
## HTTP Basic Authentication username and password.
# username = ""
# password = ""
## Optional TLS Config
# tls_ca = "/etc/telegraf/ca.pem"
# tls_cert = "/etc/telegraf/cert.pem"
# tls_key = "/etc/telegraf/key.pem"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
`
// Elasticsearch is a plugin to read stats from one or many Elasticsearch
// servers.
type Elasticsearch struct {
Local bool `toml:"local"`
Servers []string `toml:"servers"`
HTTPTimeout internal.Duration `toml:"http_timeout"`
ClusterHealth bool `toml:"cluster_health"`
ClusterHealthLevel string `toml:"cluster_health_level"`
ClusterStats bool `toml:"cluster_stats"`
ClusterStatsOnlyFromMaster bool `toml:"cluster_stats_only_from_master"`
IndicesInclude []string `toml:"indices_include"`
IndicesLevel string `toml:"indices_level"`
NodeStats []string `toml:"node_stats"`
Username string `toml:"username"`
Password string `toml:"password"`
tls.ClientConfig
client *http.Client
serverInfo map[string]serverInfo
serverInfoMutex sync.Mutex
}
type serverInfo struct {
nodeID string
masterID string
}
func (i serverInfo) isMaster() bool {
return i.nodeID == i.masterID
}
// NewElasticsearch return a new instance of Elasticsearch
func NewElasticsearch() *Elasticsearch {
return &Elasticsearch{
HTTPTimeout: internal.Duration{Duration: time.Second * 5},
ClusterStatsOnlyFromMaster: true,
ClusterHealthLevel: "indices",
}
}
// perform status mapping
func mapHealthStatusToCode(s string) int {
switch strings.ToLower(s) {
case "green":
return 1
case "yellow":
return 2
case "red":
return 3
}
return 0
}
// perform shard status mapping
func mapShardStatusToCode(s string) int {
switch strings.ToUpper(s) {
case "UNASSIGNED":
return 1
case "INITIALIZING":
return 2
case "STARTED":
return 3
case "RELOCATING":
return 4
}
return 0
}
// SampleConfig returns sample configuration for this plugin.
func (e *Elasticsearch) SampleConfig() string {
return sampleConfig
}
// Description returns the plugin description.
func (e *Elasticsearch) Description() string {
return "Read stats from one or more Elasticsearch servers or clusters"
}
// Gather reads the stats from Elasticsearch and writes it to the
// Accumulator.
func (e *Elasticsearch) Gather(acc telegraf.Accumulator) error {
if e.client == nil {
client, err := e.createHTTPClient()
if err != nil {
return err
}
e.client = client
}
if e.ClusterStats || len(e.IndicesInclude) > 0 || len(e.IndicesLevel) > 0 {
var wgC sync.WaitGroup
wgC.Add(len(e.Servers))
e.serverInfo = make(map[string]serverInfo)
for _, serv := range e.Servers {
go func(s string, acc telegraf.Accumulator) {
defer wgC.Done()
info := serverInfo{}
var err error
// Gather node ID
if info.nodeID, err = e.gatherNodeID(s + "/_nodes/_local/name"); err != nil {
acc.AddError(fmt.Errorf(mask.ReplaceAllString(err.Error(), "http(s)://XXX:XXX@")))
return
}
// get cat/master information here so NodeStats can determine
// whether this node is the Master
if info.masterID, err = e.getCatMaster(s + "/_cat/master"); err != nil {
acc.AddError(fmt.Errorf(mask.ReplaceAllString(err.Error(), "http(s)://XXX:XXX@")))
return
}
e.serverInfoMutex.Lock()
e.serverInfo[s] = info
e.serverInfoMutex.Unlock()
}(serv, acc)
}
wgC.Wait()
}
var wg sync.WaitGroup
wg.Add(len(e.Servers))
for _, serv := range e.Servers {
go func(s string, acc telegraf.Accumulator) {
defer wg.Done()
url := e.nodeStatsURL(s)
// Always gather node stats
if err := e.gatherNodeStats(url, acc); err != nil {
acc.AddError(fmt.Errorf(mask.ReplaceAllString(err.Error(), "http(s)://XXX:XXX@")))
return
}
if e.ClusterHealth {
url = s + "/_cluster/health"
if e.ClusterHealthLevel != "" {
url = url + "?level=" + e.ClusterHealthLevel
}
if err := e.gatherClusterHealth(url, acc); err != nil {
acc.AddError(fmt.Errorf(mask.ReplaceAllString(err.Error(), "http(s)://XXX:XXX@")))
return
}
}
if e.ClusterStats && (e.serverInfo[s].isMaster() || !e.ClusterStatsOnlyFromMaster || !e.Local) {
if err := e.gatherClusterStats(s+"/_cluster/stats", acc); err != nil {
acc.AddError(fmt.Errorf(mask.ReplaceAllString(err.Error(), "http(s)://XXX:XXX@")))
return
}
}
if len(e.IndicesInclude) > 0 && (e.serverInfo[s].isMaster() || !e.ClusterStatsOnlyFromMaster || !e.Local) {
if e.IndicesLevel != "shards" {
if err := e.gatherIndicesStats(s+"/"+strings.Join(e.IndicesInclude, ",")+"/_stats", acc); err != nil {
acc.AddError(fmt.Errorf(mask.ReplaceAllString(err.Error(), "http(s)://XXX:XXX@")))
return
}
} else {
if err := e.gatherIndicesStats(s+"/"+strings.Join(e.IndicesInclude, ",")+"/_stats?level=shards", acc); err != nil {
acc.AddError(fmt.Errorf(mask.ReplaceAllString(err.Error(), "http(s)://XXX:XXX@")))
return
}
}
}
}(serv, acc)
}
wg.Wait()
return nil
}
func (e *Elasticsearch) createHTTPClient() (*http.Client, error) {
tlsCfg, err := e.ClientConfig.TLSConfig()
if err != nil {
return nil, err
}
tr := &http.Transport{
ResponseHeaderTimeout: e.HTTPTimeout.Duration,
TLSClientConfig: tlsCfg,
}
client := &http.Client{
Transport: tr,
Timeout: e.HTTPTimeout.Duration,
}
return client, nil
}
func (e *Elasticsearch) nodeStatsURL(baseURL string) string {
var url string
if e.Local {
url = baseURL + statsPathLocal
} else {
url = baseURL + statsPath
}
if len(e.NodeStats) == 0 {
return url
}
return fmt.Sprintf("%s/%s", url, strings.Join(e.NodeStats, ","))
}
func (e *Elasticsearch) gatherNodeID(url string) (string, error) {
nodeStats := &struct {
ClusterName string `json:"cluster_name"`
Nodes map[string]*nodeStat `json:"nodes"`
}{}
if err := e.gatherJSONData(url, nodeStats); err != nil {
return "", err
}
// Only 1 should be returned
for id := range nodeStats.Nodes {
return id, nil
}
return "", nil
}
func (e *Elasticsearch) gatherNodeStats(url string, acc telegraf.Accumulator) error {
nodeStats := &struct {
ClusterName string `json:"cluster_name"`
Nodes map[string]*nodeStat `json:"nodes"`
}{}
if err := e.gatherJSONData(url, nodeStats); err != nil {
return err
}
for id, n := range nodeStats.Nodes {
sort.Strings(n.Roles)
tags := map[string]string{
"node_id": id,
"node_host": n.Host,
"node_name": n.Name,
"cluster_name": nodeStats.ClusterName,
"node_roles": strings.Join(n.Roles, ","),
}
for k, v := range n.Attributes {
tags["node_attribute_"+k] = v
}
stats := map[string]interface{}{
"indices": n.Indices,
"os": n.OS,
"process": n.Process,
"jvm": n.JVM,
"thread_pool": n.ThreadPool,
"fs": n.FS,
"transport": n.Transport,
"http": n.HTTP,
"breakers": n.Breakers,
}
now := time.Now()
for p, s := range stats {
// if one of the individual node stats is not even in the
// original result
if s == nil {
continue
}
f := jsonparser.JSONFlattener{}
// parse Json, ignoring strings and bools
err := f.FlattenJSON("", s)
if err != nil {
return err
}
acc.AddFields("elasticsearch_"+p, f.Fields, tags, now)
}
}
return nil
}
func (e *Elasticsearch) gatherClusterHealth(url string, acc telegraf.Accumulator) error {
healthStats := &clusterHealth{}
if err := e.gatherJSONData(url, healthStats); err != nil {
return err
}
measurementTime := time.Now()
clusterFields := map[string]interface{}{
"active_primary_shards": healthStats.ActivePrimaryShards,
"active_shards": healthStats.ActiveShards,
"active_shards_percent_as_number": healthStats.ActiveShardsPercentAsNumber,
"delayed_unassigned_shards": healthStats.DelayedUnassignedShards,
"initializing_shards": healthStats.InitializingShards,
"number_of_data_nodes": healthStats.NumberOfDataNodes,
"number_of_in_flight_fetch": healthStats.NumberOfInFlightFetch,
"number_of_nodes": healthStats.NumberOfNodes,
"number_of_pending_tasks": healthStats.NumberOfPendingTasks,
"relocating_shards": healthStats.RelocatingShards,
"status": healthStats.Status,
"status_code": mapHealthStatusToCode(healthStats.Status),
"task_max_waiting_in_queue_millis": healthStats.TaskMaxWaitingInQueueMillis,
"timed_out": healthStats.TimedOut,
"unassigned_shards": healthStats.UnassignedShards,
}
acc.AddFields(
"elasticsearch_cluster_health",
clusterFields,
map[string]string{"name": healthStats.ClusterName},
measurementTime,
)
for name, health := range healthStats.Indices {
indexFields := map[string]interface{}{
"active_primary_shards": health.ActivePrimaryShards,
"active_shards": health.ActiveShards,
"initializing_shards": health.InitializingShards,
"number_of_replicas": health.NumberOfReplicas,
"number_of_shards": health.NumberOfShards,
"relocating_shards": health.RelocatingShards,
"status": health.Status,
"status_code": mapHealthStatusToCode(health.Status),
"unassigned_shards": health.UnassignedShards,
}
acc.AddFields(
"elasticsearch_cluster_health_indices",
indexFields,
map[string]string{"index": name, "name": healthStats.ClusterName},
measurementTime,
)
}
return nil
}
func (e *Elasticsearch) gatherClusterStats(url string, acc telegraf.Accumulator) error {
clusterStats := &clusterStats{}
if err := e.gatherJSONData(url, clusterStats); err != nil {
return err
}
now := time.Now()
tags := map[string]string{
"node_name": clusterStats.NodeName,
"cluster_name": clusterStats.ClusterName,
"status": clusterStats.Status,
}
stats := map[string]interface{}{
"nodes": clusterStats.Nodes,
"indices": clusterStats.Indices,
}
for p, s := range stats {
f := jsonparser.JSONFlattener{}
// parse json, including bools and strings
err := f.FullFlattenJSON("", s, true, true)
if err != nil {
return err
}
acc.AddFields("elasticsearch_clusterstats_"+p, f.Fields, tags, now)
}
return nil
}
func (e *Elasticsearch) gatherIndicesStats(url string, acc telegraf.Accumulator) error {
indicesStats := &struct {
Shards map[string]interface{} `json:"_shards"`
All map[string]interface{} `json:"_all"`
Indices map[string]indexStat `json:"indices"`
}{}
if err := e.gatherJSONData(url, indicesStats); err != nil {
return err
}
now := time.Now()
// Total Shards Stats
shardsStats := map[string]interface{}{}
for k, v := range indicesStats.Shards {
shardsStats[k] = v
}
acc.AddFields("elasticsearch_indices_stats_shards_total", shardsStats, map[string]string{}, now)
// All Stats
for m, s := range indicesStats.All {
// parse Json, ignoring strings and bools
jsonParser := jsonparser.JSONFlattener{}
err := jsonParser.FullFlattenJSON("_", s, true, true)
if err != nil {
return err
}
acc.AddFields("elasticsearch_indices_stats_"+m, jsonParser.Fields, map[string]string{"index_name": "_all"}, now)
}
// Individual Indices stats
for id, index := range indicesStats.Indices {
indexTag := map[string]string{"index_name": id}
stats := map[string]interface{}{
"primaries": index.Primaries,
"total": index.Total,
}
for m, s := range stats {
f := jsonparser.JSONFlattener{}
// parse Json, getting strings and bools
err := f.FullFlattenJSON("", s, true, true)
if err != nil {
return err
}
acc.AddFields("elasticsearch_indices_stats_"+m, f.Fields, indexTag, now)
}
if e.IndicesLevel == "shards" {
for shardNumber, shards := range index.Shards {
for _, shard := range shards {
// Get Shard Stats
flattened := jsonparser.JSONFlattener{}
err := flattened.FullFlattenJSON("", shard, true, true)
if err != nil {
return err
}
// determine shard tag and primary/replica designation
shardType := "replica"
if flattened.Fields["routing_primary"] == true {
shardType = "primary"
}
delete(flattened.Fields, "routing_primary")
routingState, ok := flattened.Fields["routing_state"].(string)
if ok {
flattened.Fields["routing_state"] = mapShardStatusToCode(routingState)
}
routingNode, _ := flattened.Fields["routing_node"].(string)
shardTags := map[string]string{
"index_name": id,
"node_id": routingNode,
"shard_name": string(shardNumber),
"type": shardType,
}
for key, field := range flattened.Fields {
switch field.(type) {
case string, bool:
delete(flattened.Fields, key)
}
}
acc.AddFields("elasticsearch_indices_stats_shards",
flattened.Fields,
shardTags,
now)
}
}
}
}
return nil
}
func (e *Elasticsearch) getCatMaster(url string) (string, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return "", err
}
if e.Username != "" || e.Password != "" {
req.SetBasicAuth(e.Username, e.Password)
}
r, err := e.client.Do(req)
if err != nil {
return "", err
}
defer r.Body.Close()
if r.StatusCode != http.StatusOK {
// NOTE: we are not going to read/discard r.Body under the assumption we'd prefer
// to let the underlying transport close the connection and re-establish a new one for
// future calls.
return "", fmt.Errorf("elasticsearch: Unable to retrieve master node information. API responded with status-code %d, expected %d", r.StatusCode, http.StatusOK)
}
response, err := ioutil.ReadAll(r.Body)
if err != nil {
return "", err
}
masterID := strings.Split(string(response), " ")[0]
return masterID, nil
}
func (e *Elasticsearch) gatherJSONData(url string, v interface{}) error {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return err
}
if e.Username != "" || e.Password != "" {
req.SetBasicAuth(e.Username, e.Password)
}
r, err := e.client.Do(req)
if err != nil {
return err
}
defer r.Body.Close()
if r.StatusCode != http.StatusOK {
// NOTE: we are not going to read/discard r.Body under the assumption we'd prefer
// to let the underlying transport close the connection and re-establish a new one for
// future calls.
return fmt.Errorf("elasticsearch: API responded with status-code %d, expected %d",
r.StatusCode, http.StatusOK)
}
if err = json.NewDecoder(r.Body).Decode(v); err != nil {
return err
}
return nil
}
func init() {
inputs.Add("elasticsearch", func() telegraf.Input {
return NewElasticsearch()
})
}