506 lines
15 KiB
Go
506 lines
15 KiB
Go
package elasticsearch
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"net/http"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/influxdata/telegraf"
|
|
"github.com/influxdata/telegraf/internal"
|
|
"github.com/influxdata/telegraf/internal/tls"
|
|
"github.com/influxdata/telegraf/plugins/inputs"
|
|
jsonparser "github.com/influxdata/telegraf/plugins/parsers/json"
|
|
)
|
|
|
|
// mask for masking username/password from error messages
|
|
var mask = regexp.MustCompile(`https?:\/\/\S+:\S+@`)
|
|
|
|
// Nodestats are always generated, so simply define a constant for these endpoints
|
|
const statsPath = "/_nodes/stats"
|
|
const statsPathLocal = "/_nodes/_local/stats"
|
|
|
|
type nodeStat struct {
|
|
Host string `json:"host"`
|
|
Name string `json:"name"`
|
|
Roles []string `json:"roles"`
|
|
Attributes map[string]string `json:"attributes"`
|
|
Indices interface{} `json:"indices"`
|
|
OS interface{} `json:"os"`
|
|
Process interface{} `json:"process"`
|
|
JVM interface{} `json:"jvm"`
|
|
ThreadPool interface{} `json:"thread_pool"`
|
|
FS interface{} `json:"fs"`
|
|
Transport interface{} `json:"transport"`
|
|
HTTP interface{} `json:"http"`
|
|
Breakers interface{} `json:"breakers"`
|
|
}
|
|
|
|
type clusterHealth struct {
|
|
ActivePrimaryShards int `json:"active_primary_shards"`
|
|
ActiveShards int `json:"active_shards"`
|
|
ActiveShardsPercentAsNumber float64 `json:"active_shards_percent_as_number"`
|
|
ClusterName string `json:"cluster_name"`
|
|
DelayedUnassignedShards int `json:"delayed_unassigned_shards"`
|
|
InitializingShards int `json:"initializing_shards"`
|
|
NumberOfDataNodes int `json:"number_of_data_nodes"`
|
|
NumberOfInFlightFetch int `json:"number_of_in_flight_fetch"`
|
|
NumberOfNodes int `json:"number_of_nodes"`
|
|
NumberOfPendingTasks int `json:"number_of_pending_tasks"`
|
|
RelocatingShards int `json:"relocating_shards"`
|
|
Status string `json:"status"`
|
|
TaskMaxWaitingInQueueMillis int `json:"task_max_waiting_in_queue_millis"`
|
|
TimedOut bool `json:"timed_out"`
|
|
UnassignedShards int `json:"unassigned_shards"`
|
|
Indices map[string]indexHealth `json:"indices"`
|
|
}
|
|
|
|
type indexHealth struct {
|
|
ActivePrimaryShards int `json:"active_primary_shards"`
|
|
ActiveShards int `json:"active_shards"`
|
|
InitializingShards int `json:"initializing_shards"`
|
|
NumberOfReplicas int `json:"number_of_replicas"`
|
|
NumberOfShards int `json:"number_of_shards"`
|
|
RelocatingShards int `json:"relocating_shards"`
|
|
Status string `json:"status"`
|
|
UnassignedShards int `json:"unassigned_shards"`
|
|
}
|
|
|
|
type clusterStats struct {
|
|
NodeName string `json:"node_name"`
|
|
ClusterName string `json:"cluster_name"`
|
|
Status string `json:"status"`
|
|
Indices interface{} `json:"indices"`
|
|
Nodes interface{} `json:"nodes"`
|
|
}
|
|
|
|
type catMaster struct {
|
|
NodeID string `json:"id"`
|
|
NodeIP string `json:"ip"`
|
|
NodeName string `json:"node"`
|
|
}
|
|
|
|
const sampleConfig = `
|
|
## specify a list of one or more Elasticsearch servers
|
|
# you can add username and password to your url to use basic authentication:
|
|
# servers = ["http://user:pass@localhost:9200"]
|
|
servers = ["http://localhost:9200"]
|
|
|
|
## Timeout for HTTP requests to the elastic search server(s)
|
|
http_timeout = "5s"
|
|
|
|
## When local is true (the default), the node will read only its own stats.
|
|
## Set local to false when you want to read the node stats from all nodes
|
|
## of the cluster.
|
|
local = true
|
|
|
|
## Set cluster_health to true when you want to also obtain cluster health stats
|
|
cluster_health = false
|
|
|
|
## Adjust cluster_health_level when you want to also obtain detailed health stats
|
|
## The options are
|
|
## - indices (default)
|
|
## - cluster
|
|
# cluster_health_level = "indices"
|
|
|
|
## Set cluster_stats to true when you want to also obtain cluster stats.
|
|
cluster_stats = false
|
|
|
|
## Only gather cluster_stats from the master node. To work this require local = true
|
|
cluster_stats_only_from_master = true
|
|
|
|
## node_stats is a list of sub-stats that you want to have gathered. Valid options
|
|
## are "indices", "os", "process", "jvm", "thread_pool", "fs", "transport", "http",
|
|
## "breaker". Per default, all stats are gathered.
|
|
# node_stats = ["jvm", "http"]
|
|
|
|
## Optional TLS Config
|
|
# tls_ca = "/etc/telegraf/ca.pem"
|
|
# tls_cert = "/etc/telegraf/cert.pem"
|
|
# tls_key = "/etc/telegraf/key.pem"
|
|
## Use TLS but skip chain & host verification
|
|
# insecure_skip_verify = false
|
|
`
|
|
|
|
// Elasticsearch is a plugin to read stats from one or many Elasticsearch
|
|
// servers.
|
|
type Elasticsearch struct {
|
|
Local bool
|
|
Servers []string
|
|
HttpTimeout internal.Duration
|
|
ClusterHealth bool
|
|
ClusterHealthLevel string
|
|
ClusterStats bool
|
|
ClusterStatsOnlyFromMaster bool
|
|
NodeStats []string
|
|
tls.ClientConfig
|
|
|
|
client *http.Client
|
|
serverInfo map[string]serverInfo
|
|
serverInfoMutex sync.Mutex
|
|
}
|
|
type serverInfo struct {
|
|
nodeID string
|
|
masterID string
|
|
}
|
|
|
|
func (i serverInfo) isMaster() bool {
|
|
return i.nodeID == i.masterID
|
|
}
|
|
|
|
// NewElasticsearch return a new instance of Elasticsearch
|
|
func NewElasticsearch() *Elasticsearch {
|
|
return &Elasticsearch{
|
|
HttpTimeout: internal.Duration{Duration: time.Second * 5},
|
|
ClusterStatsOnlyFromMaster: true,
|
|
ClusterHealthLevel: "indices",
|
|
}
|
|
}
|
|
|
|
// perform status mapping
|
|
func mapHealthStatusToCode(s string) int {
|
|
switch strings.ToLower(s) {
|
|
case "green":
|
|
return 1
|
|
case "yellow":
|
|
return 2
|
|
case "red":
|
|
return 3
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// SampleConfig returns sample configuration for this plugin.
|
|
func (e *Elasticsearch) SampleConfig() string {
|
|
return sampleConfig
|
|
}
|
|
|
|
// Description returns the plugin description.
|
|
func (e *Elasticsearch) Description() string {
|
|
return "Read stats from one or more Elasticsearch servers or clusters"
|
|
}
|
|
|
|
// Gather reads the stats from Elasticsearch and writes it to the
|
|
// Accumulator.
|
|
func (e *Elasticsearch) Gather(acc telegraf.Accumulator) error {
|
|
if e.client == nil {
|
|
client, err := e.createHttpClient()
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
e.client = client
|
|
}
|
|
|
|
if e.ClusterStats {
|
|
var wgC sync.WaitGroup
|
|
wgC.Add(len(e.Servers))
|
|
|
|
e.serverInfo = make(map[string]serverInfo)
|
|
for _, serv := range e.Servers {
|
|
go func(s string, acc telegraf.Accumulator) {
|
|
defer wgC.Done()
|
|
info := serverInfo{}
|
|
|
|
var err error
|
|
|
|
// Gather node ID
|
|
if info.nodeID, err = e.gatherNodeID(s + "/_nodes/_local/name"); err != nil {
|
|
acc.AddError(fmt.Errorf(mask.ReplaceAllString(err.Error(), "http(s)://XXX:XXX@")))
|
|
return
|
|
}
|
|
|
|
// get cat/master information here so NodeStats can determine
|
|
// whether this node is the Master
|
|
if info.masterID, err = e.getCatMaster(s + "/_cat/master"); err != nil {
|
|
acc.AddError(fmt.Errorf(mask.ReplaceAllString(err.Error(), "http(s)://XXX:XXX@")))
|
|
return
|
|
}
|
|
|
|
e.serverInfoMutex.Lock()
|
|
e.serverInfo[s] = info
|
|
e.serverInfoMutex.Unlock()
|
|
|
|
}(serv, acc)
|
|
}
|
|
wgC.Wait()
|
|
}
|
|
|
|
var wg sync.WaitGroup
|
|
wg.Add(len(e.Servers))
|
|
|
|
for _, serv := range e.Servers {
|
|
go func(s string, acc telegraf.Accumulator) {
|
|
defer wg.Done()
|
|
url := e.nodeStatsUrl(s)
|
|
|
|
// Always gather node stats
|
|
if err := e.gatherNodeStats(url, acc); err != nil {
|
|
acc.AddError(fmt.Errorf(mask.ReplaceAllString(err.Error(), "http(s)://XXX:XXX@")))
|
|
return
|
|
}
|
|
|
|
if e.ClusterHealth {
|
|
url = s + "/_cluster/health"
|
|
if e.ClusterHealthLevel != "" {
|
|
url = url + "?level=" + e.ClusterHealthLevel
|
|
}
|
|
if err := e.gatherClusterHealth(url, acc); err != nil {
|
|
acc.AddError(fmt.Errorf(mask.ReplaceAllString(err.Error(), "http(s)://XXX:XXX@")))
|
|
return
|
|
}
|
|
}
|
|
|
|
if e.ClusterStats && (e.serverInfo[s].isMaster() || !e.ClusterStatsOnlyFromMaster || !e.Local) {
|
|
if err := e.gatherClusterStats(s+"/_cluster/stats", acc); err != nil {
|
|
acc.AddError(fmt.Errorf(mask.ReplaceAllString(err.Error(), "http(s)://XXX:XXX@")))
|
|
return
|
|
}
|
|
}
|
|
}(serv, acc)
|
|
}
|
|
|
|
wg.Wait()
|
|
return nil
|
|
}
|
|
|
|
func (e *Elasticsearch) createHttpClient() (*http.Client, error) {
|
|
tlsCfg, err := e.ClientConfig.TLSConfig()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
tr := &http.Transport{
|
|
ResponseHeaderTimeout: e.HttpTimeout.Duration,
|
|
TLSClientConfig: tlsCfg,
|
|
}
|
|
client := &http.Client{
|
|
Transport: tr,
|
|
Timeout: e.HttpTimeout.Duration,
|
|
}
|
|
|
|
return client, nil
|
|
}
|
|
|
|
func (e *Elasticsearch) nodeStatsUrl(baseUrl string) string {
|
|
var url string
|
|
|
|
if e.Local {
|
|
url = baseUrl + statsPathLocal
|
|
} else {
|
|
url = baseUrl + statsPath
|
|
}
|
|
|
|
if len(e.NodeStats) == 0 {
|
|
return url
|
|
}
|
|
|
|
return fmt.Sprintf("%s/%s", url, strings.Join(e.NodeStats, ","))
|
|
}
|
|
|
|
func (e *Elasticsearch) gatherNodeID(url string) (string, error) {
|
|
nodeStats := &struct {
|
|
ClusterName string `json:"cluster_name"`
|
|
Nodes map[string]*nodeStat `json:"nodes"`
|
|
}{}
|
|
if err := e.gatherJsonData(url, nodeStats); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
// Only 1 should be returned
|
|
for id := range nodeStats.Nodes {
|
|
return id, nil
|
|
}
|
|
return "", nil
|
|
}
|
|
|
|
func (e *Elasticsearch) gatherNodeStats(url string, acc telegraf.Accumulator) error {
|
|
nodeStats := &struct {
|
|
ClusterName string `json:"cluster_name"`
|
|
Nodes map[string]*nodeStat `json:"nodes"`
|
|
}{}
|
|
if err := e.gatherJsonData(url, nodeStats); err != nil {
|
|
return err
|
|
}
|
|
|
|
for id, n := range nodeStats.Nodes {
|
|
sort.Strings(n.Roles)
|
|
tags := map[string]string{
|
|
"node_id": id,
|
|
"node_host": n.Host,
|
|
"node_name": n.Name,
|
|
"cluster_name": nodeStats.ClusterName,
|
|
"node_roles": strings.Join(n.Roles, ","),
|
|
}
|
|
|
|
for k, v := range n.Attributes {
|
|
tags["node_attribute_"+k] = v
|
|
}
|
|
|
|
stats := map[string]interface{}{
|
|
"indices": n.Indices,
|
|
"os": n.OS,
|
|
"process": n.Process,
|
|
"jvm": n.JVM,
|
|
"thread_pool": n.ThreadPool,
|
|
"fs": n.FS,
|
|
"transport": n.Transport,
|
|
"http": n.HTTP,
|
|
"breakers": n.Breakers,
|
|
}
|
|
|
|
now := time.Now()
|
|
for p, s := range stats {
|
|
// if one of the individual node stats is not even in the
|
|
// original result
|
|
if s == nil {
|
|
continue
|
|
}
|
|
f := jsonparser.JSONFlattener{}
|
|
// parse Json, ignoring strings and bools
|
|
err := f.FlattenJSON("", s)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
acc.AddFields("elasticsearch_"+p, f.Fields, tags, now)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (e *Elasticsearch) gatherClusterHealth(url string, acc telegraf.Accumulator) error {
|
|
healthStats := &clusterHealth{}
|
|
if err := e.gatherJsonData(url, healthStats); err != nil {
|
|
return err
|
|
}
|
|
measurementTime := time.Now()
|
|
clusterFields := map[string]interface{}{
|
|
"active_primary_shards": healthStats.ActivePrimaryShards,
|
|
"active_shards": healthStats.ActiveShards,
|
|
"active_shards_percent_as_number": healthStats.ActiveShardsPercentAsNumber,
|
|
"delayed_unassigned_shards": healthStats.DelayedUnassignedShards,
|
|
"initializing_shards": healthStats.InitializingShards,
|
|
"number_of_data_nodes": healthStats.NumberOfDataNodes,
|
|
"number_of_in_flight_fetch": healthStats.NumberOfInFlightFetch,
|
|
"number_of_nodes": healthStats.NumberOfNodes,
|
|
"number_of_pending_tasks": healthStats.NumberOfPendingTasks,
|
|
"relocating_shards": healthStats.RelocatingShards,
|
|
"status": healthStats.Status,
|
|
"status_code": mapHealthStatusToCode(healthStats.Status),
|
|
"task_max_waiting_in_queue_millis": healthStats.TaskMaxWaitingInQueueMillis,
|
|
"timed_out": healthStats.TimedOut,
|
|
"unassigned_shards": healthStats.UnassignedShards,
|
|
}
|
|
acc.AddFields(
|
|
"elasticsearch_cluster_health",
|
|
clusterFields,
|
|
map[string]string{"name": healthStats.ClusterName},
|
|
measurementTime,
|
|
)
|
|
|
|
for name, health := range healthStats.Indices {
|
|
indexFields := map[string]interface{}{
|
|
"active_primary_shards": health.ActivePrimaryShards,
|
|
"active_shards": health.ActiveShards,
|
|
"initializing_shards": health.InitializingShards,
|
|
"number_of_replicas": health.NumberOfReplicas,
|
|
"number_of_shards": health.NumberOfShards,
|
|
"relocating_shards": health.RelocatingShards,
|
|
"status": health.Status,
|
|
"status_code": mapHealthStatusToCode(health.Status),
|
|
"unassigned_shards": health.UnassignedShards,
|
|
}
|
|
acc.AddFields(
|
|
"elasticsearch_cluster_health_indices",
|
|
indexFields,
|
|
map[string]string{"index": name, "name": healthStats.ClusterName},
|
|
measurementTime,
|
|
)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (e *Elasticsearch) gatherClusterStats(url string, acc telegraf.Accumulator) error {
|
|
clusterStats := &clusterStats{}
|
|
if err := e.gatherJsonData(url, clusterStats); err != nil {
|
|
return err
|
|
}
|
|
now := time.Now()
|
|
tags := map[string]string{
|
|
"node_name": clusterStats.NodeName,
|
|
"cluster_name": clusterStats.ClusterName,
|
|
"status": clusterStats.Status,
|
|
}
|
|
|
|
stats := map[string]interface{}{
|
|
"nodes": clusterStats.Nodes,
|
|
"indices": clusterStats.Indices,
|
|
}
|
|
|
|
for p, s := range stats {
|
|
f := jsonparser.JSONFlattener{}
|
|
// parse json, including bools and strings
|
|
err := f.FullFlattenJSON("", s, true, true)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
acc.AddFields("elasticsearch_clusterstats_"+p, f.Fields, tags, now)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (e *Elasticsearch) getCatMaster(url string) (string, error) {
|
|
r, err := e.client.Get(url)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer r.Body.Close()
|
|
if r.StatusCode != http.StatusOK {
|
|
// NOTE: we are not going to read/discard r.Body under the assumption we'd prefer
|
|
// to let the underlying transport close the connection and re-establish a new one for
|
|
// future calls.
|
|
return "", fmt.Errorf("elasticsearch: Unable to retrieve master node information. API responded with status-code %d, expected %d", r.StatusCode, http.StatusOK)
|
|
}
|
|
response, err := ioutil.ReadAll(r.Body)
|
|
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
masterID := strings.Split(string(response), " ")[0]
|
|
|
|
return masterID, nil
|
|
}
|
|
|
|
func (e *Elasticsearch) gatherJsonData(url string, v interface{}) error {
|
|
r, err := e.client.Get(url)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer r.Body.Close()
|
|
if r.StatusCode != http.StatusOK {
|
|
// NOTE: we are not going to read/discard r.Body under the assumption we'd prefer
|
|
// to let the underlying transport close the connection and re-establish a new one for
|
|
// future calls.
|
|
return fmt.Errorf("elasticsearch: API responded with status-code %d, expected %d",
|
|
r.StatusCode, http.StatusOK)
|
|
}
|
|
|
|
if err = json.NewDecoder(r.Body).Decode(v); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func init() {
|
|
inputs.Add("elasticsearch", func() telegraf.Input {
|
|
return NewElasticsearch()
|
|
})
|
|
}
|