320 lines
7.1 KiB
Go
320 lines
7.1 KiB
Go
package mesos
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"io/ioutil"
|
|
"log"
|
|
"net"
|
|
"net/http"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
|
|
"github.com/influxdata/telegraf"
|
|
"github.com/influxdata/telegraf/internal"
|
|
"github.com/influxdata/telegraf/plugins/inputs"
|
|
)
|
|
|
|
type Mesos struct {
|
|
Timeout int
|
|
Servers []string
|
|
MetricsCol []string `toml:"metrics_collection"`
|
|
}
|
|
|
|
var defaultMetrics = []string{
|
|
"resources", "master", "system", "slaves", "frameworks",
|
|
"tasks", "messages", "evqueue", "messages", "registrar",
|
|
}
|
|
|
|
// SampleConfig returns a sample configuration block
|
|
func (m *Mesos) SampleConfig() string {
|
|
return sampleConfig
|
|
}
|
|
|
|
// Description just returns a short description of the Mesos plugin
|
|
func (m *Mesos) Description() string {
|
|
return "Telegraf plugin for gathering metrics from N Mesos masters"
|
|
}
|
|
|
|
func (m *Mesos) Gather(acc telegraf.Accumulator) error {
|
|
var wg sync.WaitGroup
|
|
var errorChannel chan error
|
|
|
|
if len(m.Servers) == 0 {
|
|
m.Servers = []string{"localhost:5050"}
|
|
}
|
|
|
|
errorChannel = make(chan error, len(m.Servers)*2)
|
|
|
|
for _, v := range m.Servers {
|
|
wg.Add(1)
|
|
go func() {
|
|
errorChannel <- m.gatherMetrics(v, acc)
|
|
wg.Done()
|
|
return
|
|
}()
|
|
}
|
|
|
|
wg.Wait()
|
|
close(errorChannel)
|
|
errorStrings := []string{}
|
|
|
|
for err := range errorChannel {
|
|
if err != nil {
|
|
errorStrings = append(errorStrings, err.Error())
|
|
}
|
|
}
|
|
|
|
if len(errorStrings) > 0 {
|
|
return errors.New(strings.Join(errorStrings, "\n"))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func metricsDiff(w []string) []string {
|
|
b := []string{}
|
|
s := make(map[string]bool)
|
|
|
|
if len(w) == 0 {
|
|
return b
|
|
}
|
|
|
|
for _, v := range w {
|
|
s[v] = true
|
|
}
|
|
|
|
for _, d := range defaultMetrics {
|
|
if _, ok := s[d]; !ok {
|
|
b = append(b, d)
|
|
}
|
|
}
|
|
|
|
return b
|
|
}
|
|
|
|
func masterBlocks(g string) []string {
|
|
var m map[string][]string
|
|
|
|
m = make(map[string][]string)
|
|
|
|
m["resources"] = []string{
|
|
"master/cpus_percent",
|
|
"master/cpus_used",
|
|
"master/cpus_total",
|
|
"master/cpus_revocable_percent",
|
|
"master/cpus_revocable_total",
|
|
"master/cpus_revocable_used",
|
|
"master/disk_percent",
|
|
"master/disk_used",
|
|
"master/disk_total",
|
|
"master/disk_revocable_percent",
|
|
"master/disk_revocable_total",
|
|
"master/disk_revocable_used",
|
|
"master/mem_percent",
|
|
"master/mem_used",
|
|
"master/mem_total",
|
|
"master/mem_revocable_percent",
|
|
"master/mem_revocable_total",
|
|
"master/mem_revocable_used",
|
|
}
|
|
|
|
m["master"] = []string{
|
|
"master/elected",
|
|
"master/uptime_secs",
|
|
}
|
|
|
|
m["system"] = []string{
|
|
"system/cpus_total",
|
|
"system/load_15min",
|
|
"system/load_5min",
|
|
"system/load_1min",
|
|
"system/mem_free_bytes",
|
|
"system/mem_total_bytes",
|
|
}
|
|
|
|
m["slaves"] = []string{
|
|
"master/slave_registrations",
|
|
"master/slave_removals",
|
|
"master/slave_reregistrations",
|
|
"master/slave_shutdowns_scheduled",
|
|
"master/slave_shutdowns_canceled",
|
|
"master/slave_shutdowns_completed",
|
|
"master/slaves_active",
|
|
"master/slaves_connected",
|
|
"master/slaves_disconnected",
|
|
"master/slaves_inactive",
|
|
}
|
|
|
|
m["frameworks"] = []string{
|
|
"master/frameworks_active",
|
|
"master/frameworks_connected",
|
|
"master/frameworks_disconnected",
|
|
"master/frameworks_inactive",
|
|
"master/outstanding_offers",
|
|
}
|
|
|
|
m["tasks"] = []string{
|
|
"master/tasks_error",
|
|
"master/tasks_failed",
|
|
"master/tasks_finished",
|
|
"master/tasks_killed",
|
|
"master/tasks_lost",
|
|
"master/tasks_running",
|
|
"master/tasks_staging",
|
|
"master/tasks_starting",
|
|
}
|
|
|
|
m["messages"] = []string{
|
|
"master/invalid_executor_to_framework_messages",
|
|
"master/invalid_framework_to_executor_messages",
|
|
"master/invalid_status_update_acknowledgements",
|
|
"master/invalid_status_updates",
|
|
"master/dropped_messages",
|
|
"master/messages_authenticate",
|
|
"master/messages_deactivate_framework",
|
|
"master/messages_decline_offers",
|
|
"master/messages_executor_to_framework",
|
|
"master/messages_exited_executor",
|
|
"master/messages_framework_to_executor",
|
|
"master/messages_kill_task",
|
|
"master/messages_launch_tasks",
|
|
"master/messages_reconcile_tasks",
|
|
"master/messages_register_framework",
|
|
"master/messages_register_slave",
|
|
"master/messages_reregister_framework",
|
|
"master/messages_reregister_slave",
|
|
"master/messages_resource_request",
|
|
"master/messages_revive_offers",
|
|
"master/messages_status_update",
|
|
"master/messages_status_update_acknowledgement",
|
|
"master/messages_unregister_framework",
|
|
"master/messages_unregister_slave",
|
|
"master/messages_update_slave",
|
|
"master/recovery_slave_removals",
|
|
"master/slave_removals/reason_registered",
|
|
"master/slave_removals/reason_unhealthy",
|
|
"master/slave_removals/reason_unregistered",
|
|
"master/valid_framework_to_executor_messages",
|
|
"master/valid_status_update_acknowledgements",
|
|
"master/valid_status_updates",
|
|
"master/task_lost/source_master/reason_invalid_offers",
|
|
"master/task_lost/source_master/reason_slave_removed",
|
|
"master/task_lost/source_slave/reason_executor_terminated",
|
|
"master/valid_executor_to_framework_messages",
|
|
}
|
|
|
|
m["evqueue"] = []string{
|
|
"master/event_queue_dispatches",
|
|
"master/event_queue_http_requests",
|
|
"master/event_queue_messages",
|
|
}
|
|
|
|
m["registrar"] = []string{
|
|
"registrar/state_fetch_ms",
|
|
"registrar/state_store_ms",
|
|
"registrar/state_store_ms/max",
|
|
"registrar/state_store_ms/min",
|
|
"registrar/state_store_ms/p50",
|
|
"registrar/state_store_ms/p90",
|
|
"registrar/state_store_ms/p95",
|
|
"registrar/state_store_ms/p99",
|
|
"registrar/state_store_ms/p999",
|
|
"registrar/state_store_ms/p9999",
|
|
}
|
|
|
|
ret, ok := m[g]
|
|
|
|
if !ok {
|
|
log.Println("[mesos] Unkown metrics group: ", g)
|
|
return []string{}
|
|
}
|
|
|
|
return ret
|
|
}
|
|
|
|
var sampleConfig = `
|
|
# Timeout, in ms.
|
|
timeout = 100
|
|
# A list of Mesos masters. e.g. master1:5050, master2:5080, etc.
|
|
# The port can be skipped if using the default (5050)
|
|
# Default value is localhost:5050.
|
|
servers = ["localhost:5050"]
|
|
# Metrics groups to be collected.
|
|
# Default, all enabled.
|
|
metrics_collection = ["resources","master","system","slaves","frameworks","messages","evqueue","registrar"]
|
|
`
|
|
|
|
// removeGroup(), remove unwanted groups
|
|
func (m *Mesos) removeGroup(j *map[string]interface{}) {
|
|
var ok bool
|
|
|
|
b := metricsDiff(m.MetricsCol)
|
|
|
|
for _, k := range b {
|
|
for _, v := range masterBlocks(k) {
|
|
if _, ok = (*j)[v]; ok {
|
|
delete((*j), v)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// This should not belong to the object
|
|
func (m *Mesos) gatherMetrics(a string, acc telegraf.Accumulator) error {
|
|
var jsonOut map[string]interface{}
|
|
|
|
host, _, err := net.SplitHostPort(a)
|
|
if err != nil {
|
|
host = a
|
|
a = a + ":5050"
|
|
}
|
|
|
|
tags := map[string]string{
|
|
"server": host,
|
|
}
|
|
|
|
if m.Timeout == 0 {
|
|
log.Println("[mesos] Missing timeout value, setting default value (100ms)")
|
|
m.Timeout = 100
|
|
}
|
|
|
|
ts := strconv.Itoa(m.Timeout) + "ms"
|
|
|
|
resp, err := http.Get("http://" + a + "/metrics/snapshot?timeout=" + ts)
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
data, err := ioutil.ReadAll(resp.Body)
|
|
resp.Body.Close()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err = json.Unmarshal([]byte(data), &jsonOut); err != nil {
|
|
return errors.New("Error decoding JSON response")
|
|
}
|
|
|
|
m.removeGroup(&jsonOut)
|
|
|
|
jf := internal.JSONFlattener{}
|
|
|
|
err = jf.FlattenJSON("", jsonOut)
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
acc.AddFields("mesos", jf.Fields, tags)
|
|
|
|
return nil
|
|
}
|
|
|
|
func init() {
|
|
inputs.Add("mesos", func() telegraf.Input {
|
|
return &Mesos{}
|
|
})
|
|
}
|