Collect framework_offers and allocator metrics in mesos input (#5719)

This commit is contained in:
Branden Rolston
2019-08-09 17:27:59 -07:00
committed by Daniel Nelson
parent 337a579dd0
commit f5a4d72382
3 changed files with 400 additions and 204 deletions

View File

@@ -8,6 +8,7 @@ import (
"net/http/httptest"
"net/url"
"os"
"strings"
"testing"
"github.com/influxdata/telegraf/testutil"
@@ -27,194 +28,262 @@ func randUUID() string {
return fmt.Sprintf("%x-%x-%x-%x-%x", b[0:4], b[4:6], b[6:8], b[8:10], b[10:])
}
// master metrics that will be returned by generateMetrics()
var masterMetricNames []string = []string{
// resources
"master/cpus_percent",
"master/cpus_used",
"master/cpus_total",
"master/cpus_revocable_percent",
"master/cpus_revocable_total",
"master/cpus_revocable_used",
"master/disk_percent",
"master/disk_used",
"master/disk_total",
"master/disk_revocable_percent",
"master/disk_revocable_total",
"master/disk_revocable_used",
"master/gpus_percent",
"master/gpus_used",
"master/gpus_total",
"master/gpus_revocable_percent",
"master/gpus_revocable_total",
"master/gpus_revocable_used",
"master/mem_percent",
"master/mem_used",
"master/mem_total",
"master/mem_revocable_percent",
"master/mem_revocable_total",
"master/mem_revocable_used",
// master
"master/elected",
"master/uptime_secs",
// system
"system/cpus_total",
"system/load_15min",
"system/load_5min",
"system/load_1min",
"system/mem_free_bytes",
"system/mem_total_bytes",
// agents
"master/slave_registrations",
"master/slave_removals",
"master/slave_reregistrations",
"master/slave_shutdowns_scheduled",
"master/slave_shutdowns_canceled",
"master/slave_shutdowns_completed",
"master/slaves_active",
"master/slaves_connected",
"master/slaves_disconnected",
"master/slaves_inactive",
"master/slave_unreachable_canceled",
"master/slave_unreachable_completed",
"master/slave_unreachable_scheduled",
"master/slaves_unreachable",
// frameworks
"master/frameworks_active",
"master/frameworks_connected",
"master/frameworks_disconnected",
"master/frameworks_inactive",
"master/outstanding_offers",
// framework offers
"master/frameworks/marathon/abc-123/calls",
"master/frameworks/marathon/abc-123/calls/accept",
"master/frameworks/marathon/abc-123/events",
"master/frameworks/marathon/abc-123/events/error",
"master/frameworks/marathon/abc-123/offers/sent",
"master/frameworks/marathon/abc-123/operations",
"master/frameworks/marathon/abc-123/operations/create",
"master/frameworks/marathon/abc-123/roles/*/suppressed",
"master/frameworks/marathon/abc-123/subscribed",
"master/frameworks/marathon/abc-123/tasks/active/task_killing",
"master/frameworks/marathon/abc-123/tasks/active/task_dropped",
"master/frameworks/marathon/abc-123/tasks/terminal/task_dropped",
"master/frameworks/marathon/abc-123/unknown/unknown", // test case for unknown metric type
// tasks
"master/tasks_error",
"master/tasks_failed",
"master/tasks_finished",
"master/tasks_killed",
"master/tasks_lost",
"master/tasks_running",
"master/tasks_staging",
"master/tasks_starting",
"master/tasks_dropped",
"master/tasks_gone",
"master/tasks_gone_by_operator",
"master/tasks_killing",
"master/tasks_unreachable",
// messages
"master/invalid_executor_to_framework_messages",
"master/invalid_framework_to_executor_messages",
"master/invalid_status_update_acknowledgements",
"master/invalid_status_updates",
"master/dropped_messages",
"master/messages_authenticate",
"master/messages_deactivate_framework",
"master/messages_decline_offers",
"master/messages_executor_to_framework",
"master/messages_exited_executor",
"master/messages_framework_to_executor",
"master/messages_kill_task",
"master/messages_launch_tasks",
"master/messages_reconcile_tasks",
"master/messages_register_framework",
"master/messages_register_slave",
"master/messages_reregister_framework",
"master/messages_reregister_slave",
"master/messages_resource_request",
"master/messages_revive_offers",
"master/messages_status_update",
"master/messages_status_update_acknowledgement",
"master/messages_unregister_framework",
"master/messages_unregister_slave",
"master/messages_update_slave",
"master/recovery_slave_removals",
"master/slave_removals/reason_registered",
"master/slave_removals/reason_unhealthy",
"master/slave_removals/reason_unregistered",
"master/valid_framework_to_executor_messages",
"master/valid_status_update_acknowledgements",
"master/valid_status_updates",
"master/task_lost/source_master/reason_invalid_offers",
"master/task_lost/source_master/reason_slave_removed",
"master/task_lost/source_slave/reason_executor_terminated",
"master/valid_executor_to_framework_messages",
"master/invalid_operation_status_update_acknowledgements",
"master/messages_operation_status_update_acknowledgement",
"master/messages_reconcile_operations",
"master/messages_suppress_offers",
"master/valid_operation_status_update_acknowledgements",
// evgqueue
"master/event_queue_dispatches",
"master/event_queue_http_requests",
"master/event_queue_messages",
"master/operator_event_stream_subscribers",
// registrar
"registrar/log/ensemble_size",
"registrar/log/recovered",
"registrar/queued_operations",
"registrar/registry_size_bytes",
"registrar/state_fetch_ms",
"registrar/state_store_ms",
"registrar/state_store_ms/max",
"registrar/state_store_ms/min",
"registrar/state_store_ms/p50",
"registrar/state_store_ms/p90",
"registrar/state_store_ms/p95",
"registrar/state_store_ms/p99",
"registrar/state_store_ms/p999",
"registrar/state_store_ms/p9999",
"registrar/state_store_ms/count",
// allocator
"allocator/mesos/allocation_run_ms",
"allocator/mesos/allocation_run_ms/count",
"allocator/mesos/allocation_run_ms/max",
"allocator/mesos/allocation_run_ms/min",
"allocator/mesos/allocation_run_ms/p50",
"allocator/mesos/allocation_run_ms/p90",
"allocator/mesos/allocation_run_ms/p95",
"allocator/mesos/allocation_run_ms/p99",
"allocator/mesos/allocation_run_ms/p999",
"allocator/mesos/allocation_run_ms/p9999",
"allocator/mesos/allocation_runs",
"allocator/mesos/allocation_run_latency_ms",
"allocator/mesos/allocation_run_latency_ms/count",
"allocator/mesos/allocation_run_latency_ms/max",
"allocator/mesos/allocation_run_latency_ms/min",
"allocator/mesos/allocation_run_latency_ms/p50",
"allocator/mesos/allocation_run_latency_ms/p90",
"allocator/mesos/allocation_run_latency_ms/p95",
"allocator/mesos/allocation_run_latency_ms/p99",
"allocator/mesos/allocation_run_latency_ms/p999",
"allocator/mesos/allocation_run_latency_ms/p9999",
"allocator/mesos/roles/*/shares/dominant",
"allocator/mesos/event_queue_dispatches",
"allocator/mesos/offer_filters/roles/*/active",
"allocator/mesos/quota/roles/*/resources/disk/offered_or_allocated",
"allocator/mesos/quota/roles/*/resources/mem/guarantee",
"allocator/mesos/quota/roles/*/resources/disk/guarantee",
"allocator/mesos/resources/cpus/offered_or_allocated",
"allocator/mesos/resources/cpus/total",
"allocator/mesos/resources/disk/offered_or_allocated",
"allocator/mesos/resources/disk/total",
"allocator/mesos/resources/mem/offered_or_allocated",
"allocator/mesos/resources/mem/total",
}
// slave metrics that will be returned by generateMetrics()
var slaveMetricNames []string = []string{
// resources
"slave/cpus_percent",
"slave/cpus_used",
"slave/cpus_total",
"slave/cpus_revocable_percent",
"slave/cpus_revocable_total",
"slave/cpus_revocable_used",
"slave/disk_percent",
"slave/disk_used",
"slave/disk_total",
"slave/disk_revocable_percent",
"slave/disk_revocable_total",
"slave/disk_revocable_used",
"slave/gpus_percent",
"slave/gpus_used",
"slave/gpus_total",
"slave/gpus_revocable_percent",
"slave/gpus_revocable_total",
"slave/gpus_revocable_used",
"slave/mem_percent",
"slave/mem_used",
"slave/mem_total",
"slave/mem_revocable_percent",
"slave/mem_revocable_total",
"slave/mem_revocable_used",
// agent
"slave/registered",
"slave/uptime_secs",
// system
"system/cpus_total",
"system/load_15min",
"system/load_5min",
"system/load_1min",
"system/mem_free_bytes",
"system/mem_total_bytes",
// executors
"containerizer/mesos/container_destroy_errors",
"slave/container_launch_errors",
"slave/executors_preempted",
"slave/frameworks_active",
"slave/executor_directory_max_allowed_age_secs",
"slave/executors_registering",
"slave/executors_running",
"slave/executors_terminated",
"slave/executors_terminating",
"slave/recovery_errors",
// tasks
"slave/tasks_failed",
"slave/tasks_finished",
"slave/tasks_killed",
"slave/tasks_lost",
"slave/tasks_running",
"slave/tasks_staging",
"slave/tasks_starting",
// messages
"slave/invalid_framework_messages",
"slave/invalid_status_updates",
"slave/valid_framework_messages",
"slave/valid_status_updates",
}
func generateMetrics() {
masterMetrics = make(map[string]interface{})
metricNames := []string{
// resources
"master/cpus_percent",
"master/cpus_used",
"master/cpus_total",
"master/cpus_revocable_percent",
"master/cpus_revocable_total",
"master/cpus_revocable_used",
"master/disk_percent",
"master/disk_used",
"master/disk_total",
"master/disk_revocable_percent",
"master/disk_revocable_total",
"master/disk_revocable_used",
"master/gpus_percent",
"master/gpus_used",
"master/gpus_total",
"master/gpus_revocable_percent",
"master/gpus_revocable_total",
"master/gpus_revocable_used",
"master/mem_percent",
"master/mem_used",
"master/mem_total",
"master/mem_revocable_percent",
"master/mem_revocable_total",
"master/mem_revocable_used",
// master
"master/elected",
"master/uptime_secs",
// system
"system/cpus_total",
"system/load_15min",
"system/load_5min",
"system/load_1min",
"system/mem_free_bytes",
"system/mem_total_bytes",
// agents
"master/slave_registrations",
"master/slave_removals",
"master/slave_reregistrations",
"master/slave_shutdowns_scheduled",
"master/slave_shutdowns_canceled",
"master/slave_shutdowns_completed",
"master/slaves_active",
"master/slaves_connected",
"master/slaves_disconnected",
"master/slaves_inactive",
// frameworks
"master/frameworks_active",
"master/frameworks_connected",
"master/frameworks_disconnected",
"master/frameworks_inactive",
"master/outstanding_offers",
// tasks
"master/tasks_error",
"master/tasks_failed",
"master/tasks_finished",
"master/tasks_killed",
"master/tasks_lost",
"master/tasks_running",
"master/tasks_staging",
"master/tasks_starting",
// messages
"master/invalid_executor_to_framework_messages",
"master/invalid_framework_to_executor_messages",
"master/invalid_status_update_acknowledgements",
"master/invalid_status_updates",
"master/dropped_messages",
"master/messages_authenticate",
"master/messages_deactivate_framework",
"master/messages_decline_offers",
"master/messages_executor_to_framework",
"master/messages_exited_executor",
"master/messages_framework_to_executor",
"master/messages_kill_task",
"master/messages_launch_tasks",
"master/messages_reconcile_tasks",
"master/messages_register_framework",
"master/messages_register_slave",
"master/messages_reregister_framework",
"master/messages_reregister_slave",
"master/messages_resource_request",
"master/messages_revive_offers",
"master/messages_status_update",
"master/messages_status_update_acknowledgement",
"master/messages_unregister_framework",
"master/messages_unregister_slave",
"master/messages_update_slave",
"master/recovery_slave_removals",
"master/slave_removals/reason_registered",
"master/slave_removals/reason_unhealthy",
"master/slave_removals/reason_unregistered",
"master/valid_framework_to_executor_messages",
"master/valid_status_update_acknowledgements",
"master/valid_status_updates",
"master/task_lost/source_master/reason_invalid_offers",
"master/task_lost/source_master/reason_slave_removed",
"master/task_lost/source_slave/reason_executor_terminated",
"master/valid_executor_to_framework_messages",
// evgqueue
"master/event_queue_dispatches",
"master/event_queue_http_requests",
"master/event_queue_messages",
// registrar
"registrar/state_fetch_ms",
"registrar/state_store_ms",
"registrar/state_store_ms/max",
"registrar/state_store_ms/min",
"registrar/state_store_ms/p50",
"registrar/state_store_ms/p90",
"registrar/state_store_ms/p95",
"registrar/state_store_ms/p99",
"registrar/state_store_ms/p999",
"registrar/state_store_ms/p9999",
}
for _, k := range metricNames {
for _, k := range masterMetricNames {
masterMetrics[k] = rand.Float64()
}
slaveMetrics = make(map[string]interface{})
metricNames = []string{
// resources
"slave/cpus_percent",
"slave/cpus_used",
"slave/cpus_total",
"slave/cpus_revocable_percent",
"slave/cpus_revocable_total",
"slave/cpus_revocable_used",
"slave/disk_percent",
"slave/disk_used",
"slave/disk_total",
"slave/disk_revocable_percent",
"slave/disk_revocable_total",
"slave/disk_revocable_used",
"slave/gpus_percent",
"slave/gpus_used",
"slave/gpus_total",
"slave/gpus_revocable_percent",
"slave/gpus_revocable_total",
"slave/gpus_revocable_used",
"slave/mem_percent",
"slave/mem_used",
"slave/mem_total",
"slave/mem_revocable_percent",
"slave/mem_revocable_total",
"slave/mem_revocable_used",
// agent
"slave/registered",
"slave/uptime_secs",
// system
"system/cpus_total",
"system/load_15min",
"system/load_5min",
"system/load_1min",
"system/mem_free_bytes",
"system/mem_total_bytes",
// executors
"containerizer/mesos/container_destroy_errors",
"slave/container_launch_errors",
"slave/executors_preempted",
"slave/frameworks_active",
"slave/executor_directory_max_allowed_age_secs",
"slave/executors_registering",
"slave/executors_running",
"slave/executors_terminated",
"slave/executors_terminating",
"slave/recovery_errors",
// tasks
"slave/tasks_failed",
"slave/tasks_finished",
"slave/tasks_killed",
"slave/tasks_lost",
"slave/tasks_running",
"slave/tasks_staging",
"slave/tasks_starting",
// messages
"slave/invalid_framework_messages",
"slave/invalid_status_updates",
"slave/valid_framework_messages",
"slave/valid_status_updates",
}
for _, k := range metricNames {
for _, k := range slaveMetricNames {
slaveMetrics[k] = rand.Float64()
}
@@ -296,7 +365,7 @@ func TestMesosMaster(t *testing.T) {
func TestMasterFilter(t *testing.T) {
m := Mesos{
MasterCols: []string{
"resources", "master", "registrar",
"resources", "master", "registrar", "allocator",
},
}
b := []string{
@@ -306,6 +375,26 @@ func TestMasterFilter(t *testing.T) {
m.filterMetrics(MASTER, &masterMetrics)
// Assert expected metrics are present.
for _, v := range m.MasterCols {
for _, x := range getMetrics(MASTER, v) {
if _, ok := masterMetrics[x]; !ok {
t.Errorf("Didn't find key %s, it should present.", x)
}
}
}
// m.MasterCols includes "allocator", so allocator metrics should be present.
// allocator metrics have unpredictable names, so we can't rely on the list of metrics returned from
// getMetrics(). We have to find them by checking name prefixes.
for _, x := range masterMetricNames {
if strings.HasPrefix(x, "allocator/") {
if _, ok := masterMetrics[x]; !ok {
t.Errorf("Didn't find key %s, it should be present.", x)
}
}
}
// Assert unexpected metrics are not present.
for _, v := range b {
for _, x := range getMetrics(MASTER, v) {
if _, ok := masterMetrics[x]; ok {
@@ -313,11 +402,12 @@ func TestMasterFilter(t *testing.T) {
}
}
}
for _, v := range m.MasterCols {
for _, x := range getMetrics(MASTER, v) {
if _, ok := masterMetrics[x]; !ok {
t.Errorf("Didn't find key %s, it should present.", x)
}
// m.MasterCols does not include "framework_offers", so framework_offers metrics should not be present.
// framework_offers metrics have unpredictable names, so we can't rely on the list of metrics returned from
// getMetrics(). We have to find them by checking name prefixes.
for k := range masterMetrics {
if strings.HasPrefix(k, "master/frameworks/") || strings.HasPrefix(k, "frameworks/") {
t.Errorf("Found key %s, it should be gone.", k)
}
}
}
@@ -339,18 +429,6 @@ func TestMesosSlave(t *testing.T) {
}
acc.AssertContainsFields(t, "mesos", slaveMetrics)
// expectedFields := make(map[string]interface{}, len(slaveTaskMetrics["statistics"].(map[string]interface{}))+1)
// for k, v := range slaveTaskMetrics["statistics"].(map[string]interface{}) {
// expectedFields[k] = v
// }
// expectedFields["executor_id"] = slaveTaskMetrics["executor_id"]
// acc.AssertContainsTaggedFields(
// t,
// "mesos_tasks",
// expectedFields,
// map[string]string{"server": "127.0.0.1", "framework_id": slaveTaskMetrics["framework_id"].(string)})
}
func TestSlaveFilter(t *testing.T) {