Ceph Cluster Performance Input Plugin

The existing ceph input plugin only has access to the local admin daemon socket
on the local host, and as such has access to a limited subset of data.  This
extends the plugin to use CLI commands to get access to the full spread of Ceph
data.  This patch collects global OSD map and IO statistics, PG state and per pool
IO and utilization statistics.

closes #1513
This commit is contained in:
Simon Murray
2016-07-13 09:50:38 +00:00
committed by Cameron Sparr
parent 5c5984bfe1
commit 38d877165a
5 changed files with 417 additions and 34 deletions

View File

@@ -23,33 +23,15 @@ const (
)
type Ceph struct {
CephBinary string
OsdPrefix string
MonPrefix string
SocketDir string
SocketSuffix string
}
func (c *Ceph) setDefaults() {
if c.CephBinary == "" {
c.CephBinary = "/usr/bin/ceph"
}
if c.OsdPrefix == "" {
c.OsdPrefix = osdPrefix
}
if c.MonPrefix == "" {
c.MonPrefix = monPrefix
}
if c.SocketDir == "" {
c.SocketDir = "/var/run/ceph"
}
if c.SocketSuffix == "" {
c.SocketSuffix = sockSuffix
}
CephBinary string
OsdPrefix string
MonPrefix string
SocketDir string
SocketSuffix string
CephUser string
CephConfig string
GatherAdminSocketStats bool
GatherClusterStats bool
}
func (c *Ceph) Description() string {
@@ -57,6 +39,10 @@ func (c *Ceph) Description() string {
}
var sampleConfig = `
## This is the recommended interval to poll. Too frequent and you will lose
## data points due to timeouts during rebalancing and recovery
interval = '1m'
## All configuration values are optional, defaults are shown below
## location of ceph binary
@@ -71,6 +57,18 @@ var sampleConfig = `
## suffix used to identify socket files
socket_suffix = "asok"
## Ceph user to authenticate as
ceph_user = "client.admin"
## Ceph configuration to use to locate the cluster
ceph_config = "/etc/ceph/ceph.conf"
## Whether to gather statistics via the admin socket
gather_admin_socket_stats = true
## Whether to gather statistics via ceph commands
gather_cluster_stats = true
`
func (c *Ceph) SampleConfig() string {
@@ -78,7 +76,22 @@ func (c *Ceph) SampleConfig() string {
}
func (c *Ceph) Gather(acc telegraf.Accumulator) error {
c.setDefaults()
if c.GatherAdminSocketStats {
if err := c.gatherAdminSocketStats(acc); err != nil {
return err
}
}
if c.GatherClusterStats {
if err := c.gatherClusterStats(acc); err != nil {
return err
}
}
return nil
}
func (c *Ceph) gatherAdminSocketStats(acc telegraf.Accumulator) error {
sockets, err := findSockets(c)
if err != nil {
return fmt.Errorf("failed to find sockets at path '%s': %v", c.SocketDir, err)
@@ -104,8 +117,46 @@ func (c *Ceph) Gather(acc telegraf.Accumulator) error {
return nil
}
func (c *Ceph) gatherClusterStats(acc telegraf.Accumulator) error {
jobs := []struct {
command string
parser func(telegraf.Accumulator, string) error
}{
{"status", decodeStatus},
{"df", decodeDf},
{"osd pool stats", decodeOsdPoolStats},
}
// For each job, execute against the cluster, parse and accumulate the data points
for _, job := range jobs {
output, err := c.exec(job.command)
if err != nil {
return fmt.Errorf("error executing command: %v", err)
}
err = job.parser(acc, output)
if err != nil {
return fmt.Errorf("error parsing output: %v", err)
}
}
return nil
}
func init() {
inputs.Add(measurement, func() telegraf.Input { return &Ceph{} })
c := Ceph{
CephBinary: "/usr/bin/ceph",
OsdPrefix: osdPrefix,
MonPrefix: monPrefix,
SocketDir: "/var/run/ceph",
SocketSuffix: sockSuffix,
CephUser: "client.admin",
CephConfig: "/etc/ceph/ceph.conf",
GatherAdminSocketStats: true,
GatherClusterStats: false,
}
inputs.Add(measurement, func() telegraf.Input { return &c })
}
var perfDump = func(binary string, socket *socket) (string, error) {
@@ -247,3 +298,192 @@ func flatten(data interface{}) []*metric {
return metrics
}
func (c *Ceph) exec(command string) (string, error) {
cmdArgs := []string{"--conf", c.CephConfig, "--name", c.CephUser, "--format", "json"}
cmdArgs = append(cmdArgs, strings.Split(command, " ")...)
cmd := exec.Command(c.CephBinary, cmdArgs...)
var out bytes.Buffer
cmd.Stdout = &out
err := cmd.Run()
if err != nil {
return "", fmt.Errorf("error running ceph %v: %s", command, err)
}
output := out.String()
// Ceph doesn't sanitize its output, and may return invalid JSON. Patch this
// up for them, as having some inaccurate data is better than none.
output = strings.Replace(output, "-inf", "0", -1)
output = strings.Replace(output, "inf", "0", -1)
return output, nil
}
func decodeStatus(acc telegraf.Accumulator, input string) error {
data := make(map[string]interface{})
err := json.Unmarshal([]byte(input), &data)
if err != nil {
return fmt.Errorf("failed to parse json: '%s': %v", input, err)
}
err = decodeStatusOsdmap(acc, data)
if err != nil {
return err
}
err = decodeStatusPgmap(acc, data)
if err != nil {
return err
}
err = decodeStatusPgmapState(acc, data)
if err != nil {
return err
}
return nil
}
func decodeStatusOsdmap(acc telegraf.Accumulator, data map[string]interface{}) error {
osdmap, ok := data["osdmap"].(map[string]interface{})
if !ok {
return fmt.Errorf("WARNING %s - unable to decode osdmap", measurement)
}
fields, ok := osdmap["osdmap"].(map[string]interface{})
if !ok {
return fmt.Errorf("WARNING %s - unable to decode osdmap", measurement)
}
acc.AddFields("ceph_osdmap", fields, map[string]string{})
return nil
}
func decodeStatusPgmap(acc telegraf.Accumulator, data map[string]interface{}) error {
pgmap, ok := data["pgmap"].(map[string]interface{})
if !ok {
return fmt.Errorf("WARNING %s - unable to decode pgmap", measurement)
}
fields := make(map[string]interface{})
for key, value := range pgmap {
switch value.(type) {
case float64:
fields[key] = value
}
}
acc.AddFields("ceph_pgmap", fields, map[string]string{})
return nil
}
func decodeStatusPgmapState(acc telegraf.Accumulator, data map[string]interface{}) error {
pgmap, ok := data["pgmap"].(map[string]interface{})
if !ok {
return fmt.Errorf("WARNING %s - unable to decode pgmap", measurement)
}
fields := make(map[string]interface{})
for key, value := range pgmap {
switch value.(type) {
case []interface{}:
if key != "pgs_by_state" {
continue
}
for _, state := range value.([]interface{}) {
state_map, ok := state.(map[string]interface{})
if !ok {
return fmt.Errorf("WARNING %s - unable to decode pg state", measurement)
}
state_name, ok := state_map["state_name"].(string)
if !ok {
return fmt.Errorf("WARNING %s - unable to decode pg state name", measurement)
}
state_count, ok := state_map["count"].(float64)
if !ok {
return fmt.Errorf("WARNING %s - unable to decode pg state count", measurement)
}
fields[state_name] = state_count
}
}
}
acc.AddFields("ceph_pgmap_state", fields, map[string]string{})
return nil
}
func decodeDf(acc telegraf.Accumulator, input string) error {
data := make(map[string]interface{})
err := json.Unmarshal([]byte(input), &data)
if err != nil {
return fmt.Errorf("failed to parse json: '%s': %v", input, err)
}
// ceph.usage: records global utilization and number of objects
stats_fields, ok := data["stats"].(map[string]interface{})
if !ok {
return fmt.Errorf("WARNING %s - unable to decode df stats", measurement)
}
acc.AddFields("ceph_usage", stats_fields, map[string]string{})
// ceph.pool.usage: records per pool utilization and number of objects
pools, ok := data["pools"].([]interface{})
if !ok {
return fmt.Errorf("WARNING %s - unable to decode df pools", measurement)
}
for _, pool := range pools {
pool_map, ok := pool.(map[string]interface{})
if !ok {
return fmt.Errorf("WARNING %s - unable to decode df pool", measurement)
}
pool_name, ok := pool_map["name"].(string)
if !ok {
return fmt.Errorf("WARNING %s - unable to decode df pool name", measurement)
}
fields, ok := pool_map["stats"].(map[string]interface{})
if !ok {
return fmt.Errorf("WARNING %s - unable to decode df pool stats", measurement)
}
tags := map[string]string{
"name": pool_name,
}
acc.AddFields("ceph_pool_usage", fields, tags)
}
return nil
}
func decodeOsdPoolStats(acc telegraf.Accumulator, input string) error {
data := make([]map[string]interface{}, 0)
err := json.Unmarshal([]byte(input), &data)
if err != nil {
return fmt.Errorf("failed to parse json: '%s': %v", input, err)
}
// ceph.pool.stats: records pre pool IO and recovery throughput
for _, pool := range data {
pool_name, ok := pool["pool_name"].(string)
if !ok {
return fmt.Errorf("WARNING %s - unable to decode osd pool stats name", measurement)
}
// Note: the 'recovery' object looks broken (in hammer), so it's omitted
objects := []string{
"client_io_rate",
"recovery_rate",
}
fields := make(map[string]interface{})
for _, object := range objects {
perfdata, ok := pool[object].(map[string]interface{})
if !ok {
return fmt.Errorf("WARNING %s - unable to decode osd pool stats", measurement)
}
for key, value := range perfdata {
fields[key] = value
}
}
tags := map[string]string{
"name": pool_name,
}
acc.AddFields("ceph_pool_stats", fields, tags)
}
return nil
}