From 160c96ccfee0a9df3246bbd540891068cd95e7a0 Mon Sep 17 00:00:00 2001 From: Jack Zampolin Date: Tue, 17 Apr 2018 13:40:55 -0700 Subject: [PATCH] Add nvidia_smi input to monitor nvidia GPUs (#4026) --- plugins/inputs/all/all.go | 1 + plugins/inputs/nvidia_smi/README.md | 47 ++++++ plugins/inputs/nvidia_smi/nvidia_smi.go | 149 +++++++++++++++++++ plugins/inputs/nvidia_smi/nvidia_smi_test.go | 35 +++++ 4 files changed, 232 insertions(+) create mode 100644 plugins/inputs/nvidia_smi/README.md create mode 100644 plugins/inputs/nvidia_smi/nvidia_smi.go create mode 100644 plugins/inputs/nvidia_smi/nvidia_smi_test.go diff --git a/plugins/inputs/all/all.go b/plugins/inputs/all/all.go index e3264ef8b..440d1e9a5 100644 --- a/plugins/inputs/all/all.go +++ b/plugins/inputs/all/all.go @@ -65,6 +65,7 @@ import ( _ "github.com/influxdata/telegraf/plugins/inputs/nsq_consumer" _ "github.com/influxdata/telegraf/plugins/inputs/nstat" _ "github.com/influxdata/telegraf/plugins/inputs/ntpq" + _ "github.com/influxdata/telegraf/plugins/inputs/nvidia_smi" _ "github.com/influxdata/telegraf/plugins/inputs/openldap" _ "github.com/influxdata/telegraf/plugins/inputs/opensmtpd" _ "github.com/influxdata/telegraf/plugins/inputs/passenger" diff --git a/plugins/inputs/nvidia_smi/README.md b/plugins/inputs/nvidia_smi/README.md new file mode 100644 index 000000000..84b8527f2 --- /dev/null +++ b/plugins/inputs/nvidia_smi/README.md @@ -0,0 +1,47 @@ +# `nvidia-smi` Input Plugin + +This plugin uses a query on the [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) binary to pull GPU stats including memory and GPU usage, temp and other. + +### Configuration + +```toml +# Pulls statistics from nvidia GPUs attached to the host +[[inputs.nvidia_smi]] +## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath +# bin_path = /usr/bin/nvidia-smi + +## Optional: timeout for GPU polling +# timeout = 5s +``` + +### Metrics +- measurement: `nvidia_smi` + - tags + - `name` (type of GPU e.g. `GeForce GTX 170 Ti`) + - `compute_mode` (The compute mode of the GPU e.g. `Default`) + - `index` (The port index where the GPU is connected to the motherboard e.g. `1`) + - `pstate` (Overclocking state for the GPU e.g. `P0`) + - `uuid` (A unique identifier for the GPU e.g. `GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665`) + - fields + - `fan_speed` (integer, percentage) + - `memory_free` (integer, KB) + - `memory_used` (integer, KB) + - `memory_total` (integer, KB) + - `temperature_gpu` (integer, degrees C) + - `utilization_gpu` (integer, percentage) + - `utilization_memory` (integer, percentage) + +### Sample Query + +The below query could be used to alert on the average temperature of the your GPUs over the last minute + +``` +SELECT mean("temperature_gpu") FROM "nvidia_smi" WHERE time > now() - 5m GROUP BY time(1m), "index", "name", "host" +``` + +### Example Output +``` +nvidia_smi,compute_mode=Default,host=8218cf,index=0,name=GeForce\ GTX\ 1070,pstate=P2,uuid=GPU-823bc202-6279-6f2c-d729-868a30f14d96 fan_speed=100i,memory_free=7563i,memory_total=8112i,memory_used=549i,temperature_gpu=53i,utilization_gpu=100i,utilization_memory=90i 1523991122000000000 +nvidia_smi,compute_mode=Default,host=8218cf,index=1,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665 fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=50i,utilization_gpu=100i,utilization_memory=85i 1523991122000000000 +nvidia_smi,compute_mode=Default,host=8218cf,index=2,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-d4cfc28d-0481-8d07-b81a-ddfc63d74adf fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=58i,utilization_gpu=100i,utilization_memory=86i 1523991122000000000 +``` diff --git a/plugins/inputs/nvidia_smi/nvidia_smi.go b/plugins/inputs/nvidia_smi/nvidia_smi.go new file mode 100644 index 000000000..0cf9bd9e7 --- /dev/null +++ b/plugins/inputs/nvidia_smi/nvidia_smi.go @@ -0,0 +1,149 @@ +package nvidia_smi + +import ( + "bufio" + "fmt" + "os" + "os/exec" + "strconv" + "strings" + "time" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal" + "github.com/influxdata/telegraf/plugins/inputs" +) + +var ( + measurement = "nvidia_smi" + metrics = "fan.speed,memory.total,memory.used,memory.free,pstate,temperature.gpu,name,uuid,compute_mode,utilization.gpu,utilization.memory,index" + metricNames = [][]string{ + []string{"fan_speed", "field"}, + []string{"memory_total", "field"}, + []string{"memory_used", "field"}, + []string{"memory_free", "field"}, + []string{"pstate", "tag"}, + []string{"temperature_gpu", "field"}, + []string{"name", "tag"}, + []string{"uuid", "tag"}, + []string{"compute_mode", "tag"}, + []string{"utilization_gpu", "field"}, + []string{"utilization_memory", "field"}, + []string{"index", "tag"}, + } +) + +// NvidiaSMI holds the methods for this plugin +type NvidiaSMI struct { + BinPath string + Timeout time.Duration + + metrics string +} + +// Description returns the description of the NvidiaSMI plugin +func (smi *NvidiaSMI) Description() string { + return "Pulls statistics from nvidia GPUs attached to the host" +} + +// SampleConfig returns the sample configuration for the NvidiaSMI plugin +func (smi *NvidiaSMI) SampleConfig() string { + return ` +## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath +# bin_path = /usr/bin/nvidia-smi + +## Optional: timeout for GPU polling +# timeout = 5s +` +} + +// Gather implements the telegraf interface +func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error { + + if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) { + return fmt.Errorf("nvidia-smi binary not at path %s, cannot gather GPU data", smi.BinPath) + } + + data, err := smi.pollSMI() + if err != nil { + return err + } + + err = gatherNvidiaSMI(data, acc) + if err != nil { + return err + } + + return nil +} + +func init() { + inputs.Add("nvidia_smi", func() telegraf.Input { + return &NvidiaSMI{ + BinPath: "/usr/bin/nvidia-smi", + Timeout: 5 * time.Second, + metrics: metrics, + } + }) +} + +func (smi *NvidiaSMI) pollSMI() (string, error) { + // Construct and execute metrics query + opts := []string{"--format=noheader,nounits,csv", fmt.Sprintf("--query-gpu=%s", smi.metrics)} + ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, opts...), smi.Timeout) + if err != nil { + return "", err + } + return string(ret), nil +} + +func gatherNvidiaSMI(ret string, acc telegraf.Accumulator) error { + // First split the lines up and handle each one + scanner := bufio.NewScanner(strings.NewReader(ret)) + for scanner.Scan() { + tags, fields, err := parseLine(scanner.Text()) + if err != nil { + return err + } + acc.AddFields(measurement, fields, tags) + } + + if err := scanner.Err(); err != nil { + return fmt.Errorf("Error scanning text %s", ret) + } + + return nil +} + +func parseLine(line string) (map[string]string, map[string]interface{}, error) { + tags := make(map[string]string, 0) + fields := make(map[string]interface{}, 0) + + // Next split up the comma delimited metrics + met := strings.Split(line, ",") + + // Make sure there are as many metrics in the line as there were queried. + if len(met) == len(metricNames) { + for i, m := range metricNames { + + // First handle the tags + if m[1] == "tag" { + tags[m[0]] = strings.TrimSpace(met[i]) + continue + } + + // Then parse the integers out of the fields + out, err := strconv.ParseInt(strings.TrimSpace(met[i]), 10, 64) + if err != nil { + return tags, fields, err + } + fields[m[0]] = out + } + + // Return the tags and fields + return tags, fields, nil + } + + // If the line is empty return an emptyline error + return tags, fields, fmt.Errorf("Different number of metrics returned (%d) than expeced (%d)", len(met), len(metricNames)) +} diff --git a/plugins/inputs/nvidia_smi/nvidia_smi_test.go b/plugins/inputs/nvidia_smi/nvidia_smi_test.go new file mode 100644 index 000000000..62ddee3b8 --- /dev/null +++ b/plugins/inputs/nvidia_smi/nvidia_smi_test.go @@ -0,0 +1,35 @@ +package nvidia_smi + +import ( + "testing" +) + +func TestParseLineStandard(t *testing.T) { + line := "85, 8114, 553, 7561, P2, 61, GeForce GTX 1070 Ti, GPU-d1911b8a-f5c8-5e66-057c-486561269de8, Default, 100, 93, 1\n" + tags, fields, err := parseLine(line) + if err != nil { + t.Fail() + } + if tags["name"] != "GeForce GTX 1070 Ti" { + t.Fail() + } + if temp, ok := fields["temperature_gpu"].(int); ok && temp == 61 { + t.Fail() + } +} + +func TestParseLineEmptyLine(t *testing.T) { + line := "\n" + _, _, err := parseLine(line) + if err == nil { + t.Fail() + } +} + +func TestParseLineBad(t *testing.T) { + line := "the quick brown fox jumped over the lazy dog" + _, _, err := parseLine(line) + if err == nil { + t.Fail() + } +}