Add nvidia_smi input to monitor nvidia GPUs (#4026)
This commit is contained in:
parent
bcf1cf59c1
commit
3046f957d5
|
@ -65,6 +65,7 @@ import (
|
||||||
_ "github.com/influxdata/telegraf/plugins/inputs/nsq_consumer"
|
_ "github.com/influxdata/telegraf/plugins/inputs/nsq_consumer"
|
||||||
_ "github.com/influxdata/telegraf/plugins/inputs/nstat"
|
_ "github.com/influxdata/telegraf/plugins/inputs/nstat"
|
||||||
_ "github.com/influxdata/telegraf/plugins/inputs/ntpq"
|
_ "github.com/influxdata/telegraf/plugins/inputs/ntpq"
|
||||||
|
_ "github.com/influxdata/telegraf/plugins/inputs/nvidia_smi"
|
||||||
_ "github.com/influxdata/telegraf/plugins/inputs/openldap"
|
_ "github.com/influxdata/telegraf/plugins/inputs/openldap"
|
||||||
_ "github.com/influxdata/telegraf/plugins/inputs/opensmtpd"
|
_ "github.com/influxdata/telegraf/plugins/inputs/opensmtpd"
|
||||||
_ "github.com/influxdata/telegraf/plugins/inputs/passenger"
|
_ "github.com/influxdata/telegraf/plugins/inputs/passenger"
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
# `nvidia-smi` Input Plugin
|
||||||
|
|
||||||
|
This plugin uses a query on the [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) binary to pull GPU stats including memory and GPU usage, temp and other.
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
```toml
|
||||||
|
# Pulls statistics from nvidia GPUs attached to the host
|
||||||
|
[[inputs.nvidia_smi]]
|
||||||
|
## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath
|
||||||
|
# bin_path = /usr/bin/nvidia-smi
|
||||||
|
|
||||||
|
## Optional: timeout for GPU polling
|
||||||
|
# timeout = 5s
|
||||||
|
```
|
||||||
|
|
||||||
|
### Metrics
|
||||||
|
- measurement: `nvidia_smi`
|
||||||
|
- tags
|
||||||
|
- `name` (type of GPU e.g. `GeForce GTX 170 Ti`)
|
||||||
|
- `compute_mode` (The compute mode of the GPU e.g. `Default`)
|
||||||
|
- `index` (The port index where the GPU is connected to the motherboard e.g. `1`)
|
||||||
|
- `pstate` (Overclocking state for the GPU e.g. `P0`)
|
||||||
|
- `uuid` (A unique identifier for the GPU e.g. `GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665`)
|
||||||
|
- fields
|
||||||
|
- `fan_speed` (integer, percentage)
|
||||||
|
- `memory_free` (integer, KB)
|
||||||
|
- `memory_used` (integer, KB)
|
||||||
|
- `memory_total` (integer, KB)
|
||||||
|
- `temperature_gpu` (integer, degrees C)
|
||||||
|
- `utilization_gpu` (integer, percentage)
|
||||||
|
- `utilization_memory` (integer, percentage)
|
||||||
|
|
||||||
|
### Sample Query
|
||||||
|
|
||||||
|
The below query could be used to alert on the average temperature of the your GPUs over the last minute
|
||||||
|
|
||||||
|
```
|
||||||
|
SELECT mean("temperature_gpu") FROM "nvidia_smi" WHERE time > now() - 5m GROUP BY time(1m), "index", "name", "host"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example Output
|
||||||
|
```
|
||||||
|
nvidia_smi,compute_mode=Default,host=8218cf,index=0,name=GeForce\ GTX\ 1070,pstate=P2,uuid=GPU-823bc202-6279-6f2c-d729-868a30f14d96 fan_speed=100i,memory_free=7563i,memory_total=8112i,memory_used=549i,temperature_gpu=53i,utilization_gpu=100i,utilization_memory=90i 1523991122000000000
|
||||||
|
nvidia_smi,compute_mode=Default,host=8218cf,index=1,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665 fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=50i,utilization_gpu=100i,utilization_memory=85i 1523991122000000000
|
||||||
|
nvidia_smi,compute_mode=Default,host=8218cf,index=2,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-d4cfc28d-0481-8d07-b81a-ddfc63d74adf fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=58i,utilization_gpu=100i,utilization_memory=86i 1523991122000000000
|
||||||
|
```
|
|
@ -0,0 +1,149 @@
|
||||||
|
package nvidia_smi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/influxdata/telegraf"
|
||||||
|
"github.com/influxdata/telegraf/internal"
|
||||||
|
"github.com/influxdata/telegraf/plugins/inputs"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
measurement = "nvidia_smi"
|
||||||
|
metrics = "fan.speed,memory.total,memory.used,memory.free,pstate,temperature.gpu,name,uuid,compute_mode,utilization.gpu,utilization.memory,index"
|
||||||
|
metricNames = [][]string{
|
||||||
|
[]string{"fan_speed", "field"},
|
||||||
|
[]string{"memory_total", "field"},
|
||||||
|
[]string{"memory_used", "field"},
|
||||||
|
[]string{"memory_free", "field"},
|
||||||
|
[]string{"pstate", "tag"},
|
||||||
|
[]string{"temperature_gpu", "field"},
|
||||||
|
[]string{"name", "tag"},
|
||||||
|
[]string{"uuid", "tag"},
|
||||||
|
[]string{"compute_mode", "tag"},
|
||||||
|
[]string{"utilization_gpu", "field"},
|
||||||
|
[]string{"utilization_memory", "field"},
|
||||||
|
[]string{"index", "tag"},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
// NvidiaSMI holds the methods for this plugin
|
||||||
|
type NvidiaSMI struct {
|
||||||
|
BinPath string
|
||||||
|
Timeout time.Duration
|
||||||
|
|
||||||
|
metrics string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Description returns the description of the NvidiaSMI plugin
|
||||||
|
func (smi *NvidiaSMI) Description() string {
|
||||||
|
return "Pulls statistics from nvidia GPUs attached to the host"
|
||||||
|
}
|
||||||
|
|
||||||
|
// SampleConfig returns the sample configuration for the NvidiaSMI plugin
|
||||||
|
func (smi *NvidiaSMI) SampleConfig() string {
|
||||||
|
return `
|
||||||
|
## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath
|
||||||
|
# bin_path = /usr/bin/nvidia-smi
|
||||||
|
|
||||||
|
## Optional: timeout for GPU polling
|
||||||
|
# timeout = 5s
|
||||||
|
`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Gather implements the telegraf interface
|
||||||
|
func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
|
||||||
|
|
||||||
|
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
|
||||||
|
return fmt.Errorf("nvidia-smi binary not at path %s, cannot gather GPU data", smi.BinPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := smi.pollSMI()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = gatherNvidiaSMI(data, acc)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
inputs.Add("nvidia_smi", func() telegraf.Input {
|
||||||
|
return &NvidiaSMI{
|
||||||
|
BinPath: "/usr/bin/nvidia-smi",
|
||||||
|
Timeout: 5 * time.Second,
|
||||||
|
metrics: metrics,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (smi *NvidiaSMI) pollSMI() (string, error) {
|
||||||
|
// Construct and execute metrics query
|
||||||
|
opts := []string{"--format=noheader,nounits,csv", fmt.Sprintf("--query-gpu=%s", smi.metrics)}
|
||||||
|
ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, opts...), smi.Timeout)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return string(ret), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func gatherNvidiaSMI(ret string, acc telegraf.Accumulator) error {
|
||||||
|
// First split the lines up and handle each one
|
||||||
|
scanner := bufio.NewScanner(strings.NewReader(ret))
|
||||||
|
for scanner.Scan() {
|
||||||
|
tags, fields, err := parseLine(scanner.Text())
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
acc.AddFields(measurement, fields, tags)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
return fmt.Errorf("Error scanning text %s", ret)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseLine(line string) (map[string]string, map[string]interface{}, error) {
|
||||||
|
tags := make(map[string]string, 0)
|
||||||
|
fields := make(map[string]interface{}, 0)
|
||||||
|
|
||||||
|
// Next split up the comma delimited metrics
|
||||||
|
met := strings.Split(line, ",")
|
||||||
|
|
||||||
|
// Make sure there are as many metrics in the line as there were queried.
|
||||||
|
if len(met) == len(metricNames) {
|
||||||
|
for i, m := range metricNames {
|
||||||
|
|
||||||
|
// First handle the tags
|
||||||
|
if m[1] == "tag" {
|
||||||
|
tags[m[0]] = strings.TrimSpace(met[i])
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then parse the integers out of the fields
|
||||||
|
out, err := strconv.ParseInt(strings.TrimSpace(met[i]), 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
return tags, fields, err
|
||||||
|
}
|
||||||
|
fields[m[0]] = out
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the tags and fields
|
||||||
|
return tags, fields, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the line is empty return an emptyline error
|
||||||
|
return tags, fields, fmt.Errorf("Different number of metrics returned (%d) than expeced (%d)", len(met), len(metricNames))
|
||||||
|
}
|
|
@ -0,0 +1,35 @@
|
||||||
|
package nvidia_smi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseLineStandard(t *testing.T) {
|
||||||
|
line := "85, 8114, 553, 7561, P2, 61, GeForce GTX 1070 Ti, GPU-d1911b8a-f5c8-5e66-057c-486561269de8, Default, 100, 93, 1\n"
|
||||||
|
tags, fields, err := parseLine(line)
|
||||||
|
if err != nil {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
if tags["name"] != "GeForce GTX 1070 Ti" {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
if temp, ok := fields["temperature_gpu"].(int); ok && temp == 61 {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseLineEmptyLine(t *testing.T) {
|
||||||
|
line := "\n"
|
||||||
|
_, _, err := parseLine(line)
|
||||||
|
if err == nil {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseLineBad(t *testing.T) {
|
||||||
|
line := "the quick brown fox jumped over the lazy dog"
|
||||||
|
_, _, err := parseLine(line)
|
||||||
|
if err == nil {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue