Add nvidia_smi input to monitor nvidia GPUs (#4026)
This commit is contained in:
		
							parent
							
								
									bcf1cf59c1
								
							
						
					
					
						commit
						3046f957d5
					
				|  | @ -65,6 +65,7 @@ import ( | |||
| 	_ "github.com/influxdata/telegraf/plugins/inputs/nsq_consumer" | ||||
| 	_ "github.com/influxdata/telegraf/plugins/inputs/nstat" | ||||
| 	_ "github.com/influxdata/telegraf/plugins/inputs/ntpq" | ||||
| 	_ "github.com/influxdata/telegraf/plugins/inputs/nvidia_smi" | ||||
| 	_ "github.com/influxdata/telegraf/plugins/inputs/openldap" | ||||
| 	_ "github.com/influxdata/telegraf/plugins/inputs/opensmtpd" | ||||
| 	_ "github.com/influxdata/telegraf/plugins/inputs/passenger" | ||||
|  |  | |||
|  | @ -0,0 +1,47 @@ | |||
| # `nvidia-smi` Input Plugin | ||||
| 
 | ||||
| This plugin uses a query on the [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) binary to pull GPU stats including memory and GPU usage, temp and other. | ||||
| 
 | ||||
| ### Configuration | ||||
| 
 | ||||
| ```toml | ||||
| # Pulls statistics from nvidia GPUs attached to the host | ||||
| [[inputs.nvidia_smi]] | ||||
| ## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath | ||||
| # bin_path = /usr/bin/nvidia-smi | ||||
| 
 | ||||
| ## Optional: timeout for GPU polling | ||||
| # timeout = 5s | ||||
| ``` | ||||
| 
 | ||||
| ### Metrics | ||||
| - measurement: `nvidia_smi` | ||||
|   - tags | ||||
|     - `name` (type of GPU e.g. `GeForce GTX 170 Ti`) | ||||
|     - `compute_mode` (The compute mode of the GPU e.g. `Default`) | ||||
|     - `index` (The port index where the GPU is connected to the motherboard e.g. `1`) | ||||
|     - `pstate` (Overclocking state for the GPU e.g. `P0`) | ||||
|     - `uuid` (A unique identifier for the GPU e.g. `GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665`) | ||||
|   - fields | ||||
|     - `fan_speed` (integer, percentage) | ||||
|     - `memory_free` (integer, KB) | ||||
|     - `memory_used` (integer, KB) | ||||
|     - `memory_total` (integer, KB) | ||||
|     - `temperature_gpu` (integer, degrees C) | ||||
|     - `utilization_gpu` (integer, percentage) | ||||
|     - `utilization_memory` (integer, percentage) | ||||
| 
 | ||||
| ### Sample Query | ||||
| 
 | ||||
| The below query could be used to alert on the average temperature of the your GPUs over the last minute | ||||
| 
 | ||||
| ``` | ||||
| SELECT mean("temperature_gpu") FROM "nvidia_smi" WHERE time > now() - 5m GROUP BY time(1m), "index", "name", "host" | ||||
| ``` | ||||
| 
 | ||||
| ### Example Output | ||||
| ``` | ||||
| nvidia_smi,compute_mode=Default,host=8218cf,index=0,name=GeForce\ GTX\ 1070,pstate=P2,uuid=GPU-823bc202-6279-6f2c-d729-868a30f14d96 fan_speed=100i,memory_free=7563i,memory_total=8112i,memory_used=549i,temperature_gpu=53i,utilization_gpu=100i,utilization_memory=90i 1523991122000000000 | ||||
| nvidia_smi,compute_mode=Default,host=8218cf,index=1,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665 fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=50i,utilization_gpu=100i,utilization_memory=85i 1523991122000000000 | ||||
| nvidia_smi,compute_mode=Default,host=8218cf,index=2,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-d4cfc28d-0481-8d07-b81a-ddfc63d74adf fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=58i,utilization_gpu=100i,utilization_memory=86i 1523991122000000000 | ||||
| ``` | ||||
|  | @ -0,0 +1,149 @@ | |||
| package nvidia_smi | ||||
| 
 | ||||
| import ( | ||||
| 	"bufio" | ||||
| 	"fmt" | ||||
| 	"os" | ||||
| 	"os/exec" | ||||
| 	"strconv" | ||||
| 	"strings" | ||||
| 	"time" | ||||
| 
 | ||||
| 	"github.com/influxdata/telegraf" | ||||
| 	"github.com/influxdata/telegraf/internal" | ||||
| 	"github.com/influxdata/telegraf/plugins/inputs" | ||||
| ) | ||||
| 
 | ||||
| var ( | ||||
| 	measurement = "nvidia_smi" | ||||
| 	metrics     = "fan.speed,memory.total,memory.used,memory.free,pstate,temperature.gpu,name,uuid,compute_mode,utilization.gpu,utilization.memory,index" | ||||
| 	metricNames = [][]string{ | ||||
| 		[]string{"fan_speed", "field"}, | ||||
| 		[]string{"memory_total", "field"}, | ||||
| 		[]string{"memory_used", "field"}, | ||||
| 		[]string{"memory_free", "field"}, | ||||
| 		[]string{"pstate", "tag"}, | ||||
| 		[]string{"temperature_gpu", "field"}, | ||||
| 		[]string{"name", "tag"}, | ||||
| 		[]string{"uuid", "tag"}, | ||||
| 		[]string{"compute_mode", "tag"}, | ||||
| 		[]string{"utilization_gpu", "field"}, | ||||
| 		[]string{"utilization_memory", "field"}, | ||||
| 		[]string{"index", "tag"}, | ||||
| 	} | ||||
| ) | ||||
| 
 | ||||
| // NvidiaSMI holds the methods for this plugin
 | ||||
| type NvidiaSMI struct { | ||||
| 	BinPath string | ||||
| 	Timeout time.Duration | ||||
| 
 | ||||
| 	metrics string | ||||
| } | ||||
| 
 | ||||
| // Description returns the description of the NvidiaSMI plugin
 | ||||
| func (smi *NvidiaSMI) Description() string { | ||||
| 	return "Pulls statistics from nvidia GPUs attached to the host" | ||||
| } | ||||
| 
 | ||||
| // SampleConfig returns the sample configuration for the NvidiaSMI plugin
 | ||||
| func (smi *NvidiaSMI) SampleConfig() string { | ||||
| 	return ` | ||||
| ## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath | ||||
| # bin_path = /usr/bin/nvidia-smi | ||||
| 
 | ||||
| ## Optional: timeout for GPU polling | ||||
| # timeout = 5s | ||||
| ` | ||||
| } | ||||
| 
 | ||||
| // Gather implements the telegraf interface
 | ||||
| func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error { | ||||
| 
 | ||||
| 	if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) { | ||||
| 		return fmt.Errorf("nvidia-smi binary not at path %s, cannot gather GPU data", smi.BinPath) | ||||
| 	} | ||||
| 
 | ||||
| 	data, err := smi.pollSMI() | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 
 | ||||
| 	err = gatherNvidiaSMI(data, acc) | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 
 | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| func init() { | ||||
| 	inputs.Add("nvidia_smi", func() telegraf.Input { | ||||
| 		return &NvidiaSMI{ | ||||
| 			BinPath: "/usr/bin/nvidia-smi", | ||||
| 			Timeout: 5 * time.Second, | ||||
| 			metrics: metrics, | ||||
| 		} | ||||
| 	}) | ||||
| } | ||||
| 
 | ||||
| func (smi *NvidiaSMI) pollSMI() (string, error) { | ||||
| 	// Construct and execute metrics query
 | ||||
| 	opts := []string{"--format=noheader,nounits,csv", fmt.Sprintf("--query-gpu=%s", smi.metrics)} | ||||
| 	ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, opts...), smi.Timeout) | ||||
| 	if err != nil { | ||||
| 		return "", err | ||||
| 	} | ||||
| 	return string(ret), nil | ||||
| } | ||||
| 
 | ||||
| func gatherNvidiaSMI(ret string, acc telegraf.Accumulator) error { | ||||
| 	// First split the lines up and handle each one
 | ||||
| 	scanner := bufio.NewScanner(strings.NewReader(ret)) | ||||
| 	for scanner.Scan() { | ||||
| 		tags, fields, err := parseLine(scanner.Text()) | ||||
| 		if err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 		acc.AddFields(measurement, fields, tags) | ||||
| 	} | ||||
| 
 | ||||
| 	if err := scanner.Err(); err != nil { | ||||
| 		return fmt.Errorf("Error scanning text %s", ret) | ||||
| 	} | ||||
| 
 | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| func parseLine(line string) (map[string]string, map[string]interface{}, error) { | ||||
| 	tags := make(map[string]string, 0) | ||||
| 	fields := make(map[string]interface{}, 0) | ||||
| 
 | ||||
| 	// Next split up the comma delimited metrics
 | ||||
| 	met := strings.Split(line, ",") | ||||
| 
 | ||||
| 	// Make sure there are as many metrics in the line as there were queried.
 | ||||
| 	if len(met) == len(metricNames) { | ||||
| 		for i, m := range metricNames { | ||||
| 
 | ||||
| 			// First handle the tags
 | ||||
| 			if m[1] == "tag" { | ||||
| 				tags[m[0]] = strings.TrimSpace(met[i]) | ||||
| 				continue | ||||
| 			} | ||||
| 
 | ||||
| 			// Then parse the integers out of the fields
 | ||||
| 			out, err := strconv.ParseInt(strings.TrimSpace(met[i]), 10, 64) | ||||
| 			if err != nil { | ||||
| 				return tags, fields, err | ||||
| 			} | ||||
| 			fields[m[0]] = out | ||||
| 		} | ||||
| 
 | ||||
| 		// Return the tags and fields
 | ||||
| 		return tags, fields, nil | ||||
| 	} | ||||
| 
 | ||||
| 	// If the line is empty return an emptyline error
 | ||||
| 	return tags, fields, fmt.Errorf("Different number of metrics returned (%d) than expeced (%d)", len(met), len(metricNames)) | ||||
| } | ||||
|  | @ -0,0 +1,35 @@ | |||
| package nvidia_smi | ||||
| 
 | ||||
| import ( | ||||
| 	"testing" | ||||
| ) | ||||
| 
 | ||||
| func TestParseLineStandard(t *testing.T) { | ||||
| 	line := "85, 8114, 553, 7561, P2, 61, GeForce GTX 1070 Ti, GPU-d1911b8a-f5c8-5e66-057c-486561269de8, Default, 100, 93, 1\n" | ||||
| 	tags, fields, err := parseLine(line) | ||||
| 	if err != nil { | ||||
| 		t.Fail() | ||||
| 	} | ||||
| 	if tags["name"] != "GeForce GTX 1070 Ti" { | ||||
| 		t.Fail() | ||||
| 	} | ||||
| 	if temp, ok := fields["temperature_gpu"].(int); ok && temp == 61 { | ||||
| 		t.Fail() | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func TestParseLineEmptyLine(t *testing.T) { | ||||
| 	line := "\n" | ||||
| 	_, _, err := parseLine(line) | ||||
| 	if err == nil { | ||||
| 		t.Fail() | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func TestParseLineBad(t *testing.T) { | ||||
| 	line := "the quick brown fox jumped over the lazy dog" | ||||
| 	_, _, err := parseLine(line) | ||||
| 	if err == nil { | ||||
| 		t.Fail() | ||||
| 	} | ||||
| } | ||||
		Loading…
	
		Reference in New Issue