176 lines
		
	
	
		
			4.5 KiB
		
	
	
	
		
			Go
		
	
	
	
			
		
		
	
	
			176 lines
		
	
	
		
			4.5 KiB
		
	
	
	
		
			Go
		
	
	
	
| package nvidia_smi
 | |
| 
 | |
| import (
 | |
| 	"bufio"
 | |
| 	"fmt"
 | |
| 	"os"
 | |
| 	"os/exec"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| 	"time"
 | |
| 
 | |
| 	"github.com/influxdata/telegraf"
 | |
| 	"github.com/influxdata/telegraf/internal"
 | |
| 	"github.com/influxdata/telegraf/plugins/inputs"
 | |
| )
 | |
| 
 | |
| var (
 | |
| 	measurement = "nvidia_smi"
 | |
| 	metrics     = "fan.speed,memory.total,memory.used,memory.free,pstate,temperature.gpu,name,uuid,compute_mode,utilization.gpu,utilization.memory,index,power.draw,pcie.link.gen.current,pcie.link.width.current,encoder.stats.sessionCount,encoder.stats.averageFps,encoder.stats.averageLatency,clocks.current.graphics,clocks.current.sm,clocks.current.memory,clocks.current.video"
 | |
| 	metricNames = [][]string{
 | |
| 		{"fan_speed", "integer"},
 | |
| 		{"memory_total", "integer"},
 | |
| 		{"memory_used", "integer"},
 | |
| 		{"memory_free", "integer"},
 | |
| 		{"pstate", "tag"},
 | |
| 		{"temperature_gpu", "integer"},
 | |
| 		{"name", "tag"},
 | |
| 		{"uuid", "tag"},
 | |
| 		{"compute_mode", "tag"},
 | |
| 		{"utilization_gpu", "integer"},
 | |
| 		{"utilization_memory", "integer"},
 | |
| 		{"index", "tag"},
 | |
| 		{"power_draw", "float"},
 | |
| 		{"pcie_link_gen_current", "integer"},
 | |
| 		{"pcie_link_width_current", "integer"},
 | |
| 		{"encoder_stats_session_count", "integer"},
 | |
| 		{"encoder_stats_average_fps", "integer"},
 | |
| 		{"encoder_stats_average_latency", "integer"},
 | |
| 		{"clocks_current_graphics", "integer"},
 | |
| 		{"clocks_current_sm", "integer"},
 | |
| 		{"clocks_current_memory", "integer"},
 | |
| 		{"clocks_current_video", "integer"},
 | |
| 	}
 | |
| )
 | |
| 
 | |
| // NvidiaSMI holds the methods for this plugin
 | |
| type NvidiaSMI struct {
 | |
| 	BinPath string
 | |
| 	Timeout internal.Duration
 | |
| 
 | |
| 	metrics string
 | |
| }
 | |
| 
 | |
| // Description returns the description of the NvidiaSMI plugin
 | |
| func (smi *NvidiaSMI) Description() string {
 | |
| 	return "Pulls statistics from nvidia GPUs attached to the host"
 | |
| }
 | |
| 
 | |
| // SampleConfig returns the sample configuration for the NvidiaSMI plugin
 | |
| func (smi *NvidiaSMI) SampleConfig() string {
 | |
| 	return `
 | |
|   ## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath
 | |
|   # bin_path = "/usr/bin/nvidia-smi"
 | |
| 
 | |
|   ## Optional: timeout for GPU polling
 | |
|   # timeout = "5s"
 | |
| `
 | |
| }
 | |
| 
 | |
| // Gather implements the telegraf interface
 | |
| func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
 | |
| 
 | |
| 	if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
 | |
| 		return fmt.Errorf("nvidia-smi binary not at path %s, cannot gather GPU data", smi.BinPath)
 | |
| 	}
 | |
| 
 | |
| 	data, err := smi.pollSMI()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	err = gatherNvidiaSMI(data, acc)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func init() {
 | |
| 	inputs.Add("nvidia_smi", func() telegraf.Input {
 | |
| 		return &NvidiaSMI{
 | |
| 			BinPath: "/usr/bin/nvidia-smi",
 | |
| 			Timeout: internal.Duration{Duration: 5 * time.Second},
 | |
| 			metrics: metrics,
 | |
| 		}
 | |
| 	})
 | |
| }
 | |
| 
 | |
| func (smi *NvidiaSMI) pollSMI() (string, error) {
 | |
| 	// Construct and execute metrics query
 | |
| 	opts := []string{"--format=noheader,nounits,csv", fmt.Sprintf("--query-gpu=%s", smi.metrics)}
 | |
| 	ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, opts...), smi.Timeout.Duration)
 | |
| 	if err != nil {
 | |
| 		return "", err
 | |
| 	}
 | |
| 	return string(ret), nil
 | |
| }
 | |
| 
 | |
| func gatherNvidiaSMI(ret string, acc telegraf.Accumulator) error {
 | |
| 	// First split the lines up and handle each one
 | |
| 	scanner := bufio.NewScanner(strings.NewReader(ret))
 | |
| 	for scanner.Scan() {
 | |
| 		tags, fields, err := parseLine(scanner.Text())
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		acc.AddFields(measurement, fields, tags)
 | |
| 	}
 | |
| 
 | |
| 	if err := scanner.Err(); err != nil {
 | |
| 		return fmt.Errorf("Error scanning text %s", ret)
 | |
| 	}
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func parseLine(line string) (map[string]string, map[string]interface{}, error) {
 | |
| 	tags := make(map[string]string, 0)
 | |
| 	fields := make(map[string]interface{}, 0)
 | |
| 
 | |
| 	// Next split up the comma delimited metrics
 | |
| 	met := strings.Split(line, ",")
 | |
| 
 | |
| 	// Make sure there are as many metrics in the line as there were queried.
 | |
| 	if len(met) == len(metricNames) {
 | |
| 		for i, m := range metricNames {
 | |
| 			col := strings.TrimSpace(met[i])
 | |
| 
 | |
| 			// Handle the tags
 | |
| 			if m[1] == "tag" {
 | |
| 				tags[m[0]] = col
 | |
| 				continue
 | |
| 			}
 | |
| 
 | |
| 			if strings.Contains(col, "[Not Supported]") {
 | |
| 				continue
 | |
| 			}
 | |
| 
 | |
| 			// Parse the integers
 | |
| 			if m[1] == "integer" {
 | |
| 				out, err := strconv.ParseInt(col, 10, 64)
 | |
| 				if err != nil {
 | |
| 					return tags, fields, err
 | |
| 				}
 | |
| 				fields[m[0]] = out
 | |
| 			}
 | |
| 
 | |
| 			// Parse the floats
 | |
| 			if m[1] == "float" {
 | |
| 				out, err := strconv.ParseFloat(col, 64)
 | |
| 				if err != nil {
 | |
| 					return tags, fields, err
 | |
| 				}
 | |
| 				fields[m[0]] = out
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		// Return the tags and fields
 | |
| 		return tags, fields, nil
 | |
| 	}
 | |
| 
 | |
| 	// If the line is empty return an emptyline error
 | |
| 	return tags, fields, fmt.Errorf("Different number of metrics returned (%d) than expeced (%d)", len(met), len(metricNames))
 | |
| }
 |