telegraf/plugins/inputs/nvidia_smi/nvidia_smi.go

240 lines
6.6 KiB
Go

package nvidia_smi
import (
"encoding/xml"
"fmt"
"os"
"os/exec"
"strconv"
"strings"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/plugins/inputs"
)
const measurement = "nvidia_smi"
// NvidiaSMI holds the methods for this plugin
type NvidiaSMI struct {
BinPath string
Timeout internal.Duration
}
// Description returns the description of the NvidiaSMI plugin
func (smi *NvidiaSMI) Description() string {
return "Pulls statistics from nvidia GPUs attached to the host"
}
// SampleConfig returns the sample configuration for the NvidiaSMI plugin
func (smi *NvidiaSMI) SampleConfig() string {
return `
## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath
# bin_path = "/usr/bin/nvidia-smi"
## Optional: timeout for GPU polling
# timeout = "5s"
`
}
// Gather implements the telegraf interface
func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
return fmt.Errorf("nvidia-smi binary not at path %s, cannot gather GPU data", smi.BinPath)
}
data, err := smi.pollSMI()
if err != nil {
return err
}
err = gatherNvidiaSMI(data, acc)
if err != nil {
return err
}
return nil
}
func init() {
inputs.Add("nvidia_smi", func() telegraf.Input {
return &NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
Timeout: internal.Duration{Duration: 5 * time.Second},
}
})
}
func (smi *NvidiaSMI) pollSMI() ([]byte, error) {
// Construct and execute metrics query
ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), smi.Timeout.Duration)
if err != nil {
return nil, err
}
return ret, nil
}
func gatherNvidiaSMI(ret []byte, acc telegraf.Accumulator) error {
smi := &SMI{}
err := xml.Unmarshal(ret, smi)
if err != nil {
return err
}
metrics := smi.genTagsFields()
for _, metric := range metrics {
acc.AddFields(measurement, metric.fields, metric.tags)
}
return nil
}
type metric struct {
tags map[string]string
fields map[string]interface{}
}
func (s *SMI) genTagsFields() []metric {
metrics := []metric{}
for i, gpu := range s.GPU {
tags := map[string]string{
"index": strconv.Itoa(i),
}
fields := map[string]interface{}{}
setTagIfUsed(tags, "pstate", gpu.PState)
setTagIfUsed(tags, "name", gpu.ProdName)
setTagIfUsed(tags, "uuid", gpu.UUID)
setTagIfUsed(tags, "compute_mode", gpu.ComputeMode)
setIfUsed("int", fields, "fan_speed", gpu.FanSpeed)
setIfUsed("int", fields, "memory_total", gpu.Memory.Total)
setIfUsed("int", fields, "memory_used", gpu.Memory.Used)
setIfUsed("int", fields, "memory_free", gpu.Memory.Free)
setIfUsed("int", fields, "temperature_gpu", gpu.Temp.GPUTemp)
setIfUsed("int", fields, "utilization_gpu", gpu.Utilization.GPU)
setIfUsed("int", fields, "utilization_memory", gpu.Utilization.Memory)
setIfUsed("int", fields, "pcie_link_gen_current", gpu.PCI.LinkInfo.PCIEGen.CurrentLinkGen)
setIfUsed("int", fields, "pcie_link_width_current", gpu.PCI.LinkInfo.LinkWidth.CurrentLinkWidth)
setIfUsed("int", fields, "encoder_stats_session_count", gpu.Encoder.SessionCount)
setIfUsed("int", fields, "encoder_stats_average_fps", gpu.Encoder.AverageFPS)
setIfUsed("int", fields, "encoder_stats_average_latency", gpu.Encoder.AverageLatency)
setIfUsed("int", fields, "clocks_current_graphics", gpu.Clocks.Graphics)
setIfUsed("int", fields, "clocks_current_sm", gpu.Clocks.SM)
setIfUsed("int", fields, "clocks_current_memory", gpu.Clocks.Memory)
setIfUsed("int", fields, "clocks_current_video", gpu.Clocks.Video)
setIfUsed("float", fields, "power_draw", gpu.Power.PowerDraw)
metrics = append(metrics, metric{tags, fields})
}
return metrics
}
func setTagIfUsed(m map[string]string, k, v string) {
if v != "" {
m[k] = v
}
}
func setIfUsed(t string, m map[string]interface{}, k, v string) {
vals := strings.Fields(v)
if len(vals) < 1 {
return
}
val := vals[0]
if k == "pcie_link_width_current" {
val = strings.TrimSuffix(vals[0], "x")
}
switch t {
case "float":
if val != "" {
f, err := strconv.ParseFloat(val, 64)
if err == nil {
m[k] = f
}
}
case "int":
if val != "" {
i, err := strconv.Atoi(val)
if err == nil {
m[k] = i
}
}
}
}
// SMI defines the structure for the output of _nvidia-smi -q -x_.
type SMI struct {
GPU GPU `xml:"gpu"`
}
// GPU defines the structure of the GPU portion of the smi output.
type GPU []struct {
FanSpeed string `xml:"fan_speed"` // int
Memory MemoryStats `xml:"fb_memory_usage"`
PState string `xml:"performance_state"`
Temp TempStats `xml:"temperature"`
ProdName string `xml:"product_name"`
UUID string `xml:"uuid"`
ComputeMode string `xml:"compute_mode"`
Utilization UtilizationStats `xml:"utilization"`
Power PowerReadings `xml:"power_readings"`
PCI PCI `xml:"pci"`
Encoder EncoderStats `xml:"encoder_stats"`
Clocks ClockStats `xml:"clocks"`
}
// MemoryStats defines the structure of the memory portions in the smi output.
type MemoryStats struct {
Total string `xml:"total"` // int
Used string `xml:"used"` // int
Free string `xml:"free"` // int
}
// TempStats defines the structure of the temperature portion of the smi output.
type TempStats struct {
GPUTemp string `xml:"gpu_temp"` // int
}
// UtilizationStats defines the structure of the utilization portion of the smi output.
type UtilizationStats struct {
GPU string `xml:"gpu_util"` // int
Memory string `xml:"memory_util"` // int
}
// PowerReadings defines the structure of the power_readings portion of the smi output.
type PowerReadings struct {
PowerDraw string `xml:"power_draw"` // float
}
// PCI defines the structure of the pci portion of the smi output.
type PCI struct {
LinkInfo struct {
PCIEGen struct {
CurrentLinkGen string `xml:"current_link_gen"` // int
} `xml:"pcie_gen"`
LinkWidth struct {
CurrentLinkWidth string `xml:"current_link_width"` // int
} `xml:"link_widths"`
} `xml:"pci_gpu_link_info"`
}
// EncoderStats defines the structure of the encoder_stats portion of the smi output.
type EncoderStats struct {
SessionCount string `xml:"session_count"` // int
AverageFPS string `xml:"average_fps"` // int
AverageLatency string `xml:"average_latency"` // int
}
// ClockStats defines the structure of the clocks portion of the smi output.
type ClockStats struct {
Graphics string `xml:"graphics_clock"` // int
SM string `xml:"sm_clock"` // int
Memory string `xml:"mem_clock"` // int
Video string `xml:"video_clock"` // int
}