2018-04-17 20:40:55 +00:00
|
|
|
package nvidia_smi
|
|
|
|
|
|
|
|
import (
|
2019-11-13 00:12:15 +00:00
|
|
|
"encoding/xml"
|
2018-04-17 20:40:55 +00:00
|
|
|
"fmt"
|
|
|
|
"os"
|
|
|
|
"os/exec"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/influxdata/telegraf"
|
|
|
|
"github.com/influxdata/telegraf/internal"
|
|
|
|
"github.com/influxdata/telegraf/plugins/inputs"
|
|
|
|
)
|
|
|
|
|
2019-11-13 00:12:15 +00:00
|
|
|
const measurement = "nvidia_smi"
|
2018-04-17 20:40:55 +00:00
|
|
|
|
|
|
|
// NvidiaSMI holds the methods for this plugin
|
|
|
|
type NvidiaSMI struct {
|
|
|
|
BinPath string
|
2018-04-24 21:40:19 +00:00
|
|
|
Timeout internal.Duration
|
2018-04-17 20:40:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Description returns the description of the NvidiaSMI plugin
|
|
|
|
func (smi *NvidiaSMI) Description() string {
|
|
|
|
return "Pulls statistics from nvidia GPUs attached to the host"
|
|
|
|
}
|
|
|
|
|
|
|
|
// SampleConfig returns the sample configuration for the NvidiaSMI plugin
|
|
|
|
func (smi *NvidiaSMI) SampleConfig() string {
|
|
|
|
return `
|
2018-07-11 03:20:44 +00:00
|
|
|
## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath
|
|
|
|
# bin_path = "/usr/bin/nvidia-smi"
|
2018-04-17 20:40:55 +00:00
|
|
|
|
2018-07-11 03:20:44 +00:00
|
|
|
## Optional: timeout for GPU polling
|
|
|
|
# timeout = "5s"
|
2018-04-17 20:40:55 +00:00
|
|
|
`
|
|
|
|
}
|
|
|
|
|
|
|
|
// Gather implements the telegraf interface
|
|
|
|
func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
|
|
|
|
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
|
|
|
|
return fmt.Errorf("nvidia-smi binary not at path %s, cannot gather GPU data", smi.BinPath)
|
|
|
|
}
|
|
|
|
|
|
|
|
data, err := smi.pollSMI()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
err = gatherNvidiaSMI(data, acc)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func init() {
|
|
|
|
inputs.Add("nvidia_smi", func() telegraf.Input {
|
|
|
|
return &NvidiaSMI{
|
|
|
|
BinPath: "/usr/bin/nvidia-smi",
|
2018-04-24 21:40:19 +00:00
|
|
|
Timeout: internal.Duration{Duration: 5 * time.Second},
|
2018-04-17 20:40:55 +00:00
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-11-13 00:12:15 +00:00
|
|
|
func (smi *NvidiaSMI) pollSMI() ([]byte, error) {
|
2018-04-17 20:40:55 +00:00
|
|
|
// Construct and execute metrics query
|
2019-11-13 00:12:15 +00:00
|
|
|
ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), smi.Timeout.Duration)
|
2018-04-17 20:40:55 +00:00
|
|
|
if err != nil {
|
2019-11-13 00:12:15 +00:00
|
|
|
return nil, err
|
2018-04-17 20:40:55 +00:00
|
|
|
}
|
2019-11-13 00:12:15 +00:00
|
|
|
return ret, nil
|
2018-04-17 20:40:55 +00:00
|
|
|
}
|
|
|
|
|
2019-11-13 00:12:15 +00:00
|
|
|
func gatherNvidiaSMI(ret []byte, acc telegraf.Accumulator) error {
|
|
|
|
smi := &SMI{}
|
|
|
|
err := xml.Unmarshal(ret, smi)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
2018-04-17 20:40:55 +00:00
|
|
|
}
|
|
|
|
|
2019-11-13 00:12:15 +00:00
|
|
|
metrics := smi.genTagsFields()
|
|
|
|
|
|
|
|
for _, metric := range metrics {
|
|
|
|
acc.AddFields(measurement, metric.fields, metric.tags)
|
2018-04-17 20:40:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-11-13 00:12:15 +00:00
|
|
|
type metric struct {
|
|
|
|
tags map[string]string
|
|
|
|
fields map[string]interface{}
|
|
|
|
}
|
2018-04-17 20:40:55 +00:00
|
|
|
|
2019-11-13 00:12:15 +00:00
|
|
|
func (s *SMI) genTagsFields() []metric {
|
|
|
|
metrics := []metric{}
|
|
|
|
for i, gpu := range s.GPU {
|
|
|
|
tags := map[string]string{
|
|
|
|
"index": strconv.Itoa(i),
|
|
|
|
}
|
|
|
|
fields := map[string]interface{}{}
|
|
|
|
|
|
|
|
setTagIfUsed(tags, "pstate", gpu.PState)
|
|
|
|
setTagIfUsed(tags, "name", gpu.ProdName)
|
|
|
|
setTagIfUsed(tags, "uuid", gpu.UUID)
|
|
|
|
setTagIfUsed(tags, "compute_mode", gpu.ComputeMode)
|
|
|
|
|
|
|
|
setIfUsed("int", fields, "fan_speed", gpu.FanSpeed)
|
|
|
|
setIfUsed("int", fields, "memory_total", gpu.Memory.Total)
|
|
|
|
setIfUsed("int", fields, "memory_used", gpu.Memory.Used)
|
|
|
|
setIfUsed("int", fields, "memory_free", gpu.Memory.Free)
|
|
|
|
setIfUsed("int", fields, "temperature_gpu", gpu.Temp.GPUTemp)
|
|
|
|
setIfUsed("int", fields, "utilization_gpu", gpu.Utilization.GPU)
|
|
|
|
setIfUsed("int", fields, "utilization_memory", gpu.Utilization.Memory)
|
|
|
|
setIfUsed("int", fields, "pcie_link_gen_current", gpu.PCI.LinkInfo.PCIEGen.CurrentLinkGen)
|
|
|
|
setIfUsed("int", fields, "pcie_link_width_current", gpu.PCI.LinkInfo.LinkWidth.CurrentLinkWidth)
|
|
|
|
setIfUsed("int", fields, "encoder_stats_session_count", gpu.Encoder.SessionCount)
|
|
|
|
setIfUsed("int", fields, "encoder_stats_average_fps", gpu.Encoder.AverageFPS)
|
|
|
|
setIfUsed("int", fields, "encoder_stats_average_latency", gpu.Encoder.AverageLatency)
|
|
|
|
setIfUsed("int", fields, "clocks_current_graphics", gpu.Clocks.Graphics)
|
|
|
|
setIfUsed("int", fields, "clocks_current_sm", gpu.Clocks.SM)
|
|
|
|
setIfUsed("int", fields, "clocks_current_memory", gpu.Clocks.Memory)
|
|
|
|
setIfUsed("int", fields, "clocks_current_video", gpu.Clocks.Video)
|
|
|
|
|
|
|
|
setIfUsed("float", fields, "power_draw", gpu.Power.PowerDraw)
|
|
|
|
metrics = append(metrics, metric{tags, fields})
|
|
|
|
}
|
|
|
|
return metrics
|
|
|
|
}
|
2018-04-17 20:40:55 +00:00
|
|
|
|
2019-11-13 00:12:15 +00:00
|
|
|
func setTagIfUsed(m map[string]string, k, v string) {
|
|
|
|
if v != "" {
|
|
|
|
m[k] = v
|
|
|
|
}
|
|
|
|
}
|
2018-04-17 20:40:55 +00:00
|
|
|
|
2019-11-13 00:12:15 +00:00
|
|
|
func setIfUsed(t string, m map[string]interface{}, k, v string) {
|
|
|
|
vals := strings.Fields(v)
|
|
|
|
if len(vals) < 1 {
|
|
|
|
return
|
|
|
|
}
|
2018-05-08 20:11:12 +00:00
|
|
|
|
2019-11-13 00:12:15 +00:00
|
|
|
val := vals[0]
|
|
|
|
if k == "pcie_link_width_current" {
|
|
|
|
val = strings.TrimSuffix(vals[0], "x")
|
|
|
|
}
|
2018-04-17 20:40:55 +00:00
|
|
|
|
2019-11-13 00:12:15 +00:00
|
|
|
switch t {
|
|
|
|
case "float":
|
|
|
|
if val != "" {
|
|
|
|
f, err := strconv.ParseFloat(val, 64)
|
|
|
|
if err == nil {
|
|
|
|
m[k] = f
|
2018-06-11 23:06:26 +00:00
|
|
|
}
|
2019-11-13 00:12:15 +00:00
|
|
|
}
|
|
|
|
case "int":
|
|
|
|
if val != "" {
|
|
|
|
i, err := strconv.Atoi(val)
|
|
|
|
if err == nil {
|
|
|
|
m[k] = i
|
2018-04-17 20:40:55 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-11-13 00:12:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// SMI defines the structure for the output of _nvidia-smi -q -x_.
|
|
|
|
type SMI struct {
|
|
|
|
GPU GPU `xml:"gpu"`
|
|
|
|
}
|
|
|
|
|
|
|
|
// GPU defines the structure of the GPU portion of the smi output.
|
|
|
|
type GPU []struct {
|
|
|
|
FanSpeed string `xml:"fan_speed"` // int
|
|
|
|
Memory MemoryStats `xml:"fb_memory_usage"`
|
|
|
|
PState string `xml:"performance_state"`
|
|
|
|
Temp TempStats `xml:"temperature"`
|
|
|
|
ProdName string `xml:"product_name"`
|
|
|
|
UUID string `xml:"uuid"`
|
|
|
|
ComputeMode string `xml:"compute_mode"`
|
|
|
|
Utilization UtilizationStats `xml:"utilization"`
|
|
|
|
Power PowerReadings `xml:"power_readings"`
|
|
|
|
PCI PCI `xml:"pci"`
|
|
|
|
Encoder EncoderStats `xml:"encoder_stats"`
|
|
|
|
Clocks ClockStats `xml:"clocks"`
|
|
|
|
}
|
|
|
|
|
|
|
|
// MemoryStats defines the structure of the memory portions in the smi output.
|
|
|
|
type MemoryStats struct {
|
|
|
|
Total string `xml:"total"` // int
|
|
|
|
Used string `xml:"used"` // int
|
|
|
|
Free string `xml:"free"` // int
|
|
|
|
}
|
|
|
|
|
|
|
|
// TempStats defines the structure of the temperature portion of the smi output.
|
|
|
|
type TempStats struct {
|
|
|
|
GPUTemp string `xml:"gpu_temp"` // int
|
|
|
|
}
|
|
|
|
|
|
|
|
// UtilizationStats defines the structure of the utilization portion of the smi output.
|
|
|
|
type UtilizationStats struct {
|
|
|
|
GPU string `xml:"gpu_util"` // int
|
|
|
|
Memory string `xml:"memory_util"` // int
|
|
|
|
}
|
|
|
|
|
|
|
|
// PowerReadings defines the structure of the power_readings portion of the smi output.
|
|
|
|
type PowerReadings struct {
|
|
|
|
PowerDraw string `xml:"power_draw"` // float
|
|
|
|
}
|
|
|
|
|
|
|
|
// PCI defines the structure of the pci portion of the smi output.
|
|
|
|
type PCI struct {
|
|
|
|
LinkInfo struct {
|
|
|
|
PCIEGen struct {
|
|
|
|
CurrentLinkGen string `xml:"current_link_gen"` // int
|
|
|
|
} `xml:"pcie_gen"`
|
|
|
|
LinkWidth struct {
|
|
|
|
CurrentLinkWidth string `xml:"current_link_width"` // int
|
|
|
|
} `xml:"link_widths"`
|
|
|
|
} `xml:"pci_gpu_link_info"`
|
|
|
|
}
|
|
|
|
|
|
|
|
// EncoderStats defines the structure of the encoder_stats portion of the smi output.
|
|
|
|
type EncoderStats struct {
|
|
|
|
SessionCount string `xml:"session_count"` // int
|
|
|
|
AverageFPS string `xml:"average_fps"` // int
|
|
|
|
AverageLatency string `xml:"average_latency"` // int
|
|
|
|
}
|
2018-04-17 20:40:55 +00:00
|
|
|
|
2019-11-13 00:12:15 +00:00
|
|
|
// ClockStats defines the structure of the clocks portion of the smi output.
|
|
|
|
type ClockStats struct {
|
|
|
|
Graphics string `xml:"graphics_clock"` // int
|
|
|
|
SM string `xml:"sm_clock"` // int
|
|
|
|
Memory string `xml:"mem_clock"` // int
|
|
|
|
Video string `xml:"video_clock"` // int
|
2018-04-17 20:40:55 +00:00
|
|
|
}
|