Update nvidia-smi input to use xml (#6639)

This commit is contained in:
Greg 2019-11-12 17:12:15 -07:00 committed by Daniel Nelson
parent 55b78a5f66
commit 2cf5116d14
2 changed files with 240 additions and 137 deletions

View File

@ -1,7 +1,7 @@
package nvidia_smi
import (
"bufio"
"encoding/xml"
"fmt"
"os"
"os/exec"
@ -14,41 +14,12 @@ import (
"github.com/influxdata/telegraf/plugins/inputs"
)
var (
measurement = "nvidia_smi"
metrics = "fan.speed,memory.total,memory.used,memory.free,pstate,temperature.gpu,name,uuid,compute_mode,utilization.gpu,utilization.memory,index,power.draw,pcie.link.gen.current,pcie.link.width.current,encoder.stats.sessionCount,encoder.stats.averageFps,encoder.stats.averageLatency,clocks.current.graphics,clocks.current.sm,clocks.current.memory,clocks.current.video"
metricNames = [][]string{
{"fan_speed", "integer"},
{"memory_total", "integer"},
{"memory_used", "integer"},
{"memory_free", "integer"},
{"pstate", "tag"},
{"temperature_gpu", "integer"},
{"name", "tag"},
{"uuid", "tag"},
{"compute_mode", "tag"},
{"utilization_gpu", "integer"},
{"utilization_memory", "integer"},
{"index", "tag"},
{"power_draw", "float"},
{"pcie_link_gen_current", "integer"},
{"pcie_link_width_current", "integer"},
{"encoder_stats_session_count", "integer"},
{"encoder_stats_average_fps", "integer"},
{"encoder_stats_average_latency", "integer"},
{"clocks_current_graphics", "integer"},
{"clocks_current_sm", "integer"},
{"clocks_current_memory", "integer"},
{"clocks_current_video", "integer"},
}
)
const measurement = "nvidia_smi"
// NvidiaSMI holds the methods for this plugin
type NvidiaSMI struct {
BinPath string
Timeout internal.Duration
metrics string
}
// Description returns the description of the NvidiaSMI plugin
@ -69,7 +40,6 @@ func (smi *NvidiaSMI) SampleConfig() string {
// Gather implements the telegraf interface
func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
return fmt.Errorf("nvidia-smi binary not at path %s, cannot gather GPU data", smi.BinPath)
}
@ -92,93 +62,178 @@ func init() {
return &NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
Timeout: internal.Duration{Duration: 5 * time.Second},
metrics: metrics,
}
})
}
func (smi *NvidiaSMI) pollSMI() (string, error) {
func (smi *NvidiaSMI) pollSMI() ([]byte, error) {
// Construct and execute metrics query
opts := []string{"--format=noheader,nounits,csv", fmt.Sprintf("--query-gpu=%s", smi.metrics)}
ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, opts...), smi.Timeout.Duration)
ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), smi.Timeout.Duration)
if err != nil {
return "", err
return nil, err
}
return string(ret), nil
return ret, nil
}
func gatherNvidiaSMI(ret string, acc telegraf.Accumulator) error {
// First split the lines up and handle each one
scanner := bufio.NewScanner(strings.NewReader(ret))
for scanner.Scan() {
tags, fields, err := parseLine(scanner.Text())
if err != nil {
return err
}
acc.AddFields(measurement, fields, tags)
func gatherNvidiaSMI(ret []byte, acc telegraf.Accumulator) error {
smi := &SMI{}
err := xml.Unmarshal(ret, smi)
if err != nil {
return err
}
if err := scanner.Err(); err != nil {
return fmt.Errorf("Error scanning text %s", ret)
metrics := smi.genTagsFields()
for _, metric := range metrics {
acc.AddFields(measurement, metric.fields, metric.tags)
}
return nil
}
func parseLine(line string) (map[string]string, map[string]interface{}, error) {
tags := make(map[string]string, 0)
fields := make(map[string]interface{}, 0)
type metric struct {
tags map[string]string
fields map[string]interface{}
}
// Next split up the comma delimited metrics
met := strings.Split(line, ",")
// Make sure there are as many metrics in the line as there were queried.
if len(met) == len(metricNames) {
for i, m := range metricNames {
col := strings.TrimSpace(met[i])
// Handle the tags
if m[1] == "tag" {
tags[m[0]] = col
continue
}
// In some cases we may not be able to get data.
// One such case is when the memory is overclocked.
// nvidia-smi reads the max supported memory clock from the stock value.
// If the current memory clock is greater than the max detected memory clock then we receive [Unknown Error] as a value.
// For example, the stock max memory clock speed on a 2080 Ti is 7000 MHz which nvidia-smi detects.
// The user has overclocked their memory using an offset of +1000 so under load the memory clock reaches 8000 MHz.
// Now when nvidia-smi tries to read the current memory clock it fails and spits back [Unknown Error] as the value.
// This value will break the parsing logic below unless it is accounted for here.
if strings.Contains(col, "[Not Supported]") || strings.Contains(col, "[Unknown Error]") {
continue
}
// Parse the integers
if m[1] == "integer" {
out, err := strconv.ParseInt(col, 10, 64)
if err != nil {
return tags, fields, err
}
fields[m[0]] = out
}
// Parse the floats
if m[1] == "float" {
out, err := strconv.ParseFloat(col, 64)
if err != nil {
return tags, fields, err
}
fields[m[0]] = out
}
func (s *SMI) genTagsFields() []metric {
metrics := []metric{}
for i, gpu := range s.GPU {
tags := map[string]string{
"index": strconv.Itoa(i),
}
fields := map[string]interface{}{}
// Return the tags and fields
return tags, fields, nil
setTagIfUsed(tags, "pstate", gpu.PState)
setTagIfUsed(tags, "name", gpu.ProdName)
setTagIfUsed(tags, "uuid", gpu.UUID)
setTagIfUsed(tags, "compute_mode", gpu.ComputeMode)
setIfUsed("int", fields, "fan_speed", gpu.FanSpeed)
setIfUsed("int", fields, "memory_total", gpu.Memory.Total)
setIfUsed("int", fields, "memory_used", gpu.Memory.Used)
setIfUsed("int", fields, "memory_free", gpu.Memory.Free)
setIfUsed("int", fields, "temperature_gpu", gpu.Temp.GPUTemp)
setIfUsed("int", fields, "utilization_gpu", gpu.Utilization.GPU)
setIfUsed("int", fields, "utilization_memory", gpu.Utilization.Memory)
setIfUsed("int", fields, "pcie_link_gen_current", gpu.PCI.LinkInfo.PCIEGen.CurrentLinkGen)
setIfUsed("int", fields, "pcie_link_width_current", gpu.PCI.LinkInfo.LinkWidth.CurrentLinkWidth)
setIfUsed("int", fields, "encoder_stats_session_count", gpu.Encoder.SessionCount)
setIfUsed("int", fields, "encoder_stats_average_fps", gpu.Encoder.AverageFPS)
setIfUsed("int", fields, "encoder_stats_average_latency", gpu.Encoder.AverageLatency)
setIfUsed("int", fields, "clocks_current_graphics", gpu.Clocks.Graphics)
setIfUsed("int", fields, "clocks_current_sm", gpu.Clocks.SM)
setIfUsed("int", fields, "clocks_current_memory", gpu.Clocks.Memory)
setIfUsed("int", fields, "clocks_current_video", gpu.Clocks.Video)
setIfUsed("float", fields, "power_draw", gpu.Power.PowerDraw)
metrics = append(metrics, metric{tags, fields})
}
return metrics
}
func setTagIfUsed(m map[string]string, k, v string) {
if v != "" {
m[k] = v
}
}
func setIfUsed(t string, m map[string]interface{}, k, v string) {
vals := strings.Fields(v)
if len(vals) < 1 {
return
}
// If the line is empty return an emptyline error
return tags, fields, fmt.Errorf("Different number of metrics returned (%d) than expeced (%d)", len(met), len(metricNames))
val := vals[0]
if k == "pcie_link_width_current" {
val = strings.TrimSuffix(vals[0], "x")
}
switch t {
case "float":
if val != "" {
f, err := strconv.ParseFloat(val, 64)
if err == nil {
m[k] = f
}
}
case "int":
if val != "" {
i, err := strconv.Atoi(val)
if err == nil {
m[k] = i
}
}
}
}
// SMI defines the structure for the output of _nvidia-smi -q -x_.
type SMI struct {
GPU GPU `xml:"gpu"`
}
// GPU defines the structure of the GPU portion of the smi output.
type GPU []struct {
FanSpeed string `xml:"fan_speed"` // int
Memory MemoryStats `xml:"fb_memory_usage"`
PState string `xml:"performance_state"`
Temp TempStats `xml:"temperature"`
ProdName string `xml:"product_name"`
UUID string `xml:"uuid"`
ComputeMode string `xml:"compute_mode"`
Utilization UtilizationStats `xml:"utilization"`
Power PowerReadings `xml:"power_readings"`
PCI PCI `xml:"pci"`
Encoder EncoderStats `xml:"encoder_stats"`
Clocks ClockStats `xml:"clocks"`
}
// MemoryStats defines the structure of the memory portions in the smi output.
type MemoryStats struct {
Total string `xml:"total"` // int
Used string `xml:"used"` // int
Free string `xml:"free"` // int
}
// TempStats defines the structure of the temperature portion of the smi output.
type TempStats struct {
GPUTemp string `xml:"gpu_temp"` // int
}
// UtilizationStats defines the structure of the utilization portion of the smi output.
type UtilizationStats struct {
GPU string `xml:"gpu_util"` // int
Memory string `xml:"memory_util"` // int
}
// PowerReadings defines the structure of the power_readings portion of the smi output.
type PowerReadings struct {
PowerDraw string `xml:"power_draw"` // float
}
// PCI defines the structure of the pci portion of the smi output.
type PCI struct {
LinkInfo struct {
PCIEGen struct {
CurrentLinkGen string `xml:"current_link_gen"` // int
} `xml:"pcie_gen"`
LinkWidth struct {
CurrentLinkWidth string `xml:"current_link_width"` // int
} `xml:"link_widths"`
} `xml:"pci_gpu_link_info"`
}
// EncoderStats defines the structure of the encoder_stats portion of the smi output.
type EncoderStats struct {
SessionCount string `xml:"session_count"` // int
AverageFPS string `xml:"average_fps"` // int
AverageLatency string `xml:"average_latency"` // int
}
// ClockStats defines the structure of the clocks portion of the smi output.
type ClockStats struct {
Graphics string `xml:"graphics_clock"` // int
SM string `xml:"sm_clock"` // int
Memory string `xml:"mem_clock"` // int
Video string `xml:"video_clock"` // int
}

View File

@ -1,51 +1,99 @@
package nvidia_smi
import (
"fmt"
"testing"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/require"
)
func TestParseLineStandard(t *testing.T) {
line := "41, 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n"
tags, fields, err := parseLine(line)
if err != nil {
t.Fail()
}
if tags["name"] != "GeForce RTX 2080 Ti" {
t.Fail()
}
if temp, ok := fields["temperature_gpu"].(int); ok && temp != 32 {
t.Fail()
}
}
var payload = []byte(`<?xml version="1.0" ?>
<!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v10.dtd">
<nvidia_smi_log>
<gpu id="00000000:01:00.0">
<product_name>GeForce GTX 1070 Ti</product_name>
<uuid>GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665</uuid>
<pci>
<pci_gpu_link_info>
<pcie_gen>
<current_link_gen>1</current_link_gen>
</pcie_gen>
<link_widths>
<current_link_width>16x</current_link_width>
</link_widths>
</pci_gpu_link_info>
</pci>
<fan_speed>100 %</fan_speed>
<performance_state>P8</performance_state>
<fb_memory_usage>
<total>4096 MiB</total>
<used>42 MiB</used>
<free>4054 MiB</free>
</fb_memory_usage>
<compute_mode>Default</compute_mode>
<utilization>
<gpu_util>0 %</gpu_util>
<memory_util>0 %</memory_util>
</utilization>
<encoder_stats>
<session_count>0</session_count>
<average_fps>0</average_fps>
<average_latency>0</average_latency>
</encoder_stats>
<temperature>
<gpu_temp>39 C</gpu_temp>
</temperature>
<power_readings>
<power_draw>N/A</power_draw>
</power_readings>
<clocks>
<graphics_clock>135 MHz</graphics_clock>
<sm_clock>135 MHz</sm_clock>
<mem_clock>405 MHz</mem_clock>
<video_clock>405 MHz</video_clock>
</clocks>
</gpu>
</nvidia_smi_log>`)
func TestParseLineEmptyLine(t *testing.T) {
line := "\n"
_, _, err := parseLine(line)
if err == nil {
t.Fail()
func TestGatherSMI(t *testing.T) {
var expectedMetric = struct {
tags map[string]string
fields map[string]interface{}
}{
tags: map[string]string{
"name": "GeForce GTX 1070 Ti",
"compute_mode": "Default",
"index": "0",
"pstate": "P8",
"uuid": "GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665",
},
fields: map[string]interface{}{
"fan_speed": 100,
"memory_free": 4054,
"memory_used": 42,
"memory_total": 4096,
"temperature_gpu": 39,
"utilization_gpu": 0,
"utilization_memory": 0,
"pcie_link_gen_current": 1,
"pcie_link_width_current": 16,
"encoder_stats_session_count": 0,
"encoder_stats_average_fps": 0,
"encoder_stats_average_latency": 0,
"clocks_current_graphics": 135,
"clocks_current_sm": 135,
"clocks_current_memory": 405,
"clocks_current_video": 405,
},
}
}
func TestParseLineBad(t *testing.T) {
line := "the quick brown fox jumped over the lazy dog"
_, _, err := parseLine(line)
if err == nil {
t.Fail()
}
}
acc := &testutil.Accumulator{}
func TestParseLineNotSupported(t *testing.T) {
line := "[Not Supported], 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n"
_, fields, err := parseLine(line)
require.NoError(t, err)
require.Equal(t, nil, fields["fan_speed"])
}
gatherNvidiaSMI(payload, acc)
fmt.Println()
func TestParseLineUnknownError(t *testing.T) {
line := "[Unknown Error], 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n"
_, fields, err := parseLine(line)
require.NoError(t, err)
require.Equal(t, nil, fields["fan_speed"])
require.Equal(t, 1, len(acc.Metrics))
require.Equal(t, expectedMetric.fields, acc.Metrics[0].Fields)
require.Equal(t, expectedMetric.tags, acc.Metrics[0].Tags)
}