Update nvidia-smi input to use xml (#6639)
This commit is contained in:
parent
55b78a5f66
commit
2cf5116d14
|
@ -1,7 +1,7 @@
|
||||||
package nvidia_smi
|
package nvidia_smi
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"encoding/xml"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
@ -14,41 +14,12 @@ import (
|
||||||
"github.com/influxdata/telegraf/plugins/inputs"
|
"github.com/influxdata/telegraf/plugins/inputs"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
const measurement = "nvidia_smi"
|
||||||
measurement = "nvidia_smi"
|
|
||||||
metrics = "fan.speed,memory.total,memory.used,memory.free,pstate,temperature.gpu,name,uuid,compute_mode,utilization.gpu,utilization.memory,index,power.draw,pcie.link.gen.current,pcie.link.width.current,encoder.stats.sessionCount,encoder.stats.averageFps,encoder.stats.averageLatency,clocks.current.graphics,clocks.current.sm,clocks.current.memory,clocks.current.video"
|
|
||||||
metricNames = [][]string{
|
|
||||||
{"fan_speed", "integer"},
|
|
||||||
{"memory_total", "integer"},
|
|
||||||
{"memory_used", "integer"},
|
|
||||||
{"memory_free", "integer"},
|
|
||||||
{"pstate", "tag"},
|
|
||||||
{"temperature_gpu", "integer"},
|
|
||||||
{"name", "tag"},
|
|
||||||
{"uuid", "tag"},
|
|
||||||
{"compute_mode", "tag"},
|
|
||||||
{"utilization_gpu", "integer"},
|
|
||||||
{"utilization_memory", "integer"},
|
|
||||||
{"index", "tag"},
|
|
||||||
{"power_draw", "float"},
|
|
||||||
{"pcie_link_gen_current", "integer"},
|
|
||||||
{"pcie_link_width_current", "integer"},
|
|
||||||
{"encoder_stats_session_count", "integer"},
|
|
||||||
{"encoder_stats_average_fps", "integer"},
|
|
||||||
{"encoder_stats_average_latency", "integer"},
|
|
||||||
{"clocks_current_graphics", "integer"},
|
|
||||||
{"clocks_current_sm", "integer"},
|
|
||||||
{"clocks_current_memory", "integer"},
|
|
||||||
{"clocks_current_video", "integer"},
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
// NvidiaSMI holds the methods for this plugin
|
// NvidiaSMI holds the methods for this plugin
|
||||||
type NvidiaSMI struct {
|
type NvidiaSMI struct {
|
||||||
BinPath string
|
BinPath string
|
||||||
Timeout internal.Duration
|
Timeout internal.Duration
|
||||||
|
|
||||||
metrics string
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Description returns the description of the NvidiaSMI plugin
|
// Description returns the description of the NvidiaSMI plugin
|
||||||
|
@ -69,7 +40,6 @@ func (smi *NvidiaSMI) SampleConfig() string {
|
||||||
|
|
||||||
// Gather implements the telegraf interface
|
// Gather implements the telegraf interface
|
||||||
func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
|
func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
|
||||||
|
|
||||||
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
|
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
|
||||||
return fmt.Errorf("nvidia-smi binary not at path %s, cannot gather GPU data", smi.BinPath)
|
return fmt.Errorf("nvidia-smi binary not at path %s, cannot gather GPU data", smi.BinPath)
|
||||||
}
|
}
|
||||||
|
@ -92,93 +62,178 @@ func init() {
|
||||||
return &NvidiaSMI{
|
return &NvidiaSMI{
|
||||||
BinPath: "/usr/bin/nvidia-smi",
|
BinPath: "/usr/bin/nvidia-smi",
|
||||||
Timeout: internal.Duration{Duration: 5 * time.Second},
|
Timeout: internal.Duration{Duration: 5 * time.Second},
|
||||||
metrics: metrics,
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (smi *NvidiaSMI) pollSMI() (string, error) {
|
func (smi *NvidiaSMI) pollSMI() ([]byte, error) {
|
||||||
// Construct and execute metrics query
|
// Construct and execute metrics query
|
||||||
opts := []string{"--format=noheader,nounits,csv", fmt.Sprintf("--query-gpu=%s", smi.metrics)}
|
ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), smi.Timeout.Duration)
|
||||||
ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, opts...), smi.Timeout.Duration)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return nil, err
|
||||||
}
|
}
|
||||||
return string(ret), nil
|
return ret, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func gatherNvidiaSMI(ret string, acc telegraf.Accumulator) error {
|
func gatherNvidiaSMI(ret []byte, acc telegraf.Accumulator) error {
|
||||||
// First split the lines up and handle each one
|
smi := &SMI{}
|
||||||
scanner := bufio.NewScanner(strings.NewReader(ret))
|
err := xml.Unmarshal(ret, smi)
|
||||||
for scanner.Scan() {
|
if err != nil {
|
||||||
tags, fields, err := parseLine(scanner.Text())
|
return err
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
acc.AddFields(measurement, fields, tags)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := scanner.Err(); err != nil {
|
metrics := smi.genTagsFields()
|
||||||
return fmt.Errorf("Error scanning text %s", ret)
|
|
||||||
|
for _, metric := range metrics {
|
||||||
|
acc.AddFields(measurement, metric.fields, metric.tags)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseLine(line string) (map[string]string, map[string]interface{}, error) {
|
type metric struct {
|
||||||
tags := make(map[string]string, 0)
|
tags map[string]string
|
||||||
fields := make(map[string]interface{}, 0)
|
fields map[string]interface{}
|
||||||
|
}
|
||||||
|
|
||||||
// Next split up the comma delimited metrics
|
func (s *SMI) genTagsFields() []metric {
|
||||||
met := strings.Split(line, ",")
|
metrics := []metric{}
|
||||||
|
for i, gpu := range s.GPU {
|
||||||
// Make sure there are as many metrics in the line as there were queried.
|
tags := map[string]string{
|
||||||
if len(met) == len(metricNames) {
|
"index": strconv.Itoa(i),
|
||||||
for i, m := range metricNames {
|
|
||||||
col := strings.TrimSpace(met[i])
|
|
||||||
|
|
||||||
// Handle the tags
|
|
||||||
if m[1] == "tag" {
|
|
||||||
tags[m[0]] = col
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// In some cases we may not be able to get data.
|
|
||||||
// One such case is when the memory is overclocked.
|
|
||||||
// nvidia-smi reads the max supported memory clock from the stock value.
|
|
||||||
// If the current memory clock is greater than the max detected memory clock then we receive [Unknown Error] as a value.
|
|
||||||
|
|
||||||
// For example, the stock max memory clock speed on a 2080 Ti is 7000 MHz which nvidia-smi detects.
|
|
||||||
// The user has overclocked their memory using an offset of +1000 so under load the memory clock reaches 8000 MHz.
|
|
||||||
// Now when nvidia-smi tries to read the current memory clock it fails and spits back [Unknown Error] as the value.
|
|
||||||
// This value will break the parsing logic below unless it is accounted for here.
|
|
||||||
if strings.Contains(col, "[Not Supported]") || strings.Contains(col, "[Unknown Error]") {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse the integers
|
|
||||||
if m[1] == "integer" {
|
|
||||||
out, err := strconv.ParseInt(col, 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
return tags, fields, err
|
|
||||||
}
|
|
||||||
fields[m[0]] = out
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse the floats
|
|
||||||
if m[1] == "float" {
|
|
||||||
out, err := strconv.ParseFloat(col, 64)
|
|
||||||
if err != nil {
|
|
||||||
return tags, fields, err
|
|
||||||
}
|
|
||||||
fields[m[0]] = out
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
fields := map[string]interface{}{}
|
||||||
|
|
||||||
// Return the tags and fields
|
setTagIfUsed(tags, "pstate", gpu.PState)
|
||||||
return tags, fields, nil
|
setTagIfUsed(tags, "name", gpu.ProdName)
|
||||||
|
setTagIfUsed(tags, "uuid", gpu.UUID)
|
||||||
|
setTagIfUsed(tags, "compute_mode", gpu.ComputeMode)
|
||||||
|
|
||||||
|
setIfUsed("int", fields, "fan_speed", gpu.FanSpeed)
|
||||||
|
setIfUsed("int", fields, "memory_total", gpu.Memory.Total)
|
||||||
|
setIfUsed("int", fields, "memory_used", gpu.Memory.Used)
|
||||||
|
setIfUsed("int", fields, "memory_free", gpu.Memory.Free)
|
||||||
|
setIfUsed("int", fields, "temperature_gpu", gpu.Temp.GPUTemp)
|
||||||
|
setIfUsed("int", fields, "utilization_gpu", gpu.Utilization.GPU)
|
||||||
|
setIfUsed("int", fields, "utilization_memory", gpu.Utilization.Memory)
|
||||||
|
setIfUsed("int", fields, "pcie_link_gen_current", gpu.PCI.LinkInfo.PCIEGen.CurrentLinkGen)
|
||||||
|
setIfUsed("int", fields, "pcie_link_width_current", gpu.PCI.LinkInfo.LinkWidth.CurrentLinkWidth)
|
||||||
|
setIfUsed("int", fields, "encoder_stats_session_count", gpu.Encoder.SessionCount)
|
||||||
|
setIfUsed("int", fields, "encoder_stats_average_fps", gpu.Encoder.AverageFPS)
|
||||||
|
setIfUsed("int", fields, "encoder_stats_average_latency", gpu.Encoder.AverageLatency)
|
||||||
|
setIfUsed("int", fields, "clocks_current_graphics", gpu.Clocks.Graphics)
|
||||||
|
setIfUsed("int", fields, "clocks_current_sm", gpu.Clocks.SM)
|
||||||
|
setIfUsed("int", fields, "clocks_current_memory", gpu.Clocks.Memory)
|
||||||
|
setIfUsed("int", fields, "clocks_current_video", gpu.Clocks.Video)
|
||||||
|
|
||||||
|
setIfUsed("float", fields, "power_draw", gpu.Power.PowerDraw)
|
||||||
|
metrics = append(metrics, metric{tags, fields})
|
||||||
|
}
|
||||||
|
return metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
func setTagIfUsed(m map[string]string, k, v string) {
|
||||||
|
if v != "" {
|
||||||
|
m[k] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func setIfUsed(t string, m map[string]interface{}, k, v string) {
|
||||||
|
vals := strings.Fields(v)
|
||||||
|
if len(vals) < 1 {
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the line is empty return an emptyline error
|
val := vals[0]
|
||||||
return tags, fields, fmt.Errorf("Different number of metrics returned (%d) than expeced (%d)", len(met), len(metricNames))
|
if k == "pcie_link_width_current" {
|
||||||
|
val = strings.TrimSuffix(vals[0], "x")
|
||||||
|
}
|
||||||
|
|
||||||
|
switch t {
|
||||||
|
case "float":
|
||||||
|
if val != "" {
|
||||||
|
f, err := strconv.ParseFloat(val, 64)
|
||||||
|
if err == nil {
|
||||||
|
m[k] = f
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case "int":
|
||||||
|
if val != "" {
|
||||||
|
i, err := strconv.Atoi(val)
|
||||||
|
if err == nil {
|
||||||
|
m[k] = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// SMI defines the structure for the output of _nvidia-smi -q -x_.
|
||||||
|
type SMI struct {
|
||||||
|
GPU GPU `xml:"gpu"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// GPU defines the structure of the GPU portion of the smi output.
|
||||||
|
type GPU []struct {
|
||||||
|
FanSpeed string `xml:"fan_speed"` // int
|
||||||
|
Memory MemoryStats `xml:"fb_memory_usage"`
|
||||||
|
PState string `xml:"performance_state"`
|
||||||
|
Temp TempStats `xml:"temperature"`
|
||||||
|
ProdName string `xml:"product_name"`
|
||||||
|
UUID string `xml:"uuid"`
|
||||||
|
ComputeMode string `xml:"compute_mode"`
|
||||||
|
Utilization UtilizationStats `xml:"utilization"`
|
||||||
|
Power PowerReadings `xml:"power_readings"`
|
||||||
|
PCI PCI `xml:"pci"`
|
||||||
|
Encoder EncoderStats `xml:"encoder_stats"`
|
||||||
|
Clocks ClockStats `xml:"clocks"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// MemoryStats defines the structure of the memory portions in the smi output.
|
||||||
|
type MemoryStats struct {
|
||||||
|
Total string `xml:"total"` // int
|
||||||
|
Used string `xml:"used"` // int
|
||||||
|
Free string `xml:"free"` // int
|
||||||
|
}
|
||||||
|
|
||||||
|
// TempStats defines the structure of the temperature portion of the smi output.
|
||||||
|
type TempStats struct {
|
||||||
|
GPUTemp string `xml:"gpu_temp"` // int
|
||||||
|
}
|
||||||
|
|
||||||
|
// UtilizationStats defines the structure of the utilization portion of the smi output.
|
||||||
|
type UtilizationStats struct {
|
||||||
|
GPU string `xml:"gpu_util"` // int
|
||||||
|
Memory string `xml:"memory_util"` // int
|
||||||
|
}
|
||||||
|
|
||||||
|
// PowerReadings defines the structure of the power_readings portion of the smi output.
|
||||||
|
type PowerReadings struct {
|
||||||
|
PowerDraw string `xml:"power_draw"` // float
|
||||||
|
}
|
||||||
|
|
||||||
|
// PCI defines the structure of the pci portion of the smi output.
|
||||||
|
type PCI struct {
|
||||||
|
LinkInfo struct {
|
||||||
|
PCIEGen struct {
|
||||||
|
CurrentLinkGen string `xml:"current_link_gen"` // int
|
||||||
|
} `xml:"pcie_gen"`
|
||||||
|
LinkWidth struct {
|
||||||
|
CurrentLinkWidth string `xml:"current_link_width"` // int
|
||||||
|
} `xml:"link_widths"`
|
||||||
|
} `xml:"pci_gpu_link_info"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// EncoderStats defines the structure of the encoder_stats portion of the smi output.
|
||||||
|
type EncoderStats struct {
|
||||||
|
SessionCount string `xml:"session_count"` // int
|
||||||
|
AverageFPS string `xml:"average_fps"` // int
|
||||||
|
AverageLatency string `xml:"average_latency"` // int
|
||||||
|
}
|
||||||
|
|
||||||
|
// ClockStats defines the structure of the clocks portion of the smi output.
|
||||||
|
type ClockStats struct {
|
||||||
|
Graphics string `xml:"graphics_clock"` // int
|
||||||
|
SM string `xml:"sm_clock"` // int
|
||||||
|
Memory string `xml:"mem_clock"` // int
|
||||||
|
Video string `xml:"video_clock"` // int
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,51 +1,99 @@
|
||||||
package nvidia_smi
|
package nvidia_smi
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/influxdata/telegraf/testutil"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestParseLineStandard(t *testing.T) {
|
var payload = []byte(`<?xml version="1.0" ?>
|
||||||
line := "41, 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n"
|
<!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v10.dtd">
|
||||||
tags, fields, err := parseLine(line)
|
<nvidia_smi_log>
|
||||||
if err != nil {
|
<gpu id="00000000:01:00.0">
|
||||||
t.Fail()
|
<product_name>GeForce GTX 1070 Ti</product_name>
|
||||||
}
|
<uuid>GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665</uuid>
|
||||||
if tags["name"] != "GeForce RTX 2080 Ti" {
|
<pci>
|
||||||
t.Fail()
|
<pci_gpu_link_info>
|
||||||
}
|
<pcie_gen>
|
||||||
if temp, ok := fields["temperature_gpu"].(int); ok && temp != 32 {
|
<current_link_gen>1</current_link_gen>
|
||||||
t.Fail()
|
</pcie_gen>
|
||||||
}
|
<link_widths>
|
||||||
}
|
<current_link_width>16x</current_link_width>
|
||||||
|
</link_widths>
|
||||||
|
</pci_gpu_link_info>
|
||||||
|
</pci>
|
||||||
|
<fan_speed>100 %</fan_speed>
|
||||||
|
<performance_state>P8</performance_state>
|
||||||
|
<fb_memory_usage>
|
||||||
|
<total>4096 MiB</total>
|
||||||
|
<used>42 MiB</used>
|
||||||
|
<free>4054 MiB</free>
|
||||||
|
</fb_memory_usage>
|
||||||
|
<compute_mode>Default</compute_mode>
|
||||||
|
<utilization>
|
||||||
|
<gpu_util>0 %</gpu_util>
|
||||||
|
<memory_util>0 %</memory_util>
|
||||||
|
</utilization>
|
||||||
|
<encoder_stats>
|
||||||
|
<session_count>0</session_count>
|
||||||
|
<average_fps>0</average_fps>
|
||||||
|
<average_latency>0</average_latency>
|
||||||
|
</encoder_stats>
|
||||||
|
<temperature>
|
||||||
|
<gpu_temp>39 C</gpu_temp>
|
||||||
|
</temperature>
|
||||||
|
<power_readings>
|
||||||
|
<power_draw>N/A</power_draw>
|
||||||
|
</power_readings>
|
||||||
|
<clocks>
|
||||||
|
<graphics_clock>135 MHz</graphics_clock>
|
||||||
|
<sm_clock>135 MHz</sm_clock>
|
||||||
|
<mem_clock>405 MHz</mem_clock>
|
||||||
|
<video_clock>405 MHz</video_clock>
|
||||||
|
</clocks>
|
||||||
|
</gpu>
|
||||||
|
</nvidia_smi_log>`)
|
||||||
|
|
||||||
func TestParseLineEmptyLine(t *testing.T) {
|
func TestGatherSMI(t *testing.T) {
|
||||||
line := "\n"
|
var expectedMetric = struct {
|
||||||
_, _, err := parseLine(line)
|
tags map[string]string
|
||||||
if err == nil {
|
fields map[string]interface{}
|
||||||
t.Fail()
|
}{
|
||||||
|
tags: map[string]string{
|
||||||
|
"name": "GeForce GTX 1070 Ti",
|
||||||
|
"compute_mode": "Default",
|
||||||
|
"index": "0",
|
||||||
|
"pstate": "P8",
|
||||||
|
"uuid": "GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665",
|
||||||
|
},
|
||||||
|
fields: map[string]interface{}{
|
||||||
|
"fan_speed": 100,
|
||||||
|
"memory_free": 4054,
|
||||||
|
"memory_used": 42,
|
||||||
|
"memory_total": 4096,
|
||||||
|
"temperature_gpu": 39,
|
||||||
|
"utilization_gpu": 0,
|
||||||
|
"utilization_memory": 0,
|
||||||
|
"pcie_link_gen_current": 1,
|
||||||
|
"pcie_link_width_current": 16,
|
||||||
|
"encoder_stats_session_count": 0,
|
||||||
|
"encoder_stats_average_fps": 0,
|
||||||
|
"encoder_stats_average_latency": 0,
|
||||||
|
"clocks_current_graphics": 135,
|
||||||
|
"clocks_current_sm": 135,
|
||||||
|
"clocks_current_memory": 405,
|
||||||
|
"clocks_current_video": 405,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseLineBad(t *testing.T) {
|
acc := &testutil.Accumulator{}
|
||||||
line := "the quick brown fox jumped over the lazy dog"
|
|
||||||
_, _, err := parseLine(line)
|
|
||||||
if err == nil {
|
|
||||||
t.Fail()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseLineNotSupported(t *testing.T) {
|
gatherNvidiaSMI(payload, acc)
|
||||||
line := "[Not Supported], 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n"
|
fmt.Println()
|
||||||
_, fields, err := parseLine(line)
|
|
||||||
require.NoError(t, err)
|
|
||||||
require.Equal(t, nil, fields["fan_speed"])
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseLineUnknownError(t *testing.T) {
|
require.Equal(t, 1, len(acc.Metrics))
|
||||||
line := "[Unknown Error], 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n"
|
require.Equal(t, expectedMetric.fields, acc.Metrics[0].Fields)
|
||||||
_, fields, err := parseLine(line)
|
require.Equal(t, expectedMetric.tags, acc.Metrics[0].Tags)
|
||||||
require.NoError(t, err)
|
|
||||||
require.Equal(t, nil, fields["fan_speed"])
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue