Update nvidia-smi input to use xml (#6639)
This commit is contained in:
parent
55b78a5f66
commit
2cf5116d14
|
@ -1,7 +1,7 @@
|
|||
package nvidia_smi
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
|
@ -14,41 +14,12 @@ import (
|
|||
"github.com/influxdata/telegraf/plugins/inputs"
|
||||
)
|
||||
|
||||
var (
|
||||
measurement = "nvidia_smi"
|
||||
metrics = "fan.speed,memory.total,memory.used,memory.free,pstate,temperature.gpu,name,uuid,compute_mode,utilization.gpu,utilization.memory,index,power.draw,pcie.link.gen.current,pcie.link.width.current,encoder.stats.sessionCount,encoder.stats.averageFps,encoder.stats.averageLatency,clocks.current.graphics,clocks.current.sm,clocks.current.memory,clocks.current.video"
|
||||
metricNames = [][]string{
|
||||
{"fan_speed", "integer"},
|
||||
{"memory_total", "integer"},
|
||||
{"memory_used", "integer"},
|
||||
{"memory_free", "integer"},
|
||||
{"pstate", "tag"},
|
||||
{"temperature_gpu", "integer"},
|
||||
{"name", "tag"},
|
||||
{"uuid", "tag"},
|
||||
{"compute_mode", "tag"},
|
||||
{"utilization_gpu", "integer"},
|
||||
{"utilization_memory", "integer"},
|
||||
{"index", "tag"},
|
||||
{"power_draw", "float"},
|
||||
{"pcie_link_gen_current", "integer"},
|
||||
{"pcie_link_width_current", "integer"},
|
||||
{"encoder_stats_session_count", "integer"},
|
||||
{"encoder_stats_average_fps", "integer"},
|
||||
{"encoder_stats_average_latency", "integer"},
|
||||
{"clocks_current_graphics", "integer"},
|
||||
{"clocks_current_sm", "integer"},
|
||||
{"clocks_current_memory", "integer"},
|
||||
{"clocks_current_video", "integer"},
|
||||
}
|
||||
)
|
||||
const measurement = "nvidia_smi"
|
||||
|
||||
// NvidiaSMI holds the methods for this plugin
|
||||
type NvidiaSMI struct {
|
||||
BinPath string
|
||||
Timeout internal.Duration
|
||||
|
||||
metrics string
|
||||
}
|
||||
|
||||
// Description returns the description of the NvidiaSMI plugin
|
||||
|
@ -69,7 +40,6 @@ func (smi *NvidiaSMI) SampleConfig() string {
|
|||
|
||||
// Gather implements the telegraf interface
|
||||
func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
|
||||
|
||||
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
|
||||
return fmt.Errorf("nvidia-smi binary not at path %s, cannot gather GPU data", smi.BinPath)
|
||||
}
|
||||
|
@ -92,93 +62,178 @@ func init() {
|
|||
return &NvidiaSMI{
|
||||
BinPath: "/usr/bin/nvidia-smi",
|
||||
Timeout: internal.Duration{Duration: 5 * time.Second},
|
||||
metrics: metrics,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (smi *NvidiaSMI) pollSMI() (string, error) {
|
||||
func (smi *NvidiaSMI) pollSMI() ([]byte, error) {
|
||||
// Construct and execute metrics query
|
||||
opts := []string{"--format=noheader,nounits,csv", fmt.Sprintf("--query-gpu=%s", smi.metrics)}
|
||||
ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, opts...), smi.Timeout.Duration)
|
||||
ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), smi.Timeout.Duration)
|
||||
if err != nil {
|
||||
return "", err
|
||||
return nil, err
|
||||
}
|
||||
return string(ret), nil
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func gatherNvidiaSMI(ret string, acc telegraf.Accumulator) error {
|
||||
// First split the lines up and handle each one
|
||||
scanner := bufio.NewScanner(strings.NewReader(ret))
|
||||
for scanner.Scan() {
|
||||
tags, fields, err := parseLine(scanner.Text())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
acc.AddFields(measurement, fields, tags)
|
||||
func gatherNvidiaSMI(ret []byte, acc telegraf.Accumulator) error {
|
||||
smi := &SMI{}
|
||||
err := xml.Unmarshal(ret, smi)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
return fmt.Errorf("Error scanning text %s", ret)
|
||||
metrics := smi.genTagsFields()
|
||||
|
||||
for _, metric := range metrics {
|
||||
acc.AddFields(measurement, metric.fields, metric.tags)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseLine(line string) (map[string]string, map[string]interface{}, error) {
|
||||
tags := make(map[string]string, 0)
|
||||
fields := make(map[string]interface{}, 0)
|
||||
type metric struct {
|
||||
tags map[string]string
|
||||
fields map[string]interface{}
|
||||
}
|
||||
|
||||
// Next split up the comma delimited metrics
|
||||
met := strings.Split(line, ",")
|
||||
|
||||
// Make sure there are as many metrics in the line as there were queried.
|
||||
if len(met) == len(metricNames) {
|
||||
for i, m := range metricNames {
|
||||
col := strings.TrimSpace(met[i])
|
||||
|
||||
// Handle the tags
|
||||
if m[1] == "tag" {
|
||||
tags[m[0]] = col
|
||||
continue
|
||||
}
|
||||
|
||||
// In some cases we may not be able to get data.
|
||||
// One such case is when the memory is overclocked.
|
||||
// nvidia-smi reads the max supported memory clock from the stock value.
|
||||
// If the current memory clock is greater than the max detected memory clock then we receive [Unknown Error] as a value.
|
||||
|
||||
// For example, the stock max memory clock speed on a 2080 Ti is 7000 MHz which nvidia-smi detects.
|
||||
// The user has overclocked their memory using an offset of +1000 so under load the memory clock reaches 8000 MHz.
|
||||
// Now when nvidia-smi tries to read the current memory clock it fails and spits back [Unknown Error] as the value.
|
||||
// This value will break the parsing logic below unless it is accounted for here.
|
||||
if strings.Contains(col, "[Not Supported]") || strings.Contains(col, "[Unknown Error]") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse the integers
|
||||
if m[1] == "integer" {
|
||||
out, err := strconv.ParseInt(col, 10, 64)
|
||||
if err != nil {
|
||||
return tags, fields, err
|
||||
}
|
||||
fields[m[0]] = out
|
||||
}
|
||||
|
||||
// Parse the floats
|
||||
if m[1] == "float" {
|
||||
out, err := strconv.ParseFloat(col, 64)
|
||||
if err != nil {
|
||||
return tags, fields, err
|
||||
}
|
||||
fields[m[0]] = out
|
||||
}
|
||||
func (s *SMI) genTagsFields() []metric {
|
||||
metrics := []metric{}
|
||||
for i, gpu := range s.GPU {
|
||||
tags := map[string]string{
|
||||
"index": strconv.Itoa(i),
|
||||
}
|
||||
fields := map[string]interface{}{}
|
||||
|
||||
// Return the tags and fields
|
||||
return tags, fields, nil
|
||||
setTagIfUsed(tags, "pstate", gpu.PState)
|
||||
setTagIfUsed(tags, "name", gpu.ProdName)
|
||||
setTagIfUsed(tags, "uuid", gpu.UUID)
|
||||
setTagIfUsed(tags, "compute_mode", gpu.ComputeMode)
|
||||
|
||||
setIfUsed("int", fields, "fan_speed", gpu.FanSpeed)
|
||||
setIfUsed("int", fields, "memory_total", gpu.Memory.Total)
|
||||
setIfUsed("int", fields, "memory_used", gpu.Memory.Used)
|
||||
setIfUsed("int", fields, "memory_free", gpu.Memory.Free)
|
||||
setIfUsed("int", fields, "temperature_gpu", gpu.Temp.GPUTemp)
|
||||
setIfUsed("int", fields, "utilization_gpu", gpu.Utilization.GPU)
|
||||
setIfUsed("int", fields, "utilization_memory", gpu.Utilization.Memory)
|
||||
setIfUsed("int", fields, "pcie_link_gen_current", gpu.PCI.LinkInfo.PCIEGen.CurrentLinkGen)
|
||||
setIfUsed("int", fields, "pcie_link_width_current", gpu.PCI.LinkInfo.LinkWidth.CurrentLinkWidth)
|
||||
setIfUsed("int", fields, "encoder_stats_session_count", gpu.Encoder.SessionCount)
|
||||
setIfUsed("int", fields, "encoder_stats_average_fps", gpu.Encoder.AverageFPS)
|
||||
setIfUsed("int", fields, "encoder_stats_average_latency", gpu.Encoder.AverageLatency)
|
||||
setIfUsed("int", fields, "clocks_current_graphics", gpu.Clocks.Graphics)
|
||||
setIfUsed("int", fields, "clocks_current_sm", gpu.Clocks.SM)
|
||||
setIfUsed("int", fields, "clocks_current_memory", gpu.Clocks.Memory)
|
||||
setIfUsed("int", fields, "clocks_current_video", gpu.Clocks.Video)
|
||||
|
||||
setIfUsed("float", fields, "power_draw", gpu.Power.PowerDraw)
|
||||
metrics = append(metrics, metric{tags, fields})
|
||||
}
|
||||
return metrics
|
||||
}
|
||||
|
||||
func setTagIfUsed(m map[string]string, k, v string) {
|
||||
if v != "" {
|
||||
m[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
func setIfUsed(t string, m map[string]interface{}, k, v string) {
|
||||
vals := strings.Fields(v)
|
||||
if len(vals) < 1 {
|
||||
return
|
||||
}
|
||||
|
||||
// If the line is empty return an emptyline error
|
||||
return tags, fields, fmt.Errorf("Different number of metrics returned (%d) than expeced (%d)", len(met), len(metricNames))
|
||||
val := vals[0]
|
||||
if k == "pcie_link_width_current" {
|
||||
val = strings.TrimSuffix(vals[0], "x")
|
||||
}
|
||||
|
||||
switch t {
|
||||
case "float":
|
||||
if val != "" {
|
||||
f, err := strconv.ParseFloat(val, 64)
|
||||
if err == nil {
|
||||
m[k] = f
|
||||
}
|
||||
}
|
||||
case "int":
|
||||
if val != "" {
|
||||
i, err := strconv.Atoi(val)
|
||||
if err == nil {
|
||||
m[k] = i
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SMI defines the structure for the output of _nvidia-smi -q -x_.
|
||||
type SMI struct {
|
||||
GPU GPU `xml:"gpu"`
|
||||
}
|
||||
|
||||
// GPU defines the structure of the GPU portion of the smi output.
|
||||
type GPU []struct {
|
||||
FanSpeed string `xml:"fan_speed"` // int
|
||||
Memory MemoryStats `xml:"fb_memory_usage"`
|
||||
PState string `xml:"performance_state"`
|
||||
Temp TempStats `xml:"temperature"`
|
||||
ProdName string `xml:"product_name"`
|
||||
UUID string `xml:"uuid"`
|
||||
ComputeMode string `xml:"compute_mode"`
|
||||
Utilization UtilizationStats `xml:"utilization"`
|
||||
Power PowerReadings `xml:"power_readings"`
|
||||
PCI PCI `xml:"pci"`
|
||||
Encoder EncoderStats `xml:"encoder_stats"`
|
||||
Clocks ClockStats `xml:"clocks"`
|
||||
}
|
||||
|
||||
// MemoryStats defines the structure of the memory portions in the smi output.
|
||||
type MemoryStats struct {
|
||||
Total string `xml:"total"` // int
|
||||
Used string `xml:"used"` // int
|
||||
Free string `xml:"free"` // int
|
||||
}
|
||||
|
||||
// TempStats defines the structure of the temperature portion of the smi output.
|
||||
type TempStats struct {
|
||||
GPUTemp string `xml:"gpu_temp"` // int
|
||||
}
|
||||
|
||||
// UtilizationStats defines the structure of the utilization portion of the smi output.
|
||||
type UtilizationStats struct {
|
||||
GPU string `xml:"gpu_util"` // int
|
||||
Memory string `xml:"memory_util"` // int
|
||||
}
|
||||
|
||||
// PowerReadings defines the structure of the power_readings portion of the smi output.
|
||||
type PowerReadings struct {
|
||||
PowerDraw string `xml:"power_draw"` // float
|
||||
}
|
||||
|
||||
// PCI defines the structure of the pci portion of the smi output.
|
||||
type PCI struct {
|
||||
LinkInfo struct {
|
||||
PCIEGen struct {
|
||||
CurrentLinkGen string `xml:"current_link_gen"` // int
|
||||
} `xml:"pcie_gen"`
|
||||
LinkWidth struct {
|
||||
CurrentLinkWidth string `xml:"current_link_width"` // int
|
||||
} `xml:"link_widths"`
|
||||
} `xml:"pci_gpu_link_info"`
|
||||
}
|
||||
|
||||
// EncoderStats defines the structure of the encoder_stats portion of the smi output.
|
||||
type EncoderStats struct {
|
||||
SessionCount string `xml:"session_count"` // int
|
||||
AverageFPS string `xml:"average_fps"` // int
|
||||
AverageLatency string `xml:"average_latency"` // int
|
||||
}
|
||||
|
||||
// ClockStats defines the structure of the clocks portion of the smi output.
|
||||
type ClockStats struct {
|
||||
Graphics string `xml:"graphics_clock"` // int
|
||||
SM string `xml:"sm_clock"` // int
|
||||
Memory string `xml:"mem_clock"` // int
|
||||
Video string `xml:"video_clock"` // int
|
||||
}
|
||||
|
|
|
@ -1,51 +1,99 @@
|
|||
package nvidia_smi
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/influxdata/telegraf/testutil"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestParseLineStandard(t *testing.T) {
|
||||
line := "41, 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n"
|
||||
tags, fields, err := parseLine(line)
|
||||
if err != nil {
|
||||
t.Fail()
|
||||
}
|
||||
if tags["name"] != "GeForce RTX 2080 Ti" {
|
||||
t.Fail()
|
||||
}
|
||||
if temp, ok := fields["temperature_gpu"].(int); ok && temp != 32 {
|
||||
t.Fail()
|
||||
}
|
||||
}
|
||||
var payload = []byte(`<?xml version="1.0" ?>
|
||||
<!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v10.dtd">
|
||||
<nvidia_smi_log>
|
||||
<gpu id="00000000:01:00.0">
|
||||
<product_name>GeForce GTX 1070 Ti</product_name>
|
||||
<uuid>GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665</uuid>
|
||||
<pci>
|
||||
<pci_gpu_link_info>
|
||||
<pcie_gen>
|
||||
<current_link_gen>1</current_link_gen>
|
||||
</pcie_gen>
|
||||
<link_widths>
|
||||
<current_link_width>16x</current_link_width>
|
||||
</link_widths>
|
||||
</pci_gpu_link_info>
|
||||
</pci>
|
||||
<fan_speed>100 %</fan_speed>
|
||||
<performance_state>P8</performance_state>
|
||||
<fb_memory_usage>
|
||||
<total>4096 MiB</total>
|
||||
<used>42 MiB</used>
|
||||
<free>4054 MiB</free>
|
||||
</fb_memory_usage>
|
||||
<compute_mode>Default</compute_mode>
|
||||
<utilization>
|
||||
<gpu_util>0 %</gpu_util>
|
||||
<memory_util>0 %</memory_util>
|
||||
</utilization>
|
||||
<encoder_stats>
|
||||
<session_count>0</session_count>
|
||||
<average_fps>0</average_fps>
|
||||
<average_latency>0</average_latency>
|
||||
</encoder_stats>
|
||||
<temperature>
|
||||
<gpu_temp>39 C</gpu_temp>
|
||||
</temperature>
|
||||
<power_readings>
|
||||
<power_draw>N/A</power_draw>
|
||||
</power_readings>
|
||||
<clocks>
|
||||
<graphics_clock>135 MHz</graphics_clock>
|
||||
<sm_clock>135 MHz</sm_clock>
|
||||
<mem_clock>405 MHz</mem_clock>
|
||||
<video_clock>405 MHz</video_clock>
|
||||
</clocks>
|
||||
</gpu>
|
||||
</nvidia_smi_log>`)
|
||||
|
||||
func TestParseLineEmptyLine(t *testing.T) {
|
||||
line := "\n"
|
||||
_, _, err := parseLine(line)
|
||||
if err == nil {
|
||||
t.Fail()
|
||||
func TestGatherSMI(t *testing.T) {
|
||||
var expectedMetric = struct {
|
||||
tags map[string]string
|
||||
fields map[string]interface{}
|
||||
}{
|
||||
tags: map[string]string{
|
||||
"name": "GeForce GTX 1070 Ti",
|
||||
"compute_mode": "Default",
|
||||
"index": "0",
|
||||
"pstate": "P8",
|
||||
"uuid": "GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665",
|
||||
},
|
||||
fields: map[string]interface{}{
|
||||
"fan_speed": 100,
|
||||
"memory_free": 4054,
|
||||
"memory_used": 42,
|
||||
"memory_total": 4096,
|
||||
"temperature_gpu": 39,
|
||||
"utilization_gpu": 0,
|
||||
"utilization_memory": 0,
|
||||
"pcie_link_gen_current": 1,
|
||||
"pcie_link_width_current": 16,
|
||||
"encoder_stats_session_count": 0,
|
||||
"encoder_stats_average_fps": 0,
|
||||
"encoder_stats_average_latency": 0,
|
||||
"clocks_current_graphics": 135,
|
||||
"clocks_current_sm": 135,
|
||||
"clocks_current_memory": 405,
|
||||
"clocks_current_video": 405,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLineBad(t *testing.T) {
|
||||
line := "the quick brown fox jumped over the lazy dog"
|
||||
_, _, err := parseLine(line)
|
||||
if err == nil {
|
||||
t.Fail()
|
||||
}
|
||||
}
|
||||
acc := &testutil.Accumulator{}
|
||||
|
||||
func TestParseLineNotSupported(t *testing.T) {
|
||||
line := "[Not Supported], 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n"
|
||||
_, fields, err := parseLine(line)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, nil, fields["fan_speed"])
|
||||
}
|
||||
gatherNvidiaSMI(payload, acc)
|
||||
fmt.Println()
|
||||
|
||||
func TestParseLineUnknownError(t *testing.T) {
|
||||
line := "[Unknown Error], 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n"
|
||||
_, fields, err := parseLine(line)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, nil, fields["fan_speed"])
|
||||
require.Equal(t, 1, len(acc.Metrics))
|
||||
require.Equal(t, expectedMetric.fields, acc.Metrics[0].Fields)
|
||||
require.Equal(t, expectedMetric.tags, acc.Metrics[0].Tags)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue