From 2cf5116d14472600e315d65b49ac62fc79c20390 Mon Sep 17 00:00:00 2001 From: Greg <2653109+glinton@users.noreply.github.com> Date: Tue, 12 Nov 2019 17:12:15 -0700 Subject: [PATCH] Update nvidia-smi input to use xml (#6639) --- plugins/inputs/nvidia_smi/nvidia_smi.go | 255 +++++++++++-------- plugins/inputs/nvidia_smi/nvidia_smi_test.go | 122 ++++++--- 2 files changed, 240 insertions(+), 137 deletions(-) diff --git a/plugins/inputs/nvidia_smi/nvidia_smi.go b/plugins/inputs/nvidia_smi/nvidia_smi.go index e2ec19959..b21e390c6 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi.go @@ -1,7 +1,7 @@ package nvidia_smi import ( - "bufio" + "encoding/xml" "fmt" "os" "os/exec" @@ -14,41 +14,12 @@ import ( "github.com/influxdata/telegraf/plugins/inputs" ) -var ( - measurement = "nvidia_smi" - metrics = "fan.speed,memory.total,memory.used,memory.free,pstate,temperature.gpu,name,uuid,compute_mode,utilization.gpu,utilization.memory,index,power.draw,pcie.link.gen.current,pcie.link.width.current,encoder.stats.sessionCount,encoder.stats.averageFps,encoder.stats.averageLatency,clocks.current.graphics,clocks.current.sm,clocks.current.memory,clocks.current.video" - metricNames = [][]string{ - {"fan_speed", "integer"}, - {"memory_total", "integer"}, - {"memory_used", "integer"}, - {"memory_free", "integer"}, - {"pstate", "tag"}, - {"temperature_gpu", "integer"}, - {"name", "tag"}, - {"uuid", "tag"}, - {"compute_mode", "tag"}, - {"utilization_gpu", "integer"}, - {"utilization_memory", "integer"}, - {"index", "tag"}, - {"power_draw", "float"}, - {"pcie_link_gen_current", "integer"}, - {"pcie_link_width_current", "integer"}, - {"encoder_stats_session_count", "integer"}, - {"encoder_stats_average_fps", "integer"}, - {"encoder_stats_average_latency", "integer"}, - {"clocks_current_graphics", "integer"}, - {"clocks_current_sm", "integer"}, - {"clocks_current_memory", "integer"}, - {"clocks_current_video", "integer"}, - } -) +const measurement = "nvidia_smi" // NvidiaSMI holds the methods for this plugin type NvidiaSMI struct { BinPath string Timeout internal.Duration - - metrics string } // Description returns the description of the NvidiaSMI plugin @@ -69,7 +40,6 @@ func (smi *NvidiaSMI) SampleConfig() string { // Gather implements the telegraf interface func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error { - if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) { return fmt.Errorf("nvidia-smi binary not at path %s, cannot gather GPU data", smi.BinPath) } @@ -92,93 +62,178 @@ func init() { return &NvidiaSMI{ BinPath: "/usr/bin/nvidia-smi", Timeout: internal.Duration{Duration: 5 * time.Second}, - metrics: metrics, } }) } -func (smi *NvidiaSMI) pollSMI() (string, error) { +func (smi *NvidiaSMI) pollSMI() ([]byte, error) { // Construct and execute metrics query - opts := []string{"--format=noheader,nounits,csv", fmt.Sprintf("--query-gpu=%s", smi.metrics)} - ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, opts...), smi.Timeout.Duration) + ret, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), smi.Timeout.Duration) if err != nil { - return "", err + return nil, err } - return string(ret), nil + return ret, nil } -func gatherNvidiaSMI(ret string, acc telegraf.Accumulator) error { - // First split the lines up and handle each one - scanner := bufio.NewScanner(strings.NewReader(ret)) - for scanner.Scan() { - tags, fields, err := parseLine(scanner.Text()) - if err != nil { - return err - } - acc.AddFields(measurement, fields, tags) +func gatherNvidiaSMI(ret []byte, acc telegraf.Accumulator) error { + smi := &SMI{} + err := xml.Unmarshal(ret, smi) + if err != nil { + return err } - if err := scanner.Err(); err != nil { - return fmt.Errorf("Error scanning text %s", ret) + metrics := smi.genTagsFields() + + for _, metric := range metrics { + acc.AddFields(measurement, metric.fields, metric.tags) } return nil } -func parseLine(line string) (map[string]string, map[string]interface{}, error) { - tags := make(map[string]string, 0) - fields := make(map[string]interface{}, 0) +type metric struct { + tags map[string]string + fields map[string]interface{} +} - // Next split up the comma delimited metrics - met := strings.Split(line, ",") - - // Make sure there are as many metrics in the line as there were queried. - if len(met) == len(metricNames) { - for i, m := range metricNames { - col := strings.TrimSpace(met[i]) - - // Handle the tags - if m[1] == "tag" { - tags[m[0]] = col - continue - } - - // In some cases we may not be able to get data. - // One such case is when the memory is overclocked. - // nvidia-smi reads the max supported memory clock from the stock value. - // If the current memory clock is greater than the max detected memory clock then we receive [Unknown Error] as a value. - - // For example, the stock max memory clock speed on a 2080 Ti is 7000 MHz which nvidia-smi detects. - // The user has overclocked their memory using an offset of +1000 so under load the memory clock reaches 8000 MHz. - // Now when nvidia-smi tries to read the current memory clock it fails and spits back [Unknown Error] as the value. - // This value will break the parsing logic below unless it is accounted for here. - if strings.Contains(col, "[Not Supported]") || strings.Contains(col, "[Unknown Error]") { - continue - } - - // Parse the integers - if m[1] == "integer" { - out, err := strconv.ParseInt(col, 10, 64) - if err != nil { - return tags, fields, err - } - fields[m[0]] = out - } - - // Parse the floats - if m[1] == "float" { - out, err := strconv.ParseFloat(col, 64) - if err != nil { - return tags, fields, err - } - fields[m[0]] = out - } +func (s *SMI) genTagsFields() []metric { + metrics := []metric{} + for i, gpu := range s.GPU { + tags := map[string]string{ + "index": strconv.Itoa(i), } + fields := map[string]interface{}{} - // Return the tags and fields - return tags, fields, nil + setTagIfUsed(tags, "pstate", gpu.PState) + setTagIfUsed(tags, "name", gpu.ProdName) + setTagIfUsed(tags, "uuid", gpu.UUID) + setTagIfUsed(tags, "compute_mode", gpu.ComputeMode) + + setIfUsed("int", fields, "fan_speed", gpu.FanSpeed) + setIfUsed("int", fields, "memory_total", gpu.Memory.Total) + setIfUsed("int", fields, "memory_used", gpu.Memory.Used) + setIfUsed("int", fields, "memory_free", gpu.Memory.Free) + setIfUsed("int", fields, "temperature_gpu", gpu.Temp.GPUTemp) + setIfUsed("int", fields, "utilization_gpu", gpu.Utilization.GPU) + setIfUsed("int", fields, "utilization_memory", gpu.Utilization.Memory) + setIfUsed("int", fields, "pcie_link_gen_current", gpu.PCI.LinkInfo.PCIEGen.CurrentLinkGen) + setIfUsed("int", fields, "pcie_link_width_current", gpu.PCI.LinkInfo.LinkWidth.CurrentLinkWidth) + setIfUsed("int", fields, "encoder_stats_session_count", gpu.Encoder.SessionCount) + setIfUsed("int", fields, "encoder_stats_average_fps", gpu.Encoder.AverageFPS) + setIfUsed("int", fields, "encoder_stats_average_latency", gpu.Encoder.AverageLatency) + setIfUsed("int", fields, "clocks_current_graphics", gpu.Clocks.Graphics) + setIfUsed("int", fields, "clocks_current_sm", gpu.Clocks.SM) + setIfUsed("int", fields, "clocks_current_memory", gpu.Clocks.Memory) + setIfUsed("int", fields, "clocks_current_video", gpu.Clocks.Video) + + setIfUsed("float", fields, "power_draw", gpu.Power.PowerDraw) + metrics = append(metrics, metric{tags, fields}) + } + return metrics +} + +func setTagIfUsed(m map[string]string, k, v string) { + if v != "" { + m[k] = v + } +} + +func setIfUsed(t string, m map[string]interface{}, k, v string) { + vals := strings.Fields(v) + if len(vals) < 1 { + return } - // If the line is empty return an emptyline error - return tags, fields, fmt.Errorf("Different number of metrics returned (%d) than expeced (%d)", len(met), len(metricNames)) + val := vals[0] + if k == "pcie_link_width_current" { + val = strings.TrimSuffix(vals[0], "x") + } + + switch t { + case "float": + if val != "" { + f, err := strconv.ParseFloat(val, 64) + if err == nil { + m[k] = f + } + } + case "int": + if val != "" { + i, err := strconv.Atoi(val) + if err == nil { + m[k] = i + } + } + } +} + +// SMI defines the structure for the output of _nvidia-smi -q -x_. +type SMI struct { + GPU GPU `xml:"gpu"` +} + +// GPU defines the structure of the GPU portion of the smi output. +type GPU []struct { + FanSpeed string `xml:"fan_speed"` // int + Memory MemoryStats `xml:"fb_memory_usage"` + PState string `xml:"performance_state"` + Temp TempStats `xml:"temperature"` + ProdName string `xml:"product_name"` + UUID string `xml:"uuid"` + ComputeMode string `xml:"compute_mode"` + Utilization UtilizationStats `xml:"utilization"` + Power PowerReadings `xml:"power_readings"` + PCI PCI `xml:"pci"` + Encoder EncoderStats `xml:"encoder_stats"` + Clocks ClockStats `xml:"clocks"` +} + +// MemoryStats defines the structure of the memory portions in the smi output. +type MemoryStats struct { + Total string `xml:"total"` // int + Used string `xml:"used"` // int + Free string `xml:"free"` // int +} + +// TempStats defines the structure of the temperature portion of the smi output. +type TempStats struct { + GPUTemp string `xml:"gpu_temp"` // int +} + +// UtilizationStats defines the structure of the utilization portion of the smi output. +type UtilizationStats struct { + GPU string `xml:"gpu_util"` // int + Memory string `xml:"memory_util"` // int +} + +// PowerReadings defines the structure of the power_readings portion of the smi output. +type PowerReadings struct { + PowerDraw string `xml:"power_draw"` // float +} + +// PCI defines the structure of the pci portion of the smi output. +type PCI struct { + LinkInfo struct { + PCIEGen struct { + CurrentLinkGen string `xml:"current_link_gen"` // int + } `xml:"pcie_gen"` + LinkWidth struct { + CurrentLinkWidth string `xml:"current_link_width"` // int + } `xml:"link_widths"` + } `xml:"pci_gpu_link_info"` +} + +// EncoderStats defines the structure of the encoder_stats portion of the smi output. +type EncoderStats struct { + SessionCount string `xml:"session_count"` // int + AverageFPS string `xml:"average_fps"` // int + AverageLatency string `xml:"average_latency"` // int +} + +// ClockStats defines the structure of the clocks portion of the smi output. +type ClockStats struct { + Graphics string `xml:"graphics_clock"` // int + SM string `xml:"sm_clock"` // int + Memory string `xml:"mem_clock"` // int + Video string `xml:"video_clock"` // int } diff --git a/plugins/inputs/nvidia_smi/nvidia_smi_test.go b/plugins/inputs/nvidia_smi/nvidia_smi_test.go index a16447d69..7d0ec4666 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi_test.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi_test.go @@ -1,51 +1,99 @@ package nvidia_smi import ( + "fmt" "testing" + "github.com/influxdata/telegraf/testutil" "github.com/stretchr/testify/require" ) -func TestParseLineStandard(t *testing.T) { - line := "41, 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n" - tags, fields, err := parseLine(line) - if err != nil { - t.Fail() - } - if tags["name"] != "GeForce RTX 2080 Ti" { - t.Fail() - } - if temp, ok := fields["temperature_gpu"].(int); ok && temp != 32 { - t.Fail() - } -} +var payload = []byte(` + + + + GeForce GTX 1070 Ti + GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665 + + + + 1 + + + 16x + + + + 100 % + P8 + + 4096 MiB + 42 MiB + 4054 MiB + + Default + + 0 % + 0 % + + + 0 + 0 + 0 + + + 39 C + + + N/A + + + 135 MHz + 135 MHz + 405 MHz + 405 MHz + + +`) -func TestParseLineEmptyLine(t *testing.T) { - line := "\n" - _, _, err := parseLine(line) - if err == nil { - t.Fail() +func TestGatherSMI(t *testing.T) { + var expectedMetric = struct { + tags map[string]string + fields map[string]interface{} + }{ + tags: map[string]string{ + "name": "GeForce GTX 1070 Ti", + "compute_mode": "Default", + "index": "0", + "pstate": "P8", + "uuid": "GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665", + }, + fields: map[string]interface{}{ + "fan_speed": 100, + "memory_free": 4054, + "memory_used": 42, + "memory_total": 4096, + "temperature_gpu": 39, + "utilization_gpu": 0, + "utilization_memory": 0, + "pcie_link_gen_current": 1, + "pcie_link_width_current": 16, + "encoder_stats_session_count": 0, + "encoder_stats_average_fps": 0, + "encoder_stats_average_latency": 0, + "clocks_current_graphics": 135, + "clocks_current_sm": 135, + "clocks_current_memory": 405, + "clocks_current_video": 405, + }, } -} -func TestParseLineBad(t *testing.T) { - line := "the quick brown fox jumped over the lazy dog" - _, _, err := parseLine(line) - if err == nil { - t.Fail() - } -} + acc := &testutil.Accumulator{} -func TestParseLineNotSupported(t *testing.T) { - line := "[Not Supported], 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n" - _, fields, err := parseLine(line) - require.NoError(t, err) - require.Equal(t, nil, fields["fan_speed"]) -} + gatherNvidiaSMI(payload, acc) + fmt.Println() -func TestParseLineUnknownError(t *testing.T) { - line := "[Unknown Error], 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n" - _, fields, err := parseLine(line) - require.NoError(t, err) - require.Equal(t, nil, fields["fan_speed"]) + require.Equal(t, 1, len(acc.Metrics)) + require.Equal(t, expectedMetric.fields, acc.Metrics[0].Fields) + require.Equal(t, expectedMetric.tags, acc.Metrics[0].Tags) }