Add additional nvidia-smi examples as testcases

This commit is contained in:
Daniel Nelson 2019-11-12 16:13:30 -08:00
parent 2cf5116d14
commit b71a387ca2
No known key found for this signature in database
GPG Key ID: CAAD59C9444F6155
2 changed files with 131 additions and 84 deletions

View File

@ -55,11 +55,20 @@ SELECT mean("temperature_gpu") FROM "nvidia_smi" WHERE time > now() - 5m GROUP B
### Troubleshooting
As the `telegraf` user run the following command. Adjust the path to `nvidia-smi` if customized.
Check the full output by running `nvidia-smi` binary manually.
Linux:
```
/usr/bin/nvidia-smi --format=noheader,nounits,csv --query-gpu=fan.speed,memory.total,memory.used,memory.free,pstate,temperature.gpu,name,uuid,compute_mode,utilization.gpu,utilization.memory,index,power.draw,pcie.link.gen.current,pcie.link.width.current,encoder.stats.sessionCount,encoder.stats.averageFps,encoder.stats.averageLatency,clocks.current.graphics,clocks.current.sm,clocks.current.memory,clocks.current.video
sudo -u telegraf -- /usr/bin/nvidia-smi -q -x
```
Windows:
```
"C:\Program Files\NVIDIA Corporation\NVSMI\nvidia-smi.exe" -q -x
```
Please include the output of this command if opening an GitHub issue.
### Example Output
```
nvidia_smi,compute_mode=Default,host=8218cf,index=0,name=GeForce\ GTX\ 1070,pstate=P2,uuid=GPU-823bc202-6279-6f2c-d729-868a30f14d96 fan_speed=100i,memory_free=7563i,memory_total=8112i,memory_used=549i,temperature_gpu=53i,utilization_gpu=100i,utilization_memory=90i 1523991122000000000

View File

@ -1,99 +1,137 @@
package nvidia_smi
import (
"fmt"
"io/ioutil"
"path/filepath"
"testing"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/require"
)
var payload = []byte(`<?xml version="1.0" ?>
<!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v10.dtd">
<nvidia_smi_log>
<gpu id="00000000:01:00.0">
<product_name>GeForce GTX 1070 Ti</product_name>
<uuid>GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665</uuid>
<pci>
<pci_gpu_link_info>
<pcie_gen>
<current_link_gen>1</current_link_gen>
</pcie_gen>
<link_widths>
<current_link_width>16x</current_link_width>
</link_widths>
</pci_gpu_link_info>
</pci>
<fan_speed>100 %</fan_speed>
<performance_state>P8</performance_state>
<fb_memory_usage>
<total>4096 MiB</total>
<used>42 MiB</used>
<free>4054 MiB</free>
</fb_memory_usage>
<compute_mode>Default</compute_mode>
<utilization>
<gpu_util>0 %</gpu_util>
<memory_util>0 %</memory_util>
</utilization>
<encoder_stats>
<session_count>0</session_count>
<average_fps>0</average_fps>
<average_latency>0</average_latency>
</encoder_stats>
<temperature>
<gpu_temp>39 C</gpu_temp>
</temperature>
<power_readings>
<power_draw>N/A</power_draw>
</power_readings>
<clocks>
<graphics_clock>135 MHz</graphics_clock>
<sm_clock>135 MHz</sm_clock>
<mem_clock>405 MHz</mem_clock>
<video_clock>405 MHz</video_clock>
</clocks>
</gpu>
</nvidia_smi_log>`)
func TestGatherSMI(t *testing.T) {
var expectedMetric = struct {
tags map[string]string
fields map[string]interface{}
func TestGatherValidXML(t *testing.T) {
tests := []struct {
name string
filename string
expected []telegraf.Metric
}{
tags: map[string]string{
"name": "GeForce GTX 1070 Ti",
"compute_mode": "Default",
"index": "0",
"pstate": "P8",
"uuid": "GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665",
{
name: "GeForce GTX 1070 Ti",
filename: "gtx-1070-ti.xml",
expected: []telegraf.Metric{
testutil.MustMetric(
"nvidia_smi",
map[string]string{
"name": "GeForce GTX 1070 Ti",
"compute_mode": "Default",
"index": "0",
"pstate": "P8",
"uuid": "GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665",
},
map[string]interface{}{
"clocks_current_graphics": 135,
"clocks_current_memory": 405,
"clocks_current_sm": 135,
"clocks_current_video": 405,
"encoder_stats_average_fps": 0,
"encoder_stats_average_latency": 0,
"encoder_stats_session_count": 0,
"fan_speed": 100,
"memory_free": 4054,
"memory_total": 4096,
"memory_used": 42,
"pcie_link_gen_current": 1,
"pcie_link_width_current": 16,
"temperature_gpu": 39,
"utilization_gpu": 0,
"utilization_memory": 0,
},
time.Unix(0, 0)),
},
},
fields: map[string]interface{}{
"fan_speed": 100,
"memory_free": 4054,
"memory_used": 42,
"memory_total": 4096,
"temperature_gpu": 39,
"utilization_gpu": 0,
"utilization_memory": 0,
"pcie_link_gen_current": 1,
"pcie_link_width_current": 16,
"encoder_stats_session_count": 0,
"encoder_stats_average_fps": 0,
"encoder_stats_average_latency": 0,
"clocks_current_graphics": 135,
"clocks_current_sm": 135,
"clocks_current_memory": 405,
"clocks_current_video": 405,
{
name: "GeForce GTX 1660 Ti",
filename: "gtx-1660-ti.xml",
expected: []telegraf.Metric{
testutil.MustMetric(
"nvidia_smi",
map[string]string{
"compute_mode": "Default",
"index": "0",
"name": "Graphics Device",
"pstate": "P8",
"uuid": "GPU-304a277d-3545-63b8-3a36-dfde3c992989",
},
map[string]interface{}{
"clocks_current_graphics": 300,
"clocks_current_memory": 405,
"clocks_current_sm": 300,
"clocks_current_video": 540,
"encoder_stats_average_fps": 0,
"encoder_stats_average_latency": 0,
"encoder_stats_session_count": 0,
"fan_speed": 0,
"memory_free": 5912,
"memory_total": 5912,
"memory_used": 0,
"pcie_link_gen_current": 1,
"pcie_link_width_current": 16,
"power_draw": 8.93,
"temperature_gpu": 40,
"utilization_gpu": 0,
"utilization_memory": 1,
},
time.Unix(0, 0)),
},
},
{
name: "Quadro P400",
filename: "quadro-p400.xml",
expected: []telegraf.Metric{
testutil.MustMetric(
"nvidia_smi",
map[string]string{
"compute_mode": "Default",
"index": "0",
"name": "Quadro P400",
"pstate": "P8",
"uuid": "GPU-8f750be4-dfbc-23b9-b33f-da729a536494",
},
map[string]interface{}{
"clocks_current_graphics": 139,
"clocks_current_memory": 405,
"clocks_current_sm": 139,
"clocks_current_video": 544,
"encoder_stats_average_fps": 0,
"encoder_stats_average_latency": 0,
"encoder_stats_session_count": 0,
"fan_speed": 34,
"memory_free": 1998,
"memory_total": 1998,
"memory_used": 0,
"pcie_link_gen_current": 1,
"pcie_link_width_current": 16,
"temperature_gpu": 33,
"utilization_gpu": 0,
"utilization_memory": 3,
},
time.Unix(0, 0)),
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var acc testutil.Accumulator
acc := &testutil.Accumulator{}
octets, err := ioutil.ReadFile(filepath.Join("testdata", tt.filename))
require.NoError(t, err)
gatherNvidiaSMI(payload, acc)
fmt.Println()
err = gatherNvidiaSMI(octets, &acc)
require.NoError(t, err)
require.Equal(t, 1, len(acc.Metrics))
require.Equal(t, expectedMetric.fields, acc.Metrics[0].Fields)
require.Equal(t, expectedMetric.tags, acc.Metrics[0].Tags)
testutil.RequireMetricsEqual(t, tt.expected, acc.GetTelegrafMetrics(), testutil.IgnoreTime())
})
}
}