From aa07b95e00e279cd61694984bc6586dda3d4c7f7 Mon Sep 17 00:00:00 2001 From: Aaron Wood Date: Tue, 9 Jul 2019 15:45:02 -0700 Subject: [PATCH] Handle unknown error in nvidia-smi output (#6073) --- plugins/inputs/nvidia_smi/README.md | 4 ++++ plugins/inputs/nvidia_smi/nvidia_smi.go | 11 ++++++++++- plugins/inputs/nvidia_smi/nvidia_smi_test.go | 7 +++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/plugins/inputs/nvidia_smi/README.md b/plugins/inputs/nvidia_smi/README.md index c3bac8da5..7fe0c077a 100644 --- a/plugins/inputs/nvidia_smi/README.md +++ b/plugins/inputs/nvidia_smi/README.md @@ -59,3 +59,7 @@ nvidia_smi,compute_mode=Default,host=8218cf,index=0,name=GeForce\ GTX\ 1070,psta nvidia_smi,compute_mode=Default,host=8218cf,index=1,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665 fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=50i,utilization_gpu=100i,utilization_memory=85i 1523991122000000000 nvidia_smi,compute_mode=Default,host=8218cf,index=2,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-d4cfc28d-0481-8d07-b81a-ddfc63d74adf fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=58i,utilization_gpu=100i,utilization_memory=86i 1523991122000000000 ``` + +### Limitations +Note that there seems to be an issue with getting current memory clock values when the memory is overclocked. +This may or may not apply to everyone but it's confirmed to be an issue on an EVGA 2080 Ti. diff --git a/plugins/inputs/nvidia_smi/nvidia_smi.go b/plugins/inputs/nvidia_smi/nvidia_smi.go index 37dde689a..e2ec19959 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi.go @@ -143,7 +143,16 @@ func parseLine(line string) (map[string]string, map[string]interface{}, error) { continue } - if strings.Contains(col, "[Not Supported]") { + // In some cases we may not be able to get data. + // One such case is when the memory is overclocked. + // nvidia-smi reads the max supported memory clock from the stock value. + // If the current memory clock is greater than the max detected memory clock then we receive [Unknown Error] as a value. + + // For example, the stock max memory clock speed on a 2080 Ti is 7000 MHz which nvidia-smi detects. + // The user has overclocked their memory using an offset of +1000 so under load the memory clock reaches 8000 MHz. + // Now when nvidia-smi tries to read the current memory clock it fails and spits back [Unknown Error] as the value. + // This value will break the parsing logic below unless it is accounted for here. + if strings.Contains(col, "[Not Supported]") || strings.Contains(col, "[Unknown Error]") { continue } diff --git a/plugins/inputs/nvidia_smi/nvidia_smi_test.go b/plugins/inputs/nvidia_smi/nvidia_smi_test.go index 4e0cc8eac..a16447d69 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi_test.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi_test.go @@ -42,3 +42,10 @@ func TestParseLineNotSupported(t *testing.T) { require.NoError(t, err) require.Equal(t, nil, fields["fan_speed"]) } + +func TestParseLineUnknownError(t *testing.T) { + line := "[Unknown Error], 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n" + _, fields, err := parseLine(line) + require.NoError(t, err) + require.Equal(t, nil, fields["fan_speed"]) +}