Handle unknown error in nvidia-smi output (#6073)

This commit is contained in:
Aaron Wood 2019-07-09 15:45:02 -07:00 committed by Daniel Nelson
parent 1e12006ad6
commit aa07b95e00
3 changed files with 21 additions and 1 deletions

View File

@ -59,3 +59,7 @@ nvidia_smi,compute_mode=Default,host=8218cf,index=0,name=GeForce\ GTX\ 1070,psta
nvidia_smi,compute_mode=Default,host=8218cf,index=1,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665 fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=50i,utilization_gpu=100i,utilization_memory=85i 1523991122000000000 nvidia_smi,compute_mode=Default,host=8218cf,index=1,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-f9ba66fc-a7f5-94c5-da19-019ef2f9c665 fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=50i,utilization_gpu=100i,utilization_memory=85i 1523991122000000000
nvidia_smi,compute_mode=Default,host=8218cf,index=2,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-d4cfc28d-0481-8d07-b81a-ddfc63d74adf fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=58i,utilization_gpu=100i,utilization_memory=86i 1523991122000000000 nvidia_smi,compute_mode=Default,host=8218cf,index=2,name=GeForce\ GTX\ 1080,pstate=P2,uuid=GPU-d4cfc28d-0481-8d07-b81a-ddfc63d74adf fan_speed=100i,memory_free=7557i,memory_total=8114i,memory_used=557i,temperature_gpu=58i,utilization_gpu=100i,utilization_memory=86i 1523991122000000000
``` ```
### Limitations
Note that there seems to be an issue with getting current memory clock values when the memory is overclocked.
This may or may not apply to everyone but it's confirmed to be an issue on an EVGA 2080 Ti.

View File

@ -143,7 +143,16 @@ func parseLine(line string) (map[string]string, map[string]interface{}, error) {
continue continue
} }
if strings.Contains(col, "[Not Supported]") { // In some cases we may not be able to get data.
// One such case is when the memory is overclocked.
// nvidia-smi reads the max supported memory clock from the stock value.
// If the current memory clock is greater than the max detected memory clock then we receive [Unknown Error] as a value.
// For example, the stock max memory clock speed on a 2080 Ti is 7000 MHz which nvidia-smi detects.
// The user has overclocked their memory using an offset of +1000 so under load the memory clock reaches 8000 MHz.
// Now when nvidia-smi tries to read the current memory clock it fails and spits back [Unknown Error] as the value.
// This value will break the parsing logic below unless it is accounted for here.
if strings.Contains(col, "[Not Supported]") || strings.Contains(col, "[Unknown Error]") {
continue continue
} }

View File

@ -42,3 +42,10 @@ func TestParseLineNotSupported(t *testing.T) {
require.NoError(t, err) require.NoError(t, err)
require.Equal(t, nil, fields["fan_speed"]) require.Equal(t, nil, fields["fan_speed"])
} }
func TestParseLineUnknownError(t *testing.T) {
line := "[Unknown Error], 11264, 1074, 10190, P8, 32, GeForce RTX 2080 Ti, GPU-c97b7f88-c06d-650f-5339-f8dd0c1315c0, Default, 1, 4, 0, 24.33, 1, 16, 0, 0, 0, 300, 300, 405, 540\n"
_, fields, err := parseLine(line)
require.NoError(t, err)
require.Equal(t, nil, fields["fan_speed"])
}