Update smart input plugin to support more drive types (#5765)

This commit is contained in:
Greg 2019-05-07 16:20:03 -06:00 committed by Daniel Nelson
parent 67394709a9
commit 0d66ed70f8
3 changed files with 735 additions and 171 deletions

View File

@ -31,29 +31,27 @@ smartctl -s on <device>
[[inputs.smart]]
## Optionally specify the path to the smartctl executable
# path = "/usr/bin/smartctl"
#
## On most platforms smartctl requires root access.
## Setting 'use_sudo' to true will make use of sudo to run smartctl.
## Sudo must be configured to to allow the telegraf user to run smartctl
## with out password.
## without a password.
# use_sudo = false
#
## Skip checking disks in this power mode. Defaults to
## "standby" to not wake up disks that have stoped rotating.
## See --nockeck in the man pages for smartctl.
## See --nocheck in the man pages for smartctl.
## smartctl version 5.41 and 5.42 have faulty detection of
## power mode and might require changing this value to
## "never" depending on your storage device.
## "never" depending on your disks.
# nocheck = "standby"
#
## Gather detailed metrics for each SMART Attribute.
## Defaults to "false"
##
# attributes = false
#
## Optionally specify devices to exclude from reporting.
# excludes = [ "/dev/pass6" ]
#
## Optionally specify devices and device type, if unset
## a scan (smartctl --scan) for S.M.A.R.T. devices will
## done and all found will be included except for the

View File

@ -3,6 +3,7 @@ package smart
import (
"bufio"
"fmt"
"log"
"os/exec"
"path"
"regexp"
@ -18,31 +19,46 @@ import (
)
var (
execCommand = exec.Command // execCommand is used to mock commands in tests.
// Device Model: APPLE SSD SM256E
modelInInfo = regexp.MustCompile("^Device Model:\\s+(.*)$")
// Product: HUH721212AL5204
// Model Number: TS128GMTE850
modelInfo = regexp.MustCompile("^(Device Model|Product|Model Number):\\s+(.*)$")
// Serial Number: S0X5NZBC422720
serialInInfo = regexp.MustCompile("^Serial Number:\\s+(.*)$")
serialInfo = regexp.MustCompile("^Serial Number:\\s+(.*)$")
// LU WWN Device Id: 5 002538 655584d30
wwnInInfo = regexp.MustCompile("^LU WWN Device Id:\\s+(.*)$")
wwnInfo = regexp.MustCompile("^LU WWN Device Id:\\s+(.*)$")
// User Capacity: 251,000,193,024 bytes [251 GB]
usercapacityInInfo = regexp.MustCompile("^User Capacity:\\s+([0-9,]+)\\s+bytes.*$")
usercapacityInfo = regexp.MustCompile("^User Capacity:\\s+([0-9,]+)\\s+bytes.*$")
// SMART support is: Enabled
smartEnabledInInfo = regexp.MustCompile("^SMART support is:\\s+(\\w+)$")
smartEnabledInfo = regexp.MustCompile("^SMART support is:\\s+(\\w+)$")
// SMART overall-health self-assessment test result: PASSED
// SMART Health Status: OK
// PASSED, FAILED, UNKNOWN
smartOverallHealth = regexp.MustCompile("^SMART overall-health self-assessment test result:\\s+(\\w+).*$")
smartOverallHealth = regexp.MustCompile("^(SMART overall-health self-assessment test result|SMART Health Status):\\s+(\\w+).*$")
// Accumulated start-stop cycles: 7
sasStartStopAttr = regexp.MustCompile("^Accumulated start-stop cycles:\\s+(.*)$")
// Accumulated load-unload cycles: 39
sasLoadCycleAttr = regexp.MustCompile("^Accumulated load-unload cycles:\\s+(.*)$")
// Current Drive Temperature: 34 C
sasTempAttr = regexp.MustCompile("^Current Drive Temperature:\\s+(.*)\\s+C(.*)$")
// Temperature: 38 Celsius
nvmeTempAttr = regexp.MustCompile("^Temperature:\\s+(.*)\\s+(.*)$")
// Power Cycles: 472
nvmePowerCycleAttr = regexp.MustCompile("^Power Cycles:\\s+(.*)$")
// Power On Hours: 6,038
nvmePowerOnAttr = regexp.MustCompile("^Power On Hours:\\s+(.*)$")
// ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE
// 1 Raw_Read_Error_Rate -O-RC- 200 200 000 - 0
// 5 Reallocated_Sector_Ct PO--CK 100 100 000 - 0
// 192 Power-Off_Retract_Count -O--C- 097 097 000 - 14716
attribute = regexp.MustCompile("^\\s*([0-9]+)\\s(\\S+)\\s+([-P][-O][-S][-R][-C][-K])\\s+([0-9]+)\\s+([0-9]+)\\s+([0-9]+)\\s+([-\\w]+)\\s+([\\w\\+\\.]+).*$")
attribute = regexp.MustCompile("^\\s*([0-9]+)\\s(\\S+)\\s+([-P][-O][-S][-R][-C][-K])\\s+([0-9]+)\\s+([0-9]+)\\s+([0-9-]+)\\s+([-\\w]+)\\s+([\\w\\+\\.]+).*$")
deviceFieldIds = map[string]string{
"1": "read_error_rate",
"7": "seek_error_rate",
"190": "temp_c",
"194": "temp_c",
"199": "udma_crc_errors",
}
@ -60,13 +76,13 @@ type Smart struct {
var sampleConfig = `
## Optionally specify the path to the smartctl executable
# path = "/usr/bin/smartctl"
#
## On most platforms smartctl requires root access.
## Setting 'use_sudo' to true will make use of sudo to run smartctl.
## Sudo must be configured to to allow the telegraf user to run smartctl
## with out password.
## without a password.
# use_sudo = false
#
## Skip checking disks in this power mode. Defaults to
## "standby" to not wake up disks that have stoped rotating.
## See --nocheck in the man pages for smartctl.
@ -74,15 +90,13 @@ var sampleConfig = `
## power mode and might require changing this value to
## "never" depending on your disks.
# nocheck = "standby"
#
## Gather detailed metrics for each SMART Attribute.
## Defaults to "false"
##
# attributes = false
#
## Optionally specify devices to exclude from reporting.
# excludes = [ "/dev/pass6" ]
#
## Optionally specify devices and device type, if unset
## a scan (smartctl --scan) for S.M.A.R.T. devices will
## done and all found will be included except for the
@ -111,34 +125,36 @@ func (m *Smart) Gather(acc telegraf.Accumulator) error {
return err
}
}
log.Printf("D! [inputs.smart] devices: %+#v", devices)
m.getAttributes(acc, devices)
return nil
}
// Wrap with sudo
func sudo(sudo bool, command string, args ...string) *exec.Cmd {
var runCmd = func(sudo bool, command string, args ...string) ([]byte, error) {
cmd := exec.Command(command, args...)
if sudo {
return execCommand("sudo", append([]string{"-n", command}, args...)...)
cmd = exec.Command("sudo", append([]string{"-n", command}, args...)...)
}
return execCommand(command, args...)
return internal.CombinedOutputTimeout(cmd, time.Second*5)
}
// Scan for S.M.A.R.T. devices
func (m *Smart) scan() ([]string, error) {
cmd := sudo(m.UseSudo, m.Path, "--scan")
out, err := internal.CombinedOutputTimeout(cmd, time.Second*5)
out, err := runCmd(m.UseSudo, m.Path, "--scan")
if err != nil {
return []string{}, fmt.Errorf("failed to run command %s: %s - %s", strings.Join(cmd.Args, " "), err, string(out))
return []string{}, fmt.Errorf("failed to run command '%s --scan': %s - %s", m.Path, err, string(out))
}
devices := []string{}
for _, line := range strings.Split(string(out), "\n") {
dev := strings.Split(line, " ")
if len(dev) > 1 && !excludedDev(m.Excludes, strings.TrimSpace(dev[0])) {
log.Printf("D! [inputs.smart] adding device: %+#v", dev)
devices = append(devices, strings.TrimSpace(dev[0]))
} else {
log.Printf("D! [inputs.smart] skipping device: %+#v", dev)
}
}
return devices, nil
@ -158,7 +174,6 @@ func excludedDev(excludes []string, deviceLine string) bool {
// Get info and attributes for each S.M.A.R.T. device
func (m *Smart) getAttributes(acc telegraf.Accumulator, devices []string) {
var wg sync.WaitGroup
wg.Add(len(devices))
@ -180,79 +195,77 @@ func exitStatus(err error) (int, error) {
return 0, err
}
func gatherDisk(acc telegraf.Accumulator, usesudo, attributes bool, smartctl, nockeck, device string, wg *sync.WaitGroup) {
func gatherDisk(acc telegraf.Accumulator, usesudo, collectAttributes bool, smartctl, nocheck, device string, wg *sync.WaitGroup) {
defer wg.Done()
// smartctl 5.41 & 5.42 have are broken regarding handling of --nocheck/-n
args := []string{"--info", "--health", "--attributes", "--tolerance=verypermissive", "-n", nockeck, "--format=brief"}
args := []string{"--info", "--health", "--attributes", "--tolerance=verypermissive", "-n", nocheck, "--format=brief"}
args = append(args, strings.Split(device, " ")...)
cmd := sudo(usesudo, smartctl, args...)
out, e := internal.CombinedOutputTimeout(cmd, time.Second*5)
out, e := runCmd(usesudo, smartctl, args...)
outStr := string(out)
// Ignore all exit statuses except if it is a command line parse error
exitStatus, er := exitStatus(e)
if er != nil {
acc.AddError(fmt.Errorf("failed to run command %s: %s - %s", strings.Join(cmd.Args, " "), e, outStr))
acc.AddError(fmt.Errorf("failed to run command '%s %s': %s - %s", smartctl, strings.Join(args, " "), e, outStr))
return
}
device_tags := map[string]string{}
device_node := strings.Split(device, " ")[0]
device_tags["device"] = path.Base(device_node)
device_fields := make(map[string]interface{})
device_fields["exit_status"] = exitStatus
deviceTags := map[string]string{}
deviceNode := strings.Split(device, " ")[0]
deviceTags["device"] = path.Base(deviceNode)
deviceFields := make(map[string]interface{})
deviceFields["exit_status"] = exitStatus
log.Printf("D! [inputs.smart] gatherDisk '%s'", deviceNode)
scanner := bufio.NewScanner(strings.NewReader(outStr))
for scanner.Scan() {
line := scanner.Text()
model := modelInInfo.FindStringSubmatch(line)
if len(model) > 1 {
device_tags["model"] = model[1]
model := modelInfo.FindStringSubmatch(line)
if len(model) > 2 {
deviceTags["model"] = model[2]
}
serial := serialInInfo.FindStringSubmatch(line)
serial := serialInfo.FindStringSubmatch(line)
if len(serial) > 1 {
device_tags["serial_no"] = serial[1]
deviceTags["serial_no"] = serial[1]
}
wwn := wwnInInfo.FindStringSubmatch(line)
wwn := wwnInfo.FindStringSubmatch(line)
if len(wwn) > 1 {
device_tags["wwn"] = strings.Replace(wwn[1], " ", "", -1)
deviceTags["wwn"] = strings.Replace(wwn[1], " ", "", -1)
}
capacity := usercapacityInInfo.FindStringSubmatch(line)
capacity := usercapacityInfo.FindStringSubmatch(line)
if len(capacity) > 1 {
device_tags["capacity"] = strings.Replace(capacity[1], ",", "", -1)
deviceTags["capacity"] = strings.Replace(capacity[1], ",", "", -1)
}
enabled := smartEnabledInInfo.FindStringSubmatch(line)
enabled := smartEnabledInfo.FindStringSubmatch(line)
if len(enabled) > 1 {
device_tags["enabled"] = enabled[1]
deviceTags["enabled"] = enabled[1]
}
health := smartOverallHealth.FindStringSubmatch(line)
if len(health) > 1 {
device_fields["health_ok"] = (health[1] == "PASSED")
if len(health) > 2 {
deviceFields["health_ok"] = (health[2] == "PASSED" || health[2] == "OK")
}
tags := map[string]string{}
fields := make(map[string]interface{})
attr := attribute.FindStringSubmatch(line)
if len(attr) > 1 {
if collectAttributes {
deviceNode := strings.Split(device, " ")[0]
tags["device"] = path.Base(deviceNode)
if attributes {
tags := map[string]string{}
fields := make(map[string]interface{})
device_node := strings.Split(device, " ")[0]
tags["device"] = path.Base(device_node)
if serial, ok := device_tags["serial_no"]; ok {
if serial, ok := deviceTags["serial_no"]; ok {
tags["serial_no"] = serial
}
if wwn, ok := device_tags["wwn"]; ok {
if wwn, ok := deviceTags["wwn"]; ok {
tags["wwn"] = wwn
}
tags["id"] = attr[1]
@ -282,16 +295,95 @@ func gatherDisk(acc telegraf.Accumulator, usesudo, attributes bool, smartctl, no
// save the raw value to a field.
if field, ok := deviceFieldIds[attr[1]]; ok {
if val, err := parseRawValue(attr[8]); err == nil {
device_fields[field] = val
deviceFields[field] = val
}
}
} else {
if collectAttributes {
if startStop := sasStartStopAttr.FindStringSubmatch(line); len(startStop) > 1 {
tags["id"] = "4"
tags["name"] = "Start_Stop_Count"
i, err := strconv.ParseInt(strings.Replace(startStop[1], ",", "", -1), 10, 64)
if err != nil {
continue
}
fields["raw_value"] = i
acc.AddFields("smart_attribute", fields, tags)
continue
}
if powerCycle := nvmePowerCycleAttr.FindStringSubmatch(line); len(powerCycle) > 1 {
tags["id"] = "12"
tags["name"] = "Power_Cycle_Count"
i, err := strconv.ParseInt(strings.Replace(powerCycle[1], ",", "", -1), 10, 64)
if err != nil {
continue
}
fields["raw_value"] = i
acc.AddFields("smart_attribute", fields, tags)
continue
}
if powerOn := nvmePowerOnAttr.FindStringSubmatch(line); len(powerOn) > 1 {
tags["id"] = "9"
tags["name"] = "Power_On_Hours"
i, err := strconv.ParseInt(strings.Replace(powerOn[1], ",", "", -1), 10, 64)
if err != nil {
continue
}
fields["raw_value"] = i
acc.AddFields("smart_attribute", fields, tags)
continue
}
if loadCycle := sasLoadCycleAttr.FindStringSubmatch(line); len(loadCycle) > 1 {
tags["id"] = "193"
tags["name"] = "Load_Cycle_Count"
i, err := strconv.ParseInt(strings.Replace(loadCycle[1], ",", "", -1), 10, 64)
if err != nil {
continue
}
fields["raw_value"] = i
acc.AddFields("smart_attribute", fields, tags)
continue
}
if temp := sasTempAttr.FindStringSubmatch(line); len(temp) > 1 {
tags["id"] = "194"
tags["name"] = "Temperature_Celsius"
tempC, err := strconv.ParseInt(temp[1], 10, 64)
if err != nil {
continue
}
fields["raw_value"] = tempC
deviceFields["temp_c"] = tempC
acc.AddFields("smart_attribute", fields, tags)
}
if temp := nvmeTempAttr.FindStringSubmatch(line); len(temp) > 1 {
tags["id"] = "194"
tags["name"] = "Temperature_Celsius"
tempC, err := strconv.ParseInt(temp[1], 10, 64)
if err != nil {
continue
}
fields["raw_value"] = tempC
deviceFields["temp_c"] = tempC
acc.AddFields("smart_attribute", fields, tags)
}
}
}
}
acc.AddFields("smart_device", device_fields, device_tags)
acc.AddFields("smart_device", deviceFields, deviceTags)
}
func parseRawValue(rawVal string) (int64, error) {
// Integer
if i, err := strconv.ParseInt(rawVal, 10, 64); err == nil {
return i, nil

View File

@ -1,9 +1,8 @@
package smart
import (
"fmt"
"os"
"os/exec"
"errors"
"sync"
"testing"
"github.com/influxdata/telegraf/testutil"
@ -11,66 +10,24 @@ import (
"github.com/stretchr/testify/require"
)
var (
mockScanData = `/dev/ada0 -d atacam # /dev/ada0, ATA device
`
mockInfoAttributeData = `smartctl 6.5 2016-05-07 r4318 [Darwin 16.4.0 x86_64] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org
CHECK POWER MODE not implemented, ignoring -n option
=== START OF INFORMATION SECTION ===
Model Family: Apple SD/SM/TS...E/F SSDs
Device Model: APPLE SSD SM256E
Serial Number: S0X5NZBC422720
LU WWN Device Id: 5 002538 043584d30
Firmware Version: CXM09A1Q
User Capacity: 251,000,193,024 bytes [251 GB]
Sector Sizes: 512 bytes logical, 4096 bytes physical
Rotation Rate: Solid State Device
Device is: In smartctl database [for details use: -P show]
ATA Version is: ATA8-ACS T13/1699-D revision 4c
SATA Version is: SATA 3.0, 6.0 Gb/s (current: 6.0 Gb/s)
Local Time is: Thu Feb 9 16:48:45 2017 CET
SMART support is: Available - device has SMART capability.
SMART support is: Enabled
=== START OF READ SMART DATA SECTION ===
SMART overall-health self-assessment test result: PASSED
=== START OF READ SMART DATA SECTION ===
SMART Attributes Data Structure revision number: 1
Vendor Specific SMART Attributes with Thresholds:
ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE
1 Raw_Read_Error_Rate -O-RC- 200 200 000 - 0
5 Reallocated_Sector_Ct PO--CK 100 100 000 - 0
9 Power_On_Hours -O--CK 099 099 000 - 2988
12 Power_Cycle_Count -O--CK 085 085 000 - 14879
169 Unknown_Attribute PO--C- 253 253 010 - 2044932921600
173 Wear_Leveling_Count -O--CK 185 185 100 - 957808640337
190 Airflow_Temperature_Cel -O---K 055 040 045 Past 45 (Min/Max 43/57 #2689)
192 Power-Off_Retract_Count -O--C- 097 097 000 - 14716
194 Temperature_Celsius -O---K 066 021 000 - 34 (Min/Max 14/79)
197 Current_Pending_Sector -O---K 100 100 000 - 0
199 UDMA_CRC_Error_Count -O-RC- 200 200 000 - 0
240 Head_Flying_Hours ------ 100 253 000 - 6585h+55m+23.234s
||||||_ K auto-keep
|||||__ C event count
||||___ R error rate
|||____ S speed/performance
||_____ O updated online
|______ P prefailure warning
`
)
func TestGatherAttributes(t *testing.T) {
s := &Smart{
Path: "smartctl",
Attributes: true,
}
// overwriting exec commands with mock commands
execCommand = fakeExecCommand
var acc testutil.Accumulator
runCmd = func(sudo bool, command string, args ...string) ([]byte, error) {
if len(args) > 0 {
if args[0] == "--scan" {
return []byte(mockScanData), nil
} else if args[0] == "--info" {
return []byte(mockInfoAttributeData), nil
}
}
return nil, errors.New("command not found")
}
err := s.Gather(&acc)
require.NoError(t, err)
@ -302,8 +259,6 @@ func TestGatherAttributes(t *testing.T) {
acc.AssertContainsTaggedFields(t, "smart_attribute", test.fields, test.tags)
}
// tags = map[string]string{}
var testsAda0Device = []struct {
fields map[string]interface{}
tags map[string]string
@ -330,7 +285,6 @@ func TestGatherAttributes(t *testing.T) {
for _, test := range testsAda0Device {
acc.AssertContainsTaggedFields(t, "smart_device", test.fields, test.tags)
}
}
func TestGatherNoAttributes(t *testing.T) {
@ -339,7 +293,6 @@ func TestGatherNoAttributes(t *testing.T) {
Attributes: false,
}
// overwriting exec commands with mock commands
execCommand = fakeExecCommand
var acc testutil.Accumulator
err := s.Gather(&acc)
@ -348,8 +301,6 @@ func TestGatherNoAttributes(t *testing.T) {
assert.Equal(t, 5, acc.NFields(), "Wrong number of fields gathered")
acc.AssertDoesNotContainMeasurement(t, "smart_attribute")
// tags = map[string]string{}
var testsAda0Device = []struct {
fields map[string]interface{}
tags map[string]string
@ -376,51 +327,574 @@ func TestGatherNoAttributes(t *testing.T) {
for _, test := range testsAda0Device {
acc.AssertContainsTaggedFields(t, "smart_device", test.fields, test.tags)
}
}
func TestExcludedDev(t *testing.T) {
assert.Equal(t, true, excludedDev([]string{"/dev/pass6"}, "/dev/pass6 -d atacam"), "Should be excluded.")
assert.Equal(t, false, excludedDev([]string{}, "/dev/pass6 -d atacam"), "Shouldn't be excluded.")
assert.Equal(t, false, excludedDev([]string{"/dev/pass6"}, "/dev/pass1 -d atacam"), "Shouldn't be excluded.")
}
// fackeExecCommand is a helper function that mock
// the exec.Command call (and call the test binary)
func fakeExecCommand(command string, args ...string) *exec.Cmd {
cs := []string{"-test.run=TestHelperProcess", "--", command}
cs = append(cs, args...)
cmd := exec.Command(os.Args[0], cs...)
cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"}
return cmd
}
// TestHelperProcess isn't a real test. It's used to mock exec.Command
// For example, if you run:
// GO_WANT_HELPER_PROCESS=1 go test -test.run=TestHelperProcess -- --scan
// it returns below mockScanData.
func TestHelperProcess(t *testing.T) {
if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
return
func TestGatherSATAInfo(t *testing.T) {
runCmd = func(sudo bool, command string, args ...string) ([]byte, error) {
return []byte(hgstSATAInfoData), nil
}
args := os.Args
var (
acc = &testutil.Accumulator{}
wg = &sync.WaitGroup{}
)
// Previous arguments are tests stuff, that looks like :
// /tmp/go-build970079519/…/_test/integration.test -test.run=TestHelperProcess --
cmd, arg1, args := args[3], args[4], args[5:]
if cmd == "smartctl" {
if arg1 == "--scan" {
fmt.Fprint(os.Stdout, mockScanData)
}
if arg1 == "--info" {
fmt.Fprint(os.Stdout, mockInfoAttributeData)
}
} else {
fmt.Fprint(os.Stdout, "command not found")
os.Exit(1)
}
os.Exit(0)
wg.Add(1)
gatherDisk(acc, true, true, "", "", "", wg)
assert.Equal(t, 101, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(20), acc.NMetrics(), "Wrong number of metrics gathered")
}
func TestGatherSATAInfo65(t *testing.T) {
runCmd = func(sudo bool, command string, args ...string) ([]byte, error) {
return []byte(hgstSATAInfoData65), nil
}
var (
acc = &testutil.Accumulator{}
wg = &sync.WaitGroup{}
)
wg.Add(1)
gatherDisk(acc, true, true, "", "", "", wg)
assert.Equal(t, 91, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(18), acc.NMetrics(), "Wrong number of metrics gathered")
}
func TestGatherHgstSAS(t *testing.T) {
runCmd = func(sudo bool, command string, args ...string) ([]byte, error) {
return []byte(hgstSASInfoData), nil
}
var (
acc = &testutil.Accumulator{}
wg = &sync.WaitGroup{}
)
wg.Add(1)
gatherDisk(acc, true, true, "", "", "", wg)
assert.Equal(t, 6, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(4), acc.NMetrics(), "Wrong number of metrics gathered")
}
func TestGatherHtSAS(t *testing.T) {
runCmd = func(sudo bool, command string, args ...string) ([]byte, error) {
return []byte(htSASInfoData), nil
}
var (
acc = &testutil.Accumulator{}
wg = &sync.WaitGroup{}
)
wg.Add(1)
gatherDisk(acc, true, true, "", "", "", wg)
assert.Equal(t, 5, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(3), acc.NMetrics(), "Wrong number of metrics gathered")
}
func TestGatherSSD(t *testing.T) {
runCmd = func(sudo bool, command string, args ...string) ([]byte, error) {
return []byte(ssdInfoData), nil
}
var (
acc = &testutil.Accumulator{}
wg = &sync.WaitGroup{}
)
wg.Add(1)
gatherDisk(acc, true, true, "", "", "", wg)
assert.Equal(t, 105, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(26), acc.NMetrics(), "Wrong number of metrics gathered")
}
func TestGatherSSDRaid(t *testing.T) {
runCmd = func(sudo bool, command string, args ...string) ([]byte, error) {
return []byte(ssdRaidInfoData), nil
}
var (
acc = &testutil.Accumulator{}
wg = &sync.WaitGroup{}
)
wg.Add(1)
gatherDisk(acc, true, true, "", "", "", wg)
assert.Equal(t, 74, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(15), acc.NMetrics(), "Wrong number of metrics gathered")
}
func TestGatherNvme(t *testing.T) {
runCmd = func(sudo bool, command string, args ...string) ([]byte, error) {
return []byte(nvmeInfoData), nil
}
var (
acc = &testutil.Accumulator{}
wg = &sync.WaitGroup{}
)
wg.Add(1)
gatherDisk(acc, true, true, "", "", "", wg)
assert.Equal(t, 6, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(4), acc.NMetrics(), "Wrong number of metrics gathered")
}
// smartctl output
var (
// smartctl --scan
mockScanData = `/dev/ada0 -d atacam # /dev/ada0, ATA device
`
// smartctl --info --health --attributes --tolerance=verypermissive -n standby --format=brief [DEVICE]
mockInfoAttributeData = `smartctl 6.5 2016-05-07 r4318 [Darwin 16.4.0 x86_64] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org
CHECK POWER MODE not implemented, ignoring -n option
=== START OF INFORMATION SECTION ===
Model Family: Apple SD/SM/TS...E/F SSDs
Device Model: APPLE SSD SM256E
Serial Number: S0X5NZBC422720
LU WWN Device Id: 5 002538 043584d30
Firmware Version: CXM09A1Q
User Capacity: 251,000,193,024 bytes [251 GB]
Sector Sizes: 512 bytes logical, 4096 bytes physical
Rotation Rate: Solid State Device
Device is: In smartctl database [for details use: -P show]
ATA Version is: ATA8-ACS T13/1699-D revision 4c
SATA Version is: SATA 3.0, 6.0 Gb/s (current: 6.0 Gb/s)
Local Time is: Thu Feb 9 16:48:45 2017 CET
SMART support is: Available - device has SMART capability.
SMART support is: Enabled
=== START OF READ SMART DATA SECTION ===
SMART overall-health self-assessment test result: PASSED
=== START OF READ SMART DATA SECTION ===
SMART Attributes Data Structure revision number: 1
Vendor Specific SMART Attributes with Thresholds:
ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE
1 Raw_Read_Error_Rate -O-RC- 200 200 000 - 0
5 Reallocated_Sector_Ct PO--CK 100 100 000 - 0
9 Power_On_Hours -O--CK 099 099 000 - 2988
12 Power_Cycle_Count -O--CK 085 085 000 - 14879
169 Unknown_Attribute PO--C- 253 253 010 - 2044932921600
173 Wear_Leveling_Count -O--CK 185 185 100 - 957808640337
190 Airflow_Temperature_Cel -O---K 055 040 045 Past 45 (Min/Max 43/57 #2689)
192 Power-Off_Retract_Count -O--C- 097 097 000 - 14716
194 Temperature_Celsius -O---K 066 021 000 - 34 (Min/Max 14/79)
197 Current_Pending_Sector -O---K 100 100 000 - 0
199 UDMA_CRC_Error_Count -O-RC- 200 200 000 - 0
240 Head_Flying_Hours ------ 100 253 000 - 6585h+55m+23.234s
||||||_ K auto-keep
|||||__ C event count
||||___ R error rate
|||____ S speed/performance
||_____ O updated online
|______ P prefailure warning
`
htSASInfoData = `smartctl 6.6 2016-05-31 r4324 [x86_64-linux-4.15.18-12-pve] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smar$montools.org
=== START OF INFORMATION SECTION ===
Vendor: HITACHI
Product: HUC103030CSS600
Revision: J350
Compliance: SPC-4
User Capacity: 300,$00,000,000 bytes [300 GB]
Logical block size: 512 bytes
Rotation Rate: 10020 rpm
Form Factor: 2.5 inches
Logical Unit id: 0x5000cca00a4bdbc8
Serial number: PDWAR9GE
Devicetype: disk
Transport protocol: SAS (SPL-3)
Local Time is: Wed Apr 17 15:01:28 2019 PDT
SMART support is: Available - device has SMART capability.
SMART support is: Enabled
Temp$rature Warning: Disabled or Not Supported
=== START OF READ SMART DATA SECTION ===
SMART Health Status: OK
Current Drive Temperature: 36 C
Drive Trip Temperature: 85 C
Manufactured in $eek 52 of year 2009
Specified cycle count over device lifetime: 50000
Accumulated start-stop cycles: 47
Elements in grown defect list: 0
Vendor (Seagate) cache information
Blocks sent to initiator= 7270983270400000
`
hgstSASInfoData = `smartctl 6.6 2016-05-31 r4324 [x86_64-linux-4.15.0-46-generic] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org
=== START OF INFORMATION SECTION ===
Vendor: HGST
Product: HUH721212AL5204
Revision: C3Q1
Compliance: SPC-4
User Capacity: 12,000,138,625,024 bytes [12.0 TB]
Logical block size: 512 bytes
Physical block size: 4096 bytes
LU is fully provisioned
Rotation Rate: 7200 rpm
Form Factor: 3.5 inches
Logical Unit id: 0x5000cca27076bfe8
Serial number: 8HJ39K3H
Device type: disk
Transport protocol: SAS (SPL-3)
Local Time is: Thu Apr 18 13:25:03 2019 MSK
SMART support is: Available - device has SMART capability.
SMART support is: Enabled
Temperature Warning: Enabled
=== START OF READ SMART DATA SECTION ===
SMART Health Status: OK
Current Drive Temperature: 34 C
Drive Trip Temperature: 85 C
Manufactured in week 35 of year 2018
Specified cycle count over device lifetime: 50000
Accumulated start-stop cycles: 7
Specified load-unload count over device lifetime: 600000
Accumulated load-unload cycles: 39
Elements in grown defect list: 0
Vendor (Seagate) cache information
Blocks sent to initiator = 544135446528
`
hgstSATAInfoData = `smartctl 6.6 2016-05-31 r4324 [x86_64-linux-4.15.0-46-generic] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org
=== START OF INFORMATION SECTION ===
Model Family: Hitachi/HGST Travelstar Z7K500
Device Model: HGST HTE725050A7E630
Serial Number: RCE50G20G81S9S
LU WWN Device Id: 5 000cca 90bc3a98b
Firmware Version: GS2OA3E0
User Capacity: 500,107,862,016 bytes [500 GB]
Sector Sizes: 512 bytes logical, 4096 bytes physical
Rotation Rate: 7200 rpm
Form Factor: 2.5 inches
Device is: In smartctl database [for details use: -P show]
ATA Version is: ATA8-ACS T13/1699-D revision 6
SATA Version is: SATA 2.6, 6.0 Gb/s (current: 6.0 Gb/s)
Local Time is: Thu Apr 18 13:27:51 2019 MSK
SMART support is: Available - device has SMART capability.
SMART support is: Enabled
Power mode is: ACTIVE or IDLE
=== START OF READ SMART DATA SECTION ===
SMART overall-health self-assessment test result: PASSED
SMART Attributes Data Structure revision number: 16
Vendor Specific SMART Attributes with Thresholds:
ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE
1 Raw_Read_Error_Rate PO-R-- 100 100 062 - 0
2 Throughput_Performance P-S--- 100 100 040 - 0
3 Spin_Up_Time POS--- 100 100 033 - 1
4 Start_Stop_Count -O--C- 100 100 000 - 4
5 Reallocated_Sector_Ct PO--CK 100 100 005 - 0
7 Seek_Error_Rate PO-R-- 100 100 067 - 0
8 Seek_Time_Performance P-S--- 100 100 040 - 0
9 Power_On_Hours -O--C- 099 099 000 - 743
10 Spin_Retry_Count PO--C- 100 100 060 - 0
12 Power_Cycle_Count -O--CK 100 100 000 - 4
191 G-Sense_Error_Rate -O-R-- 100 100 000 - 0
192 Power-Off_Retract_Count -O--CK 100 100 000 - 2
193 Load_Cycle_Count -O--C- 100 100 000 - 13
194 Temperature_Celsius -O---- 250 250 000 - 24 (Min/Max 15/29)
196 Reallocated_Event_Count -O--CK 100 100 000 - 0
197 Current_Pending_Sector -O---K 100 100 000 - 0
198 Offline_Uncorrectable ---R-- 100 100 000 - 0
199 UDMA_CRC_Error_Count -O-R-- 200 200 000 - 0
223 Load_Retry_Count -O-R-- 100 100 000 - 0
||||||_ K auto-keep
|||||__ C event count
||||___ R error rate
|||____ S speed/performance
||_____ O updated online
|______ P prefailure warning
`
hgstSATAInfoData65 = `smartctl 6.5 2016-01-24 r4214 [x86_64-linux-4.4.0-145-generic] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org
=== START OF INFORMATION SECTION ===
Model Family: HGST Deskstar NAS
Device Model: HGST HDN724040ALE640
Serial Number: PK1334PEK49SBS
LU WWN Device Id: 5 000cca 250ec3c9c
Firmware Version: MJAOA5E0
User Capacity: 4,000,787,030,016 bytes [4.00 TB]
Sector Sizes: 512 bytes logical, 4096 bytes physical
Rotation Rate: 7200 rpm
Form Factor: 3.5 inches
Device is: In smartctl database [for details use: -P show]
ATA Version is: ATA8-ACS T13/1699-D revision 4
SATA Version is: SATA 3.0, 6.0 Gb/s (current: 6.0 Gb/s)
Local Time is: Wed Apr 17 15:14:27 2019 PDT
SMART support is: Available - device has SMART capability.
SMART support is: Enabled
Power mode is: ACTIVE or IDLE
=== START OF READ SMART DATA SECTION ===
SMART overall-health self-assessment test result: PASSED
SMART Attributes Data Structure revision number: 16
Vendor Specific SMART Attributes with Thresholds:
ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE
1 Raw_Read_Error_Rate PO-R-- 100 100 016 - 0
2 Throughput_Performance P-S--- 135 135 054 - 84
3 Spin_Up_Time POS--- 125 125 024 - 621 (Average 619)
4 Start_Stop_Count -O--C- 100 100 000 - 33
5 Reallocated_Sector_Ct PO--CK 100 100 005 - 0
7 Seek_Error_Rate PO-R-- 100 100 067 - 0
8 Seek_Time_Performance P-S--- 119 119 020 - 35
9 Power_On_Hours -O--C- 098 098 000 - 19371
10 Spin_Retry_Count PO--C- 100 100 060 - 0
12 Power_Cycle_Count -O--CK 100 100 000 - 33
192 Power-Off_Retract_Count -O--CK 100 100 000 - 764
193 Load_Cycle_Count -O--C- 100 100 000 - 764
194 Temperature_Celsius -O---- 176 176 000 - 34 (Min/Max 21/53)
196 Reallocated_Event_Count -O--CK 100 100 000 - 0
197 Current_Pending_Sector -O---K 100 100 000 - 0
198 Offline_Uncorrectable ---R-- 100 100 000 - 0
199 UDMA_CRC_Error_Count -O-R-- 200 200 000 - 0
||||||_ K auto-keep
|||||__ C event count
||||___ R error rate
|||____ S speed/performance
||_____ O updated online
|______ P prefailure warning
`
ssdInfoData = `smartctl 6.6 2016-05-31 r4324 [x86_64-linux-4.15.0-33-generic] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org
=== START OF INFORMATION SECTION ===
Device Model: SanDisk Ultra II 240GB
Serial Number: XXXXXXXX
LU WWN Device Id: XXXXXXXX
Firmware Version: XXXXXXX
User Capacity: 240.057.409.536 bytes [240 GB]
Sector Size: 512 bytes logical/physical
Rotation Rate: Solid State Device
Form Factor: 2.5 inches
Device is: Not in smartctl database [for details use: -P showall]
ATA Version is: ACS-2 T13/2015-D revision 3
SATA Version is: SATA 3.2, 6.0 Gb/s (current: 6.0 Gb/s)
Local Time is: Mon Sep 17 13:22:19 2018 CEST
SMART support is: Available - device has SMART capability.
SMART support is: Enabled
Power mode is: ACTIVE or IDLE
=== START OF READ SMART DATA SECTION ===
SMART overall-health self-assessment test result: PASSED
SMART Attributes Data Structure revision number: 4
Vendor Specific SMART Attributes with Thresholds:
ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE
5 Reallocated_Sector_Ct -O--CK 100 100 --- - 0
9 Power_On_Hours -O--CK 100 100 --- - 6383
12 Power_Cycle_Count -O--CK 100 100 --- - 19
165 Unknown_Attribute -O--CK 100 100 --- - 59310806
166 Unknown_Attribute -O--CK 100 100 --- - 1
167 Unknown_Attribute -O--CK 100 100 --- - 57
168 Unknown_Attribute -O--CK 100 100 --- - 43
169 Unknown_Attribute -O--CK 100 100 --- - 221
170 Unknown_Attribute -O--CK 100 100 --- - 0
171 Unknown_Attribute -O--CK 100 100 --- - 0
172 Unknown_Attribute -O--CK 100 100 --- - 0
173 Unknown_Attribute -O--CK 100 100 --- - 13
174 Unknown_Attribute -O--CK 100 100 --- - 4
184 End-to-End_Error -O--CK 100 100 --- - 0
187 Reported_Uncorrect -O--CK 100 100 --- - 0
188 Command_Timeout -O--CK 100 100 --- - 0
194 Temperature_Celsius -O---K 066 065 --- - 34 (Min/Max 19/65)
199 UDMA_CRC_Error_Count -O--CK 100 100 --- - 0
230 Unknown_SSD_Attribute -O--CK 100 100 --- - 2229110374919
232 Available_Reservd_Space PO--CK 100 100 004 - 100
233 Media_Wearout_Indicator -O--CK 100 100 --- - 3129
234 Unknown_Attribute -O--CK 100 100 --- - 7444
241 Total_LBAs_Written ----CK 253 253 --- - 4812
242 Total_LBAs_Read ----CK 253 253 --- - 671
244 Unknown_Attribute -O--CK 000 100 --- - 0
||||||_ K auto-keep
|||||__ C event count
||||___ R error rate
|||____ S speed/performance
||_____ O updated online
|______ P prefailure warning
`
ssdRaidInfoData = `smartctl 6.6 2017-11-05 r4594 [FreeBSD 11.1-RELEASE-p13 amd64] (local build)
Copyright (C) 2002-17, Bruce Allen, Christian Franke, www.smartmontools.org
CHECK POWER MODE: incomplete response, ATA output registers missing
CHECK POWER MODE not implemented, ignoring -n option
=== START OF INFORMATION SECTION ===
Model Family: Samsung based SSDs
Device Model: Samsung SSD 850 PRO 256GB
Serial Number: S251NX0H869353L
LU WWN Device Id: 5 002538 84027f72f
Firmware Version: EXM02B6Q
User Capacity: 256 060 514 304 bytes [256 GB]
Sector Size: 512 bytes logical/physical
Rotation Rate: Solid State Device
Device is: In smartctl database [for details use: -P show]
ATA Version is: ACS-2, ATA8-ACS T13/1699-D revision 4c
SATA Version is: SATA 3.1, 6.0 Gb/s (current: 6.0 Gb/s)
Local Time is: Fri Sep 21 17:49:16 2018 CEST
SMART support is: Available - device has SMART capability.
SMART support is: Enabled
=== START OF READ SMART DATA SECTION ===
SMART Status not supported: Incomplete response, ATA output registers missing
SMART overall-health self-assessment test result: PASSED
Warning: This result is based on an Attribute check.
General SMART Values:
Offline data collection status: (0x00) Offline data collection activity
was never started.
Auto Offline Data Collection: Disabled.
Self-test execution status: ( 0) The previous self-test routine completed
without error or no self-test has ever
been run.
Total time to complete Offline
data collection: ( 0) seconds.
Offline data collection
capabilities: (0x53) SMART execute Offline immediate.
Auto Offline data collection on/off support.
Suspend Offline collection upon new
command.
No Offline surface scan supported.
Self-test supported.
No Conveyance Self-test supported.
Selective Self-test supported.
SMART capabilities: (0x0003) Saves SMART data before entering
power-saving mode.
Supports SMART auto save timer.
Error logging capability: (0x01) Error logging supported.
General Purpose Logging supported.
Short self-test routine
recommended polling time: ( 2) minutes.
Extended self-test routine
recommended polling time: ( 136) minutes.
SCT capabilities: (0x003d) SCT Status supported.
SCT Error Recovery Control supported.
SCT Feature Control supported.
SCT Data Table supported.
SMART Attributes Data Structure revision number: 1
Vendor Specific SMART Attributes with Thresholds:
ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE
5 Reallocated_Sector_Ct PO--CK 099 099 010 - 1
9 Power_On_Hours -O--CK 094 094 000 - 26732
12 Power_Cycle_Count -O--CK 099 099 000 - 51
177 Wear_Leveling_Count PO--C- 001 001 000 - 7282
179 Used_Rsvd_Blk_Cnt_Tot PO--C- 099 099 010 - 1
181 Program_Fail_Cnt_Total -O--CK 100 100 010 - 0
182 Erase_Fail_Count_Total -O--CK 099 099 010 - 1
183 Runtime_Bad_Block PO--C- 099 099 010 - 1
187 Uncorrectable_Error_Cnt -O--CK 100 100 000 - 0
190 Airflow_Temperature_Cel -O--CK 081 069 000 - 19
195 ECC_Error_Rate -O-RC- 200 200 000 - 0
199 CRC_Error_Count -OSRCK 100 100 000 - 0
235 POR_Recovery_Count -O--C- 099 099 000 - 50
241 Total_LBAs_Written -O--CK 099 099 000 - 61956393677
||||||_ K auto-keep
|||||__ C event count
||||___ R error rate
|||____ S speed/performance
||_____ O updated online
|______ P prefailure warning
SMART Error Log Version: 1
No Errors Logged
SMART Self-test log structure revision number 1
Num Test_Description Status Remaining LifeTime(hours) LBA_of_first_error
# 1 Short offline Completed without error 00% 26717 -
# 2 Short offline Completed without error 00% 26693 -
# 3 Short offline Completed without error 00% 26669 -
# 4 Short offline Completed without error 00% 26645 -
# 5 Short offline Completed without error 00% 26621 -
# 6 Short offline Completed without error 00% 26596 -
# 7 Extended offline Completed without error 00% 26574 -
# 8 Short offline Completed without error 00% 26572 -
# 9 Short offline Completed without error 00% 26548 -
#10 Short offline Completed without error 00% 26524 -
#11 Short offline Completed without error 00% 26500 -
#12 Short offline Completed without error 00% 26476 -
#13 Short offline Completed without error 00% 26452 -
#14 Short offline Completed without error 00% 26428 -
#15 Extended offline Completed without error 00% 26406 -
#16 Short offline Completed without error 00% 26404 -
#17 Short offline Completed without error 00% 26380 -
#18 Short offline Completed without error 00% 26356 -
#19 Short offline Completed without error 00% 26332 -
#20 Short offline Completed without error 00% 26308 -
SMART Selective self-test log data structure revision number 1
SPAN MIN_LBA MAX_LBA CURRENT_TEST_STATUS
1 0 0 Not_testing
2 0 0 Not_testing
3 0 0 Not_testing
4 0 0 Not_testing
5 0 0 Not_testing
Selective self-test flags (0x0):
After scanning selected spans, do NOT read-scan remainder of disk.
If Selective self-test is pending on power-up, resume after 0 minute delay.
`
nvmeInfoData = `smartctl 6.5 2016-05-07 r4318 [x86_64-linux-4.1.27-gvt-yocto-standard] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org
=== START OF INFORMATION SECTION ===
Model Number: TS128GMTE850
Serial Number: D704940282?
Firmware Version: C2.3.13
PCI Vendor/Subsystem ID: 0x126f
IEEE OUI Identifier: 0x000000
Controller ID: 1
Number of Namespaces: 1
Namespace 1 Size/Capacity: 128,035,676,160 [128 GB]
Namespace 1 Formatted LBA Size: 512
Local Time is: Fri Jun 15 11:41:35 2018 UTC
=== START OF SMART DATA SECTION ===
SMART overall-health self-assessment test result: PASSED
SMART/Health Information (NVMe Log 0x02, NSID 0xffffffff)
Critical Warning: 0x00
Temperature: 38 Celsius
Available Spare: 100%
Available Spare Threshold: 10%
Percentage Used: 16%
Data Units Read: 11,836,935 [6.06 TB]
Data Units Written: 62,288,091 [31.8 TB]
Host Read Commands: 135,924,188
Host Write Commands: 7,715,573,429
Controller Busy Time: 4,042
Power Cycles: 472
Power On Hours: 6,038
Unsafe Shutdowns: 355
Media and Data Integrity Errors: 0
Error Information Log Entries: 119,699
Warning Comp. Temperature Time: 0
Critical Comp. Temperature Time: 0
`
)