From 0d66ed70f84e18266bf641749c9ee4964215a044 Mon Sep 17 00:00:00 2001 From: Greg <2653109+glinton@users.noreply.github.com> Date: Tue, 7 May 2019 16:20:03 -0600 Subject: [PATCH] Update smart input plugin to support more drive types (#5765) --- plugins/inputs/smart/README.md | 18 +- plugins/inputs/smart/smart.go | 218 +++++++--- plugins/inputs/smart/smart_test.go | 670 ++++++++++++++++++++++++----- 3 files changed, 735 insertions(+), 171 deletions(-) diff --git a/plugins/inputs/smart/README.md b/plugins/inputs/smart/README.md index c60e11e35..127397f1e 100644 --- a/plugins/inputs/smart/README.md +++ b/plugins/inputs/smart/README.md @@ -31,29 +31,27 @@ smartctl -s on [[inputs.smart]] ## Optionally specify the path to the smartctl executable # path = "/usr/bin/smartctl" - # + ## On most platforms smartctl requires root access. ## Setting 'use_sudo' to true will make use of sudo to run smartctl. ## Sudo must be configured to to allow the telegraf user to run smartctl - ## with out password. + ## without a password. # use_sudo = false - # + ## Skip checking disks in this power mode. Defaults to ## "standby" to not wake up disks that have stoped rotating. - ## See --nockeck in the man pages for smartctl. + ## See --nocheck in the man pages for smartctl. ## smartctl version 5.41 and 5.42 have faulty detection of ## power mode and might require changing this value to - ## "never" depending on your storage device. + ## "never" depending on your disks. # nocheck = "standby" - # + ## Gather detailed metrics for each SMART Attribute. - ## Defaults to "false" - ## # attributes = false - # + ## Optionally specify devices to exclude from reporting. # excludes = [ "/dev/pass6" ] - # + ## Optionally specify devices and device type, if unset ## a scan (smartctl --scan) for S.M.A.R.T. devices will ## done and all found will be included except for the diff --git a/plugins/inputs/smart/smart.go b/plugins/inputs/smart/smart.go index 46912d487..8bec2581f 100644 --- a/plugins/inputs/smart/smart.go +++ b/plugins/inputs/smart/smart.go @@ -3,6 +3,7 @@ package smart import ( "bufio" "fmt" + "log" "os/exec" "path" "regexp" @@ -18,31 +19,46 @@ import ( ) var ( - execCommand = exec.Command // execCommand is used to mock commands in tests. - // Device Model: APPLE SSD SM256E - modelInInfo = regexp.MustCompile("^Device Model:\\s+(.*)$") + // Product: HUH721212AL5204 + // Model Number: TS128GMTE850 + modelInfo = regexp.MustCompile("^(Device Model|Product|Model Number):\\s+(.*)$") // Serial Number: S0X5NZBC422720 - serialInInfo = regexp.MustCompile("^Serial Number:\\s+(.*)$") + serialInfo = regexp.MustCompile("^Serial Number:\\s+(.*)$") // LU WWN Device Id: 5 002538 655584d30 - wwnInInfo = regexp.MustCompile("^LU WWN Device Id:\\s+(.*)$") + wwnInfo = regexp.MustCompile("^LU WWN Device Id:\\s+(.*)$") // User Capacity: 251,000,193,024 bytes [251 GB] - usercapacityInInfo = regexp.MustCompile("^User Capacity:\\s+([0-9,]+)\\s+bytes.*$") + usercapacityInfo = regexp.MustCompile("^User Capacity:\\s+([0-9,]+)\\s+bytes.*$") // SMART support is: Enabled - smartEnabledInInfo = regexp.MustCompile("^SMART support is:\\s+(\\w+)$") + smartEnabledInfo = regexp.MustCompile("^SMART support is:\\s+(\\w+)$") // SMART overall-health self-assessment test result: PASSED + // SMART Health Status: OK // PASSED, FAILED, UNKNOWN - smartOverallHealth = regexp.MustCompile("^SMART overall-health self-assessment test result:\\s+(\\w+).*$") + smartOverallHealth = regexp.MustCompile("^(SMART overall-health self-assessment test result|SMART Health Status):\\s+(\\w+).*$") + + // Accumulated start-stop cycles: 7 + sasStartStopAttr = regexp.MustCompile("^Accumulated start-stop cycles:\\s+(.*)$") + // Accumulated load-unload cycles: 39 + sasLoadCycleAttr = regexp.MustCompile("^Accumulated load-unload cycles:\\s+(.*)$") + // Current Drive Temperature: 34 C + sasTempAttr = regexp.MustCompile("^Current Drive Temperature:\\s+(.*)\\s+C(.*)$") + // Temperature: 38 Celsius + nvmeTempAttr = regexp.MustCompile("^Temperature:\\s+(.*)\\s+(.*)$") + // Power Cycles: 472 + nvmePowerCycleAttr = regexp.MustCompile("^Power Cycles:\\s+(.*)$") + // Power On Hours: 6,038 + nvmePowerOnAttr = regexp.MustCompile("^Power On Hours:\\s+(.*)$") // ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE // 1 Raw_Read_Error_Rate -O-RC- 200 200 000 - 0 // 5 Reallocated_Sector_Ct PO--CK 100 100 000 - 0 // 192 Power-Off_Retract_Count -O--C- 097 097 000 - 14716 - attribute = regexp.MustCompile("^\\s*([0-9]+)\\s(\\S+)\\s+([-P][-O][-S][-R][-C][-K])\\s+([0-9]+)\\s+([0-9]+)\\s+([0-9]+)\\s+([-\\w]+)\\s+([\\w\\+\\.]+).*$") + attribute = regexp.MustCompile("^\\s*([0-9]+)\\s(\\S+)\\s+([-P][-O][-S][-R][-C][-K])\\s+([0-9]+)\\s+([0-9]+)\\s+([0-9-]+)\\s+([-\\w]+)\\s+([\\w\\+\\.]+).*$") deviceFieldIds = map[string]string{ "1": "read_error_rate", "7": "seek_error_rate", + "190": "temp_c", "194": "temp_c", "199": "udma_crc_errors", } @@ -60,13 +76,13 @@ type Smart struct { var sampleConfig = ` ## Optionally specify the path to the smartctl executable # path = "/usr/bin/smartctl" - # + ## On most platforms smartctl requires root access. ## Setting 'use_sudo' to true will make use of sudo to run smartctl. ## Sudo must be configured to to allow the telegraf user to run smartctl - ## with out password. + ## without a password. # use_sudo = false - # + ## Skip checking disks in this power mode. Defaults to ## "standby" to not wake up disks that have stoped rotating. ## See --nocheck in the man pages for smartctl. @@ -74,15 +90,13 @@ var sampleConfig = ` ## power mode and might require changing this value to ## "never" depending on your disks. # nocheck = "standby" - # + ## Gather detailed metrics for each SMART Attribute. - ## Defaults to "false" - ## # attributes = false - # + ## Optionally specify devices to exclude from reporting. # excludes = [ "/dev/pass6" ] - # + ## Optionally specify devices and device type, if unset ## a scan (smartctl --scan) for S.M.A.R.T. devices will ## done and all found will be included except for the @@ -111,34 +125,36 @@ func (m *Smart) Gather(acc telegraf.Accumulator) error { return err } } + log.Printf("D! [inputs.smart] devices: %+#v", devices) m.getAttributes(acc, devices) return nil } // Wrap with sudo -func sudo(sudo bool, command string, args ...string) *exec.Cmd { +var runCmd = func(sudo bool, command string, args ...string) ([]byte, error) { + cmd := exec.Command(command, args...) if sudo { - return execCommand("sudo", append([]string{"-n", command}, args...)...) + cmd = exec.Command("sudo", append([]string{"-n", command}, args...)...) } - - return execCommand(command, args...) + return internal.CombinedOutputTimeout(cmd, time.Second*5) } // Scan for S.M.A.R.T. devices func (m *Smart) scan() ([]string, error) { - - cmd := sudo(m.UseSudo, m.Path, "--scan") - out, err := internal.CombinedOutputTimeout(cmd, time.Second*5) + out, err := runCmd(m.UseSudo, m.Path, "--scan") if err != nil { - return []string{}, fmt.Errorf("failed to run command %s: %s - %s", strings.Join(cmd.Args, " "), err, string(out)) + return []string{}, fmt.Errorf("failed to run command '%s --scan': %s - %s", m.Path, err, string(out)) } devices := []string{} for _, line := range strings.Split(string(out), "\n") { dev := strings.Split(line, " ") if len(dev) > 1 && !excludedDev(m.Excludes, strings.TrimSpace(dev[0])) { + log.Printf("D! [inputs.smart] adding device: %+#v", dev) devices = append(devices, strings.TrimSpace(dev[0])) + } else { + log.Printf("D! [inputs.smart] skipping device: %+#v", dev) } } return devices, nil @@ -158,7 +174,6 @@ func excludedDev(excludes []string, deviceLine string) bool { // Get info and attributes for each S.M.A.R.T. device func (m *Smart) getAttributes(acc telegraf.Accumulator, devices []string) { - var wg sync.WaitGroup wg.Add(len(devices)) @@ -180,79 +195,77 @@ func exitStatus(err error) (int, error) { return 0, err } -func gatherDisk(acc telegraf.Accumulator, usesudo, attributes bool, smartctl, nockeck, device string, wg *sync.WaitGroup) { - +func gatherDisk(acc telegraf.Accumulator, usesudo, collectAttributes bool, smartctl, nocheck, device string, wg *sync.WaitGroup) { defer wg.Done() // smartctl 5.41 & 5.42 have are broken regarding handling of --nocheck/-n - args := []string{"--info", "--health", "--attributes", "--tolerance=verypermissive", "-n", nockeck, "--format=brief"} + args := []string{"--info", "--health", "--attributes", "--tolerance=verypermissive", "-n", nocheck, "--format=brief"} args = append(args, strings.Split(device, " ")...) - cmd := sudo(usesudo, smartctl, args...) - out, e := internal.CombinedOutputTimeout(cmd, time.Second*5) + out, e := runCmd(usesudo, smartctl, args...) outStr := string(out) // Ignore all exit statuses except if it is a command line parse error exitStatus, er := exitStatus(e) if er != nil { - acc.AddError(fmt.Errorf("failed to run command %s: %s - %s", strings.Join(cmd.Args, " "), e, outStr)) + acc.AddError(fmt.Errorf("failed to run command '%s %s': %s - %s", smartctl, strings.Join(args, " "), e, outStr)) return } - device_tags := map[string]string{} - device_node := strings.Split(device, " ")[0] - device_tags["device"] = path.Base(device_node) - device_fields := make(map[string]interface{}) - device_fields["exit_status"] = exitStatus + deviceTags := map[string]string{} + deviceNode := strings.Split(device, " ")[0] + deviceTags["device"] = path.Base(deviceNode) + deviceFields := make(map[string]interface{}) + deviceFields["exit_status"] = exitStatus + + log.Printf("D! [inputs.smart] gatherDisk '%s'", deviceNode) scanner := bufio.NewScanner(strings.NewReader(outStr)) for scanner.Scan() { line := scanner.Text() - model := modelInInfo.FindStringSubmatch(line) - if len(model) > 1 { - device_tags["model"] = model[1] + model := modelInfo.FindStringSubmatch(line) + if len(model) > 2 { + deviceTags["model"] = model[2] } - serial := serialInInfo.FindStringSubmatch(line) + serial := serialInfo.FindStringSubmatch(line) if len(serial) > 1 { - device_tags["serial_no"] = serial[1] + deviceTags["serial_no"] = serial[1] } - wwn := wwnInInfo.FindStringSubmatch(line) + wwn := wwnInfo.FindStringSubmatch(line) if len(wwn) > 1 { - device_tags["wwn"] = strings.Replace(wwn[1], " ", "", -1) + deviceTags["wwn"] = strings.Replace(wwn[1], " ", "", -1) } - capacity := usercapacityInInfo.FindStringSubmatch(line) + capacity := usercapacityInfo.FindStringSubmatch(line) if len(capacity) > 1 { - device_tags["capacity"] = strings.Replace(capacity[1], ",", "", -1) + deviceTags["capacity"] = strings.Replace(capacity[1], ",", "", -1) } - enabled := smartEnabledInInfo.FindStringSubmatch(line) + enabled := smartEnabledInfo.FindStringSubmatch(line) if len(enabled) > 1 { - device_tags["enabled"] = enabled[1] + deviceTags["enabled"] = enabled[1] } health := smartOverallHealth.FindStringSubmatch(line) - if len(health) > 1 { - device_fields["health_ok"] = (health[1] == "PASSED") + if len(health) > 2 { + deviceFields["health_ok"] = (health[2] == "PASSED" || health[2] == "OK") } + tags := map[string]string{} + fields := make(map[string]interface{}) + attr := attribute.FindStringSubmatch(line) - if len(attr) > 1 { + if collectAttributes { + deviceNode := strings.Split(device, " ")[0] + tags["device"] = path.Base(deviceNode) - if attributes { - tags := map[string]string{} - fields := make(map[string]interface{}) - - device_node := strings.Split(device, " ")[0] - tags["device"] = path.Base(device_node) - - if serial, ok := device_tags["serial_no"]; ok { + if serial, ok := deviceTags["serial_no"]; ok { tags["serial_no"] = serial } - if wwn, ok := device_tags["wwn"]; ok { + if wwn, ok := deviceTags["wwn"]; ok { tags["wwn"] = wwn } tags["id"] = attr[1] @@ -282,16 +295,95 @@ func gatherDisk(acc telegraf.Accumulator, usesudo, attributes bool, smartctl, no // save the raw value to a field. if field, ok := deviceFieldIds[attr[1]]; ok { if val, err := parseRawValue(attr[8]); err == nil { - device_fields[field] = val + deviceFields[field] = val + } + } + } else { + if collectAttributes { + if startStop := sasStartStopAttr.FindStringSubmatch(line); len(startStop) > 1 { + tags["id"] = "4" + tags["name"] = "Start_Stop_Count" + i, err := strconv.ParseInt(strings.Replace(startStop[1], ",", "", -1), 10, 64) + if err != nil { + continue + } + fields["raw_value"] = i + + acc.AddFields("smart_attribute", fields, tags) + continue + } + + if powerCycle := nvmePowerCycleAttr.FindStringSubmatch(line); len(powerCycle) > 1 { + tags["id"] = "12" + tags["name"] = "Power_Cycle_Count" + i, err := strconv.ParseInt(strings.Replace(powerCycle[1], ",", "", -1), 10, 64) + if err != nil { + continue + } + fields["raw_value"] = i + + acc.AddFields("smart_attribute", fields, tags) + continue + } + + if powerOn := nvmePowerOnAttr.FindStringSubmatch(line); len(powerOn) > 1 { + tags["id"] = "9" + tags["name"] = "Power_On_Hours" + i, err := strconv.ParseInt(strings.Replace(powerOn[1], ",", "", -1), 10, 64) + if err != nil { + continue + } + fields["raw_value"] = i + + acc.AddFields("smart_attribute", fields, tags) + continue + } + + if loadCycle := sasLoadCycleAttr.FindStringSubmatch(line); len(loadCycle) > 1 { + tags["id"] = "193" + tags["name"] = "Load_Cycle_Count" + i, err := strconv.ParseInt(strings.Replace(loadCycle[1], ",", "", -1), 10, 64) + if err != nil { + continue + } + fields["raw_value"] = i + + acc.AddFields("smart_attribute", fields, tags) + continue + } + + if temp := sasTempAttr.FindStringSubmatch(line); len(temp) > 1 { + tags["id"] = "194" + tags["name"] = "Temperature_Celsius" + tempC, err := strconv.ParseInt(temp[1], 10, 64) + if err != nil { + continue + } + fields["raw_value"] = tempC + deviceFields["temp_c"] = tempC + + acc.AddFields("smart_attribute", fields, tags) + } + + if temp := nvmeTempAttr.FindStringSubmatch(line); len(temp) > 1 { + tags["id"] = "194" + tags["name"] = "Temperature_Celsius" + tempC, err := strconv.ParseInt(temp[1], 10, 64) + if err != nil { + continue + } + fields["raw_value"] = tempC + deviceFields["temp_c"] = tempC + + acc.AddFields("smart_attribute", fields, tags) } } } } - acc.AddFields("smart_device", device_fields, device_tags) + acc.AddFields("smart_device", deviceFields, deviceTags) } func parseRawValue(rawVal string) (int64, error) { - // Integer if i, err := strconv.ParseInt(rawVal, 10, 64); err == nil { return i, nil diff --git a/plugins/inputs/smart/smart_test.go b/plugins/inputs/smart/smart_test.go index da658f5f9..525d99e3b 100644 --- a/plugins/inputs/smart/smart_test.go +++ b/plugins/inputs/smart/smart_test.go @@ -1,9 +1,8 @@ package smart import ( - "fmt" - "os" - "os/exec" + "errors" + "sync" "testing" "github.com/influxdata/telegraf/testutil" @@ -11,66 +10,24 @@ import ( "github.com/stretchr/testify/require" ) -var ( - mockScanData = `/dev/ada0 -d atacam # /dev/ada0, ATA device -` - mockInfoAttributeData = `smartctl 6.5 2016-05-07 r4318 [Darwin 16.4.0 x86_64] (local build) -Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org - -CHECK POWER MODE not implemented, ignoring -n option -=== START OF INFORMATION SECTION === -Model Family: Apple SD/SM/TS...E/F SSDs -Device Model: APPLE SSD SM256E -Serial Number: S0X5NZBC422720 -LU WWN Device Id: 5 002538 043584d30 -Firmware Version: CXM09A1Q -User Capacity: 251,000,193,024 bytes [251 GB] -Sector Sizes: 512 bytes logical, 4096 bytes physical -Rotation Rate: Solid State Device -Device is: In smartctl database [for details use: -P show] -ATA Version is: ATA8-ACS T13/1699-D revision 4c -SATA Version is: SATA 3.0, 6.0 Gb/s (current: 6.0 Gb/s) -Local Time is: Thu Feb 9 16:48:45 2017 CET -SMART support is: Available - device has SMART capability. -SMART support is: Enabled - -=== START OF READ SMART DATA SECTION === -SMART overall-health self-assessment test result: PASSED - -=== START OF READ SMART DATA SECTION === -SMART Attributes Data Structure revision number: 1 -Vendor Specific SMART Attributes with Thresholds: -ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE - 1 Raw_Read_Error_Rate -O-RC- 200 200 000 - 0 - 5 Reallocated_Sector_Ct PO--CK 100 100 000 - 0 - 9 Power_On_Hours -O--CK 099 099 000 - 2988 - 12 Power_Cycle_Count -O--CK 085 085 000 - 14879 -169 Unknown_Attribute PO--C- 253 253 010 - 2044932921600 -173 Wear_Leveling_Count -O--CK 185 185 100 - 957808640337 -190 Airflow_Temperature_Cel -O---K 055 040 045 Past 45 (Min/Max 43/57 #2689) -192 Power-Off_Retract_Count -O--C- 097 097 000 - 14716 -194 Temperature_Celsius -O---K 066 021 000 - 34 (Min/Max 14/79) -197 Current_Pending_Sector -O---K 100 100 000 - 0 -199 UDMA_CRC_Error_Count -O-RC- 200 200 000 - 0 -240 Head_Flying_Hours ------ 100 253 000 - 6585h+55m+23.234s - ||||||_ K auto-keep - |||||__ C event count - ||||___ R error rate - |||____ S speed/performance - ||_____ O updated online - |______ P prefailure warning -` -) - func TestGatherAttributes(t *testing.T) { s := &Smart{ Path: "smartctl", Attributes: true, } - // overwriting exec commands with mock commands - execCommand = fakeExecCommand var acc testutil.Accumulator + runCmd = func(sudo bool, command string, args ...string) ([]byte, error) { + if len(args) > 0 { + if args[0] == "--scan" { + return []byte(mockScanData), nil + } else if args[0] == "--info" { + return []byte(mockInfoAttributeData), nil + } + } + return nil, errors.New("command not found") + } + err := s.Gather(&acc) require.NoError(t, err) @@ -302,8 +259,6 @@ func TestGatherAttributes(t *testing.T) { acc.AssertContainsTaggedFields(t, "smart_attribute", test.fields, test.tags) } - // tags = map[string]string{} - var testsAda0Device = []struct { fields map[string]interface{} tags map[string]string @@ -330,7 +285,6 @@ func TestGatherAttributes(t *testing.T) { for _, test := range testsAda0Device { acc.AssertContainsTaggedFields(t, "smart_device", test.fields, test.tags) } - } func TestGatherNoAttributes(t *testing.T) { @@ -339,7 +293,6 @@ func TestGatherNoAttributes(t *testing.T) { Attributes: false, } // overwriting exec commands with mock commands - execCommand = fakeExecCommand var acc testutil.Accumulator err := s.Gather(&acc) @@ -348,8 +301,6 @@ func TestGatherNoAttributes(t *testing.T) { assert.Equal(t, 5, acc.NFields(), "Wrong number of fields gathered") acc.AssertDoesNotContainMeasurement(t, "smart_attribute") - // tags = map[string]string{} - var testsAda0Device = []struct { fields map[string]interface{} tags map[string]string @@ -376,51 +327,574 @@ func TestGatherNoAttributes(t *testing.T) { for _, test := range testsAda0Device { acc.AssertContainsTaggedFields(t, "smart_device", test.fields, test.tags) } - } func TestExcludedDev(t *testing.T) { assert.Equal(t, true, excludedDev([]string{"/dev/pass6"}, "/dev/pass6 -d atacam"), "Should be excluded.") assert.Equal(t, false, excludedDev([]string{}, "/dev/pass6 -d atacam"), "Shouldn't be excluded.") assert.Equal(t, false, excludedDev([]string{"/dev/pass6"}, "/dev/pass1 -d atacam"), "Shouldn't be excluded.") - } -// fackeExecCommand is a helper function that mock -// the exec.Command call (and call the test binary) -func fakeExecCommand(command string, args ...string) *exec.Cmd { - cs := []string{"-test.run=TestHelperProcess", "--", command} - cs = append(cs, args...) - cmd := exec.Command(os.Args[0], cs...) - cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"} - return cmd -} - -// TestHelperProcess isn't a real test. It's used to mock exec.Command -// For example, if you run: -// GO_WANT_HELPER_PROCESS=1 go test -test.run=TestHelperProcess -- --scan -// it returns below mockScanData. -func TestHelperProcess(t *testing.T) { - if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" { - return +func TestGatherSATAInfo(t *testing.T) { + runCmd = func(sudo bool, command string, args ...string) ([]byte, error) { + return []byte(hgstSATAInfoData), nil } - args := os.Args + var ( + acc = &testutil.Accumulator{} + wg = &sync.WaitGroup{} + ) - // Previous arguments are tests stuff, that looks like : - // /tmp/go-build970079519/…/_test/integration.test -test.run=TestHelperProcess -- - cmd, arg1, args := args[3], args[4], args[5:] - - if cmd == "smartctl" { - if arg1 == "--scan" { - fmt.Fprint(os.Stdout, mockScanData) - } - if arg1 == "--info" { - fmt.Fprint(os.Stdout, mockInfoAttributeData) - } - } else { - fmt.Fprint(os.Stdout, "command not found") - os.Exit(1) - } - os.Exit(0) + wg.Add(1) + gatherDisk(acc, true, true, "", "", "", wg) + assert.Equal(t, 101, acc.NFields(), "Wrong number of fields gathered") + assert.Equal(t, uint64(20), acc.NMetrics(), "Wrong number of metrics gathered") } + +func TestGatherSATAInfo65(t *testing.T) { + runCmd = func(sudo bool, command string, args ...string) ([]byte, error) { + return []byte(hgstSATAInfoData65), nil + } + + var ( + acc = &testutil.Accumulator{} + wg = &sync.WaitGroup{} + ) + + wg.Add(1) + gatherDisk(acc, true, true, "", "", "", wg) + assert.Equal(t, 91, acc.NFields(), "Wrong number of fields gathered") + assert.Equal(t, uint64(18), acc.NMetrics(), "Wrong number of metrics gathered") +} + +func TestGatherHgstSAS(t *testing.T) { + runCmd = func(sudo bool, command string, args ...string) ([]byte, error) { + return []byte(hgstSASInfoData), nil + } + + var ( + acc = &testutil.Accumulator{} + wg = &sync.WaitGroup{} + ) + + wg.Add(1) + gatherDisk(acc, true, true, "", "", "", wg) + assert.Equal(t, 6, acc.NFields(), "Wrong number of fields gathered") + assert.Equal(t, uint64(4), acc.NMetrics(), "Wrong number of metrics gathered") +} + +func TestGatherHtSAS(t *testing.T) { + runCmd = func(sudo bool, command string, args ...string) ([]byte, error) { + return []byte(htSASInfoData), nil + } + + var ( + acc = &testutil.Accumulator{} + wg = &sync.WaitGroup{} + ) + + wg.Add(1) + gatherDisk(acc, true, true, "", "", "", wg) + assert.Equal(t, 5, acc.NFields(), "Wrong number of fields gathered") + assert.Equal(t, uint64(3), acc.NMetrics(), "Wrong number of metrics gathered") +} + +func TestGatherSSD(t *testing.T) { + runCmd = func(sudo bool, command string, args ...string) ([]byte, error) { + return []byte(ssdInfoData), nil + } + + var ( + acc = &testutil.Accumulator{} + wg = &sync.WaitGroup{} + ) + + wg.Add(1) + gatherDisk(acc, true, true, "", "", "", wg) + assert.Equal(t, 105, acc.NFields(), "Wrong number of fields gathered") + assert.Equal(t, uint64(26), acc.NMetrics(), "Wrong number of metrics gathered") +} + +func TestGatherSSDRaid(t *testing.T) { + runCmd = func(sudo bool, command string, args ...string) ([]byte, error) { + return []byte(ssdRaidInfoData), nil + } + + var ( + acc = &testutil.Accumulator{} + wg = &sync.WaitGroup{} + ) + + wg.Add(1) + gatherDisk(acc, true, true, "", "", "", wg) + assert.Equal(t, 74, acc.NFields(), "Wrong number of fields gathered") + assert.Equal(t, uint64(15), acc.NMetrics(), "Wrong number of metrics gathered") +} + +func TestGatherNvme(t *testing.T) { + runCmd = func(sudo bool, command string, args ...string) ([]byte, error) { + return []byte(nvmeInfoData), nil + } + + var ( + acc = &testutil.Accumulator{} + wg = &sync.WaitGroup{} + ) + + wg.Add(1) + gatherDisk(acc, true, true, "", "", "", wg) + assert.Equal(t, 6, acc.NFields(), "Wrong number of fields gathered") + assert.Equal(t, uint64(4), acc.NMetrics(), "Wrong number of metrics gathered") +} + +// smartctl output +var ( + // smartctl --scan + mockScanData = `/dev/ada0 -d atacam # /dev/ada0, ATA device +` + // smartctl --info --health --attributes --tolerance=verypermissive -n standby --format=brief [DEVICE] + mockInfoAttributeData = `smartctl 6.5 2016-05-07 r4318 [Darwin 16.4.0 x86_64] (local build) +Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org + +CHECK POWER MODE not implemented, ignoring -n option +=== START OF INFORMATION SECTION === +Model Family: Apple SD/SM/TS...E/F SSDs +Device Model: APPLE SSD SM256E +Serial Number: S0X5NZBC422720 +LU WWN Device Id: 5 002538 043584d30 +Firmware Version: CXM09A1Q +User Capacity: 251,000,193,024 bytes [251 GB] +Sector Sizes: 512 bytes logical, 4096 bytes physical +Rotation Rate: Solid State Device +Device is: In smartctl database [for details use: -P show] +ATA Version is: ATA8-ACS T13/1699-D revision 4c +SATA Version is: SATA 3.0, 6.0 Gb/s (current: 6.0 Gb/s) +Local Time is: Thu Feb 9 16:48:45 2017 CET +SMART support is: Available - device has SMART capability. +SMART support is: Enabled + +=== START OF READ SMART DATA SECTION === +SMART overall-health self-assessment test result: PASSED + +=== START OF READ SMART DATA SECTION === +SMART Attributes Data Structure revision number: 1 +Vendor Specific SMART Attributes with Thresholds: +ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE + 1 Raw_Read_Error_Rate -O-RC- 200 200 000 - 0 + 5 Reallocated_Sector_Ct PO--CK 100 100 000 - 0 + 9 Power_On_Hours -O--CK 099 099 000 - 2988 + 12 Power_Cycle_Count -O--CK 085 085 000 - 14879 +169 Unknown_Attribute PO--C- 253 253 010 - 2044932921600 +173 Wear_Leveling_Count -O--CK 185 185 100 - 957808640337 +190 Airflow_Temperature_Cel -O---K 055 040 045 Past 45 (Min/Max 43/57 #2689) +192 Power-Off_Retract_Count -O--C- 097 097 000 - 14716 +194 Temperature_Celsius -O---K 066 021 000 - 34 (Min/Max 14/79) +197 Current_Pending_Sector -O---K 100 100 000 - 0 +199 UDMA_CRC_Error_Count -O-RC- 200 200 000 - 0 +240 Head_Flying_Hours ------ 100 253 000 - 6585h+55m+23.234s + ||||||_ K auto-keep + |||||__ C event count + ||||___ R error rate + |||____ S speed/performance + ||_____ O updated online + |______ P prefailure warning +` + + htSASInfoData = `smartctl 6.6 2016-05-31 r4324 [x86_64-linux-4.15.18-12-pve] (local build) +Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smar$montools.org + +=== START OF INFORMATION SECTION === +Vendor: HITACHI +Product: HUC103030CSS600 +Revision: J350 +Compliance: SPC-4 +User Capacity: 300,$00,000,000 bytes [300 GB] +Logical block size: 512 bytes +Rotation Rate: 10020 rpm +Form Factor: 2.5 inches +Logical Unit id: 0x5000cca00a4bdbc8 +Serial number: PDWAR9GE +Devicetype: disk +Transport protocol: SAS (SPL-3) +Local Time is: Wed Apr 17 15:01:28 2019 PDT +SMART support is: Available - device has SMART capability. +SMART support is: Enabled +Temp$rature Warning: Disabled or Not Supported + +=== START OF READ SMART DATA SECTION === +SMART Health Status: OK + +Current Drive Temperature: 36 C +Drive Trip Temperature: 85 C + +Manufactured in $eek 52 of year 2009 +Specified cycle count over device lifetime: 50000 +Accumulated start-stop cycles: 47 +Elements in grown defect list: 0 + +Vendor (Seagate) cache information + Blocks sent to initiator= 7270983270400000 +` + + hgstSASInfoData = `smartctl 6.6 2016-05-31 r4324 [x86_64-linux-4.15.0-46-generic] (local build) +Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org + +=== START OF INFORMATION SECTION === +Vendor: HGST +Product: HUH721212AL5204 +Revision: C3Q1 +Compliance: SPC-4 +User Capacity: 12,000,138,625,024 bytes [12.0 TB] +Logical block size: 512 bytes +Physical block size: 4096 bytes +LU is fully provisioned +Rotation Rate: 7200 rpm +Form Factor: 3.5 inches +Logical Unit id: 0x5000cca27076bfe8 +Serial number: 8HJ39K3H +Device type: disk +Transport protocol: SAS (SPL-3) +Local Time is: Thu Apr 18 13:25:03 2019 MSK +SMART support is: Available - device has SMART capability. +SMART support is: Enabled +Temperature Warning: Enabled + +=== START OF READ SMART DATA SECTION === +SMART Health Status: OK + +Current Drive Temperature: 34 C +Drive Trip Temperature: 85 C + +Manufactured in week 35 of year 2018 +Specified cycle count over device lifetime: 50000 +Accumulated start-stop cycles: 7 +Specified load-unload count over device lifetime: 600000 +Accumulated load-unload cycles: 39 +Elements in grown defect list: 0 + +Vendor (Seagate) cache information + Blocks sent to initiator = 544135446528 +` + + hgstSATAInfoData = `smartctl 6.6 2016-05-31 r4324 [x86_64-linux-4.15.0-46-generic] (local build) +Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org + +=== START OF INFORMATION SECTION === +Model Family: Hitachi/HGST Travelstar Z7K500 +Device Model: HGST HTE725050A7E630 +Serial Number: RCE50G20G81S9S +LU WWN Device Id: 5 000cca 90bc3a98b +Firmware Version: GS2OA3E0 +User Capacity: 500,107,862,016 bytes [500 GB] +Sector Sizes: 512 bytes logical, 4096 bytes physical +Rotation Rate: 7200 rpm +Form Factor: 2.5 inches +Device is: In smartctl database [for details use: -P show] +ATA Version is: ATA8-ACS T13/1699-D revision 6 +SATA Version is: SATA 2.6, 6.0 Gb/s (current: 6.0 Gb/s) +Local Time is: Thu Apr 18 13:27:51 2019 MSK +SMART support is: Available - device has SMART capability. +SMART support is: Enabled +Power mode is: ACTIVE or IDLE + +=== START OF READ SMART DATA SECTION === +SMART overall-health self-assessment test result: PASSED + +SMART Attributes Data Structure revision number: 16 +Vendor Specific SMART Attributes with Thresholds: +ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE + 1 Raw_Read_Error_Rate PO-R-- 100 100 062 - 0 + 2 Throughput_Performance P-S--- 100 100 040 - 0 + 3 Spin_Up_Time POS--- 100 100 033 - 1 + 4 Start_Stop_Count -O--C- 100 100 000 - 4 + 5 Reallocated_Sector_Ct PO--CK 100 100 005 - 0 + 7 Seek_Error_Rate PO-R-- 100 100 067 - 0 + 8 Seek_Time_Performance P-S--- 100 100 040 - 0 + 9 Power_On_Hours -O--C- 099 099 000 - 743 + 10 Spin_Retry_Count PO--C- 100 100 060 - 0 + 12 Power_Cycle_Count -O--CK 100 100 000 - 4 +191 G-Sense_Error_Rate -O-R-- 100 100 000 - 0 +192 Power-Off_Retract_Count -O--CK 100 100 000 - 2 +193 Load_Cycle_Count -O--C- 100 100 000 - 13 +194 Temperature_Celsius -O---- 250 250 000 - 24 (Min/Max 15/29) +196 Reallocated_Event_Count -O--CK 100 100 000 - 0 +197 Current_Pending_Sector -O---K 100 100 000 - 0 +198 Offline_Uncorrectable ---R-- 100 100 000 - 0 +199 UDMA_CRC_Error_Count -O-R-- 200 200 000 - 0 +223 Load_Retry_Count -O-R-- 100 100 000 - 0 + ||||||_ K auto-keep + |||||__ C event count + ||||___ R error rate + |||____ S speed/performance + ||_____ O updated online + |______ P prefailure warning +` + + hgstSATAInfoData65 = `smartctl 6.5 2016-01-24 r4214 [x86_64-linux-4.4.0-145-generic] (local build) +Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org + +=== START OF INFORMATION SECTION === +Model Family: HGST Deskstar NAS +Device Model: HGST HDN724040ALE640 +Serial Number: PK1334PEK49SBS +LU WWN Device Id: 5 000cca 250ec3c9c +Firmware Version: MJAOA5E0 +User Capacity: 4,000,787,030,016 bytes [4.00 TB] +Sector Sizes: 512 bytes logical, 4096 bytes physical +Rotation Rate: 7200 rpm +Form Factor: 3.5 inches +Device is: In smartctl database [for details use: -P show] +ATA Version is: ATA8-ACS T13/1699-D revision 4 +SATA Version is: SATA 3.0, 6.0 Gb/s (current: 6.0 Gb/s) +Local Time is: Wed Apr 17 15:14:27 2019 PDT +SMART support is: Available - device has SMART capability. +SMART support is: Enabled +Power mode is: ACTIVE or IDLE + +=== START OF READ SMART DATA SECTION === +SMART overall-health self-assessment test result: PASSED + +SMART Attributes Data Structure revision number: 16 +Vendor Specific SMART Attributes with Thresholds: +ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE + 1 Raw_Read_Error_Rate PO-R-- 100 100 016 - 0 + 2 Throughput_Performance P-S--- 135 135 054 - 84 + 3 Spin_Up_Time POS--- 125 125 024 - 621 (Average 619) + 4 Start_Stop_Count -O--C- 100 100 000 - 33 + 5 Reallocated_Sector_Ct PO--CK 100 100 005 - 0 + 7 Seek_Error_Rate PO-R-- 100 100 067 - 0 + 8 Seek_Time_Performance P-S--- 119 119 020 - 35 + 9 Power_On_Hours -O--C- 098 098 000 - 19371 + 10 Spin_Retry_Count PO--C- 100 100 060 - 0 + 12 Power_Cycle_Count -O--CK 100 100 000 - 33 +192 Power-Off_Retract_Count -O--CK 100 100 000 - 764 +193 Load_Cycle_Count -O--C- 100 100 000 - 764 +194 Temperature_Celsius -O---- 176 176 000 - 34 (Min/Max 21/53) +196 Reallocated_Event_Count -O--CK 100 100 000 - 0 +197 Current_Pending_Sector -O---K 100 100 000 - 0 +198 Offline_Uncorrectable ---R-- 100 100 000 - 0 +199 UDMA_CRC_Error_Count -O-R-- 200 200 000 - 0 + ||||||_ K auto-keep + |||||__ C event count + ||||___ R error rate + |||____ S speed/performance + ||_____ O updated online + |______ P prefailure warning +` + + ssdInfoData = `smartctl 6.6 2016-05-31 r4324 [x86_64-linux-4.15.0-33-generic] (local build) +Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org + +=== START OF INFORMATION SECTION === +Device Model: SanDisk Ultra II 240GB +Serial Number: XXXXXXXX +LU WWN Device Id: XXXXXXXX +Firmware Version: XXXXXXX +User Capacity: 240.057.409.536 bytes [240 GB] +Sector Size: 512 bytes logical/physical +Rotation Rate: Solid State Device +Form Factor: 2.5 inches +Device is: Not in smartctl database [for details use: -P showall] +ATA Version is: ACS-2 T13/2015-D revision 3 +SATA Version is: SATA 3.2, 6.0 Gb/s (current: 6.0 Gb/s) +Local Time is: Mon Sep 17 13:22:19 2018 CEST +SMART support is: Available - device has SMART capability. +SMART support is: Enabled +Power mode is: ACTIVE or IDLE + +=== START OF READ SMART DATA SECTION === +SMART overall-health self-assessment test result: PASSED + +SMART Attributes Data Structure revision number: 4 +Vendor Specific SMART Attributes with Thresholds: +ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE + 5 Reallocated_Sector_Ct -O--CK 100 100 --- - 0 + 9 Power_On_Hours -O--CK 100 100 --- - 6383 + 12 Power_Cycle_Count -O--CK 100 100 --- - 19 +165 Unknown_Attribute -O--CK 100 100 --- - 59310806 +166 Unknown_Attribute -O--CK 100 100 --- - 1 +167 Unknown_Attribute -O--CK 100 100 --- - 57 +168 Unknown_Attribute -O--CK 100 100 --- - 43 +169 Unknown_Attribute -O--CK 100 100 --- - 221 +170 Unknown_Attribute -O--CK 100 100 --- - 0 +171 Unknown_Attribute -O--CK 100 100 --- - 0 +172 Unknown_Attribute -O--CK 100 100 --- - 0 +173 Unknown_Attribute -O--CK 100 100 --- - 13 +174 Unknown_Attribute -O--CK 100 100 --- - 4 +184 End-to-End_Error -O--CK 100 100 --- - 0 +187 Reported_Uncorrect -O--CK 100 100 --- - 0 +188 Command_Timeout -O--CK 100 100 --- - 0 +194 Temperature_Celsius -O---K 066 065 --- - 34 (Min/Max 19/65) +199 UDMA_CRC_Error_Count -O--CK 100 100 --- - 0 +230 Unknown_SSD_Attribute -O--CK 100 100 --- - 2229110374919 +232 Available_Reservd_Space PO--CK 100 100 004 - 100 +233 Media_Wearout_Indicator -O--CK 100 100 --- - 3129 +234 Unknown_Attribute -O--CK 100 100 --- - 7444 +241 Total_LBAs_Written ----CK 253 253 --- - 4812 +242 Total_LBAs_Read ----CK 253 253 --- - 671 +244 Unknown_Attribute -O--CK 000 100 --- - 0 + ||||||_ K auto-keep + |||||__ C event count + ||||___ R error rate + |||____ S speed/performance + ||_____ O updated online + |______ P prefailure warning +` + ssdRaidInfoData = `smartctl 6.6 2017-11-05 r4594 [FreeBSD 11.1-RELEASE-p13 amd64] (local build) +Copyright (C) 2002-17, Bruce Allen, Christian Franke, www.smartmontools.org + +CHECK POWER MODE: incomplete response, ATA output registers missing +CHECK POWER MODE not implemented, ignoring -n option +=== START OF INFORMATION SECTION === +Model Family: Samsung based SSDs +Device Model: Samsung SSD 850 PRO 256GB +Serial Number: S251NX0H869353L +LU WWN Device Id: 5 002538 84027f72f +Firmware Version: EXM02B6Q +User Capacity: 256 060 514 304 bytes [256 GB] +Sector Size: 512 bytes logical/physical +Rotation Rate: Solid State Device +Device is: In smartctl database [for details use: -P show] +ATA Version is: ACS-2, ATA8-ACS T13/1699-D revision 4c +SATA Version is: SATA 3.1, 6.0 Gb/s (current: 6.0 Gb/s) +Local Time is: Fri Sep 21 17:49:16 2018 CEST +SMART support is: Available - device has SMART capability. +SMART support is: Enabled + +=== START OF READ SMART DATA SECTION === +SMART Status not supported: Incomplete response, ATA output registers missing +SMART overall-health self-assessment test result: PASSED +Warning: This result is based on an Attribute check. + +General SMART Values: +Offline data collection status: (0x00) Offline data collection activity + was never started. + Auto Offline Data Collection: Disabled. +Self-test execution status: ( 0) The previous self-test routine completed + without error or no self-test has ever + been run. +Total time to complete Offline +data collection: ( 0) seconds. +Offline data collection +capabilities: (0x53) SMART execute Offline immediate. + Auto Offline data collection on/off support. + Suspend Offline collection upon new + command. + No Offline surface scan supported. + Self-test supported. + No Conveyance Self-test supported. + Selective Self-test supported. +SMART capabilities: (0x0003) Saves SMART data before entering + power-saving mode. + Supports SMART auto save timer. +Error logging capability: (0x01) Error logging supported. + General Purpose Logging supported. +Short self-test routine +recommended polling time: ( 2) minutes. +Extended self-test routine +recommended polling time: ( 136) minutes. +SCT capabilities: (0x003d) SCT Status supported. + SCT Error Recovery Control supported. + SCT Feature Control supported. + SCT Data Table supported. + +SMART Attributes Data Structure revision number: 1 +Vendor Specific SMART Attributes with Thresholds: +ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE + 5 Reallocated_Sector_Ct PO--CK 099 099 010 - 1 + 9 Power_On_Hours -O--CK 094 094 000 - 26732 + 12 Power_Cycle_Count -O--CK 099 099 000 - 51 +177 Wear_Leveling_Count PO--C- 001 001 000 - 7282 +179 Used_Rsvd_Blk_Cnt_Tot PO--C- 099 099 010 - 1 +181 Program_Fail_Cnt_Total -O--CK 100 100 010 - 0 +182 Erase_Fail_Count_Total -O--CK 099 099 010 - 1 +183 Runtime_Bad_Block PO--C- 099 099 010 - 1 +187 Uncorrectable_Error_Cnt -O--CK 100 100 000 - 0 +190 Airflow_Temperature_Cel -O--CK 081 069 000 - 19 +195 ECC_Error_Rate -O-RC- 200 200 000 - 0 +199 CRC_Error_Count -OSRCK 100 100 000 - 0 +235 POR_Recovery_Count -O--C- 099 099 000 - 50 +241 Total_LBAs_Written -O--CK 099 099 000 - 61956393677 + ||||||_ K auto-keep + |||||__ C event count + ||||___ R error rate + |||____ S speed/performance + ||_____ O updated online + |______ P prefailure warning + +SMART Error Log Version: 1 +No Errors Logged + +SMART Self-test log structure revision number 1 +Num Test_Description Status Remaining LifeTime(hours) LBA_of_first_error +# 1 Short offline Completed without error 00% 26717 - +# 2 Short offline Completed without error 00% 26693 - +# 3 Short offline Completed without error 00% 26669 - +# 4 Short offline Completed without error 00% 26645 - +# 5 Short offline Completed without error 00% 26621 - +# 6 Short offline Completed without error 00% 26596 - +# 7 Extended offline Completed without error 00% 26574 - +# 8 Short offline Completed without error 00% 26572 - +# 9 Short offline Completed without error 00% 26548 - +#10 Short offline Completed without error 00% 26524 - +#11 Short offline Completed without error 00% 26500 - +#12 Short offline Completed without error 00% 26476 - +#13 Short offline Completed without error 00% 26452 - +#14 Short offline Completed without error 00% 26428 - +#15 Extended offline Completed without error 00% 26406 - +#16 Short offline Completed without error 00% 26404 - +#17 Short offline Completed without error 00% 26380 - +#18 Short offline Completed without error 00% 26356 - +#19 Short offline Completed without error 00% 26332 - +#20 Short offline Completed without error 00% 26308 - + +SMART Selective self-test log data structure revision number 1 + SPAN MIN_LBA MAX_LBA CURRENT_TEST_STATUS + 1 0 0 Not_testing + 2 0 0 Not_testing + 3 0 0 Not_testing + 4 0 0 Not_testing + 5 0 0 Not_testing +Selective self-test flags (0x0): + After scanning selected spans, do NOT read-scan remainder of disk. +If Selective self-test is pending on power-up, resume after 0 minute delay. +` + + nvmeInfoData = `smartctl 6.5 2016-05-07 r4318 [x86_64-linux-4.1.27-gvt-yocto-standard] (local build) +Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org + +=== START OF INFORMATION SECTION === +Model Number: TS128GMTE850 +Serial Number: D704940282? +Firmware Version: C2.3.13 +PCI Vendor/Subsystem ID: 0x126f +IEEE OUI Identifier: 0x000000 +Controller ID: 1 +Number of Namespaces: 1 +Namespace 1 Size/Capacity: 128,035,676,160 [128 GB] +Namespace 1 Formatted LBA Size: 512 +Local Time is: Fri Jun 15 11:41:35 2018 UTC + +=== START OF SMART DATA SECTION === +SMART overall-health self-assessment test result: PASSED + +SMART/Health Information (NVMe Log 0x02, NSID 0xffffffff) +Critical Warning: 0x00 +Temperature: 38 Celsius +Available Spare: 100% +Available Spare Threshold: 10% +Percentage Used: 16% +Data Units Read: 11,836,935 [6.06 TB] +Data Units Written: 62,288,091 [31.8 TB] +Host Read Commands: 135,924,188 +Host Write Commands: 7,715,573,429 +Controller Busy Time: 4,042 +Power Cycles: 472 +Power On Hours: 6,038 +Unsafe Shutdowns: 355 +Media and Data Integrity Errors: 0 +Error Information Log Entries: 119,699 +Warning Comp. Temperature Time: 0 +Critical Comp. Temperature Time: 0 +` +)