feat(inputs.smart): Collect SSD endurance information where available in smartctl (#11391)

This commit is contained in:
Ben Tasker 2022-07-12 18:25:54 +01:00 committed by GitHub
parent 0ef5df50af
commit fa0c9c937e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 98 additions and 9 deletions

View File

@ -180,10 +180,13 @@ execute this script.
- fields:
- exit_status
- health_ok
- media_wearout_indicator
- percent_lifetime_remain
- read_error_rate
- seek_error
- temp_c
- udma_crc_errors
- wear_leveling_count
- smart_attribute:
- tags:

View File

@ -87,6 +87,14 @@ var (
"199": "udma_crc_errors",
}
// There are some fields we're interested in which use the vendor specific device ids
// so we need to be able to match on name instead
deviceFieldNames = map[string]string{
"Percent_Lifetime_Remain": "percent_lifetime_remain",
"Wear_Leveling_Count": "wear_leveling_count",
"Media_Wearout_Indicator": "media_wearout_indicator",
}
// to obtain metrics from smartctl
sasNVMeAttributes = map[string]struct {
ID string
@ -150,6 +158,10 @@ var (
Name: "Percentage_Used",
Parse: parsePercentageInt,
},
"Percentage used endurance indicator": {
Name: "Percentage_Used",
Parse: parsePercentageInt,
},
"Data Units Read": {
Name: "Data_Units_Read",
Parse: parseDataUnits,
@ -817,6 +829,16 @@ func (m *Smart) gatherDisk(acc telegraf.Accumulator, device string, wg *sync.Wai
deviceFields[field] = val
}
}
if len(attr) > 4 {
// If the attribute name matches on in deviceFieldNames
// save the value to a field
if field, ok := deviceFieldNames[attr[2]]; ok {
if val, err := parseRawValue(attr[4]); err == nil {
deviceFields[field] = val
}
}
}
} else {
// what was found is not a vendor attribute
if matches := sasNVMeAttr.FindStringSubmatch(line); len(matches) > 2 {

View File

@ -52,7 +52,7 @@ func TestGatherAttributes(t *testing.T) {
err := s.Gather(&acc)
require.NoError(t, err)
assert.Equal(t, 65, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, 66, acc.NFields(), "Wrong number of fields gathered")
for _, test := range testsAda0Attributes {
acc.AssertContainsTaggedFields(t, "smart_attribute", test.fields, test.tags)
@ -171,7 +171,7 @@ func TestGatherNoAttributes(t *testing.T) {
err := s.Gather(&acc)
require.NoError(t, err)
assert.Equal(t, 8, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, 9, acc.NFields(), "Wrong number of fields gathered")
acc.AssertDoesNotContainMeasurement(t, "smart_attribute")
for _, test := range testsAda0Device {
@ -264,6 +264,23 @@ func TestGatherHtSAS(t *testing.T) {
testutil.RequireMetricsEqual(t, testHtsasAtributtes, acc.GetTelegrafMetrics(), testutil.SortMetrics(), testutil.IgnoreTime())
}
func TestGatherLongFormEnduranceAttrib(t *testing.T) {
runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) {
return []byte(mockHGST), nil
}
var (
acc = &testutil.Accumulator{}
wg = &sync.WaitGroup{}
)
wg.Add(1)
sampleSmart.gatherDisk(acc, "", wg)
assert.Equal(t, 7, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(5), acc.NMetrics(), "Wrong number of metrics gathered")
}
func TestGatherSSD(t *testing.T) {
runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) {
return []byte(ssdInfoData), nil
@ -276,7 +293,7 @@ func TestGatherSSD(t *testing.T) {
wg.Add(1)
sampleSmart.gatherDisk(acc, "", wg)
assert.Equal(t, 105, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, 106, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(26), acc.NMetrics(), "Wrong number of metrics gathered")
}
@ -292,7 +309,7 @@ func TestGatherSSDRaid(t *testing.T) {
wg.Add(1)
sampleSmart.gatherDisk(acc, "", wg)
assert.Equal(t, 74, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, 75, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(15), acc.NMetrics(), "Wrong number of metrics gathered")
}
@ -1394,11 +1411,12 @@ var (
}{
{
map[string]interface{}{
"exit_status": int(0),
"health_ok": bool(true),
"read_error_rate": int64(0),
"temp_c": int64(34),
"udma_crc_errors": int64(0),
"exit_status": int(0),
"health_ok": bool(true),
"read_error_rate": int64(0),
"temp_c": int64(34),
"udma_crc_errors": int64(0),
"wear_leveling_count": int64(185),
},
map[string]string{
"device": "ada0",
@ -1810,6 +1828,52 @@ ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE
|||____ S speed/performance
||_____ O updated online
|______ P prefailure warning
`
mockHGST = `
smartctl 6.6 2016-05-31 r4324 [x86_64-linux-4.9.0-3-amd64] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org
=== START OF INFORMATION SECTION ===
Vendor: HGST
Product: HUSMM1640ASS200
Revision: A360
Compliance: SPC-4
User Capacity: 400,088,457,216 bytes [400 GB]
Logical block size: 512 bytes
Physical block size: 4096 bytes
LU is resource provisioned, LBPRZ=1
Rotation Rate: Solid State Device
Form Factor: 2.5 inches
Logical Unit id: 0x5000cca04ec26364
Serial number: ZZZZZZZZZ
Device type: disk
Transport protocol: SAS (SPL-3)
Local Time is: Mon Nov 6 10:20:33 2017 CET
SMART support is: Available - device has SMART capability.
SMART support is: Enabled
Temperature Warning: Enabled
Read Cache is: Enabled
Writeback Cache is: Enabled
=== START OF READ SMART DATA SECTION ===
SMART Health Status: OK
Percentage used endurance indicator: 0%
Current Drive Temperature: 28 C
Drive Trip Temperature: 70 C
Manufactured in week 30 of year 2017
Specified cycle count over device lifetime: 0
Accumulated start-stop cycles: 0
Specified load-unload count over device lifetime: 0
Accumulated load-unload cycles: 0
defect list format 6 unknown
Elements in grown defect list: 0
Vendor (Seagate) cache information
Blocks sent to initiator = 3400674574336
`
htSASInfoData = `smartctl 6.6 2016-05-31 r4324 [x86_64-linux-4.15.18-12-pve] (local build)