feat: Add SMART plugin concurrency configuration option, nvme-cli v1.14+ support and lint fixes. (#10150)

This commit is contained in:
Kuba Trojan 2021-12-07 08:10:36 -08:00 committed by GitHub
parent 0e237774f1
commit d4475b7d08
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 479 additions and 96 deletions

View File

@ -112,6 +112,14 @@ smartctl --scan -d nvme
## Timeout for the cli command to complete.
# timeout = "30s"
## Optionally call smartctl and nvme-cli with a specific concurrency policy.
## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes.
## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of
## SMART data - one individual array drive at the time. In such case please set this configuration option
## to "sequential" to get readings for all drives.
## valid options: concurrent, sequential
# read_method = "concurrent"
```
## Permissions
@ -235,13 +243,27 @@ the DEVICE (name of the device could be taken from the previous command):
smartctl --info --health --attributes --tolerance=verypermissive --nocheck NOCHECK --format=brief -d DEVICE
```
If you try to gather vendor specific metrics, please provide this commad
If you try to gather vendor specific metrics, please provide this command
and replace vendor and device to match your case:
```sh
nvme VENDOR smart-log-add DEVICE
```
If you have specified devices array in configuration file, and Telegraf only shows data from one device, you should
change the plugin configuration to sequentially gather disk attributes instead of collecting it in separate threads
(goroutines). To do this find in plugin configuration read_method and change it to sequential:
```toml
## Optionally call smartctl and nvme-cli with a specific concurrency policy.
## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes.
## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of
## SMART data - one individual array drive at the time. In such case please set this configuration option
## to "sequential" to get readings for all drives.
## valid options: concurrent, sequential
read_method = "sequential"
```
## Example SMART Plugin Outputs
```shell

View File

@ -43,8 +43,8 @@ var (
// PASSED, FAILED, UNKNOWN
smartOverallHealth = regexp.MustCompile(`^(SMART overall-health self-assessment test result|SMART Health Status):\s+(\w+).*$`)
// sasNvmeAttr is a SAS or NVME SMART attribute
sasNvmeAttr = regexp.MustCompile(`^([^:]+):\s+(.+)$`)
// sasNVMeAttr is a SAS or NVMe SMART attribute
sasNVMeAttr = regexp.MustCompile(`^([^:]+):\s+(.+)$`)
// ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE
// 1 Raw_Read_Error_Rate -O-RC- 200 200 000 - 0
@ -53,14 +53,26 @@ var (
attribute = regexp.MustCompile(`^\s*([0-9]+)\s(\S+)\s+([-P][-O][-S][-R][-C][-K])\s+([0-9]+)\s+([0-9]+)\s+([0-9-]+)\s+([-\w]+)\s+([\w\+\.]+).*$`)
// Additional Smart Log for NVME device:nvme0 namespace-id:ffffffff
// nvme version 1.14+ metrics:
// ID KEY Normalized Raw
// 0xab program_fail_count 100 0
// nvme deprecated metric format:
// key normalized raw
// program_fail_count : 100% 0
intelExpressionPattern = regexp.MustCompile(`^([\w\s]+):([\w\s]+)%(.+)`)
// REGEX patter supports deprecated metrics (nvme-cli version below 1.14) and metrics from nvme-cli 1.14 (and above).
intelExpressionPattern = regexp.MustCompile(`^([A-Za-z0-9_\s]+)[:|\s]+(\d+)[%|\s]+(.+)`)
// vid : 0x8086
// sn : CFGT53260XSP8011P
nvmeIDCtrlExpressionPattern = regexp.MustCompile(`^([\w\s]+):([\s\w]+)`)
// Format from nvme-cli 1.14 (and above) gives ID and KEY, this regex is for separating id from key.
// ID KEY
// 0xab program_fail_count
nvmeIDSeparatePattern = regexp.MustCompile(`^([A-Za-z0-9_]+)(.+)`)
deviceFieldIds = map[string]string{
"1": "read_error_rate",
"7": "seek_error_rate",
@ -70,7 +82,7 @@ var (
}
// to obtain metrics from smartctl
sasNvmeAttributes = map[string]struct {
sasNVMeAttributes = map[string]struct {
ID string
Name string
Parse func(fields, deviceFields map[string]interface{}, str string) error
@ -213,12 +225,51 @@ var (
Parse: parseTemperatureSensor,
},
}
// to obtain Intel specific metrics from nvme-cli
// To obtain Intel specific metrics from nvme-cli version 1.14 and above.
intelAttributes = map[string]struct {
ID string
Name string
Parse func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error
}{
"program_fail_count": {
Name: "Program_Fail_Count",
},
"erase_fail_count": {
Name: "Erase_Fail_Count",
},
"wear_leveling_count": { // previously: "wear_leveling"
Name: "Wear_Leveling_Count",
},
"e2e_error_detect_count": { // previously: "end_to_end_error_detection_count"
Name: "End_To_End_Error_Detection_Count",
},
"crc_error_count": {
Name: "Crc_Error_Count",
},
"media_wear_percentage": { // previously: "timed_workload_media_wear"
Name: "Media_Wear_Percentage",
},
"host_reads": {
Name: "Host_Reads",
},
"timed_work_load": { // previously: "timed_workload_timer"
Name: "Timed_Workload_Timer",
},
"thermal_throttle_status": {
Name: "Thermal_Throttle_Status",
},
"retry_buff_overflow_count": { // previously: "retry_buffer_overflow_count"
Name: "Retry_Buffer_Overflow_Count",
},
"pll_lock_loss_counter": { // previously: "pll_lock_loss_count"
Name: "Pll_Lock_Loss_Count",
},
}
// to obtain Intel specific metrics from nvme-cli
intelAttributesDeprecatedFormat = map[string]struct {
ID string
Name string
Parse func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error
}{
"program_fail_count": {
Name: "Program_Fail_Count",
@ -269,6 +320,8 @@ var (
Parse: parseBytesWritten,
},
}
knownReadMethods = []string{"concurrent", "sequential"}
)
// Smart plugin reads metrics from storage devices supporting S.M.A.R.T.
@ -283,6 +336,7 @@ type Smart struct {
Devices []string `toml:"devices"`
UseSudo bool `toml:"use_sudo"`
Timeout config.Duration `toml:"timeout"`
ReadMethod string `toml:"read_method"`
Log telegraf.Logger `toml:"-"`
}
@ -333,11 +387,20 @@ var sampleConfig = `
## Timeout for the cli command to complete.
# timeout = "30s"
## Optionally call smartctl and nvme-cli with a specific concurrency policy.
## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes.
## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of
## SMART data - one individual array drive at the time. In such case please set this configuration option
## to "sequential" to get readings for all drives.
## valid options: concurrent, sequential
# read_method = "concurrent"
`
func newSmart() *Smart {
return &Smart{
Timeout: config.Duration(time.Second * 30),
Timeout: config.Duration(time.Second * 30),
ReadMethod: "concurrent",
}
}
@ -368,6 +431,10 @@ func (m *Smart) Init() error {
m.PathNVMe, _ = exec.LookPath("nvme")
}
if !contains(knownReadMethods, m.ReadMethod) {
return fmt.Errorf("provided read method `%s` is not valid", m.ReadMethod)
}
err := validatePath(m.PathSmartctl)
if err != nil {
m.PathSmartctl = ""
@ -404,9 +471,9 @@ func (m *Smart) Gather(acc telegraf.Accumulator) error {
if err != nil {
return err
}
NVMeDevices := distinguishNVMeDevices(devicesFromConfig, scannedNVMeDevices)
nvmeDevices := distinguishNVMeDevices(devicesFromConfig, scannedNVMeDevices)
m.getVendorNVMeAttributes(acc, NVMeDevices)
m.getVendorNVMeAttributes(acc, nvmeDevices)
}
return nil
}
@ -434,28 +501,28 @@ func (m *Smart) scanAllDevices(ignoreExcludes bool) ([]string, []string, error)
}
// this will return only NVMe devices
NVMeDevices, err := m.scanDevices(ignoreExcludes, "--scan", "--device=nvme")
nvmeDevices, err := m.scanDevices(ignoreExcludes, "--scan", "--device=nvme")
if err != nil {
return nil, nil, err
}
// to handle all versions of smartctl this will return only non NVMe devices
nonNVMeDevices := difference(devices, NVMeDevices)
return NVMeDevices, nonNVMeDevices, nil
nonNVMeDevices := difference(devices, nvmeDevices)
return nvmeDevices, nonNVMeDevices, nil
}
func distinguishNVMeDevices(userDevices []string, availableNVMeDevices []string) []string {
var NVMeDevices []string
var nvmeDevices []string
for _, userDevice := range userDevices {
for _, NVMeDevice := range availableNVMeDevices {
for _, availableNVMeDevice := range availableNVMeDevices {
// double check. E.g. in case when nvme0 is equal nvme0n1, will check if "nvme0" part is present.
if strings.Contains(NVMeDevice, userDevice) || strings.Contains(userDevice, NVMeDevice) {
NVMeDevices = append(NVMeDevices, userDevice)
if strings.Contains(availableNVMeDevice, userDevice) || strings.Contains(userDevice, availableNVMeDevice) {
nvmeDevices = append(nvmeDevices, userDevice)
}
}
}
return NVMeDevices
return nvmeDevices
}
// Scan for S.M.A.R.T. devices from smartctl
@ -506,69 +573,86 @@ func excludedDev(excludes []string, deviceLine string) bool {
func (m *Smart) getAttributes(acc telegraf.Accumulator, devices []string) {
var wg sync.WaitGroup
wg.Add(len(devices))
for _, device := range devices {
go gatherDisk(acc, m.Timeout, m.UseSudo, m.Attributes, m.PathSmartctl, m.Nocheck, device, &wg)
switch m.ReadMethod {
case "concurrent":
go m.gatherDisk(acc, device, &wg)
case "sequential":
m.gatherDisk(acc, device, &wg)
default:
wg.Done()
}
}
wg.Wait()
}
func (m *Smart) getVendorNVMeAttributes(acc telegraf.Accumulator, devices []string) {
NVMeDevices := getDeviceInfoForNVMeDisks(acc, devices, m.PathNVMe, m.Timeout, m.UseSudo)
nvmeDevices := getDeviceInfoForNVMeDisks(acc, devices, m.PathNVMe, m.Timeout, m.UseSudo)
var wg sync.WaitGroup
for _, device := range NVMeDevices {
for _, device := range nvmeDevices {
if contains(m.EnableExtensions, "auto-on") {
// nolint:revive // one case switch on purpose to demonstrate potential extensions
switch device.vendorID {
case intelVID:
wg.Add(1)
go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg)
switch m.ReadMethod {
case "concurrent":
go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg)
case "sequential":
gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg)
default:
wg.Done()
}
}
} else if contains(m.EnableExtensions, "Intel") && device.vendorID == intelVID {
wg.Add(1)
go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg)
switch m.ReadMethod {
case "concurrent":
go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg)
case "sequential":
gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg)
default:
wg.Done()
}
}
}
wg.Wait()
}
func getDeviceInfoForNVMeDisks(acc telegraf.Accumulator, devices []string, nvme string, timeout config.Duration, useSudo bool) []nvmeDevice {
var NVMeDevices []nvmeDevice
var nvmeDevices []nvmeDevice
for _, device := range devices {
vid, sn, mn, err := gatherNVMeDeviceInfo(nvme, device, timeout, useSudo)
newDevice, err := gatherNVMeDeviceInfo(nvme, device, timeout, useSudo)
if err != nil {
acc.AddError(fmt.Errorf("cannot find device info for %s device", device))
continue
}
newDevice := nvmeDevice{
name: device,
vendorID: vid,
model: mn,
serialNumber: sn,
}
NVMeDevices = append(NVMeDevices, newDevice)
nvmeDevices = append(nvmeDevices, newDevice)
}
return NVMeDevices
return nvmeDevices
}
func gatherNVMeDeviceInfo(nvme, device string, timeout config.Duration, useSudo bool) (string, string, string, error) {
func gatherNVMeDeviceInfo(nvme, deviceName string, timeout config.Duration, useSudo bool) (device nvmeDevice, err error) {
args := []string{"id-ctrl"}
args = append(args, strings.Split(device, " ")...)
args = append(args, strings.Split(deviceName, " ")...)
out, err := runCmd(timeout, useSudo, nvme, args...)
if err != nil {
return "", "", "", err
return device, err
}
outStr := string(out)
vid, sn, mn, err := findNVMeDeviceInfo(outStr)
return vid, sn, mn, err
device, err = findNVMeDeviceInfo(outStr)
if err != nil {
return device, err
}
device.name = deviceName
return device, nil
}
func findNVMeDeviceInfo(output string) (string, string, string, error) {
func findNVMeDeviceInfo(output string) (nvmeDevice, error) {
scanner := bufio.NewScanner(strings.NewReader(output))
var vid, sn, mn string
@ -580,7 +664,7 @@ func findNVMeDeviceInfo(output string) (string, string, string, error) {
matches[2] = strings.TrimSpace(matches[2])
if matches[1] == "vid" {
if _, err := fmt.Sscanf(matches[2], "%s", &vid); err != nil {
return "", "", "", err
return nvmeDevice{}, err
}
}
if matches[1] == "sn" {
@ -591,7 +675,13 @@ func findNVMeDeviceInfo(output string) (string, string, string, error) {
}
}
}
return vid, sn, mn, nil
newDevice := nvmeDevice{
vendorID: vid,
model: mn,
serialNumber: sn,
}
return newDevice, nil
}
func gatherIntelNVMeDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo bool, nvme string, device nvmeDevice, wg *sync.WaitGroup) {
@ -619,10 +709,31 @@ func gatherIntelNVMeDisk(acc telegraf.Accumulator, timeout config.Duration, uses
tags["model"] = device.model
tags["serial_no"] = device.serialNumber
if matches := intelExpressionPattern.FindStringSubmatch(line); len(matches) > 3 {
matches[1] = strings.TrimSpace(matches[1])
// Create struct to initialize later with intel attributes.
var (
attr = struct {
ID string
Name string
Parse func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error
}{}
attrExists bool
)
if matches := intelExpressionPattern.FindStringSubmatch(line); len(matches) > 3 && len(matches[1]) > 1 {
// Check if nvme shows metrics in deprecated format or in format with ID.
// Based on that, an attribute map with metrics is chosen.
// If string has more than one character it means it has KEY there, otherwise it's empty string ("").
if separatedIDAndKey := nvmeIDSeparatePattern.FindStringSubmatch(matches[1]); len(strings.TrimSpace(separatedIDAndKey[2])) > 1 {
matches[1] = strings.TrimSpace(separatedIDAndKey[2])
attr, attrExists = intelAttributes[matches[1]]
} else {
matches[1] = strings.TrimSpace(matches[1])
attr, attrExists = intelAttributesDeprecatedFormat[matches[1]]
}
matches[3] = strings.TrimSpace(matches[3])
if attr, ok := intelAttributes[matches[1]]; ok {
if attrExists {
tags["name"] = attr.Name
if attr.ID != "" {
tags["id"] = attr.ID
@ -641,18 +752,18 @@ func gatherIntelNVMeDisk(acc telegraf.Accumulator, timeout config.Duration, uses
}
}
func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, collectAttributes bool, smartctl, nocheck, device string, wg *sync.WaitGroup) {
func (m *Smart) gatherDisk(acc telegraf.Accumulator, device string, wg *sync.WaitGroup) {
defer wg.Done()
// smartctl 5.41 & 5.42 have are broken regarding handling of --nocheck/-n
args := []string{"--info", "--health", "--attributes", "--tolerance=verypermissive", "-n", nocheck, "--format=brief"}
args := []string{"--info", "--health", "--attributes", "--tolerance=verypermissive", "-n", m.Nocheck, "--format=brief"}
args = append(args, strings.Split(device, " ")...)
out, e := runCmd(timeout, usesudo, smartctl, args...)
out, e := runCmd(m.Timeout, m.UseSudo, m.PathSmartctl, args...)
outStr := string(out)
// Ignore all exit statuses except if it is a command line parse error
exitStatus, er := exitStatus(e)
if er != nil {
acc.AddError(fmt.Errorf("failed to run command '%s %s': %s - %s", smartctl, strings.Join(args, " "), e, outStr))
acc.AddError(fmt.Errorf("failed to run command '%s %s': %s - %s", m.PathSmartctl, strings.Join(args, " "), e, outStr))
return
}
@ -712,7 +823,7 @@ func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, coll
tags := map[string]string{}
fields := make(map[string]interface{})
if collectAttributes {
if m.Attributes {
//add power mode
keys := [...]string{"device", "model", "serial_no", "wwn", "capacity", "enabled", "power"}
for _, key := range keys {
@ -724,8 +835,8 @@ func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, coll
attr := attribute.FindStringSubmatch(line)
if len(attr) > 1 {
// attribute has been found, add it only if collectAttributes is true
if collectAttributes {
// attribute has been found, add it only if m.Attributes is true
if m.Attributes {
tags["id"] = attr[1]
tags["name"] = attr[2]
tags["flags"] = attr[3]
@ -758,8 +869,8 @@ func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, coll
}
} else {
// what was found is not a vendor attribute
if matches := sasNvmeAttr.FindStringSubmatch(line); len(matches) > 2 {
if attr, ok := sasNvmeAttributes[matches[1]]; ok {
if matches := sasNVMeAttr.FindStringSubmatch(line); len(matches) > 2 {
if attr, ok := sasNVMeAttributes[matches[1]]; ok {
tags["name"] = attr.Name
if attr.ID != "" {
tags["id"] = attr.ID
@ -774,8 +885,8 @@ func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, coll
continue
}
// if the field is classified as an attribute, only add it
// if collectAttributes is true
if collectAttributes {
// if m.Attributes is true
if m.Attributes {
acc.AddFields("smart_attribute", fields, tags)
}
}
@ -972,13 +1083,13 @@ func parseTemperatureSensor(fields, _ map[string]interface{}, str string) error
return nil
}
func validatePath(path string) error {
pathInfo, err := os.Stat(path)
func validatePath(filePath string) error {
pathInfo, err := os.Stat(filePath)
if os.IsNotExist(err) {
return fmt.Errorf("provided path does not exist: [%s]", path)
return fmt.Errorf("provided path does not exist: [%s]", filePath)
}
if mode := pathInfo.Mode(); !mode.IsRegular() {
return fmt.Errorf("provided path does not point to a regular file: [%s]", path)
return fmt.Errorf("provided path does not point to a regular file: [%s]", filePath)
}
return nil
}

View File

@ -24,11 +24,11 @@ func TestGatherAttributes(t *testing.T) {
if args[0] == "--info" && args[7] == "/dev/ada0" {
return []byte(mockInfoAttributeData), nil
} else if args[0] == "--info" && args[7] == "/dev/nvme0" {
return []byte(smartctlNvmeInfoData), nil
return []byte(smartctlNVMeInfoData), nil
} else if args[0] == "--scan" && len(args) == 1 {
return []byte(mockScanData), nil
} else if args[0] == "--scan" && len(args) >= 2 && args[1] == "--device=nvme" {
return []byte(mockScanNvmeData), nil
return []byte(mockScanNVMeData), nil
}
}
return nil, errors.New("command not found")
@ -45,7 +45,7 @@ func TestGatherAttributes(t *testing.T) {
s.PathSmartctl = "smartctl"
s.PathNVMe = ""
t.Run("Only non nvme device", func(t *testing.T) {
t.Run("Only non NVMe device", func(t *testing.T) {
s.Devices = []string{"/dev/ada0"}
var acc testutil.Accumulator
@ -62,7 +62,7 @@ func TestGatherAttributes(t *testing.T) {
acc.AssertContainsTaggedFields(t, "smart_device", test.fields, test.tags)
}
})
t.Run("Only nvme device", func(t *testing.T) {
t.Run("Only NVMe device", func(t *testing.T) {
s.Devices = []string{"/dev/nvme0"}
var acc testutil.Accumulator
@ -71,12 +71,78 @@ func TestGatherAttributes(t *testing.T) {
require.NoError(t, err)
assert.Equal(t, 32, acc.NFields(), "Wrong number of fields gathered")
testutil.RequireMetricsEqual(t, testSmartctlNvmeAttributes, acc.GetTelegrafMetrics(),
testutil.RequireMetricsEqual(t, testSmartctlNVMeAttributes, acc.GetTelegrafMetrics(),
testutil.SortMetrics(), testutil.IgnoreTime())
})
})
}
func TestGatherInParallelMode(t *testing.T) {
s := newSmart()
s.Attributes = true
s.PathSmartctl = "smartctl"
s.PathNVMe = "nvmeIdentifyController"
s.EnableExtensions = append(s.EnableExtensions, "auto-on")
s.Devices = []string{"/dev/nvme0"}
runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) {
if len(args) > 0 {
if args[0] == "--info" && args[7] == "/dev/ada0" {
return []byte(mockInfoAttributeData), nil
} else if args[0] == "--info" && args[7] == "/dev/nvmeIdentifyController" {
return []byte(smartctlNVMeInfoData), nil
} else if args[0] == "--scan" && len(args) == 1 {
return []byte(mockScanData), nil
} else if args[0] == "--scan" && len(args) >= 2 && args[1] == "--device=nvme" {
return []byte(mockScanNVMeData), nil
} else if args[0] == "intel" && args[1] == "smart-log-add" {
return []byte(nvmeIntelInfoDataMetricsFormat), nil
} else if args[0] == "id-ctrl" {
return []byte(nvmeIdentifyController), nil
}
}
return nil, errors.New("command not found")
}
t.Run("Gather NVMe device info in goroutine", func(t *testing.T) {
acc := &testutil.Accumulator{}
s.ReadMethod = "concurrent"
err := s.Gather(acc)
require.NoError(t, err)
result := acc.GetTelegrafMetrics()
testutil.RequireMetricsEqual(t, testIntelNVMeNewFormatAttributes, result,
testutil.SortMetrics(), testutil.IgnoreTime())
})
t.Run("Gather NVMe device info sequentially", func(t *testing.T) {
acc := &testutil.Accumulator{}
s.ReadMethod = "sequential"
err := s.Gather(acc)
require.NoError(t, err)
result := acc.GetTelegrafMetrics()
testutil.RequireMetricsEqual(t, testIntelNVMeNewFormatAttributes, result,
testutil.SortMetrics(), testutil.IgnoreTime())
})
t.Run("Gather NVMe device info - not known read method", func(t *testing.T) {
acc := &testutil.Accumulator{}
s.ReadMethod = "horizontally"
err := s.Init()
require.Error(t, err)
err = s.Gather(acc)
require.NoError(t, err)
result := acc.GetTelegrafMetrics()
testutil.RequireMetricsEqual(t, []telegraf.Metric{}, result)
})
}
func TestGatherNoAttributes(t *testing.T) {
s := newSmart()
s.Attributes = false
@ -90,9 +156,9 @@ func TestGatherNoAttributes(t *testing.T) {
} else if args[0] == "--info" && args[7] == "/dev/ada0" {
return []byte(mockInfoAttributeData), nil
} else if args[0] == "--info" && args[7] == "/dev/nvme0" {
return []byte(smartctlNvmeInfoData), nil
return []byte(smartctlNVMeInfoData), nil
} else if args[0] == "--scan" && args[1] == "--device=nvme" {
return []byte(mockScanNvmeData), nil
return []byte(mockScanNVMeData), nil
}
}
return nil, errors.New("command not found")
@ -111,7 +177,7 @@ func TestGatherNoAttributes(t *testing.T) {
for _, test := range testsAda0Device {
acc.AssertContainsTaggedFields(t, "smart_device", test.fields, test.tags)
}
for _, test := range testNvmeDevice {
for _, test := range testNVMeDevice {
acc.AssertContainsTaggedFields(t, "smart_device", test.fields, test.tags)
}
})
@ -123,6 +189,16 @@ func TestExcludedDev(t *testing.T) {
assert.Equal(t, false, excludedDev([]string{"/dev/pass6"}, "/dev/pass1 -d atacam"), "Shouldn't be excluded.")
}
var (
sampleSmart = Smart{
PathSmartctl: "",
Nocheck: "",
Attributes: true,
UseSudo: true,
Timeout: config.Duration(time.Second * 30),
}
)
func TestGatherSATAInfo(t *testing.T) {
runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) {
return []byte(hgstSATAInfoData), nil
@ -134,7 +210,8 @@ func TestGatherSATAInfo(t *testing.T) {
)
wg.Add(1)
gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg)
sampleSmart.gatherDisk(acc, "", wg)
assert.Equal(t, 101, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(20), acc.NMetrics(), "Wrong number of metrics gathered")
}
@ -150,7 +227,7 @@ func TestGatherSATAInfo65(t *testing.T) {
)
wg.Add(1)
gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg)
sampleSmart.gatherDisk(acc, "", wg)
assert.Equal(t, 91, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(18), acc.NMetrics(), "Wrong number of metrics gathered")
}
@ -166,7 +243,7 @@ func TestGatherHgstSAS(t *testing.T) {
)
wg.Add(1)
gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg)
sampleSmart.gatherDisk(acc, "", wg)
assert.Equal(t, 6, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(4), acc.NMetrics(), "Wrong number of metrics gathered")
}
@ -182,7 +259,7 @@ func TestGatherHtSAS(t *testing.T) {
)
wg.Add(1)
gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg)
sampleSmart.gatherDisk(acc, "", wg)
testutil.RequireMetricsEqual(t, testHtsasAtributtes, acc.GetTelegrafMetrics(), testutil.SortMetrics(), testutil.IgnoreTime())
}
@ -198,7 +275,7 @@ func TestGatherSSD(t *testing.T) {
)
wg.Add(1)
gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg)
sampleSmart.gatherDisk(acc, "", wg)
assert.Equal(t, 105, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(26), acc.NMetrics(), "Wrong number of metrics gathered")
}
@ -214,14 +291,14 @@ func TestGatherSSDRaid(t *testing.T) {
)
wg.Add(1)
gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg)
sampleSmart.gatherDisk(acc, "", wg)
assert.Equal(t, 74, acc.NFields(), "Wrong number of fields gathered")
assert.Equal(t, uint64(15), acc.NMetrics(), "Wrong number of metrics gathered")
}
func TestGatherNvme(t *testing.T) {
func TestGatherNVMe(t *testing.T) {
runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) {
return []byte(smartctlNvmeInfoData), nil
return []byte(smartctlNVMeInfoData), nil
}
var (
@ -230,15 +307,15 @@ func TestGatherNvme(t *testing.T) {
)
wg.Add(1)
gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "nvme0", wg)
sampleSmart.gatherDisk(acc, "nvme0", wg)
testutil.RequireMetricsEqual(t, testSmartctlNvmeAttributes, acc.GetTelegrafMetrics(),
testutil.RequireMetricsEqual(t, testSmartctlNVMeAttributes, acc.GetTelegrafMetrics(),
testutil.SortMetrics(), testutil.IgnoreTime())
}
func TestGatherIntelNvme(t *testing.T) {
func TestGatherIntelNVMeMetrics(t *testing.T) {
runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) {
return []byte(nvmeIntelInfoData), nil
return []byte(nvmeIntelInfoDataMetricsFormat), nil
}
var (
@ -255,17 +332,40 @@ func TestGatherIntelNvme(t *testing.T) {
gatherIntelNVMeDisk(acc, config.Duration(time.Second*30), true, "", device, wg)
result := acc.GetTelegrafMetrics()
testutil.RequireMetricsEqual(t, testIntelInvmeAttributes, result,
testutil.RequireMetricsEqual(t, testIntelNVMeNewFormatAttributes, result,
testutil.SortMetrics(), testutil.IgnoreTime())
}
func TestGatherIntelNVMeDeprecatedFormatMetrics(t *testing.T) {
runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) {
return []byte(nvmeIntelInfoDataDeprecatedMetricsFormat), nil
}
var (
acc = &testutil.Accumulator{}
wg = &sync.WaitGroup{}
device = nvmeDevice{
name: "nvme0",
model: mockModel,
serialNumber: mockSerial,
}
)
wg.Add(1)
gatherIntelNVMeDisk(acc, config.Duration(time.Second*30), true, "", device, wg)
result := acc.GetTelegrafMetrics()
testutil.RequireMetricsEqual(t, testIntelNVMeAttributes, result,
testutil.SortMetrics(), testutil.IgnoreTime())
}
func Test_findVIDFromNVMeOutput(t *testing.T) {
vid, sn, mn, err := findNVMeDeviceInfo(nvmeIdentifyController)
device, err := findNVMeDeviceInfo(nvmeIdentifyController)
assert.Nil(t, err)
assert.Equal(t, "0x8086", vid)
assert.Equal(t, "CVFT5123456789ABCD", sn)
assert.Equal(t, "INTEL SSDPEDABCDEFG", mn)
assert.Equal(t, "0x8086", device.vendorID)
assert.Equal(t, "CVFT5123456789ABCD", device.serialNumber)
assert.Equal(t, "INTEL SSDPEDABCDEFG", device.model)
}
func Test_checkForNVMeDevices(t *testing.T) {
@ -293,7 +393,7 @@ func Test_difference(t *testing.T) {
func Test_integerOverflow(t *testing.T) {
runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) {
return []byte(smartctlNvmeInfoDataWithOverflow), nil
return []byte(smartctlNVMeInfoDataWithOverflow), nil
}
var (
@ -303,7 +403,8 @@ func Test_integerOverflow(t *testing.T) {
t.Run("If data raw_value is out of int64 range, there should be no metrics for that attribute", func(t *testing.T) {
wg.Add(1)
gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "nvme0", wg)
sampleSmart.gatherDisk(acc, "nvme0", wg)
result := acc.GetTelegrafMetrics()
testutil.RequireMetricsEqual(t, testOverflowAttributes, result,
@ -656,7 +757,7 @@ var (
mockModel = "INTEL SSDPEDABCDEFG"
mockSerial = "CVFT5123456789ABCD"
testSmartctlNvmeAttributes = []telegraf.Metric{
testSmartctlNVMeAttributes = []telegraf.Metric{
testutil.MustMetric("smart_device",
map[string]string{
"device": "nvme0",
@ -1045,7 +1146,7 @@ var (
},
}
testNvmeDevice = []struct {
testNVMeDevice = []struct {
fields map[string]interface{}
tags map[string]string
}{
@ -1063,7 +1164,7 @@ var (
},
}
testIntelInvmeAttributes = []telegraf.Metric{
testIntelNVMeAttributes = []telegraf.Metric{
testutil.MustMetric("smart_attribute",
map[string]string{
"device": "nvme0",
@ -1257,11 +1358,146 @@ var (
time.Now(),
),
}
testIntelNVMeNewFormatAttributes = []telegraf.Metric{
testutil.MustMetric("smart_attribute",
map[string]string{
"device": "nvme0",
"serial_no": mockSerial,
"model": mockModel,
"name": "Program_Fail_Count",
},
map[string]interface{}{
"raw_value": 0,
},
time.Now(),
),
testutil.MustMetric("smart_attribute",
map[string]string{
"device": "nvme0",
"serial_no": mockSerial,
"model": mockModel,
"name": "Erase_Fail_Count",
},
map[string]interface{}{
"raw_value": 0,
},
time.Now(),
),
testutil.MustMetric("smart_attribute",
map[string]string{
"device": "nvme0",
"serial_no": mockSerial,
"model": mockModel,
"name": "Wear_Leveling_Count",
},
map[string]interface{}{
"raw_value": int64(700090417315),
},
time.Now(),
),
testutil.MustMetric("smart_attribute",
map[string]string{
"device": "nvme0",
"serial_no": mockSerial,
"model": mockModel,
"name": "End_To_End_Error_Detection_Count",
},
map[string]interface{}{
"raw_value": 0,
},
time.Now(),
),
testutil.MustMetric("smart_attribute",
map[string]string{
"device": "nvme0",
"serial_no": mockSerial,
"model": mockModel,
"name": "Crc_Error_Count",
},
map[string]interface{}{
"raw_value": 13,
},
time.Now(),
),
testutil.MustMetric("smart_attribute",
map[string]string{
"device": "nvme0",
"serial_no": mockSerial,
"model": mockModel,
"name": "Media_Wear_Percentage",
},
map[string]interface{}{
"raw_value": 552,
},
time.Now(),
),
testutil.MustMetric("smart_attribute",
map[string]string{
"device": "nvme0",
"serial_no": mockSerial,
"model": mockModel,
"name": "Host_Reads",
},
map[string]interface{}{
"raw_value": 73,
},
time.Now(),
),
testutil.MustMetric("smart_attribute",
map[string]string{
"device": "nvme0",
"serial_no": mockSerial,
"model": mockModel,
"name": "Timed_Workload_Timer",
},
map[string]interface{}{
"raw_value": int64(2343038),
},
time.Now(),
),
testutil.MustMetric("smart_attribute",
map[string]string{
"device": "nvme0",
"serial_no": mockSerial,
"model": mockModel,
"name": "Thermal_Throttle_Status",
},
map[string]interface{}{
"raw_value": 0,
},
time.Now(),
),
testutil.MustMetric("smart_attribute",
map[string]string{
"device": "nvme0",
"serial_no": mockSerial,
"model": mockModel,
"name": "Retry_Buffer_Overflow_Count",
},
map[string]interface{}{
"raw_value": 0,
},
time.Now(),
),
testutil.MustMetric("smart_attribute",
map[string]string{
"device": "nvme0",
"serial_no": mockSerial,
"model": mockModel,
"name": "Pll_Lock_Loss_Count",
},
map[string]interface{}{
"raw_value": 0,
},
time.Now(),
),
}
// smartctl --scan
mockScanData = `/dev/ada0 -d atacam # /dev/ada0, ATA device`
// smartctl --scan -d nvme
mockScanNvmeData = `/dev/nvme0 -d nvme # /dev/nvme0, NVMe device`
mockScanNVMeData = `/dev/nvme0 -d nvme # /dev/nvme0, NVMe device`
// smartctl --info --health --attributes --tolerance=verypermissive -n standby --format=brief [DEVICE]
mockInfoAttributeData = `smartctl 6.5 2016-05-07 r4318 [Darwin 16.4.0 x86_64] (local build)
@ -1670,7 +1906,7 @@ Selective self-test flags (0x0):
After scanning selected spans, do NOT read-scan remainder of disk.
If Selective self-test is pending on power-up, resume after 0 minute delay.
`
smartctlNvmeInfoData = `smartctl 6.5 2016-05-07 r4318 [x86_64-linux-4.1.27-gvt-yocto-standard] (local build)
smartctlNVMeInfoData = `smartctl 6.5 2016-05-07 r4318 [x86_64-linux-4.1.27-gvt-yocto-standard] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org
=== START OF INFORMATION SECTION ===
@ -1720,14 +1956,14 @@ Temperature Sensor 7: 44 C
Temperature Sensor 8: 43 C
`
smartctlNvmeInfoDataWithOverflow = `
smartctlNVMeInfoDataWithOverflow = `
Temperature Sensor 1: 9223372036854775808 C
Temperature Sensor 2: -9223372036854775809 C
Temperature Sensor 3: 9223372036854775807 C
Temperature Sensor 4: -9223372036854775808 C
`
nvmeIntelInfoData = `Additional Smart Log for NVME device:nvme0 namespace-id:ffffffff
nvmeIntelInfoDataDeprecatedMetricsFormat = `Additional Smart Log for NVME device:nvme0 namespace-id:ffffffff
key normalized raw
program_fail_count : 100% 0
erase_fail_count : 100% 0
@ -1742,6 +1978,20 @@ retry_buffer_overflow_count : 100% 0
pll_lock_loss_count : 100% 0
nand_bytes_written : 0% sectors: 0
host_bytes_written : 0% sectors: 0
`
nvmeIntelInfoDataMetricsFormat = `Additional Smart Log for NVME device:nvme0n1 namespace-id:ffffffff
ID KEY Normalized Raw
0xab program_fail_count 100 0
0xac erase_fail_count 100 0
0xad wear_leveling_count 100 700090417315
0xb8 e2e_error_detect_count 100 0
0xc7 crc_error_count 100 13
0xe2 media_wear_percentage 100 552
0xe3 host_reads 100 73
0xe4 timed_work_load 100 2343038
0xea thermal_throttle_status 100 0
0xf0 retry_buff_overflow_count 100 0
0xf3 pll_lock_loss_counter 100 0
`
nvmeIdentifyController = `NVME Identify Controller: