Add more missing NVMe attributes to smart plugin (#8113)

This commit is contained in:
Paweł Żak 2020-09-28 17:16:49 +02:00 committed by GitHub
parent ca7252c641
commit e1cb269a35
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 1550 additions and 394 deletions

View File

@ -1,6 +1,6 @@
# S.M.A.R.T. Input Plugin # S.M.A.R.T. Input Plugin
Get metrics using the command line utility `smartctl` for S.M.A.R.T. (Self-Monitoring, Analysis and Reporting Technology) storage devices. SMART is a monitoring system included in computer hard disk drives (HDDs) and solid-state drives (SSDs)[1] that detects and reports on various indicators of drive reliability, with the intent of enabling the anticipation of hardware failures. Get metrics using the command line utility `smartctl` for S.M.A.R.T. (Self-Monitoring, Analysis and Reporting Technology) storage devices. SMART is a monitoring system included in computer hard disk drives (HDDs) and solid-state drives (SSDs) that detects and reports on various indicators of drive reliability, with the intent of enabling the anticipation of hardware failures.
See smartmontools (https://www.smartmontools.org/). See smartmontools (https://www.smartmontools.org/).
SMART information is separated between different measurements: `smart_device` is used for general information, while `smart_attribute` stores the detailed attribute information if `attributes = true` is enabled in the plugin configuration. SMART information is separated between different measurements: `smart_device` is used for general information, while `smart_attribute` stores the detailed attribute information if `attributes = true` is enabled in the plugin configuration.
@ -19,57 +19,98 @@ smartctl --info --attributes --health -n <nocheck> --format=brief <device>
This plugin supports _smartmontools_ version 5.41 and above, but v. 5.41 and v. 5.42 This plugin supports _smartmontools_ version 5.41 and above, but v. 5.41 and v. 5.42
might require setting `nocheck`, see the comment in the sample configuration. might require setting `nocheck`, see the comment in the sample configuration.
Also, NVMe capabilities were introduced in version 6.5.
To enable SMART on a storage device run: To enable SMART on a storage device run:
``` ```
smartctl -s on <device> smartctl -s on <device>
``` ```
## NVMe vendor specific attributes
### Configuration For NVMe disk type, plugin can use command line utility `nvme-cli`. It has a feature
to easy access a vendor specific attributes.
This plugin supports nmve-cli version 1.5 and above (https://github.com/linux-nvme/nvme-cli).
In case of `nvme-cli` absence NVMe vendor specific metrics will not be obtained.
Vendor specific SMART metrics for NVMe disks may be reported from the following `nvme` command:
```
nvme <vendor> smart-log-add <device>
```
Note that vendor plugins for `nvme-cli` could require different naming convention and report format.
To see installed plugin extensions, depended on the nvme-cli version, look at the bottom of:
```
nvme help
```
To gather disk vendor id (vid) `id-ctrl` could be used:
```
nvme id-ctrl <device>
```
Association between a vid and company can be found there: https://pcisig.com/membership/member-companies.
Devices affiliation to being NVMe or non NVMe will be determined thanks to:
```
smartctl --scan
```
and:
```
smartctl --scan -d nvme
```
## Configuration
```toml ```toml
# Read metrics from storage devices supporting S.M.A.R.T. # Read metrics from storage devices supporting S.M.A.R.T.
[[inputs.smart]] [[inputs.smart]]
## Optionally specify the path to the smartctl executable ## Optionally specify the path to the smartctl executable
# path = "/usr/bin/smartctl" # path_smartctl = "/usr/bin/smartctl"
## On most platforms smartctl requires root access. ## Optionally specify the path to the nvme-cli executable
## Setting 'use_sudo' to true will make use of sudo to run smartctl. # path_nvme = "/usr/bin/nvme"
## Sudo must be configured to to allow the telegraf user to run smartctl
## without a password. ## Optionally specify if vendor specific attributes should be propagated for NVMe disk case
# use_sudo = false ## ["auto-on"] - automatically find and enable additional vendor specific disk info
## ["vendor1", "vendor2", ...] - e.g. "Intel" enable additional Intel specific disk info
## Skip checking disks in this power mode. Defaults to # enable_extensions = ["auto-on"]
## "standby" to not wake up disks that have stoped rotating.
## See --nocheck in the man pages for smartctl. ## On most platforms used cli utilities requires root access.
## smartctl version 5.41 and 5.42 have faulty detection of ## Setting 'use_sudo' to true will make use of sudo to run smartctl or nvme-cli.
## power mode and might require changing this value to ## Sudo must be configured to allow the telegraf user to run smartctl or nvme-cli
## "never" depending on your disks. ## without a password.
# nocheck = "standby" # use_sudo = false
## Gather all returned S.M.A.R.T. attribute metrics and the detailed ## Skip checking disks in this power mode. Defaults to
## information from each drive into the `smart_attribute` measurement. ## "standby" to not wake up disks that have stopped rotating.
# attributes = false ## See --nocheck in the man pages for smartctl.
## smartctl version 5.41 and 5.42 have faulty detection of
## Optionally specify devices to exclude from reporting. ## power mode and might require changing this value to
# excludes = [ "/dev/pass6" ] ## "never" depending on your disks.
# nocheck = "standby"
## Optionally specify devices and device type, if unset
## a scan (smartctl --scan) for S.M.A.R.T. devices will ## Gather all returned S.M.A.R.T. attribute metrics and the detailed
## done and all found will be included except for the ## information from each drive into the 'smart_attribute' measurement.
## excluded in excludes. # attributes = false
# devices = [ "/dev/ada0 -d atacam" ]
## Optionally specify devices to exclude from reporting if disks auto-discovery is performed.
## Timeout for the smartctl command to complete. # excludes = [ "/dev/pass6" ]
# timeout = "30s"
## Optionally specify devices and device type, if unset
## a scan (smartctl --scan and smartctl --scan -d nvme) for S.M.A.R.T. devices will be done
## and all found will be included except for the excluded in excludes.
# devices = [ "/dev/ada0 -d atacam", "/dev/nvme0"]
## Timeout for the cli command to complete.
# timeout = "30s"
``` ```
### Permissions ## Permissions
It's important to note that this plugin references smartctl, which may require additional permissions to execute successfully.
Depending on the user/group permissions of the telegraf user executing this plugin, you may need to use sudo.
It's important to note that this plugin references smartctl and nvme-cli, which may require additional permissions to execute successfully.
Depending on the user/group permissions of the telegraf user executing this plugin, you may need to use sudo.
You will need the following in your telegraf config: You will need the following in your telegraf config:
```toml ```toml
@ -80,13 +121,20 @@ You will need the following in your telegraf config:
You will also need to update your sudoers file: You will also need to update your sudoers file:
```bash ```bash
$ visudo $ visudo
# Add the following line: # For smartctl add the following lines:
Cmnd_Alias SMARTCTL = /usr/bin/smartctl Cmnd_Alias SMARTCTL = /usr/bin/smartctl
telegraf ALL=(ALL) NOPASSWD: SMARTCTL telegraf ALL=(ALL) NOPASSWD: SMARTCTL
Defaults!SMARTCTL !logfile, !syslog, !pam_session Defaults!SMARTCTL !logfile, !syslog, !pam_session
```
### Metrics # For nvme-cli add the following lines:
Cmnd_Alias NVME = /path/to/nvme
telegraf ALL=(ALL) NOPASSWD: NVME
Defaults!NVME !logfile, !syslog, !pam_session
```
To run smartctl or nvme with `sudo` wrapper script can be created. `path_smartctl` or
`path_nvme` in the configuration should be set to execute this script.
## Metrics
- smart_device: - smart_device:
- tags: - tags:
@ -135,37 +183,44 @@ The interpretation of the tag `flags` is:
#### Exit Status #### Exit Status
The `exit_status` field captures the exit status of the smartctl command which The `exit_status` field captures the exit status of the used cli utilities command which
is defined by a bitmask. For the interpretation of the bitmask see the man page for is defined by a bitmask. For the interpretation of the bitmask see the man page for
smartctl. smartctl or nvme-cli.
#### Device Names
## Device Names
Device names, e.g., `/dev/sda`, are *not persistent*, and may be Device names, e.g., `/dev/sda`, are *not persistent*, and may be
subject to change across reboots or system changes. Instead, you can the subject to change across reboots or system changes. Instead, you can use the
*World Wide Name* (WWN) or serial number to identify devices. On Linux block *World Wide Name* (WWN) or serial number to identify devices. On Linux block
devices can be referenced by the WWN in the following location: devices can be referenced by the WWN in the following location:
`/dev/disk/by-id/`. `/dev/disk/by-id/`.
## Troubleshooting
To run `smartctl` with `sudo` create a wrapper script and use `path` in If you expect to see more SMART metrics than this plugin shows, be sure to use a proper version
the configuration to execute that. of smartctl or nvme-cli utility which has the functionality to gather desired data. Also, check
your device capability because not every SMART metrics are mandatory.
### Troubleshooting For example the number of temperature sensors depends on the device specification.
If this plugin is not working as expected for your SMART enabled device, If this plugin is not working as expected for your SMART enabled device,
please run these commands and include the output in a bug report: please run these commands and include the output in a bug report:
For non NVMe devices (from smartctl version >= 7.0 this will also return NVMe devices by default):
``` ```
smartctl --scan smartctl --scan
``` ```
For NVMe devices:
```
smartctl --scan -d nvme
```
Run the following command replacing your configuration setting for NOCHECK and Run the following command replacing your configuration setting for NOCHECK and
the DEVICE from the previous command: the DEVICE (name of the device could be taken from the previous command):
``` ```
smartctl --info --health --attributes --tolerance=verypermissive --nocheck NOCHECK --format=brief -d DEVICE smartctl --info --health --attributes --tolerance=verypermissive --nocheck NOCHECK --format=brief -d DEVICE
``` ```
If you try to gather vendor specific metrics, please provide this commad
### Example Output and replace vendor and device to match your case:
```
nvme VENDOR smart-log-add DEVICE
```
## Example SMART Plugin Outputs
``` ```
smart_device,enabled=Enabled,host=mbpro.local,device=rdisk0,model=APPLE\ SSD\ SM0512F,serial_no=S1K5NYCD964433,wwn=5002538655584d30,capacity=500277790720 udma_crc_errors=0i,exit_status=0i,health_ok=true,read_error_rate=0i,temp_c=40i 1502536854000000000 smart_device,enabled=Enabled,host=mbpro.local,device=rdisk0,model=APPLE\ SSD\ SM0512F,serial_no=S1K5NYCD964433,wwn=5002538655584d30,capacity=500277790720 udma_crc_errors=0i,exit_status=0i,health_ok=true,read_error_rate=0i,temp_c=40i 1502536854000000000
smart_attribute,capacity=500277790720,device=rdisk0,enabled=Enabled,fail=-,flags=-O-RC-,host=mbpro.local,id=199,model=APPLE\ SSD\ SM0512F,name=UDMA_CRC_Error_Count,serial_no=S1K5NYCD964433,wwn=5002538655584d30 exit_status=0i,raw_value=0i,threshold=0i,value=200i,worst=200i 1502536854000000000 smart_attribute,capacity=500277790720,device=rdisk0,enabled=Enabled,fail=-,flags=-O-RC-,host=mbpro.local,id=199,model=APPLE\ SSD\ SM0512F,name=UDMA_CRC_Error_Count,serial_no=S1K5NYCD964433,wwn=5002538655584d30 exit_status=0i,raw_value=0i,threshold=0i,value=200i,worst=200i 1502536854000000000

View File

@ -3,6 +3,7 @@ package smart
import ( import (
"bufio" "bufio"
"fmt" "fmt"
"os"
"os/exec" "os/exec"
"path" "path"
"regexp" "regexp"
@ -11,12 +12,15 @@ import (
"sync" "sync"
"syscall" "syscall"
"time" "time"
"unicode"
"github.com/influxdata/telegraf" "github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal" "github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/plugins/inputs" "github.com/influxdata/telegraf/plugins/inputs"
) )
const IntelVID = "0x8086"
var ( var (
// Device Model: APPLE SSD SM256E // Device Model: APPLE SSD SM256E
// Product: HUH721212AL5204 // Product: HUH721212AL5204
@ -27,7 +31,7 @@ var (
// LU WWN Device Id: 5 002538 655584d30 // LU WWN Device Id: 5 002538 655584d30
wwnInfo = regexp.MustCompile("^LU WWN Device Id:\\s+(.*)$") wwnInfo = regexp.MustCompile("^LU WWN Device Id:\\s+(.*)$")
// User Capacity: 251,000,193,024 bytes [251 GB] // User Capacity: 251,000,193,024 bytes [251 GB]
usercapacityInfo = regexp.MustCompile("^User Capacity:\\s+([0-9,]+)\\s+bytes.*$") userCapacityInfo = regexp.MustCompile("^User Capacity:\\s+([0-9,]+)\\s+bytes.*$")
// SMART support is: Enabled // SMART support is: Enabled
smartEnabledInfo = regexp.MustCompile("^SMART support is:\\s+(\\w+)$") smartEnabledInfo = regexp.MustCompile("^SMART support is:\\s+(\\w+)$")
// SMART overall-health self-assessment test result: PASSED // SMART overall-health self-assessment test result: PASSED
@ -44,6 +48,15 @@ var (
// 192 Power-Off_Retract_Count -O--C- 097 097 000 - 14716 // 192 Power-Off_Retract_Count -O--C- 097 097 000 - 14716
attribute = regexp.MustCompile("^\\s*([0-9]+)\\s(\\S+)\\s+([-P][-O][-S][-R][-C][-K])\\s+([0-9]+)\\s+([0-9]+)\\s+([0-9-]+)\\s+([-\\w]+)\\s+([\\w\\+\\.]+).*$") attribute = regexp.MustCompile("^\\s*([0-9]+)\\s(\\S+)\\s+([-P][-O][-S][-R][-C][-K])\\s+([0-9]+)\\s+([0-9]+)\\s+([0-9-]+)\\s+([-\\w]+)\\s+([\\w\\+\\.]+).*$")
// Additional Smart Log for NVME device:nvme0 namespace-id:ffffffff
// key normalized raw
// program_fail_count : 100% 0
intelExpressionPattern = regexp.MustCompile(`^([\w\s]+):([\w\s]+)%(.+)`)
// vid : 0x8086
// sn : CFGT53260XSP8011P
nvmeIdCtrlExpressionPattern = regexp.MustCompile(`^([\w\s]+):([\s\w]+)`)
deviceFieldIds = map[string]string{ deviceFieldIds = map[string]string{
"1": "read_error_rate", "1": "read_error_rate",
"7": "seek_error_rate", "7": "seek_error_rate",
@ -52,6 +65,7 @@ var (
"199": "udma_crc_errors", "199": "udma_crc_errors",
} }
// to obtain metrics from smartctl
sasNvmeAttributes = map[string]struct { sasNvmeAttributes = map[string]struct {
ID string ID string
Name string Name string
@ -146,31 +160,154 @@ var (
Name: "Critical_Temperature_Time", Name: "Critical_Temperature_Time",
Parse: parseCommaSeparatedInt, Parse: parseCommaSeparatedInt,
}, },
"Thermal Temp. 1 Transition Count": {
Name: "Thermal_Management_T1_Trans_Count",
Parse: parseCommaSeparatedInt,
},
"Thermal Temp. 2 Transition Count": {
Name: "Thermal_Management_T2_Trans_Count",
Parse: parseCommaSeparatedInt,
},
"Thermal Temp. 1 Total Time": {
Name: "Thermal_Management_T1_Total_Time",
Parse: parseCommaSeparatedInt,
},
"Thermal Temp. 2 Total Time": {
Name: "Thermal_Management_T2_Total_Time",
Parse: parseCommaSeparatedInt,
},
"Temperature Sensor 1": {
Name: "Temperature_Sensor_1",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 2": {
Name: "Temperature_Sensor_2",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 3": {
Name: "Temperature_Sensor_3",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 4": {
Name: "Temperature_Sensor_4",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 5": {
Name: "Temperature_Sensor_5",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 6": {
Name: "Temperature_Sensor_6",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 7": {
Name: "Temperature_Sensor_7",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 8": {
Name: "Temperature_Sensor_8",
Parse: parseTemperatureSensor,
},
}
// to obtain Intel specific metrics from nvme-cli
intelAttributes = map[string]struct {
ID string
Name string
Parse func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error
}{
"program_fail_count": {
Name: "Program_Fail_Count",
},
"erase_fail_count": {
Name: "Erase_Fail_Count",
},
"end_to_end_error_detection_count": {
Name: "End_To_End_Error_Detection_Count",
},
"crc_error_count": {
Name: "Crc_Error_Count",
},
"retry_buffer_overflow_count": {
Name: "Retry_Buffer_Overflow_Count",
},
"wear_leveling": {
Name: "Wear_Leveling",
Parse: parseWearLeveling,
},
"timed_workload_media_wear": {
Name: "Timed_Workload_Media_Wear",
Parse: parseTimedWorkload,
},
"timed_workload_host_reads": {
Name: "Timed_Workload_Host_Reads",
Parse: parseTimedWorkload,
},
"timed_workload_timer": {
Name: "Timed_Workload_Timer",
Parse: func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
return parseCommaSeparatedIntWithAccumulator(acc, fields, tags, strings.TrimSuffix(str, " min"))
},
},
"thermal_throttle_status": {
Name: "Thermal_Throttle_Status",
Parse: parseThermalThrottle,
},
"pll_lock_loss_count": {
Name: "Pll_Lock_Loss_Count",
},
"nand_bytes_written": {
Name: "Nand_Bytes_Written",
Parse: parseBytesWritten,
},
"host_bytes_written": {
Name: "Host_Bytes_Written",
Parse: parseBytesWritten,
},
} }
) )
type NVMeDevice struct {
name string
vendorID string
model string
serialNumber string
}
type Smart struct { type Smart struct {
Path string Path string `toml:"path"` //deprecated - to keep backward compatibility
Nocheck string PathSmartctl string `toml:"path_smartctl"`
Attributes bool PathNVMe string `toml:"path_nvme"`
Excludes []string Nocheck string `toml:"nocheck"`
Devices []string EnableExtensions []string `toml:"enable_extensions"`
UseSudo bool Attributes bool `toml:"attributes"`
Timeout internal.Duration Excludes []string `toml:"excludes"`
Devices []string `toml:"devices"`
UseSudo bool `toml:"use_sudo"`
Timeout internal.Duration `toml:"timeout"`
Log telegraf.Logger `toml:"-"`
} }
var sampleConfig = ` var sampleConfig = `
## Optionally specify the path to the smartctl executable ## Optionally specify the path to the smartctl executable
# path = "/usr/bin/smartctl" # path_smartctl = "/usr/bin/smartctl"
## On most platforms smartctl requires root access. ## Optionally specify the path to the nvme-cli executable
## Setting 'use_sudo' to true will make use of sudo to run smartctl. # path_nvme = "/usr/bin/nvme"
## Sudo must be configured to to allow the telegraf user to run smartctl
## Optionally specify if vendor specific attributes should be propagated for NVMe disk case
## ["auto-on"] - automatically find and enable additional vendor specific disk info
## ["vendor1", "vendor2", ...] - e.g. "Intel" enable additional Intel specific disk info
# enable_extensions = ["auto-on"]
## On most platforms used cli utilities requires root access.
## Setting 'use_sudo' to true will make use of sudo to run smartctl or nvme-cli.
## Sudo must be configured to allow the telegraf user to run smartctl or nvme-cli
## without a password. ## without a password.
# use_sudo = false # use_sudo = false
## Skip checking disks in this power mode. Defaults to ## Skip checking disks in this power mode. Defaults to
## "standby" to not wake up disks that have stoped rotating. ## "standby" to not wake up disks that have stopped rotating.
## See --nocheck in the man pages for smartctl. ## See --nocheck in the man pages for smartctl.
## smartctl version 5.41 and 5.42 have faulty detection of ## smartctl version 5.41 and 5.42 have faulty detection of
## power mode and might require changing this value to ## power mode and might require changing this value to
@ -181,16 +318,15 @@ var sampleConfig = `
## information from each drive into the 'smart_attribute' measurement. ## information from each drive into the 'smart_attribute' measurement.
# attributes = false # attributes = false
## Optionally specify devices to exclude from reporting. ## Optionally specify devices to exclude from reporting if disks auto-discovery is performed.
# excludes = [ "/dev/pass6" ] # excludes = [ "/dev/pass6" ]
## Optionally specify devices and device type, if unset ## Optionally specify devices and device type, if unset
## a scan (smartctl --scan) for S.M.A.R.T. devices will ## a scan (smartctl --scan and smartctl --scan -d nvme) for S.M.A.R.T. devices will be done
## done and all found will be included except for the ## and all found will be included except for the excluded in excludes.
## excluded in excludes. # devices = [ "/dev/ada0 -d atacam", "/dev/nvme0"]
# devices = [ "/dev/ada0 -d atacam" ]
## Timeout for the smartctl command to complete. ## Timeout for the cli command to complete.
# timeout = "30s" # timeout = "30s"
` `
@ -208,22 +344,159 @@ func (m *Smart) Description() string {
return "Read metrics from storage devices supporting S.M.A.R.T." return "Read metrics from storage devices supporting S.M.A.R.T."
} }
func (m *Smart) Gather(acc telegraf.Accumulator) error { func (m *Smart) Init() error {
if len(m.Path) == 0 { //if deprecated `path` (to smartctl binary) is provided in config and `path_smartctl` override does not exist
return fmt.Errorf("smartctl not found: verify that smartctl is installed and that smartctl is in your PATH") if len(m.Path) > 0 && len(m.PathSmartctl) == 0 {
m.PathSmartctl = m.Path
} }
devices := m.Devices //if `path_smartctl` is not provided in config, try to find smartctl binary in PATH
if len(devices) == 0 { if len(m.PathSmartctl) == 0 {
var err error m.PathSmartctl, _ = exec.LookPath("smartctl")
devices, err = m.scan() }
if err != nil {
return err //if `path_nvme` is not provided in config, try to find nvme binary in PATH
if len(m.PathNVMe) == 0 {
m.PathNVMe, _ = exec.LookPath("nvme")
}
err := validatePath(m.PathSmartctl)
if err != nil {
m.PathSmartctl = ""
//without smartctl, plugin will not be able to gather basic metrics
return fmt.Errorf("smartctl not found: verify that smartctl is installed and it is in your PATH (or specified in config): %s", err.Error())
}
err = validatePath(m.PathNVMe)
if err != nil {
m.PathNVMe = ""
//without nvme, plugin will not be able to gather vendor specific attributes (but it can work without it)
m.Log.Warnf("nvme not found: verify that nvme is installed and it is in your PATH (or specified in config) to gather vendor specific attributes: %s", err.Error())
}
return nil
}
func (m *Smart) Gather(acc telegraf.Accumulator) error {
var err error
var scannedNVMeDevices []string
var scannedNonNVMeDevices []string
devicesFromConfig := m.Devices
isNVMe := len(m.PathNVMe) != 0
isVendorExtension := len(m.EnableExtensions) != 0
if len(m.Devices) != 0 {
devicesFromConfig = excludeWrongDeviceNames(devicesFromConfig)
m.getAttributes(acc, devicesFromConfig)
// if nvme-cli is present, vendor specific attributes can be gathered
if isVendorExtension && isNVMe {
scannedNVMeDevices, scannedNonNVMeDevices, err = m.scanAllDevices(true)
if err != nil {
return err
}
NVMeDevices := distinguishNVMeDevices(devicesFromConfig, scannedNVMeDevices)
m.getVendorNVMeAttributes(acc, NVMeDevices)
}
return nil
}
scannedNVMeDevices, scannedNonNVMeDevices, err = m.scanAllDevices(false)
if err != nil {
return err
}
var devicesFromScan []string
devicesFromScan = append(devicesFromScan, scannedNVMeDevices...)
devicesFromScan = append(devicesFromScan, scannedNonNVMeDevices...)
m.getAttributes(acc, devicesFromScan)
if isVendorExtension && isNVMe {
m.getVendorNVMeAttributes(acc, scannedNVMeDevices)
}
return nil
}
// validate and exclude not correct config device names to avoid unwanted behaviours
func excludeWrongDeviceNames(devices []string) []string {
validSigns := map[string]struct{}{
" ": {},
"/": {},
"\\": {},
"-": {},
",": {},
}
var wrongDevices []string
for _, device := range devices {
for _, char := range device {
if unicode.IsLetter(char) || unicode.IsNumber(char) {
continue
}
if _, exist := validSigns[string(char)]; exist {
continue
}
wrongDevices = append(wrongDevices, device)
} }
} }
return difference(devices, wrongDevices)
}
m.getAttributes(acc, devices) func (m *Smart) scanAllDevices(ignoreExcludes bool) ([]string, []string, error) {
return nil // this will return all devices (including NVMe devices) for smartctl version >= 7.0
// for older versions this will return non NVMe devices
devices, err := m.scanDevices(ignoreExcludes, "--scan")
if err != nil {
return nil, nil, err
}
// this will return only NVMe devices
NVMeDevices, err := m.scanDevices(ignoreExcludes, "--scan", "--device=nvme")
if err != nil {
return nil, nil, err
}
// to handle all versions of smartctl this will return only non NVMe devices
nonNVMeDevices := difference(devices, NVMeDevices)
return NVMeDevices, nonNVMeDevices, nil
}
func distinguishNVMeDevices(userDevices []string, availableNVMeDevices []string) []string {
var NVMeDevices []string
for _, userDevice := range userDevices {
for _, NVMeDevice := range availableNVMeDevices {
// double check. E.g. in case when nvme0 is equal nvme0n1, will check if "nvme0" part is present.
if strings.Contains(NVMeDevice, userDevice) || strings.Contains(userDevice, NVMeDevice) {
NVMeDevices = append(NVMeDevices, userDevice)
}
}
}
return NVMeDevices
}
// Scan for S.M.A.R.T. devices from smartctl
func (m *Smart) scanDevices(ignoreExcludes bool, scanArgs ...string) ([]string, error) {
out, err := runCmd(m.Timeout, m.UseSudo, m.PathSmartctl, scanArgs...)
if err != nil {
return []string{}, fmt.Errorf("failed to run command '%s %s': %s - %s", m.PathSmartctl, scanArgs, err, string(out))
}
var devices []string
for _, line := range strings.Split(string(out), "\n") {
dev := strings.Split(line, " ")
if len(dev) <= 1 {
continue
}
if !ignoreExcludes {
if !excludedDev(m.Excludes, strings.TrimSpace(dev[0])) {
devices = append(devices, strings.TrimSpace(dev[0]))
}
} else {
devices = append(devices, strings.TrimSpace(dev[0]))
}
}
return devices, nil
} }
// Wrap with sudo // Wrap with sudo
@ -235,23 +508,6 @@ var runCmd = func(timeout internal.Duration, sudo bool, command string, args ...
return internal.CombinedOutputTimeout(cmd, timeout.Duration) return internal.CombinedOutputTimeout(cmd, timeout.Duration)
} }
// Scan for S.M.A.R.T. devices
func (m *Smart) scan() ([]string, error) {
out, err := runCmd(m.Timeout, m.UseSudo, m.Path, "--scan")
if err != nil {
return []string{}, fmt.Errorf("failed to run command '%s --scan': %s - %s", m.Path, err, string(out))
}
devices := []string{}
for _, line := range strings.Split(string(out), "\n") {
dev := strings.Split(line, " ")
if len(dev) > 1 && !excludedDev(m.Excludes, strings.TrimSpace(dev[0])) {
devices = append(devices, strings.TrimSpace(dev[0]))
}
}
return devices, nil
}
func excludedDev(excludes []string, deviceLine string) bool { func excludedDev(excludes []string, deviceLine string) bool {
device := strings.Split(deviceLine, " ") device := strings.Split(deviceLine, " ")
if len(device) != 0 { if len(device) != 0 {
@ -270,21 +526,137 @@ func (m *Smart) getAttributes(acc telegraf.Accumulator, devices []string) {
wg.Add(len(devices)) wg.Add(len(devices))
for _, device := range devices { for _, device := range devices {
go gatherDisk(acc, m.Timeout, m.UseSudo, m.Attributes, m.Path, m.Nocheck, device, &wg) go gatherDisk(acc, m.Timeout, m.UseSudo, m.Attributes, m.PathSmartctl, m.Nocheck, device, &wg)
} }
wg.Wait() wg.Wait()
} }
// Command line parse errors are denoted by the exit code having the 0 bit set. func (m *Smart) getVendorNVMeAttributes(acc telegraf.Accumulator, devices []string) {
// All other errors are drive/communication errors and should be ignored. NVMeDevices := getDeviceInfoForNVMeDisks(acc, devices, m.PathNVMe, m.Timeout, m.UseSudo)
func exitStatus(err error) (int, error) {
if exiterr, ok := err.(*exec.ExitError); ok { var wg sync.WaitGroup
if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
return status.ExitStatus(), nil for _, device := range NVMeDevices {
if contains(m.EnableExtensions, "auto-on") {
switch device.vendorID {
case IntelVID:
wg.Add(1)
go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg)
}
} else if contains(m.EnableExtensions, "Intel") && device.vendorID == IntelVID {
wg.Add(1)
go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg)
}
}
wg.Wait()
}
func getDeviceInfoForNVMeDisks(acc telegraf.Accumulator, devices []string, nvme string, timeout internal.Duration, useSudo bool) []NVMeDevice {
var NVMeDevices []NVMeDevice
for _, device := range devices {
vid, sn, mn, err := gatherNVMeDeviceInfo(nvme, device, timeout, useSudo)
if err != nil {
acc.AddError(fmt.Errorf("cannot find device info for %s device", device))
continue
}
newDevice := NVMeDevice{
name: device,
vendorID: vid,
model: mn,
serialNumber: sn,
}
NVMeDevices = append(NVMeDevices, newDevice)
}
return NVMeDevices
}
func gatherNVMeDeviceInfo(nvme, device string, timeout internal.Duration, useSudo bool) (string, string, string, error) {
args := []string{"id-ctrl"}
args = append(args, strings.Split(device, " ")...)
out, err := runCmd(timeout, useSudo, nvme, args...)
if err != nil {
return "", "", "", err
}
outStr := string(out)
vid, sn, mn, err := findNVMeDeviceInfo(outStr)
return vid, sn, mn, err
}
func findNVMeDeviceInfo(output string) (string, string, string, error) {
scanner := bufio.NewScanner(strings.NewReader(output))
var vid, sn, mn string
for scanner.Scan() {
line := scanner.Text()
if matches := nvmeIdCtrlExpressionPattern.FindStringSubmatch(line); len(matches) > 2 {
matches[1] = strings.TrimSpace(matches[1])
matches[2] = strings.TrimSpace(matches[2])
if matches[1] == "vid" {
if _, err := fmt.Sscanf(matches[2], "%s", &vid); err != nil {
return "", "", "", err
}
}
if matches[1] == "sn" {
sn = matches[2]
}
if matches[1] == "mn" {
mn = matches[2]
}
}
}
return vid, sn, mn, nil
}
func gatherIntelNVMeDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo bool, nvme string, device NVMeDevice, wg *sync.WaitGroup) {
defer wg.Done()
args := []string{"intel", "smart-log-add"}
args = append(args, strings.Split(device.name, " ")...)
out, e := runCmd(timeout, usesudo, nvme, args...)
outStr := string(out)
_, er := exitStatus(e)
if er != nil {
acc.AddError(fmt.Errorf("failed to run command '%s %s': %s - %s", nvme, strings.Join(args, " "), e, outStr))
return
}
scanner := bufio.NewScanner(strings.NewReader(outStr))
for scanner.Scan() {
line := scanner.Text()
tags := map[string]string{}
fields := make(map[string]interface{})
tags["device"] = path.Base(device.name)
tags["model"] = device.model
tags["serial_no"] = device.serialNumber
if matches := intelExpressionPattern.FindStringSubmatch(line); len(matches) > 3 {
matches[1] = strings.TrimSpace(matches[1])
matches[3] = strings.TrimSpace(matches[3])
if attr, ok := intelAttributes[matches[1]]; ok {
tags["name"] = attr.Name
if attr.ID != "" {
tags["id"] = attr.ID
}
parse := parseCommaSeparatedIntWithAccumulator
if attr.Parse != nil {
parse = attr.Parse
}
if err := parse(acc, fields, tags, matches[3]); err != nil {
continue
}
}
} }
} }
return 0, err
} }
func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, collectAttributes bool, smartctl, nocheck, device string, wg *sync.WaitGroup) { func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, collectAttributes bool, smartctl, nocheck, device string, wg *sync.WaitGroup) {
@ -328,7 +700,7 @@ func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, co
deviceTags["wwn"] = strings.Replace(wwn[1], " ", "", -1) deviceTags["wwn"] = strings.Replace(wwn[1], " ", "", -1)
} }
capacity := usercapacityInfo.FindStringSubmatch(line) capacity := userCapacityInfo.FindStringSubmatch(line)
if len(capacity) > 1 { if len(capacity) > 1 {
deviceTags["capacity"] = strings.Replace(capacity[1], ",", "", -1) deviceTags["capacity"] = strings.Replace(capacity[1], ",", "", -1)
} }
@ -340,7 +712,7 @@ func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, co
health := smartOverallHealth.FindStringSubmatch(line) health := smartOverallHealth.FindStringSubmatch(line)
if len(health) > 2 { if len(health) > 2 {
deviceFields["health_ok"] = (health[2] == "PASSED" || health[2] == "OK") deviceFields["health_ok"] = health[2] == "PASSED" || health[2] == "OK"
} }
tags := map[string]string{} tags := map[string]string{}
@ -418,6 +790,40 @@ func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, co
acc.AddFields("smart_device", deviceFields, deviceTags) acc.AddFields("smart_device", deviceFields, deviceTags)
} }
// Command line parse errors are denoted by the exit code having the 0 bit set.
// All other errors are drive/communication errors and should be ignored.
func exitStatus(err error) (int, error) {
if exiterr, ok := err.(*exec.ExitError); ok {
if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
return status.ExitStatus(), nil
}
}
return 0, err
}
func contains(args []string, element string) bool {
for _, arg := range args {
if arg == element {
return true
}
}
return false
}
func difference(a, b []string) []string {
mb := make(map[string]struct{}, len(b))
for _, x := range b {
mb[x] = struct{}{}
}
var diff []string
for _, x := range a {
if _, found := mb[x]; !found {
diff = append(diff, x)
}
}
return diff
}
func parseRawValue(rawVal string) (int64, error) { func parseRawValue(rawVal string) (int64, error) {
// Integer // Integer
if i, err := strconv.ParseInt(rawVal, 10, 64); err == nil { if i, err := strconv.ParseInt(rawVal, 10, 64); err == nil {
@ -428,7 +834,7 @@ func parseRawValue(rawVal string) (int64, error) {
unit := regexp.MustCompile("^(.*)([hms])$") unit := regexp.MustCompile("^(.*)([hms])$")
parts := strings.Split(rawVal, "+") parts := strings.Split(rawVal, "+")
if len(parts) == 0 { if len(parts) == 0 {
return 0, fmt.Errorf("Couldn't parse RAW_VALUE '%s'", rawVal) return 0, fmt.Errorf("couldn't parse RAW_VALUE '%s'", rawVal)
} }
duration := int64(0) duration := int64(0)
@ -452,6 +858,63 @@ func parseRawValue(rawVal string) (int64, error) {
return duration, nil return duration, nil
} }
func parseBytesWritten(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
var value int64
if _, err := fmt.Sscanf(str, "sectors: %d", &value); err != nil {
return err
}
fields["raw_value"] = value
acc.AddFields("smart_attribute", fields, tags)
return nil
}
func parseThermalThrottle(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
var percentage float64
var count int64
if _, err := fmt.Sscanf(str, "%f%%, cnt: %d", &percentage, &count); err != nil {
return err
}
fields["raw_value"] = percentage
tags["name"] = "Thermal_Throttle_Status_Prc"
acc.AddFields("smart_attribute", fields, tags)
fields["raw_value"] = count
tags["name"] = "Thermal_Throttle_Status_Cnt"
acc.AddFields("smart_attribute", fields, tags)
return nil
}
func parseWearLeveling(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
var min, max, avg int64
if _, err := fmt.Sscanf(str, "min: %d, max: %d, avg: %d", &min, &max, &avg); err != nil {
return err
}
values := []int64{min, max, avg}
for i, submetricName := range []string{"Min", "Max", "Avg"} {
fields["raw_value"] = values[i]
tags["name"] = fmt.Sprintf("Wear_Leveling_%s", submetricName)
acc.AddFields("smart_attribute", fields, tags)
}
return nil
}
func parseTimedWorkload(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
var value float64
if _, err := fmt.Sscanf(str, "%f", &value); err != nil {
return err
}
fields["raw_value"] = value
acc.AddFields("smart_attribute", fields, tags)
return nil
}
func parseInt(str string) int64 { func parseInt(str string) int64 {
if i, err := strconv.ParseInt(str, 10, 64); err == nil { if i, err := strconv.ParseInt(str, 10, 64); err == nil {
return i return i
@ -460,6 +923,7 @@ func parseInt(str string) int64 {
} }
func parseCommaSeparatedInt(fields, _ map[string]interface{}, str string) error { func parseCommaSeparatedInt(fields, _ map[string]interface{}, str string) error {
str = strings.Join(strings.Fields(str), "")
i, err := strconv.ParseInt(strings.Replace(str, ",", "", -1), 10, 64) i, err := strconv.ParseInt(strings.Replace(str, ",", "", -1), 10, 64)
if err != nil { if err != nil {
return err return err
@ -479,6 +943,17 @@ func parseDataUnits(fields, deviceFields map[string]interface{}, str string) err
return parseCommaSeparatedInt(fields, deviceFields, units) return parseCommaSeparatedInt(fields, deviceFields, units)
} }
func parseCommaSeparatedIntWithAccumulator(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
i, err := strconv.ParseInt(strings.Replace(str, ",", "", -1), 10, 64)
if err != nil {
return err
}
fields["raw_value"] = i
acc.AddFields("smart_attribute", fields, tags)
return nil
}
func parseTemperature(fields, deviceFields map[string]interface{}, str string) error { func parseTemperature(fields, deviceFields map[string]interface{}, str string) error {
var temp int64 var temp int64
if _, err := fmt.Sscanf(str, "%d C", &temp); err != nil { if _, err := fmt.Sscanf(str, "%d C", &temp); err != nil {
@ -491,13 +966,34 @@ func parseTemperature(fields, deviceFields map[string]interface{}, str string) e
return nil return nil
} }
func parseTemperatureSensor(fields, deviceFields map[string]interface{}, str string) error {
var temp int64
if _, err := fmt.Sscanf(str, "%d C", &temp); err != nil {
return err
}
fields["raw_value"] = temp
return nil
}
func validatePath(path string) error {
pathInfo, err := os.Stat(path)
if os.IsNotExist(err) {
return fmt.Errorf("provided path does not exist: [%s]", path)
}
if mode := pathInfo.Mode(); !mode.IsRegular() {
return fmt.Errorf("provided path does not point to a regular file: [%s]", path)
}
return nil
}
func init() { func init() {
// Set LC_NUMERIC to uniform numeric output from cli tools
_ = os.Setenv("LC_NUMERIC", "en_US.UTF-8")
inputs.Add("smart", func() telegraf.Input { inputs.Add("smart", func() telegraf.Input {
m := NewSmart() m := NewSmart()
path, _ := exec.LookPath("smartctl")
if len(path) > 0 {
m.Path = path
}
m.Nocheck = "standby" m.Nocheck = "standby"
return m return m
}) })

File diff suppressed because it is too large Load Diff