Add more missing NVMe attributes to smart plugin (#8113)

This commit is contained in:
Paweł Żak 2020-09-28 17:16:49 +02:00 committed by GitHub
parent ca7252c641
commit e1cb269a35
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 1550 additions and 394 deletions

View File

@ -1,6 +1,6 @@
# S.M.A.R.T. Input Plugin
Get metrics using the command line utility `smartctl` for S.M.A.R.T. (Self-Monitoring, Analysis and Reporting Technology) storage devices. SMART is a monitoring system included in computer hard disk drives (HDDs) and solid-state drives (SSDs)[1] that detects and reports on various indicators of drive reliability, with the intent of enabling the anticipation of hardware failures.
Get metrics using the command line utility `smartctl` for S.M.A.R.T. (Self-Monitoring, Analysis and Reporting Technology) storage devices. SMART is a monitoring system included in computer hard disk drives (HDDs) and solid-state drives (SSDs) that detects and reports on various indicators of drive reliability, with the intent of enabling the anticipation of hardware failures.
See smartmontools (https://www.smartmontools.org/).
SMART information is separated between different measurements: `smart_device` is used for general information, while `smart_attribute` stores the detailed attribute information if `attributes = true` is enabled in the plugin configuration.
@ -19,29 +19,72 @@ smartctl --info --attributes --health -n <nocheck> --format=brief <device>
This plugin supports _smartmontools_ version 5.41 and above, but v. 5.41 and v. 5.42
might require setting `nocheck`, see the comment in the sample configuration.
Also, NVMe capabilities were introduced in version 6.5.
To enable SMART on a storage device run:
```
smartctl -s on <device>
```
## NVMe vendor specific attributes
### Configuration
For NVMe disk type, plugin can use command line utility `nvme-cli`. It has a feature
to easy access a vendor specific attributes.
This plugin supports nmve-cli version 1.5 and above (https://github.com/linux-nvme/nvme-cli).
In case of `nvme-cli` absence NVMe vendor specific metrics will not be obtained.
Vendor specific SMART metrics for NVMe disks may be reported from the following `nvme` command:
```
nvme <vendor> smart-log-add <device>
```
Note that vendor plugins for `nvme-cli` could require different naming convention and report format.
To see installed plugin extensions, depended on the nvme-cli version, look at the bottom of:
```
nvme help
```
To gather disk vendor id (vid) `id-ctrl` could be used:
```
nvme id-ctrl <device>
```
Association between a vid and company can be found there: https://pcisig.com/membership/member-companies.
Devices affiliation to being NVMe or non NVMe will be determined thanks to:
```
smartctl --scan
```
and:
```
smartctl --scan -d nvme
```
## Configuration
```toml
# Read metrics from storage devices supporting S.M.A.R.T.
[[inputs.smart]]
## Optionally specify the path to the smartctl executable
# path = "/usr/bin/smartctl"
# path_smartctl = "/usr/bin/smartctl"
## On most platforms smartctl requires root access.
## Setting 'use_sudo' to true will make use of sudo to run smartctl.
## Sudo must be configured to to allow the telegraf user to run smartctl
## Optionally specify the path to the nvme-cli executable
# path_nvme = "/usr/bin/nvme"
## Optionally specify if vendor specific attributes should be propagated for NVMe disk case
## ["auto-on"] - automatically find and enable additional vendor specific disk info
## ["vendor1", "vendor2", ...] - e.g. "Intel" enable additional Intel specific disk info
# enable_extensions = ["auto-on"]
## On most platforms used cli utilities requires root access.
## Setting 'use_sudo' to true will make use of sudo to run smartctl or nvme-cli.
## Sudo must be configured to allow the telegraf user to run smartctl or nvme-cli
## without a password.
# use_sudo = false
## Skip checking disks in this power mode. Defaults to
## "standby" to not wake up disks that have stoped rotating.
## "standby" to not wake up disks that have stopped rotating.
## See --nocheck in the man pages for smartctl.
## smartctl version 5.41 and 5.42 have faulty detection of
## power mode and might require changing this value to
@ -49,28 +92,26 @@ smartctl -s on <device>
# nocheck = "standby"
## Gather all returned S.M.A.R.T. attribute metrics and the detailed
## information from each drive into the `smart_attribute` measurement.
## information from each drive into the 'smart_attribute' measurement.
# attributes = false
## Optionally specify devices to exclude from reporting.
## Optionally specify devices to exclude from reporting if disks auto-discovery is performed.
# excludes = [ "/dev/pass6" ]
## Optionally specify devices and device type, if unset
## a scan (smartctl --scan) for S.M.A.R.T. devices will
## done and all found will be included except for the
## excluded in excludes.
# devices = [ "/dev/ada0 -d atacam" ]
## a scan (smartctl --scan and smartctl --scan -d nvme) for S.M.A.R.T. devices will be done
## and all found will be included except for the excluded in excludes.
# devices = [ "/dev/ada0 -d atacam", "/dev/nvme0"]
## Timeout for the smartctl command to complete.
## Timeout for the cli command to complete.
# timeout = "30s"
```
### Permissions
## Permissions
It's important to note that this plugin references smartctl, which may require additional permissions to execute successfully.
It's important to note that this plugin references smartctl and nvme-cli, which may require additional permissions to execute successfully.
Depending on the user/group permissions of the telegraf user executing this plugin, you may need to use sudo.
You will need the following in your telegraf config:
```toml
[[inputs.smart]]
@ -80,13 +121,20 @@ You will need the following in your telegraf config:
You will also need to update your sudoers file:
```bash
$ visudo
# Add the following line:
# For smartctl add the following lines:
Cmnd_Alias SMARTCTL = /usr/bin/smartctl
telegraf ALL=(ALL) NOPASSWD: SMARTCTL
Defaults!SMARTCTL !logfile, !syslog, !pam_session
```
### Metrics
# For nvme-cli add the following lines:
Cmnd_Alias NVME = /path/to/nvme
telegraf ALL=(ALL) NOPASSWD: NVME
Defaults!NVME !logfile, !syslog, !pam_session
```
To run smartctl or nvme with `sudo` wrapper script can be created. `path_smartctl` or
`path_nvme` in the configuration should be set to execute this script.
## Metrics
- smart_device:
- tags:
@ -135,37 +183,44 @@ The interpretation of the tag `flags` is:
#### Exit Status
The `exit_status` field captures the exit status of the smartctl command which
The `exit_status` field captures the exit status of the used cli utilities command which
is defined by a bitmask. For the interpretation of the bitmask see the man page for
smartctl.
#### Device Names
smartctl or nvme-cli.
## Device Names
Device names, e.g., `/dev/sda`, are *not persistent*, and may be
subject to change across reboots or system changes. Instead, you can the
subject to change across reboots or system changes. Instead, you can use the
*World Wide Name* (WWN) or serial number to identify devices. On Linux block
devices can be referenced by the WWN in the following location:
`/dev/disk/by-id/`.
To run `smartctl` with `sudo` create a wrapper script and use `path` in
the configuration to execute that.
### Troubleshooting
## Troubleshooting
If you expect to see more SMART metrics than this plugin shows, be sure to use a proper version
of smartctl or nvme-cli utility which has the functionality to gather desired data. Also, check
your device capability because not every SMART metrics are mandatory.
For example the number of temperature sensors depends on the device specification.
If this plugin is not working as expected for your SMART enabled device,
please run these commands and include the output in a bug report:
For non NVMe devices (from smartctl version >= 7.0 this will also return NVMe devices by default):
```
smartctl --scan
```
For NVMe devices:
```
smartctl --scan -d nvme
```
Run the following command replacing your configuration setting for NOCHECK and
the DEVICE from the previous command:
the DEVICE (name of the device could be taken from the previous command):
```
smartctl --info --health --attributes --tolerance=verypermissive --nocheck NOCHECK --format=brief -d DEVICE
```
### Example Output
If you try to gather vendor specific metrics, please provide this commad
and replace vendor and device to match your case:
```
nvme VENDOR smart-log-add DEVICE
```
## Example SMART Plugin Outputs
```
smart_device,enabled=Enabled,host=mbpro.local,device=rdisk0,model=APPLE\ SSD\ SM0512F,serial_no=S1K5NYCD964433,wwn=5002538655584d30,capacity=500277790720 udma_crc_errors=0i,exit_status=0i,health_ok=true,read_error_rate=0i,temp_c=40i 1502536854000000000
smart_attribute,capacity=500277790720,device=rdisk0,enabled=Enabled,fail=-,flags=-O-RC-,host=mbpro.local,id=199,model=APPLE\ SSD\ SM0512F,name=UDMA_CRC_Error_Count,serial_no=S1K5NYCD964433,wwn=5002538655584d30 exit_status=0i,raw_value=0i,threshold=0i,value=200i,worst=200i 1502536854000000000

View File

@ -3,6 +3,7 @@ package smart
import (
"bufio"
"fmt"
"os"
"os/exec"
"path"
"regexp"
@ -11,12 +12,15 @@ import (
"sync"
"syscall"
"time"
"unicode"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/plugins/inputs"
)
const IntelVID = "0x8086"
var (
// Device Model: APPLE SSD SM256E
// Product: HUH721212AL5204
@ -27,7 +31,7 @@ var (
// LU WWN Device Id: 5 002538 655584d30
wwnInfo = regexp.MustCompile("^LU WWN Device Id:\\s+(.*)$")
// User Capacity: 251,000,193,024 bytes [251 GB]
usercapacityInfo = regexp.MustCompile("^User Capacity:\\s+([0-9,]+)\\s+bytes.*$")
userCapacityInfo = regexp.MustCompile("^User Capacity:\\s+([0-9,]+)\\s+bytes.*$")
// SMART support is: Enabled
smartEnabledInfo = regexp.MustCompile("^SMART support is:\\s+(\\w+)$")
// SMART overall-health self-assessment test result: PASSED
@ -44,6 +48,15 @@ var (
// 192 Power-Off_Retract_Count -O--C- 097 097 000 - 14716
attribute = regexp.MustCompile("^\\s*([0-9]+)\\s(\\S+)\\s+([-P][-O][-S][-R][-C][-K])\\s+([0-9]+)\\s+([0-9]+)\\s+([0-9-]+)\\s+([-\\w]+)\\s+([\\w\\+\\.]+).*$")
// Additional Smart Log for NVME device:nvme0 namespace-id:ffffffff
// key normalized raw
// program_fail_count : 100% 0
intelExpressionPattern = regexp.MustCompile(`^([\w\s]+):([\w\s]+)%(.+)`)
// vid : 0x8086
// sn : CFGT53260XSP8011P
nvmeIdCtrlExpressionPattern = regexp.MustCompile(`^([\w\s]+):([\s\w]+)`)
deviceFieldIds = map[string]string{
"1": "read_error_rate",
"7": "seek_error_rate",
@ -52,6 +65,7 @@ var (
"199": "udma_crc_errors",
}
// to obtain metrics from smartctl
sasNvmeAttributes = map[string]struct {
ID string
Name string
@ -146,31 +160,154 @@ var (
Name: "Critical_Temperature_Time",
Parse: parseCommaSeparatedInt,
},
"Thermal Temp. 1 Transition Count": {
Name: "Thermal_Management_T1_Trans_Count",
Parse: parseCommaSeparatedInt,
},
"Thermal Temp. 2 Transition Count": {
Name: "Thermal_Management_T2_Trans_Count",
Parse: parseCommaSeparatedInt,
},
"Thermal Temp. 1 Total Time": {
Name: "Thermal_Management_T1_Total_Time",
Parse: parseCommaSeparatedInt,
},
"Thermal Temp. 2 Total Time": {
Name: "Thermal_Management_T2_Total_Time",
Parse: parseCommaSeparatedInt,
},
"Temperature Sensor 1": {
Name: "Temperature_Sensor_1",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 2": {
Name: "Temperature_Sensor_2",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 3": {
Name: "Temperature_Sensor_3",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 4": {
Name: "Temperature_Sensor_4",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 5": {
Name: "Temperature_Sensor_5",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 6": {
Name: "Temperature_Sensor_6",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 7": {
Name: "Temperature_Sensor_7",
Parse: parseTemperatureSensor,
},
"Temperature Sensor 8": {
Name: "Temperature_Sensor_8",
Parse: parseTemperatureSensor,
},
}
// to obtain Intel specific metrics from nvme-cli
intelAttributes = map[string]struct {
ID string
Name string
Parse func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error
}{
"program_fail_count": {
Name: "Program_Fail_Count",
},
"erase_fail_count": {
Name: "Erase_Fail_Count",
},
"end_to_end_error_detection_count": {
Name: "End_To_End_Error_Detection_Count",
},
"crc_error_count": {
Name: "Crc_Error_Count",
},
"retry_buffer_overflow_count": {
Name: "Retry_Buffer_Overflow_Count",
},
"wear_leveling": {
Name: "Wear_Leveling",
Parse: parseWearLeveling,
},
"timed_workload_media_wear": {
Name: "Timed_Workload_Media_Wear",
Parse: parseTimedWorkload,
},
"timed_workload_host_reads": {
Name: "Timed_Workload_Host_Reads",
Parse: parseTimedWorkload,
},
"timed_workload_timer": {
Name: "Timed_Workload_Timer",
Parse: func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
return parseCommaSeparatedIntWithAccumulator(acc, fields, tags, strings.TrimSuffix(str, " min"))
},
},
"thermal_throttle_status": {
Name: "Thermal_Throttle_Status",
Parse: parseThermalThrottle,
},
"pll_lock_loss_count": {
Name: "Pll_Lock_Loss_Count",
},
"nand_bytes_written": {
Name: "Nand_Bytes_Written",
Parse: parseBytesWritten,
},
"host_bytes_written": {
Name: "Host_Bytes_Written",
Parse: parseBytesWritten,
},
}
)
type NVMeDevice struct {
name string
vendorID string
model string
serialNumber string
}
type Smart struct {
Path string
Nocheck string
Attributes bool
Excludes []string
Devices []string
UseSudo bool
Timeout internal.Duration
Path string `toml:"path"` //deprecated - to keep backward compatibility
PathSmartctl string `toml:"path_smartctl"`
PathNVMe string `toml:"path_nvme"`
Nocheck string `toml:"nocheck"`
EnableExtensions []string `toml:"enable_extensions"`
Attributes bool `toml:"attributes"`
Excludes []string `toml:"excludes"`
Devices []string `toml:"devices"`
UseSudo bool `toml:"use_sudo"`
Timeout internal.Duration `toml:"timeout"`
Log telegraf.Logger `toml:"-"`
}
var sampleConfig = `
## Optionally specify the path to the smartctl executable
# path = "/usr/bin/smartctl"
# path_smartctl = "/usr/bin/smartctl"
## On most platforms smartctl requires root access.
## Setting 'use_sudo' to true will make use of sudo to run smartctl.
## Sudo must be configured to to allow the telegraf user to run smartctl
## Optionally specify the path to the nvme-cli executable
# path_nvme = "/usr/bin/nvme"
## Optionally specify if vendor specific attributes should be propagated for NVMe disk case
## ["auto-on"] - automatically find and enable additional vendor specific disk info
## ["vendor1", "vendor2", ...] - e.g. "Intel" enable additional Intel specific disk info
# enable_extensions = ["auto-on"]
## On most platforms used cli utilities requires root access.
## Setting 'use_sudo' to true will make use of sudo to run smartctl or nvme-cli.
## Sudo must be configured to allow the telegraf user to run smartctl or nvme-cli
## without a password.
# use_sudo = false
## Skip checking disks in this power mode. Defaults to
## "standby" to not wake up disks that have stoped rotating.
## "standby" to not wake up disks that have stopped rotating.
## See --nocheck in the man pages for smartctl.
## smartctl version 5.41 and 5.42 have faulty detection of
## power mode and might require changing this value to
@ -181,16 +318,15 @@ var sampleConfig = `
## information from each drive into the 'smart_attribute' measurement.
# attributes = false
## Optionally specify devices to exclude from reporting.
## Optionally specify devices to exclude from reporting if disks auto-discovery is performed.
# excludes = [ "/dev/pass6" ]
## Optionally specify devices and device type, if unset
## a scan (smartctl --scan) for S.M.A.R.T. devices will
## done and all found will be included except for the
## excluded in excludes.
# devices = [ "/dev/ada0 -d atacam" ]
## a scan (smartctl --scan and smartctl --scan -d nvme) for S.M.A.R.T. devices will be done
## and all found will be included except for the excluded in excludes.
# devices = [ "/dev/ada0 -d atacam", "/dev/nvme0"]
## Timeout for the smartctl command to complete.
## Timeout for the cli command to complete.
# timeout = "30s"
`
@ -208,22 +344,159 @@ func (m *Smart) Description() string {
return "Read metrics from storage devices supporting S.M.A.R.T."
}
func (m *Smart) Gather(acc telegraf.Accumulator) error {
if len(m.Path) == 0 {
return fmt.Errorf("smartctl not found: verify that smartctl is installed and that smartctl is in your PATH")
func (m *Smart) Init() error {
//if deprecated `path` (to smartctl binary) is provided in config and `path_smartctl` override does not exist
if len(m.Path) > 0 && len(m.PathSmartctl) == 0 {
m.PathSmartctl = m.Path
}
devices := m.Devices
if len(devices) == 0 {
//if `path_smartctl` is not provided in config, try to find smartctl binary in PATH
if len(m.PathSmartctl) == 0 {
m.PathSmartctl, _ = exec.LookPath("smartctl")
}
//if `path_nvme` is not provided in config, try to find nvme binary in PATH
if len(m.PathNVMe) == 0 {
m.PathNVMe, _ = exec.LookPath("nvme")
}
err := validatePath(m.PathSmartctl)
if err != nil {
m.PathSmartctl = ""
//without smartctl, plugin will not be able to gather basic metrics
return fmt.Errorf("smartctl not found: verify that smartctl is installed and it is in your PATH (or specified in config): %s", err.Error())
}
err = validatePath(m.PathNVMe)
if err != nil {
m.PathNVMe = ""
//without nvme, plugin will not be able to gather vendor specific attributes (but it can work without it)
m.Log.Warnf("nvme not found: verify that nvme is installed and it is in your PATH (or specified in config) to gather vendor specific attributes: %s", err.Error())
}
return nil
}
func (m *Smart) Gather(acc telegraf.Accumulator) error {
var err error
devices, err = m.scan()
var scannedNVMeDevices []string
var scannedNonNVMeDevices []string
devicesFromConfig := m.Devices
isNVMe := len(m.PathNVMe) != 0
isVendorExtension := len(m.EnableExtensions) != 0
if len(m.Devices) != 0 {
devicesFromConfig = excludeWrongDeviceNames(devicesFromConfig)
m.getAttributes(acc, devicesFromConfig)
// if nvme-cli is present, vendor specific attributes can be gathered
if isVendorExtension && isNVMe {
scannedNVMeDevices, scannedNonNVMeDevices, err = m.scanAllDevices(true)
if err != nil {
return err
}
NVMeDevices := distinguishNVMeDevices(devicesFromConfig, scannedNVMeDevices)
m.getVendorNVMeAttributes(acc, NVMeDevices)
}
return nil
}
scannedNVMeDevices, scannedNonNVMeDevices, err = m.scanAllDevices(false)
if err != nil {
return err
}
var devicesFromScan []string
devicesFromScan = append(devicesFromScan, scannedNVMeDevices...)
devicesFromScan = append(devicesFromScan, scannedNonNVMeDevices...)
m.getAttributes(acc, devicesFromScan)
if isVendorExtension && isNVMe {
m.getVendorNVMeAttributes(acc, scannedNVMeDevices)
}
return nil
}
// validate and exclude not correct config device names to avoid unwanted behaviours
func excludeWrongDeviceNames(devices []string) []string {
validSigns := map[string]struct{}{
" ": {},
"/": {},
"\\": {},
"-": {},
",": {},
}
var wrongDevices []string
for _, device := range devices {
for _, char := range device {
if unicode.IsLetter(char) || unicode.IsNumber(char) {
continue
}
if _, exist := validSigns[string(char)]; exist {
continue
}
wrongDevices = append(wrongDevices, device)
}
}
return difference(devices, wrongDevices)
}
func (m *Smart) scanAllDevices(ignoreExcludes bool) ([]string, []string, error) {
// this will return all devices (including NVMe devices) for smartctl version >= 7.0
// for older versions this will return non NVMe devices
devices, err := m.scanDevices(ignoreExcludes, "--scan")
if err != nil {
return nil, nil, err
}
m.getAttributes(acc, devices)
return nil
// this will return only NVMe devices
NVMeDevices, err := m.scanDevices(ignoreExcludes, "--scan", "--device=nvme")
if err != nil {
return nil, nil, err
}
// to handle all versions of smartctl this will return only non NVMe devices
nonNVMeDevices := difference(devices, NVMeDevices)
return NVMeDevices, nonNVMeDevices, nil
}
func distinguishNVMeDevices(userDevices []string, availableNVMeDevices []string) []string {
var NVMeDevices []string
for _, userDevice := range userDevices {
for _, NVMeDevice := range availableNVMeDevices {
// double check. E.g. in case when nvme0 is equal nvme0n1, will check if "nvme0" part is present.
if strings.Contains(NVMeDevice, userDevice) || strings.Contains(userDevice, NVMeDevice) {
NVMeDevices = append(NVMeDevices, userDevice)
}
}
}
return NVMeDevices
}
// Scan for S.M.A.R.T. devices from smartctl
func (m *Smart) scanDevices(ignoreExcludes bool, scanArgs ...string) ([]string, error) {
out, err := runCmd(m.Timeout, m.UseSudo, m.PathSmartctl, scanArgs...)
if err != nil {
return []string{}, fmt.Errorf("failed to run command '%s %s': %s - %s", m.PathSmartctl, scanArgs, err, string(out))
}
var devices []string
for _, line := range strings.Split(string(out), "\n") {
dev := strings.Split(line, " ")
if len(dev) <= 1 {
continue
}
if !ignoreExcludes {
if !excludedDev(m.Excludes, strings.TrimSpace(dev[0])) {
devices = append(devices, strings.TrimSpace(dev[0]))
}
} else {
devices = append(devices, strings.TrimSpace(dev[0]))
}
}
return devices, nil
}
// Wrap with sudo
@ -235,23 +508,6 @@ var runCmd = func(timeout internal.Duration, sudo bool, command string, args ...
return internal.CombinedOutputTimeout(cmd, timeout.Duration)
}
// Scan for S.M.A.R.T. devices
func (m *Smart) scan() ([]string, error) {
out, err := runCmd(m.Timeout, m.UseSudo, m.Path, "--scan")
if err != nil {
return []string{}, fmt.Errorf("failed to run command '%s --scan': %s - %s", m.Path, err, string(out))
}
devices := []string{}
for _, line := range strings.Split(string(out), "\n") {
dev := strings.Split(line, " ")
if len(dev) > 1 && !excludedDev(m.Excludes, strings.TrimSpace(dev[0])) {
devices = append(devices, strings.TrimSpace(dev[0]))
}
}
return devices, nil
}
func excludedDev(excludes []string, deviceLine string) bool {
device := strings.Split(deviceLine, " ")
if len(device) != 0 {
@ -270,21 +526,137 @@ func (m *Smart) getAttributes(acc telegraf.Accumulator, devices []string) {
wg.Add(len(devices))
for _, device := range devices {
go gatherDisk(acc, m.Timeout, m.UseSudo, m.Attributes, m.Path, m.Nocheck, device, &wg)
go gatherDisk(acc, m.Timeout, m.UseSudo, m.Attributes, m.PathSmartctl, m.Nocheck, device, &wg)
}
wg.Wait()
}
// Command line parse errors are denoted by the exit code having the 0 bit set.
// All other errors are drive/communication errors and should be ignored.
func exitStatus(err error) (int, error) {
if exiterr, ok := err.(*exec.ExitError); ok {
if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
return status.ExitStatus(), nil
func (m *Smart) getVendorNVMeAttributes(acc telegraf.Accumulator, devices []string) {
NVMeDevices := getDeviceInfoForNVMeDisks(acc, devices, m.PathNVMe, m.Timeout, m.UseSudo)
var wg sync.WaitGroup
for _, device := range NVMeDevices {
if contains(m.EnableExtensions, "auto-on") {
switch device.vendorID {
case IntelVID:
wg.Add(1)
go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg)
}
} else if contains(m.EnableExtensions, "Intel") && device.vendorID == IntelVID {
wg.Add(1)
go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg)
}
}
wg.Wait()
}
func getDeviceInfoForNVMeDisks(acc telegraf.Accumulator, devices []string, nvme string, timeout internal.Duration, useSudo bool) []NVMeDevice {
var NVMeDevices []NVMeDevice
for _, device := range devices {
vid, sn, mn, err := gatherNVMeDeviceInfo(nvme, device, timeout, useSudo)
if err != nil {
acc.AddError(fmt.Errorf("cannot find device info for %s device", device))
continue
}
newDevice := NVMeDevice{
name: device,
vendorID: vid,
model: mn,
serialNumber: sn,
}
NVMeDevices = append(NVMeDevices, newDevice)
}
return NVMeDevices
}
func gatherNVMeDeviceInfo(nvme, device string, timeout internal.Duration, useSudo bool) (string, string, string, error) {
args := []string{"id-ctrl"}
args = append(args, strings.Split(device, " ")...)
out, err := runCmd(timeout, useSudo, nvme, args...)
if err != nil {
return "", "", "", err
}
outStr := string(out)
vid, sn, mn, err := findNVMeDeviceInfo(outStr)
return vid, sn, mn, err
}
func findNVMeDeviceInfo(output string) (string, string, string, error) {
scanner := bufio.NewScanner(strings.NewReader(output))
var vid, sn, mn string
for scanner.Scan() {
line := scanner.Text()
if matches := nvmeIdCtrlExpressionPattern.FindStringSubmatch(line); len(matches) > 2 {
matches[1] = strings.TrimSpace(matches[1])
matches[2] = strings.TrimSpace(matches[2])
if matches[1] == "vid" {
if _, err := fmt.Sscanf(matches[2], "%s", &vid); err != nil {
return "", "", "", err
}
}
if matches[1] == "sn" {
sn = matches[2]
}
if matches[1] == "mn" {
mn = matches[2]
}
}
}
return vid, sn, mn, nil
}
func gatherIntelNVMeDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo bool, nvme string, device NVMeDevice, wg *sync.WaitGroup) {
defer wg.Done()
args := []string{"intel", "smart-log-add"}
args = append(args, strings.Split(device.name, " ")...)
out, e := runCmd(timeout, usesudo, nvme, args...)
outStr := string(out)
_, er := exitStatus(e)
if er != nil {
acc.AddError(fmt.Errorf("failed to run command '%s %s': %s - %s", nvme, strings.Join(args, " "), e, outStr))
return
}
scanner := bufio.NewScanner(strings.NewReader(outStr))
for scanner.Scan() {
line := scanner.Text()
tags := map[string]string{}
fields := make(map[string]interface{})
tags["device"] = path.Base(device.name)
tags["model"] = device.model
tags["serial_no"] = device.serialNumber
if matches := intelExpressionPattern.FindStringSubmatch(line); len(matches) > 3 {
matches[1] = strings.TrimSpace(matches[1])
matches[3] = strings.TrimSpace(matches[3])
if attr, ok := intelAttributes[matches[1]]; ok {
tags["name"] = attr.Name
if attr.ID != "" {
tags["id"] = attr.ID
}
parse := parseCommaSeparatedIntWithAccumulator
if attr.Parse != nil {
parse = attr.Parse
}
if err := parse(acc, fields, tags, matches[3]); err != nil {
continue
}
}
}
}
return 0, err
}
func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, collectAttributes bool, smartctl, nocheck, device string, wg *sync.WaitGroup) {
@ -328,7 +700,7 @@ func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, co
deviceTags["wwn"] = strings.Replace(wwn[1], " ", "", -1)
}
capacity := usercapacityInfo.FindStringSubmatch(line)
capacity := userCapacityInfo.FindStringSubmatch(line)
if len(capacity) > 1 {
deviceTags["capacity"] = strings.Replace(capacity[1], ",", "", -1)
}
@ -340,7 +712,7 @@ func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, co
health := smartOverallHealth.FindStringSubmatch(line)
if len(health) > 2 {
deviceFields["health_ok"] = (health[2] == "PASSED" || health[2] == "OK")
deviceFields["health_ok"] = health[2] == "PASSED" || health[2] == "OK"
}
tags := map[string]string{}
@ -418,6 +790,40 @@ func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, co
acc.AddFields("smart_device", deviceFields, deviceTags)
}
// Command line parse errors are denoted by the exit code having the 0 bit set.
// All other errors are drive/communication errors and should be ignored.
func exitStatus(err error) (int, error) {
if exiterr, ok := err.(*exec.ExitError); ok {
if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
return status.ExitStatus(), nil
}
}
return 0, err
}
func contains(args []string, element string) bool {
for _, arg := range args {
if arg == element {
return true
}
}
return false
}
func difference(a, b []string) []string {
mb := make(map[string]struct{}, len(b))
for _, x := range b {
mb[x] = struct{}{}
}
var diff []string
for _, x := range a {
if _, found := mb[x]; !found {
diff = append(diff, x)
}
}
return diff
}
func parseRawValue(rawVal string) (int64, error) {
// Integer
if i, err := strconv.ParseInt(rawVal, 10, 64); err == nil {
@ -428,7 +834,7 @@ func parseRawValue(rawVal string) (int64, error) {
unit := regexp.MustCompile("^(.*)([hms])$")
parts := strings.Split(rawVal, "+")
if len(parts) == 0 {
return 0, fmt.Errorf("Couldn't parse RAW_VALUE '%s'", rawVal)
return 0, fmt.Errorf("couldn't parse RAW_VALUE '%s'", rawVal)
}
duration := int64(0)
@ -452,6 +858,63 @@ func parseRawValue(rawVal string) (int64, error) {
return duration, nil
}
func parseBytesWritten(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
var value int64
if _, err := fmt.Sscanf(str, "sectors: %d", &value); err != nil {
return err
}
fields["raw_value"] = value
acc.AddFields("smart_attribute", fields, tags)
return nil
}
func parseThermalThrottle(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
var percentage float64
var count int64
if _, err := fmt.Sscanf(str, "%f%%, cnt: %d", &percentage, &count); err != nil {
return err
}
fields["raw_value"] = percentage
tags["name"] = "Thermal_Throttle_Status_Prc"
acc.AddFields("smart_attribute", fields, tags)
fields["raw_value"] = count
tags["name"] = "Thermal_Throttle_Status_Cnt"
acc.AddFields("smart_attribute", fields, tags)
return nil
}
func parseWearLeveling(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
var min, max, avg int64
if _, err := fmt.Sscanf(str, "min: %d, max: %d, avg: %d", &min, &max, &avg); err != nil {
return err
}
values := []int64{min, max, avg}
for i, submetricName := range []string{"Min", "Max", "Avg"} {
fields["raw_value"] = values[i]
tags["name"] = fmt.Sprintf("Wear_Leveling_%s", submetricName)
acc.AddFields("smart_attribute", fields, tags)
}
return nil
}
func parseTimedWorkload(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
var value float64
if _, err := fmt.Sscanf(str, "%f", &value); err != nil {
return err
}
fields["raw_value"] = value
acc.AddFields("smart_attribute", fields, tags)
return nil
}
func parseInt(str string) int64 {
if i, err := strconv.ParseInt(str, 10, 64); err == nil {
return i
@ -460,6 +923,7 @@ func parseInt(str string) int64 {
}
func parseCommaSeparatedInt(fields, _ map[string]interface{}, str string) error {
str = strings.Join(strings.Fields(str), "")
i, err := strconv.ParseInt(strings.Replace(str, ",", "", -1), 10, 64)
if err != nil {
return err
@ -479,6 +943,17 @@ func parseDataUnits(fields, deviceFields map[string]interface{}, str string) err
return parseCommaSeparatedInt(fields, deviceFields, units)
}
func parseCommaSeparatedIntWithAccumulator(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
i, err := strconv.ParseInt(strings.Replace(str, ",", "", -1), 10, 64)
if err != nil {
return err
}
fields["raw_value"] = i
acc.AddFields("smart_attribute", fields, tags)
return nil
}
func parseTemperature(fields, deviceFields map[string]interface{}, str string) error {
var temp int64
if _, err := fmt.Sscanf(str, "%d C", &temp); err != nil {
@ -491,13 +966,34 @@ func parseTemperature(fields, deviceFields map[string]interface{}, str string) e
return nil
}
func parseTemperatureSensor(fields, deviceFields map[string]interface{}, str string) error {
var temp int64
if _, err := fmt.Sscanf(str, "%d C", &temp); err != nil {
return err
}
fields["raw_value"] = temp
return nil
}
func validatePath(path string) error {
pathInfo, err := os.Stat(path)
if os.IsNotExist(err) {
return fmt.Errorf("provided path does not exist: [%s]", path)
}
if mode := pathInfo.Mode(); !mode.IsRegular() {
return fmt.Errorf("provided path does not point to a regular file: [%s]", path)
}
return nil
}
func init() {
// Set LC_NUMERIC to uniform numeric output from cli tools
_ = os.Setenv("LC_NUMERIC", "en_US.UTF-8")
inputs.Add("smart", func() telegraf.Input {
m := NewSmart()
path, _ := exec.LookPath("smartctl")
if len(path) > 0 {
m.Path = path
}
m.Nocheck = "standby"
return m
})

File diff suppressed because it is too large Load Diff