Add more missing NVMe attributes to smart plugin (#8113)
This commit is contained in:
parent
ca7252c641
commit
e1cb269a35
|
|
@ -1,6 +1,6 @@
|
|||
# S.M.A.R.T. Input Plugin
|
||||
|
||||
Get metrics using the command line utility `smartctl` for S.M.A.R.T. (Self-Monitoring, Analysis and Reporting Technology) storage devices. SMART is a monitoring system included in computer hard disk drives (HDDs) and solid-state drives (SSDs)[1] that detects and reports on various indicators of drive reliability, with the intent of enabling the anticipation of hardware failures.
|
||||
Get metrics using the command line utility `smartctl` for S.M.A.R.T. (Self-Monitoring, Analysis and Reporting Technology) storage devices. SMART is a monitoring system included in computer hard disk drives (HDDs) and solid-state drives (SSDs) that detects and reports on various indicators of drive reliability, with the intent of enabling the anticipation of hardware failures.
|
||||
See smartmontools (https://www.smartmontools.org/).
|
||||
|
||||
SMART information is separated between different measurements: `smart_device` is used for general information, while `smart_attribute` stores the detailed attribute information if `attributes = true` is enabled in the plugin configuration.
|
||||
|
|
@ -19,29 +19,72 @@ smartctl --info --attributes --health -n <nocheck> --format=brief <device>
|
|||
|
||||
This plugin supports _smartmontools_ version 5.41 and above, but v. 5.41 and v. 5.42
|
||||
might require setting `nocheck`, see the comment in the sample configuration.
|
||||
Also, NVMe capabilities were introduced in version 6.5.
|
||||
|
||||
To enable SMART on a storage device run:
|
||||
|
||||
```
|
||||
smartctl -s on <device>
|
||||
```
|
||||
## NVMe vendor specific attributes
|
||||
|
||||
### Configuration
|
||||
For NVMe disk type, plugin can use command line utility `nvme-cli`. It has a feature
|
||||
to easy access a vendor specific attributes.
|
||||
This plugin supports nmve-cli version 1.5 and above (https://github.com/linux-nvme/nvme-cli).
|
||||
In case of `nvme-cli` absence NVMe vendor specific metrics will not be obtained.
|
||||
|
||||
Vendor specific SMART metrics for NVMe disks may be reported from the following `nvme` command:
|
||||
|
||||
```
|
||||
nvme <vendor> smart-log-add <device>
|
||||
```
|
||||
|
||||
Note that vendor plugins for `nvme-cli` could require different naming convention and report format.
|
||||
|
||||
To see installed plugin extensions, depended on the nvme-cli version, look at the bottom of:
|
||||
```
|
||||
nvme help
|
||||
```
|
||||
|
||||
To gather disk vendor id (vid) `id-ctrl` could be used:
|
||||
```
|
||||
nvme id-ctrl <device>
|
||||
```
|
||||
Association between a vid and company can be found there: https://pcisig.com/membership/member-companies.
|
||||
|
||||
Devices affiliation to being NVMe or non NVMe will be determined thanks to:
|
||||
```
|
||||
smartctl --scan
|
||||
```
|
||||
and:
|
||||
```
|
||||
smartctl --scan -d nvme
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
```toml
|
||||
# Read metrics from storage devices supporting S.M.A.R.T.
|
||||
[[inputs.smart]]
|
||||
## Optionally specify the path to the smartctl executable
|
||||
# path = "/usr/bin/smartctl"
|
||||
# path_smartctl = "/usr/bin/smartctl"
|
||||
|
||||
## On most platforms smartctl requires root access.
|
||||
## Setting 'use_sudo' to true will make use of sudo to run smartctl.
|
||||
## Sudo must be configured to to allow the telegraf user to run smartctl
|
||||
## Optionally specify the path to the nvme-cli executable
|
||||
# path_nvme = "/usr/bin/nvme"
|
||||
|
||||
## Optionally specify if vendor specific attributes should be propagated for NVMe disk case
|
||||
## ["auto-on"] - automatically find and enable additional vendor specific disk info
|
||||
## ["vendor1", "vendor2", ...] - e.g. "Intel" enable additional Intel specific disk info
|
||||
# enable_extensions = ["auto-on"]
|
||||
|
||||
## On most platforms used cli utilities requires root access.
|
||||
## Setting 'use_sudo' to true will make use of sudo to run smartctl or nvme-cli.
|
||||
## Sudo must be configured to allow the telegraf user to run smartctl or nvme-cli
|
||||
## without a password.
|
||||
# use_sudo = false
|
||||
|
||||
## Skip checking disks in this power mode. Defaults to
|
||||
## "standby" to not wake up disks that have stoped rotating.
|
||||
## "standby" to not wake up disks that have stopped rotating.
|
||||
## See --nocheck in the man pages for smartctl.
|
||||
## smartctl version 5.41 and 5.42 have faulty detection of
|
||||
## power mode and might require changing this value to
|
||||
|
|
@ -49,28 +92,26 @@ smartctl -s on <device>
|
|||
# nocheck = "standby"
|
||||
|
||||
## Gather all returned S.M.A.R.T. attribute metrics and the detailed
|
||||
## information from each drive into the `smart_attribute` measurement.
|
||||
## information from each drive into the 'smart_attribute' measurement.
|
||||
# attributes = false
|
||||
|
||||
## Optionally specify devices to exclude from reporting.
|
||||
## Optionally specify devices to exclude from reporting if disks auto-discovery is performed.
|
||||
# excludes = [ "/dev/pass6" ]
|
||||
|
||||
## Optionally specify devices and device type, if unset
|
||||
## a scan (smartctl --scan) for S.M.A.R.T. devices will
|
||||
## done and all found will be included except for the
|
||||
## excluded in excludes.
|
||||
# devices = [ "/dev/ada0 -d atacam" ]
|
||||
## a scan (smartctl --scan and smartctl --scan -d nvme) for S.M.A.R.T. devices will be done
|
||||
## and all found will be included except for the excluded in excludes.
|
||||
# devices = [ "/dev/ada0 -d atacam", "/dev/nvme0"]
|
||||
|
||||
## Timeout for the smartctl command to complete.
|
||||
## Timeout for the cli command to complete.
|
||||
# timeout = "30s"
|
||||
```
|
||||
|
||||
### Permissions
|
||||
## Permissions
|
||||
|
||||
It's important to note that this plugin references smartctl, which may require additional permissions to execute successfully.
|
||||
It's important to note that this plugin references smartctl and nvme-cli, which may require additional permissions to execute successfully.
|
||||
Depending on the user/group permissions of the telegraf user executing this plugin, you may need to use sudo.
|
||||
|
||||
|
||||
You will need the following in your telegraf config:
|
||||
```toml
|
||||
[[inputs.smart]]
|
||||
|
|
@ -80,13 +121,20 @@ You will need the following in your telegraf config:
|
|||
You will also need to update your sudoers file:
|
||||
```bash
|
||||
$ visudo
|
||||
# Add the following line:
|
||||
# For smartctl add the following lines:
|
||||
Cmnd_Alias SMARTCTL = /usr/bin/smartctl
|
||||
telegraf ALL=(ALL) NOPASSWD: SMARTCTL
|
||||
Defaults!SMARTCTL !logfile, !syslog, !pam_session
|
||||
```
|
||||
|
||||
### Metrics
|
||||
# For nvme-cli add the following lines:
|
||||
Cmnd_Alias NVME = /path/to/nvme
|
||||
telegraf ALL=(ALL) NOPASSWD: NVME
|
||||
Defaults!NVME !logfile, !syslog, !pam_session
|
||||
```
|
||||
To run smartctl or nvme with `sudo` wrapper script can be created. `path_smartctl` or
|
||||
`path_nvme` in the configuration should be set to execute this script.
|
||||
|
||||
## Metrics
|
||||
|
||||
- smart_device:
|
||||
- tags:
|
||||
|
|
@ -135,37 +183,44 @@ The interpretation of the tag `flags` is:
|
|||
|
||||
#### Exit Status
|
||||
|
||||
The `exit_status` field captures the exit status of the smartctl command which
|
||||
The `exit_status` field captures the exit status of the used cli utilities command which
|
||||
is defined by a bitmask. For the interpretation of the bitmask see the man page for
|
||||
smartctl.
|
||||
|
||||
#### Device Names
|
||||
smartctl or nvme-cli.
|
||||
|
||||
## Device Names
|
||||
Device names, e.g., `/dev/sda`, are *not persistent*, and may be
|
||||
subject to change across reboots or system changes. Instead, you can the
|
||||
subject to change across reboots or system changes. Instead, you can use the
|
||||
*World Wide Name* (WWN) or serial number to identify devices. On Linux block
|
||||
devices can be referenced by the WWN in the following location:
|
||||
`/dev/disk/by-id/`.
|
||||
|
||||
To run `smartctl` with `sudo` create a wrapper script and use `path` in
|
||||
the configuration to execute that.
|
||||
|
||||
### Troubleshooting
|
||||
## Troubleshooting
|
||||
If you expect to see more SMART metrics than this plugin shows, be sure to use a proper version
|
||||
of smartctl or nvme-cli utility which has the functionality to gather desired data. Also, check
|
||||
your device capability because not every SMART metrics are mandatory.
|
||||
For example the number of temperature sensors depends on the device specification.
|
||||
|
||||
If this plugin is not working as expected for your SMART enabled device,
|
||||
please run these commands and include the output in a bug report:
|
||||
|
||||
For non NVMe devices (from smartctl version >= 7.0 this will also return NVMe devices by default):
|
||||
```
|
||||
smartctl --scan
|
||||
```
|
||||
|
||||
For NVMe devices:
|
||||
```
|
||||
smartctl --scan -d nvme
|
||||
```
|
||||
Run the following command replacing your configuration setting for NOCHECK and
|
||||
the DEVICE from the previous command:
|
||||
the DEVICE (name of the device could be taken from the previous command):
|
||||
```
|
||||
smartctl --info --health --attributes --tolerance=verypermissive --nocheck NOCHECK --format=brief -d DEVICE
|
||||
```
|
||||
|
||||
### Example Output
|
||||
|
||||
If you try to gather vendor specific metrics, please provide this commad
|
||||
and replace vendor and device to match your case:
|
||||
```
|
||||
nvme VENDOR smart-log-add DEVICE
|
||||
```
|
||||
## Example SMART Plugin Outputs
|
||||
```
|
||||
smart_device,enabled=Enabled,host=mbpro.local,device=rdisk0,model=APPLE\ SSD\ SM0512F,serial_no=S1K5NYCD964433,wwn=5002538655584d30,capacity=500277790720 udma_crc_errors=0i,exit_status=0i,health_ok=true,read_error_rate=0i,temp_c=40i 1502536854000000000
|
||||
smart_attribute,capacity=500277790720,device=rdisk0,enabled=Enabled,fail=-,flags=-O-RC-,host=mbpro.local,id=199,model=APPLE\ SSD\ SM0512F,name=UDMA_CRC_Error_Count,serial_no=S1K5NYCD964433,wwn=5002538655584d30 exit_status=0i,raw_value=0i,threshold=0i,value=200i,worst=200i 1502536854000000000
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package smart
|
|||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path"
|
||||
"regexp"
|
||||
|
|
@ -11,12 +12,15 @@ import (
|
|||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
"unicode"
|
||||
|
||||
"github.com/influxdata/telegraf"
|
||||
"github.com/influxdata/telegraf/internal"
|
||||
"github.com/influxdata/telegraf/plugins/inputs"
|
||||
)
|
||||
|
||||
const IntelVID = "0x8086"
|
||||
|
||||
var (
|
||||
// Device Model: APPLE SSD SM256E
|
||||
// Product: HUH721212AL5204
|
||||
|
|
@ -27,7 +31,7 @@ var (
|
|||
// LU WWN Device Id: 5 002538 655584d30
|
||||
wwnInfo = regexp.MustCompile("^LU WWN Device Id:\\s+(.*)$")
|
||||
// User Capacity: 251,000,193,024 bytes [251 GB]
|
||||
usercapacityInfo = regexp.MustCompile("^User Capacity:\\s+([0-9,]+)\\s+bytes.*$")
|
||||
userCapacityInfo = regexp.MustCompile("^User Capacity:\\s+([0-9,]+)\\s+bytes.*$")
|
||||
// SMART support is: Enabled
|
||||
smartEnabledInfo = regexp.MustCompile("^SMART support is:\\s+(\\w+)$")
|
||||
// SMART overall-health self-assessment test result: PASSED
|
||||
|
|
@ -44,6 +48,15 @@ var (
|
|||
// 192 Power-Off_Retract_Count -O--C- 097 097 000 - 14716
|
||||
attribute = regexp.MustCompile("^\\s*([0-9]+)\\s(\\S+)\\s+([-P][-O][-S][-R][-C][-K])\\s+([0-9]+)\\s+([0-9]+)\\s+([0-9-]+)\\s+([-\\w]+)\\s+([\\w\\+\\.]+).*$")
|
||||
|
||||
// Additional Smart Log for NVME device:nvme0 namespace-id:ffffffff
|
||||
// key normalized raw
|
||||
// program_fail_count : 100% 0
|
||||
intelExpressionPattern = regexp.MustCompile(`^([\w\s]+):([\w\s]+)%(.+)`)
|
||||
|
||||
// vid : 0x8086
|
||||
// sn : CFGT53260XSP8011P
|
||||
nvmeIdCtrlExpressionPattern = regexp.MustCompile(`^([\w\s]+):([\s\w]+)`)
|
||||
|
||||
deviceFieldIds = map[string]string{
|
||||
"1": "read_error_rate",
|
||||
"7": "seek_error_rate",
|
||||
|
|
@ -52,6 +65,7 @@ var (
|
|||
"199": "udma_crc_errors",
|
||||
}
|
||||
|
||||
// to obtain metrics from smartctl
|
||||
sasNvmeAttributes = map[string]struct {
|
||||
ID string
|
||||
Name string
|
||||
|
|
@ -146,31 +160,154 @@ var (
|
|||
Name: "Critical_Temperature_Time",
|
||||
Parse: parseCommaSeparatedInt,
|
||||
},
|
||||
"Thermal Temp. 1 Transition Count": {
|
||||
Name: "Thermal_Management_T1_Trans_Count",
|
||||
Parse: parseCommaSeparatedInt,
|
||||
},
|
||||
"Thermal Temp. 2 Transition Count": {
|
||||
Name: "Thermal_Management_T2_Trans_Count",
|
||||
Parse: parseCommaSeparatedInt,
|
||||
},
|
||||
"Thermal Temp. 1 Total Time": {
|
||||
Name: "Thermal_Management_T1_Total_Time",
|
||||
Parse: parseCommaSeparatedInt,
|
||||
},
|
||||
"Thermal Temp. 2 Total Time": {
|
||||
Name: "Thermal_Management_T2_Total_Time",
|
||||
Parse: parseCommaSeparatedInt,
|
||||
},
|
||||
"Temperature Sensor 1": {
|
||||
Name: "Temperature_Sensor_1",
|
||||
Parse: parseTemperatureSensor,
|
||||
},
|
||||
"Temperature Sensor 2": {
|
||||
Name: "Temperature_Sensor_2",
|
||||
Parse: parseTemperatureSensor,
|
||||
},
|
||||
"Temperature Sensor 3": {
|
||||
Name: "Temperature_Sensor_3",
|
||||
Parse: parseTemperatureSensor,
|
||||
},
|
||||
"Temperature Sensor 4": {
|
||||
Name: "Temperature_Sensor_4",
|
||||
Parse: parseTemperatureSensor,
|
||||
},
|
||||
"Temperature Sensor 5": {
|
||||
Name: "Temperature_Sensor_5",
|
||||
Parse: parseTemperatureSensor,
|
||||
},
|
||||
"Temperature Sensor 6": {
|
||||
Name: "Temperature_Sensor_6",
|
||||
Parse: parseTemperatureSensor,
|
||||
},
|
||||
"Temperature Sensor 7": {
|
||||
Name: "Temperature_Sensor_7",
|
||||
Parse: parseTemperatureSensor,
|
||||
},
|
||||
"Temperature Sensor 8": {
|
||||
Name: "Temperature_Sensor_8",
|
||||
Parse: parseTemperatureSensor,
|
||||
},
|
||||
}
|
||||
|
||||
// to obtain Intel specific metrics from nvme-cli
|
||||
intelAttributes = map[string]struct {
|
||||
ID string
|
||||
Name string
|
||||
Parse func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error
|
||||
}{
|
||||
"program_fail_count": {
|
||||
Name: "Program_Fail_Count",
|
||||
},
|
||||
"erase_fail_count": {
|
||||
Name: "Erase_Fail_Count",
|
||||
},
|
||||
"end_to_end_error_detection_count": {
|
||||
Name: "End_To_End_Error_Detection_Count",
|
||||
},
|
||||
"crc_error_count": {
|
||||
Name: "Crc_Error_Count",
|
||||
},
|
||||
"retry_buffer_overflow_count": {
|
||||
Name: "Retry_Buffer_Overflow_Count",
|
||||
},
|
||||
"wear_leveling": {
|
||||
Name: "Wear_Leveling",
|
||||
Parse: parseWearLeveling,
|
||||
},
|
||||
"timed_workload_media_wear": {
|
||||
Name: "Timed_Workload_Media_Wear",
|
||||
Parse: parseTimedWorkload,
|
||||
},
|
||||
"timed_workload_host_reads": {
|
||||
Name: "Timed_Workload_Host_Reads",
|
||||
Parse: parseTimedWorkload,
|
||||
},
|
||||
"timed_workload_timer": {
|
||||
Name: "Timed_Workload_Timer",
|
||||
Parse: func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
|
||||
return parseCommaSeparatedIntWithAccumulator(acc, fields, tags, strings.TrimSuffix(str, " min"))
|
||||
},
|
||||
},
|
||||
"thermal_throttle_status": {
|
||||
Name: "Thermal_Throttle_Status",
|
||||
Parse: parseThermalThrottle,
|
||||
},
|
||||
"pll_lock_loss_count": {
|
||||
Name: "Pll_Lock_Loss_Count",
|
||||
},
|
||||
"nand_bytes_written": {
|
||||
Name: "Nand_Bytes_Written",
|
||||
Parse: parseBytesWritten,
|
||||
},
|
||||
"host_bytes_written": {
|
||||
Name: "Host_Bytes_Written",
|
||||
Parse: parseBytesWritten,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
type NVMeDevice struct {
|
||||
name string
|
||||
vendorID string
|
||||
model string
|
||||
serialNumber string
|
||||
}
|
||||
|
||||
type Smart struct {
|
||||
Path string
|
||||
Nocheck string
|
||||
Attributes bool
|
||||
Excludes []string
|
||||
Devices []string
|
||||
UseSudo bool
|
||||
Timeout internal.Duration
|
||||
Path string `toml:"path"` //deprecated - to keep backward compatibility
|
||||
PathSmartctl string `toml:"path_smartctl"`
|
||||
PathNVMe string `toml:"path_nvme"`
|
||||
Nocheck string `toml:"nocheck"`
|
||||
EnableExtensions []string `toml:"enable_extensions"`
|
||||
Attributes bool `toml:"attributes"`
|
||||
Excludes []string `toml:"excludes"`
|
||||
Devices []string `toml:"devices"`
|
||||
UseSudo bool `toml:"use_sudo"`
|
||||
Timeout internal.Duration `toml:"timeout"`
|
||||
Log telegraf.Logger `toml:"-"`
|
||||
}
|
||||
|
||||
var sampleConfig = `
|
||||
## Optionally specify the path to the smartctl executable
|
||||
# path = "/usr/bin/smartctl"
|
||||
# path_smartctl = "/usr/bin/smartctl"
|
||||
|
||||
## On most platforms smartctl requires root access.
|
||||
## Setting 'use_sudo' to true will make use of sudo to run smartctl.
|
||||
## Sudo must be configured to to allow the telegraf user to run smartctl
|
||||
## Optionally specify the path to the nvme-cli executable
|
||||
# path_nvme = "/usr/bin/nvme"
|
||||
|
||||
## Optionally specify if vendor specific attributes should be propagated for NVMe disk case
|
||||
## ["auto-on"] - automatically find and enable additional vendor specific disk info
|
||||
## ["vendor1", "vendor2", ...] - e.g. "Intel" enable additional Intel specific disk info
|
||||
# enable_extensions = ["auto-on"]
|
||||
|
||||
## On most platforms used cli utilities requires root access.
|
||||
## Setting 'use_sudo' to true will make use of sudo to run smartctl or nvme-cli.
|
||||
## Sudo must be configured to allow the telegraf user to run smartctl or nvme-cli
|
||||
## without a password.
|
||||
# use_sudo = false
|
||||
|
||||
## Skip checking disks in this power mode. Defaults to
|
||||
## "standby" to not wake up disks that have stoped rotating.
|
||||
## "standby" to not wake up disks that have stopped rotating.
|
||||
## See --nocheck in the man pages for smartctl.
|
||||
## smartctl version 5.41 and 5.42 have faulty detection of
|
||||
## power mode and might require changing this value to
|
||||
|
|
@ -181,16 +318,15 @@ var sampleConfig = `
|
|||
## information from each drive into the 'smart_attribute' measurement.
|
||||
# attributes = false
|
||||
|
||||
## Optionally specify devices to exclude from reporting.
|
||||
## Optionally specify devices to exclude from reporting if disks auto-discovery is performed.
|
||||
# excludes = [ "/dev/pass6" ]
|
||||
|
||||
## Optionally specify devices and device type, if unset
|
||||
## a scan (smartctl --scan) for S.M.A.R.T. devices will
|
||||
## done and all found will be included except for the
|
||||
## excluded in excludes.
|
||||
# devices = [ "/dev/ada0 -d atacam" ]
|
||||
## a scan (smartctl --scan and smartctl --scan -d nvme) for S.M.A.R.T. devices will be done
|
||||
## and all found will be included except for the excluded in excludes.
|
||||
# devices = [ "/dev/ada0 -d atacam", "/dev/nvme0"]
|
||||
|
||||
## Timeout for the smartctl command to complete.
|
||||
## Timeout for the cli command to complete.
|
||||
# timeout = "30s"
|
||||
`
|
||||
|
||||
|
|
@ -208,22 +344,159 @@ func (m *Smart) Description() string {
|
|||
return "Read metrics from storage devices supporting S.M.A.R.T."
|
||||
}
|
||||
|
||||
func (m *Smart) Gather(acc telegraf.Accumulator) error {
|
||||
if len(m.Path) == 0 {
|
||||
return fmt.Errorf("smartctl not found: verify that smartctl is installed and that smartctl is in your PATH")
|
||||
func (m *Smart) Init() error {
|
||||
//if deprecated `path` (to smartctl binary) is provided in config and `path_smartctl` override does not exist
|
||||
if len(m.Path) > 0 && len(m.PathSmartctl) == 0 {
|
||||
m.PathSmartctl = m.Path
|
||||
}
|
||||
|
||||
devices := m.Devices
|
||||
if len(devices) == 0 {
|
||||
//if `path_smartctl` is not provided in config, try to find smartctl binary in PATH
|
||||
if len(m.PathSmartctl) == 0 {
|
||||
m.PathSmartctl, _ = exec.LookPath("smartctl")
|
||||
}
|
||||
|
||||
//if `path_nvme` is not provided in config, try to find nvme binary in PATH
|
||||
if len(m.PathNVMe) == 0 {
|
||||
m.PathNVMe, _ = exec.LookPath("nvme")
|
||||
}
|
||||
|
||||
err := validatePath(m.PathSmartctl)
|
||||
if err != nil {
|
||||
m.PathSmartctl = ""
|
||||
//without smartctl, plugin will not be able to gather basic metrics
|
||||
return fmt.Errorf("smartctl not found: verify that smartctl is installed and it is in your PATH (or specified in config): %s", err.Error())
|
||||
}
|
||||
|
||||
err = validatePath(m.PathNVMe)
|
||||
if err != nil {
|
||||
m.PathNVMe = ""
|
||||
//without nvme, plugin will not be able to gather vendor specific attributes (but it can work without it)
|
||||
m.Log.Warnf("nvme not found: verify that nvme is installed and it is in your PATH (or specified in config) to gather vendor specific attributes: %s", err.Error())
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Smart) Gather(acc telegraf.Accumulator) error {
|
||||
var err error
|
||||
devices, err = m.scan()
|
||||
var scannedNVMeDevices []string
|
||||
var scannedNonNVMeDevices []string
|
||||
|
||||
devicesFromConfig := m.Devices
|
||||
isNVMe := len(m.PathNVMe) != 0
|
||||
isVendorExtension := len(m.EnableExtensions) != 0
|
||||
|
||||
if len(m.Devices) != 0 {
|
||||
devicesFromConfig = excludeWrongDeviceNames(devicesFromConfig)
|
||||
|
||||
m.getAttributes(acc, devicesFromConfig)
|
||||
|
||||
// if nvme-cli is present, vendor specific attributes can be gathered
|
||||
if isVendorExtension && isNVMe {
|
||||
scannedNVMeDevices, scannedNonNVMeDevices, err = m.scanAllDevices(true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
NVMeDevices := distinguishNVMeDevices(devicesFromConfig, scannedNVMeDevices)
|
||||
|
||||
m.getVendorNVMeAttributes(acc, NVMeDevices)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
scannedNVMeDevices, scannedNonNVMeDevices, err = m.scanAllDevices(false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var devicesFromScan []string
|
||||
devicesFromScan = append(devicesFromScan, scannedNVMeDevices...)
|
||||
devicesFromScan = append(devicesFromScan, scannedNonNVMeDevices...)
|
||||
|
||||
m.getAttributes(acc, devicesFromScan)
|
||||
if isVendorExtension && isNVMe {
|
||||
m.getVendorNVMeAttributes(acc, scannedNVMeDevices)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
m.getAttributes(acc, devices)
|
||||
return nil
|
||||
// validate and exclude not correct config device names to avoid unwanted behaviours
|
||||
func excludeWrongDeviceNames(devices []string) []string {
|
||||
validSigns := map[string]struct{}{
|
||||
" ": {},
|
||||
"/": {},
|
||||
"\\": {},
|
||||
"-": {},
|
||||
",": {},
|
||||
}
|
||||
var wrongDevices []string
|
||||
|
||||
for _, device := range devices {
|
||||
for _, char := range device {
|
||||
if unicode.IsLetter(char) || unicode.IsNumber(char) {
|
||||
continue
|
||||
}
|
||||
if _, exist := validSigns[string(char)]; exist {
|
||||
continue
|
||||
}
|
||||
wrongDevices = append(wrongDevices, device)
|
||||
}
|
||||
}
|
||||
return difference(devices, wrongDevices)
|
||||
}
|
||||
|
||||
func (m *Smart) scanAllDevices(ignoreExcludes bool) ([]string, []string, error) {
|
||||
// this will return all devices (including NVMe devices) for smartctl version >= 7.0
|
||||
// for older versions this will return non NVMe devices
|
||||
devices, err := m.scanDevices(ignoreExcludes, "--scan")
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// this will return only NVMe devices
|
||||
NVMeDevices, err := m.scanDevices(ignoreExcludes, "--scan", "--device=nvme")
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// to handle all versions of smartctl this will return only non NVMe devices
|
||||
nonNVMeDevices := difference(devices, NVMeDevices)
|
||||
return NVMeDevices, nonNVMeDevices, nil
|
||||
}
|
||||
|
||||
func distinguishNVMeDevices(userDevices []string, availableNVMeDevices []string) []string {
|
||||
var NVMeDevices []string
|
||||
|
||||
for _, userDevice := range userDevices {
|
||||
for _, NVMeDevice := range availableNVMeDevices {
|
||||
// double check. E.g. in case when nvme0 is equal nvme0n1, will check if "nvme0" part is present.
|
||||
if strings.Contains(NVMeDevice, userDevice) || strings.Contains(userDevice, NVMeDevice) {
|
||||
NVMeDevices = append(NVMeDevices, userDevice)
|
||||
}
|
||||
}
|
||||
}
|
||||
return NVMeDevices
|
||||
}
|
||||
|
||||
// Scan for S.M.A.R.T. devices from smartctl
|
||||
func (m *Smart) scanDevices(ignoreExcludes bool, scanArgs ...string) ([]string, error) {
|
||||
out, err := runCmd(m.Timeout, m.UseSudo, m.PathSmartctl, scanArgs...)
|
||||
if err != nil {
|
||||
return []string{}, fmt.Errorf("failed to run command '%s %s': %s - %s", m.PathSmartctl, scanArgs, err, string(out))
|
||||
}
|
||||
var devices []string
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
dev := strings.Split(line, " ")
|
||||
if len(dev) <= 1 {
|
||||
continue
|
||||
}
|
||||
if !ignoreExcludes {
|
||||
if !excludedDev(m.Excludes, strings.TrimSpace(dev[0])) {
|
||||
devices = append(devices, strings.TrimSpace(dev[0]))
|
||||
}
|
||||
} else {
|
||||
devices = append(devices, strings.TrimSpace(dev[0]))
|
||||
}
|
||||
}
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
// Wrap with sudo
|
||||
|
|
@ -235,23 +508,6 @@ var runCmd = func(timeout internal.Duration, sudo bool, command string, args ...
|
|||
return internal.CombinedOutputTimeout(cmd, timeout.Duration)
|
||||
}
|
||||
|
||||
// Scan for S.M.A.R.T. devices
|
||||
func (m *Smart) scan() ([]string, error) {
|
||||
out, err := runCmd(m.Timeout, m.UseSudo, m.Path, "--scan")
|
||||
if err != nil {
|
||||
return []string{}, fmt.Errorf("failed to run command '%s --scan': %s - %s", m.Path, err, string(out))
|
||||
}
|
||||
|
||||
devices := []string{}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
dev := strings.Split(line, " ")
|
||||
if len(dev) > 1 && !excludedDev(m.Excludes, strings.TrimSpace(dev[0])) {
|
||||
devices = append(devices, strings.TrimSpace(dev[0]))
|
||||
}
|
||||
}
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
func excludedDev(excludes []string, deviceLine string) bool {
|
||||
device := strings.Split(deviceLine, " ")
|
||||
if len(device) != 0 {
|
||||
|
|
@ -270,21 +526,137 @@ func (m *Smart) getAttributes(acc telegraf.Accumulator, devices []string) {
|
|||
wg.Add(len(devices))
|
||||
|
||||
for _, device := range devices {
|
||||
go gatherDisk(acc, m.Timeout, m.UseSudo, m.Attributes, m.Path, m.Nocheck, device, &wg)
|
||||
go gatherDisk(acc, m.Timeout, m.UseSudo, m.Attributes, m.PathSmartctl, m.Nocheck, device, &wg)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// Command line parse errors are denoted by the exit code having the 0 bit set.
|
||||
// All other errors are drive/communication errors and should be ignored.
|
||||
func exitStatus(err error) (int, error) {
|
||||
if exiterr, ok := err.(*exec.ExitError); ok {
|
||||
if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
|
||||
return status.ExitStatus(), nil
|
||||
func (m *Smart) getVendorNVMeAttributes(acc telegraf.Accumulator, devices []string) {
|
||||
NVMeDevices := getDeviceInfoForNVMeDisks(acc, devices, m.PathNVMe, m.Timeout, m.UseSudo)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for _, device := range NVMeDevices {
|
||||
if contains(m.EnableExtensions, "auto-on") {
|
||||
switch device.vendorID {
|
||||
case IntelVID:
|
||||
wg.Add(1)
|
||||
go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg)
|
||||
}
|
||||
} else if contains(m.EnableExtensions, "Intel") && device.vendorID == IntelVID {
|
||||
wg.Add(1)
|
||||
go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg)
|
||||
}
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func getDeviceInfoForNVMeDisks(acc telegraf.Accumulator, devices []string, nvme string, timeout internal.Duration, useSudo bool) []NVMeDevice {
|
||||
var NVMeDevices []NVMeDevice
|
||||
|
||||
for _, device := range devices {
|
||||
vid, sn, mn, err := gatherNVMeDeviceInfo(nvme, device, timeout, useSudo)
|
||||
if err != nil {
|
||||
acc.AddError(fmt.Errorf("cannot find device info for %s device", device))
|
||||
continue
|
||||
}
|
||||
newDevice := NVMeDevice{
|
||||
name: device,
|
||||
vendorID: vid,
|
||||
model: mn,
|
||||
serialNumber: sn,
|
||||
}
|
||||
NVMeDevices = append(NVMeDevices, newDevice)
|
||||
}
|
||||
return NVMeDevices
|
||||
}
|
||||
|
||||
func gatherNVMeDeviceInfo(nvme, device string, timeout internal.Duration, useSudo bool) (string, string, string, error) {
|
||||
args := []string{"id-ctrl"}
|
||||
args = append(args, strings.Split(device, " ")...)
|
||||
out, err := runCmd(timeout, useSudo, nvme, args...)
|
||||
if err != nil {
|
||||
return "", "", "", err
|
||||
}
|
||||
outStr := string(out)
|
||||
|
||||
vid, sn, mn, err := findNVMeDeviceInfo(outStr)
|
||||
|
||||
return vid, sn, mn, err
|
||||
}
|
||||
|
||||
func findNVMeDeviceInfo(output string) (string, string, string, error) {
|
||||
scanner := bufio.NewScanner(strings.NewReader(output))
|
||||
var vid, sn, mn string
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
if matches := nvmeIdCtrlExpressionPattern.FindStringSubmatch(line); len(matches) > 2 {
|
||||
matches[1] = strings.TrimSpace(matches[1])
|
||||
matches[2] = strings.TrimSpace(matches[2])
|
||||
if matches[1] == "vid" {
|
||||
if _, err := fmt.Sscanf(matches[2], "%s", &vid); err != nil {
|
||||
return "", "", "", err
|
||||
}
|
||||
}
|
||||
if matches[1] == "sn" {
|
||||
sn = matches[2]
|
||||
}
|
||||
if matches[1] == "mn" {
|
||||
mn = matches[2]
|
||||
}
|
||||
}
|
||||
}
|
||||
return vid, sn, mn, nil
|
||||
}
|
||||
|
||||
func gatherIntelNVMeDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo bool, nvme string, device NVMeDevice, wg *sync.WaitGroup) {
|
||||
defer wg.Done()
|
||||
|
||||
args := []string{"intel", "smart-log-add"}
|
||||
args = append(args, strings.Split(device.name, " ")...)
|
||||
out, e := runCmd(timeout, usesudo, nvme, args...)
|
||||
outStr := string(out)
|
||||
|
||||
_, er := exitStatus(e)
|
||||
if er != nil {
|
||||
acc.AddError(fmt.Errorf("failed to run command '%s %s': %s - %s", nvme, strings.Join(args, " "), e, outStr))
|
||||
return
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(strings.NewReader(outStr))
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
tags := map[string]string{}
|
||||
fields := make(map[string]interface{})
|
||||
|
||||
tags["device"] = path.Base(device.name)
|
||||
tags["model"] = device.model
|
||||
tags["serial_no"] = device.serialNumber
|
||||
|
||||
if matches := intelExpressionPattern.FindStringSubmatch(line); len(matches) > 3 {
|
||||
matches[1] = strings.TrimSpace(matches[1])
|
||||
matches[3] = strings.TrimSpace(matches[3])
|
||||
if attr, ok := intelAttributes[matches[1]]; ok {
|
||||
tags["name"] = attr.Name
|
||||
if attr.ID != "" {
|
||||
tags["id"] = attr.ID
|
||||
}
|
||||
|
||||
parse := parseCommaSeparatedIntWithAccumulator
|
||||
if attr.Parse != nil {
|
||||
parse = attr.Parse
|
||||
}
|
||||
|
||||
if err := parse(acc, fields, tags, matches[3]); err != nil {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, err
|
||||
}
|
||||
|
||||
func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, collectAttributes bool, smartctl, nocheck, device string, wg *sync.WaitGroup) {
|
||||
|
|
@ -328,7 +700,7 @@ func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, co
|
|||
deviceTags["wwn"] = strings.Replace(wwn[1], " ", "", -1)
|
||||
}
|
||||
|
||||
capacity := usercapacityInfo.FindStringSubmatch(line)
|
||||
capacity := userCapacityInfo.FindStringSubmatch(line)
|
||||
if len(capacity) > 1 {
|
||||
deviceTags["capacity"] = strings.Replace(capacity[1], ",", "", -1)
|
||||
}
|
||||
|
|
@ -340,7 +712,7 @@ func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, co
|
|||
|
||||
health := smartOverallHealth.FindStringSubmatch(line)
|
||||
if len(health) > 2 {
|
||||
deviceFields["health_ok"] = (health[2] == "PASSED" || health[2] == "OK")
|
||||
deviceFields["health_ok"] = health[2] == "PASSED" || health[2] == "OK"
|
||||
}
|
||||
|
||||
tags := map[string]string{}
|
||||
|
|
@ -418,6 +790,40 @@ func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, co
|
|||
acc.AddFields("smart_device", deviceFields, deviceTags)
|
||||
}
|
||||
|
||||
// Command line parse errors are denoted by the exit code having the 0 bit set.
|
||||
// All other errors are drive/communication errors and should be ignored.
|
||||
func exitStatus(err error) (int, error) {
|
||||
if exiterr, ok := err.(*exec.ExitError); ok {
|
||||
if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
|
||||
return status.ExitStatus(), nil
|
||||
}
|
||||
}
|
||||
return 0, err
|
||||
}
|
||||
|
||||
func contains(args []string, element string) bool {
|
||||
for _, arg := range args {
|
||||
if arg == element {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func difference(a, b []string) []string {
|
||||
mb := make(map[string]struct{}, len(b))
|
||||
for _, x := range b {
|
||||
mb[x] = struct{}{}
|
||||
}
|
||||
var diff []string
|
||||
for _, x := range a {
|
||||
if _, found := mb[x]; !found {
|
||||
diff = append(diff, x)
|
||||
}
|
||||
}
|
||||
return diff
|
||||
}
|
||||
|
||||
func parseRawValue(rawVal string) (int64, error) {
|
||||
// Integer
|
||||
if i, err := strconv.ParseInt(rawVal, 10, 64); err == nil {
|
||||
|
|
@ -428,7 +834,7 @@ func parseRawValue(rawVal string) (int64, error) {
|
|||
unit := regexp.MustCompile("^(.*)([hms])$")
|
||||
parts := strings.Split(rawVal, "+")
|
||||
if len(parts) == 0 {
|
||||
return 0, fmt.Errorf("Couldn't parse RAW_VALUE '%s'", rawVal)
|
||||
return 0, fmt.Errorf("couldn't parse RAW_VALUE '%s'", rawVal)
|
||||
}
|
||||
|
||||
duration := int64(0)
|
||||
|
|
@ -452,6 +858,63 @@ func parseRawValue(rawVal string) (int64, error) {
|
|||
return duration, nil
|
||||
}
|
||||
|
||||
func parseBytesWritten(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
|
||||
var value int64
|
||||
|
||||
if _, err := fmt.Sscanf(str, "sectors: %d", &value); err != nil {
|
||||
return err
|
||||
}
|
||||
fields["raw_value"] = value
|
||||
acc.AddFields("smart_attribute", fields, tags)
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseThermalThrottle(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
|
||||
var percentage float64
|
||||
var count int64
|
||||
|
||||
if _, err := fmt.Sscanf(str, "%f%%, cnt: %d", &percentage, &count); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fields["raw_value"] = percentage
|
||||
tags["name"] = "Thermal_Throttle_Status_Prc"
|
||||
acc.AddFields("smart_attribute", fields, tags)
|
||||
|
||||
fields["raw_value"] = count
|
||||
tags["name"] = "Thermal_Throttle_Status_Cnt"
|
||||
acc.AddFields("smart_attribute", fields, tags)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseWearLeveling(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
|
||||
var min, max, avg int64
|
||||
|
||||
if _, err := fmt.Sscanf(str, "min: %d, max: %d, avg: %d", &min, &max, &avg); err != nil {
|
||||
return err
|
||||
}
|
||||
values := []int64{min, max, avg}
|
||||
for i, submetricName := range []string{"Min", "Max", "Avg"} {
|
||||
fields["raw_value"] = values[i]
|
||||
tags["name"] = fmt.Sprintf("Wear_Leveling_%s", submetricName)
|
||||
acc.AddFields("smart_attribute", fields, tags)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseTimedWorkload(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
|
||||
var value float64
|
||||
|
||||
if _, err := fmt.Sscanf(str, "%f", &value); err != nil {
|
||||
return err
|
||||
}
|
||||
fields["raw_value"] = value
|
||||
acc.AddFields("smart_attribute", fields, tags)
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseInt(str string) int64 {
|
||||
if i, err := strconv.ParseInt(str, 10, 64); err == nil {
|
||||
return i
|
||||
|
|
@ -460,6 +923,7 @@ func parseInt(str string) int64 {
|
|||
}
|
||||
|
||||
func parseCommaSeparatedInt(fields, _ map[string]interface{}, str string) error {
|
||||
str = strings.Join(strings.Fields(str), "")
|
||||
i, err := strconv.ParseInt(strings.Replace(str, ",", "", -1), 10, 64)
|
||||
if err != nil {
|
||||
return err
|
||||
|
|
@ -479,6 +943,17 @@ func parseDataUnits(fields, deviceFields map[string]interface{}, str string) err
|
|||
return parseCommaSeparatedInt(fields, deviceFields, units)
|
||||
}
|
||||
|
||||
func parseCommaSeparatedIntWithAccumulator(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error {
|
||||
i, err := strconv.ParseInt(strings.Replace(str, ",", "", -1), 10, 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fields["raw_value"] = i
|
||||
acc.AddFields("smart_attribute", fields, tags)
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseTemperature(fields, deviceFields map[string]interface{}, str string) error {
|
||||
var temp int64
|
||||
if _, err := fmt.Sscanf(str, "%d C", &temp); err != nil {
|
||||
|
|
@ -491,13 +966,34 @@ func parseTemperature(fields, deviceFields map[string]interface{}, str string) e
|
|||
return nil
|
||||
}
|
||||
|
||||
func parseTemperatureSensor(fields, deviceFields map[string]interface{}, str string) error {
|
||||
var temp int64
|
||||
if _, err := fmt.Sscanf(str, "%d C", &temp); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fields["raw_value"] = temp
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func validatePath(path string) error {
|
||||
pathInfo, err := os.Stat(path)
|
||||
if os.IsNotExist(err) {
|
||||
return fmt.Errorf("provided path does not exist: [%s]", path)
|
||||
}
|
||||
if mode := pathInfo.Mode(); !mode.IsRegular() {
|
||||
return fmt.Errorf("provided path does not point to a regular file: [%s]", path)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Set LC_NUMERIC to uniform numeric output from cli tools
|
||||
_ = os.Setenv("LC_NUMERIC", "en_US.UTF-8")
|
||||
|
||||
inputs.Add("smart", func() telegraf.Input {
|
||||
m := NewSmart()
|
||||
path, _ := exec.LookPath("smartctl")
|
||||
if len(path) > 0 {
|
||||
m.Path = path
|
||||
}
|
||||
m.Nocheck = "standby"
|
||||
return m
|
||||
})
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue