2017-10-05 06:15:58 +08:00
package smart
import (
2018-04-03 04:55:10 +08:00
"bufio"
2017-10-05 06:15:58 +08:00
"fmt"
2020-09-28 23:16:49 +08:00
"os"
2017-10-05 06:15:58 +08:00
"os/exec"
2017-12-09 05:22:41 +08:00
"path"
2017-10-05 06:15:58 +08:00
"regexp"
"strconv"
"strings"
"sync"
"syscall"
"time"
"github.com/influxdata/telegraf"
2021-04-10 01:15:04 +08:00
"github.com/influxdata/telegraf/config"
2017-10-05 06:15:58 +08:00
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/plugins/inputs"
)
2020-11-13 00:04:52 +08:00
const intelVID = "0x8086"
2020-09-28 23:16:49 +08:00
2017-10-05 06:15:58 +08:00
var (
// Device Model: APPLE SSD SM256E
2019-05-08 06:20:03 +08:00
// Product: HUH721212AL5204
// Model Number: TS128GMTE850
2021-06-11 04:51:33 +08:00
modelInfo = regexp . MustCompile ( ` ^(Device Model|Product|Model Number):\s+(.*)$ ` )
2017-10-05 06:15:58 +08:00
// Serial Number: S0X5NZBC422720
2021-06-11 04:51:33 +08:00
serialInfo = regexp . MustCompile ( ` (?i)^Serial Number:\s+(.*)$ ` )
2017-10-05 06:15:58 +08:00
// LU WWN Device Id: 5 002538 655584d30
2021-06-11 04:51:33 +08:00
wwnInfo = regexp . MustCompile ( ` ^LU WWN Device Id:\s+(.*)$ ` )
2017-10-05 06:15:58 +08:00
// User Capacity: 251,000,193,024 bytes [251 GB]
2021-06-11 04:51:33 +08:00
userCapacityInfo = regexp . MustCompile ( ` ^User Capacity:\s+([0-9,]+)\s+bytes.*$ ` )
2017-10-05 06:15:58 +08:00
// SMART support is: Enabled
2021-06-11 04:51:33 +08:00
smartEnabledInfo = regexp . MustCompile ( ` ^SMART support is:\s+(\w+)$ ` )
// Power mode is: ACTIVE or IDLE or Power mode was: STANDBY
powermodeInfo = regexp . MustCompile ( ` ^Power mode \w+:\s+(\w+) ` )
// Device is in STANDBY mode
standbyInfo = regexp . MustCompile ( ` ^Device is in\s+(\w+) ` )
2017-10-05 06:15:58 +08:00
// SMART overall-health self-assessment test result: PASSED
2019-05-08 06:20:03 +08:00
// SMART Health Status: OK
2017-10-05 06:15:58 +08:00
// PASSED, FAILED, UNKNOWN
2021-06-11 04:51:33 +08:00
smartOverallHealth = regexp . MustCompile ( ` ^(SMART overall-health self-assessment test result|SMART Health Status):\s+(\w+).*$ ` )
2019-05-08 06:20:03 +08:00
2019-07-13 05:25:45 +08:00
// sasNvmeAttr is a SAS or NVME SMART attribute
sasNvmeAttr = regexp . MustCompile ( ` ^([^:]+):\s+(.+)$ ` )
2017-10-05 06:15:58 +08:00
// ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE
// 1 Raw_Read_Error_Rate -O-RC- 200 200 000 - 0
// 5 Reallocated_Sector_Ct PO--CK 100 100 000 - 0
// 192 Power-Off_Retract_Count -O--C- 097 097 000 - 14716
2021-06-11 04:51:33 +08:00
attribute = regexp . MustCompile ( ` ^\s*([0-9]+)\s(\S+)\s+([-P][-O][-S][-R][-C][-K])\s+([0-9]+)\s+([0-9]+)\s+([0-9-]+)\s+([-\w]+)\s+([\w\+\.]+).*$ ` )
2017-10-05 06:15:58 +08:00
2020-09-28 23:16:49 +08:00
// Additional Smart Log for NVME device:nvme0 namespace-id:ffffffff
// key normalized raw
// program_fail_count : 100% 0
intelExpressionPattern = regexp . MustCompile ( ` ^([\w\s]+):([\w\s]+)%(.+) ` )
// vid : 0x8086
// sn : CFGT53260XSP8011P
2020-11-13 00:04:52 +08:00
nvmeIDCtrlExpressionPattern = regexp . MustCompile ( ` ^([\w\s]+):([\s\w]+) ` )
2020-09-28 23:16:49 +08:00
2017-10-05 06:15:58 +08:00
deviceFieldIds = map [ string ] string {
"1" : "read_error_rate" ,
"7" : "seek_error_rate" ,
2019-05-08 06:20:03 +08:00
"190" : "temp_c" ,
2017-10-05 06:15:58 +08:00
"194" : "temp_c" ,
"199" : "udma_crc_errors" ,
}
2019-07-13 05:25:45 +08:00
2020-09-28 23:16:49 +08:00
// to obtain metrics from smartctl
2019-07-13 05:25:45 +08:00
sasNvmeAttributes = map [ string ] struct {
ID string
Name string
Parse func ( fields , deviceFields map [ string ] interface { } , str string ) error
} {
"Accumulated start-stop cycles" : {
ID : "4" ,
Name : "Start_Stop_Count" ,
} ,
"Accumulated load-unload cycles" : {
ID : "193" ,
Name : "Load_Cycle_Count" ,
} ,
"Current Drive Temperature" : {
ID : "194" ,
Name : "Temperature_Celsius" ,
Parse : parseTemperature ,
} ,
"Temperature" : {
ID : "194" ,
Name : "Temperature_Celsius" ,
Parse : parseTemperature ,
} ,
"Power Cycles" : {
ID : "12" ,
Name : "Power_Cycle_Count" ,
} ,
"Power On Hours" : {
ID : "9" ,
Name : "Power_On_Hours" ,
} ,
"Media and Data Integrity Errors" : {
Name : "Media_and_Data_Integrity_Errors" ,
} ,
"Error Information Log Entries" : {
Name : "Error_Information_Log_Entries" ,
} ,
"Critical Warning" : {
Name : "Critical_Warning" ,
Parse : func ( fields , _ map [ string ] interface { } , str string ) error {
var value int64
if _ , err := fmt . Sscanf ( str , "0x%x" , & value ) ; err != nil {
return err
}
fields [ "raw_value" ] = value
return nil
} ,
} ,
"Available Spare" : {
2020-06-24 02:55:32 +08:00
Name : "Available_Spare" ,
Parse : parsePercentageInt ,
} ,
"Available Spare Threshold" : {
Name : "Available_Spare_Threshold" ,
Parse : parsePercentageInt ,
} ,
"Percentage Used" : {
Name : "Percentage_Used" ,
Parse : parsePercentageInt ,
} ,
"Data Units Read" : {
Name : "Data_Units_Read" ,
Parse : parseDataUnits ,
} ,
"Data Units Written" : {
Name : "Data_Units_Written" ,
Parse : parseDataUnits ,
} ,
"Host Read Commands" : {
Name : "Host_Read_Commands" ,
Parse : parseCommaSeparatedInt ,
} ,
"Host Write Commands" : {
Name : "Host_Write_Commands" ,
Parse : parseCommaSeparatedInt ,
} ,
"Controller Busy Time" : {
Name : "Controller_Busy_Time" ,
Parse : parseCommaSeparatedInt ,
} ,
"Unsafe Shutdowns" : {
Name : "Unsafe_Shutdowns" ,
Parse : parseCommaSeparatedInt ,
} ,
"Warning Comp. Temperature Time" : {
Name : "Warning_Temperature_Time" ,
Parse : parseCommaSeparatedInt ,
} ,
"Critical Comp. Temperature Time" : {
Name : "Critical_Temperature_Time" ,
Parse : parseCommaSeparatedInt ,
2019-07-13 05:25:45 +08:00
} ,
2020-09-28 23:16:49 +08:00
"Thermal Temp. 1 Transition Count" : {
Name : "Thermal_Management_T1_Trans_Count" ,
Parse : parseCommaSeparatedInt ,
} ,
"Thermal Temp. 2 Transition Count" : {
Name : "Thermal_Management_T2_Trans_Count" ,
Parse : parseCommaSeparatedInt ,
} ,
"Thermal Temp. 1 Total Time" : {
Name : "Thermal_Management_T1_Total_Time" ,
Parse : parseCommaSeparatedInt ,
} ,
"Thermal Temp. 2 Total Time" : {
Name : "Thermal_Management_T2_Total_Time" ,
Parse : parseCommaSeparatedInt ,
} ,
"Temperature Sensor 1" : {
Name : "Temperature_Sensor_1" ,
Parse : parseTemperatureSensor ,
} ,
"Temperature Sensor 2" : {
Name : "Temperature_Sensor_2" ,
Parse : parseTemperatureSensor ,
} ,
"Temperature Sensor 3" : {
Name : "Temperature_Sensor_3" ,
Parse : parseTemperatureSensor ,
} ,
"Temperature Sensor 4" : {
Name : "Temperature_Sensor_4" ,
Parse : parseTemperatureSensor ,
} ,
"Temperature Sensor 5" : {
Name : "Temperature_Sensor_5" ,
Parse : parseTemperatureSensor ,
} ,
"Temperature Sensor 6" : {
Name : "Temperature_Sensor_6" ,
Parse : parseTemperatureSensor ,
} ,
"Temperature Sensor 7" : {
Name : "Temperature_Sensor_7" ,
Parse : parseTemperatureSensor ,
} ,
"Temperature Sensor 8" : {
Name : "Temperature_Sensor_8" ,
Parse : parseTemperatureSensor ,
} ,
}
// to obtain Intel specific metrics from nvme-cli
intelAttributes = map [ string ] struct {
ID string
Name string
Parse func ( acc telegraf . Accumulator , fields map [ string ] interface { } , tags map [ string ] string , str string ) error
} {
"program_fail_count" : {
Name : "Program_Fail_Count" ,
} ,
"erase_fail_count" : {
Name : "Erase_Fail_Count" ,
} ,
"end_to_end_error_detection_count" : {
Name : "End_To_End_Error_Detection_Count" ,
} ,
"crc_error_count" : {
Name : "Crc_Error_Count" ,
} ,
"retry_buffer_overflow_count" : {
Name : "Retry_Buffer_Overflow_Count" ,
} ,
"wear_leveling" : {
Name : "Wear_Leveling" ,
Parse : parseWearLeveling ,
} ,
"timed_workload_media_wear" : {
Name : "Timed_Workload_Media_Wear" ,
Parse : parseTimedWorkload ,
} ,
"timed_workload_host_reads" : {
Name : "Timed_Workload_Host_Reads" ,
Parse : parseTimedWorkload ,
} ,
"timed_workload_timer" : {
Name : "Timed_Workload_Timer" ,
Parse : func ( acc telegraf . Accumulator , fields map [ string ] interface { } , tags map [ string ] string , str string ) error {
return parseCommaSeparatedIntWithAccumulator ( acc , fields , tags , strings . TrimSuffix ( str , " min" ) )
} ,
} ,
"thermal_throttle_status" : {
Name : "Thermal_Throttle_Status" ,
Parse : parseThermalThrottle ,
} ,
"pll_lock_loss_count" : {
Name : "Pll_Lock_Loss_Count" ,
} ,
"nand_bytes_written" : {
Name : "Nand_Bytes_Written" ,
Parse : parseBytesWritten ,
} ,
"host_bytes_written" : {
Name : "Host_Bytes_Written" ,
Parse : parseBytesWritten ,
} ,
2019-07-13 05:25:45 +08:00
}
2017-10-05 06:15:58 +08:00
)
2020-11-13 00:04:52 +08:00
// Smart plugin reads metrics from storage devices supporting S.M.A.R.T.
2017-10-05 06:15:58 +08:00
type Smart struct {
2021-04-10 01:15:04 +08:00
Path string ` toml:"path" ` //deprecated - to keep backward compatibility
PathSmartctl string ` toml:"path_smartctl" `
PathNVMe string ` toml:"path_nvme" `
Nocheck string ` toml:"nocheck" `
EnableExtensions [ ] string ` toml:"enable_extensions" `
Attributes bool ` toml:"attributes" `
Excludes [ ] string ` toml:"excludes" `
Devices [ ] string ` toml:"devices" `
UseSudo bool ` toml:"use_sudo" `
Timeout config . Duration ` toml:"timeout" `
Log telegraf . Logger ` toml:"-" `
2017-10-05 06:15:58 +08:00
}
2020-11-13 00:04:52 +08:00
type nvmeDevice struct {
name string
vendorID string
model string
serialNumber string
}
2017-10-05 06:15:58 +08:00
var sampleConfig = `
# # Optionally specify the path to the smartctl executable
2020-09-28 23:16:49 +08:00
# path_smartctl = "/usr/bin/smartctl"
# # Optionally specify the path to the nvme - cli executable
# path_nvme = "/usr/bin/nvme"
2019-05-08 06:20:03 +08:00
2020-09-28 23:16:49 +08:00
# # Optionally specify if vendor specific attributes should be propagated for NVMe disk case
# # [ "auto-on" ] - automatically find and enable additional vendor specific disk info
# # [ "vendor1" , "vendor2" , ... ] - e . g . "Intel" enable additional Intel specific disk info
# enable_extensions = [ "auto-on" ]
# # On most platforms used cli utilities requires root access .
# # Setting ' use_sudo ' to true will make use of sudo to run smartctl or nvme - cli .
# # Sudo must be configured to allow the telegraf user to run smartctl or nvme - cli
2019-05-08 06:20:03 +08:00
# # without a password .
2017-10-05 06:15:58 +08:00
# use_sudo = false
2019-05-08 06:20:03 +08:00
2017-10-05 06:15:58 +08:00
# # Skip checking disks in this power mode . Defaults to
2020-09-28 23:16:49 +08:00
# # "standby" to not wake up disks that have stopped rotating .
2017-10-05 06:15:58 +08:00
# # See -- nocheck in the man pages for smartctl .
# # smartctl version 5.41 and 5.42 have faulty detection of
# # power mode and might require changing this value to
# # "never" depending on your disks .
# nocheck = "standby"
2019-05-08 06:20:03 +08:00
2019-08-03 01:48:40 +08:00
# # Gather all returned S . M . A . R . T . attribute metrics and the detailed
# # information from each drive into the ' smart_attribute ' measurement .
2017-10-05 06:15:58 +08:00
# attributes = false
2019-05-08 06:20:03 +08:00
2020-09-28 23:16:49 +08:00
# # Optionally specify devices to exclude from reporting if disks auto - discovery is performed .
2017-10-05 06:15:58 +08:00
# excludes = [ "/dev/pass6" ]
2019-05-08 06:20:03 +08:00
2017-10-05 06:15:58 +08:00
# # Optionally specify devices and device type , if unset
2020-09-28 23:16:49 +08:00
# # a scan ( smartctl -- scan and smartctl -- scan - d nvme ) for S . M . A . R . T . devices will be done
# # and all found will be included except for the excluded in excludes .
# devices = [ "/dev/ada0 -d atacam" , "/dev/nvme0" ]
2019-08-14 01:24:44 +08:00
2020-09-28 23:16:49 +08:00
# # Timeout for the cli command to complete .
2019-08-14 01:24:44 +08:00
# timeout = "30s"
2017-10-05 06:15:58 +08:00
`
2020-11-13 00:04:52 +08:00
func newSmart ( ) * Smart {
2019-08-14 01:24:44 +08:00
return & Smart {
2021-04-10 01:15:04 +08:00
Timeout : config . Duration ( time . Second * 30 ) ,
2019-08-14 01:24:44 +08:00
}
}
2020-11-13 00:04:52 +08:00
// SampleConfig returns sample configuration for this plugin.
2017-10-05 06:15:58 +08:00
func ( m * Smart ) SampleConfig ( ) string {
return sampleConfig
}
2020-11-13 00:04:52 +08:00
// Description returns the plugin description.
2017-10-05 06:15:58 +08:00
func ( m * Smart ) Description ( ) string {
return "Read metrics from storage devices supporting S.M.A.R.T."
}
2020-11-13 00:04:52 +08:00
// Init performs one time setup of the plugin and returns an error if the configuration is invalid.
2020-09-28 23:16:49 +08:00
func ( m * Smart ) Init ( ) error {
//if deprecated `path` (to smartctl binary) is provided in config and `path_smartctl` override does not exist
if len ( m . Path ) > 0 && len ( m . PathSmartctl ) == 0 {
m . PathSmartctl = m . Path
2017-10-05 06:15:58 +08:00
}
2020-09-28 23:16:49 +08:00
//if `path_smartctl` is not provided in config, try to find smartctl binary in PATH
if len ( m . PathSmartctl ) == 0 {
m . PathSmartctl , _ = exec . LookPath ( "smartctl" )
}
//if `path_nvme` is not provided in config, try to find nvme binary in PATH
if len ( m . PathNVMe ) == 0 {
m . PathNVMe , _ = exec . LookPath ( "nvme" )
}
err := validatePath ( m . PathSmartctl )
if err != nil {
m . PathSmartctl = ""
//without smartctl, plugin will not be able to gather basic metrics
return fmt . Errorf ( "smartctl not found: verify that smartctl is installed and it is in your PATH (or specified in config): %s" , err . Error ( ) )
}
err = validatePath ( m . PathNVMe )
if err != nil {
m . PathNVMe = ""
//without nvme, plugin will not be able to gather vendor specific attributes (but it can work without it)
m . Log . Warnf ( "nvme not found: verify that nvme is installed and it is in your PATH (or specified in config) to gather vendor specific attributes: %s" , err . Error ( ) )
}
return nil
}
2020-11-13 00:04:52 +08:00
// Gather takes in an accumulator and adds the metrics that the SMART tools gather.
2020-09-28 23:16:49 +08:00
func ( m * Smart ) Gather ( acc telegraf . Accumulator ) error {
var err error
var scannedNVMeDevices [ ] string
var scannedNonNVMeDevices [ ] string
devicesFromConfig := m . Devices
isNVMe := len ( m . PathNVMe ) != 0
isVendorExtension := len ( m . EnableExtensions ) != 0
if len ( m . Devices ) != 0 {
m . getAttributes ( acc , devicesFromConfig )
// if nvme-cli is present, vendor specific attributes can be gathered
if isVendorExtension && isNVMe {
2020-10-08 23:20:35 +08:00
scannedNVMeDevices , _ , err = m . scanAllDevices ( true )
2020-09-28 23:16:49 +08:00
if err != nil {
return err
}
NVMeDevices := distinguishNVMeDevices ( devicesFromConfig , scannedNVMeDevices )
m . getVendorNVMeAttributes ( acc , NVMeDevices )
2017-10-05 06:15:58 +08:00
}
2020-09-28 23:16:49 +08:00
return nil
}
scannedNVMeDevices , scannedNonNVMeDevices , err = m . scanAllDevices ( false )
if err != nil {
return err
2017-10-05 06:15:58 +08:00
}
2020-09-28 23:16:49 +08:00
var devicesFromScan [ ] string
devicesFromScan = append ( devicesFromScan , scannedNVMeDevices ... )
devicesFromScan = append ( devicesFromScan , scannedNonNVMeDevices ... )
2017-10-05 06:15:58 +08:00
2020-09-28 23:16:49 +08:00
m . getAttributes ( acc , devicesFromScan )
if isVendorExtension && isNVMe {
m . getVendorNVMeAttributes ( acc , scannedNVMeDevices )
}
2017-10-05 06:15:58 +08:00
return nil
}
2020-09-28 23:16:49 +08:00
func ( m * Smart ) scanAllDevices ( ignoreExcludes bool ) ( [ ] string , [ ] string , error ) {
// this will return all devices (including NVMe devices) for smartctl version >= 7.0
// for older versions this will return non NVMe devices
devices , err := m . scanDevices ( ignoreExcludes , "--scan" )
if err != nil {
return nil , nil , err
}
// this will return only NVMe devices
NVMeDevices , err := m . scanDevices ( ignoreExcludes , "--scan" , "--device=nvme" )
2017-10-05 06:15:58 +08:00
if err != nil {
2020-09-28 23:16:49 +08:00
return nil , nil , err
2017-10-05 06:15:58 +08:00
}
2020-09-28 23:16:49 +08:00
// to handle all versions of smartctl this will return only non NVMe devices
nonNVMeDevices := difference ( devices , NVMeDevices )
return NVMeDevices , nonNVMeDevices , nil
}
func distinguishNVMeDevices ( userDevices [ ] string , availableNVMeDevices [ ] string ) [ ] string {
var NVMeDevices [ ] string
for _ , userDevice := range userDevices {
for _ , NVMeDevice := range availableNVMeDevices {
// double check. E.g. in case when nvme0 is equal nvme0n1, will check if "nvme0" part is present.
if strings . Contains ( NVMeDevice , userDevice ) || strings . Contains ( userDevice , NVMeDevice ) {
NVMeDevices = append ( NVMeDevices , userDevice )
}
}
}
return NVMeDevices
}
// Scan for S.M.A.R.T. devices from smartctl
func ( m * Smart ) scanDevices ( ignoreExcludes bool , scanArgs ... string ) ( [ ] string , error ) {
out , err := runCmd ( m . Timeout , m . UseSudo , m . PathSmartctl , scanArgs ... )
if err != nil {
return [ ] string { } , fmt . Errorf ( "failed to run command '%s %s': %s - %s" , m . PathSmartctl , scanArgs , err , string ( out ) )
}
var devices [ ] string
2017-10-05 06:15:58 +08:00
for _ , line := range strings . Split ( string ( out ) , "\n" ) {
2017-12-09 10:03:12 +08:00
dev := strings . Split ( line , " " )
2020-09-28 23:16:49 +08:00
if len ( dev ) <= 1 {
continue
}
if ! ignoreExcludes {
if ! excludedDev ( m . Excludes , strings . TrimSpace ( dev [ 0 ] ) ) {
devices = append ( devices , strings . TrimSpace ( dev [ 0 ] ) )
}
} else {
2017-10-05 06:15:58 +08:00
devices = append ( devices , strings . TrimSpace ( dev [ 0 ] ) )
}
}
return devices , nil
}
2020-09-28 23:16:49 +08:00
// Wrap with sudo
2021-04-10 01:15:04 +08:00
var runCmd = func ( timeout config . Duration , sudo bool , command string , args ... string ) ( [ ] byte , error ) {
2020-09-28 23:16:49 +08:00
cmd := exec . Command ( command , args ... )
if sudo {
cmd = exec . Command ( "sudo" , append ( [ ] string { "-n" , command } , args ... ) ... )
}
2021-04-10 01:15:04 +08:00
return internal . CombinedOutputTimeout ( cmd , time . Duration ( timeout ) )
2020-09-28 23:16:49 +08:00
}
2017-10-05 06:15:58 +08:00
func excludedDev ( excludes [ ] string , deviceLine string ) bool {
device := strings . Split ( deviceLine , " " )
if len ( device ) != 0 {
for _ , exclude := range excludes {
if device [ 0 ] == exclude {
return true
}
}
}
return false
}
// Get info and attributes for each S.M.A.R.T. device
func ( m * Smart ) getAttributes ( acc telegraf . Accumulator , devices [ ] string ) {
var wg sync . WaitGroup
wg . Add ( len ( devices ) )
for _ , device := range devices {
2020-09-28 23:16:49 +08:00
go gatherDisk ( acc , m . Timeout , m . UseSudo , m . Attributes , m . PathSmartctl , m . Nocheck , device , & wg )
2017-10-05 06:15:58 +08:00
}
wg . Wait ( )
}
2020-09-28 23:16:49 +08:00
func ( m * Smart ) getVendorNVMeAttributes ( acc telegraf . Accumulator , devices [ ] string ) {
NVMeDevices := getDeviceInfoForNVMeDisks ( acc , devices , m . PathNVMe , m . Timeout , m . UseSudo )
var wg sync . WaitGroup
for _ , device := range NVMeDevices {
if contains ( m . EnableExtensions , "auto-on" ) {
switch device . vendorID {
2020-11-13 00:04:52 +08:00
case intelVID :
2020-09-28 23:16:49 +08:00
wg . Add ( 1 )
go gatherIntelNVMeDisk ( acc , m . Timeout , m . UseSudo , m . PathNVMe , device , & wg )
}
2020-11-13 00:04:52 +08:00
} else if contains ( m . EnableExtensions , "Intel" ) && device . vendorID == intelVID {
2020-09-28 23:16:49 +08:00
wg . Add ( 1 )
go gatherIntelNVMeDisk ( acc , m . Timeout , m . UseSudo , m . PathNVMe , device , & wg )
}
}
wg . Wait ( )
}
2021-04-10 01:15:04 +08:00
func getDeviceInfoForNVMeDisks ( acc telegraf . Accumulator , devices [ ] string , nvme string , timeout config . Duration , useSudo bool ) [ ] nvmeDevice {
2020-11-13 00:04:52 +08:00
var NVMeDevices [ ] nvmeDevice
2020-09-28 23:16:49 +08:00
for _ , device := range devices {
vid , sn , mn , err := gatherNVMeDeviceInfo ( nvme , device , timeout , useSudo )
if err != nil {
acc . AddError ( fmt . Errorf ( "cannot find device info for %s device" , device ) )
continue
}
2020-11-13 00:04:52 +08:00
newDevice := nvmeDevice {
2020-09-28 23:16:49 +08:00
name : device ,
vendorID : vid ,
model : mn ,
serialNumber : sn ,
}
NVMeDevices = append ( NVMeDevices , newDevice )
}
return NVMeDevices
}
2021-04-10 01:15:04 +08:00
func gatherNVMeDeviceInfo ( nvme , device string , timeout config . Duration , useSudo bool ) ( string , string , string , error ) {
2020-09-28 23:16:49 +08:00
args := [ ] string { "id-ctrl" }
args = append ( args , strings . Split ( device , " " ) ... )
out , err := runCmd ( timeout , useSudo , nvme , args ... )
if err != nil {
return "" , "" , "" , err
}
outStr := string ( out )
vid , sn , mn , err := findNVMeDeviceInfo ( outStr )
return vid , sn , mn , err
}
func findNVMeDeviceInfo ( output string ) ( string , string , string , error ) {
scanner := bufio . NewScanner ( strings . NewReader ( output ) )
var vid , sn , mn string
for scanner . Scan ( ) {
line := scanner . Text ( )
2020-11-13 00:04:52 +08:00
if matches := nvmeIDCtrlExpressionPattern . FindStringSubmatch ( line ) ; len ( matches ) > 2 {
2020-09-28 23:16:49 +08:00
matches [ 1 ] = strings . TrimSpace ( matches [ 1 ] )
matches [ 2 ] = strings . TrimSpace ( matches [ 2 ] )
if matches [ 1 ] == "vid" {
if _ , err := fmt . Sscanf ( matches [ 2 ] , "%s" , & vid ) ; err != nil {
return "" , "" , "" , err
}
}
if matches [ 1 ] == "sn" {
sn = matches [ 2 ]
}
if matches [ 1 ] == "mn" {
mn = matches [ 2 ]
}
}
}
return vid , sn , mn , nil
}
2021-04-10 01:15:04 +08:00
func gatherIntelNVMeDisk ( acc telegraf . Accumulator , timeout config . Duration , usesudo bool , nvme string , device nvmeDevice , wg * sync . WaitGroup ) {
2020-09-28 23:16:49 +08:00
defer wg . Done ( )
args := [ ] string { "intel" , "smart-log-add" }
args = append ( args , strings . Split ( device . name , " " ) ... )
out , e := runCmd ( timeout , usesudo , nvme , args ... )
outStr := string ( out )
_ , er := exitStatus ( e )
if er != nil {
acc . AddError ( fmt . Errorf ( "failed to run command '%s %s': %s - %s" , nvme , strings . Join ( args , " " ) , e , outStr ) )
return
}
scanner := bufio . NewScanner ( strings . NewReader ( outStr ) )
for scanner . Scan ( ) {
line := scanner . Text ( )
tags := map [ string ] string { }
fields := make ( map [ string ] interface { } )
tags [ "device" ] = path . Base ( device . name )
tags [ "model" ] = device . model
tags [ "serial_no" ] = device . serialNumber
if matches := intelExpressionPattern . FindStringSubmatch ( line ) ; len ( matches ) > 3 {
matches [ 1 ] = strings . TrimSpace ( matches [ 1 ] )
matches [ 3 ] = strings . TrimSpace ( matches [ 3 ] )
if attr , ok := intelAttributes [ matches [ 1 ] ] ; ok {
tags [ "name" ] = attr . Name
if attr . ID != "" {
tags [ "id" ] = attr . ID
}
parse := parseCommaSeparatedIntWithAccumulator
if attr . Parse != nil {
parse = attr . Parse
}
if err := parse ( acc , fields , tags , matches [ 3 ] ) ; err != nil {
continue
}
}
2017-10-05 06:15:58 +08:00
}
}
}
2021-04-10 01:15:04 +08:00
func gatherDisk ( acc telegraf . Accumulator , timeout config . Duration , usesudo , collectAttributes bool , smartctl , nocheck , device string , wg * sync . WaitGroup ) {
2017-10-05 06:15:58 +08:00
defer wg . Done ( )
// smartctl 5.41 & 5.42 have are broken regarding handling of --nocheck/-n
2019-05-08 06:20:03 +08:00
args := [ ] string { "--info" , "--health" , "--attributes" , "--tolerance=verypermissive" , "-n" , nocheck , "--format=brief" }
2017-10-05 06:15:58 +08:00
args = append ( args , strings . Split ( device , " " ) ... )
2019-08-14 01:24:44 +08:00
out , e := runCmd ( timeout , usesudo , smartctl , args ... )
2017-10-05 06:15:58 +08:00
outStr := string ( out )
// Ignore all exit statuses except if it is a command line parse error
exitStatus , er := exitStatus ( e )
if er != nil {
2019-05-08 06:20:03 +08:00
acc . AddError ( fmt . Errorf ( "failed to run command '%s %s': %s - %s" , smartctl , strings . Join ( args , " " ) , e , outStr ) )
2017-10-05 06:15:58 +08:00
return
}
2019-05-08 06:20:03 +08:00
deviceTags := map [ string ] string { }
deviceNode := strings . Split ( device , " " ) [ 0 ]
deviceTags [ "device" ] = path . Base ( deviceNode )
deviceFields := make ( map [ string ] interface { } )
deviceFields [ "exit_status" ] = exitStatus
2018-04-03 04:55:10 +08:00
scanner := bufio . NewScanner ( strings . NewReader ( outStr ) )
for scanner . Scan ( ) {
line := scanner . Text ( )
2017-10-05 06:15:58 +08:00
2019-05-08 06:20:03 +08:00
model := modelInfo . FindStringSubmatch ( line )
if len ( model ) > 2 {
deviceTags [ "model" ] = model [ 2 ]
2017-10-05 06:15:58 +08:00
}
2019-05-08 06:20:03 +08:00
serial := serialInfo . FindStringSubmatch ( line )
2017-10-05 06:15:58 +08:00
if len ( serial ) > 1 {
2019-05-08 06:20:03 +08:00
deviceTags [ "serial_no" ] = serial [ 1 ]
2017-10-05 06:15:58 +08:00
}
2019-05-08 06:20:03 +08:00
wwn := wwnInfo . FindStringSubmatch ( line )
2017-10-05 06:15:58 +08:00
if len ( wwn ) > 1 {
2019-05-08 06:20:03 +08:00
deviceTags [ "wwn" ] = strings . Replace ( wwn [ 1 ] , " " , "" , - 1 )
2017-10-05 06:15:58 +08:00
}
2020-09-28 23:16:49 +08:00
capacity := userCapacityInfo . FindStringSubmatch ( line )
2017-10-05 06:15:58 +08:00
if len ( capacity ) > 1 {
2019-05-08 06:20:03 +08:00
deviceTags [ "capacity" ] = strings . Replace ( capacity [ 1 ] , "," , "" , - 1 )
2017-10-05 06:15:58 +08:00
}
2019-05-08 06:20:03 +08:00
enabled := smartEnabledInfo . FindStringSubmatch ( line )
2017-10-05 06:15:58 +08:00
if len ( enabled ) > 1 {
2019-05-08 06:20:03 +08:00
deviceTags [ "enabled" ] = enabled [ 1 ]
2017-10-05 06:15:58 +08:00
}
health := smartOverallHealth . FindStringSubmatch ( line )
2019-05-08 06:20:03 +08:00
if len ( health ) > 2 {
2020-09-28 23:16:49 +08:00
deviceFields [ "health_ok" ] = health [ 2 ] == "PASSED" || health [ 2 ] == "OK"
2017-10-05 06:15:58 +08:00
}
2021-06-11 04:51:33 +08:00
// checks to see if there is a power mode to print to user
// if not look for Device is in STANDBY which happens when
// nocheck is set to standby (will exit to not spin up the disk)
// otherwise nothing is found so nothing is printed (NVMe does not show power)
if power := powermodeInfo . FindStringSubmatch ( line ) ; len ( power ) > 1 {
deviceTags [ "power" ] = power [ 1 ]
} else {
if power := standbyInfo . FindStringSubmatch ( line ) ; len ( power ) > 1 {
deviceTags [ "power" ] = power [ 1 ]
}
}
2019-05-08 06:20:03 +08:00
tags := map [ string ] string { }
fields := make ( map [ string ] interface { } )
2017-10-05 06:15:58 +08:00
2019-06-26 02:51:51 +08:00
if collectAttributes {
2021-06-11 04:51:33 +08:00
//add power mode
keys := [ ... ] string { "device" , "model" , "serial_no" , "wwn" , "capacity" , "enabled" , "power" }
2019-08-06 08:36:34 +08:00
for _ , key := range keys {
if value , ok := deviceTags [ key ] ; ok {
tags [ key ] = value
}
2019-06-26 02:51:51 +08:00
}
}
2019-05-08 06:20:03 +08:00
attr := attribute . FindStringSubmatch ( line )
2017-10-05 06:15:58 +08:00
if len ( attr ) > 1 {
2020-01-15 09:05:28 +08:00
// attribute has been found, add it only if collectAttributes is true
2019-05-08 06:20:03 +08:00
if collectAttributes {
2017-10-05 06:15:58 +08:00
tags [ "id" ] = attr [ 1 ]
tags [ "name" ] = attr [ 2 ]
tags [ "flags" ] = attr [ 3 ]
fields [ "exit_status" ] = exitStatus
if i , err := strconv . ParseInt ( attr [ 4 ] , 10 , 64 ) ; err == nil {
fields [ "value" ] = i
}
if i , err := strconv . ParseInt ( attr [ 5 ] , 10 , 64 ) ; err == nil {
fields [ "worst" ] = i
}
if i , err := strconv . ParseInt ( attr [ 6 ] , 10 , 64 ) ; err == nil {
fields [ "threshold" ] = i
}
tags [ "fail" ] = attr [ 7 ]
if val , err := parseRawValue ( attr [ 8 ] ) ; err == nil {
fields [ "raw_value" ] = val
}
acc . AddFields ( "smart_attribute" , fields , tags )
}
// If the attribute matches on the one in deviceFieldIds
// save the raw value to a field.
if field , ok := deviceFieldIds [ attr [ 1 ] ] ; ok {
if val , err := parseRawValue ( attr [ 8 ] ) ; err == nil {
2019-05-08 06:20:03 +08:00
deviceFields [ field ] = val
}
}
} else {
2020-01-15 09:05:28 +08:00
// what was found is not a vendor attribute
if matches := sasNvmeAttr . FindStringSubmatch ( line ) ; len ( matches ) > 2 {
if attr , ok := sasNvmeAttributes [ matches [ 1 ] ] ; ok {
tags [ "name" ] = attr . Name
if attr . ID != "" {
tags [ "id" ] = attr . ID
}
2019-07-13 05:25:45 +08:00
2020-05-16 06:43:32 +08:00
parse := parseCommaSeparatedInt
2020-01-15 09:05:28 +08:00
if attr . Parse != nil {
parse = attr . Parse
}
if err := parse ( fields , deviceFields , matches [ 2 ] ) ; err != nil {
continue
}
// if the field is classified as an attribute, only add it
// if collectAttributes is true
if collectAttributes {
2019-07-13 05:25:45 +08:00
acc . AddFields ( "smart_attribute" , fields , tags )
2019-05-08 06:20:03 +08:00
}
2017-10-05 06:15:58 +08:00
}
}
}
}
2019-05-08 06:20:03 +08:00
acc . AddFields ( "smart_device" , deviceFields , deviceTags )
2017-10-05 06:15:58 +08:00
}
2020-09-28 23:16:49 +08:00
// Command line parse errors are denoted by the exit code having the 0 bit set.
// All other errors are drive/communication errors and should be ignored.
func exitStatus ( err error ) ( int , error ) {
if exiterr , ok := err . ( * exec . ExitError ) ; ok {
if status , ok := exiterr . Sys ( ) . ( syscall . WaitStatus ) ; ok {
return status . ExitStatus ( ) , nil
}
}
return 0 , err
}
func contains ( args [ ] string , element string ) bool {
for _ , arg := range args {
if arg == element {
return true
}
}
return false
}
func difference ( a , b [ ] string ) [ ] string {
mb := make ( map [ string ] struct { } , len ( b ) )
for _ , x := range b {
mb [ x ] = struct { } { }
}
var diff [ ] string
for _ , x := range a {
if _ , found := mb [ x ] ; ! found {
diff = append ( diff , x )
}
}
return diff
}
2017-10-05 06:15:58 +08:00
func parseRawValue ( rawVal string ) ( int64 , error ) {
// Integer
if i , err := strconv . ParseInt ( rawVal , 10 , 64 ) ; err == nil {
return i , nil
}
// Duration: 65h+33m+09.259s
unit := regexp . MustCompile ( "^(.*)([hms])$" )
parts := strings . Split ( rawVal , "+" )
if len ( parts ) == 0 {
2020-09-28 23:16:49 +08:00
return 0 , fmt . Errorf ( "couldn't parse RAW_VALUE '%s'" , rawVal )
2017-10-05 06:15:58 +08:00
}
duration := int64 ( 0 )
for _ , part := range parts {
timePart := unit . FindStringSubmatch ( part )
if len ( timePart ) == 0 {
continue
}
switch timePart [ 2 ] {
case "h" :
duration += parseInt ( timePart [ 1 ] ) * int64 ( 3600 )
case "m" :
duration += parseInt ( timePart [ 1 ] ) * int64 ( 60 )
case "s" :
// drop fractions of seconds
duration += parseInt ( strings . Split ( timePart [ 1 ] , "." ) [ 0 ] )
default :
// Unknown, ignore
}
}
return duration , nil
}
2020-09-28 23:16:49 +08:00
func parseBytesWritten ( acc telegraf . Accumulator , fields map [ string ] interface { } , tags map [ string ] string , str string ) error {
var value int64
if _ , err := fmt . Sscanf ( str , "sectors: %d" , & value ) ; err != nil {
return err
}
fields [ "raw_value" ] = value
acc . AddFields ( "smart_attribute" , fields , tags )
return nil
}
func parseThermalThrottle ( acc telegraf . Accumulator , fields map [ string ] interface { } , tags map [ string ] string , str string ) error {
var percentage float64
var count int64
if _ , err := fmt . Sscanf ( str , "%f%%, cnt: %d" , & percentage , & count ) ; err != nil {
return err
}
fields [ "raw_value" ] = percentage
tags [ "name" ] = "Thermal_Throttle_Status_Prc"
acc . AddFields ( "smart_attribute" , fields , tags )
fields [ "raw_value" ] = count
tags [ "name" ] = "Thermal_Throttle_Status_Cnt"
acc . AddFields ( "smart_attribute" , fields , tags )
return nil
}
func parseWearLeveling ( acc telegraf . Accumulator , fields map [ string ] interface { } , tags map [ string ] string , str string ) error {
var min , max , avg int64
if _ , err := fmt . Sscanf ( str , "min: %d, max: %d, avg: %d" , & min , & max , & avg ) ; err != nil {
return err
}
values := [ ] int64 { min , max , avg }
for i , submetricName := range [ ] string { "Min" , "Max" , "Avg" } {
fields [ "raw_value" ] = values [ i ]
tags [ "name" ] = fmt . Sprintf ( "Wear_Leveling_%s" , submetricName )
acc . AddFields ( "smart_attribute" , fields , tags )
}
return nil
}
func parseTimedWorkload ( acc telegraf . Accumulator , fields map [ string ] interface { } , tags map [ string ] string , str string ) error {
var value float64
if _ , err := fmt . Sscanf ( str , "%f" , & value ) ; err != nil {
return err
}
fields [ "raw_value" ] = value
acc . AddFields ( "smart_attribute" , fields , tags )
return nil
}
2017-10-05 06:15:58 +08:00
func parseInt ( str string ) int64 {
if i , err := strconv . ParseInt ( str , 10 , 64 ) ; err == nil {
return i
}
return 0
}
2020-05-16 06:43:32 +08:00
func parseCommaSeparatedInt ( fields , _ map [ string ] interface { } , str string ) error {
2020-09-28 23:16:49 +08:00
str = strings . Join ( strings . Fields ( str ) , "" )
2019-07-13 05:25:45 +08:00
i , err := strconv . ParseInt ( strings . Replace ( str , "," , "" , - 1 ) , 10 , 64 )
if err != nil {
return err
}
fields [ "raw_value" ] = i
return nil
}
2020-06-24 02:55:32 +08:00
func parsePercentageInt ( fields , deviceFields map [ string ] interface { } , str string ) error {
return parseCommaSeparatedInt ( fields , deviceFields , strings . TrimSuffix ( str , "%" ) )
}
func parseDataUnits ( fields , deviceFields map [ string ] interface { } , str string ) error {
units := strings . Fields ( str ) [ 0 ]
return parseCommaSeparatedInt ( fields , deviceFields , units )
}
2020-09-28 23:16:49 +08:00
func parseCommaSeparatedIntWithAccumulator ( acc telegraf . Accumulator , fields map [ string ] interface { } , tags map [ string ] string , str string ) error {
i , err := strconv . ParseInt ( strings . Replace ( str , "," , "" , - 1 ) , 10 , 64 )
if err != nil {
return err
}
fields [ "raw_value" ] = i
acc . AddFields ( "smart_attribute" , fields , tags )
return nil
}
2019-07-13 05:25:45 +08:00
func parseTemperature ( fields , deviceFields map [ string ] interface { } , str string ) error {
var temp int64
if _ , err := fmt . Sscanf ( str , "%d C" , & temp ) ; err != nil {
return err
}
fields [ "raw_value" ] = temp
deviceFields [ "temp_c" ] = temp
return nil
}
2020-11-13 00:04:52 +08:00
func parseTemperatureSensor ( fields , _ map [ string ] interface { } , str string ) error {
2020-09-28 23:16:49 +08:00
var temp int64
if _ , err := fmt . Sscanf ( str , "%d C" , & temp ) ; err != nil {
return err
}
fields [ "raw_value" ] = temp
return nil
}
func validatePath ( path string ) error {
pathInfo , err := os . Stat ( path )
if os . IsNotExist ( err ) {
return fmt . Errorf ( "provided path does not exist: [%s]" , path )
}
if mode := pathInfo . Mode ( ) ; ! mode . IsRegular ( ) {
return fmt . Errorf ( "provided path does not point to a regular file: [%s]" , path )
}
return nil
}
2017-10-05 06:15:58 +08:00
func init ( ) {
2020-09-28 23:16:49 +08:00
// Set LC_NUMERIC to uniform numeric output from cli tools
_ = os . Setenv ( "LC_NUMERIC" , "en_US.UTF-8" )
2017-10-05 06:15:58 +08:00
inputs . Add ( "smart" , func ( ) telegraf . Input {
2020-11-13 00:04:52 +08:00
m := newSmart ( )
2019-08-14 01:24:44 +08:00
m . Nocheck = "standby"
return m
2017-10-05 06:15:58 +08:00
} )
}