feat(inputs.smart): Add a "device_type" tag to differentiate disks behind a RAID controller (#14613)
Co-authored-by: Thomas Delbende <thomas.delbende@bleemeo.com>
This commit is contained in:
parent
9878eba241
commit
439df813ec
|
|
@ -111,6 +111,10 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
|
||||||
## without a password.
|
## without a password.
|
||||||
# use_sudo = false
|
# use_sudo = false
|
||||||
|
|
||||||
|
## Adds an extra tag "device_type", which can be used to differentiate
|
||||||
|
## multiple disks behind the same controller (e.g., MegaRAID).
|
||||||
|
# tag_with_device_type = false
|
||||||
|
|
||||||
## Skip checking disks in this power mode. Defaults to
|
## Skip checking disks in this power mode. Defaults to
|
||||||
## "standby" to not wake up disks that have stopped rotating.
|
## "standby" to not wake up disks that have stopped rotating.
|
||||||
## See --nocheck in the man pages for smartctl.
|
## See --nocheck in the man pages for smartctl.
|
||||||
|
|
@ -182,6 +186,7 @@ execute this script.
|
||||||
- tags:
|
- tags:
|
||||||
- capacity
|
- capacity
|
||||||
- device
|
- device
|
||||||
|
- device_type (only emitted if `tag_with_device_type` is set to `true`)
|
||||||
- enabled
|
- enabled
|
||||||
- model
|
- model
|
||||||
- serial_no
|
- serial_no
|
||||||
|
|
@ -201,6 +206,7 @@ execute this script.
|
||||||
- tags:
|
- tags:
|
||||||
- capacity
|
- capacity
|
||||||
- device
|
- device
|
||||||
|
- device_type (only emitted if `tag_with_device_type` is set to `true`)
|
||||||
- enabled
|
- enabled
|
||||||
- fail
|
- fail
|
||||||
- flags
|
- flags
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,10 @@
|
||||||
## without a password.
|
## without a password.
|
||||||
# use_sudo = false
|
# use_sudo = false
|
||||||
|
|
||||||
|
## Adds an extra tag "device_type", which can be used to differentiate
|
||||||
|
## multiple disks behind the same controller (e.g., MegaRAID).
|
||||||
|
# tag_with_device_type = false
|
||||||
|
|
||||||
## Skip checking disks in this power mode. Defaults to
|
## Skip checking disks in this power mode. Defaults to
|
||||||
## "standby" to not wake up disks that have stopped rotating.
|
## "standby" to not wake up disks that have stopped rotating.
|
||||||
## See --nocheck in the man pages for smartctl.
|
## See --nocheck in the man pages for smartctl.
|
||||||
|
|
|
||||||
|
|
@ -353,18 +353,19 @@ var (
|
||||||
|
|
||||||
// Smart plugin reads metrics from storage devices supporting S.M.A.R.T.
|
// Smart plugin reads metrics from storage devices supporting S.M.A.R.T.
|
||||||
type Smart struct {
|
type Smart struct {
|
||||||
Path string `toml:"path" deprecated:"1.16.0;use 'path_smartctl' instead"`
|
Path string `toml:"path" deprecated:"1.16.0;use 'path_smartctl' instead"`
|
||||||
PathSmartctl string `toml:"path_smartctl"`
|
PathSmartctl string `toml:"path_smartctl"`
|
||||||
PathNVMe string `toml:"path_nvme"`
|
PathNVMe string `toml:"path_nvme"`
|
||||||
Nocheck string `toml:"nocheck"`
|
Nocheck string `toml:"nocheck"`
|
||||||
EnableExtensions []string `toml:"enable_extensions"`
|
EnableExtensions []string `toml:"enable_extensions"`
|
||||||
Attributes bool `toml:"attributes"`
|
Attributes bool `toml:"attributes"`
|
||||||
Excludes []string `toml:"excludes"`
|
Excludes []string `toml:"excludes"`
|
||||||
Devices []string `toml:"devices"`
|
Devices []string `toml:"devices"`
|
||||||
UseSudo bool `toml:"use_sudo"`
|
UseSudo bool `toml:"use_sudo"`
|
||||||
Timeout config.Duration `toml:"timeout"`
|
TagWithDeviceType bool `toml:"tag_with_device_type"`
|
||||||
ReadMethod string `toml:"read_method"`
|
Timeout config.Duration `toml:"timeout"`
|
||||||
Log telegraf.Logger `toml:"-"`
|
ReadMethod string `toml:"read_method"`
|
||||||
|
Log telegraf.Logger `toml:"-"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type nvmeDevice struct {
|
type nvmeDevice struct {
|
||||||
|
|
@ -741,8 +742,16 @@ func (m *Smart) gatherDisk(acc telegraf.Accumulator, device string, wg *sync.Wai
|
||||||
}
|
}
|
||||||
|
|
||||||
deviceTags := map[string]string{}
|
deviceTags := map[string]string{}
|
||||||
deviceNode := strings.Split(device, " ")[0]
|
if m.TagWithDeviceType {
|
||||||
deviceTags["device"] = path.Base(deviceNode)
|
deviceNode := strings.SplitN(device, " ", 2)
|
||||||
|
deviceTags["device"] = path.Base(deviceNode[0])
|
||||||
|
if len(deviceNode) == 2 && deviceNode[1] != "" {
|
||||||
|
deviceTags["device_type"] = strings.TrimPrefix(deviceNode[1], "-d ")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
deviceNode := strings.Split(device, " ")[0]
|
||||||
|
deviceTags["device"] = path.Base(deviceNode)
|
||||||
|
}
|
||||||
deviceFields := make(map[string]interface{})
|
deviceFields := make(map[string]interface{})
|
||||||
deviceFields["exit_status"] = exitStatus
|
deviceFields["exit_status"] = exitStatus
|
||||||
|
|
||||||
|
|
@ -798,7 +807,7 @@ func (m *Smart) gatherDisk(acc telegraf.Accumulator, device string, wg *sync.Wai
|
||||||
|
|
||||||
if m.Attributes {
|
if m.Attributes {
|
||||||
//add power mode
|
//add power mode
|
||||||
keys := [...]string{"device", "model", "serial_no", "wwn", "capacity", "enabled", "power"}
|
keys := [...]string{"device", "device_type", "model", "serial_no", "wwn", "capacity", "enabled", "power"}
|
||||||
for _, key := range keys {
|
for _, key := range keys {
|
||||||
if value, ok := deviceTags[key]; ok {
|
if value, ok := deviceTags[key]; ok {
|
||||||
tags[key] = value
|
tags[key] = value
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package smart
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
|
"fmt"
|
||||||
"sync"
|
"sync"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
@ -313,6 +314,39 @@ func TestGatherSSDRaid(t *testing.T) {
|
||||||
require.Equal(t, uint64(15), acc.NMetrics(), "Wrong number of metrics gathered")
|
require.Equal(t, uint64(15), acc.NMetrics(), "Wrong number of metrics gathered")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGatherDeviceTypeTag(t *testing.T) {
|
||||||
|
runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) {
|
||||||
|
switch args[0] {
|
||||||
|
case "--scan":
|
||||||
|
return nil, errors.New("scan command should not be run, since devices are provided in config")
|
||||||
|
case "--info":
|
||||||
|
switch args[len(args)-1] {
|
||||||
|
case "megaraid,0":
|
||||||
|
return []byte(smartctlMegaraidInfo1), nil
|
||||||
|
case "megaraid,1":
|
||||||
|
return []byte(smartctlMegaraidInfo2), nil
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("unexpected device type %q", args[len(args)-1])
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("unexpected command %q", args[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
s := newSmart()
|
||||||
|
s.Devices = []string{"/dev/bus/0 -d megaraid,0", "/dev/bus/0 -d megaraid,1"}
|
||||||
|
s.TagWithDeviceType = true
|
||||||
|
|
||||||
|
acc := testutil.Accumulator{}
|
||||||
|
|
||||||
|
err := s.Gather(&acc)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NoError(t, errors.Join(acc.Errors...))
|
||||||
|
|
||||||
|
result := acc.GetTelegrafMetrics()
|
||||||
|
testutil.RequireMetricsEqual(t, testSmartctlDeviceTypeTag, result, testutil.SortMetrics(), testutil.IgnoreTime())
|
||||||
|
}
|
||||||
|
|
||||||
func TestGatherNVMe(t *testing.T) {
|
func TestGatherNVMe(t *testing.T) {
|
||||||
runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) {
|
runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) {
|
||||||
return []byte(smartctlNVMeInfoData), nil
|
return []byte(smartctlNVMeInfoData), nil
|
||||||
|
|
@ -792,6 +826,45 @@ var (
|
||||||
mockModel = "INTEL SSDPEDABCDEFG"
|
mockModel = "INTEL SSDPEDABCDEFG"
|
||||||
mockSerial = "CVFT5123456789ABCD"
|
mockSerial = "CVFT5123456789ABCD"
|
||||||
|
|
||||||
|
testSmartctlDeviceTypeTag = []telegraf.Metric{
|
||||||
|
testutil.MustMetric(
|
||||||
|
"smart_device",
|
||||||
|
map[string]string{
|
||||||
|
"capacity": "600000000000",
|
||||||
|
"device": "0",
|
||||||
|
"device_type": "megaraid,0",
|
||||||
|
"enabled": "Enabled",
|
||||||
|
"model": "ST3450857SS",
|
||||||
|
"power": "ACTIVE",
|
||||||
|
"serial_no": "xxx",
|
||||||
|
},
|
||||||
|
map[string]any{
|
||||||
|
"exit_status": int64(0),
|
||||||
|
"health_ok": true,
|
||||||
|
"temp_c": int64(37),
|
||||||
|
},
|
||||||
|
time.Unix(0, 0),
|
||||||
|
),
|
||||||
|
testutil.MustMetric(
|
||||||
|
"smart_device",
|
||||||
|
map[string]string{
|
||||||
|
"capacity": "600000000000",
|
||||||
|
"device": "0",
|
||||||
|
"device_type": "megaraid,1",
|
||||||
|
"enabled": "Enabled",
|
||||||
|
"model": "ST3450857SS",
|
||||||
|
"power": "ACTIVE",
|
||||||
|
"serial_no": "xxx",
|
||||||
|
},
|
||||||
|
map[string]any{
|
||||||
|
"exit_status": int64(0),
|
||||||
|
"health_ok": true,
|
||||||
|
"temp_c": int64(47),
|
||||||
|
},
|
||||||
|
time.Unix(0, 0),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
testSmartctlNVMeAttributes = []telegraf.Metric{
|
testSmartctlNVMeAttributes = []telegraf.Metric{
|
||||||
testutil.MustMetric("smart_device",
|
testutil.MustMetric("smart_device",
|
||||||
map[string]string{
|
map[string]string{
|
||||||
|
|
@ -2237,6 +2310,93 @@ Selective self-test flags (0x0):
|
||||||
After scanning selected spans, do NOT read-scan remainder of disk.
|
After scanning selected spans, do NOT read-scan remainder of disk.
|
||||||
If Selective self-test is pending on power-up, resume after 0 minute delay.
|
If Selective self-test is pending on power-up, resume after 0 minute delay.
|
||||||
`
|
`
|
||||||
|
|
||||||
|
smartctlMegaraidInfo1 = `smartctl 7.3 2022-02-28 r5338 [x86_64-linux-6.2.16-12-pve] (local build)
|
||||||
|
Copyright (C) 2002-22, Bruce Allen, Christian Franke, www.smartmontools.org
|
||||||
|
|
||||||
|
=== START OF INFORMATION SECTION ===
|
||||||
|
Vendor: SEAGATE
|
||||||
|
Product: ST3450857SS
|
||||||
|
Revision: ES12
|
||||||
|
Compliance: SPC-3
|
||||||
|
User Capacity: 600,000,000,000 bytes [600 GB]
|
||||||
|
Logical block size: 512 bytes
|
||||||
|
Rotation Rate: 15000 rpm
|
||||||
|
Form Factor: 3.5 inches
|
||||||
|
Logical Unit id: 0x6000c60641d10397
|
||||||
|
Serial number: xxx
|
||||||
|
Device type: disk
|
||||||
|
Transport protocol: SAS (SPL-4)
|
||||||
|
Local Time is: Fri Jan 12 11:43:49 2024 CET
|
||||||
|
SMART support is: Available - device has SMART capability.
|
||||||
|
SMART support is: Enabled
|
||||||
|
Temperature Warning: Disabled or Not Supported
|
||||||
|
Power mode is: ACTIVE
|
||||||
|
|
||||||
|
=== START OF READ SMART DATA SECTION ===
|
||||||
|
SMART Health Status: OK
|
||||||
|
|
||||||
|
Current Drive Temperature: 37 C
|
||||||
|
Drive Trip Temperature: 63 C
|
||||||
|
|
||||||
|
Accumulated power on time, hours:minutes 16003:18
|
||||||
|
Elements in grown defect list: 0
|
||||||
|
|
||||||
|
Vendor (Seagate Cache) information
|
||||||
|
Blocks sent to initiator = 3000000000
|
||||||
|
Blocks received from initiator = 3000000000
|
||||||
|
Blocks read from cache and sent to initiator = 3000000000
|
||||||
|
Number of read and write commands whose size <= segment size = 3000000000
|
||||||
|
Number of read and write commands whose size > segment size = 300
|
||||||
|
|
||||||
|
Vendor (Seagate/Hitachi) factory information
|
||||||
|
number of hours powered up = 30000.30
|
||||||
|
number of minutes until next internal SMART test = 7
|
||||||
|
`
|
||||||
|
|
||||||
|
smartctlMegaraidInfo2 = `smartctl 7.3 2022-02-28 r5338 [x86_64-linux-6.2.16-12-pve] (local build)
|
||||||
|
Copyright (C) 2002-22, Bruce Allen, Christian Franke, www.smartmontools.org
|
||||||
|
|
||||||
|
=== START OF INFORMATION SECTION ===
|
||||||
|
Vendor: SEAGATE
|
||||||
|
Product: ST3450857SS
|
||||||
|
Revision: ES12
|
||||||
|
Compliance: SPC-3
|
||||||
|
User Capacity: 600,000,000,000 bytes [600 GB]
|
||||||
|
Logical block size: 512 bytes
|
||||||
|
Rotation Rate: 15000 rpm
|
||||||
|
Form Factor: 3.5 inches
|
||||||
|
Logical Unit id: 0x6000c60641d10497
|
||||||
|
Serial number: xxx
|
||||||
|
Device type: disk
|
||||||
|
Transport protocol: SAS (SPL-4)
|
||||||
|
Local Time is: Fri Jan 12 11:44:49 2024 CET
|
||||||
|
SMART support is: Available - device has SMART capability.
|
||||||
|
SMART support is: Enabled
|
||||||
|
Temperature Warning: Disabled or Not Supported
|
||||||
|
Power mode is: ACTIVE
|
||||||
|
|
||||||
|
=== START OF READ SMART DATA SECTION ===
|
||||||
|
SMART Health Status: OK
|
||||||
|
|
||||||
|
Current Drive Temperature: 47 C
|
||||||
|
Drive Trip Temperature: 64 C
|
||||||
|
|
||||||
|
Accumulated power on time, hours:minutes 16004:18
|
||||||
|
Elements in grown defect list: 0
|
||||||
|
|
||||||
|
Vendor (Seagate Cache) information
|
||||||
|
Blocks sent to initiator = 4000000000
|
||||||
|
Blocks received from initiator = 4000000000
|
||||||
|
Blocks read from cache and sent to initiator = 4000000000
|
||||||
|
Number of read and write commands whose size <= segment size = 4000000000
|
||||||
|
Number of read and write commands whose size > segment size = 400
|
||||||
|
|
||||||
|
Vendor (Seagate/Hitachi) factory information
|
||||||
|
number of hours powered up = 30000.30
|
||||||
|
number of minutes until next internal SMART test = 7
|
||||||
|
`
|
||||||
|
|
||||||
smartctlNVMeInfoData = `smartctl 6.5 2016-05-07 r4318 [x86_64-linux-4.1.27-gvt-yocto-standard] (local build)
|
smartctlNVMeInfoData = `smartctl 6.5 2016-05-07 r4318 [x86_64-linux-4.1.27-gvt-yocto-standard] (local build)
|
||||||
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org
|
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue