feat: Add additional stats to bond collector (#10137)
This commit is contained in:
parent
0b96d40b60
commit
3bbd6be2fa
|
|
@ -12,10 +12,18 @@ The plugin collects these metrics from `/proc/net/bonding/*` files.
|
|||
## If not specified, then default is /proc
|
||||
# host_proc = "/proc"
|
||||
|
||||
## Sets 'sys' directory path
|
||||
## If not specified, then default is /sys
|
||||
# host_sys = "/sys"
|
||||
|
||||
## By default, telegraf gather stats for all bond interfaces
|
||||
## Setting interfaces will restrict the stats to the specified
|
||||
## bond interfaces.
|
||||
# bond_interfaces = ["bond0"]
|
||||
|
||||
## Tries to collect additional bond details from /sys/class/net/{bond}
|
||||
## currently only useful for LACP (mode 4) bonds
|
||||
# collect_sys_details = false
|
||||
```
|
||||
|
||||
## Measurements & Fields
|
||||
|
|
@ -28,22 +36,30 @@ The plugin collects these metrics from `/proc/net/bonding/*` files.
|
|||
- failures
|
||||
- status
|
||||
- count
|
||||
- actor_churned (for LACP bonds)
|
||||
- partner_churned (for LACP bonds)
|
||||
- total_churned (for LACP bonds)
|
||||
|
||||
- bond_sys
|
||||
- slave_count
|
||||
- ad_port_count
|
||||
|
||||
## Description
|
||||
|
||||
```shell
|
||||
active_slave
|
||||
Currently active slave interface for active-backup mode.
|
||||
|
||||
status
|
||||
Status of bond interface or bonds's slave interface (down = 0, up = 1).
|
||||
|
||||
failures
|
||||
Amount of failures for bond's slave interface.
|
||||
|
||||
count
|
||||
Number of slaves attached to bond
|
||||
```
|
||||
- active_slave
|
||||
- Currently active slave interface for active-backup mode.
|
||||
- status
|
||||
- Status of bond interface or bonds's slave interface (down = 0, up = 1).
|
||||
- failures
|
||||
- Amount of failures for bond's slave interface.
|
||||
- count
|
||||
- Number of slaves attached to bond
|
||||
- actor_churned
|
||||
- number of times local end of LACP bond flapped
|
||||
- partner_churned
|
||||
- number of times remote end of LACP bond flapped
|
||||
- total_churned
|
||||
- full count of all churn events
|
||||
|
||||
## Tags
|
||||
|
||||
|
|
@ -54,6 +70,10 @@ count
|
|||
- bond
|
||||
- interface
|
||||
|
||||
- bond_sys
|
||||
- bond
|
||||
- mode
|
||||
|
||||
## Example output
|
||||
|
||||
Configuration:
|
||||
|
|
@ -72,12 +92,14 @@ Configuration:
|
|||
|
||||
Run:
|
||||
|
||||
```bash
|
||||
```shell
|
||||
telegraf --config telegraf.conf --input-filter bond --test
|
||||
```
|
||||
|
||||
Output:
|
||||
|
||||
```bash
|
||||
```shell
|
||||
* Plugin: inputs.bond, Collection 1
|
||||
> bond,bond=bond1,host=local active_slave="eth0",status=1i 1509704525000000000
|
||||
|
|
|
|||
|
|
@ -14,24 +14,44 @@ import (
|
|||
|
||||
// default host proc path
|
||||
const defaultHostProc = "/proc"
|
||||
const defaultHostSys = "/sys"
|
||||
|
||||
// env host proc variable name
|
||||
const envProc = "HOST_PROC"
|
||||
const envSys = "HOST_SYS"
|
||||
|
||||
type Bond struct {
|
||||
HostProc string `toml:"host_proc"`
|
||||
HostSys string `toml:"host_sys"`
|
||||
SysDetails bool `toml:"collect_sys_details"`
|
||||
BondInterfaces []string `toml:"bond_interfaces"`
|
||||
BondType string
|
||||
}
|
||||
|
||||
var sampleConfig = `
|
||||
type sysFiles struct {
|
||||
ModeFile string
|
||||
SlaveFile string
|
||||
ADPortsFile string
|
||||
}
|
||||
|
||||
const sampleConfig = `
|
||||
## Sets 'proc' directory path
|
||||
## If not specified, then default is /proc
|
||||
# host_proc = "/proc"
|
||||
|
||||
## Sets 'sys' directory path
|
||||
## If not specified, then default is /sys
|
||||
# host_sys = "/sys"
|
||||
|
||||
## By default, telegraf gather stats for all bond interfaces
|
||||
## Setting interfaces will restrict the stats to the specified
|
||||
## bond interfaces.
|
||||
# bond_interfaces = ["bond0"]
|
||||
|
||||
## Tries to collect additional bond details from /sys/class/net/{bond}
|
||||
## currently only useful for LACP (mode 4) bonds
|
||||
# collect_sys_details = false
|
||||
|
||||
`
|
||||
|
||||
func (bond *Bond) Description() string {
|
||||
|
|
@ -44,7 +64,7 @@ func (bond *Bond) SampleConfig() string {
|
|||
|
||||
func (bond *Bond) Gather(acc telegraf.Accumulator) error {
|
||||
// load proc path, get default value if config value and env variable are empty
|
||||
bond.loadPath()
|
||||
bond.loadPaths()
|
||||
// list bond interfaces from bonding directory or gather all interfaces.
|
||||
bondNames, err := bond.listInterfaces()
|
||||
if err != nil {
|
||||
|
|
@ -54,13 +74,25 @@ func (bond *Bond) Gather(acc telegraf.Accumulator) error {
|
|||
bondAbsPath := bond.HostProc + "/net/bonding/" + bondName
|
||||
file, err := os.ReadFile(bondAbsPath)
|
||||
if err != nil {
|
||||
acc.AddError(fmt.Errorf("error inspecting '%s' interface: %v", bondAbsPath, err))
|
||||
acc.AddError(fmt.Errorf("error inspecting %q interface: %v", bondAbsPath, err))
|
||||
continue
|
||||
}
|
||||
rawFile := strings.TrimSpace(string(file))
|
||||
err = bond.gatherBondInterface(bondName, rawFile, acc)
|
||||
rawProcFile := strings.TrimSpace(string(file))
|
||||
err = bond.gatherBondInterface(bondName, rawProcFile, acc)
|
||||
if err != nil {
|
||||
acc.AddError(fmt.Errorf("error inspecting '%s' interface: %v", bondName, err))
|
||||
acc.AddError(fmt.Errorf("error inspecting %q interface: %v", bondName, err))
|
||||
}
|
||||
|
||||
/*
|
||||
Some details about bonds only exist in /sys/class/net/
|
||||
In particular, LACP bonds track upstream port state here
|
||||
*/
|
||||
if bond.SysDetails {
|
||||
files, err := bond.readSysFiles(bond.HostSys + "/class/net/" + bondName)
|
||||
if err != nil {
|
||||
acc.AddError(err)
|
||||
}
|
||||
bond.gatherSysDetails(bondName, files, acc)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
|
@ -90,8 +122,14 @@ func (bond *Bond) gatherBondPart(bondName string, rawFile string, acc telegraf.A
|
|||
tags := map[string]string{
|
||||
"bond": bondName,
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(strings.NewReader(rawFile))
|
||||
/*
|
||||
/proc/bond/... files are formatted in a way that is difficult
|
||||
to use regexes to parse. Because of that, we scan through
|
||||
the file one line at a time and rely on specific lines to
|
||||
mark "ends" of blocks. It's a hack that should be resolved,
|
||||
but for now, it works.
|
||||
*/
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
stats := strings.Split(line, ":")
|
||||
|
|
@ -100,6 +138,9 @@ func (bond *Bond) gatherBondPart(bondName string, rawFile string, acc telegraf.A
|
|||
}
|
||||
name := strings.TrimSpace(stats[0])
|
||||
value := strings.TrimSpace(stats[1])
|
||||
if name == "Bonding Mode" {
|
||||
bond.BondType = value
|
||||
}
|
||||
if strings.Contains(name, "Currently Active Slave") {
|
||||
fields["active_slave"] = value
|
||||
}
|
||||
|
|
@ -118,10 +159,86 @@ func (bond *Bond) gatherBondPart(bondName string, rawFile string, acc telegraf.A
|
|||
return fmt.Errorf("Couldn't find status info for '%s' ", bondName)
|
||||
}
|
||||
|
||||
func (bond *Bond) readSysFiles(bondDir string) (sysFiles, error) {
|
||||
/*
|
||||
Files we may need
|
||||
bonding/mode
|
||||
bonding/slaves
|
||||
bonding/ad_num_ports
|
||||
|
||||
We load files here first to allow for easier testing
|
||||
*/
|
||||
var output sysFiles
|
||||
|
||||
file, err := os.ReadFile(bondDir + "/bonding/mode")
|
||||
if err != nil {
|
||||
return sysFiles{}, fmt.Errorf("error inspecting %q interface: %v", bondDir+"/bonding/mode", err)
|
||||
}
|
||||
output.ModeFile = strings.TrimSpace(string(file))
|
||||
file, err = os.ReadFile(bondDir + "/bonding/slaves")
|
||||
if err != nil {
|
||||
return sysFiles{}, fmt.Errorf("error inspecting %q interface: %v", bondDir+"/bonding/slaves", err)
|
||||
}
|
||||
output.SlaveFile = strings.TrimSpace(string(file))
|
||||
if bond.BondType == "IEEE 802.3ad Dynamic link aggregation" {
|
||||
file, err = os.ReadFile(bondDir + "/bonding/ad_num_ports")
|
||||
if err != nil {
|
||||
return sysFiles{}, fmt.Errorf("error inspecting %q interface: %v", bondDir+"/bonding/ad_num_ports", err)
|
||||
}
|
||||
output.ADPortsFile = strings.TrimSpace(string(file))
|
||||
}
|
||||
return output, nil
|
||||
}
|
||||
|
||||
func (bond *Bond) gatherSysDetails(bondName string, files sysFiles, acc telegraf.Accumulator) {
|
||||
var slaves []string
|
||||
var adPortCount int
|
||||
|
||||
// To start with, we get the bond operating mode
|
||||
mode := strings.TrimSpace(strings.Split(files.ModeFile, " ")[0])
|
||||
|
||||
tags := map[string]string{
|
||||
"bond": bondName,
|
||||
"mode": mode,
|
||||
}
|
||||
|
||||
// Next we collect the number of bond slaves the system expects
|
||||
slavesTmp := strings.Split(files.SlaveFile, " ")
|
||||
for _, slave := range slavesTmp {
|
||||
if slave != "" {
|
||||
slaves = append(slaves, slave)
|
||||
}
|
||||
}
|
||||
if mode == "802.3ad" {
|
||||
/*
|
||||
If we're in LACP mode, we should check on how the bond ports are
|
||||
interacting with the upstream switch ports
|
||||
a failed conversion can be treated as 0 ports
|
||||
*/
|
||||
adPortCount, _ = strconv.Atoi(strings.TrimSpace(files.ADPortsFile))
|
||||
} else {
|
||||
adPortCount = len(slaves)
|
||||
}
|
||||
|
||||
fields := map[string]interface{}{
|
||||
"slave_count": len(slaves),
|
||||
"ad_port_count": adPortCount,
|
||||
}
|
||||
acc.AddFields("bond_sys", fields, tags)
|
||||
}
|
||||
|
||||
func (bond *Bond) gatherSlavePart(bondName string, rawFile string, acc telegraf.Accumulator) error {
|
||||
var slave string
|
||||
var status int
|
||||
var slaveCount int
|
||||
tags := map[string]string{
|
||||
"bond": bondName,
|
||||
}
|
||||
fields := map[string]interface{}{
|
||||
"status": 0,
|
||||
}
|
||||
var scanPast bool
|
||||
if bond.BondType == "IEEE 802.3ad Dynamic link aggregation" {
|
||||
scanPast = true
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(strings.NewReader(rawFile))
|
||||
for scanner.Scan() {
|
||||
|
|
@ -133,48 +250,59 @@ func (bond *Bond) gatherSlavePart(bondName string, rawFile string, acc telegraf.
|
|||
name := strings.TrimSpace(stats[0])
|
||||
value := strings.TrimSpace(stats[1])
|
||||
if strings.Contains(name, "Slave Interface") {
|
||||
slave = value
|
||||
tags["interface"] = value
|
||||
slaveCount++
|
||||
}
|
||||
if strings.Contains(name, "MII Status") {
|
||||
status = 0
|
||||
if value == "up" {
|
||||
status = 1
|
||||
}
|
||||
if strings.Contains(name, "MII Status") && value == "up" {
|
||||
fields["status"] = 1
|
||||
}
|
||||
if strings.Contains(name, "Link Failure Count") {
|
||||
count, err := strconv.Atoi(value)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fields := map[string]interface{}{
|
||||
"status": status,
|
||||
"failures": count,
|
||||
fields["failures"] = count
|
||||
if !scanPast {
|
||||
acc.AddFields("bond_slave", fields, tags)
|
||||
}
|
||||
tags := map[string]string{
|
||||
"bond": bondName,
|
||||
"interface": slave,
|
||||
}
|
||||
if strings.Contains(name, "Actor Churned Count") {
|
||||
count, err := strconv.Atoi(value)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fields["actor_churned"] = count
|
||||
}
|
||||
if strings.Contains(name, "Partner Churned Count") {
|
||||
count, err := strconv.Atoi(value)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fields["partner_churned"] = count
|
||||
fields["total_churned"] = fields["actor_churned"].(int) + fields["partner_churned"].(int)
|
||||
acc.AddFields("bond_slave", fields, tags)
|
||||
slaveCount++
|
||||
}
|
||||
}
|
||||
fields := map[string]interface{}{
|
||||
"count": slaveCount,
|
||||
}
|
||||
tags := map[string]string{
|
||||
tags = map[string]string{
|
||||
"bond": bondName,
|
||||
}
|
||||
fields = map[string]interface{}{
|
||||
"count": slaveCount,
|
||||
}
|
||||
acc.AddFields("bond_slave", fields, tags)
|
||||
|
||||
return scanner.Err()
|
||||
}
|
||||
|
||||
// loadPath can be used to read path firstly from config
|
||||
// loadPaths can be used to read path firstly from config
|
||||
// if it is empty then try read from env variable
|
||||
func (bond *Bond) loadPath() {
|
||||
func (bond *Bond) loadPaths() {
|
||||
if bond.HostProc == "" {
|
||||
bond.HostProc = proc(envProc, defaultHostProc)
|
||||
}
|
||||
if bond.HostSys == "" {
|
||||
bond.HostSys = proc(envSys, defaultHostSys)
|
||||
}
|
||||
}
|
||||
|
||||
// proc can be used to read file paths from env
|
||||
|
|
|
|||
|
|
@ -7,35 +7,7 @@ import (
|
|||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
var sampleTest802 = `
|
||||
Ethernet Channel Bonding Driver: v3.5.0 (November 4, 2008)
|
||||
|
||||
Bonding Mode: IEEE 802.3ad Dynamic link aggregation
|
||||
Transmit Hash Policy: layer2 (0)
|
||||
MII Status: up
|
||||
MII Polling Interval (ms): 100
|
||||
Up Delay (ms): 0
|
||||
Down Delay (ms): 0
|
||||
|
||||
802.3ad info
|
||||
LACP rate: fast
|
||||
Aggregator selection policy (ad_select): stable
|
||||
bond bond0 has no active aggregator
|
||||
|
||||
Slave Interface: eth1
|
||||
MII Status: up
|
||||
Link Failure Count: 0
|
||||
Permanent HW addr: 00:0c:29:f5:b7:11
|
||||
Aggregator ID: N/A
|
||||
|
||||
Slave Interface: eth2
|
||||
MII Status: up
|
||||
Link Failure Count: 3
|
||||
Permanent HW addr: 00:0c:29:f5:b7:1b
|
||||
Aggregator ID: N/A
|
||||
`
|
||||
|
||||
var sampleTestAB = `
|
||||
const sampleTestAB = `
|
||||
Ethernet Channel Bonding Driver: v3.6.0 (September 26, 2009)
|
||||
|
||||
Bonding Mode: fault-tolerance (active-backup)
|
||||
|
|
@ -62,18 +34,68 @@ Link Failure Count: 0
|
|||
Permanent HW addr:
|
||||
`
|
||||
|
||||
const sampleTestLACP = `
|
||||
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
|
||||
|
||||
Bonding Mode: IEEE 802.3ad Dynamic link aggregation
|
||||
Transmit Hash Policy: layer2 (0)
|
||||
MII Status: up
|
||||
MII Polling Interval (ms): 100
|
||||
Up Delay (ms): 0
|
||||
Down Delay (ms): 0
|
||||
|
||||
802.3ad info
|
||||
LACP rate: fast
|
||||
Min links: 0
|
||||
Aggregator selection policy (ad_select): stable
|
||||
|
||||
Slave Interface: eth0
|
||||
MII Status: up
|
||||
Speed: 10000 Mbps
|
||||
Duplex: full
|
||||
Link Failure Count: 2
|
||||
Permanent HW addr: 3c:ec:ef:5e:71:58
|
||||
Slave queue ID: 0
|
||||
Aggregator ID: 2
|
||||
Actor Churn State: none
|
||||
Partner Churn State: none
|
||||
Actor Churned Count: 2
|
||||
Partner Churned Count: 0
|
||||
|
||||
Slave Interface: eth1
|
||||
MII Status: up
|
||||
Speed: 10000 Mbps
|
||||
Duplex: full
|
||||
Link Failure Count: 1
|
||||
Permanent HW addr: 3c:ec:ef:5e:71:59
|
||||
Slave queue ID: 0
|
||||
Aggregator ID: 2
|
||||
Actor Churn State: none
|
||||
Partner Churn State: none
|
||||
Actor Churned Count: 0
|
||||
Partner Churned Count: 0
|
||||
`
|
||||
|
||||
const sampleSysMode = "802.3ad 5"
|
||||
const sampleSysSlaves = "eth0 eth1 "
|
||||
const sampleSysAdPorts = " 2 "
|
||||
|
||||
func TestGatherBondInterface(t *testing.T) {
|
||||
var acc testutil.Accumulator
|
||||
bond := &Bond{}
|
||||
|
||||
require.NoError(t, bond.gatherBondInterface("bond802", sampleTest802, &acc))
|
||||
acc.AssertContainsTaggedFields(t, "bond", map[string]interface{}{"status": 1}, map[string]string{"bond": "bond802"})
|
||||
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 0, "status": 1}, map[string]string{"bond": "bond802", "interface": "eth1"})
|
||||
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 3, "status": 1}, map[string]string{"bond": "bond802", "interface": "eth2"})
|
||||
|
||||
require.NoError(t, bond.gatherBondInterface("bondAB", sampleTestAB, &acc))
|
||||
acc.AssertContainsTaggedFields(t, "bond", map[string]interface{}{"active_slave": "eth2", "status": 1}, map[string]string{"bond": "bondAB"})
|
||||
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 2, "status": 0}, map[string]string{"bond": "bondAB", "interface": "eth3"})
|
||||
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 0, "status": 1}, map[string]string{"bond": "bondAB", "interface": "eth2"})
|
||||
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"count": 2}, map[string]string{"bond": "bondAB"})
|
||||
|
||||
acc = testutil.Accumulator{}
|
||||
require.NoError(t, bond.gatherBondInterface("bondLACP", sampleTestLACP, &acc))
|
||||
bond.gatherSysDetails("bondLACP", sysFiles{ModeFile: sampleSysMode, SlaveFile: sampleSysSlaves, ADPortsFile: sampleSysAdPorts}, &acc)
|
||||
acc.AssertContainsTaggedFields(t, "bond", map[string]interface{}{"status": 1}, map[string]string{"bond": "bondLACP"})
|
||||
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 2, "status": 1, "actor_churned": 2, "partner_churned": 0, "total_churned": 2}, map[string]string{"bond": "bondLACP", "interface": "eth0"})
|
||||
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 1, "status": 1, "actor_churned": 0, "partner_churned": 0, "total_churned": 0}, map[string]string{"bond": "bondLACP", "interface": "eth1"})
|
||||
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"count": 2}, map[string]string{"bond": "bondLACP"})
|
||||
acc.AssertContainsTaggedFields(t, "bond_sys", map[string]interface{}{"slave_count": 2, "ad_port_count": 2}, map[string]string{"bond": "bondLACP", "mode": "802.3ad"})
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue