feat: Add additional stats to bond collector (#10137)

This commit is contained in:
John Seekins 2022-01-06 15:08:17 -07:00 committed by GitHub
parent 0b96d40b60
commit 3bbd6be2fa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 247 additions and 75 deletions

View File

@ -12,10 +12,18 @@ The plugin collects these metrics from `/proc/net/bonding/*` files.
## If not specified, then default is /proc
# host_proc = "/proc"
## Sets 'sys' directory path
## If not specified, then default is /sys
# host_sys = "/sys"
## By default, telegraf gather stats for all bond interfaces
## Setting interfaces will restrict the stats to the specified
## bond interfaces.
# bond_interfaces = ["bond0"]
## Tries to collect additional bond details from /sys/class/net/{bond}
## currently only useful for LACP (mode 4) bonds
# collect_sys_details = false
```
## Measurements & Fields
@ -28,22 +36,30 @@ The plugin collects these metrics from `/proc/net/bonding/*` files.
- failures
- status
- count
- actor_churned (for LACP bonds)
- partner_churned (for LACP bonds)
- total_churned (for LACP bonds)
- bond_sys
- slave_count
- ad_port_count
## Description
```shell
active_slave
Currently active slave interface for active-backup mode.
status
Status of bond interface or bonds's slave interface (down = 0, up = 1).
failures
Amount of failures for bond's slave interface.
count
Number of slaves attached to bond
```
- active_slave
- Currently active slave interface for active-backup mode.
- status
- Status of bond interface or bonds's slave interface (down = 0, up = 1).
- failures
- Amount of failures for bond's slave interface.
- count
- Number of slaves attached to bond
- actor_churned
- number of times local end of LACP bond flapped
- partner_churned
- number of times remote end of LACP bond flapped
- total_churned
- full count of all churn events
## Tags
@ -54,6 +70,10 @@ count
- bond
- interface
- bond_sys
- bond
- mode
## Example output
Configuration:
@ -72,12 +92,14 @@ Configuration:
Run:
```bash
```shell
telegraf --config telegraf.conf --input-filter bond --test
```
Output:
```bash
```shell
* Plugin: inputs.bond, Collection 1
> bond,bond=bond1,host=local active_slave="eth0",status=1i 1509704525000000000

View File

@ -14,24 +14,44 @@ import (
// default host proc path
const defaultHostProc = "/proc"
const defaultHostSys = "/sys"
// env host proc variable name
const envProc = "HOST_PROC"
const envSys = "HOST_SYS"
type Bond struct {
HostProc string `toml:"host_proc"`
HostSys string `toml:"host_sys"`
SysDetails bool `toml:"collect_sys_details"`
BondInterfaces []string `toml:"bond_interfaces"`
BondType string
}
var sampleConfig = `
type sysFiles struct {
ModeFile string
SlaveFile string
ADPortsFile string
}
const sampleConfig = `
## Sets 'proc' directory path
## If not specified, then default is /proc
# host_proc = "/proc"
## Sets 'sys' directory path
## If not specified, then default is /sys
# host_sys = "/sys"
## By default, telegraf gather stats for all bond interfaces
## Setting interfaces will restrict the stats to the specified
## bond interfaces.
# bond_interfaces = ["bond0"]
## Tries to collect additional bond details from /sys/class/net/{bond}
## currently only useful for LACP (mode 4) bonds
# collect_sys_details = false
`
func (bond *Bond) Description() string {
@ -44,7 +64,7 @@ func (bond *Bond) SampleConfig() string {
func (bond *Bond) Gather(acc telegraf.Accumulator) error {
// load proc path, get default value if config value and env variable are empty
bond.loadPath()
bond.loadPaths()
// list bond interfaces from bonding directory or gather all interfaces.
bondNames, err := bond.listInterfaces()
if err != nil {
@ -54,13 +74,25 @@ func (bond *Bond) Gather(acc telegraf.Accumulator) error {
bondAbsPath := bond.HostProc + "/net/bonding/" + bondName
file, err := os.ReadFile(bondAbsPath)
if err != nil {
acc.AddError(fmt.Errorf("error inspecting '%s' interface: %v", bondAbsPath, err))
acc.AddError(fmt.Errorf("error inspecting %q interface: %v", bondAbsPath, err))
continue
}
rawFile := strings.TrimSpace(string(file))
err = bond.gatherBondInterface(bondName, rawFile, acc)
rawProcFile := strings.TrimSpace(string(file))
err = bond.gatherBondInterface(bondName, rawProcFile, acc)
if err != nil {
acc.AddError(fmt.Errorf("error inspecting '%s' interface: %v", bondName, err))
acc.AddError(fmt.Errorf("error inspecting %q interface: %v", bondName, err))
}
/*
Some details about bonds only exist in /sys/class/net/
In particular, LACP bonds track upstream port state here
*/
if bond.SysDetails {
files, err := bond.readSysFiles(bond.HostSys + "/class/net/" + bondName)
if err != nil {
acc.AddError(err)
}
bond.gatherSysDetails(bondName, files, acc)
}
}
return nil
@ -90,8 +122,14 @@ func (bond *Bond) gatherBondPart(bondName string, rawFile string, acc telegraf.A
tags := map[string]string{
"bond": bondName,
}
scanner := bufio.NewScanner(strings.NewReader(rawFile))
/*
/proc/bond/... files are formatted in a way that is difficult
to use regexes to parse. Because of that, we scan through
the file one line at a time and rely on specific lines to
mark "ends" of blocks. It's a hack that should be resolved,
but for now, it works.
*/
for scanner.Scan() {
line := scanner.Text()
stats := strings.Split(line, ":")
@ -100,6 +138,9 @@ func (bond *Bond) gatherBondPart(bondName string, rawFile string, acc telegraf.A
}
name := strings.TrimSpace(stats[0])
value := strings.TrimSpace(stats[1])
if name == "Bonding Mode" {
bond.BondType = value
}
if strings.Contains(name, "Currently Active Slave") {
fields["active_slave"] = value
}
@ -118,10 +159,86 @@ func (bond *Bond) gatherBondPart(bondName string, rawFile string, acc telegraf.A
return fmt.Errorf("Couldn't find status info for '%s' ", bondName)
}
func (bond *Bond) readSysFiles(bondDir string) (sysFiles, error) {
/*
Files we may need
bonding/mode
bonding/slaves
bonding/ad_num_ports
We load files here first to allow for easier testing
*/
var output sysFiles
file, err := os.ReadFile(bondDir + "/bonding/mode")
if err != nil {
return sysFiles{}, fmt.Errorf("error inspecting %q interface: %v", bondDir+"/bonding/mode", err)
}
output.ModeFile = strings.TrimSpace(string(file))
file, err = os.ReadFile(bondDir + "/bonding/slaves")
if err != nil {
return sysFiles{}, fmt.Errorf("error inspecting %q interface: %v", bondDir+"/bonding/slaves", err)
}
output.SlaveFile = strings.TrimSpace(string(file))
if bond.BondType == "IEEE 802.3ad Dynamic link aggregation" {
file, err = os.ReadFile(bondDir + "/bonding/ad_num_ports")
if err != nil {
return sysFiles{}, fmt.Errorf("error inspecting %q interface: %v", bondDir+"/bonding/ad_num_ports", err)
}
output.ADPortsFile = strings.TrimSpace(string(file))
}
return output, nil
}
func (bond *Bond) gatherSysDetails(bondName string, files sysFiles, acc telegraf.Accumulator) {
var slaves []string
var adPortCount int
// To start with, we get the bond operating mode
mode := strings.TrimSpace(strings.Split(files.ModeFile, " ")[0])
tags := map[string]string{
"bond": bondName,
"mode": mode,
}
// Next we collect the number of bond slaves the system expects
slavesTmp := strings.Split(files.SlaveFile, " ")
for _, slave := range slavesTmp {
if slave != "" {
slaves = append(slaves, slave)
}
}
if mode == "802.3ad" {
/*
If we're in LACP mode, we should check on how the bond ports are
interacting with the upstream switch ports
a failed conversion can be treated as 0 ports
*/
adPortCount, _ = strconv.Atoi(strings.TrimSpace(files.ADPortsFile))
} else {
adPortCount = len(slaves)
}
fields := map[string]interface{}{
"slave_count": len(slaves),
"ad_port_count": adPortCount,
}
acc.AddFields("bond_sys", fields, tags)
}
func (bond *Bond) gatherSlavePart(bondName string, rawFile string, acc telegraf.Accumulator) error {
var slave string
var status int
var slaveCount int
tags := map[string]string{
"bond": bondName,
}
fields := map[string]interface{}{
"status": 0,
}
var scanPast bool
if bond.BondType == "IEEE 802.3ad Dynamic link aggregation" {
scanPast = true
}
scanner := bufio.NewScanner(strings.NewReader(rawFile))
for scanner.Scan() {
@ -133,48 +250,59 @@ func (bond *Bond) gatherSlavePart(bondName string, rawFile string, acc telegraf.
name := strings.TrimSpace(stats[0])
value := strings.TrimSpace(stats[1])
if strings.Contains(name, "Slave Interface") {
slave = value
tags["interface"] = value
slaveCount++
}
if strings.Contains(name, "MII Status") {
status = 0
if value == "up" {
status = 1
}
if strings.Contains(name, "MII Status") && value == "up" {
fields["status"] = 1
}
if strings.Contains(name, "Link Failure Count") {
count, err := strconv.Atoi(value)
if err != nil {
return err
}
fields := map[string]interface{}{
"status": status,
"failures": count,
fields["failures"] = count
if !scanPast {
acc.AddFields("bond_slave", fields, tags)
}
tags := map[string]string{
"bond": bondName,
"interface": slave,
}
if strings.Contains(name, "Actor Churned Count") {
count, err := strconv.Atoi(value)
if err != nil {
return err
}
fields["actor_churned"] = count
}
if strings.Contains(name, "Partner Churned Count") {
count, err := strconv.Atoi(value)
if err != nil {
return err
}
fields["partner_churned"] = count
fields["total_churned"] = fields["actor_churned"].(int) + fields["partner_churned"].(int)
acc.AddFields("bond_slave", fields, tags)
slaveCount++
}
}
fields := map[string]interface{}{
"count": slaveCount,
}
tags := map[string]string{
tags = map[string]string{
"bond": bondName,
}
fields = map[string]interface{}{
"count": slaveCount,
}
acc.AddFields("bond_slave", fields, tags)
return scanner.Err()
}
// loadPath can be used to read path firstly from config
// loadPaths can be used to read path firstly from config
// if it is empty then try read from env variable
func (bond *Bond) loadPath() {
func (bond *Bond) loadPaths() {
if bond.HostProc == "" {
bond.HostProc = proc(envProc, defaultHostProc)
}
if bond.HostSys == "" {
bond.HostSys = proc(envSys, defaultHostSys)
}
}
// proc can be used to read file paths from env

View File

@ -7,35 +7,7 @@ import (
"github.com/stretchr/testify/require"
)
var sampleTest802 = `
Ethernet Channel Bonding Driver: v3.5.0 (November 4, 2008)
Bonding Mode: IEEE 802.3ad Dynamic link aggregation
Transmit Hash Policy: layer2 (0)
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 0
Down Delay (ms): 0
802.3ad info
LACP rate: fast
Aggregator selection policy (ad_select): stable
bond bond0 has no active aggregator
Slave Interface: eth1
MII Status: up
Link Failure Count: 0
Permanent HW addr: 00:0c:29:f5:b7:11
Aggregator ID: N/A
Slave Interface: eth2
MII Status: up
Link Failure Count: 3
Permanent HW addr: 00:0c:29:f5:b7:1b
Aggregator ID: N/A
`
var sampleTestAB = `
const sampleTestAB = `
Ethernet Channel Bonding Driver: v3.6.0 (September 26, 2009)
Bonding Mode: fault-tolerance (active-backup)
@ -62,18 +34,68 @@ Link Failure Count: 0
Permanent HW addr:
`
const sampleTestLACP = `
Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
Bonding Mode: IEEE 802.3ad Dynamic link aggregation
Transmit Hash Policy: layer2 (0)
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 0
Down Delay (ms): 0
802.3ad info
LACP rate: fast
Min links: 0
Aggregator selection policy (ad_select): stable
Slave Interface: eth0
MII Status: up
Speed: 10000 Mbps
Duplex: full
Link Failure Count: 2
Permanent HW addr: 3c:ec:ef:5e:71:58
Slave queue ID: 0
Aggregator ID: 2
Actor Churn State: none
Partner Churn State: none
Actor Churned Count: 2
Partner Churned Count: 0
Slave Interface: eth1
MII Status: up
Speed: 10000 Mbps
Duplex: full
Link Failure Count: 1
Permanent HW addr: 3c:ec:ef:5e:71:59
Slave queue ID: 0
Aggregator ID: 2
Actor Churn State: none
Partner Churn State: none
Actor Churned Count: 0
Partner Churned Count: 0
`
const sampleSysMode = "802.3ad 5"
const sampleSysSlaves = "eth0 eth1 "
const sampleSysAdPorts = " 2 "
func TestGatherBondInterface(t *testing.T) {
var acc testutil.Accumulator
bond := &Bond{}
require.NoError(t, bond.gatherBondInterface("bond802", sampleTest802, &acc))
acc.AssertContainsTaggedFields(t, "bond", map[string]interface{}{"status": 1}, map[string]string{"bond": "bond802"})
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 0, "status": 1}, map[string]string{"bond": "bond802", "interface": "eth1"})
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 3, "status": 1}, map[string]string{"bond": "bond802", "interface": "eth2"})
require.NoError(t, bond.gatherBondInterface("bondAB", sampleTestAB, &acc))
acc.AssertContainsTaggedFields(t, "bond", map[string]interface{}{"active_slave": "eth2", "status": 1}, map[string]string{"bond": "bondAB"})
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 2, "status": 0}, map[string]string{"bond": "bondAB", "interface": "eth3"})
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 0, "status": 1}, map[string]string{"bond": "bondAB", "interface": "eth2"})
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"count": 2}, map[string]string{"bond": "bondAB"})
acc = testutil.Accumulator{}
require.NoError(t, bond.gatherBondInterface("bondLACP", sampleTestLACP, &acc))
bond.gatherSysDetails("bondLACP", sysFiles{ModeFile: sampleSysMode, SlaveFile: sampleSysSlaves, ADPortsFile: sampleSysAdPorts}, &acc)
acc.AssertContainsTaggedFields(t, "bond", map[string]interface{}{"status": 1}, map[string]string{"bond": "bondLACP"})
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 2, "status": 1, "actor_churned": 2, "partner_churned": 0, "total_churned": 2}, map[string]string{"bond": "bondLACP", "interface": "eth0"})
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 1, "status": 1, "actor_churned": 0, "partner_churned": 0, "total_churned": 0}, map[string]string{"bond": "bondLACP", "interface": "eth1"})
acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"count": 2}, map[string]string{"bond": "bondLACP"})
acc.AssertContainsTaggedFields(t, "bond_sys", map[string]interface{}{"slave_count": 2, "ad_port_count": 2}, map[string]string{"bond": "bondLACP", "mode": "802.3ad"})
}