diff --git a/plugins/inputs/bond/README.md b/plugins/inputs/bond/README.md index 9227df2ba..517671f7e 100644 --- a/plugins/inputs/bond/README.md +++ b/plugins/inputs/bond/README.md @@ -12,10 +12,18 @@ The plugin collects these metrics from `/proc/net/bonding/*` files. ## If not specified, then default is /proc # host_proc = "/proc" + ## Sets 'sys' directory path + ## If not specified, then default is /sys + # host_sys = "/sys" + ## By default, telegraf gather stats for all bond interfaces ## Setting interfaces will restrict the stats to the specified ## bond interfaces. # bond_interfaces = ["bond0"] + + ## Tries to collect additional bond details from /sys/class/net/{bond} + ## currently only useful for LACP (mode 4) bonds + # collect_sys_details = false ``` ## Measurements & Fields @@ -28,22 +36,30 @@ The plugin collects these metrics from `/proc/net/bonding/*` files. - failures - status - count + - actor_churned (for LACP bonds) + - partner_churned (for LACP bonds) + - total_churned (for LACP bonds) + +- bond_sys + - slave_count + - ad_port_count ## Description -```shell -active_slave - Currently active slave interface for active-backup mode. - -status - Status of bond interface or bonds's slave interface (down = 0, up = 1). - -failures - Amount of failures for bond's slave interface. - -count - Number of slaves attached to bond -``` +- active_slave + - Currently active slave interface for active-backup mode. +- status + - Status of bond interface or bonds's slave interface (down = 0, up = 1). +- failures + - Amount of failures for bond's slave interface. +- count + - Number of slaves attached to bond +- actor_churned + - number of times local end of LACP bond flapped +- partner_churned + - number of times remote end of LACP bond flapped +- total_churned + - full count of all churn events ## Tags @@ -54,6 +70,10 @@ count - bond - interface +- bond_sys + - bond + - mode + ## Example output Configuration: @@ -72,12 +92,14 @@ Configuration: Run: +```bash ```shell telegraf --config telegraf.conf --input-filter bond --test ``` Output: +```bash ```shell * Plugin: inputs.bond, Collection 1 > bond,bond=bond1,host=local active_slave="eth0",status=1i 1509704525000000000 diff --git a/plugins/inputs/bond/bond.go b/plugins/inputs/bond/bond.go index 4f30a20e3..de6f3602b 100644 --- a/plugins/inputs/bond/bond.go +++ b/plugins/inputs/bond/bond.go @@ -14,24 +14,44 @@ import ( // default host proc path const defaultHostProc = "/proc" +const defaultHostSys = "/sys" // env host proc variable name const envProc = "HOST_PROC" +const envSys = "HOST_SYS" type Bond struct { HostProc string `toml:"host_proc"` + HostSys string `toml:"host_sys"` + SysDetails bool `toml:"collect_sys_details"` BondInterfaces []string `toml:"bond_interfaces"` + BondType string } -var sampleConfig = ` +type sysFiles struct { + ModeFile string + SlaveFile string + ADPortsFile string +} + +const sampleConfig = ` ## Sets 'proc' directory path ## If not specified, then default is /proc # host_proc = "/proc" + ## Sets 'sys' directory path + ## If not specified, then default is /sys + # host_sys = "/sys" + ## By default, telegraf gather stats for all bond interfaces ## Setting interfaces will restrict the stats to the specified ## bond interfaces. # bond_interfaces = ["bond0"] + + ## Tries to collect additional bond details from /sys/class/net/{bond} + ## currently only useful for LACP (mode 4) bonds + # collect_sys_details = false + ` func (bond *Bond) Description() string { @@ -44,7 +64,7 @@ func (bond *Bond) SampleConfig() string { func (bond *Bond) Gather(acc telegraf.Accumulator) error { // load proc path, get default value if config value and env variable are empty - bond.loadPath() + bond.loadPaths() // list bond interfaces from bonding directory or gather all interfaces. bondNames, err := bond.listInterfaces() if err != nil { @@ -54,13 +74,25 @@ func (bond *Bond) Gather(acc telegraf.Accumulator) error { bondAbsPath := bond.HostProc + "/net/bonding/" + bondName file, err := os.ReadFile(bondAbsPath) if err != nil { - acc.AddError(fmt.Errorf("error inspecting '%s' interface: %v", bondAbsPath, err)) + acc.AddError(fmt.Errorf("error inspecting %q interface: %v", bondAbsPath, err)) continue } - rawFile := strings.TrimSpace(string(file)) - err = bond.gatherBondInterface(bondName, rawFile, acc) + rawProcFile := strings.TrimSpace(string(file)) + err = bond.gatherBondInterface(bondName, rawProcFile, acc) if err != nil { - acc.AddError(fmt.Errorf("error inspecting '%s' interface: %v", bondName, err)) + acc.AddError(fmt.Errorf("error inspecting %q interface: %v", bondName, err)) + } + + /* + Some details about bonds only exist in /sys/class/net/ + In particular, LACP bonds track upstream port state here + */ + if bond.SysDetails { + files, err := bond.readSysFiles(bond.HostSys + "/class/net/" + bondName) + if err != nil { + acc.AddError(err) + } + bond.gatherSysDetails(bondName, files, acc) } } return nil @@ -90,8 +122,14 @@ func (bond *Bond) gatherBondPart(bondName string, rawFile string, acc telegraf.A tags := map[string]string{ "bond": bondName, } - scanner := bufio.NewScanner(strings.NewReader(rawFile)) + /* + /proc/bond/... files are formatted in a way that is difficult + to use regexes to parse. Because of that, we scan through + the file one line at a time and rely on specific lines to + mark "ends" of blocks. It's a hack that should be resolved, + but for now, it works. + */ for scanner.Scan() { line := scanner.Text() stats := strings.Split(line, ":") @@ -100,6 +138,9 @@ func (bond *Bond) gatherBondPart(bondName string, rawFile string, acc telegraf.A } name := strings.TrimSpace(stats[0]) value := strings.TrimSpace(stats[1]) + if name == "Bonding Mode" { + bond.BondType = value + } if strings.Contains(name, "Currently Active Slave") { fields["active_slave"] = value } @@ -118,10 +159,86 @@ func (bond *Bond) gatherBondPart(bondName string, rawFile string, acc telegraf.A return fmt.Errorf("Couldn't find status info for '%s' ", bondName) } +func (bond *Bond) readSysFiles(bondDir string) (sysFiles, error) { + /* + Files we may need + bonding/mode + bonding/slaves + bonding/ad_num_ports + + We load files here first to allow for easier testing + */ + var output sysFiles + + file, err := os.ReadFile(bondDir + "/bonding/mode") + if err != nil { + return sysFiles{}, fmt.Errorf("error inspecting %q interface: %v", bondDir+"/bonding/mode", err) + } + output.ModeFile = strings.TrimSpace(string(file)) + file, err = os.ReadFile(bondDir + "/bonding/slaves") + if err != nil { + return sysFiles{}, fmt.Errorf("error inspecting %q interface: %v", bondDir+"/bonding/slaves", err) + } + output.SlaveFile = strings.TrimSpace(string(file)) + if bond.BondType == "IEEE 802.3ad Dynamic link aggregation" { + file, err = os.ReadFile(bondDir + "/bonding/ad_num_ports") + if err != nil { + return sysFiles{}, fmt.Errorf("error inspecting %q interface: %v", bondDir+"/bonding/ad_num_ports", err) + } + output.ADPortsFile = strings.TrimSpace(string(file)) + } + return output, nil +} + +func (bond *Bond) gatherSysDetails(bondName string, files sysFiles, acc telegraf.Accumulator) { + var slaves []string + var adPortCount int + + // To start with, we get the bond operating mode + mode := strings.TrimSpace(strings.Split(files.ModeFile, " ")[0]) + + tags := map[string]string{ + "bond": bondName, + "mode": mode, + } + + // Next we collect the number of bond slaves the system expects + slavesTmp := strings.Split(files.SlaveFile, " ") + for _, slave := range slavesTmp { + if slave != "" { + slaves = append(slaves, slave) + } + } + if mode == "802.3ad" { + /* + If we're in LACP mode, we should check on how the bond ports are + interacting with the upstream switch ports + a failed conversion can be treated as 0 ports + */ + adPortCount, _ = strconv.Atoi(strings.TrimSpace(files.ADPortsFile)) + } else { + adPortCount = len(slaves) + } + + fields := map[string]interface{}{ + "slave_count": len(slaves), + "ad_port_count": adPortCount, + } + acc.AddFields("bond_sys", fields, tags) +} + func (bond *Bond) gatherSlavePart(bondName string, rawFile string, acc telegraf.Accumulator) error { - var slave string - var status int var slaveCount int + tags := map[string]string{ + "bond": bondName, + } + fields := map[string]interface{}{ + "status": 0, + } + var scanPast bool + if bond.BondType == "IEEE 802.3ad Dynamic link aggregation" { + scanPast = true + } scanner := bufio.NewScanner(strings.NewReader(rawFile)) for scanner.Scan() { @@ -133,48 +250,59 @@ func (bond *Bond) gatherSlavePart(bondName string, rawFile string, acc telegraf. name := strings.TrimSpace(stats[0]) value := strings.TrimSpace(stats[1]) if strings.Contains(name, "Slave Interface") { - slave = value + tags["interface"] = value + slaveCount++ } - if strings.Contains(name, "MII Status") { - status = 0 - if value == "up" { - status = 1 - } + if strings.Contains(name, "MII Status") && value == "up" { + fields["status"] = 1 } if strings.Contains(name, "Link Failure Count") { count, err := strconv.Atoi(value) if err != nil { return err } - fields := map[string]interface{}{ - "status": status, - "failures": count, + fields["failures"] = count + if !scanPast { + acc.AddFields("bond_slave", fields, tags) } - tags := map[string]string{ - "bond": bondName, - "interface": slave, + } + if strings.Contains(name, "Actor Churned Count") { + count, err := strconv.Atoi(value) + if err != nil { + return err } + fields["actor_churned"] = count + } + if strings.Contains(name, "Partner Churned Count") { + count, err := strconv.Atoi(value) + if err != nil { + return err + } + fields["partner_churned"] = count + fields["total_churned"] = fields["actor_churned"].(int) + fields["partner_churned"].(int) acc.AddFields("bond_slave", fields, tags) - slaveCount++ } } - fields := map[string]interface{}{ - "count": slaveCount, - } - tags := map[string]string{ + tags = map[string]string{ "bond": bondName, } + fields = map[string]interface{}{ + "count": slaveCount, + } acc.AddFields("bond_slave", fields, tags) return scanner.Err() } -// loadPath can be used to read path firstly from config +// loadPaths can be used to read path firstly from config // if it is empty then try read from env variable -func (bond *Bond) loadPath() { +func (bond *Bond) loadPaths() { if bond.HostProc == "" { bond.HostProc = proc(envProc, defaultHostProc) } + if bond.HostSys == "" { + bond.HostSys = proc(envSys, defaultHostSys) + } } // proc can be used to read file paths from env diff --git a/plugins/inputs/bond/bond_test.go b/plugins/inputs/bond/bond_test.go index 8dc24f4ca..838f4c465 100644 --- a/plugins/inputs/bond/bond_test.go +++ b/plugins/inputs/bond/bond_test.go @@ -7,35 +7,7 @@ import ( "github.com/stretchr/testify/require" ) -var sampleTest802 = ` -Ethernet Channel Bonding Driver: v3.5.0 (November 4, 2008) - -Bonding Mode: IEEE 802.3ad Dynamic link aggregation -Transmit Hash Policy: layer2 (0) -MII Status: up -MII Polling Interval (ms): 100 -Up Delay (ms): 0 -Down Delay (ms): 0 - -802.3ad info -LACP rate: fast -Aggregator selection policy (ad_select): stable -bond bond0 has no active aggregator - -Slave Interface: eth1 -MII Status: up -Link Failure Count: 0 -Permanent HW addr: 00:0c:29:f5:b7:11 -Aggregator ID: N/A - -Slave Interface: eth2 -MII Status: up -Link Failure Count: 3 -Permanent HW addr: 00:0c:29:f5:b7:1b -Aggregator ID: N/A -` - -var sampleTestAB = ` +const sampleTestAB = ` Ethernet Channel Bonding Driver: v3.6.0 (September 26, 2009) Bonding Mode: fault-tolerance (active-backup) @@ -62,18 +34,68 @@ Link Failure Count: 0 Permanent HW addr: ` +const sampleTestLACP = ` +Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011) + +Bonding Mode: IEEE 802.3ad Dynamic link aggregation +Transmit Hash Policy: layer2 (0) +MII Status: up +MII Polling Interval (ms): 100 +Up Delay (ms): 0 +Down Delay (ms): 0 + +802.3ad info +LACP rate: fast +Min links: 0 +Aggregator selection policy (ad_select): stable + +Slave Interface: eth0 +MII Status: up +Speed: 10000 Mbps +Duplex: full +Link Failure Count: 2 +Permanent HW addr: 3c:ec:ef:5e:71:58 +Slave queue ID: 0 +Aggregator ID: 2 +Actor Churn State: none +Partner Churn State: none +Actor Churned Count: 2 +Partner Churned Count: 0 + +Slave Interface: eth1 +MII Status: up +Speed: 10000 Mbps +Duplex: full +Link Failure Count: 1 +Permanent HW addr: 3c:ec:ef:5e:71:59 +Slave queue ID: 0 +Aggregator ID: 2 +Actor Churn State: none +Partner Churn State: none +Actor Churned Count: 0 +Partner Churned Count: 0 +` + +const sampleSysMode = "802.3ad 5" +const sampleSysSlaves = "eth0 eth1 " +const sampleSysAdPorts = " 2 " + func TestGatherBondInterface(t *testing.T) { var acc testutil.Accumulator bond := &Bond{} - require.NoError(t, bond.gatherBondInterface("bond802", sampleTest802, &acc)) - acc.AssertContainsTaggedFields(t, "bond", map[string]interface{}{"status": 1}, map[string]string{"bond": "bond802"}) - acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 0, "status": 1}, map[string]string{"bond": "bond802", "interface": "eth1"}) - acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 3, "status": 1}, map[string]string{"bond": "bond802", "interface": "eth2"}) - require.NoError(t, bond.gatherBondInterface("bondAB", sampleTestAB, &acc)) acc.AssertContainsTaggedFields(t, "bond", map[string]interface{}{"active_slave": "eth2", "status": 1}, map[string]string{"bond": "bondAB"}) acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 2, "status": 0}, map[string]string{"bond": "bondAB", "interface": "eth3"}) acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 0, "status": 1}, map[string]string{"bond": "bondAB", "interface": "eth2"}) acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"count": 2}, map[string]string{"bond": "bondAB"}) + + acc = testutil.Accumulator{} + require.NoError(t, bond.gatherBondInterface("bondLACP", sampleTestLACP, &acc)) + bond.gatherSysDetails("bondLACP", sysFiles{ModeFile: sampleSysMode, SlaveFile: sampleSysSlaves, ADPortsFile: sampleSysAdPorts}, &acc) + acc.AssertContainsTaggedFields(t, "bond", map[string]interface{}{"status": 1}, map[string]string{"bond": "bondLACP"}) + acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 2, "status": 1, "actor_churned": 2, "partner_churned": 0, "total_churned": 2}, map[string]string{"bond": "bondLACP", "interface": "eth0"}) + acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"failures": 1, "status": 1, "actor_churned": 0, "partner_churned": 0, "total_churned": 0}, map[string]string{"bond": "bondLACP", "interface": "eth1"}) + acc.AssertContainsTaggedFields(t, "bond_slave", map[string]interface{}{"count": 2}, map[string]string{"bond": "bondLACP"}) + acc.AssertContainsTaggedFields(t, "bond_sys", map[string]interface{}{"slave_count": 2, "ad_port_count": 2}, map[string]string{"bond": "bondLACP", "mode": "802.3ad"}) }