feat(inputs.infiniband): Add support for RDMA counters (#16336)

Co-authored-by: zekrii <ishay.zekri@dell.com>
This commit is contained in:
Ishay Zekri 2025-02-04 17:58:22 +02:00 committed by GitHub
parent 7f1c5d31aa
commit 3df52afe39
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 225 additions and 5 deletions

View File

@ -3,6 +3,8 @@
This plugin gathers statistics for all InfiniBand devices and ports on the This plugin gathers statistics for all InfiniBand devices and ports on the
system. These are the counters that can be found in system. These are the counters that can be found in
`/sys/class/infiniband/<dev>/port/<port>/counters/` `/sys/class/infiniband/<dev>/port/<port>/counters/`
and RDMA counters can be found in
`/sys/class/infiniband/<dev>/ports/<port>/hw_counters/`
**Supported Platforms**: Linux **Supported Platforms**: Linux
@ -22,6 +24,9 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
# This plugin ONLY supports Linux # This plugin ONLY supports Linux
[[inputs.infiniband]] [[inputs.infiniband]]
# no configuration # no configuration
## Collect RDMA counters
# gather_rdma = false
``` ```
## Metrics ## Metrics
@ -29,15 +34,20 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
Actual metrics depend on the InfiniBand devices, the plugin uses a simple Actual metrics depend on the InfiniBand devices, the plugin uses a simple
mapping from counter -> counter value. mapping from counter -> counter value.
[Information about the counters][counters] collected is provided by Mellanox. [Information about the counters][counters] collected is provided by Nvidia.
[counters]: https://community.mellanox.com/s/article/understanding-mlx5-linux-counters-and-status-parameters [counters]: https://enterprise-support.nvidia.com/s/article/understanding-mlx5-linux-counters-and-status-parameters
The following fields are emitted by the plugin when selecting `counters`:
- infiniband - infiniband
- tags: - tags:
- device - device
- port - port
- fields: - fields:
### Infiniband Counters
- excessive_buffer_overrun_errors (integer) - excessive_buffer_overrun_errors (integer)
- link_downed (integer) - link_downed (integer)
- link_error_recovery (integer) - link_error_recovery (integer)
@ -60,8 +70,41 @@ mapping from counter -> counter value.
- unicast_xmit_packets (integer) - unicast_xmit_packets (integer)
- VL15_dropped (integer) - VL15_dropped (integer)
### Infiniband RDMA counters
- duplicate_request (integer)
- implied_nak_seq_err (integer)
- lifespan (integer)
- local_ack_timeout_err (integer)
- np_cnp_sent (integer)
- np_ecn_marked_roce_packets (integer)
- out_of_buffer (integer)
- out_of_sequence (integer)
- packet_seq_err (integer)
- req_cqe_error (integer)
- req_cqe_flush_error (integer)
- req_remote_access_errors (integer)
- req_remote_invalid_request (integer)
- resp_cqe_error (integer)
- resp_cqe_flush_error (integer)
- resp_local_length_error (integer)
- resp_remote_access_errors (integer)
- rnr_nak_retry_err (integer)
- roce_adp_retrans (integer)
- roce_adp_retrans_to (integer)
- roce_slow_restart (integer)
- roce_slow_restart_cnps (integer)
- roce_slow_restart_trans (integer)
- rp_cnp_handled (integer)
- rp_cnp_ignored (integer)
- rx_atomic_requests (integer)
- rx_icrc_encapsulated (integer)
- rx_read_requests (integer)
- rx_write_requests (integer)
## Example Output ## Example Output
```text ```text
infiniband,device=mlx5_0,port=1 VL15_dropped=0i,excessive_buffer_overrun_errors=0i,link_downed=0i,link_error_recovery=0i,local_link_integrity_errors=0i,multicast_rcv_packets=0i,multicast_xmit_packets=0i,port_rcv_constraint_errors=0i,port_rcv_data=237159415345822i,port_rcv_errors=0i,port_rcv_packets=801977655075i,port_rcv_remote_physical_errors=0i,port_rcv_switch_relay_errors=0i,port_xmit_constraint_errors=0i,port_xmit_data=238334949937759i,port_xmit_discards=0i,port_xmit_packets=803162651391i,port_xmit_wait=4294967295i,symbol_error=0i,unicast_rcv_packets=801977655075i,unicast_xmit_packets=803162651391i 1573125558000000000 infiniband,device=mlx5_bond_0,host=hop-r640-12,port=1 port_xmit_data=85378896588i,VL15_dropped=0i,port_rcv_packets=34914071i,port_rcv_data=34600185253i,port_xmit_discards=0i,link_downed=0i,local_link_integrity_errors=0i,symbol_error=0i,link_error_recovery=0i,multicast_rcv_packets=0i,multicast_xmit_packets=0i,unicast_xmit_packets=82002535i,excessive_buffer_overrun_errors=0i,port_rcv_switch_relay_errors=0i,unicast_rcv_packets=34914071i,port_xmit_constraint_errors=0i,port_rcv_errors=0i,port_xmit_wait=0i,port_rcv_remote_physical_errors=0i,port_rcv_constraint_errors=0i,port_xmit_packets=82002535i 1737652060000000000
infiniband,device=mlx5_bond_0,host=hop-r640-12,port=1 local_ack_timeout_err=0i,lifespan=10i,out_of_buffer=0i,resp_remote_access_errors=0i,resp_local_length_error=0i,np_cnp_sent=0i,roce_slow_restart=0i,rx_read_requests=6000i,duplicate_request=0i,resp_cqe_error=0i,rx_write_requests=19000i,roce_slow_restart_cnps=0i,rx_icrc_encapsulated=0i,rnr_nak_retry_err=0i,roce_adp_retrans=0i,out_of_sequence=0i,req_remote_access_errors=0i,roce_slow_restart_trans=0i,req_remote_invalid_request=0i,req_cqe_error=0i,resp_cqe_flush_error=0i,packet_seq_err=0i,roce_adp_retrans_to=0i,np_ecn_marked_roce_packets=0i,rp_cnp_handled=0i,implied_nak_seq_err=0i,rp_cnp_ignored=0i,req_cqe_flush_error=0i,rx_atomic_requests=0i 1737652060000000000
``` ```

View File

@ -12,7 +12,8 @@ import (
var sampleConfig string var sampleConfig string
type Infiniband struct { type Infiniband struct {
Log telegraf.Logger `toml:"-"` RDMA bool `toml:"gather_rdma"`
Log telegraf.Logger `toml:"-"`
} }
func (*Infiniband) SampleConfig() string { func (*Infiniband) SampleConfig() string {

View File

@ -12,7 +12,7 @@ import (
) )
// Gather statistics from our infiniband cards // Gather statistics from our infiniband cards
func (*Infiniband) Gather(acc telegraf.Accumulator) error { func (ib *Infiniband) Gather(acc telegraf.Accumulator) error {
rdmaDevices := rdmamap.GetRdmaDeviceList() rdmaDevices := rdmamap.GetRdmaDeviceList()
if len(rdmaDevices) == 0 { if len(rdmaDevices) == 0 {
@ -33,6 +33,15 @@ func (*Infiniband) Gather(acc telegraf.Accumulator) error {
} }
addStats(dev, port, stats, acc) addStats(dev, port, stats, acc)
if ib.RDMA {
stats, err := rdmamap.GetRdmaSysfsHwStats(dev, portInt)
if err != nil {
continue
}
addStats(dev, port, stats, acc)
}
} }
} }

View File

@ -133,3 +133,167 @@ func TestInfiniband(t *testing.T) {
acc.AssertContainsTaggedFields(t, "infiniband", fields, tags) acc.AssertContainsTaggedFields(t, "infiniband", fields, tags)
} }
func TestInfinibandRDMA(t *testing.T) {
fields := map[string]interface{}{
"duplicate_request": uint64(0),
"implied_nak_seq_err": uint64(0),
"lifespan": uint64(10),
"local_ack_timeout_err": uint64(38),
"np_cnp_sent": uint64(10284520),
"np_ecn_marked_roce_packets": uint64(286733949),
"out_of_buffer": uint64(1149772),
"out_of_sequence": uint64(44),
"packet_seq_err": uint64(1),
"req_cqe_error": uint64(10776),
"req_cqe_flush_error": uint64(2173),
"req_remote_access_errors": uint64(0),
"req_remote_invalid_request": uint64(0),
"resp_cqe_error": uint64(759),
"resp_cqe_flush_error": uint64(759),
"resp_local_length_error": uint64(0),
"resp_remote_access_errors": uint64(0),
"rnr_nak_retry_err": uint64(0),
"roce_adp_retrans": uint64(0),
"roce_adp_retrans_to": uint64(0),
"roce_slow_restart": uint64(0),
"roce_slow_restart_cnps": uint64(0),
"roce_slow_restart_trans": uint64(0),
"rp_cnp_handled": uint64(1),
"rp_cnp_ignored": uint64(0),
"rx_atomic_requests": uint64(0),
"rx_icrc_encapsulated": uint64(0),
"rx_read_requests": uint64(488228),
"rx_write_requests": uint64(3928699),
}
tags := map[string]string{
"device": "m1x5_0",
"port": "1",
}
sampleRdmaHwStatsEntries := []rdmamap.RdmaStatEntry{
{
Name: "duplicate_request",
Value: uint64(0),
},
{
Name: "implied_nak_seq_err",
Value: uint64(0),
},
{
Name: "lifespan",
Value: uint64(10),
},
{
Name: "local_ack_timeout_err",
Value: uint64(38),
},
{
Name: "np_cnp_sent",
Value: uint64(10284520),
},
{
Name: "np_ecn_marked_roce_packets",
Value: uint64(286733949),
},
{
Name: "out_of_buffer",
Value: uint64(1149772),
},
{
Name: "out_of_sequence",
Value: uint64(44),
},
{
Name: "packet_seq_err",
Value: uint64(1),
},
{
Name: "req_cqe_error",
Value: uint64(10776),
},
{
Name: "req_cqe_flush_error",
Value: uint64(2173),
},
{
Name: "req_remote_access_errors",
Value: uint64(0),
},
{
Name: "req_remote_invalid_request",
Value: uint64(0),
},
{
Name: "resp_cqe_error",
Value: uint64(759),
},
{
Name: "resp_cqe_flush_error",
Value: uint64(759),
},
{
Name: "resp_local_length_error",
Value: uint64(0),
},
{
Name: "resp_remote_access_errors",
Value: uint64(0),
},
{
Name: "rnr_nak_retry_err",
Value: uint64(0),
},
{
Name: "roce_adp_retrans",
Value: uint64(0),
},
{
Name: "roce_adp_retrans_to",
Value: uint64(0),
},
{
Name: "roce_slow_restart",
Value: uint64(0),
},
{
Name: "roce_slow_restart_cnps",
Value: uint64(0),
},
{
Name: "roce_slow_restart_trans",
Value: uint64(0),
},
{
Name: "rp_cnp_handled",
Value: uint64(1),
},
{
Name: "rp_cnp_ignored",
Value: uint64(0),
},
{
Name: "rx_atomic_requests",
Value: uint64(0),
},
{
Name: "rx_icrc_encapsulated",
Value: uint64(0),
},
{
Name: "rx_read_requests",
Value: uint64(488228),
},
{
Name: "rx_write_requests",
Value: uint64(3928699),
},
}
var acc testutil.Accumulator
addStats("m1x5_0", "1", sampleRdmaHwStatsEntries, &acc)
acc.AssertContainsTaggedFields(t, "infiniband", fields, tags)
}

View File

@ -2,3 +2,6 @@
# This plugin ONLY supports Linux # This plugin ONLY supports Linux
[[inputs.infiniband]] [[inputs.infiniband]]
# no configuration # no configuration
## Collect RDMA counters
# gather_rdma = false