diff --git a/plugins/inputs/infiniband/README.md b/plugins/inputs/infiniband/README.md index 3c009abd9..7deb92521 100644 --- a/plugins/inputs/infiniband/README.md +++ b/plugins/inputs/infiniband/README.md @@ -3,6 +3,8 @@ This plugin gathers statistics for all InfiniBand devices and ports on the system. These are the counters that can be found in `/sys/class/infiniband//port//counters/` +and RDMA counters can be found in +`/sys/class/infiniband//ports//hw_counters/` **Supported Platforms**: Linux @@ -22,6 +24,9 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. # This plugin ONLY supports Linux [[inputs.infiniband]] # no configuration + + ## Collect RDMA counters + # gather_rdma = false ``` ## Metrics @@ -29,15 +34,20 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. Actual metrics depend on the InfiniBand devices, the plugin uses a simple mapping from counter -> counter value. -[Information about the counters][counters] collected is provided by Mellanox. +[Information about the counters][counters] collected is provided by Nvidia. -[counters]: https://community.mellanox.com/s/article/understanding-mlx5-linux-counters-and-status-parameters +[counters]: https://enterprise-support.nvidia.com/s/article/understanding-mlx5-linux-counters-and-status-parameters + +The following fields are emitted by the plugin when selecting `counters`: - infiniband - tags: - device - port - fields: + + ### Infiniband Counters + - excessive_buffer_overrun_errors (integer) - link_downed (integer) - link_error_recovery (integer) @@ -60,8 +70,41 @@ mapping from counter -> counter value. - unicast_xmit_packets (integer) - VL15_dropped (integer) + ### Infiniband RDMA counters + + - duplicate_request (integer) + - implied_nak_seq_err (integer) + - lifespan (integer) + - local_ack_timeout_err (integer) + - np_cnp_sent (integer) + - np_ecn_marked_roce_packets (integer) + - out_of_buffer (integer) + - out_of_sequence (integer) + - packet_seq_err (integer) + - req_cqe_error (integer) + - req_cqe_flush_error (integer) + - req_remote_access_errors (integer) + - req_remote_invalid_request (integer) + - resp_cqe_error (integer) + - resp_cqe_flush_error (integer) + - resp_local_length_error (integer) + - resp_remote_access_errors (integer) + - rnr_nak_retry_err (integer) + - roce_adp_retrans (integer) + - roce_adp_retrans_to (integer) + - roce_slow_restart (integer) + - roce_slow_restart_cnps (integer) + - roce_slow_restart_trans (integer) + - rp_cnp_handled (integer) + - rp_cnp_ignored (integer) + - rx_atomic_requests (integer) + - rx_icrc_encapsulated (integer) + - rx_read_requests (integer) + - rx_write_requests (integer) + ## Example Output ```text -infiniband,device=mlx5_0,port=1 VL15_dropped=0i,excessive_buffer_overrun_errors=0i,link_downed=0i,link_error_recovery=0i,local_link_integrity_errors=0i,multicast_rcv_packets=0i,multicast_xmit_packets=0i,port_rcv_constraint_errors=0i,port_rcv_data=237159415345822i,port_rcv_errors=0i,port_rcv_packets=801977655075i,port_rcv_remote_physical_errors=0i,port_rcv_switch_relay_errors=0i,port_xmit_constraint_errors=0i,port_xmit_data=238334949937759i,port_xmit_discards=0i,port_xmit_packets=803162651391i,port_xmit_wait=4294967295i,symbol_error=0i,unicast_rcv_packets=801977655075i,unicast_xmit_packets=803162651391i 1573125558000000000 +infiniband,device=mlx5_bond_0,host=hop-r640-12,port=1 port_xmit_data=85378896588i,VL15_dropped=0i,port_rcv_packets=34914071i,port_rcv_data=34600185253i,port_xmit_discards=0i,link_downed=0i,local_link_integrity_errors=0i,symbol_error=0i,link_error_recovery=0i,multicast_rcv_packets=0i,multicast_xmit_packets=0i,unicast_xmit_packets=82002535i,excessive_buffer_overrun_errors=0i,port_rcv_switch_relay_errors=0i,unicast_rcv_packets=34914071i,port_xmit_constraint_errors=0i,port_rcv_errors=0i,port_xmit_wait=0i,port_rcv_remote_physical_errors=0i,port_rcv_constraint_errors=0i,port_xmit_packets=82002535i 1737652060000000000 +infiniband,device=mlx5_bond_0,host=hop-r640-12,port=1 local_ack_timeout_err=0i,lifespan=10i,out_of_buffer=0i,resp_remote_access_errors=0i,resp_local_length_error=0i,np_cnp_sent=0i,roce_slow_restart=0i,rx_read_requests=6000i,duplicate_request=0i,resp_cqe_error=0i,rx_write_requests=19000i,roce_slow_restart_cnps=0i,rx_icrc_encapsulated=0i,rnr_nak_retry_err=0i,roce_adp_retrans=0i,out_of_sequence=0i,req_remote_access_errors=0i,roce_slow_restart_trans=0i,req_remote_invalid_request=0i,req_cqe_error=0i,resp_cqe_flush_error=0i,packet_seq_err=0i,roce_adp_retrans_to=0i,np_ecn_marked_roce_packets=0i,rp_cnp_handled=0i,implied_nak_seq_err=0i,rp_cnp_ignored=0i,req_cqe_flush_error=0i,rx_atomic_requests=0i 1737652060000000000 ``` diff --git a/plugins/inputs/infiniband/infiniband.go b/plugins/inputs/infiniband/infiniband.go index 5f1194bac..48103b793 100644 --- a/plugins/inputs/infiniband/infiniband.go +++ b/plugins/inputs/infiniband/infiniband.go @@ -12,7 +12,8 @@ import ( var sampleConfig string type Infiniband struct { - Log telegraf.Logger `toml:"-"` + RDMA bool `toml:"gather_rdma"` + Log telegraf.Logger `toml:"-"` } func (*Infiniband) SampleConfig() string { diff --git a/plugins/inputs/infiniband/infiniband_linux.go b/plugins/inputs/infiniband/infiniband_linux.go index 214ba8e08..586f60ccb 100644 --- a/plugins/inputs/infiniband/infiniband_linux.go +++ b/plugins/inputs/infiniband/infiniband_linux.go @@ -12,7 +12,7 @@ import ( ) // Gather statistics from our infiniband cards -func (*Infiniband) Gather(acc telegraf.Accumulator) error { +func (ib *Infiniband) Gather(acc telegraf.Accumulator) error { rdmaDevices := rdmamap.GetRdmaDeviceList() if len(rdmaDevices) == 0 { @@ -33,6 +33,15 @@ func (*Infiniband) Gather(acc telegraf.Accumulator) error { } addStats(dev, port, stats, acc) + + if ib.RDMA { + stats, err := rdmamap.GetRdmaSysfsHwStats(dev, portInt) + if err != nil { + continue + } + + addStats(dev, port, stats, acc) + } } } diff --git a/plugins/inputs/infiniband/infiniband_test.go b/plugins/inputs/infiniband/infiniband_test.go index bb9d5de3c..281c8ad78 100644 --- a/plugins/inputs/infiniband/infiniband_test.go +++ b/plugins/inputs/infiniband/infiniband_test.go @@ -133,3 +133,167 @@ func TestInfiniband(t *testing.T) { acc.AssertContainsTaggedFields(t, "infiniband", fields, tags) } + +func TestInfinibandRDMA(t *testing.T) { + fields := map[string]interface{}{ + "duplicate_request": uint64(0), + "implied_nak_seq_err": uint64(0), + "lifespan": uint64(10), + "local_ack_timeout_err": uint64(38), + "np_cnp_sent": uint64(10284520), + "np_ecn_marked_roce_packets": uint64(286733949), + "out_of_buffer": uint64(1149772), + "out_of_sequence": uint64(44), + "packet_seq_err": uint64(1), + "req_cqe_error": uint64(10776), + "req_cqe_flush_error": uint64(2173), + "req_remote_access_errors": uint64(0), + "req_remote_invalid_request": uint64(0), + "resp_cqe_error": uint64(759), + "resp_cqe_flush_error": uint64(759), + "resp_local_length_error": uint64(0), + "resp_remote_access_errors": uint64(0), + "rnr_nak_retry_err": uint64(0), + "roce_adp_retrans": uint64(0), + "roce_adp_retrans_to": uint64(0), + "roce_slow_restart": uint64(0), + "roce_slow_restart_cnps": uint64(0), + "roce_slow_restart_trans": uint64(0), + "rp_cnp_handled": uint64(1), + "rp_cnp_ignored": uint64(0), + "rx_atomic_requests": uint64(0), + "rx_icrc_encapsulated": uint64(0), + "rx_read_requests": uint64(488228), + "rx_write_requests": uint64(3928699), + } + + tags := map[string]string{ + "device": "m1x5_0", + "port": "1", + } + + sampleRdmaHwStatsEntries := []rdmamap.RdmaStatEntry{ + { + Name: "duplicate_request", + Value: uint64(0), + }, + { + Name: "implied_nak_seq_err", + Value: uint64(0), + }, + { + Name: "lifespan", + Value: uint64(10), + }, + { + Name: "local_ack_timeout_err", + Value: uint64(38), + }, + { + Name: "np_cnp_sent", + Value: uint64(10284520), + }, + { + Name: "np_ecn_marked_roce_packets", + Value: uint64(286733949), + }, + { + Name: "out_of_buffer", + Value: uint64(1149772), + }, + { + Name: "out_of_sequence", + Value: uint64(44), + }, + { + Name: "packet_seq_err", + Value: uint64(1), + }, + { + Name: "req_cqe_error", + Value: uint64(10776), + }, + { + Name: "req_cqe_flush_error", + Value: uint64(2173), + }, + { + Name: "req_remote_access_errors", + Value: uint64(0), + }, + { + Name: "req_remote_invalid_request", + Value: uint64(0), + }, + { + Name: "resp_cqe_error", + Value: uint64(759), + }, + { + Name: "resp_cqe_flush_error", + Value: uint64(759), + }, + { + Name: "resp_local_length_error", + Value: uint64(0), + }, + { + Name: "resp_remote_access_errors", + Value: uint64(0), + }, + { + Name: "rnr_nak_retry_err", + Value: uint64(0), + }, + { + Name: "roce_adp_retrans", + Value: uint64(0), + }, + { + Name: "roce_adp_retrans_to", + Value: uint64(0), + }, + { + Name: "roce_slow_restart", + Value: uint64(0), + }, + { + Name: "roce_slow_restart_cnps", + Value: uint64(0), + }, + { + Name: "roce_slow_restart_trans", + Value: uint64(0), + }, + { + Name: "rp_cnp_handled", + Value: uint64(1), + }, + { + Name: "rp_cnp_ignored", + Value: uint64(0), + }, + { + Name: "rx_atomic_requests", + Value: uint64(0), + }, + { + Name: "rx_icrc_encapsulated", + Value: uint64(0), + }, + { + Name: "rx_read_requests", + Value: uint64(488228), + }, + { + Name: "rx_write_requests", + Value: uint64(3928699), + }, + } + + var acc testutil.Accumulator + + addStats("m1x5_0", "1", sampleRdmaHwStatsEntries, &acc) + + acc.AssertContainsTaggedFields(t, "infiniband", fields, tags) +} diff --git a/plugins/inputs/infiniband/sample.conf b/plugins/inputs/infiniband/sample.conf index 560330377..807a75daf 100644 --- a/plugins/inputs/infiniband/sample.conf +++ b/plugins/inputs/infiniband/sample.conf @@ -2,3 +2,6 @@ # This plugin ONLY supports Linux [[inputs.infiniband]] # no configuration + + ## Collect RDMA counters + # gather_rdma = false