From 46f059ebfd57d64c673f10d4caf5904d3a52eb8a Mon Sep 17 00:00:00 2001 From: Sven Burkard Date: Wed, 20 Jul 2022 21:02:38 +0200 Subject: [PATCH] feat: added metrics for member and replica-set avg health of MongoDB (#11516) --- plugins/inputs/mongodb/README.md | 2 ++ plugins/inputs/mongodb/mongodb_data.go | 2 ++ plugins/inputs/mongodb/mongodb_data_test.go | 2 ++ plugins/inputs/mongodb/mongostat.go | 25 +++++++++++++++++++++ 4 files changed, 31 insertions(+) diff --git a/plugins/inputs/mongodb/README.md b/plugins/inputs/mongodb/README.md index 545511c92..1d011ffc4 100644 --- a/plugins/inputs/mongodb/README.md +++ b/plugins/inputs/mongodb/README.md @@ -159,6 +159,8 @@ by running Telegraf with the `--debug` argument. - repl_updates (integer) - repl_oplog_window_sec (integer) - repl_state (integer) + - repl_member_health (integer) + - repl_health_avg (float) - resident_megabytes (integer) - state (string) - storage_freelist_search_bucket_exhausted (integer) diff --git a/plugins/inputs/mongodb/mongodb_data.go b/plugins/inputs/mongodb/mongodb_data.go index f8d10a08d..ab735fc00 100644 --- a/plugins/inputs/mongodb/mongodb_data.go +++ b/plugins/inputs/mongodb/mongodb_data.go @@ -149,6 +149,8 @@ var defaultReplStats = map[string]string{ "member_status": "NodeType", "state": "NodeState", "repl_state": "NodeStateInt", + "repl_member_health": "NodeHealthInt", + "repl_health_avg": "ReplHealthAvg", "repl_lag": "ReplLag", "repl_network_bytes": "ReplNetworkBytes", "repl_network_getmores_num": "ReplNetworkGetmoresNum", diff --git a/plugins/inputs/mongodb/mongodb_data_test.go b/plugins/inputs/mongodb/mongodb_data_test.go index 94b8f6f66..9d1ef471b 100644 --- a/plugins/inputs/mongodb/mongodb_data_test.go +++ b/plugins/inputs/mongodb/mongodb_data_test.go @@ -447,6 +447,8 @@ func TestStateTag(t *testing.T) { "repl_updates": int64(0), "repl_updates_per_sec": int64(0), "repl_state": int64(0), + "repl_member_health": int64(0), + "repl_health_avg": float64(0), "resident_megabytes": int64(0), "state": "PRIMARY", "storage_freelist_search_bucket_exhausted": int64(0), diff --git a/plugins/inputs/mongodb/mongostat.go b/plugins/inputs/mongodb/mongostat.go index c6906dfb2..e0bf1f4f4 100644 --- a/plugins/inputs/mongodb/mongostat.go +++ b/plugins/inputs/mongodb/mongostat.go @@ -139,6 +139,7 @@ type OplogStats struct { // ReplSetMember stores information related to a replica set member type ReplSetMember struct { Name string `bson:"name"` + Health int64 `bson:"health"` State int64 `bson:"state"` StateStr string `bson:"stateStr"` OptimeDate time.Time `bson:"optimeDate"` @@ -783,9 +784,11 @@ type StatLine struct { NetOut, NetOutCnt int64 NumConnections int64 ReplSetName string + ReplHealthAvg float64 NodeType string NodeState string NodeStateInt int64 + NodeHealthInt int64 // Replicated Metrics fields ReplNetworkBytes int64 @@ -1332,6 +1335,8 @@ func NewStatLine(oldMongo, newMongo MongoStatus, key string, all bool, sampleSec returnVal.NodeState = member.StateStr // Store my state integer returnVal.NodeStateInt = member.State + // Store my health integer + returnVal.NodeHealthInt = member.Health if member.State == 1 { // I'm the master @@ -1356,6 +1361,26 @@ func NewStatLine(oldMongo, newMongo MongoStatus, key string, all bool, sampleSec returnVal.ReplLag = lag } } + + // Prepartions for the average health state of the replica-set + replMemberCount := len(newReplStat.Members) + replMemberHealthyCount := 0 + + // Second for-loop is needed, because of break-construct above + for _, member := range newReplStat.Members { + // Count only healthy members for the average health state of the replica-set + if member.Health == 1 { + replMemberHealthyCount++ + } + } + + // Calculate the average health state of the replica-set (For precise monitoring alerts) + // To detect if a member is unhealthy from the perspective of another member and also how bad the replica-set health is + if replMemberCount > 0 { + returnVal.ReplHealthAvg = float64(replMemberHealthyCount) / float64(replMemberCount) + } else { + returnVal.ReplHealthAvg = 0.00 + } } }