feat: added metrics for member and replica-set avg health of MongoDB (#11516)
This commit is contained in:
parent
ba36cfe676
commit
46f059ebfd
|
|
@ -159,6 +159,8 @@ by running Telegraf with the `--debug` argument.
|
||||||
- repl_updates (integer)
|
- repl_updates (integer)
|
||||||
- repl_oplog_window_sec (integer)
|
- repl_oplog_window_sec (integer)
|
||||||
- repl_state (integer)
|
- repl_state (integer)
|
||||||
|
- repl_member_health (integer)
|
||||||
|
- repl_health_avg (float)
|
||||||
- resident_megabytes (integer)
|
- resident_megabytes (integer)
|
||||||
- state (string)
|
- state (string)
|
||||||
- storage_freelist_search_bucket_exhausted (integer)
|
- storage_freelist_search_bucket_exhausted (integer)
|
||||||
|
|
|
||||||
|
|
@ -149,6 +149,8 @@ var defaultReplStats = map[string]string{
|
||||||
"member_status": "NodeType",
|
"member_status": "NodeType",
|
||||||
"state": "NodeState",
|
"state": "NodeState",
|
||||||
"repl_state": "NodeStateInt",
|
"repl_state": "NodeStateInt",
|
||||||
|
"repl_member_health": "NodeHealthInt",
|
||||||
|
"repl_health_avg": "ReplHealthAvg",
|
||||||
"repl_lag": "ReplLag",
|
"repl_lag": "ReplLag",
|
||||||
"repl_network_bytes": "ReplNetworkBytes",
|
"repl_network_bytes": "ReplNetworkBytes",
|
||||||
"repl_network_getmores_num": "ReplNetworkGetmoresNum",
|
"repl_network_getmores_num": "ReplNetworkGetmoresNum",
|
||||||
|
|
|
||||||
|
|
@ -447,6 +447,8 @@ func TestStateTag(t *testing.T) {
|
||||||
"repl_updates": int64(0),
|
"repl_updates": int64(0),
|
||||||
"repl_updates_per_sec": int64(0),
|
"repl_updates_per_sec": int64(0),
|
||||||
"repl_state": int64(0),
|
"repl_state": int64(0),
|
||||||
|
"repl_member_health": int64(0),
|
||||||
|
"repl_health_avg": float64(0),
|
||||||
"resident_megabytes": int64(0),
|
"resident_megabytes": int64(0),
|
||||||
"state": "PRIMARY",
|
"state": "PRIMARY",
|
||||||
"storage_freelist_search_bucket_exhausted": int64(0),
|
"storage_freelist_search_bucket_exhausted": int64(0),
|
||||||
|
|
|
||||||
|
|
@ -139,6 +139,7 @@ type OplogStats struct {
|
||||||
// ReplSetMember stores information related to a replica set member
|
// ReplSetMember stores information related to a replica set member
|
||||||
type ReplSetMember struct {
|
type ReplSetMember struct {
|
||||||
Name string `bson:"name"`
|
Name string `bson:"name"`
|
||||||
|
Health int64 `bson:"health"`
|
||||||
State int64 `bson:"state"`
|
State int64 `bson:"state"`
|
||||||
StateStr string `bson:"stateStr"`
|
StateStr string `bson:"stateStr"`
|
||||||
OptimeDate time.Time `bson:"optimeDate"`
|
OptimeDate time.Time `bson:"optimeDate"`
|
||||||
|
|
@ -783,9 +784,11 @@ type StatLine struct {
|
||||||
NetOut, NetOutCnt int64
|
NetOut, NetOutCnt int64
|
||||||
NumConnections int64
|
NumConnections int64
|
||||||
ReplSetName string
|
ReplSetName string
|
||||||
|
ReplHealthAvg float64
|
||||||
NodeType string
|
NodeType string
|
||||||
NodeState string
|
NodeState string
|
||||||
NodeStateInt int64
|
NodeStateInt int64
|
||||||
|
NodeHealthInt int64
|
||||||
|
|
||||||
// Replicated Metrics fields
|
// Replicated Metrics fields
|
||||||
ReplNetworkBytes int64
|
ReplNetworkBytes int64
|
||||||
|
|
@ -1332,6 +1335,8 @@ func NewStatLine(oldMongo, newMongo MongoStatus, key string, all bool, sampleSec
|
||||||
returnVal.NodeState = member.StateStr
|
returnVal.NodeState = member.StateStr
|
||||||
// Store my state integer
|
// Store my state integer
|
||||||
returnVal.NodeStateInt = member.State
|
returnVal.NodeStateInt = member.State
|
||||||
|
// Store my health integer
|
||||||
|
returnVal.NodeHealthInt = member.Health
|
||||||
|
|
||||||
if member.State == 1 {
|
if member.State == 1 {
|
||||||
// I'm the master
|
// I'm the master
|
||||||
|
|
@ -1356,6 +1361,26 @@ func NewStatLine(oldMongo, newMongo MongoStatus, key string, all bool, sampleSec
|
||||||
returnVal.ReplLag = lag
|
returnVal.ReplLag = lag
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prepartions for the average health state of the replica-set
|
||||||
|
replMemberCount := len(newReplStat.Members)
|
||||||
|
replMemberHealthyCount := 0
|
||||||
|
|
||||||
|
// Second for-loop is needed, because of break-construct above
|
||||||
|
for _, member := range newReplStat.Members {
|
||||||
|
// Count only healthy members for the average health state of the replica-set
|
||||||
|
if member.Health == 1 {
|
||||||
|
replMemberHealthyCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the average health state of the replica-set (For precise monitoring alerts)
|
||||||
|
// To detect if a member is unhealthy from the perspective of another member and also how bad the replica-set health is
|
||||||
|
if replMemberCount > 0 {
|
||||||
|
returnVal.ReplHealthAvg = float64(replMemberHealthyCount) / float64(replMemberCount)
|
||||||
|
} else {
|
||||||
|
returnVal.ReplHealthAvg = 0.00
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue