Fixing issue with missing metrics when pod has only pending containers (#8472)
Also added Pod Phase and Pod Reason fields fixes #8347 Co-authored-by: Vyacheslav-Stepanov <Vyacheslav_Stepanov@epam.com>
This commit is contained in:
parent
a063f9d7f7
commit
0fe2386494
|
|
@ -224,12 +224,14 @@ subjects:
|
|||
- node_name
|
||||
- pod_name
|
||||
- node_selector (\*varies)
|
||||
- phase
|
||||
- state
|
||||
- readiness
|
||||
- fields:
|
||||
- restarts_total
|
||||
- state_code
|
||||
- state_reason
|
||||
- phase_reason
|
||||
- terminated_reason (string, deprecated in 1.15: use `state_reason` instead)
|
||||
- resource_requests_millicpu_units
|
||||
- resource_requests_memory_bytes
|
||||
|
|
@ -301,7 +303,7 @@ kubernetes_persistentvolume,phase=Released,pv_name=pvc-aaaaaaaa-bbbb-cccc-1111-2
|
|||
kubernetes_persistentvolumeclaim,namespace=default,phase=Bound,pvc_name=data-etcd-0,selector_select1=s1,storageclass=ebs-1-retain phase_type=0i 1547597615000000000
|
||||
kubernetes_pod,namespace=default,node_name=ip-172-17-0-2.internal,pod_name=tick1 last_transition_time=1547578322000000000i,ready="false" 1547597616000000000
|
||||
kubernetes_service,cluster_ip=172.29.61.80,namespace=redis-cache-0001,port_name=redis,port_protocol=TCP,selector_app=myapp,selector_io.kompose.service=redis,selector_role=slave,service_name=redis-slave created=1588690034000000000i,generation=0i,port=6379i,target_port=0i 1547597616000000000
|
||||
kubernetes_pod_container,container_name=telegraf,namespace=default,node_name=ip-172-17-0-2.internal,node_selector_node-role.kubernetes.io/compute=true,pod_name=tick1,state=running,readiness=ready resource_requests_cpu_units=0.1,resource_limits_memory_bytes=524288000,resource_limits_cpu_units=0.5,restarts_total=0i,state_code=0i,state_reason="",resource_requests_memory_bytes=524288000 1547597616000000000
|
||||
kubernetes_pod_container,container_name=telegraf,namespace=default,node_name=ip-172-17-0-2.internal,node_selector_node-role.kubernetes.io/compute=true,pod_name=tick1,phase=Running,state=running,readiness=ready resource_requests_cpu_units=0.1,resource_limits_memory_bytes=524288000,resource_limits_cpu_units=0.5,restarts_total=0i,state_code=0i,state_reason="",phase_reason="",resource_requests_memory_bytes=524288000 1547597616000000000
|
||||
kubernetes_statefulset,namespace=default,selector_select1=s1,statefulset_name=etcd replicas_updated=3i,spec_replicas=3i,observed_generation=1i,created=1544101669000000000i,generation=1i,replicas=3i,replicas_current=3i,replicas_ready=3i 1547597616000000000
|
||||
```
|
||||
|
||||
|
|
|
|||
|
|
@ -27,8 +27,16 @@ func (ki *KubernetesInventory) gatherPod(p v1.Pod, acc telegraf.Accumulator) err
|
|||
return nil
|
||||
}
|
||||
|
||||
for i, cs := range p.Status.ContainerStatuses {
|
||||
c := p.Spec.Containers[i]
|
||||
containerList := map[string]*v1.ContainerStatus{}
|
||||
for _, v := range p.Status.ContainerStatuses {
|
||||
containerList[*v.Name] = v
|
||||
}
|
||||
|
||||
for _, c := range p.Spec.Containers {
|
||||
cs, ok := containerList[*c.Name]
|
||||
if !ok {
|
||||
cs = &v1.ContainerStatus{}
|
||||
}
|
||||
gatherPodContainer(*p.Spec.NodeName, ki, p, *cs, *c, acc)
|
||||
}
|
||||
|
||||
|
|
@ -39,41 +47,53 @@ func gatherPodContainer(nodeName string, ki *KubernetesInventory, p v1.Pod, cs v
|
|||
stateCode := 3
|
||||
stateReason := ""
|
||||
state := "unknown"
|
||||
readiness := "unready"
|
||||
|
||||
switch {
|
||||
case cs.State.Running != nil:
|
||||
stateCode = 0
|
||||
state = "running"
|
||||
case cs.State.Terminated != nil:
|
||||
stateCode = 1
|
||||
state = "terminated"
|
||||
stateReason = cs.State.Terminated.GetReason()
|
||||
case cs.State.Waiting != nil:
|
||||
stateCode = 2
|
||||
state = "waiting"
|
||||
stateReason = cs.State.Waiting.GetReason()
|
||||
if cs.State != nil {
|
||||
switch {
|
||||
case cs.State.Running != nil:
|
||||
stateCode = 0
|
||||
state = "running"
|
||||
case cs.State.Terminated != nil:
|
||||
stateCode = 1
|
||||
state = "terminated"
|
||||
stateReason = cs.State.Terminated.GetReason()
|
||||
case cs.State.Waiting != nil:
|
||||
stateCode = 2
|
||||
state = "waiting"
|
||||
stateReason = cs.State.Waiting.GetReason()
|
||||
}
|
||||
}
|
||||
|
||||
readiness := "unready"
|
||||
if cs.GetReady() {
|
||||
readiness = "ready"
|
||||
}
|
||||
|
||||
fields := map[string]interface{}{
|
||||
"restarts_total": cs.GetRestartCount(),
|
||||
"state_code": stateCode,
|
||||
"terminated_reason": cs.State.Terminated.GetReason(),
|
||||
"restarts_total": cs.GetRestartCount(),
|
||||
"state_code": stateCode,
|
||||
}
|
||||
|
||||
// deprecated in 1.15: use `state_reason` instead
|
||||
if state == "terminated" {
|
||||
fields["terminated_reason"] = stateReason
|
||||
}
|
||||
|
||||
if stateReason != "" {
|
||||
fields["state_reason"] = stateReason
|
||||
}
|
||||
|
||||
phaseReason := p.Status.GetReason()
|
||||
if phaseReason != "" {
|
||||
fields["phase_reason"] = phaseReason
|
||||
}
|
||||
|
||||
tags := map[string]string{
|
||||
"container_name": *c.Name,
|
||||
"namespace": *p.Metadata.Namespace,
|
||||
"node_name": *p.Spec.NodeName,
|
||||
"pod_name": *p.Metadata.Name,
|
||||
"phase": *p.Status.Phase,
|
||||
"state": state,
|
||||
"readiness": readiness,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -225,6 +225,7 @@ func TestPod(t *testing.T) {
|
|||
"container_name": "running",
|
||||
"node_name": "node1",
|
||||
"pod_name": "pod1",
|
||||
"phase": "Running",
|
||||
"state": "running",
|
||||
"readiness": "ready",
|
||||
"node_selector_select1": "s1",
|
||||
|
|
@ -245,6 +246,7 @@ func TestPod(t *testing.T) {
|
|||
"container_name": "completed",
|
||||
"node_name": "node1",
|
||||
"pod_name": "pod1",
|
||||
"phase": "Running",
|
||||
"state": "terminated",
|
||||
"readiness": "unready",
|
||||
},
|
||||
|
|
@ -263,6 +265,7 @@ func TestPod(t *testing.T) {
|
|||
"container_name": "waiting",
|
||||
"node_name": "node1",
|
||||
"pod_name": "pod1",
|
||||
"phase": "Running",
|
||||
"state": "waiting",
|
||||
"readiness": "unready",
|
||||
},
|
||||
|
|
@ -551,3 +554,220 @@ func TestPodSelectorFilter(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPodPendingContainers(t *testing.T) {
|
||||
cli := &client{}
|
||||
selectInclude := []string{}
|
||||
selectExclude := []string{}
|
||||
now := time.Now()
|
||||
started := time.Date(now.Year(), now.Month(), now.Day(), now.Hour()-1, 1, 36, 0, now.Location())
|
||||
created := time.Date(now.Year(), now.Month(), now.Day(), now.Hour()-2, 1, 36, 0, now.Location())
|
||||
cond1 := time.Date(now.Year(), 7, 5, 7, 53, 29, 0, now.Location())
|
||||
cond2 := time.Date(now.Year(), 7, 5, 7, 53, 31, 0, now.Location())
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
handler *mockHandler
|
||||
output *testutil.Accumulator
|
||||
hasError bool
|
||||
}{
|
||||
{
|
||||
name: "collect pods",
|
||||
handler: &mockHandler{
|
||||
responseMap: map[string]interface{}{
|
||||
"/pods/": &v1.PodList{
|
||||
Items: []*v1.Pod{
|
||||
{
|
||||
Spec: &v1.PodSpec{
|
||||
NodeName: toStrPtr("node1"),
|
||||
Containers: []*v1.Container{
|
||||
{
|
||||
Name: toStrPtr("waiting"),
|
||||
Image: toStrPtr("image1"),
|
||||
Ports: []*v1.ContainerPort{
|
||||
{
|
||||
ContainerPort: toInt32Ptr(8080),
|
||||
Protocol: toStrPtr("TCP"),
|
||||
},
|
||||
},
|
||||
Resources: &v1.ResourceRequirements{
|
||||
Limits: map[string]*resource.Quantity{
|
||||
"cpu": {String_: toStrPtr("100m")},
|
||||
},
|
||||
Requests: map[string]*resource.Quantity{
|
||||
"cpu": {String_: toStrPtr("100m")},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: toStrPtr("terminated"),
|
||||
Image: toStrPtr("image1"),
|
||||
Ports: []*v1.ContainerPort{
|
||||
{
|
||||
ContainerPort: toInt32Ptr(8080),
|
||||
Protocol: toStrPtr("TCP"),
|
||||
},
|
||||
},
|
||||
Resources: &v1.ResourceRequirements{
|
||||
Limits: map[string]*resource.Quantity{
|
||||
"cpu": {String_: toStrPtr("100m")},
|
||||
},
|
||||
Requests: map[string]*resource.Quantity{
|
||||
"cpu": {String_: toStrPtr("100m")},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Volumes: []*v1.Volume{
|
||||
{
|
||||
Name: toStrPtr("vol1"),
|
||||
VolumeSource: &v1.VolumeSource{
|
||||
PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
|
||||
ClaimName: toStrPtr("pc1"),
|
||||
ReadOnly: toBoolPtr(true),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: toStrPtr("vol2"),
|
||||
},
|
||||
},
|
||||
NodeSelector: map[string]string{
|
||||
"select1": "s1",
|
||||
"select2": "s2",
|
||||
},
|
||||
},
|
||||
Status: &v1.PodStatus{
|
||||
Phase: toStrPtr("Pending"),
|
||||
Reason: toStrPtr("NetworkNotReady"),
|
||||
HostIP: toStrPtr("180.12.10.18"),
|
||||
PodIP: toStrPtr("10.244.2.15"),
|
||||
StartTime: &metav1.Time{Seconds: toInt64Ptr(started.Unix())},
|
||||
Conditions: []*v1.PodCondition{
|
||||
{
|
||||
Type: toStrPtr("Initialized"),
|
||||
Status: toStrPtr("True"),
|
||||
LastTransitionTime: &metav1.Time{Seconds: toInt64Ptr(cond1.Unix())},
|
||||
},
|
||||
{
|
||||
Type: toStrPtr("Ready"),
|
||||
Status: toStrPtr("True"),
|
||||
LastTransitionTime: &metav1.Time{Seconds: toInt64Ptr(cond2.Unix())},
|
||||
},
|
||||
{
|
||||
Type: toStrPtr("Scheduled"),
|
||||
Status: toStrPtr("True"),
|
||||
LastTransitionTime: &metav1.Time{Seconds: toInt64Ptr(cond1.Unix())},
|
||||
},
|
||||
},
|
||||
ContainerStatuses: []*v1.ContainerStatus{},
|
||||
},
|
||||
Metadata: &metav1.ObjectMeta{
|
||||
OwnerReferences: []*metav1.OwnerReference{
|
||||
{
|
||||
ApiVersion: toStrPtr("apps/v1"),
|
||||
Kind: toStrPtr("DaemonSet"),
|
||||
Name: toStrPtr("forwarder"),
|
||||
Controller: toBoolPtr(true),
|
||||
},
|
||||
},
|
||||
Generation: toInt64Ptr(11232),
|
||||
Namespace: toStrPtr("ns1"),
|
||||
Name: toStrPtr("pod1"),
|
||||
Labels: map[string]string{
|
||||
"lab1": "v1",
|
||||
"lab2": "v2",
|
||||
},
|
||||
CreationTimestamp: &metav1.Time{Seconds: toInt64Ptr(created.Unix())},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
output: &testutil.Accumulator{
|
||||
Metrics: []*testutil.Metric{
|
||||
{
|
||||
Measurement: podContainerMeasurement,
|
||||
Fields: map[string]interface{}{
|
||||
"phase_reason": "NetworkNotReady",
|
||||
"restarts_total": int32(0),
|
||||
"state_code": 3,
|
||||
"resource_requests_millicpu_units": int64(100),
|
||||
"resource_limits_millicpu_units": int64(100),
|
||||
},
|
||||
Tags: map[string]string{
|
||||
"namespace": "ns1",
|
||||
"container_name": "waiting",
|
||||
"node_name": "node1",
|
||||
"pod_name": "pod1",
|
||||
"phase": "Pending",
|
||||
"state": "unknown",
|
||||
"readiness": "unready",
|
||||
"node_selector_select1": "s1",
|
||||
"node_selector_select2": "s2",
|
||||
},
|
||||
},
|
||||
{
|
||||
Measurement: podContainerMeasurement,
|
||||
Fields: map[string]interface{}{
|
||||
"phase_reason": "NetworkNotReady",
|
||||
"restarts_total": int32(0),
|
||||
"state_code": 3,
|
||||
"resource_requests_millicpu_units": int64(100),
|
||||
"resource_limits_millicpu_units": int64(100),
|
||||
},
|
||||
Tags: map[string]string{
|
||||
"namespace": "ns1",
|
||||
"container_name": "terminated",
|
||||
"node_name": "node1",
|
||||
"pod_name": "pod1",
|
||||
"phase": "Pending",
|
||||
"state": "unknown",
|
||||
"readiness": "unready",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
hasError: false,
|
||||
},
|
||||
}
|
||||
for _, v := range tests {
|
||||
ks := &KubernetesInventory{
|
||||
client: cli,
|
||||
SelectorInclude: selectInclude,
|
||||
SelectorExclude: selectExclude,
|
||||
}
|
||||
ks.createSelectorFilters()
|
||||
acc := new(testutil.Accumulator)
|
||||
for _, pod := range ((v.handler.responseMap["/pods/"]).(*v1.PodList)).Items {
|
||||
err := ks.gatherPod(*pod, acc)
|
||||
if err != nil {
|
||||
t.Errorf("Failed to gather pod - %s", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
err := acc.FirstError()
|
||||
if err == nil && v.hasError {
|
||||
t.Fatalf("%s failed, should have error", v.name)
|
||||
} else if err != nil && !v.hasError {
|
||||
t.Fatalf("%s failed, err: %v", v.name, err)
|
||||
}
|
||||
if v.output == nil && len(acc.Metrics) > 0 {
|
||||
t.Fatalf("%s: collected extra data", v.name)
|
||||
} else if v.output != nil && len(v.output.Metrics) > 0 {
|
||||
for i := range v.output.Metrics {
|
||||
for k, m := range v.output.Metrics[i].Tags {
|
||||
if acc.Metrics[i].Tags[k] != m {
|
||||
t.Fatalf("%s: tag %s metrics unmatch Expected %s, got %s, i %d\n", v.name, k, m, acc.Metrics[i].Tags[k], i)
|
||||
}
|
||||
}
|
||||
for k, m := range v.output.Metrics[i].Fields {
|
||||
if acc.Metrics[i].Fields[k] != m {
|
||||
t.Fatalf("%s: field %s metrics unmatch Expected %v(%T), got %v(%T), i %d\n", v.name, k, m, m, acc.Metrics[i].Fields[k], acc.Metrics[i].Fields[k], i)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue