From 0fe238649478343a695e1436efe298096758c6d9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Stepanov Date: Fri, 11 Dec 2020 16:08:30 +0200 Subject: [PATCH] Fixing issue with missing metrics when pod has only pending containers (#8472) Also added Pod Phase and Pod Reason fields fixes #8347 Co-authored-by: Vyacheslav-Stepanov --- plugins/inputs/kube_inventory/README.md | 4 +- plugins/inputs/kube_inventory/pod.go | 56 ++++-- plugins/inputs/kube_inventory/pod_test.go | 220 ++++++++++++++++++++++ 3 files changed, 261 insertions(+), 19 deletions(-) diff --git a/plugins/inputs/kube_inventory/README.md b/plugins/inputs/kube_inventory/README.md index 276a90110..06c84a92e 100644 --- a/plugins/inputs/kube_inventory/README.md +++ b/plugins/inputs/kube_inventory/README.md @@ -224,12 +224,14 @@ subjects: - node_name - pod_name - node_selector (\*varies) + - phase - state - readiness - fields: - restarts_total - state_code - state_reason + - phase_reason - terminated_reason (string, deprecated in 1.15: use `state_reason` instead) - resource_requests_millicpu_units - resource_requests_memory_bytes @@ -301,7 +303,7 @@ kubernetes_persistentvolume,phase=Released,pv_name=pvc-aaaaaaaa-bbbb-cccc-1111-2 kubernetes_persistentvolumeclaim,namespace=default,phase=Bound,pvc_name=data-etcd-0,selector_select1=s1,storageclass=ebs-1-retain phase_type=0i 1547597615000000000 kubernetes_pod,namespace=default,node_name=ip-172-17-0-2.internal,pod_name=tick1 last_transition_time=1547578322000000000i,ready="false" 1547597616000000000 kubernetes_service,cluster_ip=172.29.61.80,namespace=redis-cache-0001,port_name=redis,port_protocol=TCP,selector_app=myapp,selector_io.kompose.service=redis,selector_role=slave,service_name=redis-slave created=1588690034000000000i,generation=0i,port=6379i,target_port=0i 1547597616000000000 -kubernetes_pod_container,container_name=telegraf,namespace=default,node_name=ip-172-17-0-2.internal,node_selector_node-role.kubernetes.io/compute=true,pod_name=tick1,state=running,readiness=ready resource_requests_cpu_units=0.1,resource_limits_memory_bytes=524288000,resource_limits_cpu_units=0.5,restarts_total=0i,state_code=0i,state_reason="",resource_requests_memory_bytes=524288000 1547597616000000000 +kubernetes_pod_container,container_name=telegraf,namespace=default,node_name=ip-172-17-0-2.internal,node_selector_node-role.kubernetes.io/compute=true,pod_name=tick1,phase=Running,state=running,readiness=ready resource_requests_cpu_units=0.1,resource_limits_memory_bytes=524288000,resource_limits_cpu_units=0.5,restarts_total=0i,state_code=0i,state_reason="",phase_reason="",resource_requests_memory_bytes=524288000 1547597616000000000 kubernetes_statefulset,namespace=default,selector_select1=s1,statefulset_name=etcd replicas_updated=3i,spec_replicas=3i,observed_generation=1i,created=1544101669000000000i,generation=1i,replicas=3i,replicas_current=3i,replicas_ready=3i 1547597616000000000 ``` diff --git a/plugins/inputs/kube_inventory/pod.go b/plugins/inputs/kube_inventory/pod.go index 2f17f690d..c75f133ba 100644 --- a/plugins/inputs/kube_inventory/pod.go +++ b/plugins/inputs/kube_inventory/pod.go @@ -27,8 +27,16 @@ func (ki *KubernetesInventory) gatherPod(p v1.Pod, acc telegraf.Accumulator) err return nil } - for i, cs := range p.Status.ContainerStatuses { - c := p.Spec.Containers[i] + containerList := map[string]*v1.ContainerStatus{} + for _, v := range p.Status.ContainerStatuses { + containerList[*v.Name] = v + } + + for _, c := range p.Spec.Containers { + cs, ok := containerList[*c.Name] + if !ok { + cs = &v1.ContainerStatus{} + } gatherPodContainer(*p.Spec.NodeName, ki, p, *cs, *c, acc) } @@ -39,41 +47,53 @@ func gatherPodContainer(nodeName string, ki *KubernetesInventory, p v1.Pod, cs v stateCode := 3 stateReason := "" state := "unknown" + readiness := "unready" - switch { - case cs.State.Running != nil: - stateCode = 0 - state = "running" - case cs.State.Terminated != nil: - stateCode = 1 - state = "terminated" - stateReason = cs.State.Terminated.GetReason() - case cs.State.Waiting != nil: - stateCode = 2 - state = "waiting" - stateReason = cs.State.Waiting.GetReason() + if cs.State != nil { + switch { + case cs.State.Running != nil: + stateCode = 0 + state = "running" + case cs.State.Terminated != nil: + stateCode = 1 + state = "terminated" + stateReason = cs.State.Terminated.GetReason() + case cs.State.Waiting != nil: + stateCode = 2 + state = "waiting" + stateReason = cs.State.Waiting.GetReason() + } } - readiness := "unready" if cs.GetReady() { readiness = "ready" } fields := map[string]interface{}{ - "restarts_total": cs.GetRestartCount(), - "state_code": stateCode, - "terminated_reason": cs.State.Terminated.GetReason(), + "restarts_total": cs.GetRestartCount(), + "state_code": stateCode, + } + + // deprecated in 1.15: use `state_reason` instead + if state == "terminated" { + fields["terminated_reason"] = stateReason } if stateReason != "" { fields["state_reason"] = stateReason } + phaseReason := p.Status.GetReason() + if phaseReason != "" { + fields["phase_reason"] = phaseReason + } + tags := map[string]string{ "container_name": *c.Name, "namespace": *p.Metadata.Namespace, "node_name": *p.Spec.NodeName, "pod_name": *p.Metadata.Name, + "phase": *p.Status.Phase, "state": state, "readiness": readiness, } diff --git a/plugins/inputs/kube_inventory/pod_test.go b/plugins/inputs/kube_inventory/pod_test.go index d9b322165..230fbbef9 100644 --- a/plugins/inputs/kube_inventory/pod_test.go +++ b/plugins/inputs/kube_inventory/pod_test.go @@ -225,6 +225,7 @@ func TestPod(t *testing.T) { "container_name": "running", "node_name": "node1", "pod_name": "pod1", + "phase": "Running", "state": "running", "readiness": "ready", "node_selector_select1": "s1", @@ -245,6 +246,7 @@ func TestPod(t *testing.T) { "container_name": "completed", "node_name": "node1", "pod_name": "pod1", + "phase": "Running", "state": "terminated", "readiness": "unready", }, @@ -263,6 +265,7 @@ func TestPod(t *testing.T) { "container_name": "waiting", "node_name": "node1", "pod_name": "pod1", + "phase": "Running", "state": "waiting", "readiness": "unready", }, @@ -551,3 +554,220 @@ func TestPodSelectorFilter(t *testing.T) { } } } + +func TestPodPendingContainers(t *testing.T) { + cli := &client{} + selectInclude := []string{} + selectExclude := []string{} + now := time.Now() + started := time.Date(now.Year(), now.Month(), now.Day(), now.Hour()-1, 1, 36, 0, now.Location()) + created := time.Date(now.Year(), now.Month(), now.Day(), now.Hour()-2, 1, 36, 0, now.Location()) + cond1 := time.Date(now.Year(), 7, 5, 7, 53, 29, 0, now.Location()) + cond2 := time.Date(now.Year(), 7, 5, 7, 53, 31, 0, now.Location()) + + tests := []struct { + name string + handler *mockHandler + output *testutil.Accumulator + hasError bool + }{ + { + name: "collect pods", + handler: &mockHandler{ + responseMap: map[string]interface{}{ + "/pods/": &v1.PodList{ + Items: []*v1.Pod{ + { + Spec: &v1.PodSpec{ + NodeName: toStrPtr("node1"), + Containers: []*v1.Container{ + { + Name: toStrPtr("waiting"), + Image: toStrPtr("image1"), + Ports: []*v1.ContainerPort{ + { + ContainerPort: toInt32Ptr(8080), + Protocol: toStrPtr("TCP"), + }, + }, + Resources: &v1.ResourceRequirements{ + Limits: map[string]*resource.Quantity{ + "cpu": {String_: toStrPtr("100m")}, + }, + Requests: map[string]*resource.Quantity{ + "cpu": {String_: toStrPtr("100m")}, + }, + }, + }, + { + Name: toStrPtr("terminated"), + Image: toStrPtr("image1"), + Ports: []*v1.ContainerPort{ + { + ContainerPort: toInt32Ptr(8080), + Protocol: toStrPtr("TCP"), + }, + }, + Resources: &v1.ResourceRequirements{ + Limits: map[string]*resource.Quantity{ + "cpu": {String_: toStrPtr("100m")}, + }, + Requests: map[string]*resource.Quantity{ + "cpu": {String_: toStrPtr("100m")}, + }, + }, + }, + }, + Volumes: []*v1.Volume{ + { + Name: toStrPtr("vol1"), + VolumeSource: &v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: toStrPtr("pc1"), + ReadOnly: toBoolPtr(true), + }, + }, + }, + { + Name: toStrPtr("vol2"), + }, + }, + NodeSelector: map[string]string{ + "select1": "s1", + "select2": "s2", + }, + }, + Status: &v1.PodStatus{ + Phase: toStrPtr("Pending"), + Reason: toStrPtr("NetworkNotReady"), + HostIP: toStrPtr("180.12.10.18"), + PodIP: toStrPtr("10.244.2.15"), + StartTime: &metav1.Time{Seconds: toInt64Ptr(started.Unix())}, + Conditions: []*v1.PodCondition{ + { + Type: toStrPtr("Initialized"), + Status: toStrPtr("True"), + LastTransitionTime: &metav1.Time{Seconds: toInt64Ptr(cond1.Unix())}, + }, + { + Type: toStrPtr("Ready"), + Status: toStrPtr("True"), + LastTransitionTime: &metav1.Time{Seconds: toInt64Ptr(cond2.Unix())}, + }, + { + Type: toStrPtr("Scheduled"), + Status: toStrPtr("True"), + LastTransitionTime: &metav1.Time{Seconds: toInt64Ptr(cond1.Unix())}, + }, + }, + ContainerStatuses: []*v1.ContainerStatus{}, + }, + Metadata: &metav1.ObjectMeta{ + OwnerReferences: []*metav1.OwnerReference{ + { + ApiVersion: toStrPtr("apps/v1"), + Kind: toStrPtr("DaemonSet"), + Name: toStrPtr("forwarder"), + Controller: toBoolPtr(true), + }, + }, + Generation: toInt64Ptr(11232), + Namespace: toStrPtr("ns1"), + Name: toStrPtr("pod1"), + Labels: map[string]string{ + "lab1": "v1", + "lab2": "v2", + }, + CreationTimestamp: &metav1.Time{Seconds: toInt64Ptr(created.Unix())}, + }, + }, + }, + }, + }, + }, + output: &testutil.Accumulator{ + Metrics: []*testutil.Metric{ + { + Measurement: podContainerMeasurement, + Fields: map[string]interface{}{ + "phase_reason": "NetworkNotReady", + "restarts_total": int32(0), + "state_code": 3, + "resource_requests_millicpu_units": int64(100), + "resource_limits_millicpu_units": int64(100), + }, + Tags: map[string]string{ + "namespace": "ns1", + "container_name": "waiting", + "node_name": "node1", + "pod_name": "pod1", + "phase": "Pending", + "state": "unknown", + "readiness": "unready", + "node_selector_select1": "s1", + "node_selector_select2": "s2", + }, + }, + { + Measurement: podContainerMeasurement, + Fields: map[string]interface{}{ + "phase_reason": "NetworkNotReady", + "restarts_total": int32(0), + "state_code": 3, + "resource_requests_millicpu_units": int64(100), + "resource_limits_millicpu_units": int64(100), + }, + Tags: map[string]string{ + "namespace": "ns1", + "container_name": "terminated", + "node_name": "node1", + "pod_name": "pod1", + "phase": "Pending", + "state": "unknown", + "readiness": "unready", + }, + }, + }, + }, + hasError: false, + }, + } + for _, v := range tests { + ks := &KubernetesInventory{ + client: cli, + SelectorInclude: selectInclude, + SelectorExclude: selectExclude, + } + ks.createSelectorFilters() + acc := new(testutil.Accumulator) + for _, pod := range ((v.handler.responseMap["/pods/"]).(*v1.PodList)).Items { + err := ks.gatherPod(*pod, acc) + if err != nil { + t.Errorf("Failed to gather pod - %s", err.Error()) + } + } + + err := acc.FirstError() + if err == nil && v.hasError { + t.Fatalf("%s failed, should have error", v.name) + } else if err != nil && !v.hasError { + t.Fatalf("%s failed, err: %v", v.name, err) + } + if v.output == nil && len(acc.Metrics) > 0 { + t.Fatalf("%s: collected extra data", v.name) + } else if v.output != nil && len(v.output.Metrics) > 0 { + for i := range v.output.Metrics { + for k, m := range v.output.Metrics[i].Tags { + if acc.Metrics[i].Tags[k] != m { + t.Fatalf("%s: tag %s metrics unmatch Expected %s, got %s, i %d\n", v.name, k, m, acc.Metrics[i].Tags[k], i) + } + } + for k, m := range v.output.Metrics[i].Fields { + if acc.Metrics[i].Fields[k] != m { + t.Fatalf("%s: field %s metrics unmatch Expected %v(%T), got %v(%T), i %d\n", v.name, k, m, m, acc.Metrics[i].Fields[k], acc.Metrics[i].Fields[k], i) + } + } + } + } + } +}