Fixing issue with missing metrics when pod has only pending containers (#8472)

Also added Pod Phase and Pod Reason fields
fixes #8347

Co-authored-by: Vyacheslav-Stepanov <Vyacheslav_Stepanov@epam.com>
This commit is contained in:
Vyacheslav Stepanov 2020-12-11 16:08:30 +02:00 committed by GitHub
parent a063f9d7f7
commit 0fe2386494
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 261 additions and 19 deletions

View File

@ -224,12 +224,14 @@ subjects:
- node_name
- pod_name
- node_selector (\*varies)
- phase
- state
- readiness
- fields:
- restarts_total
- state_code
- state_reason
- phase_reason
- terminated_reason (string, deprecated in 1.15: use `state_reason` instead)
- resource_requests_millicpu_units
- resource_requests_memory_bytes
@ -301,7 +303,7 @@ kubernetes_persistentvolume,phase=Released,pv_name=pvc-aaaaaaaa-bbbb-cccc-1111-2
kubernetes_persistentvolumeclaim,namespace=default,phase=Bound,pvc_name=data-etcd-0,selector_select1=s1,storageclass=ebs-1-retain phase_type=0i 1547597615000000000
kubernetes_pod,namespace=default,node_name=ip-172-17-0-2.internal,pod_name=tick1 last_transition_time=1547578322000000000i,ready="false" 1547597616000000000
kubernetes_service,cluster_ip=172.29.61.80,namespace=redis-cache-0001,port_name=redis,port_protocol=TCP,selector_app=myapp,selector_io.kompose.service=redis,selector_role=slave,service_name=redis-slave created=1588690034000000000i,generation=0i,port=6379i,target_port=0i 1547597616000000000
kubernetes_pod_container,container_name=telegraf,namespace=default,node_name=ip-172-17-0-2.internal,node_selector_node-role.kubernetes.io/compute=true,pod_name=tick1,state=running,readiness=ready resource_requests_cpu_units=0.1,resource_limits_memory_bytes=524288000,resource_limits_cpu_units=0.5,restarts_total=0i,state_code=0i,state_reason="",resource_requests_memory_bytes=524288000 1547597616000000000
kubernetes_pod_container,container_name=telegraf,namespace=default,node_name=ip-172-17-0-2.internal,node_selector_node-role.kubernetes.io/compute=true,pod_name=tick1,phase=Running,state=running,readiness=ready resource_requests_cpu_units=0.1,resource_limits_memory_bytes=524288000,resource_limits_cpu_units=0.5,restarts_total=0i,state_code=0i,state_reason="",phase_reason="",resource_requests_memory_bytes=524288000 1547597616000000000
kubernetes_statefulset,namespace=default,selector_select1=s1,statefulset_name=etcd replicas_updated=3i,spec_replicas=3i,observed_generation=1i,created=1544101669000000000i,generation=1i,replicas=3i,replicas_current=3i,replicas_ready=3i 1547597616000000000
```

View File

@ -27,8 +27,16 @@ func (ki *KubernetesInventory) gatherPod(p v1.Pod, acc telegraf.Accumulator) err
return nil
}
for i, cs := range p.Status.ContainerStatuses {
c := p.Spec.Containers[i]
containerList := map[string]*v1.ContainerStatus{}
for _, v := range p.Status.ContainerStatuses {
containerList[*v.Name] = v
}
for _, c := range p.Spec.Containers {
cs, ok := containerList[*c.Name]
if !ok {
cs = &v1.ContainerStatus{}
}
gatherPodContainer(*p.Spec.NodeName, ki, p, *cs, *c, acc)
}
@ -39,41 +47,53 @@ func gatherPodContainer(nodeName string, ki *KubernetesInventory, p v1.Pod, cs v
stateCode := 3
stateReason := ""
state := "unknown"
readiness := "unready"
switch {
case cs.State.Running != nil:
stateCode = 0
state = "running"
case cs.State.Terminated != nil:
stateCode = 1
state = "terminated"
stateReason = cs.State.Terminated.GetReason()
case cs.State.Waiting != nil:
stateCode = 2
state = "waiting"
stateReason = cs.State.Waiting.GetReason()
if cs.State != nil {
switch {
case cs.State.Running != nil:
stateCode = 0
state = "running"
case cs.State.Terminated != nil:
stateCode = 1
state = "terminated"
stateReason = cs.State.Terminated.GetReason()
case cs.State.Waiting != nil:
stateCode = 2
state = "waiting"
stateReason = cs.State.Waiting.GetReason()
}
}
readiness := "unready"
if cs.GetReady() {
readiness = "ready"
}
fields := map[string]interface{}{
"restarts_total": cs.GetRestartCount(),
"state_code": stateCode,
"terminated_reason": cs.State.Terminated.GetReason(),
"restarts_total": cs.GetRestartCount(),
"state_code": stateCode,
}
// deprecated in 1.15: use `state_reason` instead
if state == "terminated" {
fields["terminated_reason"] = stateReason
}
if stateReason != "" {
fields["state_reason"] = stateReason
}
phaseReason := p.Status.GetReason()
if phaseReason != "" {
fields["phase_reason"] = phaseReason
}
tags := map[string]string{
"container_name": *c.Name,
"namespace": *p.Metadata.Namespace,
"node_name": *p.Spec.NodeName,
"pod_name": *p.Metadata.Name,
"phase": *p.Status.Phase,
"state": state,
"readiness": readiness,
}

View File

@ -225,6 +225,7 @@ func TestPod(t *testing.T) {
"container_name": "running",
"node_name": "node1",
"pod_name": "pod1",
"phase": "Running",
"state": "running",
"readiness": "ready",
"node_selector_select1": "s1",
@ -245,6 +246,7 @@ func TestPod(t *testing.T) {
"container_name": "completed",
"node_name": "node1",
"pod_name": "pod1",
"phase": "Running",
"state": "terminated",
"readiness": "unready",
},
@ -263,6 +265,7 @@ func TestPod(t *testing.T) {
"container_name": "waiting",
"node_name": "node1",
"pod_name": "pod1",
"phase": "Running",
"state": "waiting",
"readiness": "unready",
},
@ -551,3 +554,220 @@ func TestPodSelectorFilter(t *testing.T) {
}
}
}
func TestPodPendingContainers(t *testing.T) {
cli := &client{}
selectInclude := []string{}
selectExclude := []string{}
now := time.Now()
started := time.Date(now.Year(), now.Month(), now.Day(), now.Hour()-1, 1, 36, 0, now.Location())
created := time.Date(now.Year(), now.Month(), now.Day(), now.Hour()-2, 1, 36, 0, now.Location())
cond1 := time.Date(now.Year(), 7, 5, 7, 53, 29, 0, now.Location())
cond2 := time.Date(now.Year(), 7, 5, 7, 53, 31, 0, now.Location())
tests := []struct {
name string
handler *mockHandler
output *testutil.Accumulator
hasError bool
}{
{
name: "collect pods",
handler: &mockHandler{
responseMap: map[string]interface{}{
"/pods/": &v1.PodList{
Items: []*v1.Pod{
{
Spec: &v1.PodSpec{
NodeName: toStrPtr("node1"),
Containers: []*v1.Container{
{
Name: toStrPtr("waiting"),
Image: toStrPtr("image1"),
Ports: []*v1.ContainerPort{
{
ContainerPort: toInt32Ptr(8080),
Protocol: toStrPtr("TCP"),
},
},
Resources: &v1.ResourceRequirements{
Limits: map[string]*resource.Quantity{
"cpu": {String_: toStrPtr("100m")},
},
Requests: map[string]*resource.Quantity{
"cpu": {String_: toStrPtr("100m")},
},
},
},
{
Name: toStrPtr("terminated"),
Image: toStrPtr("image1"),
Ports: []*v1.ContainerPort{
{
ContainerPort: toInt32Ptr(8080),
Protocol: toStrPtr("TCP"),
},
},
Resources: &v1.ResourceRequirements{
Limits: map[string]*resource.Quantity{
"cpu": {String_: toStrPtr("100m")},
},
Requests: map[string]*resource.Quantity{
"cpu": {String_: toStrPtr("100m")},
},
},
},
},
Volumes: []*v1.Volume{
{
Name: toStrPtr("vol1"),
VolumeSource: &v1.VolumeSource{
PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
ClaimName: toStrPtr("pc1"),
ReadOnly: toBoolPtr(true),
},
},
},
{
Name: toStrPtr("vol2"),
},
},
NodeSelector: map[string]string{
"select1": "s1",
"select2": "s2",
},
},
Status: &v1.PodStatus{
Phase: toStrPtr("Pending"),
Reason: toStrPtr("NetworkNotReady"),
HostIP: toStrPtr("180.12.10.18"),
PodIP: toStrPtr("10.244.2.15"),
StartTime: &metav1.Time{Seconds: toInt64Ptr(started.Unix())},
Conditions: []*v1.PodCondition{
{
Type: toStrPtr("Initialized"),
Status: toStrPtr("True"),
LastTransitionTime: &metav1.Time{Seconds: toInt64Ptr(cond1.Unix())},
},
{
Type: toStrPtr("Ready"),
Status: toStrPtr("True"),
LastTransitionTime: &metav1.Time{Seconds: toInt64Ptr(cond2.Unix())},
},
{
Type: toStrPtr("Scheduled"),
Status: toStrPtr("True"),
LastTransitionTime: &metav1.Time{Seconds: toInt64Ptr(cond1.Unix())},
},
},
ContainerStatuses: []*v1.ContainerStatus{},
},
Metadata: &metav1.ObjectMeta{
OwnerReferences: []*metav1.OwnerReference{
{
ApiVersion: toStrPtr("apps/v1"),
Kind: toStrPtr("DaemonSet"),
Name: toStrPtr("forwarder"),
Controller: toBoolPtr(true),
},
},
Generation: toInt64Ptr(11232),
Namespace: toStrPtr("ns1"),
Name: toStrPtr("pod1"),
Labels: map[string]string{
"lab1": "v1",
"lab2": "v2",
},
CreationTimestamp: &metav1.Time{Seconds: toInt64Ptr(created.Unix())},
},
},
},
},
},
},
output: &testutil.Accumulator{
Metrics: []*testutil.Metric{
{
Measurement: podContainerMeasurement,
Fields: map[string]interface{}{
"phase_reason": "NetworkNotReady",
"restarts_total": int32(0),
"state_code": 3,
"resource_requests_millicpu_units": int64(100),
"resource_limits_millicpu_units": int64(100),
},
Tags: map[string]string{
"namespace": "ns1",
"container_name": "waiting",
"node_name": "node1",
"pod_name": "pod1",
"phase": "Pending",
"state": "unknown",
"readiness": "unready",
"node_selector_select1": "s1",
"node_selector_select2": "s2",
},
},
{
Measurement: podContainerMeasurement,
Fields: map[string]interface{}{
"phase_reason": "NetworkNotReady",
"restarts_total": int32(0),
"state_code": 3,
"resource_requests_millicpu_units": int64(100),
"resource_limits_millicpu_units": int64(100),
},
Tags: map[string]string{
"namespace": "ns1",
"container_name": "terminated",
"node_name": "node1",
"pod_name": "pod1",
"phase": "Pending",
"state": "unknown",
"readiness": "unready",
},
},
},
},
hasError: false,
},
}
for _, v := range tests {
ks := &KubernetesInventory{
client: cli,
SelectorInclude: selectInclude,
SelectorExclude: selectExclude,
}
ks.createSelectorFilters()
acc := new(testutil.Accumulator)
for _, pod := range ((v.handler.responseMap["/pods/"]).(*v1.PodList)).Items {
err := ks.gatherPod(*pod, acc)
if err != nil {
t.Errorf("Failed to gather pod - %s", err.Error())
}
}
err := acc.FirstError()
if err == nil && v.hasError {
t.Fatalf("%s failed, should have error", v.name)
} else if err != nil && !v.hasError {
t.Fatalf("%s failed, err: %v", v.name, err)
}
if v.output == nil && len(acc.Metrics) > 0 {
t.Fatalf("%s: collected extra data", v.name)
} else if v.output != nil && len(v.output.Metrics) > 0 {
for i := range v.output.Metrics {
for k, m := range v.output.Metrics[i].Tags {
if acc.Metrics[i].Tags[k] != m {
t.Fatalf("%s: tag %s metrics unmatch Expected %s, got %s, i %d\n", v.name, k, m, acc.Metrics[i].Tags[k], i)
}
}
for k, m := range v.output.Metrics[i].Fields {
if acc.Metrics[i].Fields[k] != m {
t.Fatalf("%s: field %s metrics unmatch Expected %v(%T), got %v(%T), i %d\n", v.name, k, m, m, acc.Metrics[i].Fields[k], acc.Metrics[i].Fields[k], i)
}
}
}
}
}
}