feat(inputs.kernel): Add Pressure Stall Information (#14507)

This commit is contained in:
iBug 2024-01-06 05:33:00 +08:00 committed by GitHub
parent 9e88381085
commit 0052fc3634
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 308 additions and 11 deletions

View File

@ -5,13 +5,14 @@ This plugin is only available on Linux.
The kernel plugin gathers info about the kernel that doesn't fit into other
plugins. In general, it is the statistics available in `/proc/stat` that are not
covered by other plugins as well as the value of
`/proc/sys/kernel/random/entropy_avail` and optionally, Kernel Samepage Merging.
`/proc/sys/kernel/random/entropy_avail` and optionally, Kernel Samepage Merging
and Pressure Stall Information.
The metrics are documented in `man proc` under the `/proc/stat` section.
The metrics are documented in `man 4 random` under the `/proc/stat` section.
The metrics are documented in `man 5 proc` under the `/proc/stat` section, as
well as `man 4 random` under the `/proc interfaces` section
(for `entropy_avail`).
```text
/proc/sys/kernel/random/entropy_avail
Contains the value of available entropy
@ -40,10 +41,28 @@ Number of forks since boot.
```
Kernel Samepage Merging is generally documented in [kernel documentation][1] and
the available metrics exposed via sysfs are documented in [admin guide][2]
the available metrics exposed via sysfs are documented in [admin guide][2].
Pressure Stall Information is exposed through `/proc/pressure` and is documented
in [kernel documentation][3]. Kernel version 4.20 or later is required.
Examples of PSI:
```shell
# /proc/pressure/cpu
some avg10=1.53 avg60=1.87 avg300=1.73 total=1088168194
# /proc/pressure/memory
some avg10=0.00 avg60=0.00 avg300=0.00 total=3463792
full avg10=0.00 avg60=0.00 avg300=0.00 total=1429641
# /proc/pressure/io
some avg10=0.00 avg60=0.00 avg300=0.00 total=68568296
full avg10=0.00 avg60=0.00 avg300=0.00 total=54982338
```
[1]: https://www.kernel.org/doc/html/latest/mm/ksm.html
[2]: https://www.kernel.org/doc/html/latest/admin-guide/mm/ksm.html#ksm-daemon-sysfs-interface
[3]: https://www.kernel.org/doc/html/latest/accounting/psi.html
## Global configuration options <!-- @/docs/includes/plugin_config.md -->
@ -63,6 +82,7 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
## Additional gather options
## Possible options include:
## * ksm - kernel same-page merging
## * psi - pressure stall information
# collect = []
```
@ -91,11 +111,41 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
- ksm_stable_node_dups (integer, number of duplicated KSM pages, `stable_node_dups`)
- ksm_use_zero_pages (integer, whether empty pages should be treated specially, `use_zero_pages`)
- pressure (if `psi` is included in `collect`)
- tags:
- resource: cpu, memory, or io
- type: some or full
- floating-point fields: avg10, avg60, avg300
- integer fields: total
## Example Output
Default config:
```text
kernel boot_time=1690487872i,context_switches=321398652i,entropy_avail=256i,interrupts=141868628i,processes_forked=946492i 1691339564000000000
kernel boot_time=1690487872i,context_switches=321252729i,entropy_avail=256i,interrupts=141783427i,ksm_full_scans=0i,ksm_max_page_sharing=256i,ksm_merge_across_nodes=1i,ksm_pages_shared=0i,ksm_pages_sharing=0i,ksm_pages_to_scan=100i,ksm_pages_unshared=0i,ksm_pages_volatile=0i,ksm_run=0i,ksm_sleep_millisecs=20i,ksm_stable_node_chains=0i,ksm_stable_node_chains_prune_millisecs=2000i,ksm_stable_node_dups=0i,ksm_use_zero_pages=0i,processes_forked=946467i 1691339522000000000
```
If `ksm` is included in `collect`:
```text
kernel boot_time=1690487872i,context_switches=321252729i,entropy_avail=256i,interrupts=141783427i,ksm_full_scans=0i,ksm_max_page_sharing=256i,ksm_merge_across_nodes=1i,ksm_pages_shared=0i,ksm_pages_sharing=0i,ksm_pages_to_scan=100i,ksm_pages_unshared=0i,ksm_pages_volatile=0i,ksm_run=0i,ksm_sleep_millisecs=20i,ksm_stable_node_chains=0i,ksm_stable_node_chains_prune_millisecs=2000i,ksm_stable_node_dups=0i,ksm_use_zero_pages=0i,processes_forked=946467i 1691339522000000000
```
If `psi` is included in `collect`:
```text
pressure,resource=cpu,type=some avg10=1.53,avg60=1.87,avg300=1.73 1700000000000000000
pressure,resource=memory,type=some avg10=0.00,avg60=0.00,avg300=0.00 1700000000000000000
pressure,resource=memory,type=full avg10=0.00,avg60=0.00,avg300=0.00 1700000000000000000
pressure,resource=io,type=some avg10=0.0,avg60=0.0,avg300=0.0 1700000000000000000
pressure,resource=io,type=full avg10=0.0,avg60=0.0,avg300=0.0 1700000000000000000
pressure,resource=cpu,type=some total=1088168194i 1700000000000000000
pressure,resource=memory,type=some total=3463792i 1700000000000000000
pressure,resource=memory,type=full total=1429641i 1700000000000000000
pressure,resource=io,type=some total=68568296i 1700000000000000000
pressure,resource=io,type=full total=54982338i 1700000000000000000
```
Note that the combination for `resource=cpu,type=full` is omitted because it is
always zero.

View File

@ -11,6 +11,8 @@ import (
"path/filepath"
"strconv"
"github.com/prometheus/procfs"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/inputs"
)
@ -34,6 +36,8 @@ type Kernel struct {
statFile string
entropyStatFile string
ksmStatsDir string
psiDir string
procfs procfs.FS
}
func (k *Kernel) Init() error {
@ -45,7 +49,15 @@ func (k *Kernel) Init() error {
if k.optCollect["ksm"] {
if _, err := os.Stat(k.ksmStatsDir); os.IsNotExist(err) {
// ksm probably not enabled in the kernel, bail out early
return fmt.Errorf("directory %q does not exist. Is KSM enabled in this kernel?", k.ksmStatsDir)
return fmt.Errorf("directory %q does not exist. KSM is not enabled in this kernel", k.ksmStatsDir)
}
}
if k.optCollect["psi"] {
procdir := filepath.Dir(k.psiDir)
var err error
if k.procfs, err = procfs.NewFS(procdir); err != nil {
// psi probably not supported in the kernel, bail out early
return fmt.Errorf("failed to initialize procfs on %s: %w", procdir, err)
}
}
return nil
@ -145,12 +157,18 @@ func (k *Kernel) Gather(acc telegraf.Accumulator) error {
}
acc.AddCounter("kernel", fields, map[string]string{})
if k.optCollect["psi"] {
if err := k.gatherPressure(acc); err != nil {
return err
}
}
return nil
}
func (k *Kernel) getProcValueBytes(path string) ([]byte, error) {
if _, err := os.Stat(path); os.IsNotExist(err) {
return nil, fmt.Errorf("Path %q does not exist", path)
return nil, fmt.Errorf("path %q does not exist", path)
} else if err != nil {
return nil, err
}
@ -183,6 +201,7 @@ func init() {
statFile: "/proc/stat",
entropyStatFile: "/proc/sys/kernel/random/entropy_avail",
ksmStatsDir: "/sys/kernel/mm/ksm",
psiDir: "/proc/pressure",
}
})
}

View File

@ -200,7 +200,7 @@ func TestKSMEnabledWrongDir(t *testing.T) {
ConfigCollect: []string{"ksm"},
}
require.ErrorContains(t, k.Init(), "Is KSM enabled in this kernel?")
require.ErrorContains(t, k.Init(), "KSM is not enabled in this kernel")
}
func TestKSMDisabledNoKSMTags(t *testing.T) {

View File

@ -0,0 +1,51 @@
//go:build linux
package kernel
import (
"fmt"
"time"
"github.com/prometheus/procfs"
"github.com/influxdata/telegraf"
)
// Gather PSI metrics
func (k *Kernel) gatherPressure(acc telegraf.Accumulator) error {
for _, resource := range []string{"cpu", "memory", "io"} {
now := time.Now()
psiStats, err := k.procfs.PSIStatsForResource(resource)
if err != nil {
return fmt.Errorf("failed to read %s pressure: %w", resource, err)
}
stats := map[string]*procfs.PSILine{
"some": psiStats.Some,
"full": psiStats.Full,
}
for _, typ := range []string{"some", "full"} {
if resource == "cpu" && typ == "full" {
// resource=cpu,type=full is omitted because it is always zero
continue
}
tags := map[string]string{
"resource": resource,
"type": typ,
}
stat := stats[typ]
acc.AddCounter("pressure", map[string]interface{}{
"total": stat.Total,
}, tags, now)
acc.AddGauge("pressure", map[string]interface{}{
"avg10": stat.Avg10,
"avg60": stat.Avg60,
"avg300": stat.Avg300,
}, tags, now)
}
}
return nil
}

View File

@ -0,0 +1,170 @@
//go:build linux
package kernel
import (
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/metric"
"github.com/influxdata/telegraf/testutil"
)
func TestPSIEnabledWrongDir(t *testing.T) {
k := Kernel{
psiDir: "testdata/this_directory_does_not_exist/stub",
ConfigCollect: []string{"psi"},
}
require.ErrorContains(t, k.Init(), "failed to initialize procfs on ")
}
func TestPSIStats(t *testing.T) {
k := Kernel{
psiDir: "testdata/pressure",
ConfigCollect: []string{"psi"},
}
require.NoError(t, k.Init())
var acc testutil.Accumulator
require.NoError(t, k.gatherPressure(&acc))
expected := []telegraf.Metric{
metric.New(
"pressure",
map[string]string{
"resource": "cpu",
"type": "some",
},
map[string]interface{}{
"avg10": float64(10),
"avg60": float64(60),
"avg300": float64(300),
},
time.Unix(0, 0),
telegraf.Gauge,
),
metric.New(
"pressure",
map[string]string{
"resource": "cpu",
"type": "some",
},
map[string]interface{}{
"total": uint64(114514),
},
time.Unix(0, 0),
telegraf.Counter,
),
metric.New(
"pressure",
map[string]string{
"resource": "memory",
"type": "some",
},
map[string]interface{}{
"avg10": float64(10),
"avg60": float64(60),
"avg300": float64(300),
},
time.Unix(0, 0),
telegraf.Gauge,
),
metric.New(
"pressure",
map[string]string{
"resource": "memory",
"type": "some",
},
map[string]interface{}{
"total": uint64(114514),
},
time.Unix(0, 0),
telegraf.Counter,
),
metric.New(
"pressure",
map[string]string{
"resource": "io",
"type": "some",
},
map[string]interface{}{
"avg10": float64(10),
"avg60": float64(60),
"avg300": float64(300),
},
time.Unix(0, 0),
telegraf.Gauge,
),
metric.New(
"pressure",
map[string]string{
"resource": "io",
"type": "some",
},
map[string]interface{}{
"total": uint64(114514),
},
time.Unix(0, 0),
telegraf.Counter,
),
metric.New(
"pressure",
map[string]string{
"resource": "memory",
"type": "full",
},
map[string]interface{}{
"avg10": float64(1),
"avg60": float64(6),
"avg300": float64(30),
},
time.Unix(0, 0),
telegraf.Gauge,
),
metric.New(
"pressure",
map[string]string{
"resource": "memory",
"type": "full",
},
map[string]interface{}{
"total": uint64(11451),
},
time.Unix(0, 0),
telegraf.Counter,
),
metric.New(
"pressure",
map[string]string{
"resource": "io",
"type": "full",
},
map[string]interface{}{
"avg10": float64(1),
"avg60": float64(6),
"avg300": float64(30),
},
time.Unix(0, 0),
telegraf.Gauge,
),
metric.New(
"pressure",
map[string]string{
"resource": "io",
"type": "full",
},
map[string]interface{}{
"total": uint64(11451),
},
time.Unix(0, 0),
telegraf.Counter,
),
}
actual := acc.GetTelegrafMetrics()
testutil.RequireMetricsEqual(t, expected, actual, testutil.IgnoreTime(), testutil.SortMetrics())
}

View File

@ -4,4 +4,5 @@
## Additional gather options
## Possible options include:
## * ksm - kernel same-page merging
## * psi - pressure stall information
# collect = []

View File

@ -0,0 +1,2 @@
some avg10=10.00 avg60=60.00 avg300=300.00 total=114514
full avg10=0.00 avg60=0.00 avg300=0.00 total=0

View File

@ -0,0 +1,2 @@
some avg10=10.00 avg60=60.00 avg300=300.00 total=114514
full avg10=1.00 avg60=6.00 avg300=30.00 total=11451

View File

@ -0,0 +1,2 @@
some avg10=10.00 avg60=60.00 avg300=300.00 total=114514
full avg10=1.00 avg60=6.00 avg300=30.00 total=11451