From 0052fc36343f0090a4205ed14d25622c0a924321 Mon Sep 17 00:00:00 2001 From: iBug Date: Sat, 6 Jan 2024 05:33:00 +0800 Subject: [PATCH] feat(inputs.kernel): Add Pressure Stall Information (#14507) --- plugins/inputs/kernel/README.md | 66 ++++++- plugins/inputs/kernel/kernel.go | 23 ++- plugins/inputs/kernel/kernel_test.go | 2 +- plugins/inputs/kernel/psi.go | 51 ++++++ plugins/inputs/kernel/psi_test.go | 170 ++++++++++++++++++ plugins/inputs/kernel/sample.conf | 1 + plugins/inputs/kernel/testdata/pressure/cpu | 2 + plugins/inputs/kernel/testdata/pressure/io | 2 + .../inputs/kernel/testdata/pressure/memory | 2 + 9 files changed, 308 insertions(+), 11 deletions(-) create mode 100644 plugins/inputs/kernel/psi.go create mode 100644 plugins/inputs/kernel/psi_test.go create mode 100644 plugins/inputs/kernel/testdata/pressure/cpu create mode 100644 plugins/inputs/kernel/testdata/pressure/io create mode 100644 plugins/inputs/kernel/testdata/pressure/memory diff --git a/plugins/inputs/kernel/README.md b/plugins/inputs/kernel/README.md index 2ea010bc2..8b53efa3f 100644 --- a/plugins/inputs/kernel/README.md +++ b/plugins/inputs/kernel/README.md @@ -5,13 +5,14 @@ This plugin is only available on Linux. The kernel plugin gathers info about the kernel that doesn't fit into other plugins. In general, it is the statistics available in `/proc/stat` that are not covered by other plugins as well as the value of -`/proc/sys/kernel/random/entropy_avail` and optionally, Kernel Samepage Merging. +`/proc/sys/kernel/random/entropy_avail` and optionally, Kernel Samepage Merging +and Pressure Stall Information. -The metrics are documented in `man proc` under the `/proc/stat` section. -The metrics are documented in `man 4 random` under the `/proc/stat` section. +The metrics are documented in `man 5 proc` under the `/proc/stat` section, as +well as `man 4 random` under the `/proc interfaces` section +(for `entropy_avail`). ```text - /proc/sys/kernel/random/entropy_avail Contains the value of available entropy @@ -40,10 +41,28 @@ Number of forks since boot. ``` Kernel Samepage Merging is generally documented in [kernel documentation][1] and -the available metrics exposed via sysfs are documented in [admin guide][2] +the available metrics exposed via sysfs are documented in [admin guide][2]. + +Pressure Stall Information is exposed through `/proc/pressure` and is documented +in [kernel documentation][3]. Kernel version 4.20 or later is required. +Examples of PSI: + +```shell +# /proc/pressure/cpu +some avg10=1.53 avg60=1.87 avg300=1.73 total=1088168194 + +# /proc/pressure/memory +some avg10=0.00 avg60=0.00 avg300=0.00 total=3463792 +full avg10=0.00 avg60=0.00 avg300=0.00 total=1429641 + +# /proc/pressure/io +some avg10=0.00 avg60=0.00 avg300=0.00 total=68568296 +full avg10=0.00 avg60=0.00 avg300=0.00 total=54982338 +``` [1]: https://www.kernel.org/doc/html/latest/mm/ksm.html [2]: https://www.kernel.org/doc/html/latest/admin-guide/mm/ksm.html#ksm-daemon-sysfs-interface +[3]: https://www.kernel.org/doc/html/latest/accounting/psi.html ## Global configuration options @@ -63,6 +82,7 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. ## Additional gather options ## Possible options include: ## * ksm - kernel same-page merging + ## * psi - pressure stall information # collect = [] ``` @@ -91,11 +111,41 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. - ksm_stable_node_dups (integer, number of duplicated KSM pages, `stable_node_dups`) - ksm_use_zero_pages (integer, whether empty pages should be treated specially, `use_zero_pages`) +- pressure (if `psi` is included in `collect`) + - tags: + - resource: cpu, memory, or io + - type: some or full + - floating-point fields: avg10, avg60, avg300 + - integer fields: total + ## Example Output +Default config: + ```text kernel boot_time=1690487872i,context_switches=321398652i,entropy_avail=256i,interrupts=141868628i,processes_forked=946492i 1691339564000000000 - -kernel boot_time=1690487872i,context_switches=321252729i,entropy_avail=256i,interrupts=141783427i,ksm_full_scans=0i,ksm_max_page_sharing=256i,ksm_merge_across_nodes=1i,ksm_pages_shared=0i,ksm_pages_sharing=0i,ksm_pages_to_scan=100i,ksm_pages_unshared=0i,ksm_pages_volatile=0i,ksm_run=0i,ksm_sleep_millisecs=20i,ksm_stable_node_chains=0i,ksm_stable_node_chains_prune_millisecs=2000i,ksm_stable_node_dups=0i,ksm_use_zero_pages=0i,processes_forked=946467i 1691339522000000000 - ``` + +If `ksm` is included in `collect`: + +```text +kernel boot_time=1690487872i,context_switches=321252729i,entropy_avail=256i,interrupts=141783427i,ksm_full_scans=0i,ksm_max_page_sharing=256i,ksm_merge_across_nodes=1i,ksm_pages_shared=0i,ksm_pages_sharing=0i,ksm_pages_to_scan=100i,ksm_pages_unshared=0i,ksm_pages_volatile=0i,ksm_run=0i,ksm_sleep_millisecs=20i,ksm_stable_node_chains=0i,ksm_stable_node_chains_prune_millisecs=2000i,ksm_stable_node_dups=0i,ksm_use_zero_pages=0i,processes_forked=946467i 1691339522000000000 +``` + +If `psi` is included in `collect`: + +```text +pressure,resource=cpu,type=some avg10=1.53,avg60=1.87,avg300=1.73 1700000000000000000 +pressure,resource=memory,type=some avg10=0.00,avg60=0.00,avg300=0.00 1700000000000000000 +pressure,resource=memory,type=full avg10=0.00,avg60=0.00,avg300=0.00 1700000000000000000 +pressure,resource=io,type=some avg10=0.0,avg60=0.0,avg300=0.0 1700000000000000000 +pressure,resource=io,type=full avg10=0.0,avg60=0.0,avg300=0.0 1700000000000000000 +pressure,resource=cpu,type=some total=1088168194i 1700000000000000000 +pressure,resource=memory,type=some total=3463792i 1700000000000000000 +pressure,resource=memory,type=full total=1429641i 1700000000000000000 +pressure,resource=io,type=some total=68568296i 1700000000000000000 +pressure,resource=io,type=full total=54982338i 1700000000000000000 +``` + +Note that the combination for `resource=cpu,type=full` is omitted because it is +always zero. diff --git a/plugins/inputs/kernel/kernel.go b/plugins/inputs/kernel/kernel.go index 9e98bc6e0..58e3af044 100644 --- a/plugins/inputs/kernel/kernel.go +++ b/plugins/inputs/kernel/kernel.go @@ -11,6 +11,8 @@ import ( "path/filepath" "strconv" + "github.com/prometheus/procfs" + "github.com/influxdata/telegraf" "github.com/influxdata/telegraf/plugins/inputs" ) @@ -34,6 +36,8 @@ type Kernel struct { statFile string entropyStatFile string ksmStatsDir string + psiDir string + procfs procfs.FS } func (k *Kernel) Init() error { @@ -45,7 +49,15 @@ func (k *Kernel) Init() error { if k.optCollect["ksm"] { if _, err := os.Stat(k.ksmStatsDir); os.IsNotExist(err) { // ksm probably not enabled in the kernel, bail out early - return fmt.Errorf("directory %q does not exist. Is KSM enabled in this kernel?", k.ksmStatsDir) + return fmt.Errorf("directory %q does not exist. KSM is not enabled in this kernel", k.ksmStatsDir) + } + } + if k.optCollect["psi"] { + procdir := filepath.Dir(k.psiDir) + var err error + if k.procfs, err = procfs.NewFS(procdir); err != nil { + // psi probably not supported in the kernel, bail out early + return fmt.Errorf("failed to initialize procfs on %s: %w", procdir, err) } } return nil @@ -145,12 +157,18 @@ func (k *Kernel) Gather(acc telegraf.Accumulator) error { } acc.AddCounter("kernel", fields, map[string]string{}) + if k.optCollect["psi"] { + if err := k.gatherPressure(acc); err != nil { + return err + } + } + return nil } func (k *Kernel) getProcValueBytes(path string) ([]byte, error) { if _, err := os.Stat(path); os.IsNotExist(err) { - return nil, fmt.Errorf("Path %q does not exist", path) + return nil, fmt.Errorf("path %q does not exist", path) } else if err != nil { return nil, err } @@ -183,6 +201,7 @@ func init() { statFile: "/proc/stat", entropyStatFile: "/proc/sys/kernel/random/entropy_avail", ksmStatsDir: "/sys/kernel/mm/ksm", + psiDir: "/proc/pressure", } }) } diff --git a/plugins/inputs/kernel/kernel_test.go b/plugins/inputs/kernel/kernel_test.go index d249e63c8..26fa7dcbe 100644 --- a/plugins/inputs/kernel/kernel_test.go +++ b/plugins/inputs/kernel/kernel_test.go @@ -200,7 +200,7 @@ func TestKSMEnabledWrongDir(t *testing.T) { ConfigCollect: []string{"ksm"}, } - require.ErrorContains(t, k.Init(), "Is KSM enabled in this kernel?") + require.ErrorContains(t, k.Init(), "KSM is not enabled in this kernel") } func TestKSMDisabledNoKSMTags(t *testing.T) { diff --git a/plugins/inputs/kernel/psi.go b/plugins/inputs/kernel/psi.go new file mode 100644 index 000000000..c85e2cafe --- /dev/null +++ b/plugins/inputs/kernel/psi.go @@ -0,0 +1,51 @@ +//go:build linux + +package kernel + +import ( + "fmt" + "time" + + "github.com/prometheus/procfs" + + "github.com/influxdata/telegraf" +) + +// Gather PSI metrics +func (k *Kernel) gatherPressure(acc telegraf.Accumulator) error { + for _, resource := range []string{"cpu", "memory", "io"} { + now := time.Now() + psiStats, err := k.procfs.PSIStatsForResource(resource) + if err != nil { + return fmt.Errorf("failed to read %s pressure: %w", resource, err) + } + + stats := map[string]*procfs.PSILine{ + "some": psiStats.Some, + "full": psiStats.Full, + } + + for _, typ := range []string{"some", "full"} { + if resource == "cpu" && typ == "full" { + // resource=cpu,type=full is omitted because it is always zero + continue + } + + tags := map[string]string{ + "resource": resource, + "type": typ, + } + stat := stats[typ] + + acc.AddCounter("pressure", map[string]interface{}{ + "total": stat.Total, + }, tags, now) + acc.AddGauge("pressure", map[string]interface{}{ + "avg10": stat.Avg10, + "avg60": stat.Avg60, + "avg300": stat.Avg300, + }, tags, now) + } + } + return nil +} diff --git a/plugins/inputs/kernel/psi_test.go b/plugins/inputs/kernel/psi_test.go new file mode 100644 index 000000000..6552dfe98 --- /dev/null +++ b/plugins/inputs/kernel/psi_test.go @@ -0,0 +1,170 @@ +//go:build linux + +package kernel + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/metric" + "github.com/influxdata/telegraf/testutil" +) + +func TestPSIEnabledWrongDir(t *testing.T) { + k := Kernel{ + psiDir: "testdata/this_directory_does_not_exist/stub", + ConfigCollect: []string{"psi"}, + } + + require.ErrorContains(t, k.Init(), "failed to initialize procfs on ") +} + +func TestPSIStats(t *testing.T) { + k := Kernel{ + psiDir: "testdata/pressure", + ConfigCollect: []string{"psi"}, + } + require.NoError(t, k.Init()) + + var acc testutil.Accumulator + require.NoError(t, k.gatherPressure(&acc)) + + expected := []telegraf.Metric{ + metric.New( + "pressure", + map[string]string{ + "resource": "cpu", + "type": "some", + }, + map[string]interface{}{ + "avg10": float64(10), + "avg60": float64(60), + "avg300": float64(300), + }, + time.Unix(0, 0), + telegraf.Gauge, + ), + metric.New( + "pressure", + map[string]string{ + "resource": "cpu", + "type": "some", + }, + map[string]interface{}{ + "total": uint64(114514), + }, + time.Unix(0, 0), + telegraf.Counter, + ), + metric.New( + "pressure", + map[string]string{ + "resource": "memory", + "type": "some", + }, + map[string]interface{}{ + "avg10": float64(10), + "avg60": float64(60), + "avg300": float64(300), + }, + time.Unix(0, 0), + telegraf.Gauge, + ), + metric.New( + "pressure", + map[string]string{ + "resource": "memory", + "type": "some", + }, + map[string]interface{}{ + "total": uint64(114514), + }, + time.Unix(0, 0), + telegraf.Counter, + ), + metric.New( + "pressure", + map[string]string{ + "resource": "io", + "type": "some", + }, + map[string]interface{}{ + "avg10": float64(10), + "avg60": float64(60), + "avg300": float64(300), + }, + time.Unix(0, 0), + telegraf.Gauge, + ), + metric.New( + "pressure", + map[string]string{ + "resource": "io", + "type": "some", + }, + map[string]interface{}{ + "total": uint64(114514), + }, + time.Unix(0, 0), + telegraf.Counter, + ), + metric.New( + "pressure", + map[string]string{ + "resource": "memory", + "type": "full", + }, + map[string]interface{}{ + "avg10": float64(1), + "avg60": float64(6), + "avg300": float64(30), + }, + time.Unix(0, 0), + telegraf.Gauge, + ), + metric.New( + "pressure", + map[string]string{ + "resource": "memory", + "type": "full", + }, + map[string]interface{}{ + "total": uint64(11451), + }, + time.Unix(0, 0), + telegraf.Counter, + ), + metric.New( + "pressure", + map[string]string{ + "resource": "io", + "type": "full", + }, + map[string]interface{}{ + "avg10": float64(1), + "avg60": float64(6), + "avg300": float64(30), + }, + time.Unix(0, 0), + telegraf.Gauge, + ), + metric.New( + "pressure", + map[string]string{ + "resource": "io", + "type": "full", + }, + map[string]interface{}{ + "total": uint64(11451), + }, + time.Unix(0, 0), + telegraf.Counter, + ), + } + + actual := acc.GetTelegrafMetrics() + testutil.RequireMetricsEqual(t, expected, actual, testutil.IgnoreTime(), testutil.SortMetrics()) +} diff --git a/plugins/inputs/kernel/sample.conf b/plugins/inputs/kernel/sample.conf index bdc3fc053..d84802436 100644 --- a/plugins/inputs/kernel/sample.conf +++ b/plugins/inputs/kernel/sample.conf @@ -4,4 +4,5 @@ ## Additional gather options ## Possible options include: ## * ksm - kernel same-page merging + ## * psi - pressure stall information # collect = [] diff --git a/plugins/inputs/kernel/testdata/pressure/cpu b/plugins/inputs/kernel/testdata/pressure/cpu new file mode 100644 index 000000000..109330472 --- /dev/null +++ b/plugins/inputs/kernel/testdata/pressure/cpu @@ -0,0 +1,2 @@ +some avg10=10.00 avg60=60.00 avg300=300.00 total=114514 +full avg10=0.00 avg60=0.00 avg300=0.00 total=0 diff --git a/plugins/inputs/kernel/testdata/pressure/io b/plugins/inputs/kernel/testdata/pressure/io new file mode 100644 index 000000000..613432c83 --- /dev/null +++ b/plugins/inputs/kernel/testdata/pressure/io @@ -0,0 +1,2 @@ +some avg10=10.00 avg60=60.00 avg300=300.00 total=114514 +full avg10=1.00 avg60=6.00 avg300=30.00 total=11451 diff --git a/plugins/inputs/kernel/testdata/pressure/memory b/plugins/inputs/kernel/testdata/pressure/memory new file mode 100644 index 000000000..613432c83 --- /dev/null +++ b/plugins/inputs/kernel/testdata/pressure/memory @@ -0,0 +1,2 @@ +some avg10=10.00 avg60=60.00 avg300=300.00 total=114514 +full avg10=1.00 avg60=6.00 avg300=30.00 total=11451