feat(inputs.kernel): Add Pressure Stall Information (#14507)
This commit is contained in:
parent
9e88381085
commit
0052fc3634
|
|
@ -5,13 +5,14 @@ This plugin is only available on Linux.
|
|||
The kernel plugin gathers info about the kernel that doesn't fit into other
|
||||
plugins. In general, it is the statistics available in `/proc/stat` that are not
|
||||
covered by other plugins as well as the value of
|
||||
`/proc/sys/kernel/random/entropy_avail` and optionally, Kernel Samepage Merging.
|
||||
`/proc/sys/kernel/random/entropy_avail` and optionally, Kernel Samepage Merging
|
||||
and Pressure Stall Information.
|
||||
|
||||
The metrics are documented in `man proc` under the `/proc/stat` section.
|
||||
The metrics are documented in `man 4 random` under the `/proc/stat` section.
|
||||
The metrics are documented in `man 5 proc` under the `/proc/stat` section, as
|
||||
well as `man 4 random` under the `/proc interfaces` section
|
||||
(for `entropy_avail`).
|
||||
|
||||
```text
|
||||
|
||||
/proc/sys/kernel/random/entropy_avail
|
||||
Contains the value of available entropy
|
||||
|
||||
|
|
@ -40,10 +41,28 @@ Number of forks since boot.
|
|||
```
|
||||
|
||||
Kernel Samepage Merging is generally documented in [kernel documentation][1] and
|
||||
the available metrics exposed via sysfs are documented in [admin guide][2]
|
||||
the available metrics exposed via sysfs are documented in [admin guide][2].
|
||||
|
||||
Pressure Stall Information is exposed through `/proc/pressure` and is documented
|
||||
in [kernel documentation][3]. Kernel version 4.20 or later is required.
|
||||
Examples of PSI:
|
||||
|
||||
```shell
|
||||
# /proc/pressure/cpu
|
||||
some avg10=1.53 avg60=1.87 avg300=1.73 total=1088168194
|
||||
|
||||
# /proc/pressure/memory
|
||||
some avg10=0.00 avg60=0.00 avg300=0.00 total=3463792
|
||||
full avg10=0.00 avg60=0.00 avg300=0.00 total=1429641
|
||||
|
||||
# /proc/pressure/io
|
||||
some avg10=0.00 avg60=0.00 avg300=0.00 total=68568296
|
||||
full avg10=0.00 avg60=0.00 avg300=0.00 total=54982338
|
||||
```
|
||||
|
||||
[1]: https://www.kernel.org/doc/html/latest/mm/ksm.html
|
||||
[2]: https://www.kernel.org/doc/html/latest/admin-guide/mm/ksm.html#ksm-daemon-sysfs-interface
|
||||
[3]: https://www.kernel.org/doc/html/latest/accounting/psi.html
|
||||
|
||||
## Global configuration options <!-- @/docs/includes/plugin_config.md -->
|
||||
|
||||
|
|
@ -63,6 +82,7 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
|
|||
## Additional gather options
|
||||
## Possible options include:
|
||||
## * ksm - kernel same-page merging
|
||||
## * psi - pressure stall information
|
||||
# collect = []
|
||||
```
|
||||
|
||||
|
|
@ -91,11 +111,41 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
|
|||
- ksm_stable_node_dups (integer, number of duplicated KSM pages, `stable_node_dups`)
|
||||
- ksm_use_zero_pages (integer, whether empty pages should be treated specially, `use_zero_pages`)
|
||||
|
||||
- pressure (if `psi` is included in `collect`)
|
||||
- tags:
|
||||
- resource: cpu, memory, or io
|
||||
- type: some or full
|
||||
- floating-point fields: avg10, avg60, avg300
|
||||
- integer fields: total
|
||||
|
||||
## Example Output
|
||||
|
||||
Default config:
|
||||
|
||||
```text
|
||||
kernel boot_time=1690487872i,context_switches=321398652i,entropy_avail=256i,interrupts=141868628i,processes_forked=946492i 1691339564000000000
|
||||
|
||||
kernel boot_time=1690487872i,context_switches=321252729i,entropy_avail=256i,interrupts=141783427i,ksm_full_scans=0i,ksm_max_page_sharing=256i,ksm_merge_across_nodes=1i,ksm_pages_shared=0i,ksm_pages_sharing=0i,ksm_pages_to_scan=100i,ksm_pages_unshared=0i,ksm_pages_volatile=0i,ksm_run=0i,ksm_sleep_millisecs=20i,ksm_stable_node_chains=0i,ksm_stable_node_chains_prune_millisecs=2000i,ksm_stable_node_dups=0i,ksm_use_zero_pages=0i,processes_forked=946467i 1691339522000000000
|
||||
|
||||
```
|
||||
|
||||
If `ksm` is included in `collect`:
|
||||
|
||||
```text
|
||||
kernel boot_time=1690487872i,context_switches=321252729i,entropy_avail=256i,interrupts=141783427i,ksm_full_scans=0i,ksm_max_page_sharing=256i,ksm_merge_across_nodes=1i,ksm_pages_shared=0i,ksm_pages_sharing=0i,ksm_pages_to_scan=100i,ksm_pages_unshared=0i,ksm_pages_volatile=0i,ksm_run=0i,ksm_sleep_millisecs=20i,ksm_stable_node_chains=0i,ksm_stable_node_chains_prune_millisecs=2000i,ksm_stable_node_dups=0i,ksm_use_zero_pages=0i,processes_forked=946467i 1691339522000000000
|
||||
```
|
||||
|
||||
If `psi` is included in `collect`:
|
||||
|
||||
```text
|
||||
pressure,resource=cpu,type=some avg10=1.53,avg60=1.87,avg300=1.73 1700000000000000000
|
||||
pressure,resource=memory,type=some avg10=0.00,avg60=0.00,avg300=0.00 1700000000000000000
|
||||
pressure,resource=memory,type=full avg10=0.00,avg60=0.00,avg300=0.00 1700000000000000000
|
||||
pressure,resource=io,type=some avg10=0.0,avg60=0.0,avg300=0.0 1700000000000000000
|
||||
pressure,resource=io,type=full avg10=0.0,avg60=0.0,avg300=0.0 1700000000000000000
|
||||
pressure,resource=cpu,type=some total=1088168194i 1700000000000000000
|
||||
pressure,resource=memory,type=some total=3463792i 1700000000000000000
|
||||
pressure,resource=memory,type=full total=1429641i 1700000000000000000
|
||||
pressure,resource=io,type=some total=68568296i 1700000000000000000
|
||||
pressure,resource=io,type=full total=54982338i 1700000000000000000
|
||||
```
|
||||
|
||||
Note that the combination for `resource=cpu,type=full` is omitted because it is
|
||||
always zero.
|
||||
|
|
|
|||
|
|
@ -11,6 +11,8 @@ import (
|
|||
"path/filepath"
|
||||
"strconv"
|
||||
|
||||
"github.com/prometheus/procfs"
|
||||
|
||||
"github.com/influxdata/telegraf"
|
||||
"github.com/influxdata/telegraf/plugins/inputs"
|
||||
)
|
||||
|
|
@ -34,6 +36,8 @@ type Kernel struct {
|
|||
statFile string
|
||||
entropyStatFile string
|
||||
ksmStatsDir string
|
||||
psiDir string
|
||||
procfs procfs.FS
|
||||
}
|
||||
|
||||
func (k *Kernel) Init() error {
|
||||
|
|
@ -45,7 +49,15 @@ func (k *Kernel) Init() error {
|
|||
if k.optCollect["ksm"] {
|
||||
if _, err := os.Stat(k.ksmStatsDir); os.IsNotExist(err) {
|
||||
// ksm probably not enabled in the kernel, bail out early
|
||||
return fmt.Errorf("directory %q does not exist. Is KSM enabled in this kernel?", k.ksmStatsDir)
|
||||
return fmt.Errorf("directory %q does not exist. KSM is not enabled in this kernel", k.ksmStatsDir)
|
||||
}
|
||||
}
|
||||
if k.optCollect["psi"] {
|
||||
procdir := filepath.Dir(k.psiDir)
|
||||
var err error
|
||||
if k.procfs, err = procfs.NewFS(procdir); err != nil {
|
||||
// psi probably not supported in the kernel, bail out early
|
||||
return fmt.Errorf("failed to initialize procfs on %s: %w", procdir, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
|
@ -145,12 +157,18 @@ func (k *Kernel) Gather(acc telegraf.Accumulator) error {
|
|||
}
|
||||
acc.AddCounter("kernel", fields, map[string]string{})
|
||||
|
||||
if k.optCollect["psi"] {
|
||||
if err := k.gatherPressure(acc); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (k *Kernel) getProcValueBytes(path string) ([]byte, error) {
|
||||
if _, err := os.Stat(path); os.IsNotExist(err) {
|
||||
return nil, fmt.Errorf("Path %q does not exist", path)
|
||||
return nil, fmt.Errorf("path %q does not exist", path)
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
@ -183,6 +201,7 @@ func init() {
|
|||
statFile: "/proc/stat",
|
||||
entropyStatFile: "/proc/sys/kernel/random/entropy_avail",
|
||||
ksmStatsDir: "/sys/kernel/mm/ksm",
|
||||
psiDir: "/proc/pressure",
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -200,7 +200,7 @@ func TestKSMEnabledWrongDir(t *testing.T) {
|
|||
ConfigCollect: []string{"ksm"},
|
||||
}
|
||||
|
||||
require.ErrorContains(t, k.Init(), "Is KSM enabled in this kernel?")
|
||||
require.ErrorContains(t, k.Init(), "KSM is not enabled in this kernel")
|
||||
}
|
||||
|
||||
func TestKSMDisabledNoKSMTags(t *testing.T) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,51 @@
|
|||
//go:build linux
|
||||
|
||||
package kernel
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/procfs"
|
||||
|
||||
"github.com/influxdata/telegraf"
|
||||
)
|
||||
|
||||
// Gather PSI metrics
|
||||
func (k *Kernel) gatherPressure(acc telegraf.Accumulator) error {
|
||||
for _, resource := range []string{"cpu", "memory", "io"} {
|
||||
now := time.Now()
|
||||
psiStats, err := k.procfs.PSIStatsForResource(resource)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read %s pressure: %w", resource, err)
|
||||
}
|
||||
|
||||
stats := map[string]*procfs.PSILine{
|
||||
"some": psiStats.Some,
|
||||
"full": psiStats.Full,
|
||||
}
|
||||
|
||||
for _, typ := range []string{"some", "full"} {
|
||||
if resource == "cpu" && typ == "full" {
|
||||
// resource=cpu,type=full is omitted because it is always zero
|
||||
continue
|
||||
}
|
||||
|
||||
tags := map[string]string{
|
||||
"resource": resource,
|
||||
"type": typ,
|
||||
}
|
||||
stat := stats[typ]
|
||||
|
||||
acc.AddCounter("pressure", map[string]interface{}{
|
||||
"total": stat.Total,
|
||||
}, tags, now)
|
||||
acc.AddGauge("pressure", map[string]interface{}{
|
||||
"avg10": stat.Avg10,
|
||||
"avg60": stat.Avg60,
|
||||
"avg300": stat.Avg300,
|
||||
}, tags, now)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
@ -0,0 +1,170 @@
|
|||
//go:build linux
|
||||
|
||||
package kernel
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/influxdata/telegraf"
|
||||
"github.com/influxdata/telegraf/metric"
|
||||
"github.com/influxdata/telegraf/testutil"
|
||||
)
|
||||
|
||||
func TestPSIEnabledWrongDir(t *testing.T) {
|
||||
k := Kernel{
|
||||
psiDir: "testdata/this_directory_does_not_exist/stub",
|
||||
ConfigCollect: []string{"psi"},
|
||||
}
|
||||
|
||||
require.ErrorContains(t, k.Init(), "failed to initialize procfs on ")
|
||||
}
|
||||
|
||||
func TestPSIStats(t *testing.T) {
|
||||
k := Kernel{
|
||||
psiDir: "testdata/pressure",
|
||||
ConfigCollect: []string{"psi"},
|
||||
}
|
||||
require.NoError(t, k.Init())
|
||||
|
||||
var acc testutil.Accumulator
|
||||
require.NoError(t, k.gatherPressure(&acc))
|
||||
|
||||
expected := []telegraf.Metric{
|
||||
metric.New(
|
||||
"pressure",
|
||||
map[string]string{
|
||||
"resource": "cpu",
|
||||
"type": "some",
|
||||
},
|
||||
map[string]interface{}{
|
||||
"avg10": float64(10),
|
||||
"avg60": float64(60),
|
||||
"avg300": float64(300),
|
||||
},
|
||||
time.Unix(0, 0),
|
||||
telegraf.Gauge,
|
||||
),
|
||||
metric.New(
|
||||
"pressure",
|
||||
map[string]string{
|
||||
"resource": "cpu",
|
||||
"type": "some",
|
||||
},
|
||||
map[string]interface{}{
|
||||
"total": uint64(114514),
|
||||
},
|
||||
time.Unix(0, 0),
|
||||
telegraf.Counter,
|
||||
),
|
||||
metric.New(
|
||||
"pressure",
|
||||
map[string]string{
|
||||
"resource": "memory",
|
||||
"type": "some",
|
||||
},
|
||||
map[string]interface{}{
|
||||
"avg10": float64(10),
|
||||
"avg60": float64(60),
|
||||
"avg300": float64(300),
|
||||
},
|
||||
time.Unix(0, 0),
|
||||
telegraf.Gauge,
|
||||
),
|
||||
metric.New(
|
||||
"pressure",
|
||||
map[string]string{
|
||||
"resource": "memory",
|
||||
"type": "some",
|
||||
},
|
||||
map[string]interface{}{
|
||||
"total": uint64(114514),
|
||||
},
|
||||
time.Unix(0, 0),
|
||||
telegraf.Counter,
|
||||
),
|
||||
metric.New(
|
||||
"pressure",
|
||||
map[string]string{
|
||||
"resource": "io",
|
||||
"type": "some",
|
||||
},
|
||||
map[string]interface{}{
|
||||
"avg10": float64(10),
|
||||
"avg60": float64(60),
|
||||
"avg300": float64(300),
|
||||
},
|
||||
time.Unix(0, 0),
|
||||
telegraf.Gauge,
|
||||
),
|
||||
metric.New(
|
||||
"pressure",
|
||||
map[string]string{
|
||||
"resource": "io",
|
||||
"type": "some",
|
||||
},
|
||||
map[string]interface{}{
|
||||
"total": uint64(114514),
|
||||
},
|
||||
time.Unix(0, 0),
|
||||
telegraf.Counter,
|
||||
),
|
||||
metric.New(
|
||||
"pressure",
|
||||
map[string]string{
|
||||
"resource": "memory",
|
||||
"type": "full",
|
||||
},
|
||||
map[string]interface{}{
|
||||
"avg10": float64(1),
|
||||
"avg60": float64(6),
|
||||
"avg300": float64(30),
|
||||
},
|
||||
time.Unix(0, 0),
|
||||
telegraf.Gauge,
|
||||
),
|
||||
metric.New(
|
||||
"pressure",
|
||||
map[string]string{
|
||||
"resource": "memory",
|
||||
"type": "full",
|
||||
},
|
||||
map[string]interface{}{
|
||||
"total": uint64(11451),
|
||||
},
|
||||
time.Unix(0, 0),
|
||||
telegraf.Counter,
|
||||
),
|
||||
metric.New(
|
||||
"pressure",
|
||||
map[string]string{
|
||||
"resource": "io",
|
||||
"type": "full",
|
||||
},
|
||||
map[string]interface{}{
|
||||
"avg10": float64(1),
|
||||
"avg60": float64(6),
|
||||
"avg300": float64(30),
|
||||
},
|
||||
time.Unix(0, 0),
|
||||
telegraf.Gauge,
|
||||
),
|
||||
metric.New(
|
||||
"pressure",
|
||||
map[string]string{
|
||||
"resource": "io",
|
||||
"type": "full",
|
||||
},
|
||||
map[string]interface{}{
|
||||
"total": uint64(11451),
|
||||
},
|
||||
time.Unix(0, 0),
|
||||
telegraf.Counter,
|
||||
),
|
||||
}
|
||||
|
||||
actual := acc.GetTelegrafMetrics()
|
||||
testutil.RequireMetricsEqual(t, expected, actual, testutil.IgnoreTime(), testutil.SortMetrics())
|
||||
}
|
||||
|
|
@ -4,4 +4,5 @@
|
|||
## Additional gather options
|
||||
## Possible options include:
|
||||
## * ksm - kernel same-page merging
|
||||
## * psi - pressure stall information
|
||||
# collect = []
|
||||
|
|
|
|||
|
|
@ -0,0 +1,2 @@
|
|||
some avg10=10.00 avg60=60.00 avg300=300.00 total=114514
|
||||
full avg10=0.00 avg60=0.00 avg300=0.00 total=0
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
some avg10=10.00 avg60=60.00 avg300=300.00 total=114514
|
||||
full avg10=1.00 avg60=6.00 avg300=30.00 total=11451
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
some avg10=10.00 avg60=60.00 avg300=300.00 total=114514
|
||||
full avg10=1.00 avg60=6.00 avg300=30.00 total=11451
|
||||
Loading…
Reference in New Issue