feat(inputs.kernel): Collect KSM metrics (#13728)

This commit is contained in:
Costas Drogos 2023-08-09 23:43:14 +03:00 committed by GitHub
parent 8b032b73ee
commit 0cf7d23090
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
45 changed files with 291 additions and 31 deletions

View File

@ -5,14 +5,13 @@ This plugin is only available on Linux.
The kernel plugin gathers info about the kernel that doesn't fit into other
plugins. In general, it is the statistics available in `/proc/stat` that are not
covered by other plugins as well as the value of
`/proc/sys/kernel/random/entropy_avail`
`/proc/sys/kernel/random/entropy_avail` and optionally, Kernel Samepage Merging.
The metrics are documented in `man proc` under the `/proc/stat` section.
The metrics are documented in `man 4 random` under the `/proc/stat` section.
```text
/proc/sys/kernel/random/entropy_avail
Contains the value of available entropy
@ -40,6 +39,12 @@ processes 86031
Number of forks since boot.
```
Kernel Samepage Merging is generally documented in [kernel documenation][1] and
the available metrics exposed via sysfs are documented in [admin guide][2]
[1]: https://www.kernel.org/doc/html/latest/mm/ksm.html
[2]: https://www.kernel.org/doc/html/latest/admin-guide/mm/ksm.html#ksm-daemon-sysfs-interface
## Global configuration options <!-- @/docs/includes/plugin_config.md -->
In addition to the plugin-specific configuration settings, plugins support
@ -52,10 +57,13 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
## Configuration
```toml @sample.conf
# Get kernel statistics from /proc/stat
# Plugin to collect various Linux kernel statistics.
# This plugin ONLY supports Linux
[[inputs.kernel]]
# no configuration
## Additional gather options
## Possible options include:
## * ksm - kernel same-page merging
# collect = []
```
## Metrics
@ -68,9 +76,26 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
- interrupts (integer, `intr`)
- processes_forked (integer, `processes`)
- entropy_avail (integer, `entropy_available`)
- ksm_full_scans (integer, how many times all mergeable areas have been scanned, `full_scans`)
- ksm_max_page_sharing (integer, maximum sharing allowed for each KSM page, `max_page_sharing`)
- ksm_merge_across_nodes (integer, whether pages should be merged across NUMA nodes, `merge_across_nodes`)
- ksm_pages_shared (integer, how many shared pages are being used, `pages_shared`)
- ksm_pages_sharing (integer,how many more sites are sharing them , `pages_sharing`)
- ksm_pages_to_scan (integer, how many pages to scan before ksmd goes to sleep, `pages_to_scan`)
- ksm_pages_unshared (integer, how many pages unique but repeatedly checked for merging, `pages_unshared`)
- ksm_pages_volatile (integer, how many pages changing too fast to be placed in a tree, `pages_volatile`)
- ksm_run (integer, whether ksm is running or not, `run`)
- ksm_sleep_millisecs (integer, how many milliseconds ksmd should sleep between scans, `sleep_millisecs`)
- ksm_stable_node_chains (integer, the number of KSM pages that hit the max_page_sharing limit, `stable_node_chains`)
- ksm_stable_node_chains_prune_millisecs (integer, how frequently KSM checks the metadata of the pages that hit the deduplication limit, `stable_node_chains_prune_millisecs`)
- ksm_stable_node_dups (integer, number of duplicated KSM pages, `stable_node_dups`)
- ksm_use_zero_pages (integer, whether empty pages should be treated specially, `use_zero_pages`)
## Example Output
```text
kernel entropy_available=2469i,boot_time=1457505775i,context_switches=2626618i,disk_pages_in=5741i,disk_pages_out=1808i,interrupts=1472736i,processes_forked=10673i 1457613402960879816
kernel boot_time=1690487872i,context_switches=321398652i,entropy_avail=256i,interrupts=141868628i,processes_forked=946492i 1691339564000000000
kernel boot_time=1690487872i,context_switches=321252729i,entropy_avail=256i,interrupts=141783427i,ksm_full_scans=0i,ksm_max_page_sharing=256i,ksm_merge_across_nodes=1i,ksm_pages_shared=0i,ksm_pages_sharing=0i,ksm_pages_to_scan=100i,ksm_pages_unshared=0i,ksm_pages_volatile=0i,ksm_run=0i,ksm_sleep_millisecs=20i,ksm_stable_node_chains=0i,ksm_stable_node_chains_prune_millisecs=2000i,ksm_stable_node_dups=0i,ksm_use_zero_pages=0i,processes_forked=946467i 1691339522000000000
```

View File

@ -8,8 +8,8 @@ import (
_ "embed"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/inputs"
@ -28,8 +28,27 @@ var (
)
type Kernel struct {
ConfigCollect []string `toml:"collect"`
optCollect map[string]bool
statFile string
entropyStatFile string
ksmStatsDir string
}
func (k *Kernel) Init() error {
k.optCollect = make(map[string]bool, len(k.ConfigCollect))
for _, v := range k.ConfigCollect {
k.optCollect[v] = true
}
if k.optCollect["ksm"] {
if _, err := os.Stat(k.ksmStatsDir); os.IsNotExist(err) {
// ksm probably not enabled in the kernel, bail out early
return fmt.Errorf("directory %q does not exist. Is KSM enabled in this kernel?", k.ksmStatsDir)
}
}
return nil
}
func (*Kernel) SampleConfig() string {
@ -37,18 +56,12 @@ func (*Kernel) SampleConfig() string {
}
func (k *Kernel) Gather(acc telegraf.Accumulator) error {
data, err := k.getProcStat()
data, err := k.getProcValueBytes(k.statFile)
if err != nil {
return err
}
entropyData, err := os.ReadFile(k.entropyStatFile)
if err != nil {
return err
}
entropyString := string(entropyData)
entropyValue, err := strconv.ParseInt(strings.TrimSpace(entropyString), 10, 64)
entropyValue, err := k.getProcValueInt(k.entropyStatFile)
if err != nil {
return err
}
@ -98,31 +111,78 @@ func (k *Kernel) Gather(acc telegraf.Accumulator) error {
}
}
if k.optCollect["ksm"] {
stats := []string{
"full_scans", "max_page_sharing",
"merge_across_nodes", "pages_shared",
"pages_sharing", "pages_to_scan",
"pages_unshared", "pages_volatile",
"run", "sleep_millisecs",
"stable_node_chains", "stable_node_chains_prune_millisecs",
"stable_node_dups", "use_zero_pages",
}
// these exist in very recent Linux versions only, but useful to include if there.
extraStats := []string{"general_profit"}
for _, f := range stats {
m, err := k.getProcValueInt(filepath.Join(k.ksmStatsDir, f))
if err != nil {
return err
}
fields["ksm_"+f] = m
}
for _, f := range extraStats {
m, err := k.getProcValueInt(filepath.Join(k.ksmStatsDir, f))
if err != nil {
// if an extraStats metric doesn't exist in our kernel version, ignore it.
continue
}
fields["ksm_"+f] = m
}
}
acc.AddCounter("kernel", fields, map[string]string{})
return nil
}
func (k *Kernel) getProcStat() ([]byte, error) {
if _, err := os.Stat(k.statFile); os.IsNotExist(err) {
return nil, fmt.Errorf("kernel: %s does not exist", k.statFile)
func (k *Kernel) getProcValueBytes(path string) ([]byte, error) {
if _, err := os.Stat(path); os.IsNotExist(err) {
return nil, fmt.Errorf("Path %q does not exist", path)
} else if err != nil {
return nil, err
}
data, err := os.ReadFile(k.statFile)
data, err := os.ReadFile(path)
if err != nil {
return nil, err
return nil, fmt.Errorf("failed to read from %q: %w", path, err)
}
return data, nil
}
func (k *Kernel) getProcValueInt(path string) (int64, error) {
data, err := k.getProcValueBytes(path)
if err != nil {
return -1, err
}
m, err := strconv.ParseInt(string(bytes.TrimSpace(data)), 10, 64)
if err != nil {
return -1, fmt.Errorf("failed to parse %q as an integer: %w", data, err)
}
return m, nil
}
func init() {
inputs.Add("kernel", func() telegraf.Input {
return &Kernel{
statFile: "/proc/stat",
entropyStatFile: "/proc/sys/kernel/random/entropy_avail",
ksmStatsDir: "/sys/kernel/mm/ksm",
}
})
}

View File

@ -4,12 +4,37 @@ package kernel
import (
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/metric"
"github.com/influxdata/telegraf/testutil"
)
func TestGetProcValueInt(t *testing.T) {
k := Kernel{
statFile: "testdata/stat_file_full",
entropyStatFile: "testdata/entropy_stat_file_full",
}
d, err := k.getProcValueInt(k.entropyStatFile)
require.NoError(t, err)
require.IsType(t, int64(1), d)
}
func TestGetProcValueByte(t *testing.T) {
k := Kernel{
statFile: "testdata/stat_file_full",
entropyStatFile: "testdata/entropy_stat_file_full",
}
d, err := k.getProcValueBytes(k.entropyStatFile)
require.NoError(t, err)
require.IsType(t, []byte("test"), d)
}
func TestFullProcFile(t *testing.T) {
k := Kernel{
statFile: "testdata/stat_file_full",
@ -19,16 +44,24 @@ func TestFullProcFile(t *testing.T) {
acc := testutil.Accumulator{}
require.NoError(t, k.Gather(&acc))
fields := map[string]interface{}{
"boot_time": int64(1457505775),
"context_switches": int64(2626618),
"disk_pages_in": int64(5741),
"disk_pages_out": int64(1808),
"interrupts": int64(1472736),
"processes_forked": int64(10673),
"entropy_avail": int64(1024),
expected := []telegraf.Metric{
metric.New(
"kernel",
map[string]string{},
map[string]interface{}{
"boot_time": int64(1457505775),
"context_switches": int64(2626618),
"disk_pages_in": int64(5741),
"disk_pages_out": int64(1808),
"interrupts": int64(1472736),
"processes_forked": int64(10673),
"entropy_avail": int64(1024),
},
time.Unix(0, 0),
1,
),
}
acc.AssertContainsFields(t, "kernel", fields)
testutil.RequireMetricsEqual(t, expected, acc.GetTelegrafMetrics(), testutil.IgnoreTime())
}
func TestPartialProcFile(t *testing.T) {
@ -73,7 +106,7 @@ func TestInvalidProcFile2(t *testing.T) {
acc := testutil.Accumulator{}
err := k.Gather(&acc)
require.Error(t, err)
require.Contains(t, err.Error(), "no such file")
require.Contains(t, err.Error(), "does not exist")
}
func TestNoProcFile(t *testing.T) {
@ -86,3 +119,101 @@ func TestNoProcFile(t *testing.T) {
require.Error(t, err)
require.Contains(t, err.Error(), "does not exist")
}
func TestInvalidCollectOption(t *testing.T) {
k := Kernel{
statFile: "testdata/stat_file_full",
entropyStatFile: "testdata/entropy_stat_file_full",
ConfigCollect: []string{"invalidOption"},
}
acc := testutil.Accumulator{}
require.NoError(t, k.Init())
require.NoError(t, k.Gather(&acc))
}
func TestKsmEnabledValidKsmDirectory(t *testing.T) {
k := Kernel{
statFile: "testdata/stat_file_full",
entropyStatFile: "testdata/entropy_stat_file_full",
ksmStatsDir: "testdata/ksm/valid",
ConfigCollect: []string{"ksm"},
}
require.NoError(t, k.Init())
acc := testutil.Accumulator{}
require.NoError(t, k.Gather(&acc))
expected := []telegraf.Metric{
metric.New(
"kernel",
map[string]string{},
map[string]interface{}{
"boot_time": int64(1457505775),
"context_switches": int64(2626618),
"disk_pages_in": int64(5741),
"disk_pages_out": int64(1808),
"interrupts": int64(1472736),
"processes_forked": int64(10673),
"entropy_avail": int64(1024),
"ksm_full_scans": int64(123),
"ksm_max_page_sharing": int64(10000),
"ksm_merge_across_nodes": int64(1),
"ksm_pages_shared": int64(12922),
"ksm_pages_sharing": int64(28384),
"ksm_pages_to_scan": int64(12928),
"ksm_pages_unshared": int64(92847),
"ksm_pages_volatile": int64(2824171),
"ksm_run": int64(1),
"ksm_sleep_millisecs": int64(1000),
"ksm_stable_node_chains": int64(0),
"ksm_stable_node_chains_prune_millisecs": int64(0),
"ksm_stable_node_dups": int64(0),
"ksm_use_zero_pages": int64(1),
},
time.Unix(0, 0),
1,
),
}
testutil.RequireMetricsEqual(t, expected, acc.GetTelegrafMetrics(), testutil.IgnoreTime())
}
func TestKSMEnabledMissingFile(t *testing.T) {
k := Kernel{
statFile: "/proc/stat",
entropyStatFile: "/proc/sys/kernel/random/entropy_avail",
ksmStatsDir: "testdata/ksm/missing",
ConfigCollect: []string{"ksm"},
}
require.NoError(t, k.Init())
acc := testutil.Accumulator{}
require.ErrorContains(t, k.Gather(&acc), "does not exist")
}
func TestKSMEnabledWrongDir(t *testing.T) {
k := Kernel{
ksmStatsDir: "testdata/this_file_does_not_exist",
ConfigCollect: []string{"ksm"},
}
require.ErrorContains(t, k.Init(), "Is KSM enabled in this kernel?")
}
func TestKSMDisabledNoKSMTags(t *testing.T) {
k := Kernel{
statFile: "testdata/stat_file_full",
entropyStatFile: "testdata/entropy_stat_file_full",
ksmStatsDir: "testdata/this_file_does_not_exist",
ConfigCollect: []string{},
}
acc := testutil.Accumulator{}
require.NoError(t, k.Init())
require.NoError(t, k.Gather(&acc))
require.False(t, acc.HasField("kernel", "ksm_run"))
}

View File

@ -1,4 +1,7 @@
# Get kernel statistics from /proc/stat
# Plugin to collect various Linux kernel statistics.
# This plugin ONLY supports Linux
[[inputs.kernel]]
# no configuration
## Additional gather options
## Possible options include:
## * ksm - kernel same-page merging
# collect = []

View File

@ -0,0 +1 @@
123A

View File

@ -0,0 +1 @@
10000

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
12922

View File

@ -0,0 +1 @@
28384

View File

@ -0,0 +1 @@
12928

View File

@ -0,0 +1 @@
92847

View File

@ -0,0 +1 @@
2824171

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
1000

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
123

View File

@ -0,0 +1 @@
10000

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
12922

View File

@ -0,0 +1 @@
28384

View File

@ -0,0 +1 @@
12928

View File

@ -0,0 +1 @@
92847

View File

@ -0,0 +1 @@
2824171

View File

@ -0,0 +1 @@
1000

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
123

View File

@ -0,0 +1 @@
10000

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
12922

View File

@ -0,0 +1 @@
28384

View File

@ -0,0 +1 @@
12928

View File

@ -0,0 +1 @@
92847

View File

@ -0,0 +1 @@
2824171

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
1000

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
1