feat(inputs.kernel): Collect KSM metrics (#13728)

This commit is contained in:
Costas Drogos 2023-08-09 23:43:14 +03:00 committed by GitHub
parent 8b032b73ee
commit 0cf7d23090
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
45 changed files with 291 additions and 31 deletions

View File

@ -5,14 +5,13 @@ This plugin is only available on Linux.
The kernel plugin gathers info about the kernel that doesn't fit into other The kernel plugin gathers info about the kernel that doesn't fit into other
plugins. In general, it is the statistics available in `/proc/stat` that are not plugins. In general, it is the statistics available in `/proc/stat` that are not
covered by other plugins as well as the value of covered by other plugins as well as the value of
`/proc/sys/kernel/random/entropy_avail` `/proc/sys/kernel/random/entropy_avail` and optionally, Kernel Samepage Merging.
The metrics are documented in `man proc` under the `/proc/stat` section. The metrics are documented in `man proc` under the `/proc/stat` section.
The metrics are documented in `man 4 random` under the `/proc/stat` section. The metrics are documented in `man 4 random` under the `/proc/stat` section.
```text ```text
/proc/sys/kernel/random/entropy_avail /proc/sys/kernel/random/entropy_avail
Contains the value of available entropy Contains the value of available entropy
@ -40,6 +39,12 @@ processes 86031
Number of forks since boot. Number of forks since boot.
``` ```
Kernel Samepage Merging is generally documented in [kernel documenation][1] and
the available metrics exposed via sysfs are documented in [admin guide][2]
[1]: https://www.kernel.org/doc/html/latest/mm/ksm.html
[2]: https://www.kernel.org/doc/html/latest/admin-guide/mm/ksm.html#ksm-daemon-sysfs-interface
## Global configuration options <!-- @/docs/includes/plugin_config.md --> ## Global configuration options <!-- @/docs/includes/plugin_config.md -->
In addition to the plugin-specific configuration settings, plugins support In addition to the plugin-specific configuration settings, plugins support
@ -52,10 +57,13 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
## Configuration ## Configuration
```toml @sample.conf ```toml @sample.conf
# Get kernel statistics from /proc/stat # Plugin to collect various Linux kernel statistics.
# This plugin ONLY supports Linux # This plugin ONLY supports Linux
[[inputs.kernel]] [[inputs.kernel]]
# no configuration ## Additional gather options
## Possible options include:
## * ksm - kernel same-page merging
# collect = []
``` ```
## Metrics ## Metrics
@ -68,9 +76,26 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
- interrupts (integer, `intr`) - interrupts (integer, `intr`)
- processes_forked (integer, `processes`) - processes_forked (integer, `processes`)
- entropy_avail (integer, `entropy_available`) - entropy_avail (integer, `entropy_available`)
- ksm_full_scans (integer, how many times all mergeable areas have been scanned, `full_scans`)
- ksm_max_page_sharing (integer, maximum sharing allowed for each KSM page, `max_page_sharing`)
- ksm_merge_across_nodes (integer, whether pages should be merged across NUMA nodes, `merge_across_nodes`)
- ksm_pages_shared (integer, how many shared pages are being used, `pages_shared`)
- ksm_pages_sharing (integer,how many more sites are sharing them , `pages_sharing`)
- ksm_pages_to_scan (integer, how many pages to scan before ksmd goes to sleep, `pages_to_scan`)
- ksm_pages_unshared (integer, how many pages unique but repeatedly checked for merging, `pages_unshared`)
- ksm_pages_volatile (integer, how many pages changing too fast to be placed in a tree, `pages_volatile`)
- ksm_run (integer, whether ksm is running or not, `run`)
- ksm_sleep_millisecs (integer, how many milliseconds ksmd should sleep between scans, `sleep_millisecs`)
- ksm_stable_node_chains (integer, the number of KSM pages that hit the max_page_sharing limit, `stable_node_chains`)
- ksm_stable_node_chains_prune_millisecs (integer, how frequently KSM checks the metadata of the pages that hit the deduplication limit, `stable_node_chains_prune_millisecs`)
- ksm_stable_node_dups (integer, number of duplicated KSM pages, `stable_node_dups`)
- ksm_use_zero_pages (integer, whether empty pages should be treated specially, `use_zero_pages`)
## Example Output ## Example Output
```text ```text
kernel entropy_available=2469i,boot_time=1457505775i,context_switches=2626618i,disk_pages_in=5741i,disk_pages_out=1808i,interrupts=1472736i,processes_forked=10673i 1457613402960879816 kernel boot_time=1690487872i,context_switches=321398652i,entropy_avail=256i,interrupts=141868628i,processes_forked=946492i 1691339564000000000
kernel boot_time=1690487872i,context_switches=321252729i,entropy_avail=256i,interrupts=141783427i,ksm_full_scans=0i,ksm_max_page_sharing=256i,ksm_merge_across_nodes=1i,ksm_pages_shared=0i,ksm_pages_sharing=0i,ksm_pages_to_scan=100i,ksm_pages_unshared=0i,ksm_pages_volatile=0i,ksm_run=0i,ksm_sleep_millisecs=20i,ksm_stable_node_chains=0i,ksm_stable_node_chains_prune_millisecs=2000i,ksm_stable_node_dups=0i,ksm_use_zero_pages=0i,processes_forked=946467i 1691339522000000000
``` ```

View File

@ -8,8 +8,8 @@ import (
_ "embed" _ "embed"
"fmt" "fmt"
"os" "os"
"path/filepath"
"strconv" "strconv"
"strings"
"github.com/influxdata/telegraf" "github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/inputs" "github.com/influxdata/telegraf/plugins/inputs"
@ -28,8 +28,27 @@ var (
) )
type Kernel struct { type Kernel struct {
ConfigCollect []string `toml:"collect"`
optCollect map[string]bool
statFile string statFile string
entropyStatFile string entropyStatFile string
ksmStatsDir string
}
func (k *Kernel) Init() error {
k.optCollect = make(map[string]bool, len(k.ConfigCollect))
for _, v := range k.ConfigCollect {
k.optCollect[v] = true
}
if k.optCollect["ksm"] {
if _, err := os.Stat(k.ksmStatsDir); os.IsNotExist(err) {
// ksm probably not enabled in the kernel, bail out early
return fmt.Errorf("directory %q does not exist. Is KSM enabled in this kernel?", k.ksmStatsDir)
}
}
return nil
} }
func (*Kernel) SampleConfig() string { func (*Kernel) SampleConfig() string {
@ -37,18 +56,12 @@ func (*Kernel) SampleConfig() string {
} }
func (k *Kernel) Gather(acc telegraf.Accumulator) error { func (k *Kernel) Gather(acc telegraf.Accumulator) error {
data, err := k.getProcStat() data, err := k.getProcValueBytes(k.statFile)
if err != nil { if err != nil {
return err return err
} }
entropyData, err := os.ReadFile(k.entropyStatFile) entropyValue, err := k.getProcValueInt(k.entropyStatFile)
if err != nil {
return err
}
entropyString := string(entropyData)
entropyValue, err := strconv.ParseInt(strings.TrimSpace(entropyString), 10, 64)
if err != nil { if err != nil {
return err return err
} }
@ -98,31 +111,78 @@ func (k *Kernel) Gather(acc telegraf.Accumulator) error {
} }
} }
if k.optCollect["ksm"] {
stats := []string{
"full_scans", "max_page_sharing",
"merge_across_nodes", "pages_shared",
"pages_sharing", "pages_to_scan",
"pages_unshared", "pages_volatile",
"run", "sleep_millisecs",
"stable_node_chains", "stable_node_chains_prune_millisecs",
"stable_node_dups", "use_zero_pages",
}
// these exist in very recent Linux versions only, but useful to include if there.
extraStats := []string{"general_profit"}
for _, f := range stats {
m, err := k.getProcValueInt(filepath.Join(k.ksmStatsDir, f))
if err != nil {
return err
}
fields["ksm_"+f] = m
}
for _, f := range extraStats {
m, err := k.getProcValueInt(filepath.Join(k.ksmStatsDir, f))
if err != nil {
// if an extraStats metric doesn't exist in our kernel version, ignore it.
continue
}
fields["ksm_"+f] = m
}
}
acc.AddCounter("kernel", fields, map[string]string{}) acc.AddCounter("kernel", fields, map[string]string{})
return nil return nil
} }
func (k *Kernel) getProcStat() ([]byte, error) { func (k *Kernel) getProcValueBytes(path string) ([]byte, error) {
if _, err := os.Stat(k.statFile); os.IsNotExist(err) { if _, err := os.Stat(path); os.IsNotExist(err) {
return nil, fmt.Errorf("kernel: %s does not exist", k.statFile) return nil, fmt.Errorf("Path %q does not exist", path)
} else if err != nil { } else if err != nil {
return nil, err return nil, err
} }
data, err := os.ReadFile(k.statFile) data, err := os.ReadFile(path)
if err != nil { if err != nil {
return nil, err return nil, fmt.Errorf("failed to read from %q: %w", path, err)
} }
return data, nil return data, nil
} }
func (k *Kernel) getProcValueInt(path string) (int64, error) {
data, err := k.getProcValueBytes(path)
if err != nil {
return -1, err
}
m, err := strconv.ParseInt(string(bytes.TrimSpace(data)), 10, 64)
if err != nil {
return -1, fmt.Errorf("failed to parse %q as an integer: %w", data, err)
}
return m, nil
}
func init() { func init() {
inputs.Add("kernel", func() telegraf.Input { inputs.Add("kernel", func() telegraf.Input {
return &Kernel{ return &Kernel{
statFile: "/proc/stat", statFile: "/proc/stat",
entropyStatFile: "/proc/sys/kernel/random/entropy_avail", entropyStatFile: "/proc/sys/kernel/random/entropy_avail",
ksmStatsDir: "/sys/kernel/mm/ksm",
} }
}) })
} }

View File

@ -4,12 +4,37 @@ package kernel
import ( import (
"testing" "testing"
"time"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/metric"
"github.com/influxdata/telegraf/testutil" "github.com/influxdata/telegraf/testutil"
) )
func TestGetProcValueInt(t *testing.T) {
k := Kernel{
statFile: "testdata/stat_file_full",
entropyStatFile: "testdata/entropy_stat_file_full",
}
d, err := k.getProcValueInt(k.entropyStatFile)
require.NoError(t, err)
require.IsType(t, int64(1), d)
}
func TestGetProcValueByte(t *testing.T) {
k := Kernel{
statFile: "testdata/stat_file_full",
entropyStatFile: "testdata/entropy_stat_file_full",
}
d, err := k.getProcValueBytes(k.entropyStatFile)
require.NoError(t, err)
require.IsType(t, []byte("test"), d)
}
func TestFullProcFile(t *testing.T) { func TestFullProcFile(t *testing.T) {
k := Kernel{ k := Kernel{
statFile: "testdata/stat_file_full", statFile: "testdata/stat_file_full",
@ -19,7 +44,11 @@ func TestFullProcFile(t *testing.T) {
acc := testutil.Accumulator{} acc := testutil.Accumulator{}
require.NoError(t, k.Gather(&acc)) require.NoError(t, k.Gather(&acc))
fields := map[string]interface{}{ expected := []telegraf.Metric{
metric.New(
"kernel",
map[string]string{},
map[string]interface{}{
"boot_time": int64(1457505775), "boot_time": int64(1457505775),
"context_switches": int64(2626618), "context_switches": int64(2626618),
"disk_pages_in": int64(5741), "disk_pages_in": int64(5741),
@ -27,8 +56,12 @@ func TestFullProcFile(t *testing.T) {
"interrupts": int64(1472736), "interrupts": int64(1472736),
"processes_forked": int64(10673), "processes_forked": int64(10673),
"entropy_avail": int64(1024), "entropy_avail": int64(1024),
},
time.Unix(0, 0),
1,
),
} }
acc.AssertContainsFields(t, "kernel", fields) testutil.RequireMetricsEqual(t, expected, acc.GetTelegrafMetrics(), testutil.IgnoreTime())
} }
func TestPartialProcFile(t *testing.T) { func TestPartialProcFile(t *testing.T) {
@ -73,7 +106,7 @@ func TestInvalidProcFile2(t *testing.T) {
acc := testutil.Accumulator{} acc := testutil.Accumulator{}
err := k.Gather(&acc) err := k.Gather(&acc)
require.Error(t, err) require.Error(t, err)
require.Contains(t, err.Error(), "no such file") require.Contains(t, err.Error(), "does not exist")
} }
func TestNoProcFile(t *testing.T) { func TestNoProcFile(t *testing.T) {
@ -86,3 +119,101 @@ func TestNoProcFile(t *testing.T) {
require.Error(t, err) require.Error(t, err)
require.Contains(t, err.Error(), "does not exist") require.Contains(t, err.Error(), "does not exist")
} }
func TestInvalidCollectOption(t *testing.T) {
k := Kernel{
statFile: "testdata/stat_file_full",
entropyStatFile: "testdata/entropy_stat_file_full",
ConfigCollect: []string{"invalidOption"},
}
acc := testutil.Accumulator{}
require.NoError(t, k.Init())
require.NoError(t, k.Gather(&acc))
}
func TestKsmEnabledValidKsmDirectory(t *testing.T) {
k := Kernel{
statFile: "testdata/stat_file_full",
entropyStatFile: "testdata/entropy_stat_file_full",
ksmStatsDir: "testdata/ksm/valid",
ConfigCollect: []string{"ksm"},
}
require.NoError(t, k.Init())
acc := testutil.Accumulator{}
require.NoError(t, k.Gather(&acc))
expected := []telegraf.Metric{
metric.New(
"kernel",
map[string]string{},
map[string]interface{}{
"boot_time": int64(1457505775),
"context_switches": int64(2626618),
"disk_pages_in": int64(5741),
"disk_pages_out": int64(1808),
"interrupts": int64(1472736),
"processes_forked": int64(10673),
"entropy_avail": int64(1024),
"ksm_full_scans": int64(123),
"ksm_max_page_sharing": int64(10000),
"ksm_merge_across_nodes": int64(1),
"ksm_pages_shared": int64(12922),
"ksm_pages_sharing": int64(28384),
"ksm_pages_to_scan": int64(12928),
"ksm_pages_unshared": int64(92847),
"ksm_pages_volatile": int64(2824171),
"ksm_run": int64(1),
"ksm_sleep_millisecs": int64(1000),
"ksm_stable_node_chains": int64(0),
"ksm_stable_node_chains_prune_millisecs": int64(0),
"ksm_stable_node_dups": int64(0),
"ksm_use_zero_pages": int64(1),
},
time.Unix(0, 0),
1,
),
}
testutil.RequireMetricsEqual(t, expected, acc.GetTelegrafMetrics(), testutil.IgnoreTime())
}
func TestKSMEnabledMissingFile(t *testing.T) {
k := Kernel{
statFile: "/proc/stat",
entropyStatFile: "/proc/sys/kernel/random/entropy_avail",
ksmStatsDir: "testdata/ksm/missing",
ConfigCollect: []string{"ksm"},
}
require.NoError(t, k.Init())
acc := testutil.Accumulator{}
require.ErrorContains(t, k.Gather(&acc), "does not exist")
}
func TestKSMEnabledWrongDir(t *testing.T) {
k := Kernel{
ksmStatsDir: "testdata/this_file_does_not_exist",
ConfigCollect: []string{"ksm"},
}
require.ErrorContains(t, k.Init(), "Is KSM enabled in this kernel?")
}
func TestKSMDisabledNoKSMTags(t *testing.T) {
k := Kernel{
statFile: "testdata/stat_file_full",
entropyStatFile: "testdata/entropy_stat_file_full",
ksmStatsDir: "testdata/this_file_does_not_exist",
ConfigCollect: []string{},
}
acc := testutil.Accumulator{}
require.NoError(t, k.Init())
require.NoError(t, k.Gather(&acc))
require.False(t, acc.HasField("kernel", "ksm_run"))
}

View File

@ -1,4 +1,7 @@
# Get kernel statistics from /proc/stat # Plugin to collect various Linux kernel statistics.
# This plugin ONLY supports Linux # This plugin ONLY supports Linux
[[inputs.kernel]] [[inputs.kernel]]
# no configuration ## Additional gather options
## Possible options include:
## * ksm - kernel same-page merging
# collect = []

View File

@ -0,0 +1 @@
123A

View File

@ -0,0 +1 @@
10000

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
12922

View File

@ -0,0 +1 @@
28384

View File

@ -0,0 +1 @@
12928

View File

@ -0,0 +1 @@
92847

View File

@ -0,0 +1 @@
2824171

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
1000

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
123

View File

@ -0,0 +1 @@
10000

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
12922

View File

@ -0,0 +1 @@
28384

View File

@ -0,0 +1 @@
12928

View File

@ -0,0 +1 @@
92847

View File

@ -0,0 +1 @@
2824171

View File

@ -0,0 +1 @@
1000

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
123

View File

@ -0,0 +1 @@
10000

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
12922

View File

@ -0,0 +1 @@
28384

View File

@ -0,0 +1 @@
12928

View File

@ -0,0 +1 @@
92847

View File

@ -0,0 +1 @@
2824171

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
1000

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
0

View File

@ -0,0 +1 @@
1