feat(inputs.linux_cpu): Add plugin to collect CPU metrics on Linux (#8988)

This commit is contained in:
Fabian Mastenbroek 2022-08-24 21:10:45 +02:00 committed by GitHub
parent 45abba836f
commit 7f3395f148
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 481 additions and 0 deletions

View File

@ -0,0 +1,5 @@
//go:build !custom || inputs || inputs.linux_cpu
package all
import _ "github.com/influxdata/telegraf/plugins/inputs/linux_cpu" // register plugin

View File

@ -0,0 +1,64 @@
# Linux CPU Input Plugin
The `linux_cpu` plugin gathers CPU metrics exposed on Linux-based systems.
## Configuration
```toml @sample.conf
# Collects CPU metrics exposed on Linux
[[inputs.linux_cpu]]
## Path for sysfs filesystem.
## See https://www.kernel.org/doc/Documentation/filesystems/sysfs.txt
## Defaults:
# host_sys = "/sys"
## CPU metrics collected by the plugin.
## Supported options:
## "cpufreq", "thermal"
## Defaults:
# metrics = ["cpufreq"]
```
## Metrics
The following tags are emitted by the plugin under the name `linux_cpu`:
| Tag | Description |
|-------|-----------------------|
| `cpu` | Identifier of the CPU |
The following fields are emitted by the plugin when selecting `cpufreq`:
| Metric name (field) | Description | Units |
|---------------------|------------------------------------------------------------|-------|
| `scaling_cur_freq` | Current frequency of the CPU as determined by CPUFreq | KHz |
| `scaling_min_freq` | Minimum frequency the governor can scale to | KHz |
| `scaling_max_freq` | Maximum frequency the governor can scale to | KHz |
| `cpuinfo_cur_freq` | Current frequency of the CPU as determined by the hardware | KHz |
| `cpuinfo_min_freq` | Minimum operating frequency of the CPU | KHz |
| `cpuinfo_max_freq` | Maximum operating frequency of the CPU | KHz |
The following fields are emitted by the plugin when selecting `thermal`:
| Metric name (field) | Description | Units |
|-----------------------|-------------------------------------------------------------|-------|
| `throttle_count` | Number of thermal throttle events reported by the CPU | |
| `throttle_max_time` | Maximum amount of time CPU was in throttled state | ms |
| `throtlle_total_time` | Cumulative time during which the CPU was in throttled state | ms |
## Example Output
```shell
> linux_cpu,cpu=0,host=go scaling_max_freq=4700000i,cpuinfo_min_freq=400000i,cpuinfo_max_freq=4700000i,throttle_count=0i,throttle_max_time=0i,throttle_total_time=0i,scaling_cur_freq=803157i,scaling_min_freq=400000i 1617621150000000000
> linux_cpu,cpu=1,host=go throttle_total_time=0i,scaling_cur_freq=802939i,scaling_min_freq=400000i,scaling_max_freq=4700000i,cpuinfo_min_freq=400000i,cpuinfo_max_freq=4700000i,throttle_count=0i,throttle_max_time=0i 1617621150000000000
> linux_cpu,cpu=10,host=go throttle_max_time=0i,throttle_total_time=0i,scaling_cur_freq=838343i,scaling_min_freq=400000i,scaling_max_freq=4700000i,cpuinfo_min_freq=400000i,cpuinfo_max_freq=4700000i,throttle_count=0i 1617621150000000000
> linux_cpu,cpu=11,host=go cpuinfo_max_freq=4700000i,throttle_count=0i,throttle_max_time=0i,throttle_total_time=0i,scaling_cur_freq=800054i,scaling_min_freq=400000i,scaling_max_freq=4700000i,cpuinfo_min_freq=400000i 1617621150000000000
> linux_cpu,cpu=2,host=go throttle_total_time=0i,scaling_cur_freq=800404i,scaling_min_freq=400000i,scaling_max_freq=4700000i,cpuinfo_min_freq=400000i,cpuinfo_max_freq=4700000i,throttle_count=0i,throttle_max_time=0i 1617621150000000000
> linux_cpu,cpu=3,host=go throttle_total_time=0i,scaling_cur_freq=800126i,scaling_min_freq=400000i,scaling_max_freq=4700000i,cpuinfo_min_freq=400000i,cpuinfo_max_freq=4700000i,throttle_count=0i,throttle_max_time=0i 1617621150000000000
> linux_cpu,cpu=4,host=go cpuinfo_max_freq=4700000i,throttle_count=0i,throttle_max_time=0i,throttle_total_time=0i,scaling_cur_freq=800359i,scaling_min_freq=400000i,scaling_max_freq=4700000i,cpuinfo_min_freq=400000i 1617621150000000000
> linux_cpu,cpu=5,host=go throttle_max_time=0i,throttle_total_time=0i,scaling_cur_freq=800093i,scaling_min_freq=400000i,scaling_max_freq=4700000i,cpuinfo_min_freq=400000i,cpuinfo_max_freq=4700000i,throttle_count=0i 1617621150000000000
> linux_cpu,cpu=6,host=go cpuinfo_max_freq=4700000i,throttle_count=0i,throttle_max_time=0i,throttle_total_time=0i,scaling_cur_freq=741646i,scaling_min_freq=400000i,scaling_max_freq=4700000i,cpuinfo_min_freq=400000i 1617621150000000000
> linux_cpu,cpu=7,host=go scaling_cur_freq=700006i,scaling_min_freq=400000i,scaling_max_freq=4700000i,cpuinfo_min_freq=400000i,cpuinfo_max_freq=4700000i,throttle_count=0i,throttle_max_time=0i,throttle_total_time=0i 1617621150000000000
> linux_cpu,cpu=8,host=go throttle_max_time=0i,throttle_total_time=0i,scaling_cur_freq=700046i,scaling_min_freq=400000i,scaling_max_freq=4700000i,cpuinfo_min_freq=400000i,cpuinfo_max_freq=4700000i,throttle_count=0i 1617621150000000000
> linux_cpu,cpu=9,host=go throttle_count=0i,throttle_max_time=0i,throttle_total_time=0i,scaling_cur_freq=700075i,scaling_min_freq=400000i,scaling_max_freq=4700000i,cpuinfo_min_freq=400000i,cpuinfo_max_freq=4700000i 1617621150000000000
```

View File

@ -0,0 +1,214 @@
//go:build linux
package linux_cpu
import (
_ "embed"
"fmt"
"io"
"os"
"path"
"path/filepath"
"strconv"
"strings"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal/choice"
"github.com/influxdata/telegraf/plugins/inputs"
)
const (
defaultHostSys = "/sys"
cpufreq = "cpufreq"
thermal = "thermal"
)
type LinuxCPU struct {
Log telegraf.Logger `toml:"-"`
PathSysfs string `toml:"host_sys"`
Metrics []string `toml:"metrics"`
cpus []cpu
}
type cpu struct {
id string
path string
props map[string]string
}
type prop struct {
name string
path string
optional bool
}
//go:embed sample.conf
var sampleConfig string
func (g *LinuxCPU) SampleConfig() string {
return sampleConfig
}
func (g *LinuxCPU) Init() error {
if g.PathSysfs == "" {
g.PathSysfs = defaultHostSys
}
if len(g.Metrics) == 0 {
// The user has not enabled any of the metrics
return fmt.Errorf("no metrics selected")
}
cpus, err := g.discoverCpus()
if err != nil {
return err
} else if len(cpus) == 0 {
// Although the user has specified metrics to collect, `discoverCpus` failed to find the required metrics
return fmt.Errorf("no CPUs detected to track")
}
g.cpus = cpus
return nil
}
func (g *LinuxCPU) Gather(acc telegraf.Accumulator) error {
for _, cpu := range g.cpus {
fields := make(map[string]interface{})
tags := map[string]string{"cpu": cpu.id}
failed := false
for name, propPath := range cpu.props {
v, err := readUintFromFile(propPath)
if err != nil {
acc.AddError(err)
failed = true
break
}
fields[name] = v
}
if !failed {
acc.AddFields("linux_cpu", fields, tags)
}
}
return nil
}
func (g *LinuxCPU) discoverCpus() ([]cpu, error) {
var cpus []cpu
glob := path.Join(g.PathSysfs, "devices/system/cpu/cpu[0-9]*")
cpuDirs, err := filepath.Glob(glob)
if err != nil {
return nil, err
}
if len(cpuDirs) == 0 {
return nil, fmt.Errorf("no CPUs detected at: %s", glob)
}
for _, dir := range cpuDirs {
_, cpuName := filepath.Split(dir)
cpuNum := strings.TrimPrefix(cpuName, "cpu")
cpu := cpu{
id: cpuNum,
path: dir,
props: make(map[string]string),
}
var props []prop
if choice.Contains(cpufreq, g.Metrics) {
props = append(props,
prop{name: "scaling_cur_freq", path: "cpufreq/scaling_cur_freq", optional: false},
prop{name: "scaling_min_freq", path: "cpufreq/scaling_min_freq", optional: false},
prop{name: "scaling_max_freq", path: "cpufreq/scaling_max_freq", optional: false},
prop{name: "cpuinfo_cur_freq", path: "cpufreq/cpuinfo_cur_freq", optional: true},
prop{name: "cpuinfo_min_freq", path: "cpufreq/cpuinfo_min_freq", optional: true},
prop{name: "cpuinfo_max_freq", path: "cpufreq/cpuinfo_max_freq", optional: true},
)
}
if choice.Contains(thermal, g.Metrics) {
props = append(
props,
prop{name: "throttle_count", path: "thermal_throttle/core_throttle_count", optional: false},
prop{name: "throttle_max_time", path: "thermal_throttle/core_throttle_max_time_ms", optional: false},
prop{name: "throttle_total_time", path: "thermal_throttle/core_throttle_total_time_ms", optional: false},
)
}
var failed = false
for _, prop := range props {
propPath := filepath.Join(dir, prop.path)
err := validatePath(propPath)
if err != nil {
if prop.optional {
continue
}
g.Log.Warnf("Failed to load property %s: %v", propPath, err)
failed = true
break
}
cpu.props[prop.name] = propPath
}
if len(cpu.props) == 0 {
g.Log.Warnf("No properties enabled/loaded for CPU %s", cpuNum)
failed = true
}
if !failed {
cpus = append(cpus, cpu)
}
}
return cpus, nil
}
func init() {
inputs.Add("linux_cpu", func() telegraf.Input {
return &LinuxCPU{
Metrics: []string{"cpufreq"},
}
})
}
func validatePath(propPath string) error {
f, err := os.Open(propPath)
if os.IsNotExist(err) {
return fmt.Errorf("CPU property does not exist: [%s]", propPath)
}
if err != nil {
return fmt.Errorf("cannot get system information for CPU property: [%s] - %v", propPath, err)
}
_ = f.Close() // File is not written to, closing should be safe
return nil
}
func readUintFromFile(propPath string) (uint64, error) {
f, err := os.Open(propPath)
if err != nil {
return 0, err
}
defer f.Close()
buffer := make([]byte, 22)
n, err := f.Read(buffer)
if err != nil && err != io.EOF {
return 0, fmt.Errorf("error on reading file, err: %v", err)
} else if n == 0 {
return 0, fmt.Errorf("error on reading file, file is empty")
}
return strconv.ParseUint(string(buffer[:n-1]), 10, 64)
}

View File

@ -0,0 +1,3 @@
//go:build !linux
package linux_cpu

View File

@ -0,0 +1,183 @@
//go:build linux
package linux_cpu
import (
"github.com/influxdata/telegraf/testutil"
"os"
"testing"
"github.com/stretchr/testify/require"
)
func TestNoMetrics(t *testing.T) {
plugin := &LinuxCPU{}
require.Error(t, plugin.Init())
}
func TestNoCPUs(t *testing.T) {
td := t.TempDir()
plugin := &LinuxCPU{
Log: testutil.Logger{Name: "LinuxCPUPluginTest"},
Metrics: []string{"cpufreq"},
PathSysfs: td,
}
require.Error(t, plugin.Init())
}
func TestNoCPUMetrics(t *testing.T) {
td := t.TempDir()
require.NoError(t, os.MkdirAll(td+"/devices/system/cpu/cpu0/cpufreq", os.ModePerm))
plugin := &LinuxCPU{
Log: testutil.Logger{Name: "LinuxCPUPluginTest"},
Metrics: []string{"cpufreq"},
PathSysfs: td,
}
require.Error(t, plugin.Init())
}
func TestGatherCPUFreq(t *testing.T) {
td := t.TempDir()
require.NoError(t, os.MkdirAll(td+"/devices/system/cpu/cpu0/cpufreq", os.ModePerm))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq", []byte("250\n"), 0644))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu0/cpufreq/scaling_min_freq", []byte("100\n"), 0644))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu0/cpufreq/scaling_max_freq", []byte("255\n"), 0644))
require.NoError(t, os.MkdirAll(td+"/devices/system/cpu/cpu1/cpufreq", os.ModePerm))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu1/cpufreq/scaling_cur_freq", []byte("123\n"), 0644))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu1/cpufreq/scaling_min_freq", []byte("80\n"), 0644))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu1/cpufreq/scaling_max_freq", []byte("230\n"), 0644))
plugin := &LinuxCPU{
Log: testutil.Logger{Name: "LinuxCPUPluginTest"},
Metrics: []string{"cpufreq"},
PathSysfs: td,
}
require.NoError(t, plugin.Init())
var acc testutil.Accumulator
require.NoError(t, plugin.Gather(&acc))
tags1 := map[string]string{
"cpu": "0",
}
tags2 := map[string]string{
"cpu": "1",
}
fields1 := map[string]interface{}{
"scaling_cur_freq": uint64(250),
"scaling_min_freq": uint64(100),
"scaling_max_freq": uint64(255),
}
fields2 := map[string]interface{}{
"scaling_cur_freq": uint64(123),
"scaling_min_freq": uint64(80),
"scaling_max_freq": uint64(230),
}
acc.AssertContainsTaggedFields(t, "linux_cpu", fields1, tags1)
acc.AssertContainsTaggedFields(t, "linux_cpu", fields2, tags2)
}
func TestGatherThermal(t *testing.T) {
td := t.TempDir()
require.NoError(t, os.MkdirAll(td+"/devices/system/cpu/cpu0/thermal_throttle", os.ModePerm))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", []byte("250\n"), 0644))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu0/thermal_throttle/core_throttle_max_time_ms", []byte("100\n"), 0644))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu0/thermal_throttle/core_throttle_total_time_ms", []byte("255\n"), 0644))
plugin := &LinuxCPU{
Log: testutil.Logger{Name: "LinuxCPUPluginTest"},
Metrics: []string{"thermal"},
PathSysfs: td,
}
require.NoError(t, plugin.Init())
var acc testutil.Accumulator
require.NoError(t, plugin.Gather(&acc))
acc.AssertContainsFields(t, "linux_cpu", map[string]interface{}{
"throttle_count": uint64(250),
"throttle_max_time": uint64(100),
"throttle_total_time": uint64(255),
})
}
func TestGatherPropertyRemoved(t *testing.T) {
td := t.TempDir()
require.NoError(t, os.MkdirAll(td+"/devices/system/cpu/cpu0/cpufreq", os.ModePerm))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq", []byte("250\n"), 0644))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu0/cpufreq/scaling_min_freq", []byte("100\n"), 0644))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu0/cpufreq/scaling_max_freq", []byte("255\n"), 0644))
plugin := &LinuxCPU{
Log: testutil.Logger{Name: "LinuxCPUPluginTest"},
Metrics: []string{"cpufreq"},
PathSysfs: td,
}
require.NoError(t, plugin.Init())
// Remove one of the properties
require.NoError(t, os.RemoveAll(td+"/devices/system/cpu/cpu0/cpufreq/scaling_max_freq"))
var acc testutil.Accumulator
require.NoError(t, plugin.Gather(&acc))
tags1 := map[string]string{
"cpu": "0",
}
fields1 := map[string]interface{}{
"scaling_cur_freq": uint64(250),
"scaling_min_freq": uint64(100),
"scaling_max_freq": uint64(255),
}
acc.AssertDoesNotContainsTaggedFields(t, "linux_cpu", fields1, tags1)
require.NotEmpty(t, acc.Errors)
}
func TestGatherPropertyInvalid(t *testing.T) {
td := t.TempDir()
require.NoError(t, os.MkdirAll(td+"/devices/system/cpu/cpu0/cpufreq", os.ModePerm))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq", []byte("ABC\n"), 0644))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu0/cpufreq/scaling_min_freq", []byte("100\n"), 0644))
require.NoError(t, os.WriteFile(td+"/devices/system/cpu/cpu0/cpufreq/scaling_max_freq", []byte("255\n"), 0644))
plugin := &LinuxCPU{
Log: testutil.Logger{Name: "LinuxCPUPluginTest"},
Metrics: []string{"cpufreq"},
PathSysfs: td,
}
require.NoError(t, plugin.Init())
var acc testutil.Accumulator
require.NoError(t, plugin.Gather(&acc))
tags1 := map[string]string{
"cpu": "0",
}
fields1 := map[string]interface{}{
"scaling_cur_freq": uint64(250),
"scaling_min_freq": uint64(100),
"scaling_max_freq": uint64(255),
}
acc.AssertDoesNotContainsTaggedFields(t, "linux_cpu", fields1, tags1)
require.NotEmpty(t, acc.Errors)
}

View File

@ -0,0 +1,12 @@
# Provides Linux CPU metrics
[[inputs.linux_cpu]]
## Path for sysfs filesystem.
## See https://www.kernel.org/doc/Documentation/filesystems/sysfs.txt
## Defaults:
# host_sys = "/sys"
## CPU metrics collected by the plugin.
## Supported options:
## "cpufreq", "thermal"
## Defaults:
# metrics = ["cpufreq"]