feat(intel_powerstat): add uncore frequency metrics (#11254)
This commit is contained in:
parent
d3ee1b74fd
commit
6d829c199d
|
|
@ -1,10 +1,13 @@
|
|||
# Intel PowerStat Input Plugin
|
||||
|
||||
This input plugin monitors power statistics on Intel-based platforms and assumes presence of Linux based OS.
|
||||
This input plugin monitors power statistics on Intel-based platforms and assumes
|
||||
presence of Linux based OS.
|
||||
|
||||
Main use cases are power saving and workload migration. Telemetry frameworks allow users to monitor critical platform level metrics.
|
||||
Key source of platform telemetry is power domain that is beneficial for MANO/Monitoring&Analytics systems
|
||||
to take preventive/corrective actions based on platform busyness, CPU temperature, actual CPU utilization and power statistics.
|
||||
Main use cases are power saving and workload migration. Telemetry frameworks
|
||||
allow users to monitor critical platform level metrics. Key source of platform
|
||||
telemetry is power domain that is beneficial for MANO Monitoring&Analytics
|
||||
systems to take preventive/corrective actions based on platform busyness, CPU
|
||||
temperature, actual CPU utilization and power statistics.
|
||||
|
||||
## Configuration
|
||||
|
||||
|
|
@ -16,7 +19,7 @@ to take preventive/corrective actions based on platform busyness, CPU temperatur
|
|||
## - Setting this value to an empty array means no package metrics will be collected
|
||||
## - Finally, a user can specify individual metrics to capture from the supported options list
|
||||
## Supported options:
|
||||
## "current_power_consumption", "current_dram_power_consumption", "thermal_design_power", "max_turbo_frequency"
|
||||
## "current_power_consumption", "current_dram_power_consumption", "thermal_design_power", "max_turbo_frequency", "uncore_frequency"
|
||||
# package_metrics = ["current_power_consumption", "current_dram_power_consumption", "thermal_design_power"]
|
||||
|
||||
## The user can choose which per-CPU metrics are monitored by the plugin in cpu_metrics array.
|
||||
|
|
@ -29,7 +32,8 @@ to take preventive/corrective actions based on platform busyness, CPU temperatur
|
|||
|
||||
## Example: Configuration with no per-CPU telemetry
|
||||
|
||||
This configuration allows getting default processor package specific metrics, no per-CPU metrics are collected:
|
||||
This configuration allows getting default processor package specific metrics, no
|
||||
per-CPU metrics are collected:
|
||||
|
||||
```toml
|
||||
[[inputs.intel_powerstat]]
|
||||
|
|
@ -38,7 +42,8 @@ This configuration allows getting default processor package specific metrics, no
|
|||
|
||||
## Example: Configuration with no per-CPU telemetry - equivalent case
|
||||
|
||||
This configuration allows getting default processor package specific metrics, no per-CPU metrics are collected:
|
||||
This configuration allows getting default processor package specific metrics, no
|
||||
per-CPU metrics are collected:
|
||||
|
||||
```toml
|
||||
[[inputs.intel_powerstat]]
|
||||
|
|
@ -46,7 +51,8 @@ This configuration allows getting default processor package specific metrics, no
|
|||
|
||||
## Example: Configuration for CPU Temperature and CPU Frequency
|
||||
|
||||
This configuration allows getting default processor package specific metrics, plus subset of per-CPU metrics (CPU Temperature and CPU Frequency):
|
||||
This configuration allows getting default processor package specific metrics,
|
||||
plus subset of per-CPU metrics (CPU Temperature and CPU Frequency):
|
||||
|
||||
```toml
|
||||
[[inputs.intel_powerstat]]
|
||||
|
|
@ -55,7 +61,8 @@ This configuration allows getting default processor package specific metrics, pl
|
|||
|
||||
## Example: Configuration for CPU Temperature and CPU Frequency without default package metrics
|
||||
|
||||
This configuration allows getting only a subset of per-CPU metrics (CPU Temperature and CPU Frequency):
|
||||
This configuration allows getting only a subset of per-CPU metrics (CPU
|
||||
Temperature and CPU Frequency):
|
||||
|
||||
```toml
|
||||
[[inputs.intel_powerstat]]
|
||||
|
|
@ -65,27 +72,33 @@ This configuration allows getting only a subset of per-CPU metrics (CPU Temperat
|
|||
|
||||
## Example: Configuration with all available metrics
|
||||
|
||||
This configuration allows getting all processor package specific metrics and all per-CPU metrics:
|
||||
This configuration allows getting all processor package specific metrics and all
|
||||
per-CPU metrics:
|
||||
|
||||
```toml
|
||||
[[inputs.intel_powerstat]]
|
||||
package_metrics = ["current_power_consumption", "current_dram_power_consumption", "thermal_design_power", "max_turbo_frequency"]
|
||||
package_metrics = ["current_power_consumption", "current_dram_power_consumption", "thermal_design_power", "max_turbo_frequency", "uncore_frequency"]
|
||||
cpu_metrics = ["cpu_frequency", "cpu_busy_frequency", "cpu_temperature", "cpu_c0_state_residency", "cpu_c1_state_residency", "cpu_c6_state_residency"]
|
||||
```
|
||||
|
||||
## SW Dependencies
|
||||
|
||||
Plugin is based on Linux Kernel modules that expose specific metrics over `sysfs` or `devfs` interfaces.
|
||||
The following dependencies are expected by plugin:
|
||||
Plugin is based on Linux Kernel modules that expose specific metrics over
|
||||
`sysfs` or `devfs` interfaces. The following dependencies are expected by
|
||||
plugin:
|
||||
|
||||
- _intel-rapl_ module which exposes Intel Runtime Power Limiting metrics over `sysfs` (`/sys/devices/virtual/powercap/intel-rapl`),
|
||||
- _msr_ kernel module that provides access to processor model specific registers over `devfs` (`/dev/cpu/cpu%d/msr`),
|
||||
- _cpufreq_ kernel module - which exposes per-CPU Frequency over `sysfs` (`/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq`).
|
||||
- _intel-uncore-frequency_ module exposes Intel uncore frequency metrics over `sysfs` (`/sys/devices/system/cpu/intel_uncore_frequency`),
|
||||
|
||||
Minimum kernel version required is 3.13 to satisfy all requirements.
|
||||
Minimum kernel version required is 3.13 to satisfy most of requirements,
|
||||
for `uncore_frequency` metrics `intel-uncore-frequency` module is required
|
||||
(available since kernel 5.6).
|
||||
|
||||
Please make sure that kernel modules are loaded and running (cpufreq is integrated in kernel). Modules might have to be manually enabled by using `modprobe`.
|
||||
Depending on the kernel version, run commands:
|
||||
Please make sure that kernel modules are loaded and running (cpufreq is
|
||||
integrated in kernel). Modules might have to be manually enabled by using
|
||||
`modprobe`. Depending on the kernel version, run commands:
|
||||
|
||||
```sh
|
||||
# kernel 5.x.x:
|
||||
|
|
@ -94,13 +107,17 @@ subo modprobe msr
|
|||
sudo modprobe intel_rapl_common
|
||||
sudo modprobe intel_rapl_msr
|
||||
|
||||
# also for kernel >= 5.6.0
|
||||
sudo modprobe intel-uncore-frequency
|
||||
|
||||
# kernel 4.x.x:
|
||||
sudo modprobe msr
|
||||
sudo modprobe intel_rapl
|
||||
```
|
||||
|
||||
**Telegraf with Intel PowerStat plugin enabled may require root access to read model specific registers (MSRs)**
|
||||
to retrieve data for calculation of most critical per-CPU specific metrics:
|
||||
**Telegraf with Intel PowerStat plugin enabled may require root access to read
|
||||
model specific registers (MSRs)** to retrieve data for calculation of most
|
||||
critical per-CPU specific metrics:
|
||||
|
||||
- `cpu_busy_frequency_mhz`
|
||||
- `cpu_temperature_celsius`
|
||||
|
|
@ -111,17 +128,20 @@ to retrieve data for calculation of most critical per-CPU specific metrics:
|
|||
and to retrieve data for calculation per-package specific metric:
|
||||
|
||||
- `max_turbo_frequency_mhz`
|
||||
- `uncore_frequency_mhz_cur`
|
||||
|
||||
To expose other Intel PowerStat metrics root access may or may not be required (depending on OS type or configuration).
|
||||
To expose other Intel PowerStat metrics root access may or may not be required
|
||||
(depending on OS type or configuration).
|
||||
|
||||
## HW Dependencies
|
||||
|
||||
Specific metrics require certain processor features to be present, otherwise Intel PowerStat plugin won't be able to
|
||||
read them. When using Linux Kernel based OS, user can detect supported processor features reading `/proc/cpuinfo` file.
|
||||
Specific metrics require certain processor features to be present, otherwise
|
||||
Intel PowerStat plugin won't be able to read them. When using Linux Kernel based
|
||||
OS, user can detect supported processor features reading `/proc/cpuinfo` file.
|
||||
Plugin assumes crucial properties are the same for all CPU cores in the system.
|
||||
The following processor properties are examined in more detail in this section:
|
||||
processor _cpu family_, _model_ and _flags_.
|
||||
The following processor properties are required by the plugin:
|
||||
processor _cpu family_, _model_ and _flags_. The following processor properties
|
||||
are required by the plugin:
|
||||
|
||||
- Processor _cpu family_ must be Intel (0x6) - since data used by the plugin assumes Intel specific
|
||||
model specific registers for all features
|
||||
|
|
@ -186,9 +206,11 @@ and _powerstat\_core.cpu\_c6\_state\_residency_ metrics:
|
|||
|
||||
## Metrics
|
||||
|
||||
All metrics collected by Intel PowerStat plugin are collected in fixed intervals.
|
||||
Metrics that reports processor C-state residency or power are calculated over elapsed intervals.
|
||||
When starting to measure metrics, plugin skips first iteration of metrics if they are based on deltas with previous value.
|
||||
All metrics collected by Intel PowerStat plugin are collected in fixed
|
||||
intervals. Metrics that reports processor C-state residency or power are
|
||||
calculated over elapsed intervals. When starting to measure metrics, plugin
|
||||
skips first iteration of metrics if they are based on deltas with previous
|
||||
value.
|
||||
|
||||
**The following measurements are supported by Intel PowerStat plugin:**
|
||||
|
||||
|
|
@ -225,6 +247,8 @@ When starting to measure metrics, plugin skips first iteration of metrics if the
|
|||
|-----|-------------|
|
||||
| `package_id` | ID of platform package/socket |
|
||||
| `active_cores`| Specific tag for `max_turbo_frequency_mhz` metric. The maximum number of activated cores for reachable turbo frequency
|
||||
| `die`| Specific tag for all `uncore_frequency` metrics. Id of die
|
||||
| `type`| Specific tag for all `uncore_frequency` metrics. Type of uncore frequency (current or initial)
|
||||
|
||||
Measurement powerstat_package metrics are collected per processor package -_package_id_ tag indicates which package metric refers to.
|
||||
|
||||
|
|
@ -232,25 +256,32 @@ When starting to measure metrics, plugin skips first iteration of metrics if the
|
|||
|
||||
| Metric name (field) | Description | Units |
|
||||
|-----|-------------|-----|
|
||||
| `thermal_design_power_watts` | Maximum Thermal Design Power (TDP) available for processor package | Watts |
|
||||
| `thermal_design_power_watts` | Maximum Thermal Design Power (TDP) available for processor package | Watts |
|
||||
| `current_power_consumption_watts` | Current power consumption of processor package | Watts |
|
||||
| `current_dram_power_consumption_watts` | Current power consumption of processor package DRAM subsystem | Watts |
|
||||
| `max_turbo_frequency_mhz`| Maximum reachable turbo frequency for number of cores active | MHz
|
||||
| `uncore_frequency_limit_mhz_min`| Minimum uncore frequency limit for die in processor package | MHz
|
||||
| `uncore_frequency_limit_mhz_max`| Maximum uncore frequency limit for die in processor package | MHz
|
||||
| `uncore_frequency_mhz_cur`| Current uncore frequency for die in processor package. Available only with tag `current`. Since this value is not yet available from `intel-uncore-frequency` module it needs to be accessed via MSR. In case of lack of loaded msr, only `uncore_frequency_limit_mhz_min` and `uncore_frequency_limit_mhz_max` metrics will be collected | MHz
|
||||
|
||||
### Known issues
|
||||
|
||||
From linux kernel version v5.4.77 with [this kernel change](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=v5.4.77&id=19f6d91bdad42200aac557a683c17b1f65ee6c94)
|
||||
resources like `/sys/class/powercap/intel-rapl*/*/energy_uj` are readable only by root for security reasons, so this plugin needs root privileges to work properly.
|
||||
From linux kernel version v5.4.77 with [this kernel change][19f6d91b] resources
|
||||
like `/sys/class/powercap/intel-rapl*/*/energy_uj` are readable only by root for
|
||||
security reasons, so this plugin needs root privileges to work properly.
|
||||
|
||||
If such strict security restrictions are not relevant, reading permissions to files in `/sys/devices/virtual/powercap/intel-rapl/`
|
||||
directory can be manually changed for example with `chmod` command with custom parameters.
|
||||
For example to give all users permission to all files in `intel-rapl` directory:
|
||||
If such strict security restrictions are not relevant, reading permissions to
|
||||
files in `/sys/devices/virtual/powercap/intel-rapl/` directory can be manually
|
||||
changed for example with `chmod` command with custom parameters. For example to
|
||||
give all users permission to all files in `intel-rapl` directory:
|
||||
|
||||
```bash
|
||||
sudo chmod -R a+rx /sys/devices/virtual/powercap/intel-rapl/
|
||||
```
|
||||
|
||||
### Example Output
|
||||
[19f6d91b]: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=v5.4.77&id=19f6d91bdad42200aac557a683c17b1f65ee6c94
|
||||
|
||||
## Example Output
|
||||
|
||||
```shell
|
||||
powerstat_package,host=ubuntu,package_id=0 thermal_design_power_watts=160 1606494744000000000
|
||||
|
|
@ -258,6 +289,8 @@ powerstat_package,host=ubuntu,package_id=0 current_power_consumption_watts=35 16
|
|||
powerstat_package,host=ubuntu,package_id=0 current_dram_power_consumption_watts=13.94 1606494744000000000
|
||||
powerstat_package,host=ubuntu,package_id=0,active_cores=0 max_turbo_frequency_mhz=3000i 1606494744000000000
|
||||
powerstat_package,host=ubuntu,package_id=0,active_cores=1 max_turbo_frequency_mhz=2800i 1606494744000000000
|
||||
powerstat_package,die=0,host=ubuntu,package_id=0,type=initial uncore_frequency_limit_mhz_min=800,uncore_frequency_limit_mhz_max=2400 1606494744000000000
|
||||
powerstat_package,die=0,host=ubuntu,package_id=0,type=current uncore_frequency_mhz_cur=800i,uncore_frequency_limit_mhz_min=800,uncore_frequency_limit_mhz_max=2400 1606494744000000000
|
||||
powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_frequency_mhz=1200.29 1606494744000000000
|
||||
powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_temperature_celsius=34i 1606494744000000000
|
||||
powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_c6_state_residency_percent=92.52 1606494744000000000
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
// Code generated by mockery v2.10.0. DO NOT EDIT.
|
||||
// Code generated by mockery v2.12.3. DO NOT EDIT.
|
||||
|
||||
package intel_powerstat
|
||||
|
||||
|
|
@ -130,3 +130,18 @@ func (_m *mockFileService) readFileToFloat64(reader io.Reader) (float64, int64,
|
|||
|
||||
return r0, r1, r2
|
||||
}
|
||||
|
||||
type newmockFileServiceT interface {
|
||||
mock.TestingT
|
||||
Cleanup(func())
|
||||
}
|
||||
|
||||
// newmockFileService creates a new instance of mockFileService. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
|
||||
func newmockFileService(t newmockFileServiceT) *mockFileService {
|
||||
mock := &mockFileService{}
|
||||
mock.Mock.Test(t)
|
||||
|
||||
t.Cleanup(func() { mock.AssertExpectations(t) })
|
||||
|
||||
return mock
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ package intel_powerstat
|
|||
|
||||
import (
|
||||
_ "embed"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math/big"
|
||||
"strconv"
|
||||
|
|
@ -33,6 +34,7 @@ const (
|
|||
packageCurrentDramPowerConsumption = "current_dram_power_consumption"
|
||||
packageThermalDesignPower = "thermal_design_power"
|
||||
packageTurboLimit = "max_turbo_frequency"
|
||||
packageUncoreFrequency = "uncore_frequency"
|
||||
percentageMultiplier = 100
|
||||
)
|
||||
|
||||
|
|
@ -57,6 +59,7 @@ type PowerStat struct {
|
|||
packageCurrentPowerConsumption bool
|
||||
packageCurrentDramPowerConsumption bool
|
||||
packageThermalDesignPower bool
|
||||
packageUncoreFrequency bool
|
||||
cpuInfo map[string]*cpuInfo
|
||||
skipFirstIteration bool
|
||||
logOnce map[string]error
|
||||
|
|
@ -76,10 +79,10 @@ func (p *PowerStat) Init() error {
|
|||
}
|
||||
// Initialize MSR service only when there is at least one metric enabled
|
||||
if p.cpuFrequency || p.cpuBusyFrequency || p.cpuTemperature || p.cpuC0StateResidency || p.cpuC1StateResidency ||
|
||||
p.cpuC6StateResidency || p.cpuBusyCycles || p.packageTurboLimit {
|
||||
p.cpuC6StateResidency || p.cpuBusyCycles || p.packageTurboLimit || p.packageUncoreFrequency {
|
||||
p.msr = newMsrServiceWithFs(p.Log, p.fs)
|
||||
}
|
||||
if p.packageCurrentPowerConsumption || p.packageCurrentDramPowerConsumption || p.packageThermalDesignPower || p.packageTurboLimit {
|
||||
if p.packageCurrentPowerConsumption || p.packageCurrentDramPowerConsumption || p.packageThermalDesignPower || p.packageTurboLimit || p.packageUncoreFrequency {
|
||||
p.rapl = newRaplServiceWithFs(p.Log, p.fs)
|
||||
}
|
||||
|
||||
|
|
@ -97,7 +100,17 @@ func (p *PowerStat) Gather(acc telegraf.Accumulator) error {
|
|||
}
|
||||
|
||||
if p.areCoreMetricsEnabled() {
|
||||
p.addPerCoreMetrics(acc)
|
||||
if p.msr.isMsrLoaded() {
|
||||
p.logOnce["msr"] = nil
|
||||
p.addPerCoreMetrics(acc)
|
||||
} else {
|
||||
err := errors.New("error while trying to read MSR (probably msr module was not loaded)")
|
||||
if val := p.logOnce["msr"]; val == nil || val.Error() != err.Error() {
|
||||
p.Log.Errorf("%v", err)
|
||||
// Remember that specific error occurs to omit logging next time
|
||||
p.logOnce["msr"] = err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gathering the first iteration of metrics was skipped for most of them because they are based on delta calculations
|
||||
|
|
@ -109,25 +122,31 @@ func (p *PowerStat) Gather(acc telegraf.Accumulator) error {
|
|||
func (p *PowerStat) addGlobalMetrics(acc telegraf.Accumulator) {
|
||||
// Prepare RAPL data each gather because there is a possibility to disable rapl kernel module
|
||||
p.rapl.initializeRaplData()
|
||||
|
||||
for socketID := range p.rapl.getRaplData() {
|
||||
if p.packageTurboLimit {
|
||||
p.addTurboRatioLimit(socketID, acc)
|
||||
}
|
||||
|
||||
if p.packageUncoreFrequency {
|
||||
die := maxDiePerSocket(socketID)
|
||||
for actualDie := 0; actualDie < die; actualDie++ {
|
||||
p.addUncoreFreq(socketID, strconv.Itoa(actualDie), acc)
|
||||
}
|
||||
}
|
||||
|
||||
err := p.rapl.retrieveAndCalculateData(socketID)
|
||||
if err != nil {
|
||||
// In case of an error skip calculating metrics for this socket
|
||||
if val := p.logOnce[socketID]; val == nil || val.Error() != err.Error() {
|
||||
if val := p.logOnce[socketID+"rapl"]; val == nil || val.Error() != err.Error() {
|
||||
p.Log.Errorf("error fetching rapl data for socket %s, err: %v", socketID, err)
|
||||
// Remember that specific error occurs for socketID to omit logging next time
|
||||
p.logOnce[socketID] = err
|
||||
p.logOnce[socketID+"rapl"] = err
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// If error stops occurring, clear logOnce indicator
|
||||
p.logOnce[socketID] = nil
|
||||
p.logOnce[socketID+"rapl"] = nil
|
||||
if p.packageThermalDesignPower {
|
||||
p.addThermalDesignPowerMetric(socketID, acc)
|
||||
}
|
||||
|
|
@ -143,6 +162,84 @@ func (p *PowerStat) addGlobalMetrics(acc telegraf.Accumulator) {
|
|||
}
|
||||
}
|
||||
}
|
||||
func maxDiePerSocket(_ string) int {
|
||||
/*
|
||||
TODO:
|
||||
At the moment, linux does not distinguish between more dies per socket.
|
||||
This piece of code will need to be upgraded in the future.
|
||||
https://github.com/torvalds/linux/blob/v5.17/arch/x86/include/asm/topology.h#L153
|
||||
*/
|
||||
return 1
|
||||
}
|
||||
|
||||
func (p *PowerStat) addUncoreFreq(socketID string, die string, acc telegraf.Accumulator) {
|
||||
err := checkFile("/sys/devices/system/cpu/intel_uncore_frequency")
|
||||
if err != nil {
|
||||
err := fmt.Errorf("error while checking existing intel_uncore_frequency (probably intel-uncore-frequency module was not loaded)")
|
||||
if val := p.logOnce["intel_uncore_frequency"]; val == nil || val.Error() != err.Error() {
|
||||
p.Log.Errorf("%v", err)
|
||||
// Remember that specific error occurs to omit logging next time
|
||||
p.logOnce["intel_uncore_frequency"] = err
|
||||
}
|
||||
return
|
||||
}
|
||||
p.logOnce["intel_uncore_frequency"] = nil
|
||||
p.readUncoreFreq("initial", socketID, die, acc)
|
||||
p.readUncoreFreq("current", socketID, die, acc)
|
||||
}
|
||||
|
||||
func (p *PowerStat) readUncoreFreq(typeFreq string, socketID string, die string, acc telegraf.Accumulator) {
|
||||
fields := map[string]interface{}{}
|
||||
cpuID := ""
|
||||
if typeFreq == "current" {
|
||||
if p.areCoreMetricsEnabled() && p.msr.isMsrLoaded() {
|
||||
p.logOnce[socketID+"msr"] = nil
|
||||
for _, v := range p.cpuInfo {
|
||||
if v.physicalID == socketID {
|
||||
cpuID = v.cpuID
|
||||
}
|
||||
}
|
||||
if cpuID == "" {
|
||||
p.Log.Debugf("error while reading socket ID")
|
||||
return
|
||||
}
|
||||
actualUncoreFreq, err := p.msr.readSingleMsr(cpuID, "MSR_UNCORE_PERF_STATUS")
|
||||
if err != nil {
|
||||
p.Log.Debugf("error while reading MSR_UNCORE_PERF_STATUS: %v", err)
|
||||
return
|
||||
}
|
||||
actualUncoreFreq = (actualUncoreFreq & 0x3F) * 100
|
||||
fields["uncore_frequency_mhz_cur"] = actualUncoreFreq
|
||||
} else {
|
||||
err := errors.New("error while trying to read MSR (probably msr module was not loaded), uncore_frequency_mhz_cur metric will not be collected")
|
||||
if val := p.logOnce[socketID+"msr"]; val == nil || val.Error() != err.Error() {
|
||||
p.Log.Errorf("%v", err)
|
||||
// Remember that specific error occurs for socketID to omit logging next time
|
||||
p.logOnce[socketID+"msr"] = err
|
||||
}
|
||||
}
|
||||
}
|
||||
initMinFreq, err := p.msr.retrieveUncoreFrequency(socketID, typeFreq, "min", die)
|
||||
if err != nil {
|
||||
p.Log.Errorf("error while retrieving minimum uncore frequency of the socket %s, err: %v", socketID, err)
|
||||
return
|
||||
}
|
||||
initMaxFreq, err := p.msr.retrieveUncoreFrequency(socketID, typeFreq, "max", die)
|
||||
if err != nil {
|
||||
p.Log.Errorf("error while retrieving maximum uncore frequency of the socket %s, err: %v", socketID, err)
|
||||
return
|
||||
}
|
||||
|
||||
tags := map[string]string{
|
||||
"package_id": socketID,
|
||||
"type": typeFreq,
|
||||
"die": die,
|
||||
}
|
||||
fields["uncore_frequency_limit_mhz_min"] = initMinFreq
|
||||
fields["uncore_frequency_limit_mhz_max"] = initMaxFreq
|
||||
|
||||
acc.AddGauge("powerstat_package", fields, tags)
|
||||
}
|
||||
|
||||
func (p *PowerStat) addThermalDesignPowerMetric(socketID string, acc telegraf.Accumulator) {
|
||||
maxPower, err := p.rapl.getConstraintMaxPowerWatts(socketID)
|
||||
|
|
@ -579,6 +676,9 @@ func (p *PowerStat) parsePackageMetricsConfig() {
|
|||
if contains(p.PackageMetrics, packageThermalDesignPower) {
|
||||
p.packageThermalDesignPower = true
|
||||
}
|
||||
if contains(p.PackageMetrics, packageUncoreFrequency) {
|
||||
p.packageUncoreFrequency = true
|
||||
}
|
||||
}
|
||||
|
||||
func (p *PowerStat) parseCPUMetricsConfig() {
|
||||
|
|
@ -693,6 +793,7 @@ func newPowerStat(fs fileService) *PowerStat {
|
|||
cpuTemperature: false,
|
||||
cpuBusyFrequency: false,
|
||||
packageTurboLimit: false,
|
||||
packageUncoreFrequency: false,
|
||||
packageCurrentPowerConsumption: false,
|
||||
packageCurrentDramPowerConsumption: false,
|
||||
packageThermalDesignPower: false,
|
||||
|
|
|
|||
|
|
@ -119,6 +119,7 @@ func TestGather(t *testing.T) {
|
|||
On("retrieveAndCalculateData", mock.Anything).Return(nil).Times(len(raplDataMap)).
|
||||
On("getConstraintMaxPowerWatts", mock.Anything).Return(546783852.3, nil)
|
||||
mockServices.msr.On("getCPUCoresData").Return(preparedCPUData).
|
||||
On("isMsrLoaded", mock.Anything).Return(true).
|
||||
On("openAndReadMsr", mock.Anything).Return(nil).
|
||||
On("retrieveCPUFrequencyForCore", mock.Anything).Return(1200000.2, nil)
|
||||
|
||||
|
|
@ -227,6 +228,43 @@ func TestAddCPUFrequencyMetric(t *testing.T) {
|
|||
acc.AssertContainsTaggedFields(t, "powerstat_core", expectedMetric.fields, expectedMetric.tags)
|
||||
}
|
||||
|
||||
func TestReadUncoreFreq(t *testing.T) {
|
||||
var acc testutil.Accumulator
|
||||
cpuID := "0"
|
||||
coreID := "0"
|
||||
packageID := "0"
|
||||
die := "0"
|
||||
power, mockServices := getPowerWithMockedServices()
|
||||
prepareCPUInfoForSingleCPU(power, cpuID, coreID, packageID)
|
||||
preparedData := getPreparedCPUData([]string{cpuID})
|
||||
|
||||
mockServices.msr.On("getCPUCoresData").Return(preparedData)
|
||||
|
||||
mockServices.msr.On("isMsrLoaded").Return(true)
|
||||
|
||||
mockServices.msr.On("readSingleMsr", "0", "MSR_UNCORE_PERF_STATUS").Return(uint64(10), nil)
|
||||
|
||||
mockServices.msr.On("retrieveUncoreFrequency", "0", "initial", "min", "0").
|
||||
Return(float64(500), nil)
|
||||
mockServices.msr.On("retrieveUncoreFrequency", "0", "initial", "max", "0").
|
||||
Return(float64(1200), nil)
|
||||
mockServices.msr.On("retrieveUncoreFrequency", "0", "current", "min", "0").
|
||||
Return(float64(600), nil)
|
||||
mockServices.msr.On("retrieveUncoreFrequency", "0", "current", "max", "0").
|
||||
Return(float64(1100), nil)
|
||||
|
||||
power.readUncoreFreq("current", packageID, die, &acc)
|
||||
power.readUncoreFreq("initial", packageID, die, &acc)
|
||||
|
||||
require.Equal(t, 2, len(acc.GetTelegrafMetrics()))
|
||||
|
||||
expectedMetric := getPowerUncoreFreqMetric("initial", float64(500), float64(1200), nil, packageID, die)
|
||||
acc.AssertContainsTaggedFields(t, "powerstat_package", expectedMetric.fields, expectedMetric.tags)
|
||||
|
||||
expectedMetric = getPowerUncoreFreqMetric("current", float64(600), float64(1100), uint64(1000), packageID, die)
|
||||
acc.AssertContainsTaggedFields(t, "powerstat_package", expectedMetric.fields, expectedMetric.tags)
|
||||
}
|
||||
|
||||
func TestAddCoreCPUTemperatureMetric(t *testing.T) {
|
||||
var acc testutil.Accumulator
|
||||
cpuID := "0"
|
||||
|
|
@ -496,6 +534,27 @@ func getPowerGlobalMetric(name string, value interface{}, socketID string) struc
|
|||
return getPowerMetric(name, value, map[string]string{"package_id": socketID})
|
||||
}
|
||||
|
||||
func getPowerUncoreFreqMetric(typeFreq string, limitMin interface{}, limitMax interface{}, current interface{}, socketID string, die string) struct {
|
||||
fields map[string]interface{}
|
||||
tags map[string]string
|
||||
} {
|
||||
var ret struct {
|
||||
fields map[string]interface{}
|
||||
tags map[string]string
|
||||
}
|
||||
ret.tags = make(map[string]string)
|
||||
ret.fields = make(map[string]interface{})
|
||||
ret.tags["package_id"] = socketID
|
||||
ret.tags["die"] = die
|
||||
ret.tags["type"] = typeFreq
|
||||
ret.fields["uncore_frequency_limit_mhz_min"] = limitMin
|
||||
ret.fields["uncore_frequency_limit_mhz_max"] = limitMax
|
||||
if typeFreq == "current" {
|
||||
ret.fields["uncore_frequency_mhz_cur"] = current
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
func getPowerMetric(name string, value interface{}, tags map[string]string) struct {
|
||||
fields map[string]interface{}
|
||||
tags map[string]string
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ const (
|
|||
systemCPUPath = "/sys/devices/system/cpu/"
|
||||
cpuCurrentFreqPartialPath = "/sys/devices/system/cpu/cpu%s/cpufreq/scaling_cur_freq"
|
||||
msrPartialPath = "/dev/cpu/%s/msr"
|
||||
uncoreFreqPath = "/sys/devices/system/cpu/intel_uncore_frequency/package_%s_die_%s/%s%s_freq_khz"
|
||||
c3StateResidencyLocation = 0x3FC
|
||||
c6StateResidencyLocation = 0x3FD
|
||||
c7StateResidencyLocation = 0x3FE
|
||||
|
|
@ -32,14 +33,17 @@ const (
|
|||
turboRatioLimit1Location = 0x1AE
|
||||
turboRatioLimit2Location = 0x1AF
|
||||
atomCoreTurboRatiosLocation = 0x66C
|
||||
uncorePerfStatusLocation = 0x621
|
||||
)
|
||||
|
||||
// msrService is responsible for interactions with MSR.
|
||||
type msrService interface {
|
||||
getCPUCoresData() map[string]*msrData
|
||||
retrieveCPUFrequencyForCore(core string) (float64, error)
|
||||
retrieveUncoreFrequency(socketID string, typeFreq string, kind string, die string) (float64, error)
|
||||
openAndReadMsr(core string) error
|
||||
readSingleMsr(core string, msr string) (uint64, error)
|
||||
isMsrLoaded() bool
|
||||
}
|
||||
|
||||
type msrServiceImpl struct {
|
||||
|
|
@ -53,6 +57,15 @@ func (m *msrServiceImpl) getCPUCoresData() map[string]*msrData {
|
|||
return m.cpuCoresData
|
||||
}
|
||||
|
||||
func (m *msrServiceImpl) isMsrLoaded() bool {
|
||||
for cpuID := range m.getCPUCoresData() {
|
||||
err := m.openAndReadMsr(cpuID)
|
||||
if err == nil {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
func (m *msrServiceImpl) retrieveCPUFrequencyForCore(core string) (float64, error) {
|
||||
cpuFreqPath := fmt.Sprintf(cpuCurrentFreqPartialPath, core)
|
||||
err := checkFile(cpuFreqPath)
|
||||
|
|
@ -69,6 +82,49 @@ func (m *msrServiceImpl) retrieveCPUFrequencyForCore(core string) (float64, erro
|
|||
return convertKiloHertzToMegaHertz(cpuFreq), err
|
||||
}
|
||||
|
||||
func (m *msrServiceImpl) retrieveUncoreFrequency(socketID string, typeFreq string, kind string, die string) (float64, error) {
|
||||
uncoreFreqPath, err := createUncoreFreqPath(socketID, typeFreq, kind, die)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("unable to create uncore freq read path for socketID %s, and frequency type %s err: %v", socketID, typeFreq, err)
|
||||
}
|
||||
err = checkFile(uncoreFreqPath)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
uncoreFreqFile, err := os.Open(uncoreFreqPath)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("error opening uncore frequncy file on %s, err: %v", uncoreFreqPath, err)
|
||||
}
|
||||
defer uncoreFreqFile.Close()
|
||||
|
||||
uncoreFreq, _, err := m.fs.readFileToFloat64(uncoreFreqFile)
|
||||
return convertKiloHertzToMegaHertz(uncoreFreq), err
|
||||
}
|
||||
|
||||
func createUncoreFreqPath(socketID string, typeFreq string, kind string, die string) (string, error) {
|
||||
if socketID >= "0" && socketID <= "9" {
|
||||
socketID = fmt.Sprintf("0%s", socketID)
|
||||
}
|
||||
if die >= "0" && die <= "9" {
|
||||
die = fmt.Sprintf("0%s", die)
|
||||
}
|
||||
var prefix string
|
||||
|
||||
switch typeFreq {
|
||||
case "initial":
|
||||
prefix = "initial_"
|
||||
case "current":
|
||||
prefix = ""
|
||||
default:
|
||||
return "", fmt.Errorf("unknown frequency type %s, only 'initial' and 'current' are supported", typeFreq)
|
||||
}
|
||||
|
||||
if kind != "min" && kind != "max" {
|
||||
return "", fmt.Errorf("unknown frequency type %s, only 'min' and 'max' are supported", kind)
|
||||
}
|
||||
return fmt.Sprintf(uncoreFreqPath, socketID, die, prefix, kind), nil
|
||||
}
|
||||
|
||||
func (m *msrServiceImpl) openAndReadMsr(core string) error {
|
||||
path := fmt.Sprintf(msrPartialPath, core)
|
||||
err := checkFile(path)
|
||||
|
|
@ -110,6 +166,8 @@ func (m *msrServiceImpl) readSingleMsr(core string, msr string) (uint64, error)
|
|||
msrAddress = turboRatioLimit2Location
|
||||
case "MSR_ATOM_CORE_TURBO_RATIOS":
|
||||
msrAddress = atomCoreTurboRatiosLocation
|
||||
case "MSR_UNCORE_PERF_STATUS":
|
||||
msrAddress = uncorePerfStatusLocation
|
||||
default:
|
||||
return 0, fmt.Errorf("incorect name of MSR %s", msr)
|
||||
}
|
||||
|
|
@ -250,6 +308,5 @@ func newMsrServiceWithFs(logger telegraf.Logger, fs fileService) *msrServiceImpl
|
|||
msrService.msrOffsets = []int64{c3StateResidencyLocation, c6StateResidencyLocation, c7StateResidencyLocation,
|
||||
maximumFrequencyClockCountLocation, actualFrequencyClockCountLocation, timestampCounterLocation,
|
||||
throttleTemperatureLocation, temperatureLocation}
|
||||
|
||||
return msrService
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
// Code generated by mockery v2.10.0. DO NOT EDIT.
|
||||
// Code generated by mockery v2.12.3. DO NOT EDIT.
|
||||
|
||||
package intel_powerstat
|
||||
|
||||
|
|
@ -9,6 +9,20 @@ type mockMsrService struct {
|
|||
mock.Mock
|
||||
}
|
||||
|
||||
// isMsrLoaded provides a mock function with given fields:
|
||||
func (_m *mockMsrService) isMsrLoaded() bool {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 bool
|
||||
if rf, ok := ret.Get(0).(func() bool); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Get(0).(bool)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// getCPUCoresData provides a mock function with given fields:
|
||||
func (_m *mockMsrService) getCPUCoresData() map[string]*msrData {
|
||||
ret := _m.Called()
|
||||
|
|
@ -80,3 +94,39 @@ func (_m *mockMsrService) retrieveCPUFrequencyForCore(core string) (float64, err
|
|||
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
// retrieveUncoreFrequency provides a mock function with given fields: socketID, typeFreq, kind, die
|
||||
func (_m *mockMsrService) retrieveUncoreFrequency(socketID string, typeFreq string, kind string, die string) (float64, error) {
|
||||
ret := _m.Called(socketID, typeFreq, kind, die)
|
||||
|
||||
var r0 float64
|
||||
if rf, ok := ret.Get(0).(func(string, string, string, string) float64); ok {
|
||||
r0 = rf(socketID, typeFreq, kind, die)
|
||||
} else {
|
||||
r0 = ret.Get(0).(float64)
|
||||
}
|
||||
|
||||
var r1 error
|
||||
if rf, ok := ret.Get(1).(func(string, string, string, string) error); ok {
|
||||
r1 = rf(socketID, typeFreq, kind, die)
|
||||
} else {
|
||||
r1 = ret.Error(1)
|
||||
}
|
||||
|
||||
return r0, r1
|
||||
}
|
||||
|
||||
type newmockMsrServiceT interface {
|
||||
mock.TestingT
|
||||
Cleanup(func())
|
||||
}
|
||||
|
||||
// newmockMsrService creates a new instance of mockMsrService. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
|
||||
func newmockMsrService(t newmockMsrServiceT) *mockMsrService {
|
||||
mock := &mockMsrService{}
|
||||
mock.Mock.Test(t)
|
||||
|
||||
t.Cleanup(func() { mock.AssertExpectations(t) })
|
||||
|
||||
return mock
|
||||
}
|
||||
|
|
|
|||
|
|
@ -93,6 +93,60 @@ func TestReadValueFromFileAtOffset(t *testing.T) {
|
|||
require.Equal(t, zero, <-testChannel)
|
||||
}
|
||||
|
||||
func TestCreateUncoreFreqPath(t *testing.T) {
|
||||
path, err := createUncoreFreqPath("0", "initial", "min", "0")
|
||||
expectedPath := "/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/initial_min_freq_khz"
|
||||
require.Equal(t, nil, err)
|
||||
require.Equal(t, expectedPath, path)
|
||||
|
||||
path, err = createUncoreFreqPath("0", "initial", "max", "0")
|
||||
expectedPath = "/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/initial_max_freq_khz"
|
||||
require.Equal(t, nil, err)
|
||||
require.Equal(t, expectedPath, path)
|
||||
|
||||
path, err = createUncoreFreqPath("0", "current", "min", "0")
|
||||
expectedPath = "/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/min_freq_khz"
|
||||
require.Equal(t, nil, err)
|
||||
require.Equal(t, expectedPath, path)
|
||||
|
||||
path, err = createUncoreFreqPath("0", "current", "max", "0")
|
||||
expectedPath = "/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/max_freq_khz"
|
||||
require.Equal(t, nil, err)
|
||||
require.Equal(t, expectedPath, path)
|
||||
|
||||
path, err = createUncoreFreqPath("9", "current", "max", "0")
|
||||
expectedPath = "/sys/devices/system/cpu/intel_uncore_frequency/package_09_die_00/max_freq_khz"
|
||||
require.Equal(t, nil, err)
|
||||
require.Equal(t, expectedPath, path)
|
||||
|
||||
path, err = createUncoreFreqPath("99", "current", "max", "0")
|
||||
expectedPath = "/sys/devices/system/cpu/intel_uncore_frequency/package_99_die_00/max_freq_khz"
|
||||
require.Equal(t, nil, err)
|
||||
require.Equal(t, expectedPath, path)
|
||||
|
||||
path, err = createUncoreFreqPath("0", "current", "max", "9")
|
||||
expectedPath = "/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_09/max_freq_khz"
|
||||
require.Equal(t, nil, err)
|
||||
require.Equal(t, expectedPath, path)
|
||||
|
||||
path, err = createUncoreFreqPath("0", "current", "max", "99")
|
||||
expectedPath = "/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_99/max_freq_khz"
|
||||
require.Equal(t, nil, err)
|
||||
require.Equal(t, expectedPath, path)
|
||||
|
||||
path, err = createUncoreFreqPath("0", "foo", "max", "0")
|
||||
expectedPath = ""
|
||||
expectedError := errors.New("unknown frequency type foo, only 'initial' and 'current' are supported")
|
||||
require.Equal(t, expectedError, err)
|
||||
require.Equal(t, expectedPath, path)
|
||||
|
||||
path, err = createUncoreFreqPath("0", "current", "bar", "0")
|
||||
expectedPath = ""
|
||||
expectedError = errors.New("unknown frequency type bar, only 'min' and 'max' are supported")
|
||||
require.Equal(t, expectedError, err)
|
||||
require.Equal(t, expectedPath, path)
|
||||
}
|
||||
|
||||
func prepareTestData(fsMock *mockFileService, cores []string, msr *msrServiceImpl, t *testing.T) {
|
||||
// Prepare MSR offsets and CPUCoresData for test.
|
||||
fsMock.On("getStringsMatchingPatternOnPath", mock.Anything).
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
// Code generated by mockery v2.10.0. DO NOT EDIT.
|
||||
// Code generated by mockery v2.12.3. DO NOT EDIT.
|
||||
|
||||
package intel_powerstat
|
||||
|
||||
|
|
@ -64,3 +64,18 @@ func (_m *mockRaplService) retrieveAndCalculateData(socketID string) error {
|
|||
|
||||
return r0
|
||||
}
|
||||
|
||||
type newmockRaplServiceT interface {
|
||||
mock.TestingT
|
||||
Cleanup(func())
|
||||
}
|
||||
|
||||
// newmockRaplService creates a new instance of mockRaplService. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
|
||||
func newmockRaplService(t newmockRaplServiceT) *mockRaplService {
|
||||
mock := &mockRaplService{}
|
||||
mock.Mock.Test(t)
|
||||
|
||||
t.Cleanup(func() { mock.AssertExpectations(t) })
|
||||
|
||||
return mock
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
## - Setting this value to an empty array means no package metrics will be collected
|
||||
## - Finally, a user can specify individual metrics to capture from the supported options list
|
||||
## Supported options:
|
||||
## "current_power_consumption", "current_dram_power_consumption", "thermal_design_power", "max_turbo_frequency"
|
||||
## "current_power_consumption", "current_dram_power_consumption", "thermal_design_power", "max_turbo_frequency", "uncore_frequency"
|
||||
# package_metrics = ["current_power_consumption", "current_dram_power_consumption", "thermal_design_power"]
|
||||
|
||||
## The user can choose which per-CPU metrics are monitored by the plugin in cpu_metrics array.
|
||||
|
|
|
|||
Loading…
Reference in New Issue