diff --git a/plugins/inputs/intel_powerstat/README.md b/plugins/inputs/intel_powerstat/README.md index 4b0b88ab7..0b1e5cb5d 100644 --- a/plugins/inputs/intel_powerstat/README.md +++ b/plugins/inputs/intel_powerstat/README.md @@ -11,18 +11,25 @@ to take preventive/corrective actions based on platform busyness, CPU temperatur ```toml # Intel PowerStat plugin enables monitoring of platform metrics (power, TDP) and per-CPU metrics like temperature, power and utilization. [[inputs.intel_powerstat]] - ## All global metrics are always collected by Intel PowerStat plugin. - ## User can choose which per-CPU metrics are monitored by the plugin in cpu_metrics array. - ## Empty array means no per-CPU specific metrics will be collected by the plugin - in this case only platform level - ## telemetry will be exposed by Intel PowerStat plugin. + ## The user can choose which package metrics are monitored by the plugin with the package_metrics setting: + ## - The default, will collect "current_power_consumption", "current_dram_power_consumption" and "thermal_design_power" + ## - Setting this value to an empty array means no package metrics will be collected + ## - Finally, a user can specify individual metrics to capture from the supported options list ## Supported options: - ## "cpu_frequency", "cpu_busy_frequency", "cpu_temperature", "cpu_c1_state_residency", "cpu_c6_state_residency", "cpu_busy_cycles" + ## "current_power_consumption", "current_dram_power_consumption", "thermal_design_power", "max_turbo_frequency" + # package_metrics = ["current_power_consumption", "current_dram_power_consumption", "thermal_design_power"] + + ## The user can choose which per-CPU metrics are monitored by the plugin in cpu_metrics array. + ## Empty or missing array means no per-CPU specific metrics will be collected by the plugin. + ## Supported options: + ## "cpu_frequency", "cpu_c0_state_residency", "cpu_c1_state_residency", "cpu_c6_state_residency", "cpu_busy_cycles", "cpu_temperature", "cpu_busy_frequency" + ## ATTENTION: cpu_busy_cycles option is DEPRECATED - superseded by cpu_c0_state_residency # cpu_metrics = [] ``` ## Example: Configuration with no per-CPU telemetry -This configuration allows getting global metrics (processor package specific), no per-CPU metrics are collected: +This configuration allows getting default processor package specific metrics, no per-CPU metrics are collected: ```toml [[inputs.intel_powerstat]] @@ -31,28 +38,39 @@ This configuration allows getting global metrics (processor package specific), n ## Example: Configuration with no per-CPU telemetry - equivalent case -This configuration allows getting global metrics (processor package specific), no per-CPU metrics are collected: +This configuration allows getting default processor package specific metrics, no per-CPU metrics are collected: ```toml [[inputs.intel_powerstat]] ``` -## Example: Configuration for CPU Temperature and Frequency only +## Example: Configuration for CPU Temperature and CPU Frequency -This configuration allows getting global metrics plus subset of per-CPU metrics (CPU Temperature and Current Frequency): +This configuration allows getting default processor package specific metrics, plus subset of per-CPU metrics (CPU Temperature and CPU Frequency): ```toml [[inputs.intel_powerstat]] cpu_metrics = ["cpu_frequency", "cpu_temperature"] ``` -## Example: Configuration with all available metrics +## Example: Configuration for CPU Temperature and CPU Frequency without default package metrics -This configuration allows getting global metrics and all per-CPU metrics: +This configuration allows getting only a subset of per-CPU metrics (CPU Temperature and CPU Frequency): ```toml [[inputs.intel_powerstat]] - cpu_metrics = ["cpu_frequency", "cpu_busy_frequency", "cpu_temperature", "cpu_c1_state_residency", "cpu_c6_state_residency", "cpu_busy_cycles"] + package_metrics = [] + cpu_metrics = ["cpu_frequency", "cpu_temperature"] +``` + +## Example: Configuration with all available metrics + +This configuration allows getting all processor package specific metrics and all per-CPU metrics: + +```toml +[[inputs.intel_powerstat]] + package_metrics = ["current_power_consumption", "current_dram_power_consumption", "thermal_design_power", "max_turbo_frequency"] + cpu_metrics = ["cpu_frequency", "cpu_busy_frequency", "cpu_temperature", "cpu_c0_state_residency", "cpu_c1_state_residency", "cpu_c6_state_residency"] ``` ## SW Dependencies @@ -66,11 +84,17 @@ The following dependencies are expected by plugin: Minimum kernel version required is 3.13 to satisfy all requirements. -Please make sure that kernel modules are loaded and running. You might have to manually enable them by using `modprobe`. -Exact commands to be executed are: +Please make sure that kernel modules are loaded and running (cpufreq is integrated in kernel). Modules might have to be manually enabled by using `modprobe`. +Depending on the kernel version, run commands: ```sh -sudo modprobe cpufreq-stats +# kernel 5.x.x: +sudo modprobe rapl +subo modprobe msr +sudo modprobe intel_rapl_common +sudo modprobe intel_rapl_msr + +# kernel 4.x.x: sudo modprobe msr sudo modprobe intel_rapl ``` @@ -80,9 +104,13 @@ to retrieve data for calculation of most critical per-CPU specific metrics: - `cpu_busy_frequency_mhz` - `cpu_temperature_celsius` +- `cpu_c0_state_residency_percent` - `cpu_c1_state_residency_percent` - `cpu_c6_state_residency_percent` -- `cpu_busy_cycles_percent` + +and to retrieve data for calculation per-package specific metric: + +- `max_turbo_frequency_mhz` To expose other Intel PowerStat metrics root access may or may not be required (depending on OS type or configuration). @@ -99,13 +127,13 @@ The following processor properties are required by the plugin: model specific registers for all features - The following processor flags shall be present: - "_msr_" shall be present for plugin to read platform data from processor model specific registers and collect - the following metrics: _powerstat_core.cpu_temperature_, _powerstat_core.cpu_busy_frequency_, - _powerstat_core.cpu_busy_cycles_, _powerstat_core.cpu_c1_state_residency_, _powerstat_core._cpu_c6_state_residency_ - - "_aperfmperf_" shall be present to collect the following metrics: _powerstat_core.cpu_busy_frequency_, - _powerstat_core.cpu_busy_cycles_, _powerstat_core.cpu_c1_state_residency_ - - "_dts_" shall be present to collect _powerstat_core.cpu_temperature_ -- Processor _Model number_ must be one of the following values for plugin to read _powerstat_core.cpu_c1_state_residency_ -and _powerstat_core.cpu_c6_state_residency_ metrics: + the following metrics: _powerstat\_core.cpu\_temperature_, _powerstat\_core.cpu\_busy\_frequency_, + _powerstat\_core.cpu\_c0\_state\_residency_, _powerstat\_core.cpu\_c1\_state\_residency_, _powerstat\_core.cpu\_c6\_state\_residency_ + - "_aperfmperf_" shall be present to collect the following metrics: _powerstat\_core.cpu\_busy\_frequency_, + _powerstat\_core.cpu\_c0\_state\_residency_, _powerstat\_core.cpu\_c1\_state\_residency_ + - "_dts_" shall be present to collect _powerstat\_core.cpu\_temperature_ +- Processor _Model number_ must be one of the following values for plugin to read _powerstat\_core.cpu\_c1\_state\_residency_ +and _powerstat\_core.cpu\_c6\_state\_residency_ metrics: | Model number | Processor name | |-----|-------------| @@ -168,50 +196,59 @@ When starting to measure metrics, plugin skips first iteration of metrics if the - The following Tags are returned by plugin with powerstat_core measurements: - ```text - | Tag | Description | - |-----|-------------| - | `package_id` | ID of platform package/socket | - | `core_id` | ID of physical processor core | - | `cpu_id` | ID of logical processor core | + | Tag | Description | + |--------------|-------------------------------| + | `package_id` | ID of platform package/socket | + | `core_id` | ID of physical processor core | + | `cpu_id` | ID of logical processor core | + Measurement powerstat_core metrics are collected per-CPU (cpu_id is the key) while core_id and package_id tags are additional topology information. - ``` - Available metrics for powerstat_core measurement - ```text - | Metric name (field) | Description | Units | - |-----|-------------|-----| - | `cpu_frequency_mhz` | Current operational frequency of CPU Core | MHz | - | `cpu_busy_frequency_mhz` | CPU Core Busy Frequency measured as frequency adjusted to CPU Core busy cycles | MHz | - | `cpu_temperature_celsius` | Current temperature of CPU Core | Celsius degrees | - | `cpu_c1_state_residency_percent` | Percentage of time that CPU Core spent in C1 Core residency state | % | - | `cpu_c6_state_residency_percent` | Percentage of time that CPU Core spent in C6 Core residency state | % | - | `cpu_busy_cycles_percent` | CPU Core Busy cycles as a ratio of Cycles spent in C0 state residency to all cycles executed by CPU Core | % | - ``` + | Metric name (field) | Description | Units | + |---------------------|-------------|-------| + | `cpu_frequency_mhz` | Current operational frequency of CPU Core | MHz | + | `cpu_busy_frequency_mhz` | CPU Core Busy Frequency measured as frequency adjusted to CPU Core busy cycles | MHz | + | `cpu_temperature_celsius` | Current temperature of CPU Core | Celsius degrees | + | `cpu_c0_state_residency_percent` | Percentage of time that CPU Core spent in C0 Core residency state | % | + | `cpu_c1_state_residency_percent` | Percentage of time that CPU Core spent in C1 Core residency state | % | + | `cpu_c6_state_residency_percent` | Percentage of time that CPU Core spent in C6 Core residency state | % | + | `cpu_busy_cycles_percent` | (**DEPRECATED** - superseded by cpu_c0_state_residency_percent) CPU Core Busy cycles as a ratio of Cycles spent in C0 state residency to all cycles executed by CPU Core | % | - powerstat_package - The following Tags are returned by plugin with powerstat_package measurements: - ```text - | Tag | Description | - |-----|-------------| - | `package_id` | ID of platform package/socket | - Measurement powerstat_package metrics are collected per processor package -_package_id_ tag indicates which - package metric refers to. - ``` + | Tag | Description | + |-----|-------------| + | `package_id` | ID of platform package/socket | + | `active_cores`| Specific tag for `max_turbo_frequency_mhz` metric. The maximum number of activated cores for reachable turbo frequency + + Measurement powerstat_package metrics are collected per processor package -_package_id_ tag indicates which package metric refers to. - Available metrics for powerstat_package measurement - ```text - | Metric name (field) | Description | Units | - |-----|-------------|-----| - | `thermal_design_power_watts` | Maximum Thermal Design Power (TDP) available for processor package | Watts | - | `current_power_consumption_watts` | Current power consumption of processor package | Watts | - | `current_dram_power_consumption_watts` | Current power consumption of processor package DRAM subsystem | Watts | - ``` + | Metric name (field) | Description | Units | + |-----|-------------|-----| + | `thermal_design_power_watts` | Maximum Thermal Design Power (TDP) available for processor package | Watts | + | `current_power_consumption_watts` | Current power consumption of processor package | Watts | + | `current_dram_power_consumption_watts` | Current power consumption of processor package DRAM subsystem | Watts | + | `max_turbo_frequency_mhz`| Maximum reachable turbo frequency for number of cores active | MHz + +### Known issues + +From linux kernel version v5.4.77 with [this kernel change](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=v5.4.77&id=19f6d91bdad42200aac557a683c17b1f65ee6c94) +resources like `/sys/class/powercap/intel-rapl*/*/energy_uj` are readable only by root for security reasons, so this plugin needs root privileges to work properly. + +If such strict security restrictions are not relevant, reading permissions to files in `/sys/devices/virtual/powercap/intel-rapl/` +directory can be manually changed for example with `chmod` command with custom parameters. +For example to give all users permission to all files in `intel-rapl` directory: + +```bash +sudo chmod -R a+rx /sys/devices/virtual/powercap/intel-rapl/ +``` ### Example Output @@ -219,10 +256,12 @@ When starting to measure metrics, plugin skips first iteration of metrics if the powerstat_package,host=ubuntu,package_id=0 thermal_design_power_watts=160 1606494744000000000 powerstat_package,host=ubuntu,package_id=0 current_power_consumption_watts=35 1606494744000000000 powerstat_package,host=ubuntu,package_id=0 current_dram_power_consumption_watts=13.94 1606494744000000000 +powerstat_package,host=ubuntu,package_id=0,active_cores=0 max_turbo_frequency_mhz=3000i 1606494744000000000 +powerstat_package,host=ubuntu,package_id=0,active_cores=1 max_turbo_frequency_mhz=2800i 1606494744000000000 powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_frequency_mhz=1200.29 1606494744000000000 powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_temperature_celsius=34i 1606494744000000000 powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_c6_state_residency_percent=92.52 1606494744000000000 -powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_busy_cycles_percent=0.8 1606494744000000000 powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_c1_state_residency_percent=6.68 1606494744000000000 +powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_c0_state_residency_percent=0.8 1606494744000000000 powerstat_core,core_id=0,cpu_id=0,host=ubuntu,package_id=0 cpu_busy_frequency_mhz=1213.24 1606494744000000000 ``` diff --git a/plugins/inputs/intel_powerstat/dto.go b/plugins/inputs/intel_powerstat/dto.go index eb3da0bc2..71fc10f50 100644 --- a/plugins/inputs/intel_powerstat/dto.go +++ b/plugins/inputs/intel_powerstat/dto.go @@ -7,8 +7,8 @@ type msrData struct { c3 uint64 c6 uint64 c7 uint64 - throttleTemp uint64 - temp uint64 + throttleTemp int64 + temp int64 mperfDelta uint64 aperfDelta uint64 timeStampCounterDelta uint64 diff --git a/plugins/inputs/intel_powerstat/file.go b/plugins/inputs/intel_powerstat/file.go index c69dea89f..349228bd8 100644 --- a/plugins/inputs/intel_powerstat/file.go +++ b/plugins/inputs/intel_powerstat/file.go @@ -152,3 +152,22 @@ func (fs *fileServiceImpl) readFileAtOffsetToUint64(reader io.ReaderAt, offset i func newFileService() *fileServiceImpl { return &fileServiceImpl{} } + +func checkFile(path string) error { + if path == "" { + return fmt.Errorf("empty path given") + } + + lInfo, err := os.Lstat(path) + if err != nil { + if os.IsNotExist(err) { + return fmt.Errorf("file `%s` doesn't exist", path) + } + return fmt.Errorf("cannot obtain file info of `%s`: %v", path, err) + } + mode := lInfo.Mode() + if mode&os.ModeSymlink != 0 { + return fmt.Errorf("file `%s` is a symlink", path) + } + return nil +} diff --git a/plugins/inputs/intel_powerstat/file_mock_test.go b/plugins/inputs/intel_powerstat/file_mock_test.go index ab4bd8c57..ccf67b601 100644 --- a/plugins/inputs/intel_powerstat/file_mock_test.go +++ b/plugins/inputs/intel_powerstat/file_mock_test.go @@ -1,4 +1,4 @@ -// Code generated by mockery v0.0.0-dev. DO NOT EDIT. +// Code generated by mockery v2.10.0. DO NOT EDIT. package intel_powerstat @@ -8,7 +8,7 @@ import ( mock "github.com/stretchr/testify/mock" ) -// mockFileService is an autogenerated mock type for the fileService type +// mockFileService is an autogenerated mock type for the mockFileService type type mockFileService struct { mock.Mock } diff --git a/plugins/inputs/intel_powerstat/intel_powerstat.go b/plugins/inputs/intel_powerstat/intel_powerstat.go index 983909e8c..78336bbdf 100644 --- a/plugins/inputs/intel_powerstat/intel_powerstat.go +++ b/plugins/inputs/intel_powerstat/intel_powerstat.go @@ -6,6 +6,7 @@ package intel_powerstat import ( "fmt" "math/big" + "strconv" "strings" "sync" "time" @@ -15,60 +16,81 @@ import ( ) const ( - cpuFrequency = "cpu_frequency" - cpuBusyFrequency = "cpu_busy_frequency" - cpuTemperature = "cpu_temperature" - cpuC1StateResidency = "cpu_c1_state_residency" - cpuC6StateResidency = "cpu_c6_state_residency" - cpuBusyCycles = "cpu_busy_cycles" - percentageMultiplier = 100 + cpuFrequency = "cpu_frequency" + cpuBusyFrequency = "cpu_busy_frequency" + cpuTemperature = "cpu_temperature" + cpuC0StateResidency = "cpu_c0_state_residency" + cpuC1StateResidency = "cpu_c1_state_residency" + cpuC6StateResidency = "cpu_c6_state_residency" + cpuBusyCycles = "cpu_busy_cycles" + packageCurrentPowerConsumption = "current_power_consumption" + packageCurrentDramPowerConsumption = "current_dram_power_consumption" + packageThermalDesignPower = "thermal_design_power" + packageTurboLimit = "max_turbo_frequency" + percentageMultiplier = 100 ) // PowerStat plugin enables monitoring of platform metrics (power, TDP) and Core metrics like temperature, power and utilization. type PowerStat struct { - CPUMetrics []string `toml:"cpu_metrics"` - Log telegraf.Logger `toml:"-"` + CPUMetrics []string `toml:"cpu_metrics"` + PackageMetrics []string `toml:"package_metrics"` + Log telegraf.Logger `toml:"-"` fs fileService rapl raplService msr msrService - cpuFrequency bool - cpuBusyFrequency bool - cpuTemperature bool - cpuC1StateResidency bool - cpuC6StateResidency bool - cpuBusyCycles bool - cpuInfo map[string]*cpuInfo - skipFirstIteration bool + cpuFrequency bool + cpuBusyFrequency bool + cpuTemperature bool + cpuC0StateResidency bool + cpuC1StateResidency bool + cpuC6StateResidency bool + cpuBusyCycles bool + packageTurboLimit bool + packageCurrentPowerConsumption bool + packageCurrentDramPowerConsumption bool + packageThermalDesignPower bool + cpuInfo map[string]*cpuInfo + skipFirstIteration bool + logOnce map[string]error } -// Init performs one time setup of the plugin. +// Init performs one time setup of the plugin func (p *PowerStat) Init() error { + p.parsePackageMetricsConfig() p.parseCPUMetricsConfig() err := p.verifyProcessor() if err != nil { return err } - // Initialize MSR service only when there is at least one core metric enabled. - if p.cpuFrequency || p.cpuBusyFrequency || p.cpuTemperature || p.cpuC1StateResidency || - p.cpuC6StateResidency || p.cpuBusyCycles { + // Initialize MSR service only when there is at least one metric enabled + if p.cpuFrequency || p.cpuBusyFrequency || p.cpuTemperature || p.cpuC0StateResidency || p.cpuC1StateResidency || + p.cpuC6StateResidency || p.cpuBusyCycles || p.packageTurboLimit { p.msr = newMsrServiceWithFs(p.Log, p.fs) } - p.rapl = newRaplServiceWithFs(p.Log, p.fs) + if p.packageCurrentPowerConsumption || p.packageCurrentDramPowerConsumption || p.packageThermalDesignPower || p.packageTurboLimit { + p.rapl = newRaplServiceWithFs(p.Log, p.fs) + } + + if !p.areCoreMetricsEnabled() && !p.areGlobalMetricsEnabled() { + return fmt.Errorf("all configuration options are empty or invalid. Did not find anything to gather") + } return nil } -// Gather takes in an accumulator and adds the metrics that the Input gathers. +// Gather takes in an accumulator and adds the metrics that the Input gathers func (p *PowerStat) Gather(acc telegraf.Accumulator) error { - p.addGlobalMetrics(acc) + if p.areGlobalMetricsEnabled() { + p.addGlobalMetrics(acc) + } if p.areCoreMetricsEnabled() { p.addPerCoreMetrics(acc) } - // Gathering the first iteration of metrics was skipped for most of them because they are based on delta calculations. + // Gathering the first iteration of metrics was skipped for most of them because they are based on delta calculations p.skipFirstIteration = false return nil @@ -79,18 +101,36 @@ func (p *PowerStat) addGlobalMetrics(acc telegraf.Accumulator) { p.rapl.initializeRaplData() for socketID := range p.rapl.getRaplData() { + if p.packageTurboLimit { + p.addTurboRatioLimit(socketID, acc) + } + err := p.rapl.retrieveAndCalculateData(socketID) if err != nil { // In case of an error skip calculating metrics for this socket - p.Log.Errorf("error fetching rapl data for socket %s, err: %v", socketID, err) + if val := p.logOnce[socketID]; val == nil || val.Error() != err.Error() { + p.Log.Errorf("error fetching rapl data for socket %s, err: %v", socketID, err) + // Remember that specific error occurs for socketID to omit logging next time + p.logOnce[socketID] = err + } continue } - p.addThermalDesignPowerMetric(socketID, acc) + + // If error stops occurring, clear logOnce indicator + p.logOnce[socketID] = nil + if p.packageThermalDesignPower { + p.addThermalDesignPowerMetric(socketID, acc) + } + if p.skipFirstIteration { continue } - p.addCurrentSocketPowerConsumption(socketID, acc) - p.addCurrentDramPowerConsumption(socketID, acc) + if p.packageCurrentPowerConsumption { + p.addCurrentSocketPowerConsumption(socketID, acc) + } + if p.packageCurrentDramPowerConsumption { + p.addCurrentDramPowerConsumption(socketID, acc) + } } } @@ -155,11 +195,10 @@ func (p *PowerStat) addMetricsForSingleCore(cpuID string, acc telegraf.Accumulat } // Read data from MSR only if required - if p.cpuC1StateResidency || p.cpuC6StateResidency || p.cpuBusyCycles || p.cpuTemperature || - p.cpuBusyFrequency { + if p.cpuC0StateResidency || p.cpuC1StateResidency || p.cpuC6StateResidency || p.cpuBusyCycles || p.cpuTemperature || p.cpuBusyFrequency { err := p.msr.openAndReadMsr(cpuID) if err != nil { - // In case of an error exit the function. All metrics past this point are dependant on MSR. + // In case of an error exit the function. All metrics past this point are dependent on MSR p.Log.Debugf("error while reading msr: %v", err) return } @@ -169,12 +208,16 @@ func (p *PowerStat) addMetricsForSingleCore(cpuID string, acc telegraf.Accumulat p.addCPUTemperatureMetric(cpuID, acc) } - // cpuBusyFrequency metric does some calculations inside that are required in another plugin cycle. + // cpuBusyFrequency metric does some calculations inside that are required in another plugin cycle if p.cpuBusyFrequency { p.addCPUBusyFrequencyMetric(cpuID, acc) } if !p.skipFirstIteration { + if p.cpuC0StateResidency || p.cpuBusyCycles { + p.addCPUC0StateResidencyMetric(cpuID, acc) + } + if p.cpuC1StateResidency { p.addCPUC1StateResidencyMetric(cpuID, acc) } @@ -182,10 +225,6 @@ func (p *PowerStat) addMetricsForSingleCore(cpuID string, acc telegraf.Accumulat if p.cpuC6StateResidency { p.addCPUC6StateResidencyMetric(cpuID, acc) } - - if p.cpuBusyCycles { - p.addCPUBusyCyclesMetric(cpuID, acc) - } } } @@ -229,6 +268,153 @@ func (p *PowerStat) addCPUTemperatureMetric(cpuID string, acc telegraf.Accumulat acc.AddGauge("powerstat_core", fields, tags) } +func calculateTurboRatioGroup(coreCounts uint64, msr uint64, group map[int]uint64) { + from := coreCounts & 0xFF // value of number of active cores of bucket 1 is written in the first 8 bits. The next buckets values are saved on the following 8-bit sides + for i := 0; i < 8; i++ { + to := (coreCounts >> (i * 8)) & 0xFF + if to == 0 { + break + } + value := (msr >> (i * 8)) & 0xFF + // value of freq ratio is stored in 8-bit blocks, and their real value is obtained after multiplication by 100 + if value != 0 && to != 0 { + for ; from <= to; from++ { + group[int(from)] = value * 100 + } + } + from = to + 1 + } +} + +func (p *PowerStat) addTurboRatioLimit(socketID string, acc telegraf.Accumulator) { + var err error + turboRatioLimitGroups := make(map[int]uint64) + + var cpuID = "" + var model = "" + for _, v := range p.cpuInfo { + if v.physicalID == socketID { + cpuID = v.cpuID + model = v.model + } + } + if cpuID == "" || model == "" { + p.Log.Debugf("error while reading socket ID") + return + } + // dump_hsw_turbo_ratio_limit + if model == strconv.FormatInt(0x3F, 10) { // INTEL_FAM6_HASWELL_X + coreCounts := uint64(0x1211) // counting the number of active cores 17 and 18 + msrTurboRatioLimit2, err := p.msr.readSingleMsr(cpuID, "MSR_TURBO_RATIO_LIMIT2") + if err != nil { + p.Log.Debugf("error while reading MSR_TURBO_RATIO_LIMIT2: %v", err) + return + } + + calculateTurboRatioGroup(coreCounts, msrTurboRatioLimit2, turboRatioLimitGroups) + } + + // dump_ivt_turbo_ratio_limit + if (model == strconv.FormatInt(0x3E, 10)) || // INTEL_FAM6_IVYBRIDGE_X + (model == strconv.FormatInt(0x3F, 10)) { // INTEL_FAM6_HASWELL_X + coreCounts := uint64(0x100F0E0D0C0B0A09) // counting the number of active cores 9 to 16 + msrTurboRatioLimit1, err := p.msr.readSingleMsr(cpuID, "MSR_TURBO_RATIO_LIMIT1") + if err != nil { + p.Log.Debugf("error while reading MSR_TURBO_RATIO_LIMIT1: %v", err) + return + } + calculateTurboRatioGroup(coreCounts, msrTurboRatioLimit1, turboRatioLimitGroups) + } + + if (model != strconv.FormatInt(0x37, 10)) && // INTEL_FAM6_ATOM_SILVERMONT + (model != strconv.FormatInt(0x4A, 10)) && // INTEL_FAM6_ATOM_SILVERMONT_MID: + (model != strconv.FormatInt(0x5A, 10)) && // INTEL_FAM6_ATOM_AIRMONT_MID: + (model != strconv.FormatInt(0x2E, 10)) && // INTEL_FAM6_NEHALEM_EX + (model != strconv.FormatInt(0x2F, 10)) && // INTEL_FAM6_WESTMERE_EX + (model != strconv.FormatInt(0x57, 10)) && // INTEL_FAM6_XEON_PHI_KNL + (model != strconv.FormatInt(0x85, 10)) { // INTEL_FAM6_XEON_PHI_KNM + coreCounts := uint64(0x0807060504030201) // default value (counting the number of active cores 1 to 8). May be changed in "if" segment below + if (model == strconv.FormatInt(0x5C, 10)) || // INTEL_FAM6_ATOM_GOLDMONT + (model == strconv.FormatInt(0x55, 10)) || // INTEL_FAM6_SKYLAKE_X + (model == strconv.FormatInt(0x6C, 10) || model == strconv.FormatInt(0x8F, 10) || model == strconv.FormatInt(0x6A, 10)) || // INTEL_FAM6_ICELAKE_X + (model == strconv.FormatInt(0x5F, 10)) || // INTEL_FAM6_ATOM_GOLDMONT_D + (model == strconv.FormatInt(0x86, 10)) { // INTEL_FAM6_ATOM_TREMONT_D + coreCounts, err = p.msr.readSingleMsr(cpuID, "MSR_TURBO_RATIO_LIMIT1") + + if err != nil { + p.Log.Debugf("error while reading MSR_TURBO_RATIO_LIMIT1: %v", err) + return + } + } + + msrTurboRatioLimit, err := p.msr.readSingleMsr(cpuID, "MSR_TURBO_RATIO_LIMIT") + if err != nil { + p.Log.Debugf("error while reading MSR_TURBO_RATIO_LIMIT: %v", err) + return + } + calculateTurboRatioGroup(coreCounts, msrTurboRatioLimit, turboRatioLimitGroups) + } + // dump_atom_turbo_ratio_limits + if model == strconv.FormatInt(0x37, 10) || // INTEL_FAM6_ATOM_SILVERMONT + model == strconv.FormatInt(0x4A, 10) || // INTEL_FAM6_ATOM_SILVERMONT_MID: + model == strconv.FormatInt(0x5A, 10) { // INTEL_FAM6_ATOM_AIRMONT_MID + coreCounts := uint64(0x04030201) // counting the number of active cores 1 to 4 + msrTurboRatioLimit, err := p.msr.readSingleMsr(cpuID, "MSR_ATOM_CORE_TURBO_RATIOS") + + if err != nil { + p.Log.Debugf("error while reading MSR_ATOM_CORE_TURBO_RATIOS: %v", err) + return + } + value := uint64(0) + newValue := uint64(0) + + for i := 0; i < 4; i++ { // value "4" is specific for this group of processors + newValue = (msrTurboRatioLimit >> (8 * (i))) & 0x3F // value of freq ratio is stored in 6-bit blocks, saved every 8 bits + value = value + (newValue << ((i - 1) * 8)) // now value of freq ratio is stored in 8-bit blocks, saved every 8 bits + } + + calculateTurboRatioGroup(coreCounts, value, turboRatioLimitGroups) + } + // dump_knl_turbo_ratio_limits + if model == strconv.FormatInt(0x57, 10) { // INTEL_FAM6_XEON_PHI_KNL + msrTurboRatioLimit, err := p.msr.readSingleMsr(cpuID, "MSR_TURBO_RATIO_LIMIT") + if err != nil { + p.Log.Debugf("error while reading MSR_TURBO_RATIO_LIMIT: %v", err) + return + } + + // value of freq ratio of bucket 1 is saved in bits 15 to 8. + // each next value is calculated as the previous value - delta. Delta is stored in 3-bit blocks every 8 bits (start at 21 (2*8+5)) + value := (msrTurboRatioLimit >> 8) & 0xFF + newValue := value + for i := 2; i < 8; i++ { + newValue = newValue - (msrTurboRatioLimit>>(8*i+5))&0x7 + value = value + (newValue << ((i - 1) * 8)) + } + + // value of number of active cores of bucket 1 is saved in bits 1 to 7. + // each next value is calculated as the previous value + delta. Delta is stored in 5-bit blocks every 8 bits (start at 16 (2*8)) + coreCounts := (msrTurboRatioLimit & 0xFF) >> 1 + newBucket := coreCounts + for i := 2; i < 8; i++ { + newBucket = newBucket + (msrTurboRatioLimit>>(8*i))&0x1F + coreCounts = coreCounts + (newBucket << ((i - 1) * 8)) + } + calculateTurboRatioGroup(coreCounts, value, turboRatioLimitGroups) + } + + for key, val := range turboRatioLimitGroups { + tags := map[string]string{ + "package_id": socketID, + "active_cores": strconv.Itoa(key), + } + fields := map[string]interface{}{ + "max_turbo_frequency_mhz": val, + } + acc.AddGauge("powerstat_package", fields, tags) + } +} + func (p *PowerStat) addCPUBusyFrequencyMetric(cpuID string, acc telegraf.Accumulator) { coresData := p.msr.getCPUCoresData() mperfDelta := coresData[cpuID].mperfDelta @@ -331,7 +517,7 @@ func (p *PowerStat) addCPUC6StateResidencyMetric(cpuID string, acc telegraf.Accu acc.AddGauge("powerstat_core", fields, tags) } -func (p *PowerStat) addCPUBusyCyclesMetric(cpuID string, acc telegraf.Accumulator) { +func (p *PowerStat) addCPUC0StateResidencyMetric(cpuID string, acc telegraf.Accumulator) { coresData := p.msr.getCPUCoresData() // Avoid division by 0 if coresData[cpuID].timeStampCounterDelta == 0 { @@ -339,7 +525,7 @@ func (p *PowerStat) addCPUBusyCyclesMetric(cpuID string, acc telegraf.Accumulato timestampCounterLocation, cpuID) return } - busyCyclesValue := roundFloatToNearestTwoDecimalPlaces(percentageMultiplier * + c0Value := roundFloatToNearestTwoDecimalPlaces(percentageMultiplier * float64(coresData[cpuID].mperfDelta) / float64(coresData[cpuID].timeStampCounterDelta)) cpu := p.cpuInfo[cpuID] tags := map[string]string{ @@ -347,11 +533,42 @@ func (p *PowerStat) addCPUBusyCyclesMetric(cpuID string, acc telegraf.Accumulato "core_id": cpu.coreID, "cpu_id": cpu.cpuID, } - fields := map[string]interface{}{ - "cpu_busy_cycles_percent": busyCyclesValue, + if p.cpuC0StateResidency { + fields := map[string]interface{}{ + "cpu_c0_state_residency_percent": c0Value, + } + acc.AddGauge("powerstat_core", fields, tags) + } + if p.cpuBusyCycles { + deprecatedFields := map[string]interface{}{ + "cpu_busy_cycles_percent": c0Value, + } + acc.AddGauge("powerstat_core", deprecatedFields, tags) + } +} + +func (p *PowerStat) parsePackageMetricsConfig() { + if p.PackageMetrics == nil { + // if Package Metric config is empty, use the default settings. + p.packageCurrentPowerConsumption = true + p.packageCurrentDramPowerConsumption = true + p.packageThermalDesignPower = true + return } - acc.AddGauge("powerstat_core", fields, tags) + if contains(p.PackageMetrics, packageTurboLimit) { + p.packageTurboLimit = true + } + if contains(p.PackageMetrics, packageCurrentPowerConsumption) { + p.packageCurrentPowerConsumption = true + } + + if contains(p.PackageMetrics, packageCurrentDramPowerConsumption) { + p.packageCurrentDramPowerConsumption = true + } + if contains(p.PackageMetrics, packageThermalDesignPower) { + p.packageThermalDesignPower = true + } } func (p *PowerStat) parseCPUMetricsConfig() { @@ -363,6 +580,10 @@ func (p *PowerStat) parseCPUMetricsConfig() { p.cpuFrequency = true } + if contains(p.CPUMetrics, cpuC0StateResidency) { + p.cpuC0StateResidency = true + } + if contains(p.CPUMetrics, cpuC1StateResidency) { p.cpuC1StateResidency = true } @@ -396,7 +617,7 @@ func (p *PowerStat) verifyProcessor() error { p.cpuInfo = stats - // First CPU is sufficient for verification. + // First CPU is sufficient for verification firstCPU := p.cpuInfo["0"] if firstCPU == nil { return fmt.Errorf("first core not found while parsing /proc/cpuinfo") @@ -414,14 +635,16 @@ func (p *PowerStat) verifyProcessor() error { if !strings.Contains(firstCPU.flags, "msr") { p.cpuTemperature = false p.cpuC6StateResidency = false + p.cpuC0StateResidency = false p.cpuBusyCycles = false p.cpuBusyFrequency = false p.cpuC1StateResidency = false } if !strings.Contains(firstCPU.flags, "aperfmperf") { - p.cpuBusyFrequency = false p.cpuBusyCycles = false + p.cpuBusyFrequency = false + p.cpuC0StateResidency = false p.cpuC1StateResidency = false } @@ -438,7 +661,6 @@ func contains(slice []string, str string) bool { return true } } - return false } @@ -446,17 +668,27 @@ func (p *PowerStat) areCoreMetricsEnabled() bool { return p.msr != nil && len(p.msr.getCPUCoresData()) > 0 } -// newPowerStat creates and returns PowerStat struct. +func (p *PowerStat) areGlobalMetricsEnabled() bool { + return p.rapl != nil +} + +// newPowerStat creates and returns PowerStat struct func newPowerStat(fs fileService) *PowerStat { p := &PowerStat{ - cpuFrequency: false, - cpuC1StateResidency: false, - cpuC6StateResidency: false, - cpuBusyCycles: false, - cpuTemperature: false, - cpuBusyFrequency: false, - skipFirstIteration: true, - fs: fs, + cpuFrequency: false, + cpuC0StateResidency: false, + cpuC1StateResidency: false, + cpuC6StateResidency: false, + cpuBusyCycles: false, + cpuTemperature: false, + cpuBusyFrequency: false, + packageTurboLimit: false, + packageCurrentPowerConsumption: false, + packageCurrentDramPowerConsumption: false, + packageThermalDesignPower: false, + skipFirstIteration: true, + fs: fs, + logOnce: make(map[string]error), } return p diff --git a/plugins/inputs/intel_powerstat/intel_powerstat_test.go b/plugins/inputs/intel_powerstat/intel_powerstat_test.go index ce01e7799..5dd24087a 100644 --- a/plugins/inputs/intel_powerstat/intel_powerstat_test.go +++ b/plugins/inputs/intel_powerstat/intel_powerstat_test.go @@ -15,26 +15,32 @@ import ( "github.com/influxdata/telegraf/testutil" ) +type MockServices struct { + fs *mockFileService + msr *mockMsrService + rapl *mockRaplService +} + func TestInitPlugin(t *testing.T) { cores := []string{"cpu0", "cpu1", "cpu2", "cpu3"} - power, fsMock, _, _ := getPowerWithMockedServices() + power, mockServices := getPowerWithMockedServices() - fsMock.On("getCPUInfoStats", mock.Anything). + mockServices.fs.On("getCPUInfoStats", mock.Anything). Return(nil, errors.New("error getting cpu stats")).Once() require.Error(t, power.Init()) - fsMock.On("getCPUInfoStats", mock.Anything). + mockServices.fs.On("getCPUInfoStats", mock.Anything). Return(make(map[string]*cpuInfo), nil).Once() require.Error(t, power.Init()) - fsMock.On("getCPUInfoStats", mock.Anything). + mockServices.fs.On("getCPUInfoStats", mock.Anything). Return(map[string]*cpuInfo{"0": { vendorID: "GenuineIntel", cpuFamily: "test", }}, nil).Once() require.Error(t, power.Init()) - fsMock.On("getStringsMatchingPatternOnPath", mock.Anything). + mockServices.fs.On("getStringsMatchingPatternOnPath", mock.Anything). Return(cores, nil).Once(). On("getCPUInfoStats", mock.Anything). Return(map[string]*cpuInfo{"0": { @@ -44,24 +50,24 @@ func TestInitPlugin(t *testing.T) { // Verify MSR service initialization. power.cpuFrequency = true require.NoError(t, power.Init()) - fsMock.AssertCalled(t, "getStringsMatchingPatternOnPath", mock.Anything) + mockServices.fs.AssertCalled(t, "getStringsMatchingPatternOnPath", mock.Anything) require.Equal(t, len(cores), len(power.msr.getCPUCoresData())) - fsMock.On("getStringsMatchingPatternOnPath", mock.Anything). + mockServices.fs.On("getStringsMatchingPatternOnPath", mock.Anything). Return(nil, errors.New("error during getStringsMatchingPatternOnPath")).Once() // In case of an error when fetching cpu cores plugin should proceed with execution. require.NoError(t, power.Init()) - fsMock.AssertCalled(t, "getStringsMatchingPatternOnPath", mock.Anything) + mockServices.fs.AssertCalled(t, "getStringsMatchingPatternOnPath", mock.Anything) require.Equal(t, 0, len(power.msr.getCPUCoresData())) } func TestParseCPUMetricsConfig(t *testing.T) { - power, _, _, _ := getPowerWithMockedServices() + power, _ := getPowerWithMockedServices() disableCoreMetrics(power) power.CPUMetrics = []string{ - "cpu_frequency", "cpu_c1_state_residency", "cpu_c6_state_residency", "cpu_busy_cycles", "cpu_temperature", + "cpu_frequency", "cpu_c0_state_residency", "cpu_c1_state_residency", "cpu_c6_state_residency", "cpu_busy_cycles", "cpu_temperature", "cpu_busy_frequency", } power.parseCPUMetricsConfig() @@ -88,6 +94,7 @@ func verifyCoreMetrics(t *testing.T, power *PowerStat, enabled bool) { require.Equal(t, enabled, power.cpuFrequency) require.Equal(t, enabled, power.cpuC1StateResidency) require.Equal(t, enabled, power.cpuC6StateResidency) + require.Equal(t, enabled, power.cpuC0StateResidency) require.Equal(t, enabled, power.cpuBusyCycles) require.Equal(t, enabled, power.cpuBusyFrequency) require.Equal(t, enabled, power.cpuTemperature) @@ -102,23 +109,23 @@ func TestGather(t *testing.T) { preparedCPUData := getPreparedCPUData(coreIDs) raplDataMap := prepareRaplDataMap(packageIDs, socketCurrentEnergy, dramCurrentEnergy) - power, _, raplMock, msrMock := getPowerWithMockedServices() + power, mockServices := getPowerWithMockedServices() prepareCPUInfo(power, coreIDs, packageIDs) enableCoreMetrics(power) power.skipFirstIteration = false - raplMock.On("initializeRaplData", mock.Anything). + mockServices.rapl.On("initializeRaplData", mock.Anything). On("getRaplData").Return(raplDataMap). On("retrieveAndCalculateData", mock.Anything).Return(nil).Times(len(raplDataMap)). On("getConstraintMaxPowerWatts", mock.Anything).Return(546783852.3, nil) - msrMock.On("getCPUCoresData").Return(preparedCPUData). + mockServices.msr.On("getCPUCoresData").Return(preparedCPUData). On("openAndReadMsr", mock.Anything).Return(nil). On("retrieveCPUFrequencyForCore", mock.Anything).Return(1200000.2, nil) require.NoError(t, power.Gather(&acc)) // Number of global metrics : 3 - // Number of per core metrics : 6 - require.Equal(t, 3*len(packageIDs)+6*len(coreIDs), len(acc.GetTelegrafMetrics())) + // Number of per core metrics : 7 + require.Equal(t, 3*len(packageIDs)+7*len(coreIDs), len(acc.GetTelegrafMetrics())) } func TestAddGlobalMetricsNegative(t *testing.T) { @@ -126,24 +133,24 @@ func TestAddGlobalMetricsNegative(t *testing.T) { socketCurrentEnergy := 13213852.2 dramCurrentEnergy := 784552.0 raplDataMap := prepareRaplDataMap([]string{"0", "1"}, socketCurrentEnergy, dramCurrentEnergy) - power, _, raplMock, _ := getPowerWithMockedServices() + power, mockServices := getPowerWithMockedServices() power.skipFirstIteration = false - raplMock.On("initializeRaplData", mock.Anything).Once(). + mockServices.rapl.On("initializeRaplData", mock.Anything).Once(). On("getRaplData").Return(raplDataMap).Once(). On("retrieveAndCalculateData", mock.Anything).Return(errors.New("error while calculating data")).Times(len(raplDataMap)) power.addGlobalMetrics(&acc) require.Equal(t, 0, len(acc.GetTelegrafMetrics())) - raplMock.AssertNumberOfCalls(t, "retrieveAndCalculateData", len(raplDataMap)) + mockServices.rapl.AssertNumberOfCalls(t, "retrieveAndCalculateData", len(raplDataMap)) - raplMock.On("initializeRaplData", mock.Anything).Once(). + mockServices.rapl.On("initializeRaplData", mock.Anything).Once(). On("getRaplData").Return(make(map[string]*raplData)).Once() power.addGlobalMetrics(&acc) require.Equal(t, 0, len(acc.GetTelegrafMetrics())) - raplMock.AssertNotCalled(t, "retrieveAndCalculateData") + mockServices.rapl.AssertNotCalled(t, "retrieveAndCalculateData") - raplMock.On("initializeRaplData", mock.Anything).Once(). + mockServices.rapl.On("initializeRaplData", mock.Anything).Once(). On("getRaplData").Return(raplDataMap). On("retrieveAndCalculateData", mock.Anything).Return(nil).Once(). On("retrieveAndCalculateData", mock.Anything).Return(errors.New("error while calculating data")).Once(). @@ -159,10 +166,10 @@ func TestAddGlobalMetricsPositive(t *testing.T) { dramCurrentEnergy := 124234872.5 raplDataMap := prepareRaplDataMap([]string{"0", "1"}, socketCurrentEnergy, dramCurrentEnergy) maxPower := 546783852.9 - power, _, raplMock, _ := getPowerWithMockedServices() + power, mockServices := getPowerWithMockedServices() power.skipFirstIteration = false - raplMock.On("initializeRaplData", mock.Anything). + mockServices.rapl.On("initializeRaplData", mock.Anything). On("getRaplData").Return(raplDataMap). On("retrieveAndCalculateData", mock.Anything).Return(nil).Times(len(raplDataMap)). On("getConstraintMaxPowerWatts", mock.Anything).Return(maxPower, nil).Twice(). @@ -181,9 +188,9 @@ func TestAddMetricsForSingleCoreNegative(t *testing.T) { var wg sync.WaitGroup var acc testutil.Accumulator core := "0" - power, _, _, msrMock := getPowerWithMockedServices() + power, mockServices := getPowerWithMockedServices() - msrMock.On("openAndReadMsr", core).Return(errors.New("error reading MSR file")).Once() + mockServices.msr.On("openAndReadMsr", core).Return(errors.New("error reading MSR file")).Once() // Skip generating metric for CPU frequency. power.cpuFrequency = false @@ -201,16 +208,16 @@ func TestAddCPUFrequencyMetric(t *testing.T) { coreID := "3" packageID := "0" frequency := 1200000.2 - power, _, _, msrMock := getPowerWithMockedServices() + power, mockServices := getPowerWithMockedServices() prepareCPUInfoForSingleCPU(power, cpuID, coreID, packageID) - msrMock.On("retrieveCPUFrequencyForCore", mock.Anything). + mockServices.msr.On("retrieveCPUFrequencyForCore", mock.Anything). Return(float64(0), errors.New("error on reading file")).Once() power.addCPUFrequencyMetric(cpuID, &acc) require.Equal(t, 0, len(acc.GetTelegrafMetrics())) - msrMock.On("retrieveCPUFrequencyForCore", mock.Anything).Return(frequency, nil).Once() + mockServices.msr.On("retrieveCPUFrequencyForCore", mock.Anything).Return(frequency, nil).Once() power.addCPUFrequencyMetric(cpuID, &acc) require.Equal(t, 1, len(acc.GetTelegrafMetrics())) @@ -225,12 +232,12 @@ func TestAddCoreCPUTemperatureMetric(t *testing.T) { cpuID := "0" coreID := "2" packageID := "1" - power, _, _, msrMock := getPowerWithMockedServices() + power, mockServices := getPowerWithMockedServices() preparedData := getPreparedCPUData([]string{cpuID}) expectedTemp := preparedData[cpuID].throttleTemp - preparedData[cpuID].temp prepareCPUInfoForSingleCPU(power, cpuID, coreID, packageID) - msrMock.On("getCPUCoresData").Return(preparedData).Once() + mockServices.msr.On("getCPUCoresData").Return(preparedData).Once() power.addCPUTemperatureMetric(cpuID, &acc) require.Equal(t, 1, len(acc.GetTelegrafMetrics())) @@ -243,13 +250,13 @@ func TestAddC6StateResidencyMetric(t *testing.T) { cpuID := "0" coreID := "2" packageID := "1" - power, _, _, msrMock := getPowerWithMockedServices() + power, mockServices := getPowerWithMockedServices() prepareCPUInfoForSingleCPU(power, cpuID, coreID, packageID) preparedData := getPreparedCPUData([]string{cpuID}) expectedC6 := roundFloatToNearestTwoDecimalPlaces(percentageMultiplier * float64(preparedData[cpuID].c6Delta) / float64(preparedData[cpuID].timeStampCounterDelta)) - msrMock.On("getCPUCoresData").Return(preparedData).Twice() + mockServices.msr.On("getCPUCoresData").Return(preparedData).Twice() power.addCPUC6StateResidencyMetric(cpuID, &acc) require.Equal(t, 1, len(acc.GetTelegrafMetrics())) @@ -263,27 +270,32 @@ func TestAddC6StateResidencyMetric(t *testing.T) { require.Equal(t, 0, len(acc.GetTelegrafMetrics())) } -func TestAddProcessorBusyCyclesMetric(t *testing.T) { +func TestAddC0StateResidencyMetric(t *testing.T) { var acc testutil.Accumulator cpuID := "0" coreID := "2" packageID := "1" - power, _, _, msrMock := getPowerWithMockedServices() + power, mockServices := getPowerWithMockedServices() prepareCPUInfoForSingleCPU(power, cpuID, coreID, packageID) preparedData := getPreparedCPUData([]string{cpuID}) expectedBusyCycles := roundFloatToNearestTwoDecimalPlaces(percentageMultiplier * float64(preparedData[cpuID].mperfDelta) / float64(preparedData[cpuID].timeStampCounterDelta)) - msrMock.On("getCPUCoresData").Return(preparedData).Twice() - power.addCPUBusyCyclesMetric(cpuID, &acc) - require.Equal(t, 1, len(acc.GetTelegrafMetrics())) + mockServices.msr.On("getCPUCoresData").Return(preparedData).Twice() + power.cpuBusyCycles, power.cpuC0StateResidency = true, true + power.addCPUC0StateResidencyMetric(cpuID, &acc) + require.Equal(t, 2, len(acc.GetTelegrafMetrics())) - expectedMetric := getPowerCoreMetric("cpu_busy_cycles_percent", expectedBusyCycles, coreID, packageID, cpuID) + expectedMetric := getPowerCoreMetric("cpu_c0_state_residency_percent", expectedBusyCycles, coreID, packageID, cpuID) + acc.AssertContainsTaggedFields(t, "powerstat_core", expectedMetric.fields, expectedMetric.tags) + + // Deprecated + expectedMetric = getPowerCoreMetric("cpu_busy_cycles_percent", expectedBusyCycles, coreID, packageID, cpuID) acc.AssertContainsTaggedFields(t, "powerstat_core", expectedMetric.fields, expectedMetric.tags) acc.ClearMetrics() preparedData[cpuID].timeStampCounterDelta = 0 - power.addCPUBusyCyclesMetric(cpuID, &acc) + power.addCPUC0StateResidencyMetric(cpuID, &acc) require.Equal(t, 0, len(acc.GetTelegrafMetrics())) } @@ -292,12 +304,12 @@ func TestAddProcessorBusyFrequencyMetric(t *testing.T) { cpuID := "0" coreID := "2" packageID := "1" - power, _, _, msrMock := getPowerWithMockedServices() + power, mockServices := getPowerWithMockedServices() prepareCPUInfoForSingleCPU(power, cpuID, coreID, packageID) preparedData := getPreparedCPUData([]string{cpuID}) power.skipFirstIteration = false - msrMock.On("getCPUCoresData").Return(preparedData).Twice() + mockServices.msr.On("getCPUCoresData").Return(preparedData).Twice() power.addCPUBusyFrequencyMetric(cpuID, &acc) require.Equal(t, 1, len(acc.GetTelegrafMetrics())) @@ -312,14 +324,14 @@ func TestAddC1StateResidencyMetric(t *testing.T) { cpuID := "0" coreID := "2" packageID := "1" - power, _, _, msrMock := getPowerWithMockedServices() + power, mockServices := getPowerWithMockedServices() prepareCPUInfoForSingleCPU(power, cpuID, coreID, packageID) preparedData := getPreparedCPUData([]string{cpuID}) c1 := preparedData[cpuID].timeStampCounterDelta - preparedData[cpuID].mperfDelta - preparedData[cpuID].c3Delta - preparedData[cpuID].c6Delta - preparedData[cpuID].c7Delta expectedC1 := roundFloatToNearestTwoDecimalPlaces(percentageMultiplier * float64(c1) / float64(preparedData[cpuID].timeStampCounterDelta)) - msrMock.On("getCPUCoresData").Return(preparedData).Twice() + mockServices.msr.On("getCPUCoresData").Return(preparedData).Twice() power.addCPUC1StateResidencyMetric(cpuID, &acc) require.Equal(t, 1, len(acc.GetTelegrafMetrics())) @@ -337,9 +349,9 @@ func TestAddThermalDesignPowerMetric(t *testing.T) { var acc testutil.Accumulator sockets := []string{"0"} maxPower := 195720672.1 - power, _, raplMock, _ := getPowerWithMockedServices() + power, mockServices := getPowerWithMockedServices() - raplMock.On("getConstraintMaxPowerWatts", mock.Anything). + mockServices.rapl.On("getConstraintMaxPowerWatts", mock.Anything). Return(float64(0), errors.New("getConstraintMaxPowerWatts error")).Once(). On("getConstraintMaxPowerWatts", mock.Anything).Return(maxPower, nil).Once() @@ -354,6 +366,80 @@ func TestAddThermalDesignPowerMetric(t *testing.T) { acc.AssertContainsTaggedFields(t, "powerstat_package", expectedMetric.fields, expectedMetric.tags) } +func TestCalculateTurboRatioGroup(t *testing.T) { + coreCounts := uint64(0x0807060504030201) + msr := uint64(0x0807060504030201) + turboRatioLimitGroups := make(map[int]uint64) + + calculateTurboRatioGroup(coreCounts, msr, turboRatioLimitGroups) + require.Equal(t, 8, len(turboRatioLimitGroups)) + require.Equal(t, uint64(100), turboRatioLimitGroups[1]) + require.Equal(t, uint64(200), turboRatioLimitGroups[2]) + require.Equal(t, uint64(300), turboRatioLimitGroups[3]) + require.Equal(t, uint64(400), turboRatioLimitGroups[4]) + require.Equal(t, uint64(500), turboRatioLimitGroups[5]) + require.Equal(t, uint64(600), turboRatioLimitGroups[6]) + require.Equal(t, uint64(700), turboRatioLimitGroups[7]) + require.Equal(t, uint64(800), turboRatioLimitGroups[8]) + + coreCounts = uint64(0x100e0c0a08060402) + calculateTurboRatioGroup(coreCounts, msr, turboRatioLimitGroups) + require.Equal(t, 16, len(turboRatioLimitGroups)) + require.Equal(t, uint64(100), turboRatioLimitGroups[1]) + require.Equal(t, uint64(100), turboRatioLimitGroups[2]) + require.Equal(t, uint64(200), turboRatioLimitGroups[3]) + require.Equal(t, uint64(200), turboRatioLimitGroups[4]) + require.Equal(t, uint64(300), turboRatioLimitGroups[5]) + require.Equal(t, uint64(300), turboRatioLimitGroups[6]) + require.Equal(t, uint64(400), turboRatioLimitGroups[7]) + require.Equal(t, uint64(400), turboRatioLimitGroups[8]) + require.Equal(t, uint64(500), turboRatioLimitGroups[9]) + require.Equal(t, uint64(500), turboRatioLimitGroups[10]) + require.Equal(t, uint64(600), turboRatioLimitGroups[11]) + require.Equal(t, uint64(600), turboRatioLimitGroups[12]) + require.Equal(t, uint64(700), turboRatioLimitGroups[13]) + require.Equal(t, uint64(700), turboRatioLimitGroups[14]) + require.Equal(t, uint64(800), turboRatioLimitGroups[15]) + require.Equal(t, uint64(800), turboRatioLimitGroups[16]) + coreCounts = uint64(0x1211) + msr = uint64(0xfffe) + calculateTurboRatioGroup(coreCounts, msr, turboRatioLimitGroups) + require.Equal(t, 18, len(turboRatioLimitGroups)) + require.Equal(t, uint64(25400), turboRatioLimitGroups[17]) + require.Equal(t, uint64(25500), turboRatioLimitGroups[18]) + + coreCounts = uint64(0x1201) + msr = uint64(0x0202) + calculateTurboRatioGroup(coreCounts, msr, turboRatioLimitGroups) + require.Equal(t, 18, len(turboRatioLimitGroups)) + require.Equal(t, uint64(200), turboRatioLimitGroups[1]) + require.Equal(t, uint64(200), turboRatioLimitGroups[2]) + require.Equal(t, uint64(200), turboRatioLimitGroups[3]) + require.Equal(t, uint64(200), turboRatioLimitGroups[4]) + require.Equal(t, uint64(200), turboRatioLimitGroups[5]) + require.Equal(t, uint64(200), turboRatioLimitGroups[6]) + require.Equal(t, uint64(200), turboRatioLimitGroups[7]) + require.Equal(t, uint64(200), turboRatioLimitGroups[8]) + require.Equal(t, uint64(200), turboRatioLimitGroups[9]) + require.Equal(t, uint64(200), turboRatioLimitGroups[10]) + require.Equal(t, uint64(200), turboRatioLimitGroups[11]) + require.Equal(t, uint64(200), turboRatioLimitGroups[12]) + require.Equal(t, uint64(200), turboRatioLimitGroups[13]) + require.Equal(t, uint64(200), turboRatioLimitGroups[14]) + require.Equal(t, uint64(200), turboRatioLimitGroups[15]) + require.Equal(t, uint64(200), turboRatioLimitGroups[16]) + require.Equal(t, uint64(200), turboRatioLimitGroups[17]) + require.Equal(t, uint64(200), turboRatioLimitGroups[18]) + + coreCounts = uint64(0x1211) + msr = uint64(0xfffe) + turboRatioLimitGroups = make(map[int]uint64) + calculateTurboRatioGroup(coreCounts, msr, turboRatioLimitGroups) + require.Equal(t, 2, len(turboRatioLimitGroups)) + require.Equal(t, uint64(25400), turboRatioLimitGroups[17]) + require.Equal(t, uint64(25500), turboRatioLimitGroups[18]) +} + func getPreparedCPUData(cores []string) map[string]*msrData { msrDataMap := make(map[string]*msrData) @@ -451,6 +537,7 @@ func prepareCPUInfo(power *PowerStat, coreIDs []string, packageIDs []string) { } func enableCoreMetrics(power *PowerStat) { + power.cpuC0StateResidency = true power.cpuC1StateResidency = true power.cpuC6StateResidency = true power.cpuTemperature = true @@ -460,12 +547,13 @@ func enableCoreMetrics(power *PowerStat) { } func disableCoreMetrics(power *PowerStat) { + power.cpuC0StateResidency = false power.cpuC1StateResidency = false power.cpuC6StateResidency = false + power.cpuBusyCycles = false power.cpuTemperature = false power.cpuBusyFrequency = false power.cpuFrequency = false - power.cpuBusyCycles = false } func prepareRaplDataMap(socketIDs []string, socketCurrentEnergy float64, dramCurrentEnergy float64) map[string]*raplData { @@ -480,16 +568,18 @@ func prepareRaplDataMap(socketIDs []string, socketCurrentEnergy float64, dramCur return raplDataMap } -func getPowerWithMockedServices() (*PowerStat, *mockFileService, *mockRaplService, *mockMsrService) { - fsMock := &mockFileService{} - msrMock := &mockMsrService{} - raplMock := &mockRaplService{} - logger := testutil.Logger{Name: "PowerPluginTest"} - p := newPowerStat(fsMock) - p.Log = logger - p.fs = fsMock - p.rapl = raplMock - p.msr = msrMock +func getPowerWithMockedServices() (*PowerStat, *MockServices) { + var mockServices MockServices + mockServices.fs = &mockFileService{} + mockServices.msr = &mockMsrService{} + mockServices.rapl = &mockRaplService{} + p := newPowerStat(mockServices.fs) + p.Log = testutil.Logger{Name: "PowerPluginTest"} + p.rapl = mockServices.rapl + p.msr = mockServices.msr + p.packageCurrentPowerConsumption = true + p.packageCurrentDramPowerConsumption = true + p.packageThermalDesignPower = true - return p, fsMock, raplMock, msrMock + return p, &mockServices } diff --git a/plugins/inputs/intel_powerstat/msr.go b/plugins/inputs/intel_powerstat/msr.go index 6c19b56eb..46eacdc08 100644 --- a/plugins/inputs/intel_powerstat/msr.go +++ b/plugins/inputs/intel_powerstat/msr.go @@ -28,6 +28,10 @@ const ( throttleTemperatureLocation = 0x1A2 temperatureLocation = 0x19C timestampCounterLocation = 0x10 + turboRatioLimitLocation = 0x1AD + turboRatioLimit1Location = 0x1AE + turboRatioLimit2Location = 0x1AF + atomCoreTurboRatiosLocation = 0x66C ) // msrService is responsible for interactions with MSR. @@ -35,6 +39,7 @@ type msrService interface { getCPUCoresData() map[string]*msrData retrieveCPUFrequencyForCore(core string) (float64, error) openAndReadMsr(core string) error + readSingleMsr(core string, msr string) (uint64, error) } type msrServiceImpl struct { @@ -50,6 +55,10 @@ func (m *msrServiceImpl) getCPUCoresData() map[string]*msrData { func (m *msrServiceImpl) retrieveCPUFrequencyForCore(core string) (float64, error) { cpuFreqPath := fmt.Sprintf(cpuCurrentFreqPartialPath, core) + err := checkFile(cpuFreqPath) + if err != nil { + return 0, err + } cpuFreqFile, err := os.Open(cpuFreqPath) if err != nil { return 0, fmt.Errorf("error opening scaling_cur_freq file on path %s, err: %v", cpuFreqPath, err) @@ -62,6 +71,10 @@ func (m *msrServiceImpl) retrieveCPUFrequencyForCore(core string) (float64, erro func (m *msrServiceImpl) openAndReadMsr(core string) error { path := fmt.Sprintf(msrPartialPath, core) + err := checkFile(path) + if err != nil { + return err + } msrFile, err := os.Open(path) if err != nil { return fmt.Errorf("error opening MSR file on path %s, err: %v", path, err) @@ -75,6 +88,40 @@ func (m *msrServiceImpl) openAndReadMsr(core string) error { return nil } +func (m *msrServiceImpl) readSingleMsr(core string, msr string) (uint64, error) { + path := fmt.Sprintf(msrPartialPath, core) + err := checkFile(path) + if err != nil { + return 0, err + } + msrFile, err := os.Open(path) + if err != nil { + return 0, fmt.Errorf("error opening MSR file on path %s, err: %v", path, err) + } + defer msrFile.Close() + + var msrAddress int64 + switch msr { + case "MSR_TURBO_RATIO_LIMIT": + msrAddress = turboRatioLimitLocation + case "MSR_TURBO_RATIO_LIMIT1": + msrAddress = turboRatioLimit1Location + case "MSR_TURBO_RATIO_LIMIT2": + msrAddress = turboRatioLimit2Location + case "MSR_ATOM_CORE_TURBO_RATIOS": + msrAddress = atomCoreTurboRatiosLocation + default: + return 0, fmt.Errorf("incorect name of MSR %s", msr) + } + + value, err := m.fs.readFileAtOffsetToUint64(msrFile, msrAddress) + if err != nil { + return 0, err + } + + return value, nil +} + func (m *msrServiceImpl) readDataFromMsr(core string, reader io.ReaderAt) error { g, ctx := errgroup.WithContext(context.Background()) @@ -128,9 +175,9 @@ func (m *msrServiceImpl) readDataFromMsr(core string, reader io.ReaderAt) error m.cpuCoresData[core].aperf = newAperf m.cpuCoresData[core].timeStampCounter = newTsc // MSR (1A2h) IA32_TEMPERATURE_TARGET bits 23:16. - m.cpuCoresData[core].throttleTemp = (newThrottleTemp >> 16) & 0xFF + m.cpuCoresData[core].throttleTemp = int64((newThrottleTemp >> 16) & 0xFF) // MSR (19Ch) IA32_THERM_STATUS bits 22:16. - m.cpuCoresData[core].temp = (newTemp >> 16) & 0x7F + m.cpuCoresData[core].temp = int64((newTemp >> 16) & 0x7F) return nil } diff --git a/plugins/inputs/intel_powerstat/msr_mock_test.go b/plugins/inputs/intel_powerstat/msr_mock_test.go index 4ca80e8a8..67aebf230 100644 --- a/plugins/inputs/intel_powerstat/msr_mock_test.go +++ b/plugins/inputs/intel_powerstat/msr_mock_test.go @@ -1,10 +1,10 @@ -// Code generated by mockery v0.0.0-dev. DO NOT EDIT. +// Code generated by mockery v2.10.0. DO NOT EDIT. package intel_powerstat import mock "github.com/stretchr/testify/mock" -// mockMsrService is an autogenerated mock type for the msrService type +// mockMsrService is an autogenerated mock type for the mockMsrService type type mockMsrService struct { mock.Mock } @@ -39,6 +39,27 @@ func (_m *mockMsrService) openAndReadMsr(core string) error { return r0 } +// readSingleMsr provides a mock function with given fields: core, msr +func (_m *mockMsrService) readSingleMsr(core string, msr string) (uint64, error) { + ret := _m.Called(core, msr) + + var r0 uint64 + if rf, ok := ret.Get(0).(func(string, string) uint64); ok { + r0 = rf(core, msr) + } else { + r0 = ret.Get(0).(uint64) + } + + var r1 error + if rf, ok := ret.Get(1).(func(string, string) error); ok { + r1 = rf(core, msr) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + // retrieveCPUFrequencyForCore provides a mock function with given fields: core func (_m *mockMsrService) retrieveCPUFrequencyForCore(core string) (float64, error) { ret := _m.Called(core) diff --git a/plugins/inputs/intel_powerstat/msr_test.go b/plugins/inputs/intel_powerstat/msr_test.go index b03d2b009..0fc59314b 100644 --- a/plugins/inputs/intel_powerstat/msr_test.go +++ b/plugins/inputs/intel_powerstat/msr_test.go @@ -109,8 +109,8 @@ func verifyCPUCoresData(cores []string, t *testing.T, msr *msrServiceImpl, expec require.Equal(t, expectedValue, msr.cpuCoresData[core].mperf) require.Equal(t, expectedValue, msr.cpuCoresData[core].aperf) require.Equal(t, expectedValue, msr.cpuCoresData[core].timeStampCounter) - require.Equal(t, (expectedValue>>16)&0xFF, msr.cpuCoresData[core].throttleTemp) - require.Equal(t, (expectedValue>>16)&0x7F, msr.cpuCoresData[core].temp) + require.Equal(t, int64((expectedValue>>16)&0xFF), msr.cpuCoresData[core].throttleTemp) + require.Equal(t, int64((expectedValue>>16)&0x7F), msr.cpuCoresData[core].temp) if verifyDelta { require.Equal(t, delta, msr.cpuCoresData[core].c3Delta) diff --git a/plugins/inputs/intel_powerstat/rapl.go b/plugins/inputs/intel_powerstat/rapl.go index 32d60ac89..e89b04d50 100644 --- a/plugins/inputs/intel_powerstat/rapl.go +++ b/plugins/inputs/intel_powerstat/rapl.go @@ -36,6 +36,7 @@ type raplServiceImpl struct { data map[string]*raplData dramFolders map[string]string fs fileService + logOnce map[string]error } // initializeRaplData looks for RAPL folders and initializes data map with fetched information. @@ -51,6 +52,10 @@ func (r *raplServiceImpl) getRaplData() map[string]*raplData { func (r *raplServiceImpl) retrieveAndCalculateData(socketID string) error { socketRaplPath := fmt.Sprintf(intelRaplSocketPartialPath, intelRaplPath, socketID) socketEnergyUjPath := fmt.Sprintf(energyUjPartialPath, socketRaplPath) + err := checkFile(socketEnergyUjPath) + if err != nil { + return err + } socketEnergyUjFile, err := os.Open(socketEnergyUjPath) if err != nil { return fmt.Errorf("error opening socket energy_uj file on path %s, err: %v", socketEnergyUjPath, err) @@ -59,6 +64,10 @@ func (r *raplServiceImpl) retrieveAndCalculateData(socketID string) error { dramRaplPath := fmt.Sprintf(intelRaplDramPartialPath, intelRaplPath, socketID, r.dramFolders[socketID]) dramEnergyUjPath := fmt.Sprintf(energyUjPartialPath, dramRaplPath) + err = checkFile(dramEnergyUjPath) + if err != nil { + return err + } dramEnergyUjFile, err := os.Open(dramEnergyUjPath) if err != nil { return fmt.Errorf("error opening dram energy_uj file on path %s, err: %v", dramEnergyUjPath, err) @@ -66,6 +75,10 @@ func (r *raplServiceImpl) retrieveAndCalculateData(socketID string) error { defer dramEnergyUjFile.Close() socketMaxEnergyUjPath := fmt.Sprintf(maxEnergyRangeUjPartialPath, socketRaplPath) + err = checkFile(socketMaxEnergyUjPath) + if err != nil { + return err + } socketMaxEnergyUjFile, err := os.Open(socketMaxEnergyUjPath) if err != nil { return fmt.Errorf("error opening socket max_energy_range_uj file on path %s, err: %v", socketMaxEnergyUjPath, err) @@ -73,6 +86,10 @@ func (r *raplServiceImpl) retrieveAndCalculateData(socketID string) error { defer socketMaxEnergyUjFile.Close() dramMaxEnergyUjPath := fmt.Sprintf(maxEnergyRangeUjPartialPath, dramRaplPath) + err = checkFile(dramMaxEnergyUjPath) + if err != nil { + return err + } dramMaxEnergyUjFile, err := os.Open(dramMaxEnergyUjPath) if err != nil { return fmt.Errorf("error opening dram max_energy_range_uj file on path %s, err: %v", dramMaxEnergyUjPath, err) @@ -85,6 +102,10 @@ func (r *raplServiceImpl) retrieveAndCalculateData(socketID string) error { func (r *raplServiceImpl) getConstraintMaxPowerWatts(socketID string) (float64, error) { socketRaplPath := fmt.Sprintf(intelRaplSocketPartialPath, intelRaplPath, socketID) socketMaxPowerPath := fmt.Sprintf(maxPowerUwPartialPath, socketRaplPath) + err := checkFile(socketMaxPowerPath) + if err != nil { + return 0, err + } socketMaxPowerFile, err := os.Open(socketMaxPowerPath) if err != nil { return 0, fmt.Errorf("error opening constraint_0_max_power_uw file on path %s, err: %v", socketMaxPowerPath, err) @@ -156,15 +177,22 @@ func (r *raplServiceImpl) findDramFolders() { } func (r *raplServiceImpl) findDramFolder(raplFolders []string, socketID string) { + if r.logOnce == nil { + r.logOnce = make(map[string]error) + } + for _, raplFolder := range raplFolders { potentialDramPath := fmt.Sprintf(intelRaplDramPartialPath, intelRaplPath, socketID, raplFolder) nameFilePath := fmt.Sprintf(intelRaplDramNamePartialPath, potentialDramPath) read, err := r.fs.readFile(nameFilePath) if err != nil { - r.log.Errorf("error reading file on path: %s, err: %v", nameFilePath, err) + if val := r.logOnce[nameFilePath]; val == nil || val.Error() != err.Error() { + r.log.Errorf("error reading file on path: %s, err: %v", nameFilePath, err) + r.logOnce[nameFilePath] = err + } continue } - + r.logOnce[nameFilePath] = nil // Remove new line character trimmedString := strings.TrimRight(string(read), "\n") if trimmedString == "dram" { @@ -194,7 +222,7 @@ func (r *raplServiceImpl) calculateData(socketID string, socketEnergyUjFile io.R return fmt.Errorf("interval between last two Telegraf cycles is 0") } - if newSocketEnergy > r.data[socketID].socketEnergy { + if newSocketEnergy >= r.data[socketID].socketEnergy { r.data[socketID].socketCurrentEnergy = (newSocketEnergy - r.data[socketID].socketEnergy) / interval } else { socketMaxEnergy, _, err := r.readEnergyInJoules(socketMaxEnergyUjFile) @@ -206,7 +234,7 @@ func (r *raplServiceImpl) calculateData(socketID string, socketEnergyUjFile io.R r.data[socketID].socketCurrentEnergy = (socketMaxEnergy - r.data[socketID].socketEnergy + newSocketEnergy) / interval } - if newDramEnergy > r.data[socketID].dramEnergy { + if newDramEnergy >= r.data[socketID].dramEnergy { r.data[socketID].dramCurrentEnergy = (newDramEnergy - r.data[socketID].dramEnergy) / interval } else { dramMaxEnergy, _, err := r.readEnergyInJoules(dramMaxEnergyUjFile) diff --git a/plugins/inputs/intel_powerstat/rapl_mock_test.go b/plugins/inputs/intel_powerstat/rapl_mock_test.go index 7742db140..0cf86ce13 100644 --- a/plugins/inputs/intel_powerstat/rapl_mock_test.go +++ b/plugins/inputs/intel_powerstat/rapl_mock_test.go @@ -1,10 +1,10 @@ -// Code generated by mockery v0.0.0-dev. DO NOT EDIT. +// Code generated by mockery v2.10.0. DO NOT EDIT. package intel_powerstat import mock "github.com/stretchr/testify/mock" -// mockRaplService is an autogenerated mock type for the raplService type +// mockRaplService is an autogenerated mock type for the mockRaplService type type mockRaplService struct { mock.Mock }