New input plugin for RAS (Reliability, Availability and Serviceability) (#8114)
This commit is contained in:
parent
5f02c69da5
commit
fa32975d0f
|
|
@ -286,6 +286,7 @@ For documentation on the latest development code see the [documentation index][d
|
|||
* [puppetagent](./plugins/inputs/puppetagent)
|
||||
* [rabbitmq](./plugins/inputs/rabbitmq)
|
||||
* [raindrops](./plugins/inputs/raindrops)
|
||||
* [ras](./plugins/inputs/ras)
|
||||
* [redfish](./plugins/inputs/redfish)
|
||||
* [redis](./plugins/inputs/redis)
|
||||
* [rethinkdb](./plugins/inputs/rethinkdb)
|
||||
|
|
|
|||
|
|
@ -94,6 +94,7 @@ following works:
|
|||
- github.com/kubernetes/apimachinery [Apache License 2.0](https://github.com/kubernetes/apimachinery/blob/master/LICENSE)
|
||||
- github.com/leodido/ragel-machinery [MIT License](https://github.com/leodido/ragel-machinery/blob/develop/LICENSE)
|
||||
- github.com/mailru/easyjson [MIT License](https://github.com/mailru/easyjson/blob/master/LICENSE)
|
||||
- github.com/mattn/go-sqlite3 [MIT License](https://github.com/mattn/go-sqlite3/blob/master/LICENSE)
|
||||
- github.com/matttproud/golang_protobuf_extensions [Apache License 2.0](https://github.com/matttproud/golang_protobuf_extensions/blob/master/LICENSE)
|
||||
- github.com/mdlayher/apcupsd [MIT License](https://github.com/mdlayher/apcupsd/blob/master/LICENSE.md)
|
||||
- github.com/mdlayher/genetlink [MIT License](https://github.com/mdlayher/genetlink/blob/master/LICENSE.md)
|
||||
|
|
|
|||
1
go.mod
1
go.mod
|
|
@ -88,6 +88,7 @@ require (
|
|||
github.com/leesper/go_rng v0.0.0-20190531154944-a612b043e353 // indirect
|
||||
github.com/lib/pq v1.3.0 // indirect
|
||||
github.com/mailru/easyjson v0.0.0-20180717111219-efc7eb8984d6 // indirect
|
||||
github.com/mattn/go-sqlite3 v1.14.0
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.1
|
||||
github.com/mdlayher/apcupsd v0.0.0-20190314144147-eb3dd99a75fe
|
||||
github.com/miekg/dns v1.0.14
|
||||
|
|
|
|||
7
go.sum
7
go.sum
|
|
@ -83,6 +83,7 @@ github.com/Microsoft/ApplicationInsights-Go v0.4.2/go.mod h1:CukZ/G66zxXtI+h/VcV
|
|||
github.com/Microsoft/go-winio v0.4.9 h1:3RbgqgGVqmcpbOiwrjbVtDHLlJBGF6aE+yHmNtBNsFQ=
|
||||
github.com/Microsoft/go-winio v0.4.9/go.mod h1:VhR8bwka0BXejwEJY73c50VrPtXAaKcyvVC4A4RozmA=
|
||||
github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
|
||||
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
||||
github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
|
||||
github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
|
||||
github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo=
|
||||
|
|
@ -103,6 +104,7 @@ github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4 h1:Hs82Z41s6SdL1C
|
|||
github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
|
||||
github.com/amir/raidman v0.0.0-20170415203553-1ccc43bfb9c9 h1:FXrPTd8Rdlc94dKccl7KPmdmIbVh/OjelJ8/vgMRzcQ=
|
||||
github.com/amir/raidman v0.0.0-20170415203553-1ccc43bfb9c9/go.mod h1:eliMa/PW+RDr2QLWRmLH1R1ZA4RInpmvOzDDXtaIZkc=
|
||||
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/apache/thrift v0.12.0 h1:pODnxUFNcjP9UTLZGTdeh+j16A8lJbRvD3rOtrk/7bs=
|
||||
github.com/apache/thrift v0.12.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ=
|
||||
github.com/aristanetworks/glog v0.0.0-20191112221043-67e8567f59f3 h1:Bmjk+DjIi3tTAU0wxGaFbfjGUqlxxSXARq9A96Kgoos=
|
||||
|
|
@ -403,6 +405,8 @@ github.com/lib/pq v1.3.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
|
|||
github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
|
||||
github.com/mailru/easyjson v0.0.0-20180717111219-efc7eb8984d6 h1:8/+Y8SKf0xCZ8cCTfnrMdY7HNzlEjPAt3bPjalNb6CA=
|
||||
github.com/mailru/easyjson v0.0.0-20180717111219-efc7eb8984d6/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
|
||||
github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA=
|
||||
github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus=
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU=
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
|
||||
github.com/mdlayher/apcupsd v0.0.0-20190314144147-eb3dd99a75fe h1:yMrL+YorbzaBpj/h3BbLMP+qeslPZYMbzcpHFBNy1Yk=
|
||||
|
|
@ -643,6 +647,7 @@ golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzB
|
|||
golang.org/x/mod v0.2.0 h1:KU7oHjnv3XNWfa5COkzUifxZmxp1TyI7ImMXqFxLwvQ=
|
||||
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/net v0.0.0-20170114055629-f2499483f923/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
|
|
@ -671,6 +676,8 @@ golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLL
|
|||
golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI=
|
||||
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e h1:3G+cUijn7XD+S4eJFddp53Pv7+slrESplyjG25HgL+k=
|
||||
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/net v0.0.0-20200707034311-ab3426394381 h1:VXak5I6aEWmAXeQjA+QSZzlgNrpq9mjcfDemuexIKsU=
|
||||
golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
|
||||
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
||||
|
|
|
|||
|
|
@ -141,6 +141,7 @@ import (
|
|||
_ "github.com/influxdata/telegraf/plugins/inputs/puppetagent"
|
||||
_ "github.com/influxdata/telegraf/plugins/inputs/rabbitmq"
|
||||
_ "github.com/influxdata/telegraf/plugins/inputs/raindrops"
|
||||
_ "github.com/influxdata/telegraf/plugins/inputs/ras"
|
||||
_ "github.com/influxdata/telegraf/plugins/inputs/redfish"
|
||||
_ "github.com/influxdata/telegraf/plugins/inputs/redis"
|
||||
_ "github.com/influxdata/telegraf/plugins/inputs/rethinkdb"
|
||||
|
|
|
|||
|
|
@ -0,0 +1,58 @@
|
|||
# RAS Input Plugin
|
||||
|
||||
The `RAS` plugin gathers and counts errors provided by [RASDaemon](https://github.com/mchehab/rasdaemon).
|
||||
|
||||
### Configuration
|
||||
|
||||
```toml
|
||||
[[inputs.ras]]
|
||||
## Optional path to RASDaemon sqlite3 database.
|
||||
## Default: /var/lib/rasdaemon/ras-mc_event.db
|
||||
# db_path = ""
|
||||
```
|
||||
|
||||
In addition `RASDaemon` runs, by default, with `--enable-sqlite3` flag. In case of problems with SQLite3 database please verify this is still a default option.
|
||||
|
||||
### Metrics
|
||||
|
||||
- ras
|
||||
- tags:
|
||||
- socket_id
|
||||
- fields:
|
||||
- memory_read_corrected_errors
|
||||
- memory_read_uncorrectable_errors
|
||||
- memory_write_corrected_errors
|
||||
- memory_write_uncorrectable_errors
|
||||
- cache_l0_l1_errors
|
||||
- tlb_instruction_errors
|
||||
- cache_l2_errors
|
||||
- upi_errors
|
||||
- processor_base_errors
|
||||
- processor_bus_errors
|
||||
- internal_timer_errors
|
||||
- smm_handler_code_access_violation_errors
|
||||
- internal_parity_errors
|
||||
- frc_errors
|
||||
- external_mce_errors
|
||||
- microcode_rom_parity_errors
|
||||
- unclassified_mce_errors
|
||||
|
||||
Please note that `processor_base_errors` is aggregate counter measuring the following MCE events:
|
||||
- internal_timer_errors
|
||||
- smm_handler_code_access_violation_errors
|
||||
- internal_parity_errors
|
||||
- frc_errors
|
||||
- external_mce_errors
|
||||
- microcode_rom_parity_errors
|
||||
- unclassified_mce_errors
|
||||
|
||||
### Permissions
|
||||
|
||||
This plugin requires access to SQLite3 database from `RASDaemon`. Please make sure that user has required permissions to this database.
|
||||
|
||||
### Example Output
|
||||
|
||||
```
|
||||
ras,host=ubuntu,socket_id=0 external_mce_base_errors=1i,frc_errors=1i,instruction_tlb_errors=5i,internal_parity_errors=1i,internal_timer_errors=1i,l0_and_l1_cache_errors=7i,memory_read_corrected_errors=25i,memory_read_uncorrectable_errors=0i,memory_write_corrected_errors=5i,memory_write_uncorrectable_errors=0i,microcode_rom_parity_errors=1i,processor_base_errors=7i,processor_bus_errors=1i,smm_handler_code_access_violation_errors=1i,unclassified_mce_base_errors=1i 1598867393000000000
|
||||
ras,host=ubuntu level_2_cache_errors=0i,upi_errors=0i 1598867393000000000
|
||||
```
|
||||
|
|
@ -0,0 +1,294 @@
|
|||
// +build !windows
|
||||
|
||||
package ras
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
|
||||
"github.com/influxdata/telegraf"
|
||||
"github.com/influxdata/telegraf/plugins/inputs"
|
||||
)
|
||||
|
||||
type Ras struct {
|
||||
DbPath string
|
||||
latestTimestamp time.Time
|
||||
cpuSocketCounters map[int]metricCounters
|
||||
serverCounters metricCounters
|
||||
}
|
||||
|
||||
type machineCheckError struct {
|
||||
Id int
|
||||
Timestamp string
|
||||
SocketId int
|
||||
ErrorMsg string
|
||||
MciStatusMsg string
|
||||
}
|
||||
|
||||
type metricCounters map[string]int64
|
||||
|
||||
const (
|
||||
mceQuery = `
|
||||
SELECT
|
||||
id, timestamp, error_msg, mcistatus_msg, socketid
|
||||
FROM mce_record
|
||||
WHERE timestamp > ?
|
||||
`
|
||||
defaultDbPath = "/var/lib/rasdaemon/ras-mc_event.db"
|
||||
dateLayout = "2006-01-02 15:04:05 -0700"
|
||||
memoryReadCorrected = "memory_read_corrected_errors"
|
||||
memoryReadUncorrected = "memory_read_uncorrectable_errors"
|
||||
memoryWriteCorrected = "memory_write_corrected_errors"
|
||||
memoryWriteUncorrected = "memory_write_uncorrectable_errors"
|
||||
instructionCache = "cache_l0_l1_errors"
|
||||
instructionTLB = "tlb_instruction_errors"
|
||||
levelTwoCache = "cache_l2_errors"
|
||||
upi = "upi_errors"
|
||||
processorBase = "processor_base_errors"
|
||||
processorBus = "processor_bus_errors"
|
||||
internalTimer = "internal_timer_errors"
|
||||
smmHandlerCode = "smm_handler_code_access_violation_errors"
|
||||
internalParity = "internal_parity_errors"
|
||||
frc = "frc_errors"
|
||||
externalMCEBase = "external_mce_errors"
|
||||
microcodeROMParity = "microcode_rom_parity_errors"
|
||||
unclassifiedMCEBase = "unclassified_mce_errors"
|
||||
)
|
||||
|
||||
func (r *Ras) SampleConfig() string {
|
||||
return `
|
||||
## Optional path to RASDaemon sqlite3 database.
|
||||
## Default: /var/lib/rasdaemon/ras-mc_event.db
|
||||
# db_path = ""
|
||||
`
|
||||
}
|
||||
|
||||
func (r *Ras) Description() string {
|
||||
return "RAS plugin exposes counter metrics for Machine Check Errors provided by RASDaemon (sqlite3 output is required)."
|
||||
}
|
||||
|
||||
func (r *Ras) Gather(acc telegraf.Accumulator) error {
|
||||
db, err := connectToDB(r.DbPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
rows, err := db.Query(mceQuery, r.latestTimestamp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
for rows.Next() {
|
||||
mcError, err := fetchMachineCheckError(rows)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tsErr := r.updateLatestTimestamp(mcError.Timestamp)
|
||||
if tsErr != nil {
|
||||
return err
|
||||
}
|
||||
r.updateCounters(mcError)
|
||||
}
|
||||
|
||||
addCpuSocketMetrics(acc, r.cpuSocketCounters)
|
||||
addServerMetrics(acc, r.serverCounters)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *Ras) updateLatestTimestamp(timestamp string) error {
|
||||
ts, err := parseDate(timestamp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if ts.After(r.latestTimestamp) {
|
||||
r.latestTimestamp = ts
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *Ras) updateCounters(mcError *machineCheckError) {
|
||||
if strings.Contains(mcError.ErrorMsg, "No Error") {
|
||||
return
|
||||
}
|
||||
|
||||
r.initializeCpuMetricDataIfRequired(mcError.SocketId)
|
||||
r.updateSocketCounters(mcError)
|
||||
r.updateServerCounters(mcError)
|
||||
}
|
||||
|
||||
func newMetricCounters() *metricCounters {
|
||||
return &metricCounters{
|
||||
memoryReadCorrected: 0,
|
||||
memoryReadUncorrected: 0,
|
||||
memoryWriteCorrected: 0,
|
||||
memoryWriteUncorrected: 0,
|
||||
instructionCache: 0,
|
||||
instructionTLB: 0,
|
||||
processorBase: 0,
|
||||
processorBus: 0,
|
||||
internalTimer: 0,
|
||||
smmHandlerCode: 0,
|
||||
internalParity: 0,
|
||||
frc: 0,
|
||||
externalMCEBase: 0,
|
||||
microcodeROMParity: 0,
|
||||
unclassifiedMCEBase: 0,
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Ras) updateServerCounters(mcError *machineCheckError) {
|
||||
if strings.Contains(mcError.ErrorMsg, "CACHE Level-2") && strings.Contains(mcError.ErrorMsg, "Error") {
|
||||
r.serverCounters[levelTwoCache] += 1
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.ErrorMsg, "UPI:") {
|
||||
r.serverCounters[upi] += 1
|
||||
}
|
||||
}
|
||||
|
||||
func connectToDB(server string) (*sql.DB, error) {
|
||||
return sql.Open("sqlite3", server)
|
||||
}
|
||||
|
||||
func (r *Ras) initializeCpuMetricDataIfRequired(socketId int) {
|
||||
if _, ok := r.cpuSocketCounters[socketId]; !ok {
|
||||
r.cpuSocketCounters[socketId] = *newMetricCounters()
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Ras) updateSocketCounters(mcError *machineCheckError) {
|
||||
r.updateMemoryCounters(mcError)
|
||||
r.updateProcessorBaseCounters(mcError)
|
||||
|
||||
if strings.Contains(mcError.ErrorMsg, "Instruction TLB") && strings.Contains(mcError.ErrorMsg, "Error") {
|
||||
r.cpuSocketCounters[mcError.SocketId][instructionTLB] += 1
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.ErrorMsg, "BUS") && strings.Contains(mcError.ErrorMsg, "Error") {
|
||||
r.cpuSocketCounters[mcError.SocketId][processorBus] += 1
|
||||
}
|
||||
|
||||
if (strings.Contains(mcError.ErrorMsg, "CACHE Level-0") ||
|
||||
strings.Contains(mcError.ErrorMsg, "CACHE Level-1")) &&
|
||||
strings.Contains(mcError.ErrorMsg, "Error") {
|
||||
r.cpuSocketCounters[mcError.SocketId][instructionCache] += 1
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Ras) updateProcessorBaseCounters(mcError *machineCheckError) {
|
||||
if strings.Contains(mcError.ErrorMsg, "Internal Timer error") {
|
||||
r.cpuSocketCounters[mcError.SocketId][internalTimer] += 1
|
||||
r.cpuSocketCounters[mcError.SocketId][processorBase] += 1
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.ErrorMsg, "SMM Handler Code Access Violation") {
|
||||
r.cpuSocketCounters[mcError.SocketId][smmHandlerCode] += 1
|
||||
r.cpuSocketCounters[mcError.SocketId][processorBase] += 1
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.ErrorMsg, "Internal parity error") {
|
||||
r.cpuSocketCounters[mcError.SocketId][internalParity] += 1
|
||||
r.cpuSocketCounters[mcError.SocketId][processorBase] += 1
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.ErrorMsg, "FRC error") {
|
||||
r.cpuSocketCounters[mcError.SocketId][frc] += 1
|
||||
r.cpuSocketCounters[mcError.SocketId][processorBase] += 1
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.ErrorMsg, "External error") {
|
||||
r.cpuSocketCounters[mcError.SocketId][externalMCEBase] += 1
|
||||
r.cpuSocketCounters[mcError.SocketId][processorBase] += 1
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.ErrorMsg, "Microcode ROM parity error") {
|
||||
r.cpuSocketCounters[mcError.SocketId][microcodeROMParity] += 1
|
||||
r.cpuSocketCounters[mcError.SocketId][processorBase] += 1
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.ErrorMsg, "Unclassified") || strings.Contains(mcError.ErrorMsg, "Internal unclassified") {
|
||||
r.cpuSocketCounters[mcError.SocketId][unclassifiedMCEBase] += 1
|
||||
r.cpuSocketCounters[mcError.SocketId][processorBase] += 1
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Ras) updateMemoryCounters(mcError *machineCheckError) {
|
||||
if strings.Contains(mcError.ErrorMsg, "Memory read error") {
|
||||
if strings.Contains(mcError.MciStatusMsg, "Corrected_error") {
|
||||
r.cpuSocketCounters[mcError.SocketId][memoryReadCorrected] += 1
|
||||
} else {
|
||||
r.cpuSocketCounters[mcError.SocketId][memoryReadUncorrected] += 1
|
||||
}
|
||||
}
|
||||
if strings.Contains(mcError.ErrorMsg, "Memory write error") {
|
||||
if strings.Contains(mcError.MciStatusMsg, "Corrected_error") {
|
||||
r.cpuSocketCounters[mcError.SocketId][memoryWriteCorrected] += 1
|
||||
} else {
|
||||
r.cpuSocketCounters[mcError.SocketId][memoryWriteUncorrected] += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func addCpuSocketMetrics(acc telegraf.Accumulator, cpuSocketCounters map[int]metricCounters) {
|
||||
for socketId, data := range cpuSocketCounters {
|
||||
tags := map[string]string{
|
||||
"socket_id": strconv.Itoa(socketId),
|
||||
}
|
||||
fields := make(map[string]interface{})
|
||||
|
||||
for errorName, count := range data {
|
||||
fields[errorName] = count
|
||||
}
|
||||
|
||||
acc.AddCounter("ras", fields, tags)
|
||||
}
|
||||
}
|
||||
|
||||
func addServerMetrics(acc telegraf.Accumulator, counters map[string]int64) {
|
||||
fields := make(map[string]interface{})
|
||||
for errorName, count := range counters {
|
||||
fields[errorName] = count
|
||||
}
|
||||
|
||||
acc.AddCounter("ras", fields, map[string]string{})
|
||||
}
|
||||
|
||||
func fetchMachineCheckError(rows *sql.Rows) (*machineCheckError, error) {
|
||||
mcError := &machineCheckError{}
|
||||
err := rows.Scan(&mcError.Id, &mcError.Timestamp, &mcError.ErrorMsg, &mcError.MciStatusMsg, &mcError.SocketId)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return mcError, nil
|
||||
}
|
||||
|
||||
func parseDate(date string) (time.Time, error) {
|
||||
return time.Parse(dateLayout, date)
|
||||
}
|
||||
|
||||
func init() {
|
||||
inputs.Add("ras", func() telegraf.Input {
|
||||
defaultTimestamp, _ := parseDate("1970-01-01 00:00:01 -0700")
|
||||
return &Ras{
|
||||
DbPath: defaultDbPath,
|
||||
latestTimestamp: defaultTimestamp,
|
||||
cpuSocketCounters: map[int]metricCounters{
|
||||
0: *newMetricCounters(),
|
||||
},
|
||||
serverCounters: map[string]int64{
|
||||
levelTwoCache: 0,
|
||||
upi: 0,
|
||||
},
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
@ -0,0 +1,254 @@
|
|||
// +build !windows
|
||||
|
||||
package ras
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/influxdata/telegraf/testutil"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestUpdateCounters(t *testing.T) {
|
||||
ras := newRas()
|
||||
for _, mce := range testData {
|
||||
ras.updateCounters(&mce)
|
||||
}
|
||||
|
||||
assert.Equal(t, 1, len(ras.cpuSocketCounters), "Should contain counters only for single socket")
|
||||
|
||||
for metric, value := range ras.cpuSocketCounters[0] {
|
||||
if metric == processorBase {
|
||||
// processor_base_errors is sum of other seven errors: internal_timer_errors, smm_handler_code_access_violation_errors,
|
||||
// internal_parity_errors, frc_errors, external_mce_errors, microcode_rom_parity_errors and unclassified_mce_errors
|
||||
assert.Equal(t, int64(7), value, fmt.Sprintf("%s should have value of 7", processorBase))
|
||||
} else {
|
||||
assert.Equal(t, int64(1), value, fmt.Sprintf("%s should have value of 1", metric))
|
||||
}
|
||||
}
|
||||
|
||||
for metric, value := range ras.serverCounters {
|
||||
assert.Equal(t, int64(1), value, fmt.Sprintf("%s should have value of 1", metric))
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateLatestTimestamp(t *testing.T) {
|
||||
ras := newRas()
|
||||
ts := "2020-08-01 15:13:27 +0200"
|
||||
testData = append(testData, []machineCheckError{
|
||||
{
|
||||
Timestamp: "2019-05-20 08:25:55 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "",
|
||||
MciStatusMsg: "",
|
||||
},
|
||||
{
|
||||
Timestamp: "2018-02-21 12:27:22 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "",
|
||||
MciStatusMsg: "",
|
||||
},
|
||||
{
|
||||
Timestamp: ts,
|
||||
SocketId: 0,
|
||||
ErrorMsg: "",
|
||||
MciStatusMsg: "",
|
||||
},
|
||||
}...)
|
||||
for _, mce := range testData {
|
||||
err := ras.updateLatestTimestamp(mce.Timestamp)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
assert.Equal(t, ts, ras.latestTimestamp.Format(dateLayout))
|
||||
}
|
||||
|
||||
func TestMultipleSockets(t *testing.T) {
|
||||
ras := newRas()
|
||||
cacheL2 := "Instruction CACHE Level-2 Generic Error"
|
||||
overflow := "Error_overflow Corrected_error"
|
||||
testData = []machineCheckError{
|
||||
{
|
||||
Timestamp: "2019-05-20 08:25:55 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: cacheL2,
|
||||
MciStatusMsg: overflow,
|
||||
},
|
||||
{
|
||||
Timestamp: "2018-02-21 12:27:22 +0200",
|
||||
SocketId: 1,
|
||||
ErrorMsg: cacheL2,
|
||||
MciStatusMsg: overflow,
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-03-21 14:17:28 +0200",
|
||||
SocketId: 2,
|
||||
ErrorMsg: cacheL2,
|
||||
MciStatusMsg: overflow,
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-03-21 17:24:18 +0200",
|
||||
SocketId: 3,
|
||||
ErrorMsg: cacheL2,
|
||||
MciStatusMsg: overflow,
|
||||
},
|
||||
}
|
||||
for _, mce := range testData {
|
||||
ras.updateCounters(&mce)
|
||||
}
|
||||
assert.Equal(t, 4, len(ras.cpuSocketCounters), "Should contain counters for four sockets")
|
||||
|
||||
for _, metricData := range ras.cpuSocketCounters {
|
||||
for metric, value := range metricData {
|
||||
if metric == levelTwoCache {
|
||||
assert.Equal(t, int64(1), value, fmt.Sprintf("%s should have value of 1", levelTwoCache))
|
||||
} else {
|
||||
assert.Equal(t, int64(0), value, fmt.Sprintf("%s should have value of 0", metric))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingDatabase(t *testing.T) {
|
||||
var acc testutil.Accumulator
|
||||
ras := newRas()
|
||||
ras.DbPath = "/tmp/test.db"
|
||||
err := ras.Gather(&acc)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
func TestEmptyDatabase(t *testing.T) {
|
||||
ras := newRas()
|
||||
|
||||
assert.Equal(t, 1, len(ras.cpuSocketCounters), "Should contain default counters for one socket")
|
||||
assert.Equal(t, 2, len(ras.serverCounters), "Should contain default counters for server")
|
||||
|
||||
for metric, value := range ras.cpuSocketCounters[0] {
|
||||
assert.Equal(t, int64(0), value, fmt.Sprintf("%s should have value of 0", metric))
|
||||
}
|
||||
|
||||
for metric, value := range ras.serverCounters {
|
||||
assert.Equal(t, int64(0), value, fmt.Sprintf("%s should have value of 0", metric))
|
||||
}
|
||||
}
|
||||
|
||||
func newRas() *Ras {
|
||||
defaultTimestamp, _ := parseDate("1970-01-01 00:00:01 -0700")
|
||||
return &Ras{
|
||||
DbPath: defaultDbPath,
|
||||
latestTimestamp: defaultTimestamp,
|
||||
cpuSocketCounters: map[int]metricCounters{
|
||||
0: *newMetricCounters(),
|
||||
},
|
||||
serverCounters: map[string]int64{
|
||||
levelTwoCache: 0,
|
||||
upi: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
var testData = []machineCheckError{
|
||||
{
|
||||
Timestamp: "2020-05-20 07:34:53 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 07:35:11 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error",
|
||||
MciStatusMsg: "Uncorrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 07:37:50 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "MEMORY CONTROLLER RD_CHANNEL2_ERR Transaction: Memory write error",
|
||||
MciStatusMsg: "Uncorrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:14:51 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "MEMORY CONTROLLER WR_CHANNEL2_ERR Transaction: Memory write error",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:15:31 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "corrected filtering (some unreported errors in same region) Instruction CACHE Level-0 Read Error",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:16:32 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "Instruction TLB Level-0 Error",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:16:56 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "No Error",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:17:24 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "Unclassified",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:17:41 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "Microcode ROM parity error",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:17:48 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "FRC error",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:18:18 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "Internal parity error",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:18:34 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "SMM Handler Code Access Violation",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:18:54 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "Internal Timer error",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:21:23 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "BUS Level-3 Generic Generic IO Request-did-not-timeout Error",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:23:23 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "External error",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:25:31 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "UPI: COR LL Rx detected CRC error - successful LLR without Phy Reinit",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
Timestamp: "2020-05-20 08:25:55 +0200",
|
||||
SocketId: 0,
|
||||
ErrorMsg: "Instruction CACHE Level-2 Generic Error",
|
||||
MciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
}
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
// +build windows
|
||||
|
||||
package ras
|
||||
Loading…
Reference in New Issue