From fa32975d0fd2e70ab1c7f2a1e9ce007b21d7d968 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20=C5=BBak?= Date: Thu, 1 Oct 2020 22:54:51 +0200 Subject: [PATCH] New input plugin for RAS (Reliability, Availability and Serviceability) (#8114) --- README.md | 1 + docs/LICENSE_OF_DEPENDENCIES.md | 1 + go.mod | 1 + go.sum | 7 + plugins/inputs/all/all.go | 1 + plugins/inputs/ras/README.md | 58 ++++++ plugins/inputs/ras/ras.go | 294 ++++++++++++++++++++++++++++++ plugins/inputs/ras/ras_test.go | 254 ++++++++++++++++++++++++++ plugins/inputs/ras/ras_windows.go | 3 + 9 files changed, 620 insertions(+) create mode 100644 plugins/inputs/ras/README.md create mode 100644 plugins/inputs/ras/ras.go create mode 100644 plugins/inputs/ras/ras_test.go create mode 100644 plugins/inputs/ras/ras_windows.go diff --git a/README.md b/README.md index 7bcbd9111..9374645a7 100644 --- a/README.md +++ b/README.md @@ -286,6 +286,7 @@ For documentation on the latest development code see the [documentation index][d * [puppetagent](./plugins/inputs/puppetagent) * [rabbitmq](./plugins/inputs/rabbitmq) * [raindrops](./plugins/inputs/raindrops) +* [ras](./plugins/inputs/ras) * [redfish](./plugins/inputs/redfish) * [redis](./plugins/inputs/redis) * [rethinkdb](./plugins/inputs/rethinkdb) diff --git a/docs/LICENSE_OF_DEPENDENCIES.md b/docs/LICENSE_OF_DEPENDENCIES.md index 677c8046a..d8a942e63 100644 --- a/docs/LICENSE_OF_DEPENDENCIES.md +++ b/docs/LICENSE_OF_DEPENDENCIES.md @@ -94,6 +94,7 @@ following works: - github.com/kubernetes/apimachinery [Apache License 2.0](https://github.com/kubernetes/apimachinery/blob/master/LICENSE) - github.com/leodido/ragel-machinery [MIT License](https://github.com/leodido/ragel-machinery/blob/develop/LICENSE) - github.com/mailru/easyjson [MIT License](https://github.com/mailru/easyjson/blob/master/LICENSE) +- github.com/mattn/go-sqlite3 [MIT License](https://github.com/mattn/go-sqlite3/blob/master/LICENSE) - github.com/matttproud/golang_protobuf_extensions [Apache License 2.0](https://github.com/matttproud/golang_protobuf_extensions/blob/master/LICENSE) - github.com/mdlayher/apcupsd [MIT License](https://github.com/mdlayher/apcupsd/blob/master/LICENSE.md) - github.com/mdlayher/genetlink [MIT License](https://github.com/mdlayher/genetlink/blob/master/LICENSE.md) diff --git a/go.mod b/go.mod index 2e340a635..c3baca06a 100644 --- a/go.mod +++ b/go.mod @@ -88,6 +88,7 @@ require ( github.com/leesper/go_rng v0.0.0-20190531154944-a612b043e353 // indirect github.com/lib/pq v1.3.0 // indirect github.com/mailru/easyjson v0.0.0-20180717111219-efc7eb8984d6 // indirect + github.com/mattn/go-sqlite3 v1.14.0 github.com/matttproud/golang_protobuf_extensions v1.0.1 github.com/mdlayher/apcupsd v0.0.0-20190314144147-eb3dd99a75fe github.com/miekg/dns v1.0.14 diff --git a/go.sum b/go.sum index f9193c8e4..5973e475a 100644 --- a/go.sum +++ b/go.sum @@ -83,6 +83,7 @@ github.com/Microsoft/ApplicationInsights-Go v0.4.2/go.mod h1:CukZ/G66zxXtI+h/VcV github.com/Microsoft/go-winio v0.4.9 h1:3RbgqgGVqmcpbOiwrjbVtDHLlJBGF6aE+yHmNtBNsFQ= github.com/Microsoft/go-winio v0.4.9/go.mod h1:VhR8bwka0BXejwEJY73c50VrPtXAaKcyvVC4A4RozmA= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= +github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo= @@ -103,6 +104,7 @@ github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4 h1:Hs82Z41s6SdL1C github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/amir/raidman v0.0.0-20170415203553-1ccc43bfb9c9 h1:FXrPTd8Rdlc94dKccl7KPmdmIbVh/OjelJ8/vgMRzcQ= github.com/amir/raidman v0.0.0-20170415203553-1ccc43bfb9c9/go.mod h1:eliMa/PW+RDr2QLWRmLH1R1ZA4RInpmvOzDDXtaIZkc= +github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/apache/thrift v0.12.0 h1:pODnxUFNcjP9UTLZGTdeh+j16A8lJbRvD3rOtrk/7bs= github.com/apache/thrift v0.12.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ= github.com/aristanetworks/glog v0.0.0-20191112221043-67e8567f59f3 h1:Bmjk+DjIi3tTAU0wxGaFbfjGUqlxxSXARq9A96Kgoos= @@ -403,6 +405,8 @@ github.com/lib/pq v1.3.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20180717111219-efc7eb8984d6 h1:8/+Y8SKf0xCZ8cCTfnrMdY7HNzlEjPAt3bPjalNb6CA= github.com/mailru/easyjson v0.0.0-20180717111219-efc7eb8984d6/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA= +github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus= github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/mdlayher/apcupsd v0.0.0-20190314144147-eb3dd99a75fe h1:yMrL+YorbzaBpj/h3BbLMP+qeslPZYMbzcpHFBNy1Yk= @@ -643,6 +647,7 @@ golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzB golang.org/x/mod v0.2.0 h1:KU7oHjnv3XNWfa5COkzUifxZmxp1TyI7ImMXqFxLwvQ= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20170114055629-f2499483f923/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -671,6 +676,8 @@ golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e h1:3G+cUijn7XD+S4eJFddp53Pv7+slrESplyjG25HgL+k= +golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200707034311-ab3426394381 h1:VXak5I6aEWmAXeQjA+QSZzlgNrpq9mjcfDemuexIKsU= golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= diff --git a/plugins/inputs/all/all.go b/plugins/inputs/all/all.go index d25d329d4..1d1b8eb58 100644 --- a/plugins/inputs/all/all.go +++ b/plugins/inputs/all/all.go @@ -141,6 +141,7 @@ import ( _ "github.com/influxdata/telegraf/plugins/inputs/puppetagent" _ "github.com/influxdata/telegraf/plugins/inputs/rabbitmq" _ "github.com/influxdata/telegraf/plugins/inputs/raindrops" + _ "github.com/influxdata/telegraf/plugins/inputs/ras" _ "github.com/influxdata/telegraf/plugins/inputs/redfish" _ "github.com/influxdata/telegraf/plugins/inputs/redis" _ "github.com/influxdata/telegraf/plugins/inputs/rethinkdb" diff --git a/plugins/inputs/ras/README.md b/plugins/inputs/ras/README.md new file mode 100644 index 000000000..641d1f488 --- /dev/null +++ b/plugins/inputs/ras/README.md @@ -0,0 +1,58 @@ +# RAS Input Plugin + +The `RAS` plugin gathers and counts errors provided by [RASDaemon](https://github.com/mchehab/rasdaemon). + +### Configuration + +```toml +[[inputs.ras]] + ## Optional path to RASDaemon sqlite3 database. + ## Default: /var/lib/rasdaemon/ras-mc_event.db + # db_path = "" +``` + +In addition `RASDaemon` runs, by default, with `--enable-sqlite3` flag. In case of problems with SQLite3 database please verify this is still a default option. + +### Metrics + +- ras + - tags: + - socket_id + - fields: + - memory_read_corrected_errors + - memory_read_uncorrectable_errors + - memory_write_corrected_errors + - memory_write_uncorrectable_errors + - cache_l0_l1_errors + - tlb_instruction_errors + - cache_l2_errors + - upi_errors + - processor_base_errors + - processor_bus_errors + - internal_timer_errors + - smm_handler_code_access_violation_errors + - internal_parity_errors + - frc_errors + - external_mce_errors + - microcode_rom_parity_errors + - unclassified_mce_errors + +Please note that `processor_base_errors` is aggregate counter measuring the following MCE events: +- internal_timer_errors +- smm_handler_code_access_violation_errors +- internal_parity_errors +- frc_errors +- external_mce_errors +- microcode_rom_parity_errors +- unclassified_mce_errors + +### Permissions + +This plugin requires access to SQLite3 database from `RASDaemon`. Please make sure that user has required permissions to this database. + +### Example Output + +``` +ras,host=ubuntu,socket_id=0 external_mce_base_errors=1i,frc_errors=1i,instruction_tlb_errors=5i,internal_parity_errors=1i,internal_timer_errors=1i,l0_and_l1_cache_errors=7i,memory_read_corrected_errors=25i,memory_read_uncorrectable_errors=0i,memory_write_corrected_errors=5i,memory_write_uncorrectable_errors=0i,microcode_rom_parity_errors=1i,processor_base_errors=7i,processor_bus_errors=1i,smm_handler_code_access_violation_errors=1i,unclassified_mce_base_errors=1i 1598867393000000000 +ras,host=ubuntu level_2_cache_errors=0i,upi_errors=0i 1598867393000000000 +``` diff --git a/plugins/inputs/ras/ras.go b/plugins/inputs/ras/ras.go new file mode 100644 index 000000000..036402eb8 --- /dev/null +++ b/plugins/inputs/ras/ras.go @@ -0,0 +1,294 @@ +// +build !windows + +package ras + +import ( + "database/sql" + "strconv" + "strings" + "time" + + _ "github.com/mattn/go-sqlite3" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/plugins/inputs" +) + +type Ras struct { + DbPath string + latestTimestamp time.Time + cpuSocketCounters map[int]metricCounters + serverCounters metricCounters +} + +type machineCheckError struct { + Id int + Timestamp string + SocketId int + ErrorMsg string + MciStatusMsg string +} + +type metricCounters map[string]int64 + +const ( + mceQuery = ` + SELECT + id, timestamp, error_msg, mcistatus_msg, socketid + FROM mce_record + WHERE timestamp > ? + ` + defaultDbPath = "/var/lib/rasdaemon/ras-mc_event.db" + dateLayout = "2006-01-02 15:04:05 -0700" + memoryReadCorrected = "memory_read_corrected_errors" + memoryReadUncorrected = "memory_read_uncorrectable_errors" + memoryWriteCorrected = "memory_write_corrected_errors" + memoryWriteUncorrected = "memory_write_uncorrectable_errors" + instructionCache = "cache_l0_l1_errors" + instructionTLB = "tlb_instruction_errors" + levelTwoCache = "cache_l2_errors" + upi = "upi_errors" + processorBase = "processor_base_errors" + processorBus = "processor_bus_errors" + internalTimer = "internal_timer_errors" + smmHandlerCode = "smm_handler_code_access_violation_errors" + internalParity = "internal_parity_errors" + frc = "frc_errors" + externalMCEBase = "external_mce_errors" + microcodeROMParity = "microcode_rom_parity_errors" + unclassifiedMCEBase = "unclassified_mce_errors" +) + +func (r *Ras) SampleConfig() string { + return ` + ## Optional path to RASDaemon sqlite3 database. + ## Default: /var/lib/rasdaemon/ras-mc_event.db + # db_path = "" +` +} + +func (r *Ras) Description() string { + return "RAS plugin exposes counter metrics for Machine Check Errors provided by RASDaemon (sqlite3 output is required)." +} + +func (r *Ras) Gather(acc telegraf.Accumulator) error { + db, err := connectToDB(r.DbPath) + if err != nil { + return err + } + defer db.Close() + + rows, err := db.Query(mceQuery, r.latestTimestamp) + if err != nil { + return err + } + defer rows.Close() + + for rows.Next() { + mcError, err := fetchMachineCheckError(rows) + if err != nil { + return err + } + tsErr := r.updateLatestTimestamp(mcError.Timestamp) + if tsErr != nil { + return err + } + r.updateCounters(mcError) + } + + addCpuSocketMetrics(acc, r.cpuSocketCounters) + addServerMetrics(acc, r.serverCounters) + + return nil +} + +func (r *Ras) updateLatestTimestamp(timestamp string) error { + ts, err := parseDate(timestamp) + if err != nil { + return err + } + if ts.After(r.latestTimestamp) { + r.latestTimestamp = ts + } + + return nil +} + +func (r *Ras) updateCounters(mcError *machineCheckError) { + if strings.Contains(mcError.ErrorMsg, "No Error") { + return + } + + r.initializeCpuMetricDataIfRequired(mcError.SocketId) + r.updateSocketCounters(mcError) + r.updateServerCounters(mcError) +} + +func newMetricCounters() *metricCounters { + return &metricCounters{ + memoryReadCorrected: 0, + memoryReadUncorrected: 0, + memoryWriteCorrected: 0, + memoryWriteUncorrected: 0, + instructionCache: 0, + instructionTLB: 0, + processorBase: 0, + processorBus: 0, + internalTimer: 0, + smmHandlerCode: 0, + internalParity: 0, + frc: 0, + externalMCEBase: 0, + microcodeROMParity: 0, + unclassifiedMCEBase: 0, + } +} + +func (r *Ras) updateServerCounters(mcError *machineCheckError) { + if strings.Contains(mcError.ErrorMsg, "CACHE Level-2") && strings.Contains(mcError.ErrorMsg, "Error") { + r.serverCounters[levelTwoCache] += 1 + } + + if strings.Contains(mcError.ErrorMsg, "UPI:") { + r.serverCounters[upi] += 1 + } +} + +func connectToDB(server string) (*sql.DB, error) { + return sql.Open("sqlite3", server) +} + +func (r *Ras) initializeCpuMetricDataIfRequired(socketId int) { + if _, ok := r.cpuSocketCounters[socketId]; !ok { + r.cpuSocketCounters[socketId] = *newMetricCounters() + } +} + +func (r *Ras) updateSocketCounters(mcError *machineCheckError) { + r.updateMemoryCounters(mcError) + r.updateProcessorBaseCounters(mcError) + + if strings.Contains(mcError.ErrorMsg, "Instruction TLB") && strings.Contains(mcError.ErrorMsg, "Error") { + r.cpuSocketCounters[mcError.SocketId][instructionTLB] += 1 + } + + if strings.Contains(mcError.ErrorMsg, "BUS") && strings.Contains(mcError.ErrorMsg, "Error") { + r.cpuSocketCounters[mcError.SocketId][processorBus] += 1 + } + + if (strings.Contains(mcError.ErrorMsg, "CACHE Level-0") || + strings.Contains(mcError.ErrorMsg, "CACHE Level-1")) && + strings.Contains(mcError.ErrorMsg, "Error") { + r.cpuSocketCounters[mcError.SocketId][instructionCache] += 1 + } +} + +func (r *Ras) updateProcessorBaseCounters(mcError *machineCheckError) { + if strings.Contains(mcError.ErrorMsg, "Internal Timer error") { + r.cpuSocketCounters[mcError.SocketId][internalTimer] += 1 + r.cpuSocketCounters[mcError.SocketId][processorBase] += 1 + } + + if strings.Contains(mcError.ErrorMsg, "SMM Handler Code Access Violation") { + r.cpuSocketCounters[mcError.SocketId][smmHandlerCode] += 1 + r.cpuSocketCounters[mcError.SocketId][processorBase] += 1 + } + + if strings.Contains(mcError.ErrorMsg, "Internal parity error") { + r.cpuSocketCounters[mcError.SocketId][internalParity] += 1 + r.cpuSocketCounters[mcError.SocketId][processorBase] += 1 + } + + if strings.Contains(mcError.ErrorMsg, "FRC error") { + r.cpuSocketCounters[mcError.SocketId][frc] += 1 + r.cpuSocketCounters[mcError.SocketId][processorBase] += 1 + } + + if strings.Contains(mcError.ErrorMsg, "External error") { + r.cpuSocketCounters[mcError.SocketId][externalMCEBase] += 1 + r.cpuSocketCounters[mcError.SocketId][processorBase] += 1 + } + + if strings.Contains(mcError.ErrorMsg, "Microcode ROM parity error") { + r.cpuSocketCounters[mcError.SocketId][microcodeROMParity] += 1 + r.cpuSocketCounters[mcError.SocketId][processorBase] += 1 + } + + if strings.Contains(mcError.ErrorMsg, "Unclassified") || strings.Contains(mcError.ErrorMsg, "Internal unclassified") { + r.cpuSocketCounters[mcError.SocketId][unclassifiedMCEBase] += 1 + r.cpuSocketCounters[mcError.SocketId][processorBase] += 1 + } +} + +func (r *Ras) updateMemoryCounters(mcError *machineCheckError) { + if strings.Contains(mcError.ErrorMsg, "Memory read error") { + if strings.Contains(mcError.MciStatusMsg, "Corrected_error") { + r.cpuSocketCounters[mcError.SocketId][memoryReadCorrected] += 1 + } else { + r.cpuSocketCounters[mcError.SocketId][memoryReadUncorrected] += 1 + } + } + if strings.Contains(mcError.ErrorMsg, "Memory write error") { + if strings.Contains(mcError.MciStatusMsg, "Corrected_error") { + r.cpuSocketCounters[mcError.SocketId][memoryWriteCorrected] += 1 + } else { + r.cpuSocketCounters[mcError.SocketId][memoryWriteUncorrected] += 1 + } + } +} + +func addCpuSocketMetrics(acc telegraf.Accumulator, cpuSocketCounters map[int]metricCounters) { + for socketId, data := range cpuSocketCounters { + tags := map[string]string{ + "socket_id": strconv.Itoa(socketId), + } + fields := make(map[string]interface{}) + + for errorName, count := range data { + fields[errorName] = count + } + + acc.AddCounter("ras", fields, tags) + } +} + +func addServerMetrics(acc telegraf.Accumulator, counters map[string]int64) { + fields := make(map[string]interface{}) + for errorName, count := range counters { + fields[errorName] = count + } + + acc.AddCounter("ras", fields, map[string]string{}) +} + +func fetchMachineCheckError(rows *sql.Rows) (*machineCheckError, error) { + mcError := &machineCheckError{} + err := rows.Scan(&mcError.Id, &mcError.Timestamp, &mcError.ErrorMsg, &mcError.MciStatusMsg, &mcError.SocketId) + + if err != nil { + return nil, err + } + + return mcError, nil +} + +func parseDate(date string) (time.Time, error) { + return time.Parse(dateLayout, date) +} + +func init() { + inputs.Add("ras", func() telegraf.Input { + defaultTimestamp, _ := parseDate("1970-01-01 00:00:01 -0700") + return &Ras{ + DbPath: defaultDbPath, + latestTimestamp: defaultTimestamp, + cpuSocketCounters: map[int]metricCounters{ + 0: *newMetricCounters(), + }, + serverCounters: map[string]int64{ + levelTwoCache: 0, + upi: 0, + }, + } + }) +} diff --git a/plugins/inputs/ras/ras_test.go b/plugins/inputs/ras/ras_test.go new file mode 100644 index 000000000..7b3407421 --- /dev/null +++ b/plugins/inputs/ras/ras_test.go @@ -0,0 +1,254 @@ +// +build !windows + +package ras + +import ( + "fmt" + "testing" + + "github.com/influxdata/telegraf/testutil" + + "github.com/stretchr/testify/assert" +) + +func TestUpdateCounters(t *testing.T) { + ras := newRas() + for _, mce := range testData { + ras.updateCounters(&mce) + } + + assert.Equal(t, 1, len(ras.cpuSocketCounters), "Should contain counters only for single socket") + + for metric, value := range ras.cpuSocketCounters[0] { + if metric == processorBase { + // processor_base_errors is sum of other seven errors: internal_timer_errors, smm_handler_code_access_violation_errors, + // internal_parity_errors, frc_errors, external_mce_errors, microcode_rom_parity_errors and unclassified_mce_errors + assert.Equal(t, int64(7), value, fmt.Sprintf("%s should have value of 7", processorBase)) + } else { + assert.Equal(t, int64(1), value, fmt.Sprintf("%s should have value of 1", metric)) + } + } + + for metric, value := range ras.serverCounters { + assert.Equal(t, int64(1), value, fmt.Sprintf("%s should have value of 1", metric)) + } +} + +func TestUpdateLatestTimestamp(t *testing.T) { + ras := newRas() + ts := "2020-08-01 15:13:27 +0200" + testData = append(testData, []machineCheckError{ + { + Timestamp: "2019-05-20 08:25:55 +0200", + SocketId: 0, + ErrorMsg: "", + MciStatusMsg: "", + }, + { + Timestamp: "2018-02-21 12:27:22 +0200", + SocketId: 0, + ErrorMsg: "", + MciStatusMsg: "", + }, + { + Timestamp: ts, + SocketId: 0, + ErrorMsg: "", + MciStatusMsg: "", + }, + }...) + for _, mce := range testData { + err := ras.updateLatestTimestamp(mce.Timestamp) + assert.NoError(t, err) + } + assert.Equal(t, ts, ras.latestTimestamp.Format(dateLayout)) +} + +func TestMultipleSockets(t *testing.T) { + ras := newRas() + cacheL2 := "Instruction CACHE Level-2 Generic Error" + overflow := "Error_overflow Corrected_error" + testData = []machineCheckError{ + { + Timestamp: "2019-05-20 08:25:55 +0200", + SocketId: 0, + ErrorMsg: cacheL2, + MciStatusMsg: overflow, + }, + { + Timestamp: "2018-02-21 12:27:22 +0200", + SocketId: 1, + ErrorMsg: cacheL2, + MciStatusMsg: overflow, + }, + { + Timestamp: "2020-03-21 14:17:28 +0200", + SocketId: 2, + ErrorMsg: cacheL2, + MciStatusMsg: overflow, + }, + { + Timestamp: "2020-03-21 17:24:18 +0200", + SocketId: 3, + ErrorMsg: cacheL2, + MciStatusMsg: overflow, + }, + } + for _, mce := range testData { + ras.updateCounters(&mce) + } + assert.Equal(t, 4, len(ras.cpuSocketCounters), "Should contain counters for four sockets") + + for _, metricData := range ras.cpuSocketCounters { + for metric, value := range metricData { + if metric == levelTwoCache { + assert.Equal(t, int64(1), value, fmt.Sprintf("%s should have value of 1", levelTwoCache)) + } else { + assert.Equal(t, int64(0), value, fmt.Sprintf("%s should have value of 0", metric)) + } + } + } +} + +func TestMissingDatabase(t *testing.T) { + var acc testutil.Accumulator + ras := newRas() + ras.DbPath = "/tmp/test.db" + err := ras.Gather(&acc) + assert.Error(t, err) +} + +func TestEmptyDatabase(t *testing.T) { + ras := newRas() + + assert.Equal(t, 1, len(ras.cpuSocketCounters), "Should contain default counters for one socket") + assert.Equal(t, 2, len(ras.serverCounters), "Should contain default counters for server") + + for metric, value := range ras.cpuSocketCounters[0] { + assert.Equal(t, int64(0), value, fmt.Sprintf("%s should have value of 0", metric)) + } + + for metric, value := range ras.serverCounters { + assert.Equal(t, int64(0), value, fmt.Sprintf("%s should have value of 0", metric)) + } +} + +func newRas() *Ras { + defaultTimestamp, _ := parseDate("1970-01-01 00:00:01 -0700") + return &Ras{ + DbPath: defaultDbPath, + latestTimestamp: defaultTimestamp, + cpuSocketCounters: map[int]metricCounters{ + 0: *newMetricCounters(), + }, + serverCounters: map[string]int64{ + levelTwoCache: 0, + upi: 0, + }, + } +} + +var testData = []machineCheckError{ + { + Timestamp: "2020-05-20 07:34:53 +0200", + SocketId: 0, + ErrorMsg: "MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 07:35:11 +0200", + SocketId: 0, + ErrorMsg: "MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error", + MciStatusMsg: "Uncorrected_error", + }, + { + Timestamp: "2020-05-20 07:37:50 +0200", + SocketId: 0, + ErrorMsg: "MEMORY CONTROLLER RD_CHANNEL2_ERR Transaction: Memory write error", + MciStatusMsg: "Uncorrected_error", + }, + { + Timestamp: "2020-05-20 08:14:51 +0200", + SocketId: 0, + ErrorMsg: "MEMORY CONTROLLER WR_CHANNEL2_ERR Transaction: Memory write error", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 08:15:31 +0200", + SocketId: 0, + ErrorMsg: "corrected filtering (some unreported errors in same region) Instruction CACHE Level-0 Read Error", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 08:16:32 +0200", + SocketId: 0, + ErrorMsg: "Instruction TLB Level-0 Error", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 08:16:56 +0200", + SocketId: 0, + ErrorMsg: "No Error", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 08:17:24 +0200", + SocketId: 0, + ErrorMsg: "Unclassified", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 08:17:41 +0200", + SocketId: 0, + ErrorMsg: "Microcode ROM parity error", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 08:17:48 +0200", + SocketId: 0, + ErrorMsg: "FRC error", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 08:18:18 +0200", + SocketId: 0, + ErrorMsg: "Internal parity error", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 08:18:34 +0200", + SocketId: 0, + ErrorMsg: "SMM Handler Code Access Violation", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 08:18:54 +0200", + SocketId: 0, + ErrorMsg: "Internal Timer error", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 08:21:23 +0200", + SocketId: 0, + ErrorMsg: "BUS Level-3 Generic Generic IO Request-did-not-timeout Error", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 08:23:23 +0200", + SocketId: 0, + ErrorMsg: "External error", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 08:25:31 +0200", + SocketId: 0, + ErrorMsg: "UPI: COR LL Rx detected CRC error - successful LLR without Phy Reinit", + MciStatusMsg: "Error_overflow Corrected_error", + }, + { + Timestamp: "2020-05-20 08:25:55 +0200", + SocketId: 0, + ErrorMsg: "Instruction CACHE Level-2 Generic Error", + MciStatusMsg: "Error_overflow Corrected_error", + }, +} diff --git a/plugins/inputs/ras/ras_windows.go b/plugins/inputs/ras/ras_windows.go new file mode 100644 index 000000000..ac7dadd56 --- /dev/null +++ b/plugins/inputs/ras/ras_windows.go @@ -0,0 +1,3 @@ +// +build windows + +package ras