New input plugin for RAS with fixed GLIBC issue (#8293)

This commit is contained in:
Paweł Żak 2020-10-20 23:59:05 +02:00 committed by GitHub
parent 01230889b4
commit 14a73055f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 665 additions and 0 deletions

View File

@ -73,6 +73,7 @@
- [nsd](/plugins/inputs/nsd/README.md) add nsd input plugin - Contributed by @gearnode
- [opcua](/plugins/inputs/opcua/README.md) Add OPC UA input plugin - Contributed by InfluxData
- [proxmox](/plugins/inputs/proxmox/README.md) Proxmox plugin - Contributed by @effitient
- [ras](/plugins/inputs/ras/README.md) New input plugin for RAS (Reliability, Availability and Serviceability) - Contributed by @p-zak
- [win_eventlog](/plugins/inputs/win_eventlog/README.md) Windows eventlog input plugin - Contributed by @simnv
#### New Output Plugins

View File

@ -94,6 +94,7 @@ following works:
- github.com/kubernetes/apimachinery [Apache License 2.0](https://github.com/kubernetes/apimachinery/blob/master/LICENSE)
- github.com/leodido/ragel-machinery [MIT License](https://github.com/leodido/ragel-machinery/blob/develop/LICENSE)
- github.com/mailru/easyjson [MIT License](https://github.com/mailru/easyjson/blob/master/LICENSE)
- github.com/mattn/go-isatty [MIT License](https://github.com/mattn/go-isatty/blob/master/LICENSE)
- github.com/matttproud/golang_protobuf_extensions [Apache License 2.0](https://github.com/matttproud/golang_protobuf_extensions/blob/master/LICENSE)
- github.com/mdlayher/apcupsd [MIT License](https://github.com/mdlayher/apcupsd/blob/master/LICENSE.md)
- github.com/mdlayher/genetlink [MIT License](https://github.com/mdlayher/genetlink/blob/master/LICENSE.md)
@ -171,6 +172,9 @@ following works:
- gopkg.in/tomb.v1 [BSD 3-Clause Clear License](https://github.com/go-tomb/tomb/blob/v1/LICENSE)
- gopkg.in/yaml.v2 [Apache License 2.0](https://github.com/go-yaml/yaml/blob/v2.2.2/LICENSE)
- gopkg.in/yaml.v3 [Apache License 2.0](https://github.com/go-yaml/yaml/blob/v3/LICENSE)
- modernc.org/libc [BSD 3-Clause "New" or "Revised" License](https://gitlab.com/cznic/libc/-/blob/master/LICENSE)
- modernc.org/memory [BSD 3-Clause "New" or "Revised" License](https://gitlab.com/cznic/memory/-/blob/master/LICENSE)
- modernc.org/sqlite [BSD 3-Clause "New" or "Revised" License](https://gitlab.com/cznic/sqlite/-/blob/master/LICENSE)
## telegraf used and modified code from these projects
- github.com/DataDog/datadog-agent [Apache License 2.0](https://github.com/DataDog/datadog-agent/LICENSE)

1
go.mod
View File

@ -151,6 +151,7 @@ require (
gotest.tools v2.2.0+incompatible // indirect
honnef.co/go/tools v0.0.1-2020.1.3 // indirect
k8s.io/apimachinery v0.17.1 // indirect
modernc.org/sqlite v1.7.4
)
// replaced due to https://github.com/satori/go.uuid/issues/73

18
go.sum
View File

@ -412,6 +412,8 @@ github.com/lib/pq v1.3.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.0.0-20180717111219-efc7eb8984d6 h1:8/+Y8SKf0xCZ8cCTfnrMdY7HNzlEjPAt3bPjalNb6CA=
github.com/mailru/easyjson v0.0.0-20180717111219-efc7eb8984d6/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHXY=
github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/mdlayher/apcupsd v0.0.0-20190314144147-eb3dd99a75fe h1:yMrL+YorbzaBpj/h3BbLMP+qeslPZYMbzcpHFBNy1Yk=
@ -532,6 +534,8 @@ github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a h1:9ZKAASQSHhD
github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0 h1:MkV+77GLUNo5oJ0jf870itWm3D0Sjh7+Za9gazKc5LQ=
github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0 h1:OdAsTTz6OkFY5QxjkYwrChwuRruF69c169dPK26NUlk=
github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/safchain/ethtool v0.0.0-20200218184317-f459e2d13664 h1:gvolwzuDhul9qK6/oHqxCHD5TEYfsWNBGidOeG6kvpk=
github.com/safchain/ethtool v0.0.0-20200218184317-f459e2d13664/go.mod h1:Z0q5wiBQGYcxhMZ6gUqHn6pYNLypFAvaL3UvgZLR0U4=
@ -730,6 +734,7 @@ golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4 h1:sfkvUWPNGwSV+8/fNqctR5lS2AqCSqYwXdrjCxp/dXo=
@ -909,6 +914,7 @@ gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776 h1:tQIYjPdBoyREyB9XMu+nnTclp
gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo=
gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw=
honnef.co/go/netdb v0.0.0-20150201073656-a416d700ae39/go.mod h1:rbNo0ST5hSazCG4rGfpHrwnwvzP1QX62WbhzD+ghGzs=
honnef.co/go/tools v0.0.0-20180728063816-88497007e858/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
@ -924,6 +930,18 @@ k8s.io/klog v0.0.0-20181102134211-b9b56d5dfc92/go.mod h1:Gq+BEi5rUBO/HRz0bTSXDUc
k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8=
k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I=
k8s.io/kube-openapi v0.0.0-20191107075043-30be4d16710a/go.mod h1:1TqjTSzOxsLGIKfj0lK8EeCP7K1iUG65v09OM0/WG5E=
modernc.org/httpfs v1.0.0 h1:LtuKNg6JMiaBKVQHKd6Phhvk+2GFp+pUcmDQgRjrds0=
modernc.org/httpfs v1.0.0/go.mod h1:BSkfoMUcahSijQD5J/Vu4UMOxzmEf5SNRwyXC4PJBEw=
modernc.org/libc v1.3.1 h1:ZAAaxQZtb94hXvlPMEQybXBLLxEtJlQtVfvLkKOPZ5w=
modernc.org/libc v1.3.1/go.mod h1:f8sp9GAfEyGYh3lsRIKtBh/XwACdFvGznxm6GJmQvXk=
modernc.org/mathutil v1.1.1 h1:FeylZSVX8S+58VsyJlkEj2bcpdytmp9MmDKZkKx8OIE=
modernc.org/mathutil v1.1.1/go.mod h1:mZW8CKdRPY1v87qxC/wUdX5O1qDzXMP5TH3wjfpga6E=
modernc.org/memory v1.0.1 h1:bhVo78NAdgvRD4N+b2hGnAwL5RP2+QyiEJDsX3jpeDA=
modernc.org/memory v1.0.1/go.mod h1:NSjvC08+g3MLOpcAxQbdctcThAEX4YlJ20WWHYEhvRg=
modernc.org/sqlite v1.7.4 h1:pJVbc3NLKENbO1PJ3/uH+kDeuJiTShqc8eZarwANJgU=
modernc.org/sqlite v1.7.4/go.mod h1:xse4RHCm8Fzw0COf5SJqAyiDrVeDwAQthAS1V/woNIA=
modernc.org/tcl v1.4.1 h1:8ERwg+o+EFtrXmXDOVuGGmo+EkEh8Bkokb/ybI3kXPQ=
modernc.org/tcl v1.4.1/go.mod h1:8YCvzidU9SIwkz7RZwlCWK61mhV8X9UwfkRDRp7y5e0=
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=

View File

@ -141,6 +141,7 @@ import (
_ "github.com/influxdata/telegraf/plugins/inputs/puppetagent"
_ "github.com/influxdata/telegraf/plugins/inputs/rabbitmq"
_ "github.com/influxdata/telegraf/plugins/inputs/raindrops"
_ "github.com/influxdata/telegraf/plugins/inputs/ras"
_ "github.com/influxdata/telegraf/plugins/inputs/redfish"
_ "github.com/influxdata/telegraf/plugins/inputs/redis"
_ "github.com/influxdata/telegraf/plugins/inputs/rethinkdb"

View File

@ -0,0 +1,60 @@
# RAS Daemon Input Plugin
This plugin is only available on Linux.
The `RAS` plugin gathers and counts errors provided by [RASDaemon](https://github.com/mchehab/rasdaemon).
### Configuration
```toml
[[inputs.ras]]
## Optional path to RASDaemon sqlite3 database.
## Default: /var/lib/rasdaemon/ras-mc_event.db
# db_path = ""
```
In addition `RASDaemon` runs, by default, with `--enable-sqlite3` flag. In case of problems with SQLite3 database please verify this is still a default option.
### Metrics
- ras
- tags:
- socket_id
- fields:
- memory_read_corrected_errors
- memory_read_uncorrectable_errors
- memory_write_corrected_errors
- memory_write_uncorrectable_errors
- cache_l0_l1_errors
- tlb_instruction_errors
- cache_l2_errors
- upi_errors
- processor_base_errors
- processor_bus_errors
- internal_timer_errors
- smm_handler_code_access_violation_errors
- internal_parity_errors
- frc_errors
- external_mce_errors
- microcode_rom_parity_errors
- unclassified_mce_errors
Please note that `processor_base_errors` is aggregate counter measuring the following MCE events:
- internal_timer_errors
- smm_handler_code_access_violation_errors
- internal_parity_errors
- frc_errors
- external_mce_errors
- microcode_rom_parity_errors
- unclassified_mce_errors
### Permissions
This plugin requires access to SQLite3 database from `RASDaemon`. Please make sure that user has required permissions to this database.
### Example Output
```
ras,host=ubuntu,socket_id=0 external_mce_base_errors=1i,frc_errors=1i,instruction_tlb_errors=5i,internal_parity_errors=1i,internal_timer_errors=1i,l0_and_l1_cache_errors=7i,memory_read_corrected_errors=25i,memory_read_uncorrectable_errors=0i,memory_write_corrected_errors=5i,memory_write_uncorrectable_errors=0i,microcode_rom_parity_errors=1i,processor_base_errors=7i,processor_bus_errors=1i,smm_handler_code_access_violation_errors=1i,unclassified_mce_base_errors=1i 1598867393000000000
ras,host=ubuntu level_2_cache_errors=0i,upi_errors=0i 1598867393000000000
```

323
plugins/inputs/ras/ras.go Normal file
View File

@ -0,0 +1,323 @@
// +build linux,!mips,!mipsle,!s390x
package ras
import (
"database/sql"
"fmt"
"os"
"strconv"
"strings"
"time"
_ "modernc.org/sqlite" //to register SQLite driver
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/inputs"
)
// Ras plugin gathers and counts errors provided by RASDaemon
type Ras struct {
DBPath string `toml:"db_path"`
latestTimestamp time.Time `toml:"-"`
cpuSocketCounters map[int]metricCounters `toml:"-"`
serverCounters metricCounters `toml:"-"`
}
type machineCheckError struct {
ID int
Timestamp string
SocketID int
ErrorMsg string
MciStatusMsg string
}
type metricCounters map[string]int64
const (
mceQuery = `
SELECT
id, timestamp, error_msg, mcistatus_msg, socketid
FROM mce_record
WHERE timestamp > ?
`
defaultDbPath = "/var/lib/rasdaemon/ras-mc_event.db"
dateLayout = "2006-01-02 15:04:05 -0700"
memoryReadCorrected = "memory_read_corrected_errors"
memoryReadUncorrected = "memory_read_uncorrectable_errors"
memoryWriteCorrected = "memory_write_corrected_errors"
memoryWriteUncorrected = "memory_write_uncorrectable_errors"
instructionCache = "cache_l0_l1_errors"
instructionTLB = "tlb_instruction_errors"
levelTwoCache = "cache_l2_errors"
upi = "upi_errors"
processorBase = "processor_base_errors"
processorBus = "processor_bus_errors"
internalTimer = "internal_timer_errors"
smmHandlerCode = "smm_handler_code_access_violation_errors"
internalParity = "internal_parity_errors"
frc = "frc_errors"
externalMCEBase = "external_mce_errors"
microcodeROMParity = "microcode_rom_parity_errors"
unclassifiedMCEBase = "unclassified_mce_errors"
)
// SampleConfig returns sample configuration for this plugin.
func (r *Ras) SampleConfig() string {
return `
## Optional path to RASDaemon sqlite3 database.
## Default: /var/lib/rasdaemon/ras-mc_event.db
# db_path = ""
`
}
// Description returns the plugin description.
func (r *Ras) Description() string {
return "RAS plugin exposes counter metrics for Machine Check Errors provided by RASDaemon (sqlite3 output is required)."
}
// Gather reads the stats provided by RASDaemon and writes it to the Accumulator.
func (r *Ras) Gather(acc telegraf.Accumulator) error {
err := validateDbPath(r.DBPath)
if err != nil {
return err
}
db, err := connectToDB(r.DBPath)
if err != nil {
return err
}
defer db.Close()
rows, err := db.Query(mceQuery, r.latestTimestamp)
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
mcError, err := fetchMachineCheckError(rows)
if err != nil {
return err
}
tsErr := r.updateLatestTimestamp(mcError.Timestamp)
if tsErr != nil {
return err
}
r.updateCounters(mcError)
}
addCPUSocketMetrics(acc, r.cpuSocketCounters)
addServerMetrics(acc, r.serverCounters)
return nil
}
func (r *Ras) updateLatestTimestamp(timestamp string) error {
ts, err := parseDate(timestamp)
if err != nil {
return err
}
if ts.After(r.latestTimestamp) {
r.latestTimestamp = ts
}
return nil
}
func (r *Ras) updateCounters(mcError *machineCheckError) {
if strings.Contains(mcError.ErrorMsg, "No Error") {
return
}
r.initializeCPUMetricDataIfRequired(mcError.SocketID)
r.updateSocketCounters(mcError)
r.updateServerCounters(mcError)
}
func newMetricCounters() *metricCounters {
return &metricCounters{
memoryReadCorrected: 0,
memoryReadUncorrected: 0,
memoryWriteCorrected: 0,
memoryWriteUncorrected: 0,
instructionCache: 0,
instructionTLB: 0,
processorBase: 0,
processorBus: 0,
internalTimer: 0,
smmHandlerCode: 0,
internalParity: 0,
frc: 0,
externalMCEBase: 0,
microcodeROMParity: 0,
unclassifiedMCEBase: 0,
}
}
func (r *Ras) updateServerCounters(mcError *machineCheckError) {
if strings.Contains(mcError.ErrorMsg, "CACHE Level-2") && strings.Contains(mcError.ErrorMsg, "Error") {
r.serverCounters[levelTwoCache]++
}
if strings.Contains(mcError.ErrorMsg, "UPI:") {
r.serverCounters[upi]++
}
}
func validateDbPath(dbPath string) error {
pathInfo, err := os.Stat(dbPath)
if os.IsNotExist(err) {
return fmt.Errorf("provided db_path does not exist: [%s]", dbPath)
}
if err != nil {
return fmt.Errorf("cannot get system information for db_path file: [%s] - %v", dbPath, err)
}
if mode := pathInfo.Mode(); !mode.IsRegular() {
return fmt.Errorf("provided db_path does not point to a regular file: [%s]", dbPath)
}
return nil
}
func connectToDB(dbPath string) (*sql.DB, error) {
return sql.Open("sqlite", dbPath)
}
func (r *Ras) initializeCPUMetricDataIfRequired(socketID int) {
if _, ok := r.cpuSocketCounters[socketID]; !ok {
r.cpuSocketCounters[socketID] = *newMetricCounters()
}
}
func (r *Ras) updateSocketCounters(mcError *machineCheckError) {
r.updateMemoryCounters(mcError)
r.updateProcessorBaseCounters(mcError)
if strings.Contains(mcError.ErrorMsg, "Instruction TLB") && strings.Contains(mcError.ErrorMsg, "Error") {
r.cpuSocketCounters[mcError.SocketID][instructionTLB]++
}
if strings.Contains(mcError.ErrorMsg, "BUS") && strings.Contains(mcError.ErrorMsg, "Error") {
r.cpuSocketCounters[mcError.SocketID][processorBus]++
}
if (strings.Contains(mcError.ErrorMsg, "CACHE Level-0") ||
strings.Contains(mcError.ErrorMsg, "CACHE Level-1")) &&
strings.Contains(mcError.ErrorMsg, "Error") {
r.cpuSocketCounters[mcError.SocketID][instructionCache]++
}
}
func (r *Ras) updateProcessorBaseCounters(mcError *machineCheckError) {
if strings.Contains(mcError.ErrorMsg, "Internal Timer error") {
r.cpuSocketCounters[mcError.SocketID][internalTimer]++
r.cpuSocketCounters[mcError.SocketID][processorBase]++
}
if strings.Contains(mcError.ErrorMsg, "SMM Handler Code Access Violation") {
r.cpuSocketCounters[mcError.SocketID][smmHandlerCode]++
r.cpuSocketCounters[mcError.SocketID][processorBase]++
}
if strings.Contains(mcError.ErrorMsg, "Internal parity error") {
r.cpuSocketCounters[mcError.SocketID][internalParity]++
r.cpuSocketCounters[mcError.SocketID][processorBase]++
}
if strings.Contains(mcError.ErrorMsg, "FRC error") {
r.cpuSocketCounters[mcError.SocketID][frc]++
r.cpuSocketCounters[mcError.SocketID][processorBase]++
}
if strings.Contains(mcError.ErrorMsg, "External error") {
r.cpuSocketCounters[mcError.SocketID][externalMCEBase]++
r.cpuSocketCounters[mcError.SocketID][processorBase]++
}
if strings.Contains(mcError.ErrorMsg, "Microcode ROM parity error") {
r.cpuSocketCounters[mcError.SocketID][microcodeROMParity]++
r.cpuSocketCounters[mcError.SocketID][processorBase]++
}
if strings.Contains(mcError.ErrorMsg, "Unclassified") || strings.Contains(mcError.ErrorMsg, "Internal unclassified") {
r.cpuSocketCounters[mcError.SocketID][unclassifiedMCEBase]++
r.cpuSocketCounters[mcError.SocketID][processorBase]++
}
}
func (r *Ras) updateMemoryCounters(mcError *machineCheckError) {
if strings.Contains(mcError.ErrorMsg, "Memory read error") {
if strings.Contains(mcError.MciStatusMsg, "Corrected_error") {
r.cpuSocketCounters[mcError.SocketID][memoryReadCorrected]++
} else {
r.cpuSocketCounters[mcError.SocketID][memoryReadUncorrected]++
}
}
if strings.Contains(mcError.ErrorMsg, "Memory write error") {
if strings.Contains(mcError.MciStatusMsg, "Corrected_error") {
r.cpuSocketCounters[mcError.SocketID][memoryWriteCorrected]++
} else {
r.cpuSocketCounters[mcError.SocketID][memoryWriteUncorrected]++
}
}
}
func addCPUSocketMetrics(acc telegraf.Accumulator, cpuSocketCounters map[int]metricCounters) {
for socketID, data := range cpuSocketCounters {
tags := map[string]string{
"socket_id": strconv.Itoa(socketID),
}
fields := make(map[string]interface{})
for errorName, count := range data {
fields[errorName] = count
}
acc.AddCounter("ras", fields, tags)
}
}
func addServerMetrics(acc telegraf.Accumulator, counters map[string]int64) {
fields := make(map[string]interface{})
for errorName, count := range counters {
fields[errorName] = count
}
acc.AddCounter("ras", fields, map[string]string{})
}
func fetchMachineCheckError(rows *sql.Rows) (*machineCheckError, error) {
mcError := &machineCheckError{}
err := rows.Scan(&mcError.ID, &mcError.Timestamp, &mcError.ErrorMsg, &mcError.MciStatusMsg, &mcError.SocketID)
if err != nil {
return nil, err
}
return mcError, nil
}
func parseDate(date string) (time.Time, error) {
return time.Parse(dateLayout, date)
}
func init() {
inputs.Add("ras", func() telegraf.Input {
defaultTimestamp, _ := parseDate("1970-01-01 00:00:01 -0700")
return &Ras{
DBPath: defaultDbPath,
latestTimestamp: defaultTimestamp,
cpuSocketCounters: map[int]metricCounters{
0: *newMetricCounters(),
},
serverCounters: map[string]int64{
levelTwoCache: 0,
upi: 0,
},
}
})
}

View File

@ -0,0 +1,3 @@
// +build !linux mips mipsle s390x
package ras

View File

@ -0,0 +1,254 @@
// +build linux,!mips,!mipsle,!s390x
package ras
import (
"fmt"
"testing"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/assert"
)
func TestUpdateCounters(t *testing.T) {
ras := newRas()
for _, mce := range testData {
ras.updateCounters(&mce)
}
assert.Equal(t, 1, len(ras.cpuSocketCounters), "Should contain counters only for single socket")
for metric, value := range ras.cpuSocketCounters[0] {
if metric == processorBase {
// processor_base_errors is sum of other seven errors: internal_timer_errors, smm_handler_code_access_violation_errors,
// internal_parity_errors, frc_errors, external_mce_errors, microcode_rom_parity_errors and unclassified_mce_errors
assert.Equal(t, int64(7), value, fmt.Sprintf("%s should have value of 7", processorBase))
} else {
assert.Equal(t, int64(1), value, fmt.Sprintf("%s should have value of 1", metric))
}
}
for metric, value := range ras.serverCounters {
assert.Equal(t, int64(1), value, fmt.Sprintf("%s should have value of 1", metric))
}
}
func TestUpdateLatestTimestamp(t *testing.T) {
ras := newRas()
ts := "2020-08-01 15:13:27 +0200"
testData = append(testData, []machineCheckError{
{
Timestamp: "2019-05-20 08:25:55 +0200",
SocketID: 0,
ErrorMsg: "",
MciStatusMsg: "",
},
{
Timestamp: "2018-02-21 12:27:22 +0200",
SocketID: 0,
ErrorMsg: "",
MciStatusMsg: "",
},
{
Timestamp: ts,
SocketID: 0,
ErrorMsg: "",
MciStatusMsg: "",
},
}...)
for _, mce := range testData {
err := ras.updateLatestTimestamp(mce.Timestamp)
assert.NoError(t, err)
}
assert.Equal(t, ts, ras.latestTimestamp.Format(dateLayout))
}
func TestMultipleSockets(t *testing.T) {
ras := newRas()
cacheL2 := "Instruction CACHE Level-2 Generic Error"
overflow := "Error_overflow Corrected_error"
testData = []machineCheckError{
{
Timestamp: "2019-05-20 08:25:55 +0200",
SocketID: 0,
ErrorMsg: cacheL2,
MciStatusMsg: overflow,
},
{
Timestamp: "2018-02-21 12:27:22 +0200",
SocketID: 1,
ErrorMsg: cacheL2,
MciStatusMsg: overflow,
},
{
Timestamp: "2020-03-21 14:17:28 +0200",
SocketID: 2,
ErrorMsg: cacheL2,
MciStatusMsg: overflow,
},
{
Timestamp: "2020-03-21 17:24:18 +0200",
SocketID: 3,
ErrorMsg: cacheL2,
MciStatusMsg: overflow,
},
}
for _, mce := range testData {
ras.updateCounters(&mce)
}
assert.Equal(t, 4, len(ras.cpuSocketCounters), "Should contain counters for four sockets")
for _, metricData := range ras.cpuSocketCounters {
for metric, value := range metricData {
if metric == levelTwoCache {
assert.Equal(t, int64(1), value, fmt.Sprintf("%s should have value of 1", levelTwoCache))
} else {
assert.Equal(t, int64(0), value, fmt.Sprintf("%s should have value of 0", metric))
}
}
}
}
func TestMissingDatabase(t *testing.T) {
var acc testutil.Accumulator
ras := newRas()
ras.DBPath = "/tmp/test.db"
err := ras.Gather(&acc)
assert.Error(t, err)
}
func TestEmptyDatabase(t *testing.T) {
ras := newRas()
assert.Equal(t, 1, len(ras.cpuSocketCounters), "Should contain default counters for one socket")
assert.Equal(t, 2, len(ras.serverCounters), "Should contain default counters for server")
for metric, value := range ras.cpuSocketCounters[0] {
assert.Equal(t, int64(0), value, fmt.Sprintf("%s should have value of 0", metric))
}
for metric, value := range ras.serverCounters {
assert.Equal(t, int64(0), value, fmt.Sprintf("%s should have value of 0", metric))
}
}
func newRas() *Ras {
defaultTimestamp, _ := parseDate("1970-01-01 00:00:01 -0700")
return &Ras{
DBPath: defaultDbPath,
latestTimestamp: defaultTimestamp,
cpuSocketCounters: map[int]metricCounters{
0: *newMetricCounters(),
},
serverCounters: map[string]int64{
levelTwoCache: 0,
upi: 0,
},
}
}
var testData = []machineCheckError{
{
Timestamp: "2020-05-20 07:34:53 +0200",
SocketID: 0,
ErrorMsg: "MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 07:35:11 +0200",
SocketID: 0,
ErrorMsg: "MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error",
MciStatusMsg: "Uncorrected_error",
},
{
Timestamp: "2020-05-20 07:37:50 +0200",
SocketID: 0,
ErrorMsg: "MEMORY CONTROLLER RD_CHANNEL2_ERR Transaction: Memory write error",
MciStatusMsg: "Uncorrected_error",
},
{
Timestamp: "2020-05-20 08:14:51 +0200",
SocketID: 0,
ErrorMsg: "MEMORY CONTROLLER WR_CHANNEL2_ERR Transaction: Memory write error",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 08:15:31 +0200",
SocketID: 0,
ErrorMsg: "corrected filtering (some unreported errors in same region) Instruction CACHE Level-0 Read Error",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 08:16:32 +0200",
SocketID: 0,
ErrorMsg: "Instruction TLB Level-0 Error",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 08:16:56 +0200",
SocketID: 0,
ErrorMsg: "No Error",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 08:17:24 +0200",
SocketID: 0,
ErrorMsg: "Unclassified",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 08:17:41 +0200",
SocketID: 0,
ErrorMsg: "Microcode ROM parity error",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 08:17:48 +0200",
SocketID: 0,
ErrorMsg: "FRC error",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 08:18:18 +0200",
SocketID: 0,
ErrorMsg: "Internal parity error",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 08:18:34 +0200",
SocketID: 0,
ErrorMsg: "SMM Handler Code Access Violation",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 08:18:54 +0200",
SocketID: 0,
ErrorMsg: "Internal Timer error",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 08:21:23 +0200",
SocketID: 0,
ErrorMsg: "BUS Level-3 Generic Generic IO Request-did-not-timeout Error",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 08:23:23 +0200",
SocketID: 0,
ErrorMsg: "External error",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 08:25:31 +0200",
SocketID: 0,
ErrorMsg: "UPI: COR LL Rx detected CRC error - successful LLR without Phy Reinit",
MciStatusMsg: "Error_overflow Corrected_error",
},
{
Timestamp: "2020-05-20 08:25:55 +0200",
SocketID: 0,
ErrorMsg: "Instruction CACHE Level-2 Generic Error",
MciStatusMsg: "Error_overflow Corrected_error",
},
}