Add an optional health metric for the sqlserver input plugin (#8544)

This commit is contained in:
Connor Quagliana 2021-03-11 15:07:38 -06:00 committed by GitHub
parent 35b75e959c
commit 30e189df16
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 372 additions and 4 deletions

View File

@ -5859,6 +5859,11 @@
# ## If you are using AzureDB, setting this to true will gather resource utilization metrics
# # azuredb = false
# ## Toggling this to true will emit an additional metric called "sqlserver_telegraf_health".
# ## This metric tracks the count of attempted queries and successful queries for each SQL instance specified in "servers".
# ## The purpose of this metric is to assist with identifying and diagnosing any connectivity or query issues.
# ## This setting/metric is optional and is disabled by default.
# # health_metric = false
# # Gather timeseries from Google Cloud Platform v3 monitoring API
# [[inputs.stackdriver]]

View File

@ -101,6 +101,12 @@ GO
## If you are using AzureDB, setting this to true will gather resource utilization metrics
# azuredb = false
## Toggling this to true will emit an additional metric called "sqlserver_telegraf_health".
## This metric tracks the count of attempted queries and successful queries for each SQL instance specified in "servers".
## The purpose of this metric is to assist with identifying and diagnosing any connectivity or query issues.
## This setting/metric is optional and is disabled by default.
# health_metric = false
## Possible queries accross different versions of the collectors
## Queries enabled by default for specific Database Type
@ -323,4 +329,20 @@ Version 2 queries have the following tags:
- `sql_instance`: Physical host and instance name (hostname:instance)
- `database_name`: For Azure SQLDB, database_name denotes the name of the Azure SQL Database as server name is a logical construct.
#### Health Metric
All collection versions (version 1, version 2, and database_type) support an optional plugin health metric called `sqlserver_telegraf_health`. This metric tracks if connections to SQL Server are succeeding or failing. Users can leverage this metric to detect if their SQL Server monitoring is not working as intended.
In the configuration file, toggling `health_metric` to `true` will enable collection of this metric. By default, this value is set to `false` and the metric is not collected. The health metric emits one record for each connection specified by `servers` in the configuration file.
The health metric emits the following tags:
- `sql_instance` - Name of the server specified in the connection string. This value is emitted as-is in the connection string. If the server could not be parsed from the connection string, a constant placeholder value is emitted
- `database_name` - Name of the database or (initial catalog) specified in the connection string. This value is emitted as-is in the connection string. If the database could not be parsed from the connection string, a constant placeholder value is emitted
The health metric emits the following fields:
- `attempted_queries` - Number of queries that were attempted for this connection
- `successful_queries` - Number of queries that completed successfully for this connection
- `database_type` - Type of database as specified by `database_type`. If `database_type` is empty, the `QueryVersion` and `AzureDB` fields are concatenated instead
If `attempted_queries` and `successful_queries` are not equal for a given connection, some metrics were not successfully gathered for that connection. If `successful_queries` is 0, no metrics were successfully gathered.
[cardinality]: /docs/FAQ.md#user-content-q-how-can-i-manage-series-cardinality

View File

@ -0,0 +1,100 @@
package sqlserver
import (
"net/url"
"strings"
)
const (
emptySqlInstance = "<empty-sql-instance>"
emptyDatabaseName = "<empty-database-name>"
)
// getConnectionIdentifiers returns the sqlInstance and databaseName from the given connection string.
// The name of the SQL instance is returned as-is in the connection string
// If the connection string could not be parsed or sqlInstance/databaseName were not present, a placeholder value is returned
func getConnectionIdentifiers(connectionString string) (sqlInstance string, databaseName string) {
if len(connectionString) == 0 {
return emptySqlInstance, emptyDatabaseName
}
trimmedConnectionString := strings.TrimSpace(connectionString)
if strings.HasPrefix(trimmedConnectionString, "odbc:") {
connectionStringWithoutOdbc := strings.TrimPrefix(trimmedConnectionString, "odbc:")
return parseConnectionStringKeyValue(connectionStringWithoutOdbc)
}
if strings.HasPrefix(trimmedConnectionString, "sqlserver://") {
return parseConnectionStringURL(trimmedConnectionString)
}
return parseConnectionStringKeyValue(trimmedConnectionString)
}
// parseConnectionStringKeyValue parses a "key=value;" connection string and returns the SQL instance and database name
func parseConnectionStringKeyValue(connectionString string) (sqlInstance string, databaseName string) {
sqlInstance = ""
databaseName = ""
keyValuePairs := strings.Split(connectionString, ";")
for _, keyValuePair := range keyValuePairs {
if len(keyValuePair) == 0 {
continue
}
keyAndValue := strings.SplitN(keyValuePair, "=", 2)
key := strings.TrimSpace(strings.ToLower(keyAndValue[0]))
if len(key) == 0 {
continue
}
value := ""
if len(keyAndValue) > 1 {
value = strings.TrimSpace(keyAndValue[1])
}
if strings.EqualFold("server", key) {
sqlInstance = value
continue
}
if strings.EqualFold("database", key) {
databaseName = value
}
}
if sqlInstance == "" {
sqlInstance = emptySqlInstance
}
if databaseName == "" {
databaseName = emptyDatabaseName
}
return sqlInstance, databaseName
}
// parseConnectionStringURL parses a URL-formatted connection string and returns the SQL instance and database name
func parseConnectionStringURL(connectionString string) (sqlInstance string, databaseName string) {
sqlInstance = emptySqlInstance
databaseName = emptyDatabaseName
u, err := url.Parse(connectionString)
if err != nil {
return emptySqlInstance, emptyDatabaseName
}
sqlInstance = u.Hostname()
if len(u.Path) > 1 {
// There was a SQL instance name specified in addition to the host
// E.g. "the.host.com:1234/InstanceName" or "the.host.com/InstanceName"
sqlInstance = sqlInstance + "\\" + u.Path[1:]
}
query := u.Query()
for key, value := range query {
if strings.EqualFold("database", key) {
databaseName = value[0]
break
}
}
return sqlInstance, databaseName
}

View File

@ -21,6 +21,7 @@ type SQLServer struct {
DatabaseType string `toml:"database_type"`
IncludeQuery []string `toml:"include_query"`
ExcludeQuery []string `toml:"exclude_query"`
HealthMetric bool `toml:"health_metric"`
queries MapQuery
isInitialized bool
}
@ -36,8 +37,29 @@ type Query struct {
// MapQuery type
type MapQuery map[string]Query
// HealthMetric struct tracking the number of attempted vs successful connections for each connection string
type HealthMetric struct {
AttemptedQueries int
SuccessfulQueries int
}
const defaultServer = "Server=.;app name=telegraf;log=1;"
const (
typeAzureSQLDB = "AzureSQLDB"
typeAzureSQLManagedInstance = "AzureSQLManagedInstance"
typeSQLServer = "SQLServer"
)
const (
healthMetricName = "sqlserver_telegraf_health"
healthMetricInstanceTag = "sql_instance"
healthMetricDatabaseTag = "database_name"
healthMetricAttemptedQueries = "attempted_queries"
healthMetricSuccessfulQueries = "successful_queries"
healthMetricDatabaseType = "database_type"
)
const sampleConfig = `
## Specify instances to monitor with a list of connection strings.
## All connection parameters are optional.
@ -124,7 +146,7 @@ func initQueries(s *SQLServer) error {
// Constant defintiions for type "AzureSQLDB" start with sqlAzureDB
// Constant defintiions for type "AzureSQLManagedInstance" start with sqlAzureMI
// Constant defintiions for type "SQLServer" start with sqlServer
if s.DatabaseType == "AzureSQLDB" {
if s.DatabaseType == typeAzureSQLDB {
queries["AzureSQLDBResourceStats"] = Query{ScriptName: "AzureSQLDBResourceStats", Script: sqlAzureDBResourceStats, ResultByRow: false}
queries["AzureSQLDBResourceGovernance"] = Query{ScriptName: "AzureSQLDBResourceGovernance", Script: sqlAzureDBResourceGovernance, ResultByRow: false}
queries["AzureSQLDBWaitStats"] = Query{ScriptName: "AzureSQLDBWaitStats", Script: sqlAzureDBWaitStats, ResultByRow: false}
@ -135,7 +157,7 @@ func initQueries(s *SQLServer) error {
queries["AzureSQLDBPerformanceCounters"] = Query{ScriptName: "AzureSQLDBPerformanceCounters", Script: sqlAzureDBPerformanceCounters, ResultByRow: false}
queries["AzureSQLDBRequests"] = Query{ScriptName: "AzureSQLDBRequests", Script: sqlAzureDBRequests, ResultByRow: false}
queries["AzureSQLDBSchedulers"] = Query{ScriptName: "AzureSQLDBSchedulers", Script: sqlAzureDBSchedulers, ResultByRow: false}
} else if s.DatabaseType == "AzureSQLManagedInstance" {
} else if s.DatabaseType == typeAzureSQLManagedInstance {
queries["AzureSQLMIResourceStats"] = Query{ScriptName: "AzureSQLMIResourceStats", Script: sqlAzureMIResourceStats, ResultByRow: false}
queries["AzureSQLMIResourceGovernance"] = Query{ScriptName: "AzureSQLMIResourceGovernance", Script: sqlAzureMIResourceGovernance, ResultByRow: false}
queries["AzureSQLMIDatabaseIO"] = Query{ScriptName: "AzureSQLMIDatabaseIO", Script: sqlAzureMIDatabaseIO, ResultByRow: false}
@ -145,7 +167,7 @@ func initQueries(s *SQLServer) error {
queries["AzureSQLMIPerformanceCounters"] = Query{ScriptName: "AzureSQLMIPerformanceCounters", Script: sqlAzureMIPerformanceCounters, ResultByRow: false}
queries["AzureSQLMIRequests"] = Query{ScriptName: "AzureSQLMIRequests", Script: sqlAzureMIRequests, ResultByRow: false}
queries["AzureSQLMISchedulers"] = Query{ScriptName: "AzureSQLMISchedulers", Script: sqlAzureMISchedulers, ResultByRow: false}
} else if s.DatabaseType == "SQLServer" { //These are still V2 queries and have not been refactored yet.
} else if s.DatabaseType == typeSQLServer { //These are still V2 queries and have not been refactored yet.
queries["SQLServerPerformanceCounters"] = Query{ScriptName: "SQLServerPerformanceCounters", Script: sqlServerPerformanceCounters, ResultByRow: false}
queries["SQLServerWaitStatsCategorized"] = Query{ScriptName: "SQLServerWaitStatsCategorized", Script: sqlServerWaitStatsCategorized, ResultByRow: false}
queries["SQLServerDatabaseIO"] = Query{ScriptName: "SQLServerDatabaseIO", Script: sqlServerDatabaseIO, ResultByRow: false}
@ -222,18 +244,33 @@ func (s *SQLServer) Gather(acc telegraf.Accumulator) error {
}
var wg sync.WaitGroup
var mutex sync.Mutex
var healthMetrics = make(map[string]*HealthMetric)
for _, serv := range s.Servers {
for _, query := range s.queries {
wg.Add(1)
go func(serv string, query Query) {
defer wg.Done()
acc.AddError(s.gatherServer(serv, query, acc))
queryError := s.gatherServer(serv, query, acc)
if s.HealthMetric {
mutex.Lock()
s.gatherHealth(healthMetrics, serv, queryError)
mutex.Unlock()
}
acc.AddError(queryError)
}(serv, query)
}
}
wg.Wait()
if s.HealthMetric {
s.accHealth(healthMetrics, acc)
}
return nil
}
@ -323,6 +360,46 @@ func (s *SQLServer) accRow(query Query, acc telegraf.Accumulator, row scanner) e
return nil
}
// gatherHealth stores info about any query errors in the healthMetrics map
func (s *SQLServer) gatherHealth(healthMetrics map[string]*HealthMetric, serv string, queryError error) {
if healthMetrics[serv] == nil {
healthMetrics[serv] = &HealthMetric{}
}
healthMetrics[serv].AttemptedQueries++
if queryError == nil {
healthMetrics[serv].SuccessfulQueries++
}
}
// accHealth accumulates the query health data contained within the healthMetrics map
func (s *SQLServer) accHealth(healthMetrics map[string]*HealthMetric, acc telegraf.Accumulator) {
for connectionString, connectionStats := range healthMetrics {
sqlInstance, databaseName := getConnectionIdentifiers(connectionString)
tags := map[string]string{healthMetricInstanceTag: sqlInstance, healthMetricDatabaseTag: databaseName}
fields := map[string]interface{}{
healthMetricAttemptedQueries: connectionStats.AttemptedQueries,
healthMetricSuccessfulQueries: connectionStats.SuccessfulQueries,
healthMetricDatabaseType: s.getDatabaseTypeToLog(),
}
acc.AddFields(healthMetricName, fields, tags, time.Now())
}
}
// getDatabaseTypeToLog returns the type of database monitored by this plugin instance
func (s *SQLServer) getDatabaseTypeToLog() string {
if s.DatabaseType == typeAzureSQLDB || s.DatabaseType == typeAzureSQLManagedInstance || s.DatabaseType == typeSQLServer {
return s.DatabaseType
}
logname := fmt.Sprintf("QueryVersion-%d", s.QueryVersion)
if s.AzureDB {
logname += "-AzureDB"
}
return logname
}
func (s *SQLServer) Init() error {
if len(s.Servers) == 0 {
log.Println("W! Warning: Server list is empty.")

View File

@ -138,6 +138,7 @@ func TestSqlServer_MultipleInstanceIntegration(t *testing.T) {
require.NoError(t, err)
assert.Equal(t, s.isInitialized, true)
assert.Equal(t, s2.isInitialized, true)
// acc includes size metrics, and excludes memory metrics
assert.False(t, acc.HasMeasurement("Memory breakdown (%)"))
assert.True(t, acc.HasMeasurement("Log size (bytes)"))
@ -147,6 +148,89 @@ func TestSqlServer_MultipleInstanceIntegration(t *testing.T) {
assert.False(t, acc2.HasMeasurement("Log size (bytes)"))
}
func TestSqlServer_MultipleInstanceWithHealthMetricIntegration(t *testing.T) {
// Invoke Gather() from two separate configurations and
// confirm they don't interfere with each other.
// This test is intentionally similar to TestSqlServer_MultipleInstanceIntegration.
// It is separated to ensure that the health metric code does not affect other metrics
t.Skip("Skipping as unable to open tcp connection with host '127.0.0.1:1433")
testServer := "Server=127.0.0.1;Port=1433;User Id=SA;Password=ABCabc01;app name=telegraf;log=1"
s := &SQLServer{
Servers: []string{testServer},
ExcludeQuery: []string{"MemoryClerk"},
}
s2 := &SQLServer{
Servers: []string{testServer},
ExcludeQuery: []string{"DatabaseSize"},
HealthMetric: true,
}
var acc, acc2 testutil.Accumulator
err := s.Gather(&acc)
require.NoError(t, err)
assert.Equal(t, s.isInitialized, true)
assert.Equal(t, s2.isInitialized, false)
err = s2.Gather(&acc2)
require.NoError(t, err)
assert.Equal(t, s.isInitialized, true)
assert.Equal(t, s2.isInitialized, true)
// acc includes size metrics, and excludes memory metrics and the health metric
assert.False(t, acc.HasMeasurement(healthMetricName))
assert.False(t, acc.HasMeasurement("Memory breakdown (%)"))
assert.True(t, acc.HasMeasurement("Log size (bytes)"))
// acc2 includes memory metrics and the health metric, and excludes size metrics
assert.True(t, acc2.HasMeasurement(healthMetricName))
assert.True(t, acc2.HasMeasurement("Memory breakdown (%)"))
assert.False(t, acc2.HasMeasurement("Log size (bytes)"))
sqlInstance, database := getConnectionIdentifiers(testServer)
tags := map[string]string{healthMetricInstanceTag: sqlInstance, healthMetricDatabaseTag: database}
assert.True(t, acc2.HasPoint(healthMetricName, tags, healthMetricAttemptedQueries, 9))
assert.True(t, acc2.HasPoint(healthMetricName, tags, healthMetricSuccessfulQueries, 9))
}
func TestSqlServer_HealthMetric(t *testing.T) {
fakeServer1 := "localhost\\fakeinstance1;Database=fakedb1"
fakeServer2 := "localhost\\fakeinstance2;Database=fakedb2"
s1 := &SQLServer{
Servers: []string{fakeServer1, fakeServer2},
IncludeQuery: []string{"DatabaseSize", "MemoryClerk"},
HealthMetric: true,
}
s2 := &SQLServer{
Servers: []string{fakeServer1},
IncludeQuery: []string{"DatabaseSize"},
}
// acc1 should have the health metric because it is specified in the config
var acc1 testutil.Accumulator
s1.Gather(&acc1)
assert.True(t, acc1.HasMeasurement(healthMetricName))
// There will be 2 attempted queries (because we specified 2 queries in IncludeQuery)
// Both queries should fail because the specified SQL instances do not exist
sqlInstance1, database1 := getConnectionIdentifiers(fakeServer1)
tags1 := map[string]string{healthMetricInstanceTag: sqlInstance1, healthMetricDatabaseTag: database1}
assert.True(t, acc1.HasPoint(healthMetricName, tags1, healthMetricAttemptedQueries, 2))
assert.True(t, acc1.HasPoint(healthMetricName, tags1, healthMetricSuccessfulQueries, 0))
sqlInstance2, database2 := getConnectionIdentifiers(fakeServer2)
tags2 := map[string]string{healthMetricInstanceTag: sqlInstance2, healthMetricDatabaseTag: database2}
assert.True(t, acc1.HasPoint(healthMetricName, tags2, healthMetricAttemptedQueries, 2))
assert.True(t, acc1.HasPoint(healthMetricName, tags2, healthMetricSuccessfulQueries, 0))
// acc2 should not have the health metric because it is not specified in the config
var acc2 testutil.Accumulator
s2.Gather(&acc2)
assert.False(t, acc2.HasMeasurement(healthMetricName))
}
func TestSqlServer_MultipleInit(t *testing.T) {
s := &SQLServer{}
@ -169,6 +253,86 @@ func TestSqlServer_MultipleInit(t *testing.T) {
assert.Equal(t, s2.isInitialized, true)
}
func TestSqlServer_ConnectionString(t *testing.T) {
// URL format
connectionString := "sqlserver://username:password@hostname.database.windows.net?database=databasename&connection+timeout=30"
sqlInstance, database := getConnectionIdentifiers(connectionString)
assert.Equal(t, "hostname.database.windows.net", sqlInstance)
assert.Equal(t, "databasename", database)
connectionString = " sqlserver://hostname2.somethingelse.net:1433?database=databasename2"
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, "hostname2.somethingelse.net", sqlInstance)
assert.Equal(t, "databasename2", database)
connectionString = "sqlserver://hostname3:1433/SqlInstanceName3?database=databasename3"
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, "hostname3\\SqlInstanceName3", sqlInstance)
assert.Equal(t, "databasename3", database)
connectionString = " sqlserver://hostname4/SqlInstanceName4?database=databasename4&connection%20timeout=30"
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, "hostname4\\SqlInstanceName4", sqlInstance)
assert.Equal(t, "databasename4", database)
connectionString = " sqlserver://username:password@hostname5?connection%20timeout=30"
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, "hostname5", sqlInstance)
assert.Equal(t, emptyDatabaseName, database)
// odbc format
connectionString = "odbc:server=hostname.database.windows.net;user id=sa;database=master;Trusted_Connection=Yes;Integrated Security=true;"
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, "hostname.database.windows.net", sqlInstance)
assert.Equal(t, "master", database)
connectionString = " odbc:server=192.168.0.1;user id=somethingelse;Integrated Security=true;Database=mydb "
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, "192.168.0.1", sqlInstance)
assert.Equal(t, "mydb", database)
connectionString = " odbc:Server=servername\\instancename;Database=dbname;"
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, "servername\\instancename", sqlInstance)
assert.Equal(t, "dbname", database)
connectionString = "server=hostname2.database.windows.net;user id=sa;Trusted_Connection=Yes;Integrated Security=true;"
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, "hostname2.database.windows.net", sqlInstance)
assert.Equal(t, emptyDatabaseName, database)
connectionString = "invalid connection string"
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, emptySqlInstance, sqlInstance)
assert.Equal(t, emptyDatabaseName, database)
// Key/value format
connectionString = " server=hostname.database.windows.net;user id=sa;database=master;Trusted_Connection=Yes;Integrated Security=true"
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, "hostname.database.windows.net", sqlInstance)
assert.Equal(t, "master", database)
connectionString = " server=192.168.0.1;user id=somethingelse;Integrated Security=true;Database=mydb;"
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, "192.168.0.1", sqlInstance)
assert.Equal(t, "mydb", database)
connectionString = "Server=servername\\instancename;Database=dbname; "
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, "servername\\instancename", sqlInstance)
assert.Equal(t, "dbname", database)
connectionString = "server=hostname2.database.windows.net;user id=sa;Trusted_Connection=Yes;Integrated Security=true "
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, "hostname2.database.windows.net", sqlInstance)
assert.Equal(t, emptyDatabaseName, database)
connectionString = "invalid connection string"
sqlInstance, database = getConnectionIdentifiers(connectionString)
assert.Equal(t, emptySqlInstance, sqlInstance)
assert.Equal(t, emptyDatabaseName, database)
}
func TestSqlServer_AGQueriesApplicableForDatabaseTypeSQLServer(t *testing.T) {
// This test case checks where Availability Group (AG / HADR) queries return an output when included for processing for DatabaseType = SQLServer
// And they should not be processed when DatabaseType = AzureSQLDB