feat(inputs.ceph): Use perf schema to determine metric type (#15233)

2024-05-02 16:24:08 -04:00 · 2024-05-02 16:24:08 -04:00 · 62509daa6c
parent 7ed0364b34
commit 62509daa6c
2 changed files with 208 additions and 4 deletions
--- a/plugins/inputs/ceph/ceph.go
+++ b/plugins/inputs/ceph/ceph.go
@ -44,7 +44,8 @@ type Ceph struct {
 	GatherAdminSocketStats bool   `toml:"gather_admin_socket_stats"`
 	GatherClusterStats     bool   `toml:"gather_cluster_stats"`
-	Log telegraf.Logger `toml:"-"`
+	Log        telegraf.Logger `toml:"-"`
 	schemaMaps map[socket]perfSchemaMap
 }
 func (*Ceph) SampleConfig() string {
@ -73,7 +74,21 @@ func (c *Ceph) gatherAdminSocketStats(acc telegraf.Accumulator) error {
 		return fmt.Errorf("failed to find sockets at path %q: %w", c.SocketDir, err)
 	}
 	if c.schemaMaps == nil {
 		c.schemaMaps = make(map[socket]perfSchemaMap)
 	}
 	for _, s := range sockets {
 		if _, ok := c.schemaMaps[*s]; !ok {
 			rawSchema, err := perfSchema(c.CephBinary, s)
 			if err != nil {
 				c.Log.Warnf("failed to dump perf schema from socket %q: %v", s.socket, err)
 			} else if schema, err := parseSchema(rawSchema); err != nil {
 				c.Log.Warnf("failed to parse perf schema from socket %q: %v", s.socket, err)
 			} else {
 				c.schemaMaps[*s] = schema
 			}
 		}
 		dump, err := perfDump(c.CephBinary, s)
 		if err != nil {
 			acc.AddError(fmt.Errorf("error reading from socket %q: %w", s.socket, err))
@ -85,9 +100,25 @@ func (c *Ceph) gatherAdminSocketStats(acc telegraf.Accumulator) error {
 			continue
 		}
 		for tag, metrics := range data {
-			acc.AddFields(measurement,
+			if schema, ok := c.schemaMaps[*s]; ok {
-				metrics,
+				for name, metric := range metrics {
-				map[string]string{"type": s.sockType, "id": s.sockID, "collection": tag})
+					valueType := schema[tag][name]
 					switch valueType {
 					case telegraf.Counter:
 						acc.AddCounter(measurement,
 							map[string]interface{}{name: metric},
 							map[string]string{"type": s.sockType, "id": s.sockID, "collection": tag})
 					default:
 						acc.AddGauge(measurement,
 							map[string]interface{}{name: metric},
 							map[string]string{"type": s.sockType, "id": s.sockID, "collection": tag})
 					}
 				}
 			} else {
 				acc.AddFields(measurement,
 					metrics,
 					map[string]string{"type": s.sockType, "id": s.sockID, "collection": tag})
 			}
 		}
 	}
 	return nil
@ -136,6 +167,37 @@ func init() {
 	})
 }
 // Run ceph perf schema on the passed socket.  The output is a JSON string
 // mapping collection names to a map of counter names to information.
 //
 // The counter information includes the type of the counter, which determines
 // the names of the final series produced.  For example, a real-integer pair
 // valued metric produces three series: sum, avgcount and avgtime; which hold
 // the sum of all values, the count of all values and the division of these
 // values.
 func perfSchema(binary string, socket *socket) (string, error) {
 	cmdArgs := []string{"--admin-daemon", socket.socket}
 	switch socket.sockType {
 	case typeOsd, typeMds, typeRgw:
 		cmdArgs = append(cmdArgs, "perf", "schema")
 	case typeMon:
 		cmdArgs = append(cmdArgs, "perfcounters_schema")
 	default:
 		return "", fmt.Errorf("ignoring unknown socket type: %s", socket.sockType)
 	}
 	cmd := exec.Command(binary, cmdArgs...)
 	var out bytes.Buffer
 	cmd.Stdout = &out
 	err := cmd.Run()
 	if err != nil {
 		return "", fmt.Errorf("error running ceph schema: %w", err)
 	}
 	return out.String(), nil
 }
 var perfDump = func(binary string, socket *socket) (string, error) {
 	cmdArgs := []string{"--admin-daemon", socket.socket}
@ -233,6 +295,60 @@ type metricMap map[string]interface{}
 type taggedMetricMap map[string]metricMap
 // Mask bits for perf counters
 const (
 	perfCounterNone       = 0
 	perfCounterTime       = 0x1
 	perfCounterU64        = 0x2
 	perfCounterLongRunAvg = 0x4
 	perfCounterCounter    = 0x8
 	perfCounterHistogram  = 0x10
 )
 type rawPerfCounter struct {
 	TypeMask    int    `json:"type"`
 	MetricType  string `json:"metric_type"`
 	ValueType   string `json:"value_type"`
 	Description string `json:"description"`
 	Nick        string `json:"nick"`
 	Priority    int    `json:"priority"`
 	Units       string `json:"units"`
 }
 type rawCollection map[string]rawPerfCounter
 type perfSchemaMap map[string]map[string]telegraf.ValueType
 // Parses the output of ceph perf schema into a useful format, mapping metrics
 // in collections to their Telegraf metric type.  This is made a little more
 // complicated by the need to expand averages into their component metrics.
 func parseSchema(rawSchema string) (perfSchemaMap, error) {
 	rawMap := make(map[string]rawCollection)
 	err := json.Unmarshal([]byte(rawSchema), &rawMap)
 	if err != nil {
 		return nil, fmt.Errorf("failed to parse json: %q: %w", rawSchema, err)
 	}
 	schemaMap := make(perfSchemaMap)
 	for collection, counters := range rawMap {
 		schemaMap[collection] = make(map[string]telegraf.ValueType)
 		for counter, schema := range counters {
 			if schema.TypeMask&perfCounterLongRunAvg != 0 {
 				schemaMap[collection][counter+".sum"] = telegraf.Counter
 				schemaMap[collection][counter+".avgcount"] = telegraf.Counter
 				if schema.TypeMask&perfCounterTime != 0 {
 					schemaMap[collection][counter+".avgtime"] = telegraf.Gauge
 				}
 			} else if schema.TypeMask&perfCounterCounter != 0 {
 				schemaMap[collection][counter] = telegraf.Counter
 			} else {
 				schemaMap[collection][counter] = telegraf.Gauge
 			}
 		}
 	}
 	return schemaMap, nil
 }
 // Parses a raw JSON string into a taggedMetricMap
 // Delegates the actual parsing to newTaggedMetricMap(..)
 func (c *Ceph) parseDump(dump string) (taggedMetricMap, error) {
--- a/plugins/inputs/ceph/ceph_test.go
+++ b/plugins/inputs/ceph/ceph_test.go
@ -10,6 +10,7 @@ import (
 	"github.com/stretchr/testify/require"
 	"github.com/influxdata/telegraf"
 	"github.com/influxdata/telegraf/testutil"
 )
@ -120,6 +121,25 @@ func TestGather(t *testing.T) {
 	require.NoError(t, c.Gather(acc))
 }
 func TestParseSchema(t *testing.T) {
 	schemaMap, err := parseSchema(osdRawSchema)
 	require.NoError(t, err)
 	// Test Gauge
 	require.Equal(t, telegraf.Counter, schemaMap["osd"]["op"],
 		"op should be a Counter")
 	// Test Counter
 	require.Equal(t, telegraf.Gauge, schemaMap["osd"]["op_wip"],
 		"op_wip should be a Gauge")
 	// Test LongRunAvg
 	require.Equal(t, telegraf.Counter, schemaMap["osd"]["op_latency.avgcount"],
 		"op_latency.avgcount should be a Counter")
 	require.Equal(t, telegraf.Counter, schemaMap["osd"]["op_latency.sum"],
 		"op_latency.sum should be a Counter")
 	require.Equal(t, telegraf.Gauge, schemaMap["osd"]["op_latency.avgtime"],
 		"op_latency.avgtime should be a Gauge")
 }
 func TestFindSockets(t *testing.T) {
 	tmpdir := t.TempDir()
 	c := &Ceph{
@ -1767,6 +1787,74 @@ var rgwPerfDump = `
    }
 }
 `
 var osdRawSchema = `
 {    "osd": {
        "op_wip": {
            "type": 2,
            "metric_type": "gauge",
            "value_type": "integer",
            "description": "Replication operations currently being processed (primary)",
            "nick": "",
            "priority": 5,
            "units": "none"
        },
        "op": {
            "type": 10,
            "metric_type": "counter",
            "value_type": "integer",
            "description": "Client operations",
            "nick": "ops",
            "priority": 10,
            "units": "none"
        },
        "op_in_bytes": {
            "type": 10,
            "metric_type": "counter",
            "value_type": "integer",
            "description": "Client operations total write size",
            "nick": "wr",
            "priority": 8,
            "units": "bytes"
        },
        "op_out_bytes": {
            "type": 10,
            "metric_type": "counter",
            "value_type": "integer",
            "description": "Client operations total read size",
            "nick": "rd",
            "priority": 8,
            "units": "bytes"
        },
        "op_latency": {
 	    "type": 5,
            "metric_type": "gauge",
            "value_type": "real-integer-pair",
            "description": "Latency of client operations (including queue time)",
            "nick": "l",
            "priority": 9,
            "units": "none"
        },
        "op_process_latency": {
            "type": 5,
            "metric_type": "gauge",
            "value_type": "real-integer-pair",
            "description": "Latency of client operations (excluding queue time)",
            "nick": "",
            "priority": 5,
            "units": "none"
        },
        "op_prepare_latency": {
            "type": 5,
            "metric_type": "gauge",
            "value_type": "real-integer-pair",
            "description": "Latency of client operations (excluding queue time and wait for finished)",
            "nick": "",
            "priority": 5,
            "units": "none"
        }
 }
 }
 `
 var clusterStatusDump = `
 {