feat(inputs.vsphere): Adding vSAN extension to vSphere plugin (#11955)
This commit is contained in:
parent
e211bd3f28
commit
b323d1ce3c
|
|
@ -8,6 +8,7 @@ vCenter servers.
|
||||||
* Resource Pools
|
* Resource Pools
|
||||||
* VMs
|
* VMs
|
||||||
* Datastores
|
* Datastores
|
||||||
|
* vSAN
|
||||||
|
|
||||||
## Supported versions of vSphere
|
## Supported versions of vSphere
|
||||||
|
|
||||||
|
|
@ -177,6 +178,12 @@ to use them.
|
||||||
datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
|
datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
|
||||||
# datacenter_instances = false ## false by default
|
# datacenter_instances = false ## false by default
|
||||||
|
|
||||||
|
## VSAN
|
||||||
|
# vsan_metric_include = [] ## if omitted or empty, all metrics are collected
|
||||||
|
# vsan_metric_exclude = [ "*" ] ## vSAN are not collected by default.
|
||||||
|
## Whether to skip verifying vSAN metrics against the ones from GetSupportedEntityTypes API.
|
||||||
|
# vsan_metric_skip_verify = false ## false by default.
|
||||||
|
|
||||||
## Plugin Settings
|
## Plugin Settings
|
||||||
## separator character to use for measurement and field names (default: "_")
|
## separator character to use for measurement and field names (default: "_")
|
||||||
# separator = "_"
|
# separator = "_"
|
||||||
|
|
@ -243,7 +250,7 @@ to use them.
|
||||||
```
|
```
|
||||||
|
|
||||||
NOTE: To disable collection of a specific resource type, simply exclude all
|
NOTE: To disable collection of a specific resource type, simply exclude all
|
||||||
metrics using the XX_metric_exclude. For example, to disable collection of VMs,
|
metrics using the XX_metric_exclude. For example, to disable collection of VMs,
|
||||||
add this:
|
add this:
|
||||||
|
|
||||||
```toml
|
```toml
|
||||||
|
|
@ -251,32 +258,34 @@ vm_metric_exclude = [ "*" ]
|
||||||
```
|
```
|
||||||
|
|
||||||
NOTE: To disable collection of a specific resource type, simply exclude all
|
NOTE: To disable collection of a specific resource type, simply exclude all
|
||||||
metrics using the XX_metric_exclude. For example, to disable collection of VMs,
|
metrics using the XX_metric_exclude.
|
||||||
add this:
|
For example, to disable collection of VMs, add this:
|
||||||
|
|
||||||
### Objects and Metrics Per Query
|
### Objects and Metrics per Query
|
||||||
|
|
||||||
By default, in vCenter's configuration a limit is set to the number of entities
|
By default, in the vCenter configuration a limit is set to the number of
|
||||||
that are included in a performance chart query. Default settings for vCenter 6.5
|
entities that are included in a performance chart query. Default settings for
|
||||||
and above is 256. Prior versions of vCenter have this set to 64. A vCenter
|
vCenter 6.5 and later is 256. Earlier versions of vCenter have this set to 64.
|
||||||
administrator can change this setting, see this [VMware KB
|
A vCenter administrator can change this setting.
|
||||||
article](https://kb.vmware.com/s/article/2107096) for more information.
|
See this [VMware KB article](https://kb.vmware.com/s/article/2107096) for more
|
||||||
|
information.
|
||||||
|
|
||||||
Any modification should be reflected in this plugin by modifying the parameter
|
Any modification should be reflected in this plugin by modifying the parameter
|
||||||
`max_query_objects`
|
`max_query_objects`
|
||||||
|
|
||||||
```toml
|
```toml
|
||||||
## number of objects to retrieve per query for realtime resources (vms and hosts)
|
## number of objects to retrieve per query for realtime resources (VMs and hosts)
|
||||||
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
|
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
|
||||||
# max_query_objects = 256
|
# max_query_objects = 256
|
||||||
```
|
```
|
||||||
|
|
||||||
### Collection and Discovery concurrency
|
### Collection and Discovery Concurrency
|
||||||
|
|
||||||
On large vCenter setups it may be prudent to have multiple concurrent go
|
In large vCenter setups it may be prudent to have multiple concurrent go
|
||||||
routines collect performance metrics in order to avoid potential errors for time
|
routines collect performance metrics in order to avoid potential errors for
|
||||||
elapsed during a collection cycle. This should never be greater than 8, though
|
time elapsed during a collection cycle. This should never be greater than 8,
|
||||||
the default of 1 (no concurrency) should be sufficient for most configurations.
|
though the default of 1 (no concurrency) should be sufficient for most
|
||||||
|
configurations.
|
||||||
|
|
||||||
For setting up concurrency, modify `collect_concurrency` and
|
For setting up concurrency, modify `collect_concurrency` and
|
||||||
`discover_concurrency` parameters.
|
`discover_concurrency` parameters.
|
||||||
|
|
@ -289,8 +298,8 @@ For setting up concurrency, modify `collect_concurrency` and
|
||||||
|
|
||||||
### Inventory Paths
|
### Inventory Paths
|
||||||
|
|
||||||
Resources to be monitored can be selected using Inventory Paths. This treats the
|
Resources to be monitored can be selected using Inventory Paths. This treats
|
||||||
vSphere inventory as a tree structure similar to a file system. A vSphere
|
the vSphere inventory as a tree structure similar to a file system. A vSphere
|
||||||
inventory has a structure similar to this:
|
inventory has a structure similar to this:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
@ -330,15 +339,15 @@ Often, we want to select a group of resource, such as all the VMs in a
|
||||||
folder. We could use the path `/DC0/vm/Folder1/*` for that.
|
folder. We could use the path `/DC0/vm/Folder1/*` for that.
|
||||||
|
|
||||||
Another possibility is to select objects using a partial name, such as
|
Another possibility is to select objects using a partial name, such as
|
||||||
`/DC0/vm/Folder1/hadoop*` yielding all vms in Folder1 with a name starting
|
`/DC0/vm/Folder1/hadoop*` yielding all VMs in Folder1 with a name starting
|
||||||
with "hadoop".
|
with "hadoop".
|
||||||
|
|
||||||
Finally, due to the arbitrary nesting of the folder structure, we need a
|
Finally, due to the arbitrary nesting of the folder structure, we need a
|
||||||
"recursive wildcard" for traversing multiple folders. We use the "**" symbol for
|
"recursive wildcard" for traversing multiple folders. We use the "**" symbol
|
||||||
that. If we want to look for a VM with a name starting with "hadoop" in any
|
for that. If we want to look for a VM with a name starting with "hadoop" in
|
||||||
folder, we could use the following path: `/DC0/vm/**/hadoop*`
|
any folder, we could use the following path: `/DC0/vm/**/hadoop*`
|
||||||
|
|
||||||
#### Multiple paths to VMs
|
#### Multiple Paths to VMs
|
||||||
|
|
||||||
As we can see from the example tree above, VMs appear both in its on folder
|
As we can see from the example tree above, VMs appear both in its on folder
|
||||||
under the datacenter, as well as under the hosts. This is useful when you like
|
under the datacenter, as well as under the hosts. This is useful when you like
|
||||||
|
|
@ -368,7 +377,7 @@ be traversed.
|
||||||
|
|
||||||
## Performance Considerations
|
## Performance Considerations
|
||||||
|
|
||||||
### Realtime vs. historical metrics
|
### Realtime vs. Historical Metrics
|
||||||
|
|
||||||
vCenter keeps two different kinds of metrics, known as realtime and historical
|
vCenter keeps two different kinds of metrics, known as realtime and historical
|
||||||
metrics.
|
metrics.
|
||||||
|
|
@ -377,15 +386,15 @@ metrics.
|
||||||
* Historical metrics: Available at a (default) 5 minute, 30 minutes, 2 hours and 24 hours rollup levels. The vSphere Telegraf plugin only uses the most granular rollup which defaults to 5 minutes but can be changed in vCenter to other interval durations. These metrics are stored in the vCenter database and can be expensive and slow to query. Historical metrics are the only type of metrics available for **clusters**, **datastores**, **resource pools** and **datacenters**.
|
* Historical metrics: Available at a (default) 5 minute, 30 minutes, 2 hours and 24 hours rollup levels. The vSphere Telegraf plugin only uses the most granular rollup which defaults to 5 minutes but can be changed in vCenter to other interval durations. These metrics are stored in the vCenter database and can be expensive and slow to query. Historical metrics are the only type of metrics available for **clusters**, **datastores**, **resource pools** and **datacenters**.
|
||||||
|
|
||||||
This distinction has an impact on how Telegraf collects metrics. A single
|
This distinction has an impact on how Telegraf collects metrics. A single
|
||||||
instance of an input plugin can have one and only one collection interval, which
|
instance of an input plugin can have one and only one collection interval,
|
||||||
means that you typically set the collection interval based on the most
|
which means that you typically set the collection interval based on the most
|
||||||
frequently collected metric. Let's assume you set the collection interval to 1
|
frequently collected metric. Let's assume you set the collection interval to 1
|
||||||
minute. All realtime metrics will be collected every minute. Since the
|
minute. All realtime metrics will be collected every minute. Since the
|
||||||
historical metrics are only available on a 5 minute interval, the vSphere
|
historical metrics are only available on a 5 minute interval, the vSphere
|
||||||
Telegraf plugin automatically skips four out of five collection cycles for these
|
Telegraf plugin automatically skips four out of five collection cycles for
|
||||||
metrics. This works fine in many cases. Problems arise when the collection of
|
these metrics. This works fine in many cases. Problems arise when the
|
||||||
historical metrics takes longer than the collection interval. This will cause
|
collection of historical metrics takes longer than the collection interval.
|
||||||
error messages similar to this to appear in the Telegraf logs:
|
This will cause error messages similar to this to appear in the Telegraf logs:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
2019-01-16T13:41:10Z W! [agent] input "inputs.vsphere" did not complete within its interval
|
2019-01-16T13:41:10Z W! [agent] input "inputs.vsphere" did not complete within its interval
|
||||||
|
|
@ -394,8 +403,8 @@ error messages similar to this to appear in the Telegraf logs:
|
||||||
This will disrupt the metric collection and can result in missed samples. The
|
This will disrupt the metric collection and can result in missed samples. The
|
||||||
best practice workaround is to specify two instances of the vSphere plugin, one
|
best practice workaround is to specify two instances of the vSphere plugin, one
|
||||||
for the realtime metrics with a short collection interval and one for the
|
for the realtime metrics with a short collection interval and one for the
|
||||||
historical metrics with a longer interval. You can use the `*_metric_exclude` to
|
historical metrics with a longer interval. You can use the `*_metric_exclude`
|
||||||
turn off the resources you don't want to collect metrics for in each
|
to turn off the resources you don't want to collect metrics for in each
|
||||||
instance. For example:
|
instance. For example:
|
||||||
|
|
||||||
```toml
|
```toml
|
||||||
|
|
@ -414,6 +423,7 @@ instance. For example:
|
||||||
cluster_metric_exclude = ["*"]
|
cluster_metric_exclude = ["*"]
|
||||||
datacenter_metric_exclude = ["*"]
|
datacenter_metric_exclude = ["*"]
|
||||||
resourcepool_metric_exclude = ["*"]
|
resourcepool_metric_exclude = ["*"]
|
||||||
|
vsan_metric_exclude = ["*"]
|
||||||
|
|
||||||
collect_concurrency = 5
|
collect_concurrency = 5
|
||||||
discover_concurrency = 5
|
discover_concurrency = 5
|
||||||
|
|
@ -436,14 +446,14 @@ instance. For example:
|
||||||
collect_concurrency = 3
|
collect_concurrency = 3
|
||||||
```
|
```
|
||||||
|
|
||||||
### Configuring max_query_metrics setting
|
### Configuring max_query_metrics Setting
|
||||||
|
|
||||||
The `max_query_metrics` determines the maximum number of metrics to attempt to
|
The `max_query_metrics` determines the maximum number of metrics to attempt to
|
||||||
retrieve in one call to vCenter. Generally speaking, a higher number means
|
retrieve in one call to vCenter. Generally speaking, a higher number means
|
||||||
faster and more efficient queries. However, the number of allowed metrics in a
|
faster and more efficient queries. However, the number of allowed metrics in a
|
||||||
query is typically limited in vCenter by the `config.vpxd.stats.maxQueryMetrics`
|
query is typically limited in vCenter by the `config.vpxd.stats.maxQueryMetrics`
|
||||||
setting in vCenter. The value defaults to 64 on vSphere 5.5 and older and 256 on
|
setting in vCenter. The value defaults to 64 on vSphere 5.5 and earlier and to
|
||||||
newver versions of vCenter. The vSphere plugin always checks this setting and
|
256 on more recent versions. The vSphere plugin always checks this setting and
|
||||||
will automatically reduce the number if the limit configured in vCenter is lower
|
will automatically reduce the number if the limit configured in vCenter is lower
|
||||||
than max_query_metrics in the plugin. This will result in a log message similar
|
than max_query_metrics in the plugin. This will result in a log message similar
|
||||||
to this:
|
to this:
|
||||||
|
|
@ -455,15 +465,15 @@ to this:
|
||||||
You may ask a vCenter administrator to increase this limit to help boost
|
You may ask a vCenter administrator to increase this limit to help boost
|
||||||
performance.
|
performance.
|
||||||
|
|
||||||
### Cluster metrics and the max_query_metrics setting
|
### Cluster Metrics and the max_query_metrics Setting
|
||||||
|
|
||||||
Cluster metrics are handled a bit differently by vCenter. They are aggregated
|
Cluster metrics are handled a bit differently by vCenter. They are aggregated
|
||||||
from ESXi and virtual machine metrics and may not be available when you query
|
from ESXi and virtual machine metrics and may not be available when you query
|
||||||
their most recent values. When this happens, vCenter will attempt to perform
|
their most recent values. When this happens, vCenter will attempt to perform
|
||||||
that aggregation on the fly. Unfortunately, all the subqueries needed internally
|
that aggregation on the fly. Unfortunately, all the subqueries needed
|
||||||
in vCenter to perform this aggregation will count towards
|
internally in vCenter to perform this aggregation will count towards
|
||||||
`config.vpxd.stats.maxQueryMetrics`. This means that even a very small query may
|
`config.vpxd.stats.maxQueryMetrics`. This means that even a very small query
|
||||||
result in an error message similar to this:
|
may result in an error message similar to this:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
2018-11-02T13:37:11Z E! Error in plugin [inputs.vsphere]: ServerFaultCode: This operation is restricted by the administrator - 'vpxd.stats.maxQueryMetrics'. Contact your system administrator
|
2018-11-02T13:37:11Z E! Error in plugin [inputs.vsphere]: ServerFaultCode: This operation is restricted by the administrator - 'vpxd.stats.maxQueryMetrics'. Contact your system administrator
|
||||||
|
|
@ -474,22 +484,22 @@ There are two ways of addressing this:
|
||||||
* Ask your vCenter administrator to set `config.vpxd.stats.maxQueryMetrics` to a number that's higher than the total number of virtual machines managed by a vCenter instance.
|
* Ask your vCenter administrator to set `config.vpxd.stats.maxQueryMetrics` to a number that's higher than the total number of virtual machines managed by a vCenter instance.
|
||||||
* Exclude the cluster metrics and use either the basicstats aggregator to calculate sums and averages per cluster or use queries in the visualization tool to obtain the same result.
|
* Exclude the cluster metrics and use either the basicstats aggregator to calculate sums and averages per cluster or use queries in the visualization tool to obtain the same result.
|
||||||
|
|
||||||
### Concurrency settings
|
### Concurrency Settings
|
||||||
|
|
||||||
The vSphere plugin allows you to specify two concurrency settings:
|
The vSphere plugin allows you to specify two concurrency settings:
|
||||||
|
|
||||||
* `collect_concurrency`: The maximum number of simultaneous queries for performance metrics allowed per resource type.
|
* `collect_concurrency`: The maximum number of simultaneous queries for performance metrics allowed per resource type.
|
||||||
* `discover_concurrency`: The maximum number of simultaneous queries for resource discovery allowed.
|
* `discover_concurrency`: The maximum number of simultaneous queries for resource discovery allowed.
|
||||||
|
|
||||||
While a higher level of concurrency typically has a positive impact on
|
While a higher level of concurrency typically has a positive impact on
|
||||||
performance, increasing these numbers too much can cause performance issues at
|
performance, increasing these numbers too much can cause performance issues at
|
||||||
the vCenter server. A rule of thumb is to set these parameters to the number of
|
the vCenter server. A rule of thumb is to set these parameters to the number of
|
||||||
virtual machines divided by 1500 and rounded up to the nearest integer.
|
virtual machines divided by 1500 and rounded up to the nearest integer.
|
||||||
|
|
||||||
### Configuring historical_interval setting
|
### Configuring historical_interval Setting
|
||||||
|
|
||||||
When the vSphere plugin queries vCenter for historical statistics it queries for
|
When the vSphere plugin queries vCenter for historical statistics it queries for
|
||||||
statistics that exist at a specific interval. The default historical interval
|
statistics that exist at a specific interval. The default historical interval
|
||||||
duration is 5 minutes but if this interval has been changed then you must
|
duration is 5 minutes but if this interval has been changed then you must
|
||||||
override the default query interval in the vSphere plugin.
|
override the default query interval in the vSphere plugin.
|
||||||
|
|
||||||
|
|
@ -569,6 +579,230 @@ For a detailed list of commonly available metrics, please refer to
|
||||||
* virtualDisk stats for VM
|
* virtualDisk stats for VM
|
||||||
* disk (name of virtual disk)
|
* disk (name of virtual disk)
|
||||||
|
|
||||||
|
## Add a vSAN extension
|
||||||
|
|
||||||
|
A vSAN resource is a special type of resource that can be collected by the
|
||||||
|
plugin. The configuration of a vSAN resource slightly differs from the
|
||||||
|
configuration of hosts, VMs, and other resources.
|
||||||
|
|
||||||
|
### Prerequisites for vSAN
|
||||||
|
|
||||||
|
* vSphere 6.5 and later
|
||||||
|
* Clusters with vSAN enabled
|
||||||
|
* [Turn on Virtual SAN performance service](https://docs.vmware.com/en/VMware-vSphere/6.5/com.vmware.vsphere.virtualsan.doc/GUID-02F67DC3-3D5A-48A4-A445-D2BD6AF2862C.html): When you create a vSAN cluster,
|
||||||
|
the performance service is disabled. To monitor the performance metrics,
|
||||||
|
you must turn on vSAN performance service.
|
||||||
|
|
||||||
|
### vSAN Configuration
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[[inputs.vsphere]]
|
||||||
|
interval = "300s"
|
||||||
|
vcenters = ["https://<vcenter-ip>/sdk", "https://<vcenter2-ip>/sdk"]
|
||||||
|
username = "<user>"
|
||||||
|
password = "<pwd>"
|
||||||
|
|
||||||
|
# Exclude all other metrics
|
||||||
|
vm_metric_exclude = ["*"]
|
||||||
|
datastore_metric_exclude = ["*"]
|
||||||
|
datacenter_metric_exclude = ["*"]
|
||||||
|
host_metric_exclude = ["*"]
|
||||||
|
cluster_metric_exclude = ["*"]
|
||||||
|
|
||||||
|
# By default all supported entity will be included
|
||||||
|
vsan_metric_include = [
|
||||||
|
"summary.disk-usage",
|
||||||
|
"summary.health",
|
||||||
|
"summary.resync",
|
||||||
|
"performance.cluster-domclient",
|
||||||
|
"performance.cluster-domcompmgr",
|
||||||
|
"performance.host-domclient",
|
||||||
|
"performance.host-domcompmgr",
|
||||||
|
"performance.cache-disk",
|
||||||
|
"performance.disk-group",
|
||||||
|
"performance.capacity-disk",
|
||||||
|
"performance.disk-group",
|
||||||
|
"performance.virtual-machine",
|
||||||
|
"performance.vscsi",
|
||||||
|
"performance.virtual-disk",
|
||||||
|
"performance.vsan-host-net",
|
||||||
|
"performance.vsan-vnic-net",
|
||||||
|
"performance.vsan-pnic-net",
|
||||||
|
"performance.vsan-iscsi-host",
|
||||||
|
"performance.vsan-iscsi-target",
|
||||||
|
"performance.vsan-iscsi-lun",
|
||||||
|
"performance.lsom-world-cpu",
|
||||||
|
"performance.nic-world-cpu",
|
||||||
|
"performance.dom-world-cpu",
|
||||||
|
"performance.cmmds-world-cpu",
|
||||||
|
"performance.host-cpu",
|
||||||
|
"performance.host-domowner",
|
||||||
|
"performance.host-memory-slab",
|
||||||
|
"performance.host-memory-heap",
|
||||||
|
"performance.system-mem",
|
||||||
|
]
|
||||||
|
# by default vsan_metric_skip_verify = false
|
||||||
|
vsan_metric_skip_verify = true
|
||||||
|
vsan_metric_exclude = [ ]
|
||||||
|
# vsan_cluster_include = [ "/*/host/**" ] # Inventory path to clusters to collect (by default all are collected)
|
||||||
|
|
||||||
|
collect_concurrency = 5
|
||||||
|
discover_concurrency = 5
|
||||||
|
|
||||||
|
## Optional SSL Config
|
||||||
|
# ssl_ca = "/path/to/cafile"
|
||||||
|
# ssl_cert = "/path/to/certfile"
|
||||||
|
# ssl_key = "/path/to/keyfile"
|
||||||
|
## Use SSL but skip chain & host verification
|
||||||
|
# insecure_skip_verify = false
|
||||||
|
```
|
||||||
|
|
||||||
|
* Use `vsan_metric_include = [...]` to define the vSAN metrics that you want to collect.
|
||||||
|
For example, `vsan_metric_include = ["summary.*", "performance.host-domclient", "performance.cache-disk", "performance.disk-group", "performance.capacity-disk"]`.
|
||||||
|
To include all supported vSAN metrics, use `vsan_metric_include = [ "*" ]`.
|
||||||
|
To disable all the vSAN metrics, use `vsan_metric_exclude = [ "*" ]`.
|
||||||
|
|
||||||
|
* `vsan_metric_skip_verify` defines whether to skip verifying vSAN metrics against the ones from [GetSupportedEntityTypes API](https://code.vmware.com/apis/48/vsan#/doc/vim.cluster.VsanPerformanceManager.html#getSupportedEntityTypes).
|
||||||
|
This option is given because some performance entities are not returned by the API, but we want to offer the flexibility if you really need the stats.
|
||||||
|
When set to false, anything not in the supported entity list will be filtered out.
|
||||||
|
When set to true, queried metrics will be identical to vsan_metric_include and the exclusive array will not be used in this case. By default the value is false.
|
||||||
|
|
||||||
|
* `vsan_cluster_include` defines a list of inventory paths that will be used to select a portion of vSAN clusters.
|
||||||
|
vSAN metrics are only collected on the cluster level. Therefore, use the same way as inventory paths for [vSphere clusters](README.md#inventory-paths).
|
||||||
|
|
||||||
|
* Many vCenter environments use self-signed certificates. Update the bottom portion of the above configuration and provide proper values for all applicable SSL Config settings that apply in your vSphere environment. In some environments, setting insecure_skip_verify = true will be necessary when the SSL certificates are not available.
|
||||||
|
|
||||||
|
* To ensure consistent collection in larger vSphere environments, you must increase concurrency for the plugin. Use the collect_concurrency setting to control concurrency. Set collect_concurrency to the number of virtual machines divided by 1500 and rounded up to the nearest integer. For example, for 1200 VMs use 1, and for 2300 VMs use 2.
|
||||||
|
|
||||||
|
### Measurements & Fields
|
||||||
|
|
||||||
|
**NOTE**: Depending on the vSAN version, the vSAN performance measurements
|
||||||
|
and fields may vary.
|
||||||
|
|
||||||
|
* vSAN Summary
|
||||||
|
* overall_health
|
||||||
|
* total_capacity_bytes, free_capacity_bytes
|
||||||
|
* total_bytes_to_sync, total_objects_to_sync, total_recovery_eta
|
||||||
|
|
||||||
|
* vSAN Performance
|
||||||
|
* cluster-domclient
|
||||||
|
* iops_read, throughput_read, latency_avg_read, iops_write, throughput_write, latency_avg_write, congestion, oio
|
||||||
|
* cluster-domcompmgr
|
||||||
|
* iops_read, throughput_read, latency_avg_read, iops_write, throughput_write, latency_avg_write, iops_rec_write, throughput_rec_write, latency_avg_rec_write, congestion, oio, iops_resync_read, tput_resync_read, lat_avg_resyncread
|
||||||
|
* host-domclient
|
||||||
|
* iops_read, throughput_read, latency_avg_read, read_count, iops_write, throughput_write, latency_avg_write, write_count, congestion, oio, client_cache_hits, client_cache_hit_rate
|
||||||
|
* host-domcompmgr
|
||||||
|
* iops_read, throughput_read, latency_avg_read, read_count, iops_write, throughput_write, latency_avg_write, write_count, iops_rec_write, throughput_rec_write, latency_avg_rec_write, rec_write_count congestion, oio, iops_resync_read, tput_resync_read, lat_avg_resync_read
|
||||||
|
* cache-disk
|
||||||
|
* iops_dev_read, throughput_dev_read, latency_dev_read, io_count_dev_read, iops_dev_write, throughput_dev_write, latency_dev_write, io_count_dev_write, latency_dev_d_avg, latency_dev_g_avg
|
||||||
|
* capacity-disk
|
||||||
|
* iops_dev_read, throughput_dev_read, latency_dev_read, io_count_dev_read, iops_dev_write, throughput_dev_write, latency_dev_write, io_count_dev_write, latency_dev_d_avg, latency_dev_g_avg, iops_read, latency_read, io_count_read, iops_write, latency_write, io_count_write
|
||||||
|
* disk-group
|
||||||
|
* iops_sched, latency_sched, outstanding_bytes_sched, iops_sched_queue_rec, throughput_sched_queue_rec,latency_sched_queue_rec, iops_sched_queue_vm, throughput_sched_queue_vm,latency_sched_queue_vm, iops_sched_queue_meta, throughput_sched_queue_meta,latency_sched_queue_meta, iops_delay_pct_sched, latency_delay_sched, rc_hit_rate, wb_free_pct, war_evictions, quota_evictions, iops_rc_read, latency_rc_read, io_count_rc_read, iops_wb_read, latency_wb_read, io_count_wb_read, iops_rc_write, latency_rc_write, io_count_rc_write, iops_wb_write, latency_wb_write, io_count_wb_write, ssd_bytes_drained, zero_bytes_drained, mem_congestion, slab_congestion, ssd_congestion, iops_congestion, log_congestion, comp_congestion, iops_direct_sched, iops_read, throughput_read, latency_avg_read, read_count, iops_write, throughput_write, latency_avg_write, write_count, oio_write, oio_rec_write, oio_write_size, oio_rec_write_size, rc_size, wb_size, capacity, capacity_used, capacity_reserved, throughput_sched, iops_resync_read_policy, iops_resync_read_decom, iops_resync_read_rebalance, iops_resync_read_fix_comp, iops_resync_write_policy, iops_resync_write_decom, iops_resync_write_rebalance, iops_resync_write_fix_comp, tput_resync_read_policy, tput_resync_read_decom, tput_resync_read_rebalance, tput_resync_read_fix_comp, tput_resync_write_policy, tput_resync_write_decom, tput_resync_write_rebalance, tput_resync_write_fix_comp, lat_resync_read_policy, lat_resync_read_decom, lat_resync_read_rebalance, lat_resync_read_fix_comp, lat_resync_write_policy, lat_resync_write_decom, lat_resync_write_rebalance, lat_resync_write_fix_comp
|
||||||
|
* virtual-machine
|
||||||
|
* iops_read, throughput_read, latency_read_avg, latency_read_stddev, read_count, iops_write, throughput_write, latency_write_avg, latency_write_stddev, write_count
|
||||||
|
* vscsi
|
||||||
|
* iops_read, throughput_read, latency_read, read_count, iops_write, throughput_write, latency_write, write_count
|
||||||
|
* virtual-disk
|
||||||
|
* iops_limit, niops, niops_delayed
|
||||||
|
* vsan-host-net
|
||||||
|
* rx_throughput, rx_packets, rx_packets_loss_rate, tx_throughput, tx_packets, tx_packets_loss_rate
|
||||||
|
* vsan-vnic-net
|
||||||
|
* rx_throughput, rx_packets, rx_packets_loss_rate, tx_throughput, tx_packets, tx_packets_loss_rate
|
||||||
|
* vsan-pnic-net
|
||||||
|
* rx_throughput, rx_packets, rx_packets_loss_rate, tx_throughput, tx_packets, tx_packets_loss_rate
|
||||||
|
* vsan-iscsi-host
|
||||||
|
* iops_read, iops_write, iops_total, bandwidth_read, bandwidth_write, bandwidth_total, latency_read, latency_write, latency_total, queue_depth
|
||||||
|
* vsan-iscsi-target
|
||||||
|
* iops_read, iops_write, iops_total, bandwidth_read, bandwidth_write, bandwidth_total, latency_read, latency_write, latency_total, queue_depth
|
||||||
|
* vsan-iscsi-lun
|
||||||
|
* iops_read, iops_write, iops_total, bandwidth_read, bandwidth_write, bandwidth_total, latency_read, latency_write, latency_total, queue_depth
|
||||||
|
|
||||||
|
### vSAN Tags
|
||||||
|
|
||||||
|
* all vSAN metrics
|
||||||
|
* vcenter
|
||||||
|
* dcname
|
||||||
|
* clustername
|
||||||
|
* moid (the cluster's managed object id)
|
||||||
|
* host-domclient, host-domcompmgr
|
||||||
|
* hostname
|
||||||
|
* disk-group, cache-disk, capacity-disk
|
||||||
|
* hostname
|
||||||
|
* deviceName
|
||||||
|
* ssdUuid (if SSD)
|
||||||
|
* vsan-host-net
|
||||||
|
* hostname
|
||||||
|
* vsan-pnic-net
|
||||||
|
* pnic
|
||||||
|
* vsan-vnic-net
|
||||||
|
* vnic
|
||||||
|
* stackName
|
||||||
|
|
||||||
|
### Realtime vs. Historical Metrics in vSAN
|
||||||
|
|
||||||
|
vSAN metrics also keep two different kinds of metrics - realtime and
|
||||||
|
historical metrics.
|
||||||
|
|
||||||
|
* Realtime metrics are metrics with the prefix 'summary'. These metrics are available in realtime.
|
||||||
|
* Historical metrics are metrics with the prefix 'performance'. These are metrics queried from vSAN performance API, which is available at a 5-minute rollup level.
|
||||||
|
|
||||||
|
For performance consideration, it is better to specify two instances of the
|
||||||
|
plugin, one for the realtime metrics with a short collection interval,
|
||||||
|
and the second one - for the historical metrics with a longer interval.
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
## Realtime instance
|
||||||
|
[[inputs.vsphere]]
|
||||||
|
interval = "30s"
|
||||||
|
vcenters = [ "https://someaddress/sdk" ]
|
||||||
|
username = "someuser@vsphere.local"
|
||||||
|
password = "secret"
|
||||||
|
|
||||||
|
insecure_skip_verify = true
|
||||||
|
force_discover_on_init = true
|
||||||
|
|
||||||
|
# Exclude all other metrics
|
||||||
|
vm_metric_exclude = ["*"]
|
||||||
|
datastore_metric_exclude = ["*"]
|
||||||
|
datacenter_metric_exclude = ["*"]
|
||||||
|
host_metric_exclude = ["*"]
|
||||||
|
cluster_metric_exclude = ["*"]
|
||||||
|
|
||||||
|
vsan_metric_include = [ "summary.*" ]
|
||||||
|
vsan_metric_exclude = [ ]
|
||||||
|
vsan_metric_skip_verify = false
|
||||||
|
|
||||||
|
collect_concurrency = 5
|
||||||
|
discover_concurrency = 5
|
||||||
|
|
||||||
|
# Historical instance
|
||||||
|
[[inputs.vsphere]]
|
||||||
|
|
||||||
|
interval = "300s"
|
||||||
|
vcenters = [ "https://someaddress/sdk" ]
|
||||||
|
username = "someuser@vsphere.local"
|
||||||
|
password = "secret"
|
||||||
|
|
||||||
|
insecure_skip_verify = true
|
||||||
|
force_discover_on_init = true
|
||||||
|
|
||||||
|
# Exclude all other metrics
|
||||||
|
vm_metric_exclude = ["*"]
|
||||||
|
datastore_metric_exclude = ["*"]
|
||||||
|
datacenter_metric_exclude = ["*"]
|
||||||
|
host_metric_exclude = ["*"]
|
||||||
|
cluster_metric_exclude = ["*"]
|
||||||
|
|
||||||
|
vsan_metric_include = [ "performance.*" ]
|
||||||
|
vsan_metric_exclude = [ ]
|
||||||
|
vsan_metric_skip_verify = false
|
||||||
|
|
||||||
|
collect_concurrency = 5
|
||||||
|
discover_concurrency = 5
|
||||||
|
```
|
||||||
|
|
||||||
## Example Output
|
## Example Output
|
||||||
|
|
||||||
```text
|
```text
|
||||||
|
|
@ -677,3 +911,14 @@ vsphere_host_net,clustername=DC0_C0,esxhostname=DC0_C0_H0,host=host.example.com,
|
||||||
vsphere_host_mem,clustername=DC0_C0,esxhostname=DC0_C0_H0,host=host.example.com,moid=host-30,os=Mac,source=DC0_C0_H0,vcenter=localhost:8989 usage_average=116.21 1535660339000000000
|
vsphere_host_mem,clustername=DC0_C0,esxhostname=DC0_C0_H0,host=host.example.com,moid=host-30,os=Mac,source=DC0_C0_H0,vcenter=localhost:8989 usage_average=116.21 1535660339000000000
|
||||||
vsphere_host_net,clustername=DC0_C0,esxhostname=DC0_C0_H0,host=host.example.com,moid=host-30,os=Mac,source=DC0_C0_H0,vcenter=localhost:8989 bytesRx_average=726i,bytesTx_average=643i,usage_average=1504i 1535660339000000000
|
vsphere_host_net,clustername=DC0_C0,esxhostname=DC0_C0_H0,host=host.example.com,moid=host-30,os=Mac,source=DC0_C0_H0,vcenter=localhost:8989 bytesRx_average=726i,bytesTx_average=643i,usage_average=1504i 1535660339000000000
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## vSAN Sample Output
|
||||||
|
|
||||||
|
```text
|
||||||
|
vsphere_vsan_performance_hostdomclient,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,hostname=DC0_C0_H0,moid=domain-c8,source=Example-VSAN,vcenter=localhost:8898 iops_read=7,write_congestion=0,unmap_congestion=0,read_count=2199,iops=8,latency_max_write=8964,latency_avg_unmap=0,latency_avg_write=1883,write_count=364,num_oio=12623,throughput=564127,client_cache_hits=0,latency_max_read=17821,latency_max_unmap=0,read_congestion=0,latency_avg=1154,congestion=0,throughput_read=554721,latency_avg_read=1033,throughput_write=9406,client_cache_hit_rate=0,iops_unmap=0,throughput_unmap=0,latency_stddev=1315,io_count=2563,oio=4,iops_write=1,unmap_count=0 1578955200000000000
|
||||||
|
vsphere_vsan_performance_clusterdomcompmgr,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,uuid=XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXX,vcenter=localhost:8898 latency_avg_rec_write=0,latency_avg_write=9886,congestion=0,iops_resync_read=0,lat_avg_resync_read=0,iops_read=289,latency_avg_read=1184,throughput_write=50137368,iops_rec_write=0,throughput_rec_write=0,tput_resync_read=0,throughput_read=9043654,iops_write=1272,oio=97 1578954900000000000
|
||||||
|
vsphere_vsan_performance_clusterdomclient,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,uuid=XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXX,vcenter=localhost:8898 latency_avg_write=1011,congestion=0,oio=26,iops_read=6,throughput_read=489093,latency_avg_read=1085,iops_write=43,throughput_write=435142 1578955200000000000
|
||||||
|
vsphere_vsan_summary,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,vcenter=localhost:8898 total_bytes_to_sync=0i,total_objects_to_sync=0i,total_recovery_eta=0i 1578955489000000000
|
||||||
|
vsphere_vsan_summary,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,vcenter=localhost:8898 overall_health=1i 1578955489000000000
|
||||||
|
vsphere_vsan_summary,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,vcenter=localhost:8898 free_capacity_byte=11022535578757i,total_capacity_byte=14102625779712i 1578955488000000000
|
||||||
|
```
|
||||||
|
|
|
||||||
|
|
@ -60,6 +60,7 @@ type Endpoint struct {
|
||||||
metricNameLookup map[int32]string
|
metricNameLookup map[int32]string
|
||||||
metricNameMux sync.RWMutex
|
metricNameMux sync.RWMutex
|
||||||
log telegraf.Logger
|
log telegraf.Logger
|
||||||
|
apiVersion string
|
||||||
}
|
}
|
||||||
|
|
||||||
type resourceKind struct {
|
type resourceKind struct {
|
||||||
|
|
@ -237,6 +238,23 @@ func NewEndpoint(ctx context.Context, parent *VSphere, address *url.URL, log tel
|
||||||
getObjects: getDatastores,
|
getObjects: getDatastores,
|
||||||
parent: "",
|
parent: "",
|
||||||
},
|
},
|
||||||
|
"vsan": {
|
||||||
|
name: "vsan",
|
||||||
|
vcName: "ClusterComputeResource",
|
||||||
|
pKey: "clustername",
|
||||||
|
parentTag: "dcname",
|
||||||
|
enabled: anythingEnabled(parent.VSANMetricExclude),
|
||||||
|
realTime: false,
|
||||||
|
sampling: 300,
|
||||||
|
objects: make(objectMap),
|
||||||
|
filters: newFilterOrPanic(parent.VSANMetricInclude, parent.VSANMetricExclude),
|
||||||
|
paths: parent.VSANClusterInclude,
|
||||||
|
simple: parent.VSANMetricSkipVerify,
|
||||||
|
include: parent.VSANMetricInclude,
|
||||||
|
collectInstances: false,
|
||||||
|
getObjects: getClusters,
|
||||||
|
parent: "datacenter",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start discover and other goodness
|
// Start discover and other goodness
|
||||||
|
|
@ -445,7 +463,10 @@ func (e *Endpoint) discover(ctx context.Context) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
e.log.Debugf("Discover new objects for %s", e.URL.Host)
|
// get the vSphere API version
|
||||||
|
e.apiVersion = client.Client.ServiceContent.About.ApiVersion
|
||||||
|
|
||||||
|
e.Parent.Log.Debugf("Discover new objects for %s", e.URL.Host)
|
||||||
dcNameCache := make(map[string]string)
|
dcNameCache := make(map[string]string)
|
||||||
|
|
||||||
numRes := int64(0)
|
numRes := int64(0)
|
||||||
|
|
@ -455,7 +476,7 @@ func (e *Endpoint) discover(ctx context.Context) error {
|
||||||
for k, res := range e.resourceKinds {
|
for k, res := range e.resourceKinds {
|
||||||
e.log.Debugf("Discovering resources for %s", res.name)
|
e.log.Debugf("Discovering resources for %s", res.name)
|
||||||
// Need to do this for all resource types even if they are not enabled
|
// Need to do this for all resource types even if they are not enabled
|
||||||
if res.enabled || k != "vm" {
|
if res.enabled || (k != "vm" && k != "vsan") {
|
||||||
rf := ResourceFilter{
|
rf := ResourceFilter{
|
||||||
finder: &Finder{client},
|
finder: &Finder{client},
|
||||||
resType: res.vcName,
|
resType: res.vcName,
|
||||||
|
|
@ -480,7 +501,8 @@ func (e *Endpoint) discover(ctx context.Context) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
// No need to collect metric metadata if resource type is not enabled
|
// No need to collect metric metadata if resource type is not enabled
|
||||||
if res.enabled {
|
// VSAN is also skipped since vSAN metadata follow it's own format
|
||||||
|
if res.enabled && k != "vsan" {
|
||||||
if res.simple {
|
if res.simple {
|
||||||
e.simpleMetadataSelect(ctx, client, res)
|
e.simpleMetadataSelect(ctx, client, res)
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -935,7 +957,12 @@ func (e *Endpoint) Collect(ctx context.Context, acc telegraf.Accumulator) error
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func(k string) {
|
go func(k string) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
err := e.collectResource(ctx, k, acc)
|
var err error
|
||||||
|
if k == "vsan" {
|
||||||
|
err = e.collectVsan(ctx, acc)
|
||||||
|
} else {
|
||||||
|
err = e.collectResource(ctx, k, acc)
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
acc.AddError(err)
|
acc.AddError(err)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -136,6 +136,12 @@
|
||||||
datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
|
datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
|
||||||
# datacenter_instances = false ## false by default
|
# datacenter_instances = false ## false by default
|
||||||
|
|
||||||
|
## VSAN
|
||||||
|
# vsan_metric_include = [] ## if omitted or empty, all metrics are collected
|
||||||
|
# vsan_metric_exclude = [ "*" ] ## vSAN are not collected by default.
|
||||||
|
## Whether to skip verifying vSAN metrics against the ones from GetSupportedEntityTypes API.
|
||||||
|
# vsan_metric_skip_verify = false ## false by default.
|
||||||
|
|
||||||
## Plugin Settings
|
## Plugin Settings
|
||||||
## separator character to use for measurement and field names (default: "_")
|
## separator character to use for measurement and field names (default: "_")
|
||||||
# separator = "_"
|
# separator = "_"
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,520 @@
|
||||||
|
package vsphere
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/coreos/go-semver/semver"
|
||||||
|
"github.com/vmware/govmomi/object"
|
||||||
|
"github.com/vmware/govmomi/vim25"
|
||||||
|
"github.com/vmware/govmomi/vim25/methods"
|
||||||
|
"github.com/vmware/govmomi/vim25/soap"
|
||||||
|
"github.com/vmware/govmomi/vim25/types"
|
||||||
|
vsanmethods "github.com/vmware/govmomi/vsan/methods"
|
||||||
|
vsantypes "github.com/vmware/govmomi/vsan/types"
|
||||||
|
|
||||||
|
"github.com/influxdata/telegraf"
|
||||||
|
"github.com/influxdata/telegraf/internal"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
vsanNamespace = "vsan"
|
||||||
|
vsanPath = "/vsanHealth"
|
||||||
|
hwMarksKeyPrefix = "vsan-perf-"
|
||||||
|
perfPrefix = "performance."
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
vsanPerfMetricsName string
|
||||||
|
vsanSummaryMetricsName string
|
||||||
|
perfManagerRef = types.ManagedObjectReference{
|
||||||
|
Type: "VsanPerformanceManager",
|
||||||
|
Value: "vsan-performance-manager",
|
||||||
|
}
|
||||||
|
hyphenReplacer = strings.NewReplacer("-", "")
|
||||||
|
)
|
||||||
|
|
||||||
|
// collectVsan is the entry point for vsan metrics collection
|
||||||
|
func (e *Endpoint) collectVsan(ctx context.Context, acc telegraf.Accumulator) error {
|
||||||
|
//resourceType := "vsan"
|
||||||
|
lower := versionLowerThan(e.apiVersion, "5.5")
|
||||||
|
if lower {
|
||||||
|
return fmt.Errorf("a minimum API version of 5.5 is required for vSAN. Found: %s. Skipping vCenter: %s", e.apiVersion, e.URL.Host)
|
||||||
|
}
|
||||||
|
vsanPerfMetricsName = strings.Join([]string{"vsphere", "vsan", "performance"}, e.Parent.Separator)
|
||||||
|
vsanSummaryMetricsName = strings.Join([]string{"vsphere", "vsan", "summary"}, e.Parent.Separator)
|
||||||
|
res := e.resourceKinds["vsan"]
|
||||||
|
client, err := e.clientFactory.GetClient(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("fail to get client when collect vsan: %w", err)
|
||||||
|
}
|
||||||
|
// Create vSAN client
|
||||||
|
vimClient := client.Client.Client
|
||||||
|
vsanClient := vimClient.NewServiceClient(vsanPath, vsanNamespace)
|
||||||
|
// vSAN Metrics to collect
|
||||||
|
metrics := e.getVsanMetadata(ctx, vsanClient, res)
|
||||||
|
// Iterate over all clusters, run a goroutine for each cluster
|
||||||
|
te := NewThrottledExecutor(e.Parent.CollectConcurrency)
|
||||||
|
for _, obj := range res.objects {
|
||||||
|
te.Run(ctx, func() {
|
||||||
|
e.collectVsanPerCluster(ctx, obj, vimClient, vsanClient, metrics, acc)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
te.Wait()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectVsanPerCluster is called by goroutines in collectVsan function.
|
||||||
|
func (e *Endpoint) collectVsanPerCluster(ctx context.Context, clusterRef *objectRef, vimClient *vim25.Client, vsanClient *soap.Client,
|
||||||
|
metrics map[string]string, acc telegraf.Accumulator) {
|
||||||
|
// Construct a map for cmmds
|
||||||
|
cluster := object.NewClusterComputeResource(vimClient, clusterRef.ref)
|
||||||
|
if !e.vsanEnabled(ctx, cluster) {
|
||||||
|
acc.AddError(fmt.Errorf("[vSAN] Fail to identify vSAN for cluster %s. Skipping", clusterRef.name))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Do collection
|
||||||
|
if _, ok := metrics["summary.disk-usage"]; ok {
|
||||||
|
if err := e.queryDiskUsage(ctx, vsanClient, clusterRef, acc); err != nil {
|
||||||
|
acc.AddError(fmt.Errorf("error querying disk usage for cluster %s: %w", clusterRef.name, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if _, ok := metrics["summary.health"]; ok {
|
||||||
|
if err := e.queryHealthSummary(ctx, vsanClient, clusterRef, acc); err != nil {
|
||||||
|
acc.AddError(fmt.Errorf("error querying vsan health summary for cluster %s: %w", clusterRef.name, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if _, ok := metrics["summary.resync"]; ok {
|
||||||
|
if err := e.queryResyncSummary(ctx, vsanClient, cluster, clusterRef, acc); err != nil {
|
||||||
|
acc.AddError(fmt.Errorf("error querying vsan resync summary for cluster %s: %w", clusterRef.name, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cmmds, err := getCmmdsMap(ctx, vimClient, cluster)
|
||||||
|
if err != nil {
|
||||||
|
e.Parent.Log.Errorf("[vSAN] Error while query cmmds data. Error: %s. Skipping", err)
|
||||||
|
cmmds = make(map[string]CmmdsEntity)
|
||||||
|
}
|
||||||
|
if err := e.queryPerformance(ctx, vsanClient, clusterRef, metrics, cmmds, acc); err != nil {
|
||||||
|
acc.AddError(fmt.Errorf("error querying performance metrics for cluster %s: %w", clusterRef.name, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// vsanEnabled returns True if vSAN is enabled, otherwise False
|
||||||
|
func (e *Endpoint) vsanEnabled(ctx context.Context, clusterObj *object.ClusterComputeResource) bool {
|
||||||
|
config, err := clusterObj.Configuration(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
enabled := config.VsanConfigInfo.Enabled
|
||||||
|
return enabled != nil && *enabled
|
||||||
|
}
|
||||||
|
|
||||||
|
// getVsanMetadata returns a string list of the entity types that will be queried.
|
||||||
|
// e.g ["summary.health", "summary.disk-usage", "summary.resync", "performance.cluster-domclient", "performance.host-domclient"]
|
||||||
|
func (e *Endpoint) getVsanMetadata(ctx context.Context, vsanClient *soap.Client, res *resourceKind) map[string]string {
|
||||||
|
metrics := make(map[string]string)
|
||||||
|
if res.simple { // Skip getting supported Entity types from vCenter. Using user defined metrics without verifying.
|
||||||
|
for _, entity := range res.include {
|
||||||
|
if strings.Contains(entity, "*") {
|
||||||
|
e.Parent.Log.Infof("[vSAN] Won't use wildcard match \"*\" when vsan_metric_skip_verify = true. Skipping")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
metrics[entity] = ""
|
||||||
|
}
|
||||||
|
return metrics
|
||||||
|
}
|
||||||
|
// Use the include & exclude configuration to filter all summary metrics
|
||||||
|
for _, entity := range []string{"summary.health", "summary.disk-usage", "summary.resync"} {
|
||||||
|
if res.filters.Match(entity) {
|
||||||
|
metrics[entity] = ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
resp, err := vsanmethods.VsanPerfGetSupportedEntityTypes(ctx, vsanClient,
|
||||||
|
&vsantypes.VsanPerfGetSupportedEntityTypes{
|
||||||
|
This: perfManagerRef,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
e.Parent.Log.Errorf("[vSAN] Fail to get supported entities: %v. Skipping vsan performance data.", err)
|
||||||
|
return metrics
|
||||||
|
}
|
||||||
|
// Use the include & exclude configuration to filter all supported performance metrics
|
||||||
|
for _, entity := range resp.Returnval {
|
||||||
|
if res.filters.Match(perfPrefix + entity.Name) {
|
||||||
|
metrics[perfPrefix+entity.Name] = ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
// getCmmdsMap returns a map which maps a uuid to a CmmdsEntity
|
||||||
|
func getCmmdsMap(ctx context.Context, client *vim25.Client, clusterObj *object.ClusterComputeResource) (map[string]CmmdsEntity, error) {
|
||||||
|
hosts, err := clusterObj.Hosts(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("fail to get host: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(hosts) == 0 {
|
||||||
|
return make(map[string]CmmdsEntity), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
queries := []types.HostVsanInternalSystemCmmdsQuery{
|
||||||
|
{Type: "HOSTNAME"},
|
||||||
|
{Type: "DISK"},
|
||||||
|
}
|
||||||
|
|
||||||
|
//Some esx host can be down or in maintenance mode. Hence cmmds query might fail on such hosts.
|
||||||
|
// We iterate until be get proper api response
|
||||||
|
var resp *types.QueryCmmdsResponse
|
||||||
|
for _, host := range hosts {
|
||||||
|
vis, err := host.ConfigManager().VsanInternalSystem(ctx)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
request := types.QueryCmmds{
|
||||||
|
This: vis.Reference(),
|
||||||
|
Queries: queries,
|
||||||
|
}
|
||||||
|
resp, err = methods.QueryCmmds(ctx, client.RoundTripper, &request)
|
||||||
|
if err == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if resp == nil {
|
||||||
|
return nil, fmt.Errorf("all hosts fail to query cmmds")
|
||||||
|
}
|
||||||
|
var clusterCmmds Cmmds
|
||||||
|
if err := json.Unmarshal([]byte(resp.Returnval), &clusterCmmds); err != nil {
|
||||||
|
return nil, fmt.Errorf("fail to convert cmmds to json: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cmmdsMap := make(map[string]CmmdsEntity)
|
||||||
|
for _, entity := range clusterCmmds.Res {
|
||||||
|
cmmdsMap[entity.UUID] = entity
|
||||||
|
}
|
||||||
|
return cmmdsMap, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// queryPerformance adds performance metrics to telegraf accumulator
|
||||||
|
func (e *Endpoint) queryPerformance(ctx context.Context, vsanClient *soap.Client, clusterRef *objectRef, metrics map[string]string,
|
||||||
|
cmmds map[string]CmmdsEntity, acc telegraf.Accumulator) error {
|
||||||
|
end := time.Now().UTC()
|
||||||
|
|
||||||
|
// We're using a fake metric key, since we only store one highwater mark per resource
|
||||||
|
start, ok := e.hwMarks.Get(hwMarksKeyPrefix+clusterRef.ref.Value, "generic")
|
||||||
|
if !ok {
|
||||||
|
// Look back 3 sampling periods by default
|
||||||
|
start = end.Add(time.Duration(e.Parent.MetricLookback) * time.Duration(-e.resourceKinds["vsan"].sampling) * time.Second)
|
||||||
|
}
|
||||||
|
e.Parent.Log.Debugf("[vSAN] Query vsan performance for time interval: %s ~ %s", start, end)
|
||||||
|
latest := start
|
||||||
|
|
||||||
|
var commonError error
|
||||||
|
for entityRefID := range metrics {
|
||||||
|
if !strings.HasPrefix(entityRefID, perfPrefix) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
entityRefID = strings.TrimPrefix(entityRefID, perfPrefix)
|
||||||
|
var perfSpecs []vsantypes.VsanPerfQuerySpec
|
||||||
|
|
||||||
|
perfSpec := vsantypes.VsanPerfQuerySpec{
|
||||||
|
EntityRefId: entityRefID + ":*",
|
||||||
|
StartTime: &start,
|
||||||
|
EndTime: &end,
|
||||||
|
}
|
||||||
|
perfSpecs = append(perfSpecs, perfSpec)
|
||||||
|
|
||||||
|
perfRequest := vsantypes.VsanPerfQueryPerf{
|
||||||
|
This: perfManagerRef,
|
||||||
|
QuerySpecs: perfSpecs,
|
||||||
|
Cluster: &clusterRef.ref,
|
||||||
|
}
|
||||||
|
resp, err := vsanmethods.VsanPerfQueryPerf(ctx, vsanClient, &perfRequest)
|
||||||
|
if err != nil {
|
||||||
|
if err.Error() == "ServerFaultCode: NotFound" {
|
||||||
|
e.Parent.Log.Errorf("[vSAN] Is vSAN performance service enabled for %s? Skipping ...", clusterRef.name)
|
||||||
|
commonError = err
|
||||||
|
break
|
||||||
|
}
|
||||||
|
e.Parent.Log.Errorf("[vSAN] Error querying performance data for %s: %s: %s.", clusterRef.name, entityRefID, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
tags := populateClusterTags(make(map[string]string), clusterRef, e.URL.Host)
|
||||||
|
|
||||||
|
count := 0
|
||||||
|
for _, em := range resp.Returnval {
|
||||||
|
vals := strings.Split(em.EntityRefId, ":")
|
||||||
|
var entityName, uuid string
|
||||||
|
if len(vals) == 1 {
|
||||||
|
entityName, uuid = vals[0], ""
|
||||||
|
} else {
|
||||||
|
entityName, uuid = vals[0], vals[1]
|
||||||
|
}
|
||||||
|
|
||||||
|
buckets := make(map[string]metricEntry)
|
||||||
|
tags := populateCMMDSTags(tags, entityName, uuid, cmmds)
|
||||||
|
var timeStamps []time.Time
|
||||||
|
// 1. Construct a timestamp list from sample info
|
||||||
|
formattedEntityName := hyphenReplacer.Replace(entityName)
|
||||||
|
for _, t := range strings.Split(em.SampleInfo, ",") {
|
||||||
|
// Parse the input string to a time.Time object
|
||||||
|
utcTimeStamp, err := time.Parse("2006-01-02 15:04:05", t)
|
||||||
|
if err != nil {
|
||||||
|
e.Parent.Log.Errorf("[vSAN] Failed to parse a timestamp: %s. Skipping", utcTimeStamp)
|
||||||
|
timeStamps = append(timeStamps, time.Time{})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
timeStamps = append(timeStamps, utcTimeStamp)
|
||||||
|
}
|
||||||
|
// 2. Iterate on each measurement
|
||||||
|
for _, counter := range em.Value {
|
||||||
|
metricLabel := internal.SnakeCase(counter.MetricId.Label)
|
||||||
|
// 3. Iterate on each data point.
|
||||||
|
for i, values := range strings.Split(counter.Values, ",") {
|
||||||
|
ts := timeStamps[i]
|
||||||
|
if ts.IsZero() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Organize the metrics into a bucket per measurement.
|
||||||
|
bKey := em.EntityRefId + " " + strconv.FormatInt(ts.UnixNano(), 10)
|
||||||
|
bucket, found := buckets[bKey]
|
||||||
|
if !found {
|
||||||
|
mn := vsanPerfMetricsName + e.Parent.Separator + formattedEntityName
|
||||||
|
bucket = metricEntry{name: mn, ts: ts, fields: make(map[string]interface{}), tags: tags}
|
||||||
|
buckets[bKey] = bucket
|
||||||
|
}
|
||||||
|
if v, err := strconv.ParseFloat(values, 32); err == nil {
|
||||||
|
bucket.fields[metricLabel] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(timeStamps) > 0 {
|
||||||
|
lastSample := timeStamps[len(timeStamps)-1]
|
||||||
|
if lastSample != (time.Time{}) && lastSample.After(latest) {
|
||||||
|
latest = lastSample
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// We've iterated through all the metrics and collected buckets for each measurement name. Now emit them!
|
||||||
|
for _, bucket := range buckets {
|
||||||
|
acc.AddFields(bucket.name, bucket.fields, bucket.tags, bucket.ts)
|
||||||
|
}
|
||||||
|
count += len(buckets)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
e.hwMarks.Put(hwMarksKeyPrefix+clusterRef.ref.Value, "generic", latest)
|
||||||
|
return commonError
|
||||||
|
}
|
||||||
|
|
||||||
|
// queryDiskUsage adds 'FreeCapacityB' and 'TotalCapacityB' metrics to telegraf accumulator
|
||||||
|
func (e *Endpoint) queryDiskUsage(ctx context.Context, vsanClient *soap.Client, clusterRef *objectRef, acc telegraf.Accumulator) error {
|
||||||
|
spaceManagerRef := types.ManagedObjectReference{
|
||||||
|
Type: "VsanSpaceReportSystem",
|
||||||
|
Value: "vsan-cluster-space-report-system",
|
||||||
|
}
|
||||||
|
resp, err := vsanmethods.VsanQuerySpaceUsage(ctx, vsanClient,
|
||||||
|
&vsantypes.VsanQuerySpaceUsage{
|
||||||
|
This: spaceManagerRef,
|
||||||
|
Cluster: clusterRef.ref,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
fields := map[string]interface{}{
|
||||||
|
"free_capacity_byte": resp.Returnval.FreeCapacityB,
|
||||||
|
"total_capacity_byte": resp.Returnval.TotalCapacityB,
|
||||||
|
}
|
||||||
|
tags := populateClusterTags(make(map[string]string), clusterRef, e.URL.Host)
|
||||||
|
acc.AddFields(vsanSummaryMetricsName, fields, tags)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// queryDiskUsage adds 'OverallHealth' metric to telegraf accumulator
|
||||||
|
func (e *Endpoint) queryHealthSummary(ctx context.Context, vsanClient *soap.Client, clusterRef *objectRef, acc telegraf.Accumulator) error {
|
||||||
|
healthSystemRef := types.ManagedObjectReference{
|
||||||
|
Type: "VsanVcClusterHealthSystem",
|
||||||
|
Value: "vsan-cluster-health-system",
|
||||||
|
}
|
||||||
|
fetchFromCache := true
|
||||||
|
resp, err := vsanmethods.VsanQueryVcClusterHealthSummary(ctx, vsanClient,
|
||||||
|
&vsantypes.VsanQueryVcClusterHealthSummary{
|
||||||
|
This: healthSystemRef,
|
||||||
|
Cluster: &clusterRef.ref,
|
||||||
|
Fields: []string{"overallHealth", "overallHealthDescription"},
|
||||||
|
FetchFromCache: &fetchFromCache,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
healthStr := resp.Returnval.OverallHealth
|
||||||
|
healthMap := map[string]int{"red": 2, "yellow": 1, "green": 0}
|
||||||
|
fields := make(map[string]interface{})
|
||||||
|
if val, ok := healthMap[healthStr]; ok {
|
||||||
|
fields["overall_health"] = val
|
||||||
|
}
|
||||||
|
tags := populateClusterTags(make(map[string]string), clusterRef, e.URL.Host)
|
||||||
|
acc.AddFields(vsanSummaryMetricsName, fields, tags)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// queryResyncSummary adds resync information to accumulator
|
||||||
|
func (e *Endpoint) queryResyncSummary(ctx context.Context, vsanClient *soap.Client, clusterObj *object.ClusterComputeResource,
|
||||||
|
clusterRef *objectRef, acc telegraf.Accumulator) error {
|
||||||
|
if lower := versionLowerThan(e.apiVersion, "6.7"); lower {
|
||||||
|
e.Parent.Log.Infof("I! [inputs.vsphere][vSAN] Minimum API Version 6.7 required for resync summary. Found: %s. Skipping VCenter: %s",
|
||||||
|
e.apiVersion, e.URL.Host)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
hosts, err := clusterObj.Hosts(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if len(hosts) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
hostRefValue := hosts[0].Reference().Value
|
||||||
|
hostRefValueParts := strings.Split(hostRefValue, "-")
|
||||||
|
if len(hostRefValueParts) != 2 {
|
||||||
|
e.Parent.Log.Errorf("[vSAN] Host reference value does not match expected pattern: host-<num>. Actual Value %s", hostRefValue)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
vsanSystemEx := types.ManagedObjectReference{
|
||||||
|
Type: "VsanSystemEx",
|
||||||
|
Value: fmt.Sprintf("vsanSystemEx-%s", strings.Split(hostRefValue, "-")[1]),
|
||||||
|
}
|
||||||
|
|
||||||
|
includeSummary := true
|
||||||
|
request := vsantypes.VsanQuerySyncingVsanObjects{
|
||||||
|
This: vsanSystemEx,
|
||||||
|
Uuids: []string{}, // We only need summary information.
|
||||||
|
Start: 0,
|
||||||
|
IncludeSummary: &includeSummary,
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := vsanmethods.VsanQuerySyncingVsanObjects(ctx, vsanClient, &request)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
fields := make(map[string]interface{})
|
||||||
|
fields["total_bytes_to_sync"] = resp.Returnval.TotalBytesToSync
|
||||||
|
fields["total_objects_to_sync"] = resp.Returnval.TotalObjectsToSync
|
||||||
|
fields["total_recovery_eta"] = resp.Returnval.TotalRecoveryETA
|
||||||
|
tags := populateClusterTags(make(map[string]string), clusterRef, e.URL.Host)
|
||||||
|
acc.AddFields(vsanSummaryMetricsName, fields, tags)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// populateClusterTags takes in a tag map, makes a copy, populates cluster related tags and returns the copy.
|
||||||
|
func populateClusterTags(tags map[string]string, clusterRef *objectRef, vcenter string) map[string]string {
|
||||||
|
newTags := make(map[string]string)
|
||||||
|
// deep copy
|
||||||
|
for k, v := range tags {
|
||||||
|
newTags[k] = v
|
||||||
|
}
|
||||||
|
newTags["vcenter"] = vcenter
|
||||||
|
newTags["dcname"] = clusterRef.dcname
|
||||||
|
newTags["clustername"] = clusterRef.name
|
||||||
|
newTags["moid"] = clusterRef.ref.Value
|
||||||
|
newTags["source"] = clusterRef.name
|
||||||
|
return newTags
|
||||||
|
}
|
||||||
|
|
||||||
|
// populateCMMDSTags takes in a tag map, makes a copy, adds more tags using a cmmds map and returns the copy.
|
||||||
|
func populateCMMDSTags(tags map[string]string, entityName string, uuid string, cmmds map[string]CmmdsEntity) map[string]string {
|
||||||
|
newTags := make(map[string]string)
|
||||||
|
// deep copy
|
||||||
|
for k, v := range tags {
|
||||||
|
newTags[k] = v
|
||||||
|
}
|
||||||
|
// There are cases when the uuid is missing. (Usually happens when performance service is just enabled or disabled)
|
||||||
|
// We need this check to avoid index-out-of-range error
|
||||||
|
if uuid == "*" || uuid == "" {
|
||||||
|
return newTags
|
||||||
|
}
|
||||||
|
// Add additional tags based on CMMDS data
|
||||||
|
switch {
|
||||||
|
case strings.Contains(entityName, "-disk") || strings.Contains(entityName, "disk-"):
|
||||||
|
if e, ok := cmmds[uuid]; ok {
|
||||||
|
if host, ok := cmmds[e.Owner]; ok {
|
||||||
|
newTags["hostname"] = host.Content.Hostname
|
||||||
|
}
|
||||||
|
newTags["devicename"] = e.Content.DevName
|
||||||
|
if int(e.Content.IsSsd) == 0 {
|
||||||
|
newTags["ssduuid"] = e.Content.SsdUUID
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case strings.Contains(entityName, "host-memory-"):
|
||||||
|
memInfo := strings.Split(uuid, "|")
|
||||||
|
if strings.Contains(entityName, "-slab") && len(memInfo) > 1 {
|
||||||
|
newTags["slabname"] = memInfo[1]
|
||||||
|
}
|
||||||
|
if strings.Contains(entityName, "-heap") && len(memInfo) > 1 {
|
||||||
|
newTags["heapname"] = memInfo[1]
|
||||||
|
}
|
||||||
|
if e, ok := cmmds[memInfo[0]]; ok {
|
||||||
|
newTags["hostname"] = e.Content.Hostname
|
||||||
|
}
|
||||||
|
case strings.Contains(entityName, "host-") || strings.Contains(entityName, "system-mem"):
|
||||||
|
if e, ok := cmmds[uuid]; ok {
|
||||||
|
newTags["hostname"] = e.Content.Hostname
|
||||||
|
}
|
||||||
|
case strings.Contains(entityName, "vnic-net"):
|
||||||
|
nicInfo := strings.Split(uuid, "|")
|
||||||
|
if len(nicInfo) > 2 {
|
||||||
|
newTags["stackname"] = nicInfo[1]
|
||||||
|
newTags["vnic"] = nicInfo[2]
|
||||||
|
}
|
||||||
|
if e, ok := cmmds[nicInfo[0]]; ok {
|
||||||
|
newTags["hostname"] = e.Content.Hostname
|
||||||
|
}
|
||||||
|
case strings.Contains(entityName, "pnic-net"):
|
||||||
|
nicInfo := strings.Split(uuid, "|")
|
||||||
|
if len(nicInfo) > 1 {
|
||||||
|
newTags["pnic"] = nicInfo[1]
|
||||||
|
}
|
||||||
|
if e, ok := cmmds[nicInfo[0]]; ok {
|
||||||
|
newTags["hostname"] = e.Content.Hostname
|
||||||
|
}
|
||||||
|
case strings.Contains(entityName, "world-cpu"):
|
||||||
|
cpuInfo := strings.Split(uuid, "|")
|
||||||
|
if len(cpuInfo) > 1 {
|
||||||
|
newTags["worldname"] = cpuInfo[1]
|
||||||
|
}
|
||||||
|
if e, ok := cmmds[cpuInfo[0]]; ok {
|
||||||
|
newTags["hostname"] = e.Content.Hostname
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
// If no tags are added in previous steps, we add uuid for it
|
||||||
|
if len(newTags) == len(tags) {
|
||||||
|
newTags["uuid"] = uuid
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return newTags
|
||||||
|
}
|
||||||
|
|
||||||
|
// versionLowerThan returns true is the current version < a base version
|
||||||
|
func versionLowerThan(current string, base string) bool {
|
||||||
|
v1 := semver.New(current)
|
||||||
|
v2 := semver.New(base)
|
||||||
|
return v1.LessThan(*v2)
|
||||||
|
}
|
||||||
|
|
||||||
|
type CmmdsEntity struct {
|
||||||
|
UUID string `json:"uuid"`
|
||||||
|
Owner string `json:"owner"` // ESXi UUID
|
||||||
|
Type string `json:"type"`
|
||||||
|
Content CmmdsContent `json:"content"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type Cmmds struct {
|
||||||
|
Res []CmmdsEntity `json:"result"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type CmmdsContent struct {
|
||||||
|
Hostname string `json:"hostname"`
|
||||||
|
IsSsd float64 `json:"isSsd"`
|
||||||
|
SsdUUID string `json:"ssdUuid"`
|
||||||
|
DevName string `json:"devName"`
|
||||||
|
}
|
||||||
|
|
@ -55,6 +55,10 @@ type VSphere struct {
|
||||||
DatastoreMetricExclude []string
|
DatastoreMetricExclude []string
|
||||||
DatastoreInclude []string
|
DatastoreInclude []string
|
||||||
DatastoreExclude []string
|
DatastoreExclude []string
|
||||||
|
VSANMetricInclude []string `toml:"vsan_metric_include"`
|
||||||
|
VSANMetricExclude []string `toml:"vsan_metric_exclude"`
|
||||||
|
VSANMetricSkipVerify bool `toml:"vsan_metric_skip_verify"`
|
||||||
|
VSANClusterInclude []string `toml:"vsan_cluster_include"`
|
||||||
Separator string
|
Separator string
|
||||||
CustomAttributeInclude []string
|
CustomAttributeInclude []string
|
||||||
CustomAttributeExclude []string
|
CustomAttributeExclude []string
|
||||||
|
|
@ -62,15 +66,14 @@ type VSphere struct {
|
||||||
IPAddresses []string
|
IPAddresses []string
|
||||||
MetricLookback int
|
MetricLookback int
|
||||||
DisconnectedServersBehavior string
|
DisconnectedServersBehavior string
|
||||||
|
MaxQueryObjects int
|
||||||
MaxQueryObjects int
|
MaxQueryMetrics int
|
||||||
MaxQueryMetrics int
|
CollectConcurrency int
|
||||||
CollectConcurrency int
|
DiscoverConcurrency int
|
||||||
DiscoverConcurrency int
|
ForceDiscoverOnInit bool `toml:"force_discover_on_init" deprecated:"1.14.0;option is ignored"`
|
||||||
ForceDiscoverOnInit bool `toml:"force_discover_on_init" deprecated:"1.14.0;option is ignored"`
|
ObjectDiscoveryInterval config.Duration
|
||||||
ObjectDiscoveryInterval config.Duration
|
Timeout config.Duration
|
||||||
Timeout config.Duration
|
HistoricalInterval config.Duration
|
||||||
HistoricalInterval config.Duration
|
|
||||||
|
|
||||||
endpoints []*Endpoint
|
endpoints []*Endpoint
|
||||||
cancel context.CancelFunc
|
cancel context.CancelFunc
|
||||||
|
|
@ -155,38 +158,40 @@ func (v *VSphere) Gather(acc telegraf.Accumulator) error {
|
||||||
func init() {
|
func init() {
|
||||||
inputs.Add("vsphere", func() telegraf.Input {
|
inputs.Add("vsphere", func() telegraf.Input {
|
||||||
return &VSphere{
|
return &VSphere{
|
||||||
Vcenters: []string{},
|
Vcenters: []string{},
|
||||||
|
DatacenterInstances: false,
|
||||||
DatacenterInstances: false,
|
DatacenterMetricInclude: nil,
|
||||||
DatacenterMetricInclude: nil,
|
DatacenterMetricExclude: nil,
|
||||||
DatacenterMetricExclude: nil,
|
DatacenterInclude: []string{"/*"},
|
||||||
DatacenterInclude: []string{"/*"},
|
ClusterInstances: false,
|
||||||
ClusterInstances: false,
|
ClusterMetricInclude: nil,
|
||||||
ClusterMetricInclude: nil,
|
ClusterMetricExclude: nil,
|
||||||
ClusterMetricExclude: nil,
|
ClusterInclude: []string{"/*/host/**"},
|
||||||
ClusterInclude: []string{"/*/host/**"},
|
HostInstances: true,
|
||||||
HostInstances: true,
|
HostMetricInclude: nil,
|
||||||
HostMetricInclude: nil,
|
HostMetricExclude: nil,
|
||||||
HostMetricExclude: nil,
|
HostInclude: []string{"/*/host/**"},
|
||||||
HostInclude: []string{"/*/host/**"},
|
ResourcePoolInstances: false,
|
||||||
ResourcePoolInstances: false,
|
ResourcePoolMetricInclude: nil,
|
||||||
ResourcePoolMetricInclude: nil,
|
ResourcePoolMetricExclude: nil,
|
||||||
ResourcePoolMetricExclude: nil,
|
ResourcePoolInclude: []string{"/*/host/**"},
|
||||||
ResourcePoolInclude: []string{"/*/host/**"},
|
VMInstances: true,
|
||||||
VMInstances: true,
|
VMMetricInclude: nil,
|
||||||
VMMetricInclude: nil,
|
VMMetricExclude: nil,
|
||||||
VMMetricExclude: nil,
|
VMInclude: []string{"/*/vm/**"},
|
||||||
VMInclude: []string{"/*/vm/**"},
|
DatastoreInstances: false,
|
||||||
DatastoreInstances: false,
|
DatastoreMetricInclude: nil,
|
||||||
DatastoreMetricInclude: nil,
|
DatastoreMetricExclude: nil,
|
||||||
DatastoreMetricExclude: nil,
|
DatastoreInclude: []string{"/*/datastore/**"},
|
||||||
DatastoreInclude: []string{"/*/datastore/**"},
|
VSANMetricInclude: nil,
|
||||||
Separator: "_",
|
VSANMetricExclude: []string{"*"},
|
||||||
CustomAttributeInclude: []string{},
|
VSANMetricSkipVerify: false,
|
||||||
CustomAttributeExclude: []string{"*"},
|
VSANClusterInclude: []string{"/*/host/**"},
|
||||||
UseIntSamples: true,
|
Separator: "_",
|
||||||
IPAddresses: []string{},
|
CustomAttributeInclude: []string{},
|
||||||
|
CustomAttributeExclude: []string{"*"},
|
||||||
|
UseIntSamples: true,
|
||||||
|
IPAddresses: []string{},
|
||||||
MaxQueryObjects: 256,
|
MaxQueryObjects: 256,
|
||||||
MaxQueryMetrics: 256,
|
MaxQueryMetrics: 256,
|
||||||
CollectConcurrency: 1,
|
CollectConcurrency: 1,
|
||||||
|
|
|
||||||
|
|
@ -132,16 +132,15 @@ func defaultVSphere() *VSphere {
|
||||||
DatacenterInclude: []string{"/**"},
|
DatacenterInclude: []string{"/**"},
|
||||||
ClientConfig: itls.ClientConfig{InsecureSkipVerify: true},
|
ClientConfig: itls.ClientConfig{InsecureSkipVerify: true},
|
||||||
|
|
||||||
MaxQueryObjects: 256,
|
MaxQueryObjects: 256,
|
||||||
MaxQueryMetrics: 256,
|
MaxQueryMetrics: 256,
|
||||||
ObjectDiscoveryInterval: config.Duration(time.Second * 300),
|
ObjectDiscoveryInterval: config.Duration(time.Second * 300),
|
||||||
Timeout: config.Duration(time.Second * 20),
|
Timeout: config.Duration(time.Second * 20),
|
||||||
ForceDiscoverOnInit: true,
|
ForceDiscoverOnInit: true,
|
||||||
DiscoverConcurrency: 1,
|
DiscoverConcurrency: 1,
|
||||||
CollectConcurrency: 1,
|
CollectConcurrency: 1,
|
||||||
Separator: ".",
|
Separator: ".",
|
||||||
HistoricalInterval: config.Duration(time.Second * 300),
|
HistoricalInterval: config.Duration(time.Second * 300),
|
||||||
DisconnectedServersBehavior: "error",
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -414,12 +413,46 @@ func TestFolders(t *testing.T) {
|
||||||
testLookupVM(ctx, t, &f, "/F0/DC1/vm/**/F*/**", 4, "")
|
testLookupVM(ctx, t, &f, "/F0/DC1/vm/**/F*/**", 4, "")
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCollectionWithClusterMetrics(t *testing.T) {
|
func TestVsanCmmds(t *testing.T) {
|
||||||
if testing.Short() {
|
m, s, err := createSim(0)
|
||||||
t.Skip("Skipping long test in short mode")
|
require.NoError(t, err)
|
||||||
}
|
defer m.Remove()
|
||||||
|
defer s.Close()
|
||||||
|
|
||||||
testCollection(t, false)
|
v := defaultVSphere()
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
c, err := NewClient(ctx, s.URL, v)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
f := Finder{c}
|
||||||
|
var clusters []mo.ClusterComputeResource
|
||||||
|
err = f.FindAll(ctx, "ClusterComputeResource", []string{"/**"}, []string{}, &clusters)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
clusterObj := object.NewClusterComputeResource(c.Client.Client, clusters[0].Reference())
|
||||||
|
_, err = getCmmdsMap(ctx, c.Client.Client, clusterObj)
|
||||||
|
require.Error(t, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestVsanTags(t *testing.T) {
|
||||||
|
host := "5b860329-3bc4-a76c-48b6-246e963cfcc0"
|
||||||
|
disk := "52ee3be1-47cc-b50d-ecab-01af0f706381"
|
||||||
|
ssdDisk := "52f26fc8-0b9b-56d8-3a32-a9c3bfbc6148"
|
||||||
|
ssd := "52173131-3384-bb63-4ef8-c00b0ce7e3e7"
|
||||||
|
hostname := "sc2-hs1-b2801.eng.vmware.com"
|
||||||
|
devName := "naa.55cd2e414d82c815:2"
|
||||||
|
var cmmds = map[string]CmmdsEntity{
|
||||||
|
disk: {UUID: disk, Type: "DISK", Owner: host, Content: CmmdsContent{DevName: devName, IsSsd: 1.}},
|
||||||
|
ssdDisk: {UUID: ssdDisk, Type: "DISK", Owner: host, Content: CmmdsContent{DevName: devName, IsSsd: 0., SsdUUID: ssd}},
|
||||||
|
host: {UUID: host, Type: "HOSTNAME", Owner: host, Content: CmmdsContent{Hostname: hostname}},
|
||||||
|
}
|
||||||
|
tags := populateCMMDSTags(make(map[string]string), "capacity-disk", disk, cmmds)
|
||||||
|
require.Equal(t, 2, len(tags))
|
||||||
|
tags = populateCMMDSTags(make(map[string]string), "cache-disk", ssdDisk, cmmds)
|
||||||
|
require.Equal(t, 3, len(tags))
|
||||||
|
tags = populateCMMDSTags(make(map[string]string), "host-domclient", host, cmmds)
|
||||||
|
require.Equal(t, 1, len(tags))
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCollectionNoClusterMetrics(t *testing.T) {
|
func TestCollectionNoClusterMetrics(t *testing.T) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue