feat(inputs.vsphere): Adding vSAN extension to vSphere plugin (#11955)

This commit is contained in:
Gangadharaswamy HU 2023-04-18 22:33:28 +05:30 committed by GitHub
parent e211bd3f28
commit b323d1ce3c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 939 additions and 103 deletions

View File

@ -8,6 +8,7 @@ vCenter servers.
* Resource Pools
* VMs
* Datastores
* vSAN
## Supported versions of vSphere
@ -177,6 +178,12 @@ to use them.
datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
# datacenter_instances = false ## false by default
## VSAN
# vsan_metric_include = [] ## if omitted or empty, all metrics are collected
# vsan_metric_exclude = [ "*" ] ## vSAN are not collected by default.
## Whether to skip verifying vSAN metrics against the ones from GetSupportedEntityTypes API.
# vsan_metric_skip_verify = false ## false by default.
## Plugin Settings
## separator character to use for measurement and field names (default: "_")
# separator = "_"
@ -243,7 +250,7 @@ to use them.
```
NOTE: To disable collection of a specific resource type, simply exclude all
metrics using the XX_metric_exclude. For example, to disable collection of VMs,
metrics using the XX_metric_exclude. For example, to disable collection of VMs,
add this:
```toml
@ -251,32 +258,34 @@ vm_metric_exclude = [ "*" ]
```
NOTE: To disable collection of a specific resource type, simply exclude all
metrics using the XX_metric_exclude. For example, to disable collection of VMs,
add this:
metrics using the XX_metric_exclude.
For example, to disable collection of VMs, add this:
### Objects and Metrics Per Query
### Objects and Metrics per Query
By default, in vCenter's configuration a limit is set to the number of entities
that are included in a performance chart query. Default settings for vCenter 6.5
and above is 256. Prior versions of vCenter have this set to 64. A vCenter
administrator can change this setting, see this [VMware KB
article](https://kb.vmware.com/s/article/2107096) for more information.
By default, in the vCenter configuration a limit is set to the number of
entities that are included in a performance chart query. Default settings for
vCenter 6.5 and later is 256. Earlier versions of vCenter have this set to 64.
A vCenter administrator can change this setting.
See this [VMware KB article](https://kb.vmware.com/s/article/2107096) for more
information.
Any modification should be reflected in this plugin by modifying the parameter
`max_query_objects`
```toml
## number of objects to retrieve per query for realtime resources (vms and hosts)
## number of objects to retrieve per query for realtime resources (VMs and hosts)
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
# max_query_objects = 256
```
### Collection and Discovery concurrency
### Collection and Discovery Concurrency
On large vCenter setups it may be prudent to have multiple concurrent go
routines collect performance metrics in order to avoid potential errors for time
elapsed during a collection cycle. This should never be greater than 8, though
the default of 1 (no concurrency) should be sufficient for most configurations.
In large vCenter setups it may be prudent to have multiple concurrent go
routines collect performance metrics in order to avoid potential errors for
time elapsed during a collection cycle. This should never be greater than 8,
though the default of 1 (no concurrency) should be sufficient for most
configurations.
For setting up concurrency, modify `collect_concurrency` and
`discover_concurrency` parameters.
@ -289,8 +298,8 @@ For setting up concurrency, modify `collect_concurrency` and
### Inventory Paths
Resources to be monitored can be selected using Inventory Paths. This treats the
vSphere inventory as a tree structure similar to a file system. A vSphere
Resources to be monitored can be selected using Inventory Paths. This treats
the vSphere inventory as a tree structure similar to a file system. A vSphere
inventory has a structure similar to this:
```bash
@ -330,15 +339,15 @@ Often, we want to select a group of resource, such as all the VMs in a
folder. We could use the path `/DC0/vm/Folder1/*` for that.
Another possibility is to select objects using a partial name, such as
`/DC0/vm/Folder1/hadoop*` yielding all vms in Folder1 with a name starting
`/DC0/vm/Folder1/hadoop*` yielding all VMs in Folder1 with a name starting
with "hadoop".
Finally, due to the arbitrary nesting of the folder structure, we need a
"recursive wildcard" for traversing multiple folders. We use the "**" symbol for
that. If we want to look for a VM with a name starting with "hadoop" in any
folder, we could use the following path: `/DC0/vm/**/hadoop*`
"recursive wildcard" for traversing multiple folders. We use the "**" symbol
for that. If we want to look for a VM with a name starting with "hadoop" in
any folder, we could use the following path: `/DC0/vm/**/hadoop*`
#### Multiple paths to VMs
#### Multiple Paths to VMs
As we can see from the example tree above, VMs appear both in its on folder
under the datacenter, as well as under the hosts. This is useful when you like
@ -368,7 +377,7 @@ be traversed.
## Performance Considerations
### Realtime vs. historical metrics
### Realtime vs. Historical Metrics
vCenter keeps two different kinds of metrics, known as realtime and historical
metrics.
@ -377,15 +386,15 @@ metrics.
* Historical metrics: Available at a (default) 5 minute, 30 minutes, 2 hours and 24 hours rollup levels. The vSphere Telegraf plugin only uses the most granular rollup which defaults to 5 minutes but can be changed in vCenter to other interval durations. These metrics are stored in the vCenter database and can be expensive and slow to query. Historical metrics are the only type of metrics available for **clusters**, **datastores**, **resource pools** and **datacenters**.
This distinction has an impact on how Telegraf collects metrics. A single
instance of an input plugin can have one and only one collection interval, which
means that you typically set the collection interval based on the most
instance of an input plugin can have one and only one collection interval,
which means that you typically set the collection interval based on the most
frequently collected metric. Let's assume you set the collection interval to 1
minute. All realtime metrics will be collected every minute. Since the
historical metrics are only available on a 5 minute interval, the vSphere
Telegraf plugin automatically skips four out of five collection cycles for these
metrics. This works fine in many cases. Problems arise when the collection of
historical metrics takes longer than the collection interval. This will cause
error messages similar to this to appear in the Telegraf logs:
Telegraf plugin automatically skips four out of five collection cycles for
these metrics. This works fine in many cases. Problems arise when the
collection of historical metrics takes longer than the collection interval.
This will cause error messages similar to this to appear in the Telegraf logs:
```text
2019-01-16T13:41:10Z W! [agent] input "inputs.vsphere" did not complete within its interval
@ -394,8 +403,8 @@ error messages similar to this to appear in the Telegraf logs:
This will disrupt the metric collection and can result in missed samples. The
best practice workaround is to specify two instances of the vSphere plugin, one
for the realtime metrics with a short collection interval and one for the
historical metrics with a longer interval. You can use the `*_metric_exclude` to
turn off the resources you don't want to collect metrics for in each
historical metrics with a longer interval. You can use the `*_metric_exclude`
to turn off the resources you don't want to collect metrics for in each
instance. For example:
```toml
@ -414,6 +423,7 @@ instance. For example:
cluster_metric_exclude = ["*"]
datacenter_metric_exclude = ["*"]
resourcepool_metric_exclude = ["*"]
vsan_metric_exclude = ["*"]
collect_concurrency = 5
discover_concurrency = 5
@ -436,14 +446,14 @@ instance. For example:
collect_concurrency = 3
```
### Configuring max_query_metrics setting
### Configuring max_query_metrics Setting
The `max_query_metrics` determines the maximum number of metrics to attempt to
retrieve in one call to vCenter. Generally speaking, a higher number means
faster and more efficient queries. However, the number of allowed metrics in a
query is typically limited in vCenter by the `config.vpxd.stats.maxQueryMetrics`
setting in vCenter. The value defaults to 64 on vSphere 5.5 and older and 256 on
newver versions of vCenter. The vSphere plugin always checks this setting and
setting in vCenter. The value defaults to 64 on vSphere 5.5 and earlier and to
256 on more recent versions. The vSphere plugin always checks this setting and
will automatically reduce the number if the limit configured in vCenter is lower
than max_query_metrics in the plugin. This will result in a log message similar
to this:
@ -455,15 +465,15 @@ to this:
You may ask a vCenter administrator to increase this limit to help boost
performance.
### Cluster metrics and the max_query_metrics setting
### Cluster Metrics and the max_query_metrics Setting
Cluster metrics are handled a bit differently by vCenter. They are aggregated
from ESXi and virtual machine metrics and may not be available when you query
their most recent values. When this happens, vCenter will attempt to perform
that aggregation on the fly. Unfortunately, all the subqueries needed internally
in vCenter to perform this aggregation will count towards
`config.vpxd.stats.maxQueryMetrics`. This means that even a very small query may
result in an error message similar to this:
that aggregation on the fly. Unfortunately, all the subqueries needed
internally in vCenter to perform this aggregation will count towards
`config.vpxd.stats.maxQueryMetrics`. This means that even a very small query
may result in an error message similar to this:
```text
2018-11-02T13:37:11Z E! Error in plugin [inputs.vsphere]: ServerFaultCode: This operation is restricted by the administrator - 'vpxd.stats.maxQueryMetrics'. Contact your system administrator
@ -474,22 +484,22 @@ There are two ways of addressing this:
* Ask your vCenter administrator to set `config.vpxd.stats.maxQueryMetrics` to a number that's higher than the total number of virtual machines managed by a vCenter instance.
* Exclude the cluster metrics and use either the basicstats aggregator to calculate sums and averages per cluster or use queries in the visualization tool to obtain the same result.
### Concurrency settings
### Concurrency Settings
The vSphere plugin allows you to specify two concurrency settings:
* `collect_concurrency`: The maximum number of simultaneous queries for performance metrics allowed per resource type.
* `discover_concurrency`: The maximum number of simultaneous queries for resource discovery allowed.
* `discover_concurrency`: The maximum number of simultaneous queries for resource discovery allowed.
While a higher level of concurrency typically has a positive impact on
performance, increasing these numbers too much can cause performance issues at
the vCenter server. A rule of thumb is to set these parameters to the number of
virtual machines divided by 1500 and rounded up to the nearest integer.
### Configuring historical_interval setting
### Configuring historical_interval Setting
When the vSphere plugin queries vCenter for historical statistics it queries for
statistics that exist at a specific interval. The default historical interval
statistics that exist at a specific interval. The default historical interval
duration is 5 minutes but if this interval has been changed then you must
override the default query interval in the vSphere plugin.
@ -569,6 +579,230 @@ For a detailed list of commonly available metrics, please refer to
* virtualDisk stats for VM
* disk (name of virtual disk)
## Add a vSAN extension
A vSAN resource is a special type of resource that can be collected by the
plugin. The configuration of a vSAN resource slightly differs from the
configuration of hosts, VMs, and other resources.
### Prerequisites for vSAN
* vSphere 6.5 and later
* Clusters with vSAN enabled
* [Turn on Virtual SAN performance service](https://docs.vmware.com/en/VMware-vSphere/6.5/com.vmware.vsphere.virtualsan.doc/GUID-02F67DC3-3D5A-48A4-A445-D2BD6AF2862C.html): When you create a vSAN cluster,
the performance service is disabled. To monitor the performance metrics,
you must turn on vSAN performance service.
### vSAN Configuration
```toml
[[inputs.vsphere]]
interval = "300s"
vcenters = ["https://<vcenter-ip>/sdk", "https://<vcenter2-ip>/sdk"]
username = "<user>"
password = "<pwd>"
# Exclude all other metrics
vm_metric_exclude = ["*"]
datastore_metric_exclude = ["*"]
datacenter_metric_exclude = ["*"]
host_metric_exclude = ["*"]
cluster_metric_exclude = ["*"]
# By default all supported entity will be included
vsan_metric_include = [
"summary.disk-usage",
"summary.health",
"summary.resync",
"performance.cluster-domclient",
"performance.cluster-domcompmgr",
"performance.host-domclient",
"performance.host-domcompmgr",
"performance.cache-disk",
"performance.disk-group",
"performance.capacity-disk",
"performance.disk-group",
"performance.virtual-machine",
"performance.vscsi",
"performance.virtual-disk",
"performance.vsan-host-net",
"performance.vsan-vnic-net",
"performance.vsan-pnic-net",
"performance.vsan-iscsi-host",
"performance.vsan-iscsi-target",
"performance.vsan-iscsi-lun",
"performance.lsom-world-cpu",
"performance.nic-world-cpu",
"performance.dom-world-cpu",
"performance.cmmds-world-cpu",
"performance.host-cpu",
"performance.host-domowner",
"performance.host-memory-slab",
"performance.host-memory-heap",
"performance.system-mem",
]
# by default vsan_metric_skip_verify = false
vsan_metric_skip_verify = true
vsan_metric_exclude = [ ]
# vsan_cluster_include = [ "/*/host/**" ] # Inventory path to clusters to collect (by default all are collected)
collect_concurrency = 5
discover_concurrency = 5
## Optional SSL Config
# ssl_ca = "/path/to/cafile"
# ssl_cert = "/path/to/certfile"
# ssl_key = "/path/to/keyfile"
## Use SSL but skip chain & host verification
# insecure_skip_verify = false
```
* Use `vsan_metric_include = [...]` to define the vSAN metrics that you want to collect.
For example, `vsan_metric_include = ["summary.*", "performance.host-domclient", "performance.cache-disk", "performance.disk-group", "performance.capacity-disk"]`.
To include all supported vSAN metrics, use `vsan_metric_include = [ "*" ]`.
To disable all the vSAN metrics, use `vsan_metric_exclude = [ "*" ]`.
* `vsan_metric_skip_verify` defines whether to skip verifying vSAN metrics against the ones from [GetSupportedEntityTypes API](https://code.vmware.com/apis/48/vsan#/doc/vim.cluster.VsanPerformanceManager.html#getSupportedEntityTypes).
This option is given because some performance entities are not returned by the API, but we want to offer the flexibility if you really need the stats.
When set to false, anything not in the supported entity list will be filtered out.
When set to true, queried metrics will be identical to vsan_metric_include and the exclusive array will not be used in this case. By default the value is false.
* `vsan_cluster_include` defines a list of inventory paths that will be used to select a portion of vSAN clusters.
vSAN metrics are only collected on the cluster level. Therefore, use the same way as inventory paths for [vSphere clusters](README.md#inventory-paths).
* Many vCenter environments use self-signed certificates. Update the bottom portion of the above configuration and provide proper values for all applicable SSL Config settings that apply in your vSphere environment. In some environments, setting insecure_skip_verify = true will be necessary when the SSL certificates are not available.
* To ensure consistent collection in larger vSphere environments, you must increase concurrency for the plugin. Use the collect_concurrency setting to control concurrency. Set collect_concurrency to the number of virtual machines divided by 1500 and rounded up to the nearest integer. For example, for 1200 VMs use 1, and for 2300 VMs use 2.
### Measurements & Fields
**NOTE**: Depending on the vSAN version, the vSAN performance measurements
and fields may vary.
* vSAN Summary
* overall_health
* total_capacity_bytes, free_capacity_bytes
* total_bytes_to_sync, total_objects_to_sync, total_recovery_eta
* vSAN Performance
* cluster-domclient
* iops_read, throughput_read, latency_avg_read, iops_write, throughput_write, latency_avg_write, congestion, oio
* cluster-domcompmgr
* iops_read, throughput_read, latency_avg_read, iops_write, throughput_write, latency_avg_write, iops_rec_write, throughput_rec_write, latency_avg_rec_write, congestion, oio, iops_resync_read, tput_resync_read, lat_avg_resyncread
* host-domclient
* iops_read, throughput_read, latency_avg_read, read_count, iops_write, throughput_write, latency_avg_write, write_count, congestion, oio, client_cache_hits, client_cache_hit_rate
* host-domcompmgr
* iops_read, throughput_read, latency_avg_read, read_count, iops_write, throughput_write, latency_avg_write, write_count, iops_rec_write, throughput_rec_write, latency_avg_rec_write, rec_write_count congestion, oio, iops_resync_read, tput_resync_read, lat_avg_resync_read
* cache-disk
* iops_dev_read, throughput_dev_read, latency_dev_read, io_count_dev_read, iops_dev_write, throughput_dev_write, latency_dev_write, io_count_dev_write, latency_dev_d_avg, latency_dev_g_avg
* capacity-disk
* iops_dev_read, throughput_dev_read, latency_dev_read, io_count_dev_read, iops_dev_write, throughput_dev_write, latency_dev_write, io_count_dev_write, latency_dev_d_avg, latency_dev_g_avg, iops_read, latency_read, io_count_read, iops_write, latency_write, io_count_write
* disk-group
* iops_sched, latency_sched, outstanding_bytes_sched, iops_sched_queue_rec, throughput_sched_queue_rec,latency_sched_queue_rec, iops_sched_queue_vm, throughput_sched_queue_vm,latency_sched_queue_vm, iops_sched_queue_meta, throughput_sched_queue_meta,latency_sched_queue_meta, iops_delay_pct_sched, latency_delay_sched, rc_hit_rate, wb_free_pct, war_evictions, quota_evictions, iops_rc_read, latency_rc_read, io_count_rc_read, iops_wb_read, latency_wb_read, io_count_wb_read, iops_rc_write, latency_rc_write, io_count_rc_write, iops_wb_write, latency_wb_write, io_count_wb_write, ssd_bytes_drained, zero_bytes_drained, mem_congestion, slab_congestion, ssd_congestion, iops_congestion, log_congestion, comp_congestion, iops_direct_sched, iops_read, throughput_read, latency_avg_read, read_count, iops_write, throughput_write, latency_avg_write, write_count, oio_write, oio_rec_write, oio_write_size, oio_rec_write_size, rc_size, wb_size, capacity, capacity_used, capacity_reserved, throughput_sched, iops_resync_read_policy, iops_resync_read_decom, iops_resync_read_rebalance, iops_resync_read_fix_comp, iops_resync_write_policy, iops_resync_write_decom, iops_resync_write_rebalance, iops_resync_write_fix_comp, tput_resync_read_policy, tput_resync_read_decom, tput_resync_read_rebalance, tput_resync_read_fix_comp, tput_resync_write_policy, tput_resync_write_decom, tput_resync_write_rebalance, tput_resync_write_fix_comp, lat_resync_read_policy, lat_resync_read_decom, lat_resync_read_rebalance, lat_resync_read_fix_comp, lat_resync_write_policy, lat_resync_write_decom, lat_resync_write_rebalance, lat_resync_write_fix_comp
* virtual-machine
* iops_read, throughput_read, latency_read_avg, latency_read_stddev, read_count, iops_write, throughput_write, latency_write_avg, latency_write_stddev, write_count
* vscsi
* iops_read, throughput_read, latency_read, read_count, iops_write, throughput_write, latency_write, write_count
* virtual-disk
* iops_limit, niops, niops_delayed
* vsan-host-net
* rx_throughput, rx_packets, rx_packets_loss_rate, tx_throughput, tx_packets, tx_packets_loss_rate
* vsan-vnic-net
* rx_throughput, rx_packets, rx_packets_loss_rate, tx_throughput, tx_packets, tx_packets_loss_rate
* vsan-pnic-net
* rx_throughput, rx_packets, rx_packets_loss_rate, tx_throughput, tx_packets, tx_packets_loss_rate
* vsan-iscsi-host
* iops_read, iops_write, iops_total, bandwidth_read, bandwidth_write, bandwidth_total, latency_read, latency_write, latency_total, queue_depth
* vsan-iscsi-target
* iops_read, iops_write, iops_total, bandwidth_read, bandwidth_write, bandwidth_total, latency_read, latency_write, latency_total, queue_depth
* vsan-iscsi-lun
* iops_read, iops_write, iops_total, bandwidth_read, bandwidth_write, bandwidth_total, latency_read, latency_write, latency_total, queue_depth
### vSAN Tags
* all vSAN metrics
* vcenter
* dcname
* clustername
* moid (the cluster's managed object id)
* host-domclient, host-domcompmgr
* hostname
* disk-group, cache-disk, capacity-disk
* hostname
* deviceName
* ssdUuid (if SSD)
* vsan-host-net
* hostname
* vsan-pnic-net
* pnic
* vsan-vnic-net
* vnic
* stackName
### Realtime vs. Historical Metrics in vSAN
vSAN metrics also keep two different kinds of metrics - realtime and
historical metrics.
* Realtime metrics are metrics with the prefix 'summary'. These metrics are available in realtime.
* Historical metrics are metrics with the prefix 'performance'. These are metrics queried from vSAN performance API, which is available at a 5-minute rollup level.
For performance consideration, it is better to specify two instances of the
plugin, one for the realtime metrics with a short collection interval,
and the second one - for the historical metrics with a longer interval.
For example:
```toml
## Realtime instance
[[inputs.vsphere]]
interval = "30s"
vcenters = [ "https://someaddress/sdk" ]
username = "someuser@vsphere.local"
password = "secret"
insecure_skip_verify = true
force_discover_on_init = true
# Exclude all other metrics
vm_metric_exclude = ["*"]
datastore_metric_exclude = ["*"]
datacenter_metric_exclude = ["*"]
host_metric_exclude = ["*"]
cluster_metric_exclude = ["*"]
vsan_metric_include = [ "summary.*" ]
vsan_metric_exclude = [ ]
vsan_metric_skip_verify = false
collect_concurrency = 5
discover_concurrency = 5
# Historical instance
[[inputs.vsphere]]
interval = "300s"
vcenters = [ "https://someaddress/sdk" ]
username = "someuser@vsphere.local"
password = "secret"
insecure_skip_verify = true
force_discover_on_init = true
# Exclude all other metrics
vm_metric_exclude = ["*"]
datastore_metric_exclude = ["*"]
datacenter_metric_exclude = ["*"]
host_metric_exclude = ["*"]
cluster_metric_exclude = ["*"]
vsan_metric_include = [ "performance.*" ]
vsan_metric_exclude = [ ]
vsan_metric_skip_verify = false
collect_concurrency = 5
discover_concurrency = 5
```
## Example Output
```text
@ -677,3 +911,14 @@ vsphere_host_net,clustername=DC0_C0,esxhostname=DC0_C0_H0,host=host.example.com,
vsphere_host_mem,clustername=DC0_C0,esxhostname=DC0_C0_H0,host=host.example.com,moid=host-30,os=Mac,source=DC0_C0_H0,vcenter=localhost:8989 usage_average=116.21 1535660339000000000
vsphere_host_net,clustername=DC0_C0,esxhostname=DC0_C0_H0,host=host.example.com,moid=host-30,os=Mac,source=DC0_C0_H0,vcenter=localhost:8989 bytesRx_average=726i,bytesTx_average=643i,usage_average=1504i 1535660339000000000
```
## vSAN Sample Output
```text
vsphere_vsan_performance_hostdomclient,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,hostname=DC0_C0_H0,moid=domain-c8,source=Example-VSAN,vcenter=localhost:8898 iops_read=7,write_congestion=0,unmap_congestion=0,read_count=2199,iops=8,latency_max_write=8964,latency_avg_unmap=0,latency_avg_write=1883,write_count=364,num_oio=12623,throughput=564127,client_cache_hits=0,latency_max_read=17821,latency_max_unmap=0,read_congestion=0,latency_avg=1154,congestion=0,throughput_read=554721,latency_avg_read=1033,throughput_write=9406,client_cache_hit_rate=0,iops_unmap=0,throughput_unmap=0,latency_stddev=1315,io_count=2563,oio=4,iops_write=1,unmap_count=0 1578955200000000000
vsphere_vsan_performance_clusterdomcompmgr,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,uuid=XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXX,vcenter=localhost:8898 latency_avg_rec_write=0,latency_avg_write=9886,congestion=0,iops_resync_read=0,lat_avg_resync_read=0,iops_read=289,latency_avg_read=1184,throughput_write=50137368,iops_rec_write=0,throughput_rec_write=0,tput_resync_read=0,throughput_read=9043654,iops_write=1272,oio=97 1578954900000000000
vsphere_vsan_performance_clusterdomclient,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,uuid=XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXX,vcenter=localhost:8898 latency_avg_write=1011,congestion=0,oio=26,iops_read=6,throughput_read=489093,latency_avg_read=1085,iops_write=43,throughput_write=435142 1578955200000000000
vsphere_vsan_summary,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,vcenter=localhost:8898 total_bytes_to_sync=0i,total_objects_to_sync=0i,total_recovery_eta=0i 1578955489000000000
vsphere_vsan_summary,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,vcenter=localhost:8898 overall_health=1i 1578955489000000000
vsphere_vsan_summary,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,vcenter=localhost:8898 free_capacity_byte=11022535578757i,total_capacity_byte=14102625779712i 1578955488000000000
```

View File

@ -60,6 +60,7 @@ type Endpoint struct {
metricNameLookup map[int32]string
metricNameMux sync.RWMutex
log telegraf.Logger
apiVersion string
}
type resourceKind struct {
@ -237,6 +238,23 @@ func NewEndpoint(ctx context.Context, parent *VSphere, address *url.URL, log tel
getObjects: getDatastores,
parent: "",
},
"vsan": {
name: "vsan",
vcName: "ClusterComputeResource",
pKey: "clustername",
parentTag: "dcname",
enabled: anythingEnabled(parent.VSANMetricExclude),
realTime: false,
sampling: 300,
objects: make(objectMap),
filters: newFilterOrPanic(parent.VSANMetricInclude, parent.VSANMetricExclude),
paths: parent.VSANClusterInclude,
simple: parent.VSANMetricSkipVerify,
include: parent.VSANMetricInclude,
collectInstances: false,
getObjects: getClusters,
parent: "datacenter",
},
}
// Start discover and other goodness
@ -445,7 +463,10 @@ func (e *Endpoint) discover(ctx context.Context) error {
return err
}
e.log.Debugf("Discover new objects for %s", e.URL.Host)
// get the vSphere API version
e.apiVersion = client.Client.ServiceContent.About.ApiVersion
e.Parent.Log.Debugf("Discover new objects for %s", e.URL.Host)
dcNameCache := make(map[string]string)
numRes := int64(0)
@ -455,7 +476,7 @@ func (e *Endpoint) discover(ctx context.Context) error {
for k, res := range e.resourceKinds {
e.log.Debugf("Discovering resources for %s", res.name)
// Need to do this for all resource types even if they are not enabled
if res.enabled || k != "vm" {
if res.enabled || (k != "vm" && k != "vsan") {
rf := ResourceFilter{
finder: &Finder{client},
resType: res.vcName,
@ -480,7 +501,8 @@ func (e *Endpoint) discover(ctx context.Context) error {
}
// No need to collect metric metadata if resource type is not enabled
if res.enabled {
// VSAN is also skipped since vSAN metadata follow it's own format
if res.enabled && k != "vsan" {
if res.simple {
e.simpleMetadataSelect(ctx, client, res)
} else {
@ -935,7 +957,12 @@ func (e *Endpoint) Collect(ctx context.Context, acc telegraf.Accumulator) error
wg.Add(1)
go func(k string) {
defer wg.Done()
err := e.collectResource(ctx, k, acc)
var err error
if k == "vsan" {
err = e.collectVsan(ctx, acc)
} else {
err = e.collectResource(ctx, k, acc)
}
if err != nil {
acc.AddError(err)
}

View File

@ -136,6 +136,12 @@
datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
# datacenter_instances = false ## false by default
## VSAN
# vsan_metric_include = [] ## if omitted or empty, all metrics are collected
# vsan_metric_exclude = [ "*" ] ## vSAN are not collected by default.
## Whether to skip verifying vSAN metrics against the ones from GetSupportedEntityTypes API.
# vsan_metric_skip_verify = false ## false by default.
## Plugin Settings
## separator character to use for measurement and field names (default: "_")
# separator = "_"

View File

@ -0,0 +1,520 @@
package vsphere
import (
"context"
"encoding/json"
"fmt"
"strconv"
"strings"
"time"
"github.com/coreos/go-semver/semver"
"github.com/vmware/govmomi/object"
"github.com/vmware/govmomi/vim25"
"github.com/vmware/govmomi/vim25/methods"
"github.com/vmware/govmomi/vim25/soap"
"github.com/vmware/govmomi/vim25/types"
vsanmethods "github.com/vmware/govmomi/vsan/methods"
vsantypes "github.com/vmware/govmomi/vsan/types"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
)
const (
vsanNamespace = "vsan"
vsanPath = "/vsanHealth"
hwMarksKeyPrefix = "vsan-perf-"
perfPrefix = "performance."
)
var (
vsanPerfMetricsName string
vsanSummaryMetricsName string
perfManagerRef = types.ManagedObjectReference{
Type: "VsanPerformanceManager",
Value: "vsan-performance-manager",
}
hyphenReplacer = strings.NewReplacer("-", "")
)
// collectVsan is the entry point for vsan metrics collection
func (e *Endpoint) collectVsan(ctx context.Context, acc telegraf.Accumulator) error {
//resourceType := "vsan"
lower := versionLowerThan(e.apiVersion, "5.5")
if lower {
return fmt.Errorf("a minimum API version of 5.5 is required for vSAN. Found: %s. Skipping vCenter: %s", e.apiVersion, e.URL.Host)
}
vsanPerfMetricsName = strings.Join([]string{"vsphere", "vsan", "performance"}, e.Parent.Separator)
vsanSummaryMetricsName = strings.Join([]string{"vsphere", "vsan", "summary"}, e.Parent.Separator)
res := e.resourceKinds["vsan"]
client, err := e.clientFactory.GetClient(ctx)
if err != nil {
return fmt.Errorf("fail to get client when collect vsan: %w", err)
}
// Create vSAN client
vimClient := client.Client.Client
vsanClient := vimClient.NewServiceClient(vsanPath, vsanNamespace)
// vSAN Metrics to collect
metrics := e.getVsanMetadata(ctx, vsanClient, res)
// Iterate over all clusters, run a goroutine for each cluster
te := NewThrottledExecutor(e.Parent.CollectConcurrency)
for _, obj := range res.objects {
te.Run(ctx, func() {
e.collectVsanPerCluster(ctx, obj, vimClient, vsanClient, metrics, acc)
})
}
te.Wait()
return nil
}
// collectVsanPerCluster is called by goroutines in collectVsan function.
func (e *Endpoint) collectVsanPerCluster(ctx context.Context, clusterRef *objectRef, vimClient *vim25.Client, vsanClient *soap.Client,
metrics map[string]string, acc telegraf.Accumulator) {
// Construct a map for cmmds
cluster := object.NewClusterComputeResource(vimClient, clusterRef.ref)
if !e.vsanEnabled(ctx, cluster) {
acc.AddError(fmt.Errorf("[vSAN] Fail to identify vSAN for cluster %s. Skipping", clusterRef.name))
return
}
// Do collection
if _, ok := metrics["summary.disk-usage"]; ok {
if err := e.queryDiskUsage(ctx, vsanClient, clusterRef, acc); err != nil {
acc.AddError(fmt.Errorf("error querying disk usage for cluster %s: %w", clusterRef.name, err))
}
}
if _, ok := metrics["summary.health"]; ok {
if err := e.queryHealthSummary(ctx, vsanClient, clusterRef, acc); err != nil {
acc.AddError(fmt.Errorf("error querying vsan health summary for cluster %s: %w", clusterRef.name, err))
}
}
if _, ok := metrics["summary.resync"]; ok {
if err := e.queryResyncSummary(ctx, vsanClient, cluster, clusterRef, acc); err != nil {
acc.AddError(fmt.Errorf("error querying vsan resync summary for cluster %s: %w", clusterRef.name, err))
}
}
cmmds, err := getCmmdsMap(ctx, vimClient, cluster)
if err != nil {
e.Parent.Log.Errorf("[vSAN] Error while query cmmds data. Error: %s. Skipping", err)
cmmds = make(map[string]CmmdsEntity)
}
if err := e.queryPerformance(ctx, vsanClient, clusterRef, metrics, cmmds, acc); err != nil {
acc.AddError(fmt.Errorf("error querying performance metrics for cluster %s: %w", clusterRef.name, err))
}
}
// vsanEnabled returns True if vSAN is enabled, otherwise False
func (e *Endpoint) vsanEnabled(ctx context.Context, clusterObj *object.ClusterComputeResource) bool {
config, err := clusterObj.Configuration(ctx)
if err != nil {
return false
}
enabled := config.VsanConfigInfo.Enabled
return enabled != nil && *enabled
}
// getVsanMetadata returns a string list of the entity types that will be queried.
// e.g ["summary.health", "summary.disk-usage", "summary.resync", "performance.cluster-domclient", "performance.host-domclient"]
func (e *Endpoint) getVsanMetadata(ctx context.Context, vsanClient *soap.Client, res *resourceKind) map[string]string {
metrics := make(map[string]string)
if res.simple { // Skip getting supported Entity types from vCenter. Using user defined metrics without verifying.
for _, entity := range res.include {
if strings.Contains(entity, "*") {
e.Parent.Log.Infof("[vSAN] Won't use wildcard match \"*\" when vsan_metric_skip_verify = true. Skipping")
continue
}
metrics[entity] = ""
}
return metrics
}
// Use the include & exclude configuration to filter all summary metrics
for _, entity := range []string{"summary.health", "summary.disk-usage", "summary.resync"} {
if res.filters.Match(entity) {
metrics[entity] = ""
}
}
resp, err := vsanmethods.VsanPerfGetSupportedEntityTypes(ctx, vsanClient,
&vsantypes.VsanPerfGetSupportedEntityTypes{
This: perfManagerRef,
})
if err != nil {
e.Parent.Log.Errorf("[vSAN] Fail to get supported entities: %v. Skipping vsan performance data.", err)
return metrics
}
// Use the include & exclude configuration to filter all supported performance metrics
for _, entity := range resp.Returnval {
if res.filters.Match(perfPrefix + entity.Name) {
metrics[perfPrefix+entity.Name] = ""
}
}
return metrics
}
// getCmmdsMap returns a map which maps a uuid to a CmmdsEntity
func getCmmdsMap(ctx context.Context, client *vim25.Client, clusterObj *object.ClusterComputeResource) (map[string]CmmdsEntity, error) {
hosts, err := clusterObj.Hosts(ctx)
if err != nil {
return nil, fmt.Errorf("fail to get host: %w", err)
}
if len(hosts) == 0 {
return make(map[string]CmmdsEntity), nil
}
queries := []types.HostVsanInternalSystemCmmdsQuery{
{Type: "HOSTNAME"},
{Type: "DISK"},
}
//Some esx host can be down or in maintenance mode. Hence cmmds query might fail on such hosts.
// We iterate until be get proper api response
var resp *types.QueryCmmdsResponse
for _, host := range hosts {
vis, err := host.ConfigManager().VsanInternalSystem(ctx)
if err != nil {
continue
}
request := types.QueryCmmds{
This: vis.Reference(),
Queries: queries,
}
resp, err = methods.QueryCmmds(ctx, client.RoundTripper, &request)
if err == nil {
break
}
}
if resp == nil {
return nil, fmt.Errorf("all hosts fail to query cmmds")
}
var clusterCmmds Cmmds
if err := json.Unmarshal([]byte(resp.Returnval), &clusterCmmds); err != nil {
return nil, fmt.Errorf("fail to convert cmmds to json: %w", err)
}
cmmdsMap := make(map[string]CmmdsEntity)
for _, entity := range clusterCmmds.Res {
cmmdsMap[entity.UUID] = entity
}
return cmmdsMap, nil
}
// queryPerformance adds performance metrics to telegraf accumulator
func (e *Endpoint) queryPerformance(ctx context.Context, vsanClient *soap.Client, clusterRef *objectRef, metrics map[string]string,
cmmds map[string]CmmdsEntity, acc telegraf.Accumulator) error {
end := time.Now().UTC()
// We're using a fake metric key, since we only store one highwater mark per resource
start, ok := e.hwMarks.Get(hwMarksKeyPrefix+clusterRef.ref.Value, "generic")
if !ok {
// Look back 3 sampling periods by default
start = end.Add(time.Duration(e.Parent.MetricLookback) * time.Duration(-e.resourceKinds["vsan"].sampling) * time.Second)
}
e.Parent.Log.Debugf("[vSAN] Query vsan performance for time interval: %s ~ %s", start, end)
latest := start
var commonError error
for entityRefID := range metrics {
if !strings.HasPrefix(entityRefID, perfPrefix) {
continue
}
entityRefID = strings.TrimPrefix(entityRefID, perfPrefix)
var perfSpecs []vsantypes.VsanPerfQuerySpec
perfSpec := vsantypes.VsanPerfQuerySpec{
EntityRefId: entityRefID + ":*",
StartTime: &start,
EndTime: &end,
}
perfSpecs = append(perfSpecs, perfSpec)
perfRequest := vsantypes.VsanPerfQueryPerf{
This: perfManagerRef,
QuerySpecs: perfSpecs,
Cluster: &clusterRef.ref,
}
resp, err := vsanmethods.VsanPerfQueryPerf(ctx, vsanClient, &perfRequest)
if err != nil {
if err.Error() == "ServerFaultCode: NotFound" {
e.Parent.Log.Errorf("[vSAN] Is vSAN performance service enabled for %s? Skipping ...", clusterRef.name)
commonError = err
break
}
e.Parent.Log.Errorf("[vSAN] Error querying performance data for %s: %s: %s.", clusterRef.name, entityRefID, err)
continue
}
tags := populateClusterTags(make(map[string]string), clusterRef, e.URL.Host)
count := 0
for _, em := range resp.Returnval {
vals := strings.Split(em.EntityRefId, ":")
var entityName, uuid string
if len(vals) == 1 {
entityName, uuid = vals[0], ""
} else {
entityName, uuid = vals[0], vals[1]
}
buckets := make(map[string]metricEntry)
tags := populateCMMDSTags(tags, entityName, uuid, cmmds)
var timeStamps []time.Time
// 1. Construct a timestamp list from sample info
formattedEntityName := hyphenReplacer.Replace(entityName)
for _, t := range strings.Split(em.SampleInfo, ",") {
// Parse the input string to a time.Time object
utcTimeStamp, err := time.Parse("2006-01-02 15:04:05", t)
if err != nil {
e.Parent.Log.Errorf("[vSAN] Failed to parse a timestamp: %s. Skipping", utcTimeStamp)
timeStamps = append(timeStamps, time.Time{})
continue
}
timeStamps = append(timeStamps, utcTimeStamp)
}
// 2. Iterate on each measurement
for _, counter := range em.Value {
metricLabel := internal.SnakeCase(counter.MetricId.Label)
// 3. Iterate on each data point.
for i, values := range strings.Split(counter.Values, ",") {
ts := timeStamps[i]
if ts.IsZero() {
continue
}
// Organize the metrics into a bucket per measurement.
bKey := em.EntityRefId + " " + strconv.FormatInt(ts.UnixNano(), 10)
bucket, found := buckets[bKey]
if !found {
mn := vsanPerfMetricsName + e.Parent.Separator + formattedEntityName
bucket = metricEntry{name: mn, ts: ts, fields: make(map[string]interface{}), tags: tags}
buckets[bKey] = bucket
}
if v, err := strconv.ParseFloat(values, 32); err == nil {
bucket.fields[metricLabel] = v
}
}
}
if len(timeStamps) > 0 {
lastSample := timeStamps[len(timeStamps)-1]
if lastSample != (time.Time{}) && lastSample.After(latest) {
latest = lastSample
}
}
// We've iterated through all the metrics and collected buckets for each measurement name. Now emit them!
for _, bucket := range buckets {
acc.AddFields(bucket.name, bucket.fields, bucket.tags, bucket.ts)
}
count += len(buckets)
}
}
e.hwMarks.Put(hwMarksKeyPrefix+clusterRef.ref.Value, "generic", latest)
return commonError
}
// queryDiskUsage adds 'FreeCapacityB' and 'TotalCapacityB' metrics to telegraf accumulator
func (e *Endpoint) queryDiskUsage(ctx context.Context, vsanClient *soap.Client, clusterRef *objectRef, acc telegraf.Accumulator) error {
spaceManagerRef := types.ManagedObjectReference{
Type: "VsanSpaceReportSystem",
Value: "vsan-cluster-space-report-system",
}
resp, err := vsanmethods.VsanQuerySpaceUsage(ctx, vsanClient,
&vsantypes.VsanQuerySpaceUsage{
This: spaceManagerRef,
Cluster: clusterRef.ref,
})
if err != nil {
return err
}
fields := map[string]interface{}{
"free_capacity_byte": resp.Returnval.FreeCapacityB,
"total_capacity_byte": resp.Returnval.TotalCapacityB,
}
tags := populateClusterTags(make(map[string]string), clusterRef, e.URL.Host)
acc.AddFields(vsanSummaryMetricsName, fields, tags)
return nil
}
// queryDiskUsage adds 'OverallHealth' metric to telegraf accumulator
func (e *Endpoint) queryHealthSummary(ctx context.Context, vsanClient *soap.Client, clusterRef *objectRef, acc telegraf.Accumulator) error {
healthSystemRef := types.ManagedObjectReference{
Type: "VsanVcClusterHealthSystem",
Value: "vsan-cluster-health-system",
}
fetchFromCache := true
resp, err := vsanmethods.VsanQueryVcClusterHealthSummary(ctx, vsanClient,
&vsantypes.VsanQueryVcClusterHealthSummary{
This: healthSystemRef,
Cluster: &clusterRef.ref,
Fields: []string{"overallHealth", "overallHealthDescription"},
FetchFromCache: &fetchFromCache,
})
if err != nil {
return err
}
healthStr := resp.Returnval.OverallHealth
healthMap := map[string]int{"red": 2, "yellow": 1, "green": 0}
fields := make(map[string]interface{})
if val, ok := healthMap[healthStr]; ok {
fields["overall_health"] = val
}
tags := populateClusterTags(make(map[string]string), clusterRef, e.URL.Host)
acc.AddFields(vsanSummaryMetricsName, fields, tags)
return nil
}
// queryResyncSummary adds resync information to accumulator
func (e *Endpoint) queryResyncSummary(ctx context.Context, vsanClient *soap.Client, clusterObj *object.ClusterComputeResource,
clusterRef *objectRef, acc telegraf.Accumulator) error {
if lower := versionLowerThan(e.apiVersion, "6.7"); lower {
e.Parent.Log.Infof("I! [inputs.vsphere][vSAN] Minimum API Version 6.7 required for resync summary. Found: %s. Skipping VCenter: %s",
e.apiVersion, e.URL.Host)
return nil
}
hosts, err := clusterObj.Hosts(ctx)
if err != nil {
return err
}
if len(hosts) == 0 {
return nil
}
hostRefValue := hosts[0].Reference().Value
hostRefValueParts := strings.Split(hostRefValue, "-")
if len(hostRefValueParts) != 2 {
e.Parent.Log.Errorf("[vSAN] Host reference value does not match expected pattern: host-<num>. Actual Value %s", hostRefValue)
return err
}
vsanSystemEx := types.ManagedObjectReference{
Type: "VsanSystemEx",
Value: fmt.Sprintf("vsanSystemEx-%s", strings.Split(hostRefValue, "-")[1]),
}
includeSummary := true
request := vsantypes.VsanQuerySyncingVsanObjects{
This: vsanSystemEx,
Uuids: []string{}, // We only need summary information.
Start: 0,
IncludeSummary: &includeSummary,
}
resp, err := vsanmethods.VsanQuerySyncingVsanObjects(ctx, vsanClient, &request)
if err != nil {
return err
}
fields := make(map[string]interface{})
fields["total_bytes_to_sync"] = resp.Returnval.TotalBytesToSync
fields["total_objects_to_sync"] = resp.Returnval.TotalObjectsToSync
fields["total_recovery_eta"] = resp.Returnval.TotalRecoveryETA
tags := populateClusterTags(make(map[string]string), clusterRef, e.URL.Host)
acc.AddFields(vsanSummaryMetricsName, fields, tags)
return nil
}
// populateClusterTags takes in a tag map, makes a copy, populates cluster related tags and returns the copy.
func populateClusterTags(tags map[string]string, clusterRef *objectRef, vcenter string) map[string]string {
newTags := make(map[string]string)
// deep copy
for k, v := range tags {
newTags[k] = v
}
newTags["vcenter"] = vcenter
newTags["dcname"] = clusterRef.dcname
newTags["clustername"] = clusterRef.name
newTags["moid"] = clusterRef.ref.Value
newTags["source"] = clusterRef.name
return newTags
}
// populateCMMDSTags takes in a tag map, makes a copy, adds more tags using a cmmds map and returns the copy.
func populateCMMDSTags(tags map[string]string, entityName string, uuid string, cmmds map[string]CmmdsEntity) map[string]string {
newTags := make(map[string]string)
// deep copy
for k, v := range tags {
newTags[k] = v
}
// There are cases when the uuid is missing. (Usually happens when performance service is just enabled or disabled)
// We need this check to avoid index-out-of-range error
if uuid == "*" || uuid == "" {
return newTags
}
// Add additional tags based on CMMDS data
switch {
case strings.Contains(entityName, "-disk") || strings.Contains(entityName, "disk-"):
if e, ok := cmmds[uuid]; ok {
if host, ok := cmmds[e.Owner]; ok {
newTags["hostname"] = host.Content.Hostname
}
newTags["devicename"] = e.Content.DevName
if int(e.Content.IsSsd) == 0 {
newTags["ssduuid"] = e.Content.SsdUUID
}
}
case strings.Contains(entityName, "host-memory-"):
memInfo := strings.Split(uuid, "|")
if strings.Contains(entityName, "-slab") && len(memInfo) > 1 {
newTags["slabname"] = memInfo[1]
}
if strings.Contains(entityName, "-heap") && len(memInfo) > 1 {
newTags["heapname"] = memInfo[1]
}
if e, ok := cmmds[memInfo[0]]; ok {
newTags["hostname"] = e.Content.Hostname
}
case strings.Contains(entityName, "host-") || strings.Contains(entityName, "system-mem"):
if e, ok := cmmds[uuid]; ok {
newTags["hostname"] = e.Content.Hostname
}
case strings.Contains(entityName, "vnic-net"):
nicInfo := strings.Split(uuid, "|")
if len(nicInfo) > 2 {
newTags["stackname"] = nicInfo[1]
newTags["vnic"] = nicInfo[2]
}
if e, ok := cmmds[nicInfo[0]]; ok {
newTags["hostname"] = e.Content.Hostname
}
case strings.Contains(entityName, "pnic-net"):
nicInfo := strings.Split(uuid, "|")
if len(nicInfo) > 1 {
newTags["pnic"] = nicInfo[1]
}
if e, ok := cmmds[nicInfo[0]]; ok {
newTags["hostname"] = e.Content.Hostname
}
case strings.Contains(entityName, "world-cpu"):
cpuInfo := strings.Split(uuid, "|")
if len(cpuInfo) > 1 {
newTags["worldname"] = cpuInfo[1]
}
if e, ok := cmmds[cpuInfo[0]]; ok {
newTags["hostname"] = e.Content.Hostname
}
default:
// If no tags are added in previous steps, we add uuid for it
if len(newTags) == len(tags) {
newTags["uuid"] = uuid
}
}
return newTags
}
// versionLowerThan returns true is the current version < a base version
func versionLowerThan(current string, base string) bool {
v1 := semver.New(current)
v2 := semver.New(base)
return v1.LessThan(*v2)
}
type CmmdsEntity struct {
UUID string `json:"uuid"`
Owner string `json:"owner"` // ESXi UUID
Type string `json:"type"`
Content CmmdsContent `json:"content"`
}
type Cmmds struct {
Res []CmmdsEntity `json:"result"`
}
type CmmdsContent struct {
Hostname string `json:"hostname"`
IsSsd float64 `json:"isSsd"`
SsdUUID string `json:"ssdUuid"`
DevName string `json:"devName"`
}

View File

@ -55,6 +55,10 @@ type VSphere struct {
DatastoreMetricExclude []string
DatastoreInclude []string
DatastoreExclude []string
VSANMetricInclude []string `toml:"vsan_metric_include"`
VSANMetricExclude []string `toml:"vsan_metric_exclude"`
VSANMetricSkipVerify bool `toml:"vsan_metric_skip_verify"`
VSANClusterInclude []string `toml:"vsan_cluster_include"`
Separator string
CustomAttributeInclude []string
CustomAttributeExclude []string
@ -62,15 +66,14 @@ type VSphere struct {
IPAddresses []string
MetricLookback int
DisconnectedServersBehavior string
MaxQueryObjects int
MaxQueryMetrics int
CollectConcurrency int
DiscoverConcurrency int
ForceDiscoverOnInit bool `toml:"force_discover_on_init" deprecated:"1.14.0;option is ignored"`
ObjectDiscoveryInterval config.Duration
Timeout config.Duration
HistoricalInterval config.Duration
MaxQueryObjects int
MaxQueryMetrics int
CollectConcurrency int
DiscoverConcurrency int
ForceDiscoverOnInit bool `toml:"force_discover_on_init" deprecated:"1.14.0;option is ignored"`
ObjectDiscoveryInterval config.Duration
Timeout config.Duration
HistoricalInterval config.Duration
endpoints []*Endpoint
cancel context.CancelFunc
@ -155,38 +158,40 @@ func (v *VSphere) Gather(acc telegraf.Accumulator) error {
func init() {
inputs.Add("vsphere", func() telegraf.Input {
return &VSphere{
Vcenters: []string{},
DatacenterInstances: false,
DatacenterMetricInclude: nil,
DatacenterMetricExclude: nil,
DatacenterInclude: []string{"/*"},
ClusterInstances: false,
ClusterMetricInclude: nil,
ClusterMetricExclude: nil,
ClusterInclude: []string{"/*/host/**"},
HostInstances: true,
HostMetricInclude: nil,
HostMetricExclude: nil,
HostInclude: []string{"/*/host/**"},
ResourcePoolInstances: false,
ResourcePoolMetricInclude: nil,
ResourcePoolMetricExclude: nil,
ResourcePoolInclude: []string{"/*/host/**"},
VMInstances: true,
VMMetricInclude: nil,
VMMetricExclude: nil,
VMInclude: []string{"/*/vm/**"},
DatastoreInstances: false,
DatastoreMetricInclude: nil,
DatastoreMetricExclude: nil,
DatastoreInclude: []string{"/*/datastore/**"},
Separator: "_",
CustomAttributeInclude: []string{},
CustomAttributeExclude: []string{"*"},
UseIntSamples: true,
IPAddresses: []string{},
Vcenters: []string{},
DatacenterInstances: false,
DatacenterMetricInclude: nil,
DatacenterMetricExclude: nil,
DatacenterInclude: []string{"/*"},
ClusterInstances: false,
ClusterMetricInclude: nil,
ClusterMetricExclude: nil,
ClusterInclude: []string{"/*/host/**"},
HostInstances: true,
HostMetricInclude: nil,
HostMetricExclude: nil,
HostInclude: []string{"/*/host/**"},
ResourcePoolInstances: false,
ResourcePoolMetricInclude: nil,
ResourcePoolMetricExclude: nil,
ResourcePoolInclude: []string{"/*/host/**"},
VMInstances: true,
VMMetricInclude: nil,
VMMetricExclude: nil,
VMInclude: []string{"/*/vm/**"},
DatastoreInstances: false,
DatastoreMetricInclude: nil,
DatastoreMetricExclude: nil,
DatastoreInclude: []string{"/*/datastore/**"},
VSANMetricInclude: nil,
VSANMetricExclude: []string{"*"},
VSANMetricSkipVerify: false,
VSANClusterInclude: []string{"/*/host/**"},
Separator: "_",
CustomAttributeInclude: []string{},
CustomAttributeExclude: []string{"*"},
UseIntSamples: true,
IPAddresses: []string{},
MaxQueryObjects: 256,
MaxQueryMetrics: 256,
CollectConcurrency: 1,

View File

@ -132,16 +132,15 @@ func defaultVSphere() *VSphere {
DatacenterInclude: []string{"/**"},
ClientConfig: itls.ClientConfig{InsecureSkipVerify: true},
MaxQueryObjects: 256,
MaxQueryMetrics: 256,
ObjectDiscoveryInterval: config.Duration(time.Second * 300),
Timeout: config.Duration(time.Second * 20),
ForceDiscoverOnInit: true,
DiscoverConcurrency: 1,
CollectConcurrency: 1,
Separator: ".",
HistoricalInterval: config.Duration(time.Second * 300),
DisconnectedServersBehavior: "error",
MaxQueryObjects: 256,
MaxQueryMetrics: 256,
ObjectDiscoveryInterval: config.Duration(time.Second * 300),
Timeout: config.Duration(time.Second * 20),
ForceDiscoverOnInit: true,
DiscoverConcurrency: 1,
CollectConcurrency: 1,
Separator: ".",
HistoricalInterval: config.Duration(time.Second * 300),
}
}
@ -414,12 +413,46 @@ func TestFolders(t *testing.T) {
testLookupVM(ctx, t, &f, "/F0/DC1/vm/**/F*/**", 4, "")
}
func TestCollectionWithClusterMetrics(t *testing.T) {
if testing.Short() {
t.Skip("Skipping long test in short mode")
}
func TestVsanCmmds(t *testing.T) {
m, s, err := createSim(0)
require.NoError(t, err)
defer m.Remove()
defer s.Close()
testCollection(t, false)
v := defaultVSphere()
ctx := context.Background()
c, err := NewClient(ctx, s.URL, v)
require.NoError(t, err)
f := Finder{c}
var clusters []mo.ClusterComputeResource
err = f.FindAll(ctx, "ClusterComputeResource", []string{"/**"}, []string{}, &clusters)
require.NoError(t, err)
clusterObj := object.NewClusterComputeResource(c.Client.Client, clusters[0].Reference())
_, err = getCmmdsMap(ctx, c.Client.Client, clusterObj)
require.Error(t, err)
}
func TestVsanTags(t *testing.T) {
host := "5b860329-3bc4-a76c-48b6-246e963cfcc0"
disk := "52ee3be1-47cc-b50d-ecab-01af0f706381"
ssdDisk := "52f26fc8-0b9b-56d8-3a32-a9c3bfbc6148"
ssd := "52173131-3384-bb63-4ef8-c00b0ce7e3e7"
hostname := "sc2-hs1-b2801.eng.vmware.com"
devName := "naa.55cd2e414d82c815:2"
var cmmds = map[string]CmmdsEntity{
disk: {UUID: disk, Type: "DISK", Owner: host, Content: CmmdsContent{DevName: devName, IsSsd: 1.}},
ssdDisk: {UUID: ssdDisk, Type: "DISK", Owner: host, Content: CmmdsContent{DevName: devName, IsSsd: 0., SsdUUID: ssd}},
host: {UUID: host, Type: "HOSTNAME", Owner: host, Content: CmmdsContent{Hostname: hostname}},
}
tags := populateCMMDSTags(make(map[string]string), "capacity-disk", disk, cmmds)
require.Equal(t, 2, len(tags))
tags = populateCMMDSTags(make(map[string]string), "cache-disk", ssdDisk, cmmds)
require.Equal(t, 3, len(tags))
tags = populateCMMDSTags(make(map[string]string), "host-domclient", host, cmmds)
require.Equal(t, 1, len(tags))
}
func TestCollectionNoClusterMetrics(t *testing.T) {