feat(inputs.vsphere): Adding vSAN extension to vSphere plugin (#11955)

2023-04-18 22:33:28 +05:30 · 2023-04-18 22:33:28 +05:30 · b323d1ce3c
parent e211bd3f28
commit b323d1ce3c
6 changed files with 939 additions and 103 deletions
--- a/plugins/inputs/vsphere/README.md
+++ b/plugins/inputs/vsphere/README.md
@ -8,6 +8,7 @@ vCenter servers.
 * Resource Pools
 * VMs
 * Datastores
+* vSAN

 ## Supported versions of vSphere

@ -177,6 +178,12 @@ to use them.
  datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
  # datacenter_instances = false ## false by default

+  ## VSAN
+  # vsan_metric_include = [] ## if omitted or empty, all metrics are collected
+  # vsan_metric_exclude = [ "*" ] ## vSAN are not collected by default.
+  ## Whether to skip verifying vSAN metrics against the ones from GetSupportedEntityTypes API.
+  # vsan_metric_skip_verify = false ## false by default.
+
  ## Plugin Settings
  ## separator character to use for measurement and field names (default: "_")
  # separator = "_"
@ -243,7 +250,7 @@ to use them.
 ```

 NOTE: To disable collection of a specific resource type, simply exclude all
-metrics using the XX_metric_exclude.  For example, to disable collection of VMs,
+metrics using the XX_metric_exclude. For example, to disable collection of VMs,
 add this:

 ```toml
@ -251,32 +258,34 @@ vm_metric_exclude = [ "*" ]
 ```

 NOTE: To disable collection of a specific resource type, simply exclude all
-metrics using the XX_metric_exclude.  For example, to disable collection of VMs,
-add this:
+metrics using the XX_metric_exclude.
+For example, to disable collection of VMs, add this:

-### Objects and Metrics Per Query
+### Objects and Metrics per Query

-By default, in vCenter's configuration a limit is set to the number of entities
-that are included in a performance chart query. Default settings for vCenter 6.5
-and above is 256. Prior versions of vCenter have this set to 64.  A vCenter
-administrator can change this setting, see this [VMware KB
-article](https://kb.vmware.com/s/article/2107096) for more information.
+By default, in the vCenter configuration a limit is set to the number of
+entities that are included in a performance chart query. Default settings for
+vCenter 6.5 and later is 256. Earlier versions of vCenter have this set to 64.
+A vCenter administrator can change this setting.
+See this [VMware KB article](https://kb.vmware.com/s/article/2107096) for more
+information.

 Any modification should be reflected in this plugin by modifying the parameter
 `max_query_objects`

 ```toml
-  ## number of objects to retrieve per query for realtime resources (vms and hosts)
+  ## number of objects to retrieve per query for realtime resources (VMs and hosts)
  ## set to 64 for vCenter 5.5 and 6.0 (default: 256)
  # max_query_objects = 256
 ```

-### Collection and Discovery concurrency
+### Collection and Discovery Concurrency

-On large vCenter setups it may be prudent to have multiple concurrent go
-routines collect performance metrics in order to avoid potential errors for time
-elapsed during a collection cycle. This should never be greater than 8, though
-the default of 1 (no concurrency) should be sufficient for most configurations.
+In large vCenter setups it may be prudent to have multiple concurrent go
+routines collect performance metrics in order to avoid potential errors for
+time elapsed during a collection cycle. This should never be greater than 8,
+though the default of 1 (no concurrency) should be sufficient for most
+configurations.

 For setting up concurrency, modify `collect_concurrency` and
 `discover_concurrency` parameters.
@ -289,8 +298,8 @@ For setting up concurrency, modify `collect_concurrency` and

 ### Inventory Paths

-Resources to be monitored can be selected using Inventory Paths. This treats the
-vSphere inventory as a tree structure similar to a file system. A vSphere
+Resources to be monitored can be selected using Inventory Paths. This treats
+the vSphere inventory as a tree structure similar to a file system. A vSphere
 inventory has a structure similar to this:

 ```bash
@ -330,15 +339,15 @@ Often, we want to select a group of resource, such as all the VMs in a
 folder. We could use the path `/DC0/vm/Folder1/*` for that.

 Another possibility is to select objects using a partial name, such as
-`/DC0/vm/Folder1/hadoop*` yielding all vms in Folder1 with a name starting
+`/DC0/vm/Folder1/hadoop*` yielding all VMs in Folder1 with a name starting
 with "hadoop".

 Finally, due to the arbitrary nesting of the folder structure, we need a
-"recursive wildcard" for traversing multiple folders. We use the "**" symbol for
-that. If we want to look for a VM with a name starting with "hadoop" in any
-folder, we could use the following path: `/DC0/vm/**/hadoop*`
+"recursive wildcard" for traversing multiple folders. We use the "**" symbol
+for that. If we want to look for a VM with a name starting with "hadoop" in
+any folder, we could use the following path: `/DC0/vm/**/hadoop*`

-#### Multiple paths to VMs
+#### Multiple Paths to VMs

 As we can see from the example tree above, VMs appear both in its on folder
 under the datacenter, as well as under the hosts. This is useful when you like
@ -368,7 +377,7 @@ be traversed.

 ## Performance Considerations

-### Realtime vs. historical metrics
+### Realtime vs. Historical Metrics

 vCenter keeps two different kinds of metrics, known as realtime and historical
 metrics.
@ -377,15 +386,15 @@ metrics.
 * Historical metrics: Available at a (default) 5 minute, 30 minutes, 2 hours and 24 hours rollup levels. The vSphere Telegraf plugin only uses the most granular rollup which defaults to 5 minutes but can be changed in vCenter to other interval durations. These metrics are stored in the vCenter database and can be expensive and slow to query. Historical metrics are the only type of metrics available for **clusters**, **datastores**, **resource pools** and **datacenters**.

 This distinction has an impact on how Telegraf collects metrics. A single
-instance of an input plugin can have one and only one collection interval, which
-means that you typically set the collection interval based on the most
+instance of an input plugin can have one and only one collection interval,
+which means that you typically set the collection interval based on the most
 frequently collected metric. Let's assume you set the collection interval to 1
 minute. All realtime metrics will be collected every minute. Since the
 historical metrics are only available on a 5 minute interval, the vSphere
-Telegraf plugin automatically skips four out of five collection cycles for these
-metrics. This works fine in many cases. Problems arise when the collection of
-historical metrics takes longer than the collection interval. This will cause
-error messages similar to this to appear in the Telegraf logs:
+Telegraf plugin automatically skips four out of five collection cycles for
+these metrics. This works fine in many cases. Problems arise when the
+collection of historical metrics takes longer than the collection interval.
+This will cause error messages similar to this to appear in the Telegraf logs:

 ```text
 2019-01-16T13:41:10Z W! [agent] input "inputs.vsphere" did not complete within its interval
@ -394,8 +403,8 @@ error messages similar to this to appear in the Telegraf logs:
 This will disrupt the metric collection and can result in missed samples. The
 best practice workaround is to specify two instances of the vSphere plugin, one
 for the realtime metrics with a short collection interval and one for the
-historical metrics with a longer interval. You can use the `*_metric_exclude` to
-turn off the resources you don't want to collect metrics for in each
+historical metrics with a longer interval. You can use the `*_metric_exclude`
+to turn off the resources you don't want to collect metrics for in each
 instance. For example:

 ```toml
@ -414,6 +423,7 @@ instance. For example:
  cluster_metric_exclude = ["*"]
  datacenter_metric_exclude = ["*"]
  resourcepool_metric_exclude = ["*"]
+  vsan_metric_exclude = ["*"]

  collect_concurrency = 5
  discover_concurrency = 5
@ -436,14 +446,14 @@ instance. For example:
  collect_concurrency = 3
 ```

-### Configuring max_query_metrics setting
+### Configuring max_query_metrics Setting

 The `max_query_metrics` determines the maximum number of metrics to attempt to
 retrieve in one call to vCenter. Generally speaking, a higher number means
 faster and more efficient queries. However, the number of allowed metrics in a
 query is typically limited in vCenter by the `config.vpxd.stats.maxQueryMetrics`
-setting in vCenter. The value defaults to 64 on vSphere 5.5 and older and 256 on
-newver versions of vCenter. The vSphere plugin always checks this setting and
+setting in vCenter. The value defaults to 64 on vSphere 5.5 and earlier and to
+256 on more recent versions. The vSphere plugin always checks this setting and
 will automatically reduce the number if the limit configured in vCenter is lower
 than max_query_metrics in the plugin. This will result in a log message similar
 to this:
@ -455,15 +465,15 @@ to this:
 You may ask a vCenter administrator to increase this limit to help boost
 performance.

-### Cluster metrics and the max_query_metrics setting
+### Cluster Metrics and the max_query_metrics Setting

 Cluster metrics are handled a bit differently by vCenter. They are aggregated
 from ESXi and virtual machine metrics and may not be available when you query
 their most recent values. When this happens, vCenter will attempt to perform
-that aggregation on the fly. Unfortunately, all the subqueries needed internally
-in vCenter to perform this aggregation will count towards
-`config.vpxd.stats.maxQueryMetrics`. This means that even a very small query may
-result in an error message similar to this:
+that aggregation on the fly. Unfortunately, all the subqueries needed
+internally in vCenter to perform this aggregation will count towards
+`config.vpxd.stats.maxQueryMetrics`. This means that even a very small query
+may result in an error message similar to this:

 ```text
 2018-11-02T13:37:11Z E! Error in plugin [inputs.vsphere]: ServerFaultCode: This operation is restricted by the administrator - 'vpxd.stats.maxQueryMetrics'. Contact your system administrator
@ -474,22 +484,22 @@ There are two ways of addressing this:
 * Ask your vCenter administrator to set `config.vpxd.stats.maxQueryMetrics` to a number that's higher than the total number of virtual machines managed by a vCenter instance.
 * Exclude the cluster metrics and use either the basicstats aggregator to calculate sums and averages per cluster or use queries in the visualization tool to obtain the same result.

-### Concurrency settings
+### Concurrency Settings

 The vSphere plugin allows you to specify two concurrency settings:

 * `collect_concurrency`: The maximum number of simultaneous queries for performance metrics allowed per resource type.
-* `discover_concurrency`: The  maximum number of simultaneous queries for resource discovery allowed.
+* `discover_concurrency`: The maximum number of simultaneous queries for resource discovery allowed.

 While a higher level of concurrency typically has a positive impact on
 performance, increasing these numbers too much can cause performance issues at
 the vCenter server. A rule of thumb is to set these parameters to the number of
 virtual machines divided by 1500 and rounded up to the nearest integer.

-### Configuring historical_interval setting
+### Configuring historical_interval Setting

 When the vSphere plugin queries vCenter for historical statistics it queries for
-statistics that exist at a specific interval.  The default historical interval
+statistics that exist at a specific interval. The default historical interval
 duration is 5 minutes but if this interval has been changed then you must
 override the default query interval in the vSphere plugin.

@ -569,6 +579,230 @@ For a detailed list of commonly available metrics, please refer to
 * virtualDisk stats for VM
  * disk (name of virtual disk)

+## Add a vSAN extension
+
+A vSAN resource is a special type of resource that can be collected by the
+plugin. The configuration of a vSAN resource slightly differs from the
+configuration of hosts, VMs, and other resources.
+
+### Prerequisites for vSAN
+
+* vSphere 6.5 and later
+* Clusters with vSAN enabled
+* [Turn on Virtual SAN performance service](https://docs.vmware.com/en/VMware-vSphere/6.5/com.vmware.vsphere.virtualsan.doc/GUID-02F67DC3-3D5A-48A4-A445-D2BD6AF2862C.html): When you create a vSAN cluster,
+the performance service is disabled. To monitor the performance metrics,
+you must turn on vSAN performance service.
+
+### vSAN Configuration
+
+```toml
+[[inputs.vsphere]]
+  interval = "300s"
+  vcenters = ["https://<vcenter-ip>/sdk", "https://<vcenter2-ip>/sdk"]
+  username = "<user>"
+  password = "<pwd>"
+
+  # Exclude all other metrics
+  vm_metric_exclude = ["*"]
+  datastore_metric_exclude = ["*"]
+  datacenter_metric_exclude = ["*"]
+  host_metric_exclude = ["*"]
+  cluster_metric_exclude = ["*"]
+  
+  # By default all supported entity will be included
+  vsan_metric_include = [
+    "summary.disk-usage",
+    "summary.health",
+    "summary.resync",
+    "performance.cluster-domclient",
+    "performance.cluster-domcompmgr",
+    "performance.host-domclient",
+    "performance.host-domcompmgr",
+    "performance.cache-disk",
+    "performance.disk-group",
+    "performance.capacity-disk",
+    "performance.disk-group",
+    "performance.virtual-machine",
+    "performance.vscsi",
+    "performance.virtual-disk",
+    "performance.vsan-host-net",
+    "performance.vsan-vnic-net",
+    "performance.vsan-pnic-net",
+    "performance.vsan-iscsi-host",
+    "performance.vsan-iscsi-target",
+    "performance.vsan-iscsi-lun",
+    "performance.lsom-world-cpu",
+    "performance.nic-world-cpu",
+    "performance.dom-world-cpu",
+    "performance.cmmds-world-cpu",
+    "performance.host-cpu",
+    "performance.host-domowner",
+    "performance.host-memory-slab",
+    "performance.host-memory-heap",
+    "performance.system-mem",
+  ]
+  # by default vsan_metric_skip_verify = false
+  vsan_metric_skip_verify = true
+  vsan_metric_exclude = [ ]
+  # vsan_cluster_include = [ "/*/host/**" ] # Inventory path to clusters to collect (by default all are collected)
+  
+  collect_concurrency = 5
+  discover_concurrency = 5
+  
+  ## Optional SSL Config
+  # ssl_ca = "/path/to/cafile"
+  # ssl_cert = "/path/to/certfile"
+  # ssl_key = "/path/to/keyfile"
+  ## Use SSL but skip chain & host verification
+  # insecure_skip_verify = false
+```
+
+* Use `vsan_metric_include = [...]` to define the vSAN metrics that you want to collect.
+For example, `vsan_metric_include = ["summary.*", "performance.host-domclient", "performance.cache-disk", "performance.disk-group", "performance.capacity-disk"]`.
+To include all supported vSAN metrics, use `vsan_metric_include = [ "*" ]`.
+To disable all the vSAN metrics, use `vsan_metric_exclude = [ "*" ]`.
+
+* `vsan_metric_skip_verify` defines whether to skip verifying vSAN metrics against the ones from [GetSupportedEntityTypes API](https://code.vmware.com/apis/48/vsan#/doc/vim.cluster.VsanPerformanceManager.html#getSupportedEntityTypes).
+This option is given because some performance entities are not returned by the API, but we want to offer the flexibility if you really need the stats.
+When set to false, anything not in the supported entity list will be filtered out.
+When set to true, queried metrics will be identical to vsan_metric_include and the exclusive array will not be used in this case. By default the value is false.
+
+* `vsan_cluster_include` defines a list of inventory paths that will be used to select a portion of vSAN clusters.
+vSAN metrics are only collected on the cluster level. Therefore, use the same way as inventory paths for [vSphere clusters](README.md#inventory-paths).
+
+* Many vCenter environments use self-signed certificates. Update the bottom portion of the above configuration and provide proper values for all applicable SSL Config settings that apply in your vSphere environment. In some environments, setting insecure_skip_verify = true will be necessary when the SSL certificates are not available.
+
+* To ensure consistent collection in larger vSphere environments, you must increase concurrency for the plugin. Use the collect_concurrency setting to control concurrency. Set collect_concurrency to the number of virtual machines divided by 1500 and rounded up to the nearest integer. For example, for 1200 VMs use 1, and for 2300 VMs use 2.
+
+### Measurements & Fields
+
+**NOTE**: Depending on the vSAN version, the vSAN performance measurements
+and fields may vary.
+
+* vSAN Summary
+  * overall_health
+  * total_capacity_bytes, free_capacity_bytes
+  * total_bytes_to_sync, total_objects_to_sync, total_recovery_eta
+
+* vSAN Performance
+  * cluster-domclient
+    * iops_read, throughput_read, latency_avg_read, iops_write, throughput_write, latency_avg_write, congestion, oio
+  * cluster-domcompmgr
+    * iops_read, throughput_read, latency_avg_read, iops_write, throughput_write, latency_avg_write, iops_rec_write, throughput_rec_write, latency_avg_rec_write, congestion, oio, iops_resync_read, tput_resync_read, lat_avg_resyncread
+  * host-domclient
+    * iops_read, throughput_read, latency_avg_read, read_count, iops_write, throughput_write, latency_avg_write, write_count, congestion, oio, client_cache_hits, client_cache_hit_rate
+  * host-domcompmgr
+    * iops_read, throughput_read, latency_avg_read, read_count, iops_write, throughput_write, latency_avg_write, write_count, iops_rec_write, throughput_rec_write, latency_avg_rec_write, rec_write_count congestion, oio, iops_resync_read, tput_resync_read, lat_avg_resync_read
+  * cache-disk
+    * iops_dev_read, throughput_dev_read, latency_dev_read, io_count_dev_read, iops_dev_write, throughput_dev_write, latency_dev_write, io_count_dev_write, latency_dev_d_avg, latency_dev_g_avg
+  * capacity-disk
+    * iops_dev_read, throughput_dev_read, latency_dev_read, io_count_dev_read, iops_dev_write, throughput_dev_write, latency_dev_write, io_count_dev_write, latency_dev_d_avg, latency_dev_g_avg, iops_read, latency_read, io_count_read, iops_write, latency_write, io_count_write
+  * disk-group
+    * iops_sched, latency_sched, outstanding_bytes_sched, iops_sched_queue_rec, throughput_sched_queue_rec,latency_sched_queue_rec, iops_sched_queue_vm, throughput_sched_queue_vm,latency_sched_queue_vm, iops_sched_queue_meta, throughput_sched_queue_meta,latency_sched_queue_meta, iops_delay_pct_sched, latency_delay_sched, rc_hit_rate, wb_free_pct, war_evictions, quota_evictions, iops_rc_read, latency_rc_read, io_count_rc_read, iops_wb_read, latency_wb_read, io_count_wb_read, iops_rc_write, latency_rc_write, io_count_rc_write, iops_wb_write, latency_wb_write, io_count_wb_write, ssd_bytes_drained, zero_bytes_drained, mem_congestion, slab_congestion, ssd_congestion, iops_congestion, log_congestion, comp_congestion, iops_direct_sched, iops_read, throughput_read, latency_avg_read, read_count, iops_write, throughput_write, latency_avg_write, write_count, oio_write, oio_rec_write, oio_write_size, oio_rec_write_size, rc_size, wb_size, capacity, capacity_used, capacity_reserved, throughput_sched, iops_resync_read_policy, iops_resync_read_decom, iops_resync_read_rebalance, iops_resync_read_fix_comp, iops_resync_write_policy, iops_resync_write_decom, iops_resync_write_rebalance, iops_resync_write_fix_comp, tput_resync_read_policy, tput_resync_read_decom, tput_resync_read_rebalance, tput_resync_read_fix_comp, tput_resync_write_policy, tput_resync_write_decom, tput_resync_write_rebalance, tput_resync_write_fix_comp, lat_resync_read_policy, lat_resync_read_decom, lat_resync_read_rebalance, lat_resync_read_fix_comp, lat_resync_write_policy, lat_resync_write_decom, lat_resync_write_rebalance, lat_resync_write_fix_comp
+  * virtual-machine
+    * iops_read, throughput_read, latency_read_avg, latency_read_stddev, read_count, iops_write, throughput_write, latency_write_avg, latency_write_stddev, write_count
+  * vscsi
+    * iops_read, throughput_read, latency_read, read_count, iops_write, throughput_write, latency_write, write_count
+  * virtual-disk
+    * iops_limit, niops, niops_delayed
+  * vsan-host-net
+    * rx_throughput, rx_packets, rx_packets_loss_rate, tx_throughput, tx_packets, tx_packets_loss_rate
+  * vsan-vnic-net
+    * rx_throughput, rx_packets, rx_packets_loss_rate, tx_throughput, tx_packets, tx_packets_loss_rate
+  * vsan-pnic-net
+    * rx_throughput, rx_packets, rx_packets_loss_rate, tx_throughput, tx_packets, tx_packets_loss_rate
+  * vsan-iscsi-host
+    * iops_read, iops_write, iops_total, bandwidth_read, bandwidth_write, bandwidth_total, latency_read, latency_write, latency_total, queue_depth
+  * vsan-iscsi-target
+    * iops_read, iops_write, iops_total, bandwidth_read, bandwidth_write, bandwidth_total, latency_read, latency_write, latency_total, queue_depth
+  * vsan-iscsi-lun
+    * iops_read, iops_write, iops_total, bandwidth_read, bandwidth_write, bandwidth_total, latency_read, latency_write, latency_total, queue_depth
+
+### vSAN Tags
+
+* all vSAN metrics
+  * vcenter
+  * dcname
+  * clustername
+  * moid (the cluster's managed object id)
+* host-domclient, host-domcompmgr
+  * hostname
+* disk-group, cache-disk, capacity-disk
+  * hostname
+  * deviceName
+  * ssdUuid (if SSD)
+* vsan-host-net
+  * hostname
+* vsan-pnic-net
+  * pnic
+* vsan-vnic-net
+  * vnic
+  * stackName
+
+### Realtime vs. Historical Metrics in vSAN
+
+vSAN metrics also keep two different kinds of metrics - realtime and
+historical metrics.
+
+* Realtime metrics are metrics with the prefix 'summary'. These metrics are available in realtime.
+* Historical metrics are metrics with the prefix 'performance'. These are metrics queried from vSAN performance API, which is available at a 5-minute rollup level.
+
+For performance consideration, it is better to specify two instances of the
+plugin, one for the realtime metrics with a short collection interval,
+and the second one - for the historical metrics with a longer interval.
+For example:
+
+```toml
+## Realtime instance
+[[inputs.vsphere]]
+  interval = "30s"
+  vcenters = [ "https://someaddress/sdk" ]
+  username = "someuser@vsphere.local"
+  password = "secret"
+
+  insecure_skip_verify = true
+  force_discover_on_init = true
+
+  # Exclude all other metrics
+  vm_metric_exclude = ["*"]
+  datastore_metric_exclude = ["*"]
+  datacenter_metric_exclude = ["*"]
+  host_metric_exclude = ["*"]
+  cluster_metric_exclude = ["*"]
+  
+  vsan_metric_include = [ "summary.*" ]
+  vsan_metric_exclude = [ ]
+  vsan_metric_skip_verify = false
+
+  collect_concurrency = 5
+  discover_concurrency = 5
+
+# Historical instance
+[[inputs.vsphere]]
+
+  interval = "300s"
+  vcenters = [ "https://someaddress/sdk" ]
+  username = "someuser@vsphere.local"
+  password = "secret"
+
+  insecure_skip_verify = true
+  force_discover_on_init = true
+
+  # Exclude all other metrics
+  vm_metric_exclude = ["*"]
+  datastore_metric_exclude = ["*"]
+  datacenter_metric_exclude = ["*"]
+  host_metric_exclude = ["*"]
+  cluster_metric_exclude = ["*"]
+  
+  vsan_metric_include = [ "performance.*" ]
+  vsan_metric_exclude = [ ]
+  vsan_metric_skip_verify = false
+  
+  collect_concurrency = 5
+  discover_concurrency = 5
+```
+
 ## Example Output

 ```text
@ -677,3 +911,14 @@ vsphere_host_net,clustername=DC0_C0,esxhostname=DC0_C0_H0,host=host.example.com,
 vsphere_host_mem,clustername=DC0_C0,esxhostname=DC0_C0_H0,host=host.example.com,moid=host-30,os=Mac,source=DC0_C0_H0,vcenter=localhost:8989 usage_average=116.21 1535660339000000000
 vsphere_host_net,clustername=DC0_C0,esxhostname=DC0_C0_H0,host=host.example.com,moid=host-30,os=Mac,source=DC0_C0_H0,vcenter=localhost:8989 bytesRx_average=726i,bytesTx_average=643i,usage_average=1504i 1535660339000000000
 ```
+
+## vSAN Sample Output
+
+```text
+vsphere_vsan_performance_hostdomclient,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,hostname=DC0_C0_H0,moid=domain-c8,source=Example-VSAN,vcenter=localhost:8898 iops_read=7,write_congestion=0,unmap_congestion=0,read_count=2199,iops=8,latency_max_write=8964,latency_avg_unmap=0,latency_avg_write=1883,write_count=364,num_oio=12623,throughput=564127,client_cache_hits=0,latency_max_read=17821,latency_max_unmap=0,read_congestion=0,latency_avg=1154,congestion=0,throughput_read=554721,latency_avg_read=1033,throughput_write=9406,client_cache_hit_rate=0,iops_unmap=0,throughput_unmap=0,latency_stddev=1315,io_count=2563,oio=4,iops_write=1,unmap_count=0 1578955200000000000
+vsphere_vsan_performance_clusterdomcompmgr,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,uuid=XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXX,vcenter=localhost:8898 latency_avg_rec_write=0,latency_avg_write=9886,congestion=0,iops_resync_read=0,lat_avg_resync_read=0,iops_read=289,latency_avg_read=1184,throughput_write=50137368,iops_rec_write=0,throughput_rec_write=0,tput_resync_read=0,throughput_read=9043654,iops_write=1272,oio=97 1578954900000000000
+vsphere_vsan_performance_clusterdomclient,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,uuid=XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXX,vcenter=localhost:8898 latency_avg_write=1011,congestion=0,oio=26,iops_read=6,throughput_read=489093,latency_avg_read=1085,iops_write=43,throughput_write=435142 1578955200000000000
+vsphere_vsan_summary,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,vcenter=localhost:8898 total_bytes_to_sync=0i,total_objects_to_sync=0i,total_recovery_eta=0i 1578955489000000000
+vsphere_vsan_summary,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,vcenter=localhost:8898 overall_health=1i 1578955489000000000
+vsphere_vsan_summary,clustername=Example-VSAN,dcname=Example-DC,host=host.example.com,moid=domain-c7,source=Example-VSAN,vcenter=localhost:8898 free_capacity_byte=11022535578757i,total_capacity_byte=14102625779712i 1578955488000000000
+```
--- a/plugins/inputs/vsphere/endpoint.go
+++ b/plugins/inputs/vsphere/endpoint.go
@ -60,6 +60,7 @@ type Endpoint struct {
 	metricNameLookup  map[int32]string
 	metricNameMux     sync.RWMutex
 	log               telegraf.Logger
+	apiVersion        string
 }

 type resourceKind struct {
@ -237,6 +238,23 @@ func NewEndpoint(ctx context.Context, parent *VSphere, address *url.URL, log tel
 			getObjects:       getDatastores,
 			parent:           "",
 		},
+		"vsan": {
+			name:             "vsan",
+			vcName:           "ClusterComputeResource",
+			pKey:             "clustername",
+			parentTag:        "dcname",
+			enabled:          anythingEnabled(parent.VSANMetricExclude),
+			realTime:         false,
+			sampling:         300,
+			objects:          make(objectMap),
+			filters:          newFilterOrPanic(parent.VSANMetricInclude, parent.VSANMetricExclude),
+			paths:            parent.VSANClusterInclude,
+			simple:           parent.VSANMetricSkipVerify,
+			include:          parent.VSANMetricInclude,
+			collectInstances: false,
+			getObjects:       getClusters,
+			parent:           "datacenter",
+		},
 	}

 	// Start discover and other goodness
@ -445,7 +463,10 @@ func (e *Endpoint) discover(ctx context.Context) error {
 		return err
 	}

-	e.log.Debugf("Discover new objects for %s", e.URL.Host)
+	// get the vSphere API version
+	e.apiVersion = client.Client.ServiceContent.About.ApiVersion
+
+	e.Parent.Log.Debugf("Discover new objects for %s", e.URL.Host)
 	dcNameCache := make(map[string]string)

 	numRes := int64(0)
@ -455,7 +476,7 @@ func (e *Endpoint) discover(ctx context.Context) error {
 	for k, res := range e.resourceKinds {
 		e.log.Debugf("Discovering resources for %s", res.name)
 		// Need to do this for all resource types even if they are not enabled
-		if res.enabled || k != "vm" {
+		if res.enabled || (k != "vm" && k != "vsan") {
 			rf := ResourceFilter{
 				finder:       &Finder{client},
 				resType:      res.vcName,
@ -480,7 +501,8 @@ func (e *Endpoint) discover(ctx context.Context) error {
 			}

 			// No need to collect metric metadata if resource type is not enabled
-			if res.enabled {
+			// VSAN is also skipped since vSAN metadata follow it's own format
+			if res.enabled && k != "vsan" {
 				if res.simple {
 					e.simpleMetadataSelect(ctx, client, res)
 				} else {
@ -935,7 +957,12 @@ func (e *Endpoint) Collect(ctx context.Context, acc telegraf.Accumulator) error
 			wg.Add(1)
 			go func(k string) {
 				defer wg.Done()
-				err := e.collectResource(ctx, k, acc)
+				var err error
+				if k == "vsan" {
+					err = e.collectVsan(ctx, acc)
+				} else {
+					err = e.collectResource(ctx, k, acc)
+				}
 				if err != nil {
 					acc.AddError(err)
 				}
--- a/plugins/inputs/vsphere/sample.conf
+++ b/plugins/inputs/vsphere/sample.conf
@ -136,6 +136,12 @@
  datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
  # datacenter_instances = false ## false by default

+  ## VSAN
+  # vsan_metric_include = [] ## if omitted or empty, all metrics are collected
+  # vsan_metric_exclude = [ "*" ] ## vSAN are not collected by default.
+  ## Whether to skip verifying vSAN metrics against the ones from GetSupportedEntityTypes API.
+  # vsan_metric_skip_verify = false ## false by default.
+
  ## Plugin Settings
  ## separator character to use for measurement and field names (default: "_")
  # separator = "_"
--- a/plugins/inputs/vsphere/vsan.go
+++ b/plugins/inputs/vsphere/vsan.go
@ -0,0 +1,520 @@
+package vsphere
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/coreos/go-semver/semver"
+	"github.com/vmware/govmomi/object"
+	"github.com/vmware/govmomi/vim25"
+	"github.com/vmware/govmomi/vim25/methods"
+	"github.com/vmware/govmomi/vim25/soap"
+	"github.com/vmware/govmomi/vim25/types"
+	vsanmethods "github.com/vmware/govmomi/vsan/methods"
+	vsantypes "github.com/vmware/govmomi/vsan/types"
+
+	"github.com/influxdata/telegraf"
+	"github.com/influxdata/telegraf/internal"
+)
+
+const (
+	vsanNamespace    = "vsan"
+	vsanPath         = "/vsanHealth"
+	hwMarksKeyPrefix = "vsan-perf-"
+	perfPrefix       = "performance."
+)
+
+var (
+	vsanPerfMetricsName    string
+	vsanSummaryMetricsName string
+	perfManagerRef         = types.ManagedObjectReference{
+		Type:  "VsanPerformanceManager",
+		Value: "vsan-performance-manager",
+	}
+	hyphenReplacer = strings.NewReplacer("-", "")
+)
+
+// collectVsan is the entry point for vsan metrics collection
+func (e *Endpoint) collectVsan(ctx context.Context, acc telegraf.Accumulator) error {
+	//resourceType := "vsan"
+	lower := versionLowerThan(e.apiVersion, "5.5")
+	if lower {
+		return fmt.Errorf("a minimum API version of 5.5 is required for vSAN. Found: %s. Skipping vCenter: %s", e.apiVersion, e.URL.Host)
+	}
+	vsanPerfMetricsName = strings.Join([]string{"vsphere", "vsan", "performance"}, e.Parent.Separator)
+	vsanSummaryMetricsName = strings.Join([]string{"vsphere", "vsan", "summary"}, e.Parent.Separator)
+	res := e.resourceKinds["vsan"]
+	client, err := e.clientFactory.GetClient(ctx)
+	if err != nil {
+		return fmt.Errorf("fail to get client when collect vsan: %w", err)
+	}
+	// Create vSAN client
+	vimClient := client.Client.Client
+	vsanClient := vimClient.NewServiceClient(vsanPath, vsanNamespace)
+	// vSAN Metrics to collect
+	metrics := e.getVsanMetadata(ctx, vsanClient, res)
+	// Iterate over all clusters, run a goroutine for each cluster
+	te := NewThrottledExecutor(e.Parent.CollectConcurrency)
+	for _, obj := range res.objects {
+		te.Run(ctx, func() {
+			e.collectVsanPerCluster(ctx, obj, vimClient, vsanClient, metrics, acc)
+		})
+	}
+	te.Wait()
+	return nil
+}
+
+// collectVsanPerCluster is called by goroutines in collectVsan function.
+func (e *Endpoint) collectVsanPerCluster(ctx context.Context, clusterRef *objectRef, vimClient *vim25.Client, vsanClient *soap.Client,
+	metrics map[string]string, acc telegraf.Accumulator) {
+	// Construct a map for cmmds
+	cluster := object.NewClusterComputeResource(vimClient, clusterRef.ref)
+	if !e.vsanEnabled(ctx, cluster) {
+		acc.AddError(fmt.Errorf("[vSAN] Fail to identify vSAN for cluster %s. Skipping", clusterRef.name))
+		return
+	}
+	// Do collection
+	if _, ok := metrics["summary.disk-usage"]; ok {
+		if err := e.queryDiskUsage(ctx, vsanClient, clusterRef, acc); err != nil {
+			acc.AddError(fmt.Errorf("error querying disk usage for cluster %s: %w", clusterRef.name, err))
+		}
+	}
+	if _, ok := metrics["summary.health"]; ok {
+		if err := e.queryHealthSummary(ctx, vsanClient, clusterRef, acc); err != nil {
+			acc.AddError(fmt.Errorf("error querying vsan health summary for cluster %s: %w", clusterRef.name, err))
+		}
+	}
+	if _, ok := metrics["summary.resync"]; ok {
+		if err := e.queryResyncSummary(ctx, vsanClient, cluster, clusterRef, acc); err != nil {
+			acc.AddError(fmt.Errorf("error querying vsan resync summary for cluster %s: %w", clusterRef.name, err))
+		}
+	}
+	cmmds, err := getCmmdsMap(ctx, vimClient, cluster)
+	if err != nil {
+		e.Parent.Log.Errorf("[vSAN] Error while query cmmds data. Error: %s. Skipping", err)
+		cmmds = make(map[string]CmmdsEntity)
+	}
+	if err := e.queryPerformance(ctx, vsanClient, clusterRef, metrics, cmmds, acc); err != nil {
+		acc.AddError(fmt.Errorf("error querying performance metrics for cluster %s: %w", clusterRef.name, err))
+	}
+}
+
+// vsanEnabled returns True if vSAN is enabled, otherwise False
+func (e *Endpoint) vsanEnabled(ctx context.Context, clusterObj *object.ClusterComputeResource) bool {
+	config, err := clusterObj.Configuration(ctx)
+	if err != nil {
+		return false
+	}
+	enabled := config.VsanConfigInfo.Enabled
+	return enabled != nil && *enabled
+}
+
+// getVsanMetadata returns a string list of the entity types that will be queried.
+// e.g ["summary.health", "summary.disk-usage", "summary.resync", "performance.cluster-domclient", "performance.host-domclient"]
+func (e *Endpoint) getVsanMetadata(ctx context.Context, vsanClient *soap.Client, res *resourceKind) map[string]string {
+	metrics := make(map[string]string)
+	if res.simple { // Skip getting supported Entity types from vCenter. Using user defined metrics without verifying.
+		for _, entity := range res.include {
+			if strings.Contains(entity, "*") {
+				e.Parent.Log.Infof("[vSAN] Won't use wildcard match \"*\" when vsan_metric_skip_verify = true. Skipping")
+				continue
+			}
+			metrics[entity] = ""
+		}
+		return metrics
+	}
+	// Use the include & exclude configuration to filter all summary metrics
+	for _, entity := range []string{"summary.health", "summary.disk-usage", "summary.resync"} {
+		if res.filters.Match(entity) {
+			metrics[entity] = ""
+		}
+	}
+	resp, err := vsanmethods.VsanPerfGetSupportedEntityTypes(ctx, vsanClient,
+		&vsantypes.VsanPerfGetSupportedEntityTypes{
+			This: perfManagerRef,
+		})
+	if err != nil {
+		e.Parent.Log.Errorf("[vSAN] Fail to get supported entities: %v. Skipping vsan performance data.", err)
+		return metrics
+	}
+	// Use the include & exclude configuration to filter all supported performance metrics
+	for _, entity := range resp.Returnval {
+		if res.filters.Match(perfPrefix + entity.Name) {
+			metrics[perfPrefix+entity.Name] = ""
+		}
+	}
+	return metrics
+}
+
+// getCmmdsMap returns a map which maps a uuid to a CmmdsEntity
+func getCmmdsMap(ctx context.Context, client *vim25.Client, clusterObj *object.ClusterComputeResource) (map[string]CmmdsEntity, error) {
+	hosts, err := clusterObj.Hosts(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("fail to get host: %w", err)
+	}
+
+	if len(hosts) == 0 {
+		return make(map[string]CmmdsEntity), nil
+	}
+
+	queries := []types.HostVsanInternalSystemCmmdsQuery{
+		{Type: "HOSTNAME"},
+		{Type: "DISK"},
+	}
+
+	//Some esx host can be down or in maintenance mode. Hence cmmds query might fail on such hosts.
+	// We iterate until be get proper api response
+	var resp *types.QueryCmmdsResponse
+	for _, host := range hosts {
+		vis, err := host.ConfigManager().VsanInternalSystem(ctx)
+		if err != nil {
+			continue
+		}
+		request := types.QueryCmmds{
+			This:    vis.Reference(),
+			Queries: queries,
+		}
+		resp, err = methods.QueryCmmds(ctx, client.RoundTripper, &request)
+		if err == nil {
+			break
+		}
+	}
+	if resp == nil {
+		return nil, fmt.Errorf("all hosts fail to query cmmds")
+	}
+	var clusterCmmds Cmmds
+	if err := json.Unmarshal([]byte(resp.Returnval), &clusterCmmds); err != nil {
+		return nil, fmt.Errorf("fail to convert cmmds to json: %w", err)
+	}
+
+	cmmdsMap := make(map[string]CmmdsEntity)
+	for _, entity := range clusterCmmds.Res {
+		cmmdsMap[entity.UUID] = entity
+	}
+	return cmmdsMap, nil
+}
+
+// queryPerformance adds performance metrics to telegraf accumulator
+func (e *Endpoint) queryPerformance(ctx context.Context, vsanClient *soap.Client, clusterRef *objectRef, metrics map[string]string,
+	cmmds map[string]CmmdsEntity, acc telegraf.Accumulator) error {
+	end := time.Now().UTC()
+
+	// We're using a fake metric key, since we only store one highwater mark per resource
+	start, ok := e.hwMarks.Get(hwMarksKeyPrefix+clusterRef.ref.Value, "generic")
+	if !ok {
+		// Look back 3 sampling periods by default
+		start = end.Add(time.Duration(e.Parent.MetricLookback) * time.Duration(-e.resourceKinds["vsan"].sampling) * time.Second)
+	}
+	e.Parent.Log.Debugf("[vSAN] Query vsan performance for time interval: %s ~ %s", start, end)
+	latest := start
+
+	var commonError error
+	for entityRefID := range metrics {
+		if !strings.HasPrefix(entityRefID, perfPrefix) {
+			continue
+		}
+		entityRefID = strings.TrimPrefix(entityRefID, perfPrefix)
+		var perfSpecs []vsantypes.VsanPerfQuerySpec
+
+		perfSpec := vsantypes.VsanPerfQuerySpec{
+			EntityRefId: entityRefID + ":*",
+			StartTime:   &start,
+			EndTime:     &end,
+		}
+		perfSpecs = append(perfSpecs, perfSpec)
+
+		perfRequest := vsantypes.VsanPerfQueryPerf{
+			This:       perfManagerRef,
+			QuerySpecs: perfSpecs,
+			Cluster:    &clusterRef.ref,
+		}
+		resp, err := vsanmethods.VsanPerfQueryPerf(ctx, vsanClient, &perfRequest)
+		if err != nil {
+			if err.Error() == "ServerFaultCode: NotFound" {
+				e.Parent.Log.Errorf("[vSAN] Is vSAN performance service enabled for %s? Skipping ...", clusterRef.name)
+				commonError = err
+				break
+			}
+			e.Parent.Log.Errorf("[vSAN] Error querying performance data for %s: %s: %s.", clusterRef.name, entityRefID, err)
+			continue
+		}
+		tags := populateClusterTags(make(map[string]string), clusterRef, e.URL.Host)
+
+		count := 0
+		for _, em := range resp.Returnval {
+			vals := strings.Split(em.EntityRefId, ":")
+			var entityName, uuid string
+			if len(vals) == 1 {
+				entityName, uuid = vals[0], ""
+			} else {
+				entityName, uuid = vals[0], vals[1]
+			}
+
+			buckets := make(map[string]metricEntry)
+			tags := populateCMMDSTags(tags, entityName, uuid, cmmds)
+			var timeStamps []time.Time
+			// 1. Construct a timestamp list from sample info
+			formattedEntityName := hyphenReplacer.Replace(entityName)
+			for _, t := range strings.Split(em.SampleInfo, ",") {
+				// Parse the input string to a time.Time object
+				utcTimeStamp, err := time.Parse("2006-01-02 15:04:05", t)
+				if err != nil {
+					e.Parent.Log.Errorf("[vSAN] Failed to parse a timestamp: %s. Skipping", utcTimeStamp)
+					timeStamps = append(timeStamps, time.Time{})
+					continue
+				}
+				timeStamps = append(timeStamps, utcTimeStamp)
+			}
+			// 2. Iterate on each measurement
+			for _, counter := range em.Value {
+				metricLabel := internal.SnakeCase(counter.MetricId.Label)
+				// 3. Iterate on each data point.
+				for i, values := range strings.Split(counter.Values, ",") {
+					ts := timeStamps[i]
+					if ts.IsZero() {
+						continue
+					}
+					// Organize the metrics into a bucket per measurement.
+					bKey := em.EntityRefId + " " + strconv.FormatInt(ts.UnixNano(), 10)
+					bucket, found := buckets[bKey]
+					if !found {
+						mn := vsanPerfMetricsName + e.Parent.Separator + formattedEntityName
+						bucket = metricEntry{name: mn, ts: ts, fields: make(map[string]interface{}), tags: tags}
+						buckets[bKey] = bucket
+					}
+					if v, err := strconv.ParseFloat(values, 32); err == nil {
+						bucket.fields[metricLabel] = v
+					}
+				}
+			}
+			if len(timeStamps) > 0 {
+				lastSample := timeStamps[len(timeStamps)-1]
+				if lastSample != (time.Time{}) && lastSample.After(latest) {
+					latest = lastSample
+				}
+			}
+			// We've iterated through all the metrics and collected buckets for each measurement name. Now emit them!
+			for _, bucket := range buckets {
+				acc.AddFields(bucket.name, bucket.fields, bucket.tags, bucket.ts)
+			}
+			count += len(buckets)
+		}
+	}
+	e.hwMarks.Put(hwMarksKeyPrefix+clusterRef.ref.Value, "generic", latest)
+	return commonError
+}
+
+// queryDiskUsage adds 'FreeCapacityB' and 'TotalCapacityB' metrics to telegraf accumulator
+func (e *Endpoint) queryDiskUsage(ctx context.Context, vsanClient *soap.Client, clusterRef *objectRef, acc telegraf.Accumulator) error {
+	spaceManagerRef := types.ManagedObjectReference{
+		Type:  "VsanSpaceReportSystem",
+		Value: "vsan-cluster-space-report-system",
+	}
+	resp, err := vsanmethods.VsanQuerySpaceUsage(ctx, vsanClient,
+		&vsantypes.VsanQuerySpaceUsage{
+			This:    spaceManagerRef,
+			Cluster: clusterRef.ref,
+		})
+	if err != nil {
+		return err
+	}
+	fields := map[string]interface{}{
+		"free_capacity_byte":  resp.Returnval.FreeCapacityB,
+		"total_capacity_byte": resp.Returnval.TotalCapacityB,
+	}
+	tags := populateClusterTags(make(map[string]string), clusterRef, e.URL.Host)
+	acc.AddFields(vsanSummaryMetricsName, fields, tags)
+	return nil
+}
+
+// queryDiskUsage adds 'OverallHealth' metric to telegraf accumulator
+func (e *Endpoint) queryHealthSummary(ctx context.Context, vsanClient *soap.Client, clusterRef *objectRef, acc telegraf.Accumulator) error {
+	healthSystemRef := types.ManagedObjectReference{
+		Type:  "VsanVcClusterHealthSystem",
+		Value: "vsan-cluster-health-system",
+	}
+	fetchFromCache := true
+	resp, err := vsanmethods.VsanQueryVcClusterHealthSummary(ctx, vsanClient,
+		&vsantypes.VsanQueryVcClusterHealthSummary{
+			This:           healthSystemRef,
+			Cluster:        &clusterRef.ref,
+			Fields:         []string{"overallHealth", "overallHealthDescription"},
+			FetchFromCache: &fetchFromCache,
+		})
+	if err != nil {
+		return err
+	}
+	healthStr := resp.Returnval.OverallHealth
+	healthMap := map[string]int{"red": 2, "yellow": 1, "green": 0}
+	fields := make(map[string]interface{})
+	if val, ok := healthMap[healthStr]; ok {
+		fields["overall_health"] = val
+	}
+	tags := populateClusterTags(make(map[string]string), clusterRef, e.URL.Host)
+	acc.AddFields(vsanSummaryMetricsName, fields, tags)
+	return nil
+}
+
+// queryResyncSummary adds resync information to accumulator
+func (e *Endpoint) queryResyncSummary(ctx context.Context, vsanClient *soap.Client, clusterObj *object.ClusterComputeResource,
+	clusterRef *objectRef, acc telegraf.Accumulator) error {
+	if lower := versionLowerThan(e.apiVersion, "6.7"); lower {
+		e.Parent.Log.Infof("I! [inputs.vsphere][vSAN] Minimum API Version 6.7 required for resync summary. Found: %s. Skipping VCenter: %s",
+			e.apiVersion, e.URL.Host)
+		return nil
+	}
+	hosts, err := clusterObj.Hosts(ctx)
+	if err != nil {
+		return err
+	}
+	if len(hosts) == 0 {
+		return nil
+	}
+	hostRefValue := hosts[0].Reference().Value
+	hostRefValueParts := strings.Split(hostRefValue, "-")
+	if len(hostRefValueParts) != 2 {
+		e.Parent.Log.Errorf("[vSAN] Host reference value does not match expected pattern:  host-<num>. Actual Value %s", hostRefValue)
+		return err
+	}
+	vsanSystemEx := types.ManagedObjectReference{
+		Type:  "VsanSystemEx",
+		Value: fmt.Sprintf("vsanSystemEx-%s", strings.Split(hostRefValue, "-")[1]),
+	}
+
+	includeSummary := true
+	request := vsantypes.VsanQuerySyncingVsanObjects{
+		This:           vsanSystemEx,
+		Uuids:          []string{}, // We only need summary information.
+		Start:          0,
+		IncludeSummary: &includeSummary,
+	}
+
+	resp, err := vsanmethods.VsanQuerySyncingVsanObjects(ctx, vsanClient, &request)
+	if err != nil {
+		return err
+	}
+	fields := make(map[string]interface{})
+	fields["total_bytes_to_sync"] = resp.Returnval.TotalBytesToSync
+	fields["total_objects_to_sync"] = resp.Returnval.TotalObjectsToSync
+	fields["total_recovery_eta"] = resp.Returnval.TotalRecoveryETA
+	tags := populateClusterTags(make(map[string]string), clusterRef, e.URL.Host)
+	acc.AddFields(vsanSummaryMetricsName, fields, tags)
+	return nil
+}
+
+// populateClusterTags takes in a tag map, makes a copy, populates cluster related tags and returns the copy.
+func populateClusterTags(tags map[string]string, clusterRef *objectRef, vcenter string) map[string]string {
+	newTags := make(map[string]string)
+	// deep copy
+	for k, v := range tags {
+		newTags[k] = v
+	}
+	newTags["vcenter"] = vcenter
+	newTags["dcname"] = clusterRef.dcname
+	newTags["clustername"] = clusterRef.name
+	newTags["moid"] = clusterRef.ref.Value
+	newTags["source"] = clusterRef.name
+	return newTags
+}
+
+// populateCMMDSTags takes in a tag map, makes a copy, adds more tags using a cmmds map and returns the copy.
+func populateCMMDSTags(tags map[string]string, entityName string, uuid string, cmmds map[string]CmmdsEntity) map[string]string {
+	newTags := make(map[string]string)
+	// deep copy
+	for k, v := range tags {
+		newTags[k] = v
+	}
+	// There are cases when the uuid is missing. (Usually happens when performance service is just enabled or disabled)
+	// We need this check to avoid index-out-of-range error
+	if uuid == "*" || uuid == "" {
+		return newTags
+	}
+	// Add additional tags based on CMMDS data
+	switch {
+	case strings.Contains(entityName, "-disk") || strings.Contains(entityName, "disk-"):
+		if e, ok := cmmds[uuid]; ok {
+			if host, ok := cmmds[e.Owner]; ok {
+				newTags["hostname"] = host.Content.Hostname
+			}
+			newTags["devicename"] = e.Content.DevName
+			if int(e.Content.IsSsd) == 0 {
+				newTags["ssduuid"] = e.Content.SsdUUID
+			}
+		}
+	case strings.Contains(entityName, "host-memory-"):
+		memInfo := strings.Split(uuid, "|")
+		if strings.Contains(entityName, "-slab") && len(memInfo) > 1 {
+			newTags["slabname"] = memInfo[1]
+		}
+		if strings.Contains(entityName, "-heap") && len(memInfo) > 1 {
+			newTags["heapname"] = memInfo[1]
+		}
+		if e, ok := cmmds[memInfo[0]]; ok {
+			newTags["hostname"] = e.Content.Hostname
+		}
+	case strings.Contains(entityName, "host-") || strings.Contains(entityName, "system-mem"):
+		if e, ok := cmmds[uuid]; ok {
+			newTags["hostname"] = e.Content.Hostname
+		}
+	case strings.Contains(entityName, "vnic-net"):
+		nicInfo := strings.Split(uuid, "|")
+		if len(nicInfo) > 2 {
+			newTags["stackname"] = nicInfo[1]
+			newTags["vnic"] = nicInfo[2]
+		}
+		if e, ok := cmmds[nicInfo[0]]; ok {
+			newTags["hostname"] = e.Content.Hostname
+		}
+	case strings.Contains(entityName, "pnic-net"):
+		nicInfo := strings.Split(uuid, "|")
+		if len(nicInfo) > 1 {
+			newTags["pnic"] = nicInfo[1]
+		}
+		if e, ok := cmmds[nicInfo[0]]; ok {
+			newTags["hostname"] = e.Content.Hostname
+		}
+	case strings.Contains(entityName, "world-cpu"):
+		cpuInfo := strings.Split(uuid, "|")
+		if len(cpuInfo) > 1 {
+			newTags["worldname"] = cpuInfo[1]
+		}
+		if e, ok := cmmds[cpuInfo[0]]; ok {
+			newTags["hostname"] = e.Content.Hostname
+		}
+	default:
+		// If no tags are added in previous steps, we add uuid for it
+		if len(newTags) == len(tags) {
+			newTags["uuid"] = uuid
+		}
+	}
+	return newTags
+}
+
+// versionLowerThan returns true is the current version < a base version
+func versionLowerThan(current string, base string) bool {
+	v1 := semver.New(current)
+	v2 := semver.New(base)
+	return v1.LessThan(*v2)
+}
+
+type CmmdsEntity struct {
+	UUID    string       `json:"uuid"`
+	Owner   string       `json:"owner"` // ESXi UUID
+	Type    string       `json:"type"`
+	Content CmmdsContent `json:"content"`
+}
+
+type Cmmds struct {
+	Res []CmmdsEntity `json:"result"`
+}
+
+type CmmdsContent struct {
+	Hostname string  `json:"hostname"`
+	IsSsd    float64 `json:"isSsd"`
+	SsdUUID  string  `json:"ssdUuid"`
+	DevName  string  `json:"devName"`
+}
--- a/plugins/inputs/vsphere/vsphere.go
+++ b/plugins/inputs/vsphere/vsphere.go
@ -55,6 +55,10 @@ type VSphere struct {
 	DatastoreMetricExclude      []string
 	DatastoreInclude            []string
 	DatastoreExclude            []string
+	VSANMetricInclude           []string `toml:"vsan_metric_include"`
+	VSANMetricExclude           []string `toml:"vsan_metric_exclude"`
+	VSANMetricSkipVerify        bool     `toml:"vsan_metric_skip_verify"`
+	VSANClusterInclude          []string `toml:"vsan_cluster_include"`
 	Separator                   string
 	CustomAttributeInclude      []string
 	CustomAttributeExclude      []string
@ -62,15 +66,14 @@ type VSphere struct {
 	IPAddresses                 []string
 	MetricLookback              int
 	DisconnectedServersBehavior string
-
-	MaxQueryObjects         int
-	MaxQueryMetrics         int
-	CollectConcurrency      int
-	DiscoverConcurrency     int
-	ForceDiscoverOnInit     bool `toml:"force_discover_on_init" deprecated:"1.14.0;option is ignored"`
-	ObjectDiscoveryInterval config.Duration
-	Timeout                 config.Duration
-	HistoricalInterval      config.Duration
+	MaxQueryObjects             int
+	MaxQueryMetrics             int
+	CollectConcurrency          int
+	DiscoverConcurrency         int
+	ForceDiscoverOnInit         bool `toml:"force_discover_on_init" deprecated:"1.14.0;option is ignored"`
+	ObjectDiscoveryInterval     config.Duration
+	Timeout                     config.Duration
+	HistoricalInterval          config.Duration

 	endpoints []*Endpoint
 	cancel    context.CancelFunc
@ -155,38 +158,40 @@ func (v *VSphere) Gather(acc telegraf.Accumulator) error {
 func init() {
 	inputs.Add("vsphere", func() telegraf.Input {
 		return &VSphere{
-			Vcenters: []string{},
-
-			DatacenterInstances:       false,
-			DatacenterMetricInclude:   nil,
-			DatacenterMetricExclude:   nil,
-			DatacenterInclude:         []string{"/*"},
-			ClusterInstances:          false,
-			ClusterMetricInclude:      nil,
-			ClusterMetricExclude:      nil,
-			ClusterInclude:            []string{"/*/host/**"},
-			HostInstances:             true,
-			HostMetricInclude:         nil,
-			HostMetricExclude:         nil,
-			HostInclude:               []string{"/*/host/**"},
-			ResourcePoolInstances:     false,
-			ResourcePoolMetricInclude: nil,
-			ResourcePoolMetricExclude: nil,
-			ResourcePoolInclude:       []string{"/*/host/**"},
-			VMInstances:               true,
-			VMMetricInclude:           nil,
-			VMMetricExclude:           nil,
-			VMInclude:                 []string{"/*/vm/**"},
-			DatastoreInstances:        false,
-			DatastoreMetricInclude:    nil,
-			DatastoreMetricExclude:    nil,
-			DatastoreInclude:          []string{"/*/datastore/**"},
-			Separator:                 "_",
-			CustomAttributeInclude:    []string{},
-			CustomAttributeExclude:    []string{"*"},
-			UseIntSamples:             true,
-			IPAddresses:               []string{},
-
+			Vcenters:                    []string{},
+			DatacenterInstances:         false,
+			DatacenterMetricInclude:     nil,
+			DatacenterMetricExclude:     nil,
+			DatacenterInclude:           []string{"/*"},
+			ClusterInstances:            false,
+			ClusterMetricInclude:        nil,
+			ClusterMetricExclude:        nil,
+			ClusterInclude:              []string{"/*/host/**"},
+			HostInstances:               true,
+			HostMetricInclude:           nil,
+			HostMetricExclude:           nil,
+			HostInclude:                 []string{"/*/host/**"},
+			ResourcePoolInstances:       false,
+			ResourcePoolMetricInclude:   nil,
+			ResourcePoolMetricExclude:   nil,
+			ResourcePoolInclude:         []string{"/*/host/**"},
+			VMInstances:                 true,
+			VMMetricInclude:             nil,
+			VMMetricExclude:             nil,
+			VMInclude:                   []string{"/*/vm/**"},
+			DatastoreInstances:          false,
+			DatastoreMetricInclude:      nil,
+			DatastoreMetricExclude:      nil,
+			DatastoreInclude:            []string{"/*/datastore/**"},
+			VSANMetricInclude:           nil,
+			VSANMetricExclude:           []string{"*"},
+			VSANMetricSkipVerify:        false,
+			VSANClusterInclude:          []string{"/*/host/**"},
+			Separator:                   "_",
+			CustomAttributeInclude:      []string{},
+			CustomAttributeExclude:      []string{"*"},
+			UseIntSamples:               true,
+			IPAddresses:                 []string{},
 			MaxQueryObjects:             256,
 			MaxQueryMetrics:             256,
 			CollectConcurrency:          1,
--- a/plugins/inputs/vsphere/vsphere_test.go
+++ b/plugins/inputs/vsphere/vsphere_test.go
@ -132,16 +132,15 @@ func defaultVSphere() *VSphere {
 		DatacenterInclude:         []string{"/**"},
 		ClientConfig:              itls.ClientConfig{InsecureSkipVerify: true},

-		MaxQueryObjects:             256,
-		MaxQueryMetrics:             256,
-		ObjectDiscoveryInterval:     config.Duration(time.Second * 300),
-		Timeout:                     config.Duration(time.Second * 20),
-		ForceDiscoverOnInit:         true,
-		DiscoverConcurrency:         1,
-		CollectConcurrency:          1,
-		Separator:                   ".",
-		HistoricalInterval:          config.Duration(time.Second * 300),
-		DisconnectedServersBehavior: "error",
+		MaxQueryObjects:         256,
+		MaxQueryMetrics:         256,
+		ObjectDiscoveryInterval: config.Duration(time.Second * 300),
+		Timeout:                 config.Duration(time.Second * 20),
+		ForceDiscoverOnInit:     true,
+		DiscoverConcurrency:     1,
+		CollectConcurrency:      1,
+		Separator:               ".",
+		HistoricalInterval:      config.Duration(time.Second * 300),
 	}
 }

@ -414,12 +413,46 @@ func TestFolders(t *testing.T) {
 	testLookupVM(ctx, t, &f, "/F0/DC1/vm/**/F*/**", 4, "")
 }

-func TestCollectionWithClusterMetrics(t *testing.T) {
-	if testing.Short() {
-		t.Skip("Skipping long test in short mode")
-	}
+func TestVsanCmmds(t *testing.T) {
+	m, s, err := createSim(0)
+	require.NoError(t, err)
+	defer m.Remove()
+	defer s.Close()

-	testCollection(t, false)
+	v := defaultVSphere()
+	ctx := context.Background()
+
+	c, err := NewClient(ctx, s.URL, v)
+	require.NoError(t, err)
+
+	f := Finder{c}
+	var clusters []mo.ClusterComputeResource
+	err = f.FindAll(ctx, "ClusterComputeResource", []string{"/**"}, []string{}, &clusters)
+	require.NoError(t, err)
+
+	clusterObj := object.NewClusterComputeResource(c.Client.Client, clusters[0].Reference())
+	_, err = getCmmdsMap(ctx, c.Client.Client, clusterObj)
+	require.Error(t, err)
+}
+
+func TestVsanTags(t *testing.T) {
+	host := "5b860329-3bc4-a76c-48b6-246e963cfcc0"
+	disk := "52ee3be1-47cc-b50d-ecab-01af0f706381"
+	ssdDisk := "52f26fc8-0b9b-56d8-3a32-a9c3bfbc6148"
+	ssd := "52173131-3384-bb63-4ef8-c00b0ce7e3e7"
+	hostname := "sc2-hs1-b2801.eng.vmware.com"
+	devName := "naa.55cd2e414d82c815:2"
+	var cmmds = map[string]CmmdsEntity{
+		disk:    {UUID: disk, Type: "DISK", Owner: host, Content: CmmdsContent{DevName: devName, IsSsd: 1.}},
+		ssdDisk: {UUID: ssdDisk, Type: "DISK", Owner: host, Content: CmmdsContent{DevName: devName, IsSsd: 0., SsdUUID: ssd}},
+		host:    {UUID: host, Type: "HOSTNAME", Owner: host, Content: CmmdsContent{Hostname: hostname}},
+	}
+	tags := populateCMMDSTags(make(map[string]string), "capacity-disk", disk, cmmds)
+	require.Equal(t, 2, len(tags))
+	tags = populateCMMDSTags(make(map[string]string), "cache-disk", ssdDisk, cmmds)
+	require.Equal(t, 3, len(tags))
+	tags = populateCMMDSTags(make(map[string]string), "host-domclient", host, cmmds)
+	require.Equal(t, 1, len(tags))
 }

 func TestCollectionNoClusterMetrics(t *testing.T) {