From f13afea7d9ffa752e3406b7ad8e0d944a8f4a16c Mon Sep 17 00:00:00 2001 From: Richard Elling Date: Wed, 3 Jan 2018 17:45:48 -0800 Subject: [PATCH] Add support for additional metrics on Linux in zfs input (#3565) --- plugins/inputs/zfs/README.md | 77 +++++++--- plugins/inputs/zfs/zfs.go | 4 +- plugins/inputs/zfs/zfs_linux.go | 11 +- plugins/inputs/zfs/zfs_linux_test.go | 214 ++++++++++++++++++++++++--- 4 files changed, 261 insertions(+), 45 deletions(-) diff --git a/plugins/inputs/zfs/README.md b/plugins/inputs/zfs/README.md index 3215aaf3f..b60711e30 100644 --- a/plugins/inputs/zfs/README.md +++ b/plugins/inputs/zfs/README.md @@ -13,8 +13,12 @@ from `sysctl` and `zpool` on FreeBSD. # kstatPath = "/proc/spl/kstat/zfs" ## By default, telegraf gather all zfs stats - ## If not specified, then default is: + ## Override the stats list using the kstatMetrics array: + ## For FreeBSD, the default is: # kstatMetrics = ["arcstats", "zfetchstats", "vdev_cache_stats"] + ## For Linux, the default is: + # kstatMetrics = ["abdstats", "arcstats", "dnodestats", "dbufcachestats", + # "dmu_tx", "fm", "vdev_mirror_stats", "zfetchstats", "zil"] ## By default, don't gather zpool stats # poolMetrics = false @@ -22,8 +26,8 @@ from `sysctl` and `zpool` on FreeBSD. ### Measurements & Fields: -By default this plugin collects metrics about **Arc**, **Zfetch**, and -**Vdev cache**. All these metrics are either counters or measure sizes +By default this plugin collects metrics about ZFS internals and pool. +These metrics are either counters or measure sizes in bytes. These metrics will be in the `zfs` measurement with the field names listed bellow. @@ -33,7 +37,7 @@ each pool. - zfs With fields listed bellow. -#### Arc Stats +#### ARC Stats (FreeBSD and Linux) - arcstats_allocated (FreeBSD only) - arcstats_anon_evict_data (Linux only) @@ -153,7 +157,7 @@ each pool. - arcstats_size - arcstats_sync_wait_for_async (FreeBSD only) -#### Zfetch Stats +#### Zfetch Stats (FreeBSD and Linux) - zfetchstats_bogus_streams (Linux only) - zfetchstats_colinear_hits (Linux only) @@ -168,7 +172,7 @@ each pool. - zfetchstats_stride_hits (Linux only) - zfetchstats_stride_misses (Linux only) -#### Vdev Cache Stats +#### Vdev Cache Stats (FreeBSD) - vdev_cache_stats_delegations - vdev_cache_stats_hits @@ -176,21 +180,21 @@ each pool. #### Pool Metrics (optional) -On Linux: +On Linux (reference: kstat accumulated time and queue length statistics): - zfs_pool - - nread (integer, ) - - nwritten (integer, ) - - reads (integer, ) - - writes (integer, ) - - wtime (integer, ) - - wlentime (integer, ) - - wupdate (integer, ) - - rtime (integer, ) - - rlentime (integer, ) - - rupdate (integer, ) - - wcnt (integer, ) - - rcnt (integer, ) + - nread (integer, bytes) + - nwritten (integer, bytes) + - reads (integer, count) + - writes (integer, count) + - wtime (integer, nanoseconds) + - wlentime (integer, queuelength * nanoseconds) + - wupdate (integer, timestamp) + - rtime (integer, nanoseconds) + - rlentime (integer, queuelength * nanoseconds) + - rupdate (integer, timestamp) + - wcnt (integer, count) + - rcnt (integer, count) On FreeBSD: @@ -224,7 +228,7 @@ $ ./telegraf --config telegraf.conf --input-filter zfs --test A short description for some of the metrics. -#### Arc Stats +#### ARC Stats `arcstats_hits` Total amount of cache hits in the arc. @@ -283,12 +287,43 @@ A short description for some of the metrics. `zfetchstats_hits` Counts the number of cache hits, to items which are in the cache because of the prefetcher. +`zfetchstats_misses` Counts the number of prefetch cache misses. + `zfetchstats_colinear_hits` Counts the number of cache hits, to items which are in the cache because of the prefetcher (prefetched linear reads) `zfetchstats_stride_hits` Counts the number of cache hits, to items which are in the cache because of the prefetcher (prefetched stride reads) -#### Vdev Cache Stats +#### Vdev Cache Stats (FreeBSD only) +note: the vdev cache is deprecated in some ZFS implementations `vdev_cache_stats_hits` Hits to the vdev (device level) cache. `vdev_cache_stats_misses` Misses to the vdev (device level) cache. + +#### ABD Stats (Linux Only) +ABD is a linear/scatter dual typed buffer for ARC + +`abdstats_linear_cnt` number of linear ABDs which are currently allocated + +`abdstats_linear_data_size` amount of data stored in all linear ABDs + +`abdstats_scatter_cnt` number of scatter ABDs which are currently allocated + +`abdstats_scatter_data_size` amount of data stored in all scatter ABDs + +#### DMU Stats (Linux Only) + +`dmu_tx_dirty_throttle` counts when writes are throttled due to the amount of dirty data growing too large + +`dmu_tx_memory_reclaim` counts when memory is low and throttling activity + +`dmu_tx_memory_reserve` counts when memory footprint of the txg exceeds the ARC size + +#### Fault Management Ereport errors (Linux Only) + +`fm_erpt-dropped` counts when an error report cannot be created (eg available memory is too low) + +#### ZIL (Linux Only) +note: ZIL measurements are system-wide, neither per-pool nor per-dataset + +`zil_commit_count` counts when ZFS transactions are committed to a ZIL diff --git a/plugins/inputs/zfs/zfs.go b/plugins/inputs/zfs/zfs.go index 05ca346b0..8e6bec464 100644 --- a/plugins/inputs/zfs/zfs.go +++ b/plugins/inputs/zfs/zfs.go @@ -19,7 +19,9 @@ var sampleConfig = ` ## By default, telegraf gather all zfs stats ## If not specified, then default is: # kstatMetrics = ["arcstats", "zfetchstats", "vdev_cache_stats"] - + ## For Linux, the default is: + # kstatMetrics = ["abdstats", "arcstats", "dnodestats", "dbufcachestats", + # "dmu_tx", "fm", "vdev_mirror_stats", "zfetchstats", "zil"] ## By default, don't gather zpool stats # poolMetrics = false ` diff --git a/plugins/inputs/zfs/zfs_linux.go b/plugins/inputs/zfs/zfs_linux.go index 71ec7e5dc..276880d7d 100644 --- a/plugins/inputs/zfs/zfs_linux.go +++ b/plugins/inputs/zfs/zfs_linux.go @@ -80,7 +80,11 @@ func gatherPoolStats(pool poolInfo, acc telegraf.Accumulator) error { func (z *Zfs) Gather(acc telegraf.Accumulator) error { kstatMetrics := z.KstatMetrics if len(kstatMetrics) == 0 { - kstatMetrics = []string{"arcstats", "zfetchstats", "vdev_cache_stats"} + // vdev_cache_stats is deprecated + // xuio_stats are ignored because as of Sep-2016, no known + // consumers of xuio exist on Linux + kstatMetrics = []string{"abdstats", "arcstats", "dnodestats", "dbufcachestats", + "dmu_tx", "fm", "vdev_mirror_stats", "zfetchstats", "zil"} } kstatPath := z.KstatPath @@ -104,7 +108,7 @@ func (z *Zfs) Gather(acc telegraf.Accumulator) error { for _, metric := range kstatMetrics { lines, err := internal.ReadLines(kstatPath + "/" + metric) if err != nil { - return err + continue } for i, line := range lines { if i == 0 || i == 1 { @@ -115,6 +119,9 @@ func (z *Zfs) Gather(acc telegraf.Accumulator) error { } rawData := strings.Split(line, " ") key := metric + "_" + rawData[0] + if metric == "zil" || metric == "dmu_tx" || metric == "dnodestats" { + key = rawData[0] + } rawValue := rawData[len(rawData)-1] value, _ := strconv.ParseInt(rawValue, 10, 64) fields[key] = value diff --git a/plugins/inputs/zfs/zfs_linux_test.go b/plugins/inputs/zfs/zfs_linux_test.go index c4db75ff5..133d1cafa 100644 --- a/plugins/inputs/zfs/zfs_linux_test.go +++ b/plugins/inputs/zfs/zfs_linux_test.go @@ -115,16 +115,133 @@ streams_resets 4 20989756 streams_noresets 4 503182328 bogus_streams 4 0 ` -const vdev_cache_statsContents = `7 1 0x01 3 144 23617323692 12081684236238879 -name type data -delegations 4 0 -hits 4 0 -misses 4 0 -` const pool_ioContents = `11 3 0x00 1 80 2225326830828 32953476980628 nread nwritten reads writes wtime wlentime wupdate rtime rlentime rupdate wcnt rcnt 1884160 6450688 22 978 272187126 2850519036 2263669418655 424226814 2850519036 2263669871823 0 0 ` +const zilContents = `7 1 0x01 14 672 34118481334 437444452158445 +name type data +zil_commit_count 4 77 +zil_commit_writer_count 4 77 +zil_itx_count 4 1 +zil_itx_indirect_count 4 2 +zil_itx_indirect_bytes 4 3 +zil_itx_copied_count 4 4 +zil_itx_copied_bytes 4 5 +zil_itx_needcopy_count 4 6 +zil_itx_needcopy_bytes 4 7 +zil_itx_metaslab_normal_count 4 8 +zil_itx_metaslab_normal_bytes 4 9 +zil_itx_metaslab_slog_count 4 10 +zil_itx_metaslab_slog_bytes 4 11 +` +const fmContents = `0 1 0x01 4 192 34087340971 437562103532892 +name type data +erpt-dropped 4 101 +erpt-set-failed 4 202 +fmri-set-failed 4 303 +payload-set-failed 4 404 +` +const dmu_txContents = `5 1 0x01 11 528 34103260832 437683925071438 +name type data +dmu_tx_assigned 4 39321636 +dmu_tx_delay 4 111 +dmu_tx_error 4 222 +dmu_tx_suspended 4 333 +dmu_tx_group 4 444 +dmu_tx_memory_reserve 4 555 +dmu_tx_memory_reclaim 4 666 +dmu_tx_dirty_throttle 4 777 +dmu_tx_dirty_delay 4 888 +dmu_tx_dirty_over_max 4 999 +dmu_tx_quota 4 101010 +` + +const abdstatsContents = `7 1 0x01 21 1008 25476602923533 29223577332204 +name type data +struct_size 4 33840 +linear_cnt 4 834 +linear_data_size 4 989696 +scatter_cnt 4 12 +scatter_data_size 4 187904 +scatter_chunk_waste 4 4608 +scatter_order_0 4 1 +scatter_order_1 4 21 +scatter_order_2 4 11 +scatter_order_3 4 33 +scatter_order_4 4 44 +scatter_order_5 4 76 +scatter_order_6 4 489 +scatter_order_7 4 237483 +scatter_order_8 4 233 +scatter_order_9 4 4411 +scatter_order_10 4 1023 +scatter_page_multi_chunk 4 32122 +scatter_page_multi_zone 4 9930 +scatter_page_alloc_retry 4 99311 +scatter_sg_table_retry 4 99221 +` + +const dbufcachestatsContents = ` +15 1 0x01 11 2992 6257505590736 8516276189184 +name type data +size 4 242688 +size_max 4 338944 +max_bytes 4 62834368 +lowater_bytes 4 56550932 +hiwater_bytes 4 69117804 +total_evicts 4 0 +hash_collisions 4 0 +hash_elements 4 31 +hash_elements_max 4 32 +hash_chains 4 0 +hash_chain_max 4 0 +` + +const dnodestatsContents = ` +10 1 0x01 28 7616 6257498525011 8671911551753 +name type data +dnode_hold_dbuf_hold 4 0 +dnode_hold_dbuf_read 4 0 +dnode_hold_alloc_hits 4 1460 +dnode_hold_alloc_misses 4 0 +dnode_hold_alloc_interior 4 0 +dnode_hold_alloc_lock_retry 4 0 +dnode_hold_alloc_lock_misses 4 0 +dnode_hold_alloc_type_none 4 0 +dnode_hold_free_hits 4 2 +dnode_hold_free_misses 4 0 +dnode_hold_free_lock_misses 4 0 +dnode_hold_free_lock_retry 4 0 +dnode_hold_free_overflow 4 0 +dnode_hold_free_refcount 4 0 +dnode_hold_free_txg 4 0 +dnode_allocate 4 2 +dnode_reallocate 4 0 +dnode_buf_evict 4 6 +dnode_alloc_next_chunk 4 1 +dnode_alloc_race 4 0 +dnode_alloc_next_block 4 0 +dnode_move_invalid 4 0 +dnode_move_recheck1 4 0 +dnode_move_recheck2 4 0 +dnode_move_special 4 0 +dnode_move_handle 4 0 +dnode_move_rwlock 4 0 +dnode_move_active 4 0 +` + +const vdevmirrorcachestatsContents = ` +18 1 0x01 7 1904 6257505684227 9638257816287 +name type data +rotating_linear 4 0 +rotating_offset 4 0 +rotating_seek 4 0 +non_rotating_linear 4 0 +non_rotating_seek 4 0 +preferred_found 4 0 +preferred_not_found 4 43 +` var testKstatPath = os.TempDir() + "/telegraf/proc/spl/kstat/zfs" @@ -183,7 +300,16 @@ func TestZfsGeneratesMetrics(t *testing.T) { err = ioutil.WriteFile(testKstatPath+"/zfetchstats", []byte(zfetchstatsContents), 0644) require.NoError(t, err) - err = ioutil.WriteFile(testKstatPath+"/vdev_cache_stats", []byte(vdev_cache_statsContents), 0644) + err = ioutil.WriteFile(testKstatPath+"/zil", []byte(zilContents), 0644) + require.NoError(t, err) + + err = ioutil.WriteFile(testKstatPath+"/fm", []byte(fmContents), 0644) + require.NoError(t, err) + + err = ioutil.WriteFile(testKstatPath+"/dmu_tx", []byte(dmu_txContents), 0644) + require.NoError(t, err) + + err = ioutil.WriteFile(testKstatPath+"/abdstats", []byte(abdstatsContents), 0644) require.NoError(t, err) intMetrics := getKstatMetricsAll() @@ -328,20 +454,66 @@ func getKstatMetricsArcOnly() map[string]interface{} { func getKstatMetricsAll() map[string]interface{} { otherMetrics := map[string]interface{}{ - "zfetchstats_hits": int64(7812959060), - "zfetchstats_misses": int64(4154484207), - "zfetchstats_colinear_hits": int64(1366368), - "zfetchstats_colinear_misses": int64(4153117839), - "zfetchstats_stride_hits": int64(7309776732), - "zfetchstats_stride_misses": int64(222766182), - "zfetchstats_reclaim_successes": int64(107788388), - "zfetchstats_reclaim_failures": int64(4045329451), - "zfetchstats_streams_resets": int64(20989756), - "zfetchstats_streams_noresets": int64(503182328), - "zfetchstats_bogus_streams": int64(0), - "vdev_cache_stats_delegations": int64(0), - "vdev_cache_stats_hits": int64(0), - "vdev_cache_stats_misses": int64(0), + "zfetchstats_hits": int64(7812959060), + "zfetchstats_misses": int64(4154484207), + "zfetchstats_colinear_hits": int64(1366368), + "zfetchstats_colinear_misses": int64(4153117839), + "zfetchstats_stride_hits": int64(7309776732), + "zfetchstats_stride_misses": int64(222766182), + "zfetchstats_reclaim_successes": int64(107788388), + "zfetchstats_reclaim_failures": int64(4045329451), + "zfetchstats_streams_resets": int64(20989756), + "zfetchstats_streams_noresets": int64(503182328), + "zfetchstats_bogus_streams": int64(0), + "zil_commit_count": int64(77), + "zil_commit_writer_count": int64(77), + "zil_itx_count": int64(1), + "zil_itx_indirect_count": int64(2), + "zil_itx_indirect_bytes": int64(3), + "zil_itx_copied_count": int64(4), + "zil_itx_copied_bytes": int64(5), + "zil_itx_needcopy_count": int64(6), + "zil_itx_needcopy_bytes": int64(7), + "zil_itx_metaslab_normal_count": int64(8), + "zil_itx_metaslab_normal_bytes": int64(9), + "zil_itx_metaslab_slog_count": int64(10), + "zil_itx_metaslab_slog_bytes": int64(11), + "fm_erpt-dropped": int64(101), + "fm_erpt-set-failed": int64(202), + "fm_fmri-set-failed": int64(303), + "fm_payload-set-failed": int64(404), + "dmu_tx_assigned": int64(39321636), + "dmu_tx_delay": int64(111), + "dmu_tx_error": int64(222), + "dmu_tx_suspended": int64(333), + "dmu_tx_group": int64(444), + "dmu_tx_memory_reserve": int64(555), + "dmu_tx_memory_reclaim": int64(666), + "dmu_tx_dirty_throttle": int64(777), + "dmu_tx_dirty_delay": int64(888), + "dmu_tx_dirty_over_max": int64(999), + "dmu_tx_quota": int64(101010), + "abdstats_struct_size": int64(33840), + "abdstats_linear_cnt": int64(834), + "abdstats_linear_data_size": int64(989696), + "abdstats_scatter_cnt": int64(12), + "abdstats_scatter_data_size": int64(187904), + "abdstats_scatter_chunk_waste": int64(4608), + "abdstats_scatter_order_0": int64(1), + "abdstats_scatter_order_1": int64(21), + "abdstats_scatter_order_2": int64(11), + "abdstats_scatter_order_3": int64(33), + "abdstats_scatter_order_4": int64(44), + "abdstats_scatter_order_5": int64(76), + "abdstats_scatter_order_6": int64(489), + "abdstats_scatter_order_7": int64(237483), + "abdstats_scatter_order_8": int64(233), + "abdstats_scatter_order_9": int64(4411), + "abdstats_scatter_order_10": int64(1023), + "abdstats_scatter_page_multi_chunk": int64(32122), + "abdstats_scatter_page_multi_zone": int64(9930), + "abdstats_scatter_page_alloc_retry": int64(99311), + "abdstats_scatter_sg_table_retry": int64(99221), } arcMetrics := getKstatMetricsArcOnly() for k, v := range otherMetrics {