Add support for additional metrics on Linux in zfs input (#3565)
This commit is contained in:
parent
5397c02570
commit
92acef1664
|
@ -13,8 +13,12 @@ from `sysctl` and `zpool` on FreeBSD.
|
|||
# kstatPath = "/proc/spl/kstat/zfs"
|
||||
|
||||
## By default, telegraf gather all zfs stats
|
||||
## If not specified, then default is:
|
||||
## Override the stats list using the kstatMetrics array:
|
||||
## For FreeBSD, the default is:
|
||||
# kstatMetrics = ["arcstats", "zfetchstats", "vdev_cache_stats"]
|
||||
## For Linux, the default is:
|
||||
# kstatMetrics = ["abdstats", "arcstats", "dnodestats", "dbufcachestats",
|
||||
# "dmu_tx", "fm", "vdev_mirror_stats", "zfetchstats", "zil"]
|
||||
|
||||
## By default, don't gather zpool stats
|
||||
# poolMetrics = false
|
||||
|
@ -22,8 +26,8 @@ from `sysctl` and `zpool` on FreeBSD.
|
|||
|
||||
### Measurements & Fields:
|
||||
|
||||
By default this plugin collects metrics about **Arc**, **Zfetch**, and
|
||||
**Vdev cache**. All these metrics are either counters or measure sizes
|
||||
By default this plugin collects metrics about ZFS internals and pool.
|
||||
These metrics are either counters or measure sizes
|
||||
in bytes. These metrics will be in the `zfs` measurement with the field
|
||||
names listed bellow.
|
||||
|
||||
|
@ -33,7 +37,7 @@ each pool.
|
|||
- zfs
|
||||
With fields listed bellow.
|
||||
|
||||
#### Arc Stats
|
||||
#### ARC Stats (FreeBSD and Linux)
|
||||
|
||||
- arcstats_allocated (FreeBSD only)
|
||||
- arcstats_anon_evict_data (Linux only)
|
||||
|
@ -153,7 +157,7 @@ each pool.
|
|||
- arcstats_size
|
||||
- arcstats_sync_wait_for_async (FreeBSD only)
|
||||
|
||||
#### Zfetch Stats
|
||||
#### Zfetch Stats (FreeBSD and Linux)
|
||||
|
||||
- zfetchstats_bogus_streams (Linux only)
|
||||
- zfetchstats_colinear_hits (Linux only)
|
||||
|
@ -168,7 +172,7 @@ each pool.
|
|||
- zfetchstats_stride_hits (Linux only)
|
||||
- zfetchstats_stride_misses (Linux only)
|
||||
|
||||
#### Vdev Cache Stats
|
||||
#### Vdev Cache Stats (FreeBSD)
|
||||
|
||||
- vdev_cache_stats_delegations
|
||||
- vdev_cache_stats_hits
|
||||
|
@ -176,21 +180,21 @@ each pool.
|
|||
|
||||
#### Pool Metrics (optional)
|
||||
|
||||
On Linux:
|
||||
On Linux (reference: kstat accumulated time and queue length statistics):
|
||||
|
||||
- zfs_pool
|
||||
- nread (integer, )
|
||||
- nwritten (integer, )
|
||||
- reads (integer, )
|
||||
- writes (integer, )
|
||||
- wtime (integer, )
|
||||
- wlentime (integer, )
|
||||
- wupdate (integer, )
|
||||
- rtime (integer, )
|
||||
- rlentime (integer, )
|
||||
- rupdate (integer, )
|
||||
- wcnt (integer, )
|
||||
- rcnt (integer, )
|
||||
- nread (integer, bytes)
|
||||
- nwritten (integer, bytes)
|
||||
- reads (integer, count)
|
||||
- writes (integer, count)
|
||||
- wtime (integer, nanoseconds)
|
||||
- wlentime (integer, queuelength * nanoseconds)
|
||||
- wupdate (integer, timestamp)
|
||||
- rtime (integer, nanoseconds)
|
||||
- rlentime (integer, queuelength * nanoseconds)
|
||||
- rupdate (integer, timestamp)
|
||||
- wcnt (integer, count)
|
||||
- rcnt (integer, count)
|
||||
|
||||
On FreeBSD:
|
||||
|
||||
|
@ -224,7 +228,7 @@ $ ./telegraf --config telegraf.conf --input-filter zfs --test
|
|||
|
||||
A short description for some of the metrics.
|
||||
|
||||
#### Arc Stats
|
||||
#### ARC Stats
|
||||
|
||||
`arcstats_hits` Total amount of cache hits in the arc.
|
||||
|
||||
|
@ -283,12 +287,43 @@ A short description for some of the metrics.
|
|||
|
||||
`zfetchstats_hits` Counts the number of cache hits, to items which are in the cache because of the prefetcher.
|
||||
|
||||
`zfetchstats_misses` Counts the number of prefetch cache misses.
|
||||
|
||||
`zfetchstats_colinear_hits` Counts the number of cache hits, to items which are in the cache because of the prefetcher (prefetched linear reads)
|
||||
|
||||
`zfetchstats_stride_hits` Counts the number of cache hits, to items which are in the cache because of the prefetcher (prefetched stride reads)
|
||||
|
||||
#### Vdev Cache Stats
|
||||
#### Vdev Cache Stats (FreeBSD only)
|
||||
note: the vdev cache is deprecated in some ZFS implementations
|
||||
|
||||
`vdev_cache_stats_hits` Hits to the vdev (device level) cache.
|
||||
|
||||
`vdev_cache_stats_misses` Misses to the vdev (device level) cache.
|
||||
|
||||
#### ABD Stats (Linux Only)
|
||||
ABD is a linear/scatter dual typed buffer for ARC
|
||||
|
||||
`abdstats_linear_cnt` number of linear ABDs which are currently allocated
|
||||
|
||||
`abdstats_linear_data_size` amount of data stored in all linear ABDs
|
||||
|
||||
`abdstats_scatter_cnt` number of scatter ABDs which are currently allocated
|
||||
|
||||
`abdstats_scatter_data_size` amount of data stored in all scatter ABDs
|
||||
|
||||
#### DMU Stats (Linux Only)
|
||||
|
||||
`dmu_tx_dirty_throttle` counts when writes are throttled due to the amount of dirty data growing too large
|
||||
|
||||
`dmu_tx_memory_reclaim` counts when memory is low and throttling activity
|
||||
|
||||
`dmu_tx_memory_reserve` counts when memory footprint of the txg exceeds the ARC size
|
||||
|
||||
#### Fault Management Ereport errors (Linux Only)
|
||||
|
||||
`fm_erpt-dropped` counts when an error report cannot be created (eg available memory is too low)
|
||||
|
||||
#### ZIL (Linux Only)
|
||||
note: ZIL measurements are system-wide, neither per-pool nor per-dataset
|
||||
|
||||
`zil_commit_count` counts when ZFS transactions are committed to a ZIL
|
||||
|
|
|
@ -19,7 +19,9 @@ var sampleConfig = `
|
|||
## By default, telegraf gather all zfs stats
|
||||
## If not specified, then default is:
|
||||
# kstatMetrics = ["arcstats", "zfetchstats", "vdev_cache_stats"]
|
||||
|
||||
## For Linux, the default is:
|
||||
# kstatMetrics = ["abdstats", "arcstats", "dnodestats", "dbufcachestats",
|
||||
# "dmu_tx", "fm", "vdev_mirror_stats", "zfetchstats", "zil"]
|
||||
## By default, don't gather zpool stats
|
||||
# poolMetrics = false
|
||||
`
|
||||
|
|
|
@ -80,7 +80,11 @@ func gatherPoolStats(pool poolInfo, acc telegraf.Accumulator) error {
|
|||
func (z *Zfs) Gather(acc telegraf.Accumulator) error {
|
||||
kstatMetrics := z.KstatMetrics
|
||||
if len(kstatMetrics) == 0 {
|
||||
kstatMetrics = []string{"arcstats", "zfetchstats", "vdev_cache_stats"}
|
||||
// vdev_cache_stats is deprecated
|
||||
// xuio_stats are ignored because as of Sep-2016, no known
|
||||
// consumers of xuio exist on Linux
|
||||
kstatMetrics = []string{"abdstats", "arcstats", "dnodestats", "dbufcachestats",
|
||||
"dmu_tx", "fm", "vdev_mirror_stats", "zfetchstats", "zil"}
|
||||
}
|
||||
|
||||
kstatPath := z.KstatPath
|
||||
|
@ -104,7 +108,7 @@ func (z *Zfs) Gather(acc telegraf.Accumulator) error {
|
|||
for _, metric := range kstatMetrics {
|
||||
lines, err := internal.ReadLines(kstatPath + "/" + metric)
|
||||
if err != nil {
|
||||
return err
|
||||
continue
|
||||
}
|
||||
for i, line := range lines {
|
||||
if i == 0 || i == 1 {
|
||||
|
@ -115,6 +119,9 @@ func (z *Zfs) Gather(acc telegraf.Accumulator) error {
|
|||
}
|
||||
rawData := strings.Split(line, " ")
|
||||
key := metric + "_" + rawData[0]
|
||||
if metric == "zil" || metric == "dmu_tx" || metric == "dnodestats" {
|
||||
key = rawData[0]
|
||||
}
|
||||
rawValue := rawData[len(rawData)-1]
|
||||
value, _ := strconv.ParseInt(rawValue, 10, 64)
|
||||
fields[key] = value
|
||||
|
|
|
@ -115,16 +115,133 @@ streams_resets 4 20989756
|
|||
streams_noresets 4 503182328
|
||||
bogus_streams 4 0
|
||||
`
|
||||
const vdev_cache_statsContents = `7 1 0x01 3 144 23617323692 12081684236238879
|
||||
name type data
|
||||
delegations 4 0
|
||||
hits 4 0
|
||||
misses 4 0
|
||||
`
|
||||
const pool_ioContents = `11 3 0x00 1 80 2225326830828 32953476980628
|
||||
nread nwritten reads writes wtime wlentime wupdate rtime rlentime rupdate wcnt rcnt
|
||||
1884160 6450688 22 978 272187126 2850519036 2263669418655 424226814 2850519036 2263669871823 0 0
|
||||
`
|
||||
const zilContents = `7 1 0x01 14 672 34118481334 437444452158445
|
||||
name type data
|
||||
zil_commit_count 4 77
|
||||
zil_commit_writer_count 4 77
|
||||
zil_itx_count 4 1
|
||||
zil_itx_indirect_count 4 2
|
||||
zil_itx_indirect_bytes 4 3
|
||||
zil_itx_copied_count 4 4
|
||||
zil_itx_copied_bytes 4 5
|
||||
zil_itx_needcopy_count 4 6
|
||||
zil_itx_needcopy_bytes 4 7
|
||||
zil_itx_metaslab_normal_count 4 8
|
||||
zil_itx_metaslab_normal_bytes 4 9
|
||||
zil_itx_metaslab_slog_count 4 10
|
||||
zil_itx_metaslab_slog_bytes 4 11
|
||||
`
|
||||
const fmContents = `0 1 0x01 4 192 34087340971 437562103532892
|
||||
name type data
|
||||
erpt-dropped 4 101
|
||||
erpt-set-failed 4 202
|
||||
fmri-set-failed 4 303
|
||||
payload-set-failed 4 404
|
||||
`
|
||||
const dmu_txContents = `5 1 0x01 11 528 34103260832 437683925071438
|
||||
name type data
|
||||
dmu_tx_assigned 4 39321636
|
||||
dmu_tx_delay 4 111
|
||||
dmu_tx_error 4 222
|
||||
dmu_tx_suspended 4 333
|
||||
dmu_tx_group 4 444
|
||||
dmu_tx_memory_reserve 4 555
|
||||
dmu_tx_memory_reclaim 4 666
|
||||
dmu_tx_dirty_throttle 4 777
|
||||
dmu_tx_dirty_delay 4 888
|
||||
dmu_tx_dirty_over_max 4 999
|
||||
dmu_tx_quota 4 101010
|
||||
`
|
||||
|
||||
const abdstatsContents = `7 1 0x01 21 1008 25476602923533 29223577332204
|
||||
name type data
|
||||
struct_size 4 33840
|
||||
linear_cnt 4 834
|
||||
linear_data_size 4 989696
|
||||
scatter_cnt 4 12
|
||||
scatter_data_size 4 187904
|
||||
scatter_chunk_waste 4 4608
|
||||
scatter_order_0 4 1
|
||||
scatter_order_1 4 21
|
||||
scatter_order_2 4 11
|
||||
scatter_order_3 4 33
|
||||
scatter_order_4 4 44
|
||||
scatter_order_5 4 76
|
||||
scatter_order_6 4 489
|
||||
scatter_order_7 4 237483
|
||||
scatter_order_8 4 233
|
||||
scatter_order_9 4 4411
|
||||
scatter_order_10 4 1023
|
||||
scatter_page_multi_chunk 4 32122
|
||||
scatter_page_multi_zone 4 9930
|
||||
scatter_page_alloc_retry 4 99311
|
||||
scatter_sg_table_retry 4 99221
|
||||
`
|
||||
|
||||
const dbufcachestatsContents = `
|
||||
15 1 0x01 11 2992 6257505590736 8516276189184
|
||||
name type data
|
||||
size 4 242688
|
||||
size_max 4 338944
|
||||
max_bytes 4 62834368
|
||||
lowater_bytes 4 56550932
|
||||
hiwater_bytes 4 69117804
|
||||
total_evicts 4 0
|
||||
hash_collisions 4 0
|
||||
hash_elements 4 31
|
||||
hash_elements_max 4 32
|
||||
hash_chains 4 0
|
||||
hash_chain_max 4 0
|
||||
`
|
||||
|
||||
const dnodestatsContents = `
|
||||
10 1 0x01 28 7616 6257498525011 8671911551753
|
||||
name type data
|
||||
dnode_hold_dbuf_hold 4 0
|
||||
dnode_hold_dbuf_read 4 0
|
||||
dnode_hold_alloc_hits 4 1460
|
||||
dnode_hold_alloc_misses 4 0
|
||||
dnode_hold_alloc_interior 4 0
|
||||
dnode_hold_alloc_lock_retry 4 0
|
||||
dnode_hold_alloc_lock_misses 4 0
|
||||
dnode_hold_alloc_type_none 4 0
|
||||
dnode_hold_free_hits 4 2
|
||||
dnode_hold_free_misses 4 0
|
||||
dnode_hold_free_lock_misses 4 0
|
||||
dnode_hold_free_lock_retry 4 0
|
||||
dnode_hold_free_overflow 4 0
|
||||
dnode_hold_free_refcount 4 0
|
||||
dnode_hold_free_txg 4 0
|
||||
dnode_allocate 4 2
|
||||
dnode_reallocate 4 0
|
||||
dnode_buf_evict 4 6
|
||||
dnode_alloc_next_chunk 4 1
|
||||
dnode_alloc_race 4 0
|
||||
dnode_alloc_next_block 4 0
|
||||
dnode_move_invalid 4 0
|
||||
dnode_move_recheck1 4 0
|
||||
dnode_move_recheck2 4 0
|
||||
dnode_move_special 4 0
|
||||
dnode_move_handle 4 0
|
||||
dnode_move_rwlock 4 0
|
||||
dnode_move_active 4 0
|
||||
`
|
||||
|
||||
const vdevmirrorcachestatsContents = `
|
||||
18 1 0x01 7 1904 6257505684227 9638257816287
|
||||
name type data
|
||||
rotating_linear 4 0
|
||||
rotating_offset 4 0
|
||||
rotating_seek 4 0
|
||||
non_rotating_linear 4 0
|
||||
non_rotating_seek 4 0
|
||||
preferred_found 4 0
|
||||
preferred_not_found 4 43
|
||||
`
|
||||
|
||||
var testKstatPath = os.TempDir() + "/telegraf/proc/spl/kstat/zfs"
|
||||
|
||||
|
@ -183,7 +300,16 @@ func TestZfsGeneratesMetrics(t *testing.T) {
|
|||
err = ioutil.WriteFile(testKstatPath+"/zfetchstats", []byte(zfetchstatsContents), 0644)
|
||||
require.NoError(t, err)
|
||||
|
||||
err = ioutil.WriteFile(testKstatPath+"/vdev_cache_stats", []byte(vdev_cache_statsContents), 0644)
|
||||
err = ioutil.WriteFile(testKstatPath+"/zil", []byte(zilContents), 0644)
|
||||
require.NoError(t, err)
|
||||
|
||||
err = ioutil.WriteFile(testKstatPath+"/fm", []byte(fmContents), 0644)
|
||||
require.NoError(t, err)
|
||||
|
||||
err = ioutil.WriteFile(testKstatPath+"/dmu_tx", []byte(dmu_txContents), 0644)
|
||||
require.NoError(t, err)
|
||||
|
||||
err = ioutil.WriteFile(testKstatPath+"/abdstats", []byte(abdstatsContents), 0644)
|
||||
require.NoError(t, err)
|
||||
|
||||
intMetrics := getKstatMetricsAll()
|
||||
|
@ -339,9 +465,55 @@ func getKstatMetricsAll() map[string]interface{} {
|
|||
"zfetchstats_streams_resets": int64(20989756),
|
||||
"zfetchstats_streams_noresets": int64(503182328),
|
||||
"zfetchstats_bogus_streams": int64(0),
|
||||
"vdev_cache_stats_delegations": int64(0),
|
||||
"vdev_cache_stats_hits": int64(0),
|
||||
"vdev_cache_stats_misses": int64(0),
|
||||
"zil_commit_count": int64(77),
|
||||
"zil_commit_writer_count": int64(77),
|
||||
"zil_itx_count": int64(1),
|
||||
"zil_itx_indirect_count": int64(2),
|
||||
"zil_itx_indirect_bytes": int64(3),
|
||||
"zil_itx_copied_count": int64(4),
|
||||
"zil_itx_copied_bytes": int64(5),
|
||||
"zil_itx_needcopy_count": int64(6),
|
||||
"zil_itx_needcopy_bytes": int64(7),
|
||||
"zil_itx_metaslab_normal_count": int64(8),
|
||||
"zil_itx_metaslab_normal_bytes": int64(9),
|
||||
"zil_itx_metaslab_slog_count": int64(10),
|
||||
"zil_itx_metaslab_slog_bytes": int64(11),
|
||||
"fm_erpt-dropped": int64(101),
|
||||
"fm_erpt-set-failed": int64(202),
|
||||
"fm_fmri-set-failed": int64(303),
|
||||
"fm_payload-set-failed": int64(404),
|
||||
"dmu_tx_assigned": int64(39321636),
|
||||
"dmu_tx_delay": int64(111),
|
||||
"dmu_tx_error": int64(222),
|
||||
"dmu_tx_suspended": int64(333),
|
||||
"dmu_tx_group": int64(444),
|
||||
"dmu_tx_memory_reserve": int64(555),
|
||||
"dmu_tx_memory_reclaim": int64(666),
|
||||
"dmu_tx_dirty_throttle": int64(777),
|
||||
"dmu_tx_dirty_delay": int64(888),
|
||||
"dmu_tx_dirty_over_max": int64(999),
|
||||
"dmu_tx_quota": int64(101010),
|
||||
"abdstats_struct_size": int64(33840),
|
||||
"abdstats_linear_cnt": int64(834),
|
||||
"abdstats_linear_data_size": int64(989696),
|
||||
"abdstats_scatter_cnt": int64(12),
|
||||
"abdstats_scatter_data_size": int64(187904),
|
||||
"abdstats_scatter_chunk_waste": int64(4608),
|
||||
"abdstats_scatter_order_0": int64(1),
|
||||
"abdstats_scatter_order_1": int64(21),
|
||||
"abdstats_scatter_order_2": int64(11),
|
||||
"abdstats_scatter_order_3": int64(33),
|
||||
"abdstats_scatter_order_4": int64(44),
|
||||
"abdstats_scatter_order_5": int64(76),
|
||||
"abdstats_scatter_order_6": int64(489),
|
||||
"abdstats_scatter_order_7": int64(237483),
|
||||
"abdstats_scatter_order_8": int64(233),
|
||||
"abdstats_scatter_order_9": int64(4411),
|
||||
"abdstats_scatter_order_10": int64(1023),
|
||||
"abdstats_scatter_page_multi_chunk": int64(32122),
|
||||
"abdstats_scatter_page_multi_zone": int64(9930),
|
||||
"abdstats_scatter_page_alloc_retry": int64(99311),
|
||||
"abdstats_scatter_sg_table_retry": int64(99221),
|
||||
}
|
||||
arcMetrics := getKstatMetricsArcOnly()
|
||||
for k, v := range otherMetrics {
|
||||
|
|
Loading…
Reference in New Issue