diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..80edb3f0c --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +CHANGELOG.md merge=union + diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d07fb9f9..f05121d06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,98 @@ -## v0.12.1 [unreleased] +## v0.13 [unreleased] + +### Release Notes + +- **Breaking change** in jolokia plugin. See +https://github.com/influxdata/telegraf/blob/master/plugins/inputs/jolokia/README.md +for updated configuration. The plugin will now support proxy mode and will make +POST requests. + +- New [agent] configuration option: `metric_batch_size`. This option tells +telegraf the maximum batch size to allow to accumulate before sending a flush +to the configured outputs. `metric_buffer_limit` now refers to the absolute +maximum number of metrics that will accumulate before metrics are dropped. + +- There is no longer an option to +`flush_buffer_when_full`, this is now the default and only behavior of telegraf. + +- **Breaking Change**: docker plugin tags. The cont_id tag no longer exists, it +will now be a field, and be called container_id. Additionally, cont_image and +cont_name are being renamed to container_image and container_name. + +- **Breaking Change**: docker plugin measurements. The `docker_cpu`, `docker_mem`, +`docker_blkio` and `docker_net` measurements are being renamed to +`docker_container_cpu`, `docker_container_mem`, `docker_container_blkio` and +`docker_container_net`. Why? Because these metrics are +specifically tracking per-container stats. The problem with per-container stats, +in some use-cases, is that if containers are short-lived AND names are not +kept consistent, then the series cardinality will balloon very quickly. +So adding "container" to each metric will: +(1) make it more clear that these metrics are per-container, and +(2) allow users to easily drop per-container metrics if cardinality is an +issue (`namedrop = ["docker_container_*"]`) + +- `tagexclude` and `taginclude` are now available, which can be used to remove +tags from measurements on inputs and outputs. See +[the configuration doc](https://github.com/influxdata/telegraf/blob/master/docs/CONFIGURATION.md) +for more details. + +- **Measurement filtering:** All measurement filters now match based on glob +only. Previously there was an undocumented behavior where filters would match +based on _prefix_ in addition to globs. This means that a filter like +`fielddrop = ["time_"]` will need to be changed to `fielddrop = ["time_*"]` + +- **datadog**: measurement and field names will no longer have `_` replaced by `.` + +- The following plugins have changed their tags to _not_ overwrite the host tag: + - cassandra: `host -> cassandra_host` + - disque: `host -> disque_host` + - rethinkdb: `host -> rethinkdb_host` + +- **Breaking Change**: The `win_perf_counters` input has been changed to sanitize field names, replacing `/Sec` and `/sec` with `_persec`, as well as spaces with underscores. This is needed because Graphite doesn't like slashes and spaces, and was failing to accept metrics that had them. The `/[sS]ec` -> `_persec` is just to make things clearer and uniform. + +### Features + +- [#1031](https://github.com/influxdata/telegraf/pull/1031): Jolokia plugin proxy mode. Thanks @saiello! +- [#1017](https://github.com/influxdata/telegraf/pull/1017): taginclude and tagexclude arguments. +- [#1015](https://github.com/influxdata/telegraf/pull/1015): Docker plugin schema refactor. +- [#889](https://github.com/influxdata/telegraf/pull/889): Improved MySQL plugin. Thanks @maksadbek! +- [#1060](https://github.com/influxdata/telegraf/pull/1060): TTL metrics added to MongoDB input plugin +- [#1056](https://github.com/influxdata/telegraf/pull/1056): Don't allow inputs to overwrite host tags. +- [#1035](https://github.com/influxdata/telegraf/issues/1035): Add `user`, `exe`, `pidfile` tags to procstat plugin. +- [#1041](https://github.com/influxdata/telegraf/issues/1041): Add `n_cpus` field to the system plugin. +- [#1072](https://github.com/influxdata/telegraf/pull/1072): New Input Plugin: filestat. +- [#1066](https://github.com/influxdata/telegraf/pull/1066): Replication lag metrics for MongoDB input plugin +- [#1086](https://github.com/influxdata/telegraf/pull/1086): Ability to specify AWS keys in config file. Thanks @johnrengleman! +- [#1096](https://github.com/influxdata/telegraf/pull/1096): Performance refactor of running output buffers. +- [#967](https://github.com/influxdata/telegraf/issues/967): Buffer logging improvements. +- [#1107](https://github.com/influxdata/telegraf/issues/1107): Support lustre2 job stats. Thanks @hanleyja! +- [#1122](https://github.com/influxdata/telegraf/pull/1122): Support setting config path through env variable and default paths. +- [#1128](https://github.com/influxdata/telegraf/pull/1128): MongoDB jumbo chunks metric for MongoDB input plugin + +### Bugfixes + +- [#1050](https://github.com/influxdata/telegraf/issues/1050): jolokia plugin - do not overwrite host tag. Thanks @saiello! +- [#921](https://github.com/influxdata/telegraf/pull/921): mqtt_consumer stops gathering metrics. Thanks @chaton78! +- [#1013](https://github.com/influxdata/telegraf/pull/1013): Close dead riemann output connections. Thanks @echupriyanov! +- [#1012](https://github.com/influxdata/telegraf/pull/1012): Set default tags in test accumulator. +- [#1024](https://github.com/influxdata/telegraf/issues/1024): Don't replace `.` with `_` in datadog output. +- [#1058](https://github.com/influxdata/telegraf/issues/1058): Fix possible leaky TCP connections in influxdb output. +- [#1044](https://github.com/influxdata/telegraf/pull/1044): Fix SNMP OID possible collisions. Thanks @relip +- [#1022](https://github.com/influxdata/telegraf/issues/1022): Dont error deb/rpm install on systemd errors. +- [#1078](https://github.com/influxdata/telegraf/issues/1078): Use default AWS credential chain. +- [#1070](https://github.com/influxdata/telegraf/issues/1070): SQL Server input. Fix datatype conversion. +- [#1089](https://github.com/influxdata/telegraf/issues/1089): Fix leaky TCP connections in phpfpm plugin. +- [#914](https://github.com/influxdata/telegraf/issues/914): Telegraf can drop metrics on full buffers. +- [#1098](https://github.com/influxdata/telegraf/issues/1098): Sanitize invalid OpenTSDB characters. +- [#1110](https://github.com/influxdata/telegraf/pull/1110): Sanitize * to - in graphite serializer. Thanks @goodeggs! +- [#1118](https://github.com/influxdata/telegraf/pull/1118): Sanitize Counter names for `win_perf_counters` input. +- [#1125](https://github.com/influxdata/telegraf/pull/1125): Wrap all exec command runners with a timeout, so hung os processes don't halt Telegraf. +- [#1113](https://github.com/influxdata/telegraf/pull/1113): Set MaxRetry and RequiredAcks defaults in Kafka output. +- [#1090](https://github.com/influxdata/telegraf/issues/1090): [agent] and [global_tags] config sometimes not getting applied. +- [#1133](https://github.com/influxdata/telegraf/issues/1133): Use a timeout for docker list & stat cmds. +- [#1052](https://github.com/influxdata/telegraf/issues/1052): Docker panic fix when decode fails. + +## v0.12.1 [2016-04-14] ### Release Notes - Breaking change in the dovecot input plugin. See Features section below. diff --git a/Godeps b/Godeps index 14430ea5d..9dd6a83ed 100644 --- a/Godeps +++ b/Godeps @@ -16,14 +16,16 @@ github.com/eapache/go-resiliency b86b1ec0dd4209a588dc1285cdd471e73525c0b3 github.com/eapache/queue ded5959c0d4e360646dc9e9908cff48666781367 github.com/eclipse/paho.mqtt.golang 0f7a459f04f13a41b7ed752d47944528d4bf9a86 github.com/go-sql-driver/mysql 1fca743146605a172a266e1654e01e5cd5669bee +github.com/gobwas/glob d877f6352135181470c40c73ebb81aefa22115fa github.com/golang/protobuf 552c7b9542c194800fd493123b3798ef0a832032 github.com/golang/snappy 427fb6fc07997f43afa32f35e850833760e489a7 github.com/gonuts/go-shellquote e842a11b24c6abfb3dd27af69a17f482e4b483c2 github.com/gorilla/context 1ea25387ff6f684839d82767c1733ff4d4d15d0a github.com/gorilla/mux c9e326e2bdec29039a3761c07bece13133863e1e github.com/hailocab/go-hostpool e80d13ce29ede4452c43dea11e79b9bc8a15b478 +github.com/hpcloud/tail b2940955ab8b26e19d43a43c4da0475dd81bdb56 github.com/influxdata/config b79f6829346b8d6e78ba73544b1e1038f1f1c9da -github.com/influxdata/influxdb e3fef5593c21644f2b43af55d6e17e70910b0e48 +github.com/influxdata/influxdb 21db76b3374c733f37ed16ad93f3484020034351 github.com/influxdata/toml af4df43894b16e3fd2b788d01bd27ad0776ef2d0 github.com/klauspost/crc32 19b0b332c9e4516a6370a0456e6182c3b5036720 github.com/lib/pq e182dc4027e2ded4b19396d638610f2653295f36 diff --git a/README.md b/README.md index 57cff90e7..8084ba790 100644 --- a/README.md +++ b/README.md @@ -20,12 +20,12 @@ new plugins. ### Linux deb and rpm Packages: Latest: -* http://get.influxdb.org/telegraf/telegraf_0.12.0-1_amd64.deb -* http://get.influxdb.org/telegraf/telegraf-0.12.0-1.x86_64.rpm +* http://get.influxdb.org/telegraf/telegraf_0.12.1-1_amd64.deb +* http://get.influxdb.org/telegraf/telegraf-0.12.1-1.x86_64.rpm Latest (arm): -* http://get.influxdb.org/telegraf/telegraf_0.12.0-1_armhf.deb -* http://get.influxdb.org/telegraf/telegraf-0.12.0-1.armhf.rpm +* http://get.influxdb.org/telegraf/telegraf_0.12.1-1_armhf.deb +* http://get.influxdb.org/telegraf/telegraf-0.12.1-1.armhf.rpm ##### Package Instructions: @@ -46,28 +46,28 @@ to use this repo to install & update telegraf. ### Linux tarballs: Latest: -* http://get.influxdb.org/telegraf/telegraf-0.12.0-1_linux_amd64.tar.gz -* http://get.influxdb.org/telegraf/telegraf-0.12.0-1_linux_i386.tar.gz -* http://get.influxdb.org/telegraf/telegraf-0.12.0-1_linux_armhf.tar.gz +* http://get.influxdb.org/telegraf/telegraf-0.12.1-1_linux_amd64.tar.gz +* http://get.influxdb.org/telegraf/telegraf-0.12.1-1_linux_i386.tar.gz +* http://get.influxdb.org/telegraf/telegraf-0.12.1-1_linux_armhf.tar.gz ##### tarball Instructions: To install the full directory structure with config file, run: ``` -sudo tar -C / -zxvf ./telegraf-0.12.0-1_linux_amd64.tar.gz +sudo tar -C / -zxvf ./telegraf-0.12.1-1_linux_amd64.tar.gz ``` To extract only the binary, run: ``` -tar -zxvf telegraf-0.12.0-1_linux_amd64.tar.gz --strip-components=3 ./usr/bin/telegraf +tar -zxvf telegraf-0.12.1-1_linux_amd64.tar.gz --strip-components=3 ./usr/bin/telegraf ``` ### FreeBSD tarball: Latest: -* http://get.influxdb.org/telegraf/telegraf-0.12.0-1_freebsd_amd64.tar.gz +* http://get.influxdb.org/telegraf/telegraf-0.12.1-1_freebsd_amd64.tar.gz ##### tarball Instructions: @@ -87,8 +87,8 @@ brew install telegraf ### Windows Binaries (EXPERIMENTAL) Latest: -* http://get.influxdb.org/telegraf/telegraf-0.12.0-1_windows_amd64.zip -* http://get.influxdb.org/telegraf/telegraf-0.12.0-1_windows_i386.zip +* http://get.influxdb.org/telegraf/telegraf-0.12.1-1_windows_amd64.zip +* http://get.influxdb.org/telegraf/telegraf-0.12.1-1_windows_i386.zip ### From Source: @@ -168,7 +168,8 @@ Currently implemented sources: * [docker](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/docker) * [dovecot](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/dovecot) * [elasticsearch](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/elasticsearch) -* [exec](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/exec ) (generic executable plugin, support JSON, influx, graphite and nagios) +* [exec](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/exec) (generic executable plugin, support JSON, influx, graphite and nagios) +* [filestat](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/filestat) * [haproxy](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/haproxy) * [http_response](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/http_response) * [httpjson](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/httpjson) (generic JSON-emitting http service plugin) diff --git a/agent/accumulator.go b/agent/accumulator.go index 7ec22cd7f..70744359f 100644 --- a/agent/accumulator.go +++ b/agent/accumulator.go @@ -84,18 +84,15 @@ func (ac *accumulator) AddFields( if tags == nil { tags = make(map[string]string) } - // Apply plugin-wide tags if set - for k, v := range ac.inputConfig.Tags { - if _, ok := tags[k]; !ok { - tags[k] = v - } - } // Apply daemon-wide tags if set for k, v := range ac.defaultTags { - if _, ok := tags[k]; !ok { - tags[k] = v - } + tags[k] = v } + // Apply plugin-wide tags if set + for k, v := range ac.inputConfig.Tags { + tags[k] = v + } + ac.inputConfig.Filter.FilterTags(tags) result := make(map[string]interface{}) for k, v := range fields { diff --git a/agent/accumulator_test.go b/agent/accumulator_test.go index 05f9b02aa..ee8f65e48 100644 --- a/agent/accumulator_test.go +++ b/agent/accumulator_test.go @@ -300,3 +300,35 @@ func TestAddBools(t *testing.T) { fmt.Sprintf("acctest,acc=test,default=tag value=false %d", now.UnixNano()), actual) } + +// Test that tag filters get applied to metrics. +func TestAccFilterTags(t *testing.T) { + a := accumulator{} + now := time.Now() + a.metrics = make(chan telegraf.Metric, 10) + defer close(a.metrics) + filter := internal_models.Filter{ + TagExclude: []string{"acc"}, + } + assert.NoError(t, filter.CompileFilter()) + a.inputConfig = &internal_models.InputConfig{} + a.inputConfig.Filter = filter + + a.Add("acctest", float64(101), map[string]string{}) + a.Add("acctest", float64(101), map[string]string{"acc": "test"}) + a.Add("acctest", float64(101), map[string]string{"acc": "test"}, now) + + testm := <-a.metrics + actual := testm.String() + assert.Contains(t, actual, "acctest value=101") + + testm = <-a.metrics + actual = testm.String() + assert.Contains(t, actual, "acctest value=101") + + testm = <-a.metrics + actual = testm.String() + assert.Equal(t, + fmt.Sprintf("acctest value=101 %d", now.UnixNano()), + actual) +} diff --git a/agent/agent.go b/agent/agent.go index fdd17a267..60f2d63c6 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -221,6 +221,7 @@ func (a *Agent) Test() error { for _, input := range a.Config.Inputs { acc := NewAccumulator(input.Config, metricC) acc.SetDebug(true) + acc.setDefaultTags(a.Config.Tags) fmt.Printf("* Plugin: %s, Collection 1\n", input.Name) if input.Config.Interval != 0 { diff --git a/circle.yml b/circle.yml index e7b711f9d..7a269f29f 100644 --- a/circle.yml +++ b/circle.yml @@ -4,9 +4,9 @@ machine: post: - sudo service zookeeper stop - go version - - go version | grep 1.6 || sudo rm -rf /usr/local/go - - wget https://storage.googleapis.com/golang/go1.6.linux-amd64.tar.gz - - sudo tar -C /usr/local -xzf go1.6.linux-amd64.tar.gz + - go version | grep 1.6.2 || sudo rm -rf /usr/local/go + - wget https://storage.googleapis.com/golang/go1.6.2.linux-amd64.tar.gz + - sudo tar -C /usr/local -xzf go1.6.2.linux-amd64.tar.gz - go version dependencies: diff --git a/cmd/telegraf/telegraf.go b/cmd/telegraf/telegraf.go index be591829b..ad0174788 100644 --- a/cmd/telegraf/telegraf.go +++ b/cmd/telegraf/telegraf.go @@ -71,6 +71,13 @@ The flags are: -quiet run in quiet mode -version print the version to stdout +In addition to the -config flag, telegraf will also load the config file from +an environment variable or default location. Precedence is: + 1. -config flag + 2. $TELEGRAF_CONFIG_PATH environment variable + 3. $HOME/.telegraf/telegraf.conf + 4. /etc/telegraf/telegraf.conf + Examples: # generate a telegraf config file: @@ -98,12 +105,10 @@ func main() { flag.Parse() args := flag.Args() - if flag.NFlag() == 0 && len(args) == 0 { - usageExit(0) - } - var inputFilters []string if *fInputFiltersLegacy != "" { + fmt.Printf("WARNING '--filter' flag is deprecated, please use" + + " '--input-filter'") inputFilter := strings.TrimSpace(*fInputFiltersLegacy) inputFilters = strings.Split(":"+inputFilter+":", ":") } @@ -114,6 +119,8 @@ func main() { var outputFilters []string if *fOutputFiltersLegacy != "" { + fmt.Printf("WARNING '--outputfilter' flag is deprecated, please use" + + " '--output-filter'") outputFilter := strings.TrimSpace(*fOutputFiltersLegacy) outputFilters = strings.Split(":"+outputFilter+":", ":") } @@ -170,25 +177,19 @@ func main() { return } - var ( - c *config.Config - err error - ) - - if *fConfig != "" { - c = config.NewConfig() - c.OutputFilters = outputFilters - c.InputFilters = inputFilters - err = c.LoadConfig(*fConfig) - if err != nil { - log.Fatal(err) - } - } else { - fmt.Println("You must specify a config file. See telegraf --help") + // If no other options are specified, load the config file and run. + c := config.NewConfig() + c.OutputFilters = outputFilters + c.InputFilters = inputFilters + err := c.LoadConfig(*fConfig) + if err != nil { + fmt.Println(err) os.Exit(1) } if *fConfigDirectoryLegacy != "" { + fmt.Printf("WARNING '--configdirectory' flag is deprecated, please use" + + " '--config-directory'") err = c.LoadDirectory(*fConfigDirectoryLegacy) if err != nil { log.Fatal(err) diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 0afaa120f..a01178919 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -3,11 +3,20 @@ ## Generating a Configuration File A default Telegraf config file can be generated using the -sample-config flag: -`telegraf -sample-config > telegraf.conf` + +``` +telegraf -sample-config > telegraf.conf +``` To generate a file with specific inputs and outputs, you can use the -input-filter and -output-filter flags: -`telegraf -sample-config -input-filter cpu:mem:net:swap -output-filter influxdb:kafka` + +``` +telegraf -sample-config -input-filter cpu:mem:net:swap -output-filter influxdb:kafka +``` + +You can see the latest config file with all available plugins here: +[telegraf.conf](https://github.com/influxdata/telegraf/blob/master/etc/telegraf.conf) ## Environment Variables @@ -17,8 +26,8 @@ for numbers and booleans they should be plain (ie, $INT_VAR, $BOOL_VAR) ## `[global_tags]` Configuration -Global tags can be specific in the `[global_tags]` section of the config file in -key="value" format. All metrics being gathered on this host will be tagged +Global tags can be specified in the `[global_tags]` section of the config file +in key="value" format. All metrics being gathered on this host will be tagged with the tags specified here. ## `[agent]` Configuration @@ -29,8 +38,12 @@ config. * **interval**: Default data collection interval for all inputs * **round_interval**: Rounds collection interval to 'interval' ie, if interval="10s" then always collect on :00, :10, :20, etc. +* **metric_batch_size**: Telegraf will send metrics to output in batch of at +most metric_batch_size metrics. * **metric_buffer_limit**: Telegraf will cache metric_buffer_limit metrics for each output, and will flush this buffer on a successful write. +This should be a multiple of metric_batch_size and could not be less +than 2 times metric_batch_size. * **collection_jitter**: Collection jitter is used to jitter the collection by a random amount. Each plugin will sleep for a random time within jitter before collecting. @@ -47,9 +60,35 @@ ie, a jitter of 5s and flush_interval 10s means flushes will happen every 10-15s * **quiet**: Run telegraf in quiet mode. * **hostname**: Override default hostname, if empty use os.Hostname(). -## `[inputs.xxx]` Configuration +#### Measurement Filtering -There are some configuration options that are configurable per input: +Filters can be configured per input or output, see below for examples. + +* **namepass**: An array of strings that is used to filter metrics generated by the +current input. Each string in the array is tested as a glob match against +measurement names and if it matches, the field is emitted. +* **namedrop**: The inverse of pass, if a measurement name matches, it is not emitted. +* **fieldpass**: An array of strings that is used to filter metrics generated by the +current input. Each string in the array is tested as a glob match against field names +and if it matches, the field is emitted. fieldpass is not available for outputs. +* **fielddrop**: The inverse of pass, if a field name matches, it is not emitted. +fielddrop is not available for outputs. +* **tagpass**: tag names and arrays of strings that are used to filter +measurements by the current input. Each string in the array is tested as a glob +match against the tag name, and if it matches the measurement is emitted. +* **tagdrop**: The inverse of tagpass. If a tag matches, the measurement is not +emitted. This is tested on measurements that have passed the tagpass test. +* **tagexclude**: tagexclude can be used to exclude a tag from measurement(s). +As opposed to tagdrop, which will drop an entire measurement based on it's +tags, tagexclude simply strips the given tag keys from the measurement. This +can be used on inputs & outputs, but it is _recommended_ to be used on inputs, +as it is more efficient to filter out tags at the ingestion point. +* **taginclude**: taginclude is the inverse of tagexclude. It will only include +the tag keys in the final measurement. + +## Input Configuration + +Some configuration options are configurable per input: * **name_override**: Override the base name of the measurement. (Default is the name of the input). @@ -60,24 +99,6 @@ There are some configuration options that are configurable per input: global interval, but if one particular input should be run less or more often, you can configure that here. -#### Input Filters - -There are also filters that can be configured per input: - -* **namepass**: An array of strings that is used to filter metrics generated by the -current input. Each string in the array is tested as a glob match against -measurement names and if it matches, the field is emitted. -* **namedrop**: The inverse of pass, if a measurement name matches, it is not emitted. -* **fieldpass**: An array of strings that is used to filter metrics generated by the -current input. Each string in the array is tested as a glob match against field names -and if it matches, the field is emitted. -* **fielddrop**: The inverse of pass, if a field name matches, it is not emitted. -* **tagpass**: tag names and arrays of strings that are used to filter -measurements by the current input. Each string in the array is tested as a glob -match against the tag name, and if it matches the measurement is emitted. -* **tagdrop**: The inverse of tagpass. If a tag matches, the measurement is not -emitted. This is tested on measurements that have passed the tagpass test. - #### Input Configuration Examples This is a full working config that will output CPU data to an InfluxDB instance @@ -155,6 +176,20 @@ fields which begin with `time_`. namepass = ["rest_client_*"] ``` +#### Input Config: taginclude and tagexclude + +```toml +# Only include the "cpu" tag in the measurements for the cpu plugin. +[[inputs.cpu]] + percpu = true + totalcpu = true + taginclude = ["cpu"] + +# Exclude the "fstype" tag from the measurements for the disk plugin. +[[inputs.disk]] + tagexclude = ["fstype"] +``` + #### Input config: prefix, suffix, and override This plugin will emit measurements with the name `cpu_total` @@ -180,6 +215,9 @@ This will emit measurements with the name `foobar` This plugin will emit measurements with two additional tags: `tag1=foo` and `tag2=bar` +NOTE: Order matters, the `[inputs.cpu.tags]` table must be at the _end_ of the +plugin definition. + ```toml [[inputs.cpu]] percpu = false @@ -208,15 +246,12 @@ to avoid measurement collisions: fielddrop = ["cpu_time*"] ``` -## `[outputs.xxx]` Configuration +## Output Configuration Telegraf also supports specifying multiple output sinks to send data to, configuring each output sink is different, but examples can be found by running `telegraf -sample-config`. -Outputs also support the same configurable options as inputs -(namepass, namedrop, tagpass, tagdrop) - ```toml [[outputs.influxdb]] urls = [ "http://localhost:8086" ] diff --git a/docs/DATA_FORMATS_INPUT.md b/docs/DATA_FORMATS_INPUT.md index 6a916711b..07134e979 100644 --- a/docs/DATA_FORMATS_INPUT.md +++ b/docs/DATA_FORMATS_INPUT.md @@ -75,14 +75,19 @@ metrics are parsed directly into Telegraf metrics. # JSON: -The JSON data format flattens JSON into metric _fields_. For example, this JSON: +The JSON data format flattens JSON into metric _fields_. +NOTE: Only numerical values are converted to fields, and they are converted +into a float. strings are ignored unless specified as a tag_key (see below). + +So for example, this JSON: ```json { "a": 5, "b": { "c": 6 - } + }, + "ignored": "I'm a string" } ``` diff --git a/docs/LICENSE_OF_DEPENDENCIES.md b/docs/LICENSE_OF_DEPENDENCIES.md index c8f3b0926..d448872f6 100644 --- a/docs/LICENSE_OF_DEPENDENCIES.md +++ b/docs/LICENSE_OF_DEPENDENCIES.md @@ -28,6 +28,5 @@ - github.com/wvanbergen/kazoo-go [MIT LICENSE](https://github.com/wvanbergen/kazoo-go/blob/master/MIT-LICENSE) - gopkg.in/dancannon/gorethink.v1 [APACHE LICENSE](https://github.com/dancannon/gorethink/blob/v1.1.2/LICENSE) - gopkg.in/mgo.v2 [BSD LICENSE](https://github.com/go-mgo/mgo/blob/v2/LICENSE) -- golang.org/x/crypto/* [BSD LICENSE](https://github.com/golang/crypto/blob/master/LICENSE) -- internal Glob function [MIT LICENSE](https://github.com/ryanuber/go-glob/blob/master/LICENSE) +- golang.org/x/crypto/ [BSD LICENSE](https://github.com/golang/crypto/blob/master/LICENSE) diff --git a/etc/telegraf.conf b/etc/telegraf.conf index f2bdace78..f57bd1410 100644 --- a/etc/telegraf.conf +++ b/etc/telegraf.conf @@ -30,11 +30,13 @@ ## ie, if interval="10s" then always collect on :00, :10, :20, etc. round_interval = true - ## Telegraf will cache metric_buffer_limit metrics for each output, and will - ## flush this buffer on a successful write. - metric_buffer_limit = 1000 - ## Flush the buffer whenever full, regardless of flush_interval. - flush_buffer_when_full = true + ## Telegraf will send metrics to outputs in batches of at + ## most metric_batch_size metrics. + metric_batch_size = 1000 + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + metric_buffer_limit = 10000 ## Collection jitter is used to jitter the collection by a random amount. ## Each plugin will sleep for a random time within jitter before collecting. @@ -147,6 +149,15 @@ # ## Amazon REGION # region = 'us-east-1' # +# ## Amazon Credentials +# ## Credentials are loaded in the following order +# ## 1) explicit credentials from 'access_key' and 'secret_key' +# ## 2) environment variables +# ## 3) shared credentials file +# ## 4) EC2 Instance Profile +# #access_key = "" +# #secret_key = "" +# # ## Namespace for the CloudWatch MetricDatums # namespace = 'InfluxData/Telegraf' @@ -239,6 +250,16 @@ # [[outputs.kinesis]] # ## Amazon REGION of kinesis endpoint. # region = "ap-southeast-2" +# +# ## Amazon Credentials +# ## Credentials are loaded in the following order +# ## 1) explicit credentials from 'access_key' and 'secret_key' +# ## 2) environment variables +# ## 3) shared credentials file +# ## 4) EC2 Instance Profile +# #access_key = "" +# #secret_key = "" +# # ## Kinesis StreamName must exist prior to starting telegraf. # streamname = "StreamName" # ## PartitionKey as used for sharding data. @@ -438,7 +459,7 @@ # servers = ["myuser:mypassword@10.10.10.1:8778","10.10.10.2:8778",":8778"] # ## List of metrics collected on above servers # ## Each metric consists of a jmx path. -# ## This will collect all heap memory usage metrics from the jvm and +# ## This will collect all heap memory usage metrics from the jvm and # ## ReadLatency metrics for all keyspaces and tables. # ## "type=Table" in the query works with Cassandra3.0. Older versions might # ## need to use "type=ColumnFamily" @@ -453,13 +474,22 @@ # ## Amazon Region # region = 'us-east-1' # +# ## Amazon Credentials +# ## Credentials are loaded in the following order +# ## 1) explicit credentials from 'access_key' and 'secret_key' +# ## 2) environment variables +# ## 3) shared credentials file +# ## 4) EC2 Instance Profile +# #access_key = "" +# #secret_key = "" +# # ## Requested CloudWatch aggregation Period (required - must be a multiple of 60s) # period = '1m' # # ## Collection Delay (required - must account for metrics availability via CloudWatch API) # delay = '1m' # -# ## Recomended: use metric 'interval' that is a multiple of 'period' to avoid +# ## Recomended: use metric 'interval' that is a multiple of 'period' to avoid # ## gaps or overlap in pulled data # interval = '1m' # @@ -471,7 +501,7 @@ # ## Refreshes Namespace available metrics every 1h # #[[inputs.cloudwatch.metrics]] # # names = ['Latency', 'RequestCount'] -# # +# # # # ## Dimension filters for Metric (optional) # # [[inputs.cloudwatch.metrics.dimensions]] # # name = 'LoadBalancerName' @@ -535,6 +565,8 @@ # endpoint = "unix:///var/run/docker.sock" # ## Only collect metrics for these containers, collect all if empty # container_names = [] +# ## Timeout for docker list, info, and stats commands +# timeout = "5s" # # Read statistics from one or many dovecot servers @@ -570,6 +602,9 @@ # ## Commands array # commands = ["/tmp/test.sh", "/usr/bin/mycollector --foo=bar"] # +# ## Timeout for each command to complete. +# timeout = "5s" +# # ## measurement name suffix (for separating different commands) # name_suffix = "_mycollector" # @@ -580,6 +615,22 @@ # data_format = "influx" +# # Read stats about given file(s) +# [[inputs.filestat]] +# ## Files to gather stats about. +# ## These accept standard unix glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## "/var/log/**.log" -> recursively find all .log files in /var/log +# ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log +# ## "/var/log/apache.log" -> just tail the apache log file +# ## +# ## See https://github.com/gobwas/glob for more examples +# ## +# files = ["/var/log/**.log"] +# ## If true, read the entire file and calculate an md5 checksum. +# md5 = false + + # # Read metrics of haproxy, via socket or csv stats page # [[inputs.haproxy]] # ## An array of address to gather stats about. Specify an ip on hostname @@ -676,13 +727,24 @@ # # Read JMX metrics through Jolokia # [[inputs.jolokia]] # ## This is the context root used to compose the jolokia url -# context = "/jolokia/read" +# context = "/jolokia" +# +# ## This specifies the mode used +# # mode = "proxy" +# # +# ## When in proxy mode this section is used to specify further +# ## proxy address configurations. +# ## Remember to change host address to fit your environment. +# # [inputs.jolokia.proxy] +# # host = "127.0.0.1" +# # port = "8080" +# # # ## List of servers exposing jolokia read service # [[inputs.jolokia.servers]] -# name = "stable" -# host = "192.168.103.2" -# port = "8180" +# name = "as-server-01" +# host = "127.0.0.1" +# port = "8080" # # username = "myuser" # # password = "mypassword" # @@ -692,17 +754,20 @@ # ## This collect all heap memory usage metrics. # [[inputs.jolokia.metrics]] # name = "heap_memory_usage" -# jmx = "/java.lang:type=Memory/HeapMemoryUsage" -# +# mbean = "java.lang:type=Memory" +# attribute = "HeapMemoryUsage" +# # ## This collect thread counts metrics. # [[inputs.jolokia.metrics]] # name = "thread_count" -# jmx = "/java.lang:type=Threading/TotalStartedThreadCount,ThreadCount,DaemonThreadCount,PeakThreadCount" -# +# mbean = "java.lang:type=Threading" +# attribute = "TotalStartedThreadCount,ThreadCount,DaemonThreadCount,PeakThreadCount" +# # ## This collect number of class loaded/unloaded counts metrics. # [[inputs.jolokia.metrics]] # name = "class_count" -# jmx = "/java.lang:type=ClassLoading/LoadedClassCount,UnloadedClassCount,TotalLoadedClassCount" +# mbean = "java.lang:type=ClassLoading" +# attribute = "LoadedClassCount,UnloadedClassCount,TotalLoadedClassCount" # # Read metrics from a LeoFS Server via SNMP @@ -719,9 +784,13 @@ # ## # # ost_procfiles = [ # # "/proc/fs/lustre/obdfilter/*/stats", -# # "/proc/fs/lustre/osd-ldiskfs/*/stats" +# # "/proc/fs/lustre/osd-ldiskfs/*/stats", +# # "/proc/fs/lustre/obdfilter/*/job_stats", +# # ] +# # mds_procfiles = [ +# # "/proc/fs/lustre/mdt/*/md_stats", +# # "/proc/fs/lustre/mdt/*/job_stats", # # ] -# # mds_procfiles = ["/proc/fs/lustre/mdt/*/md_stats"] # # Gathers metrics from the /3.0/reports MailChimp API @@ -781,9 +850,46 @@ # ## e.g. # ## root:passwd@tcp(127.0.0.1:3306)/?tls=false # ## root@tcp(127.0.0.1:3306)/?tls=false -# ## +# # # ## If no servers are specified, then localhost is used as the host. # servers = ["tcp(127.0.0.1:3306)/"] +# ## the limits for metrics form perf_events_statements +# perf_events_statements_digest_text_limit = 120 +# perf_events_statements_limit = 250 +# perf_events_statements_time_limit = 86400 +# # +# ## if the list is empty, then metrics are gathered from all databasee tables +# table_schema_databases = [] +# # +# ## gather metrics from INFORMATION_SCHEMA.TABLES for databases provided above list +# gather_table_schema = false +# # +# ## gather thread state counts from INFORMATION_SCHEMA.PROCESSLIST +# gather_process_list = true +# # +# ## gather auto_increment columns and max values from information schema +# gather_info_schema_auto_inc = true +# # +# ## gather metrics from SHOW SLAVE STATUS command output +# gather_slave_status = true +# # +# ## gather metrics from SHOW BINARY LOGS command output +# gather_binary_logs = false +# # +# ## gather metrics from PERFORMANCE_SCHEMA.TABLE_IO_WAITS_SUMMART_BY_TABLE +# gather_table_io_waits = false +# # +# ## gather metrics from PERFORMANCE_SCHEMA.TABLE_IO_WAITS_SUMMART_BY_INDEX_USAGE +# gather_index_io_waits = false +# # +# ## gather metrics from PERFORMANCE_SCHEMA.FILE_SUMMARY_BY_EVENT_NAME +# gather_file_events_stats = false +# # +# ## gather metrics from PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_DIGEST +# gather_perf_events_statements = false +# # +# ## Some queries we may want to run less often (such as SHOW GLOBAL VARIABLES) +# interval_slow = "30m" # # Read metrics about network interface usage @@ -875,15 +981,15 @@ # [[inputs.ping]] # ## NOTE: this plugin forks the ping command. You may need to set capabilities # ## via setcap cap_net_raw+p /bin/ping -# +# # # ## urls to ping # urls = ["www.google.com"] # required -# ## number of pings to send (ping -c ) +# ## number of pings to send per collection (ping -c ) # count = 1 # required # ## interval, in s, at which to ping. 0 == default (ping -i ) # ping_interval = 0.0 -# ## ping timeout, in s. 0 == no timeout (ping -t ) -# timeout = 0.0 +# ## ping timeout, in s. 0 == no timeout (ping -W ) +# timeout = 1.0 # ## interface to send ping from (ping -I ) # interface = "" @@ -929,6 +1035,11 @@ # ## databases are gathered. # ## databases = ["app_production", "testing"] # # +# # outputaddress = "db01" +# ## A custom name for the database that will be used as the "server" tag in the +# ## measurement output. If not specified, a default one generated from +# ## the connection address is used. +# # # ## Define the toml config where the sql queries are stored # ## New queries can be added, if the withdbname is set to true and there is no # ## databases defined in the 'databases field', the sql query is ended by a @@ -939,24 +1050,28 @@ # ## because the databases variable was set to ['postgres', 'pgbench' ] and the # ## withdbname was true. Be careful that if the withdbname is set to false you # ## don't have to define the where clause (aka with the dbname) the tagvalue -# ## field is used to define custom tags (separated by comas) +# ## field is used to define custom tags (separated by commas) +# ## The optional "measurement" value can be used to override the default +# ## output measurement name ("postgresql"). # # # ## Structure : # ## [[inputs.postgresql_extensible.query]] # ## sqlquery string # ## version string # ## withdbname boolean -# ## tagvalue string (coma separated) +# ## tagvalue string (comma separated) +# ## measurement string # [[inputs.postgresql_extensible.query]] # sqlquery="SELECT * FROM pg_stat_database" # version=901 # withdbname=false # tagvalue="" +# measurement="" # [[inputs.postgresql_extensible.query]] # sqlquery="SELECT * FROM pg_stat_bgwriter" # version=901 # withdbname=false -# tagvalue="" +# tagvalue="postgresql.stats" # # Read metrics from one or many PowerDNS servers @@ -1328,6 +1443,28 @@ # percentile_limit = 1000 +# # Stream a log file, like the tail -f command +# [[inputs.tail]] +# ## files to tail. +# ## These accept standard unix glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## "/var/log/**.log" -> recursively find all .log files in /var/log +# ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log +# ## "/var/log/apache.log" -> just tail the apache log file +# ## +# ## See https://github.com/gobwas/glob for more examples +# ## +# files = ["/var/mymetrics.out"] +# ## Read file from beginning. +# from_beginning = false +# +# ## Data format to consume. +# ## Each data format has it's own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md +# data_format = "influx" + + # # Generic TCP listener # [[inputs.tcp_listener]] # ## Address and port to host TCP listener on diff --git a/internal/buffer/buffer.go b/internal/buffer/buffer.go new file mode 100644 index 000000000..b7a05bf03 --- /dev/null +++ b/internal/buffer/buffer.go @@ -0,0 +1,77 @@ +package buffer + +import ( + "github.com/influxdata/telegraf" +) + +// Buffer is an object for storing metrics in a circular buffer. +type Buffer struct { + buf chan telegraf.Metric + // total dropped metrics + drops int + // total metrics added + total int +} + +// NewBuffer returns a Buffer +// size is the maximum number of metrics that Buffer will cache. If Add is +// called when the buffer is full, then the oldest metric(s) will be dropped. +func NewBuffer(size int) *Buffer { + return &Buffer{ + buf: make(chan telegraf.Metric, size), + } +} + +// IsEmpty returns true if Buffer is empty. +func (b *Buffer) IsEmpty() bool { + return len(b.buf) == 0 +} + +// Len returns the current length of the buffer. +func (b *Buffer) Len() int { + return len(b.buf) +} + +// Drops returns the total number of dropped metrics that have occured in this +// buffer since instantiation. +func (b *Buffer) Drops() int { + return b.drops +} + +// Total returns the total number of metrics that have been added to this buffer. +func (b *Buffer) Total() int { + return b.total +} + +// Add adds metrics to the buffer. +func (b *Buffer) Add(metrics ...telegraf.Metric) { + for i, _ := range metrics { + b.total++ + select { + case b.buf <- metrics[i]: + default: + b.drops++ + <-b.buf + b.buf <- metrics[i] + } + } +} + +// Batch returns a batch of metrics of size batchSize. +// the batch will be of maximum length batchSize. It can be less than batchSize, +// if the length of Buffer is less than batchSize. +func (b *Buffer) Batch(batchSize int) []telegraf.Metric { + n := min(len(b.buf), batchSize) + out := make([]telegraf.Metric, n) + for i := 0; i < n; i++ { + out[i] = <-b.buf + } + return out +} + +func min(a, b int) int { + if b < a { + return b + } + return a +} diff --git a/internal/buffer/buffer_test.go b/internal/buffer/buffer_test.go new file mode 100644 index 000000000..9a36f4d84 --- /dev/null +++ b/internal/buffer/buffer_test.go @@ -0,0 +1,94 @@ +package buffer + +import ( + "testing" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/testutil" + + "github.com/stretchr/testify/assert" +) + +var metricList = []telegraf.Metric{ + testutil.TestMetric(2, "mymetric1"), + testutil.TestMetric(1, "mymetric2"), + testutil.TestMetric(11, "mymetric3"), + testutil.TestMetric(15, "mymetric4"), + testutil.TestMetric(8, "mymetric5"), +} + +func BenchmarkAddMetrics(b *testing.B) { + buf := NewBuffer(10000) + m := testutil.TestMetric(1, "mymetric") + for n := 0; n < b.N; n++ { + buf.Add(m) + } +} + +func TestNewBufferBasicFuncs(t *testing.T) { + b := NewBuffer(10) + + assert.True(t, b.IsEmpty()) + assert.Zero(t, b.Len()) + assert.Zero(t, b.Drops()) + assert.Zero(t, b.Total()) + + m := testutil.TestMetric(1, "mymetric") + b.Add(m) + assert.False(t, b.IsEmpty()) + assert.Equal(t, b.Len(), 1) + assert.Equal(t, b.Drops(), 0) + assert.Equal(t, b.Total(), 1) + + b.Add(metricList...) + assert.False(t, b.IsEmpty()) + assert.Equal(t, b.Len(), 6) + assert.Equal(t, b.Drops(), 0) + assert.Equal(t, b.Total(), 6) +} + +func TestDroppingMetrics(t *testing.T) { + b := NewBuffer(10) + + // Add up to the size of the buffer + b.Add(metricList...) + b.Add(metricList...) + assert.False(t, b.IsEmpty()) + assert.Equal(t, b.Len(), 10) + assert.Equal(t, b.Drops(), 0) + assert.Equal(t, b.Total(), 10) + + // Add 5 more and verify they were dropped + b.Add(metricList...) + assert.False(t, b.IsEmpty()) + assert.Equal(t, b.Len(), 10) + assert.Equal(t, b.Drops(), 5) + assert.Equal(t, b.Total(), 15) +} + +func TestGettingBatches(t *testing.T) { + b := NewBuffer(20) + + // Verify that the buffer returned is smaller than requested when there are + // not as many items as requested. + b.Add(metricList...) + batch := b.Batch(10) + assert.Len(t, batch, 5) + + // Verify that the buffer is now empty + assert.True(t, b.IsEmpty()) + assert.Zero(t, b.Len()) + assert.Zero(t, b.Drops()) + assert.Equal(t, b.Total(), 5) + + // Verify that the buffer returned is not more than the size requested + b.Add(metricList...) + batch = b.Batch(3) + assert.Len(t, batch, 3) + + // Verify that buffer is not empty + assert.False(t, b.IsEmpty()) + assert.Equal(t, b.Len(), 2) + assert.Equal(t, b.Drops(), 0) + assert.Equal(t, b.Total(), 10) +} diff --git a/internal/config/config.go b/internal/config/config.go index cfd6c9593..daaaa10fc 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -93,9 +93,15 @@ type AgentConfig struct { // ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s FlushJitter internal.Duration + // MetricBatchSize is the maximum number of metrics that is wrote to an + // output plugin in one call. + MetricBatchSize int + // MetricBufferLimit is the max number of metrics that each output plugin // will cache. The buffer is cleared when a successful write occurs. When - // full, the oldest metrics will be overwritten. + // full, the oldest metrics will be overwritten. This number should be a + // multiple of MetricBatchSize. Due to current implementation, this could + // not be less than 2 times MetricBatchSize. MetricBufferLimit int // FlushBufferWhenFull tells Telegraf to flush the metric buffer whenever @@ -182,11 +188,13 @@ var header = `# Telegraf Configuration ## ie, if interval="10s" then always collect on :00, :10, :20, etc. round_interval = true - ## Telegraf will cache metric_buffer_limit metrics for each output, and will - ## flush this buffer on a successful write. - metric_buffer_limit = 1000 - ## Flush the buffer whenever full, regardless of flush_interval. - flush_buffer_when_full = true + ## Telegraf will send metrics to outputs in batches of at + ## most metric_batch_size metrics. + metric_batch_size = 1000 + ## For failed writes, telegraf will cache metric_buffer_limit metrics for each + ## output, and will flush this buffer on a successful write. Oldest metrics + ## are dropped first when this buffer fills. + metric_buffer_limit = 10000 ## Collection jitter is used to jitter the collection by a random amount. ## Each plugin will sleep for a random time within jitter before collecting. @@ -404,13 +412,67 @@ func (c *Config) LoadDirectory(path string) error { return nil } +// Try to find a default config file at these locations (in order): +// 1. $TELEGRAF_CONFIG_PATH +// 2. $HOME/.telegraf/telegraf.conf +// 3. /etc/telegraf/telegraf.conf +// +func getDefaultConfigPath() (string, error) { + envfile := os.Getenv("TELEGRAF_CONFIG_PATH") + homefile := os.ExpandEnv("${HOME}/.telegraf/telegraf.conf") + etcfile := "/etc/telegraf/telegraf.conf" + for _, path := range []string{envfile, homefile, etcfile} { + if _, err := os.Stat(path); err == nil { + log.Printf("Using config file: %s", path) + return path, nil + } + } + + // if we got here, we didn't find a file in a default location + return "", fmt.Errorf("No config file specified, and could not find one"+ + " in $TELEGRAF_CONFIG_PATH, %s, or %s", homefile, etcfile) +} + // LoadConfig loads the given config file and applies it to c func (c *Config) LoadConfig(path string) error { + var err error + if path == "" { + if path, err = getDefaultConfigPath(); err != nil { + return err + } + } tbl, err := parseFile(path) if err != nil { return fmt.Errorf("Error parsing %s, %s", path, err) } + // Parse tags tables first: + for _, tableName := range []string{"tags", "global_tags"} { + if val, ok := tbl.Fields[tableName]; ok { + subTable, ok := val.(*ast.Table) + if !ok { + return fmt.Errorf("%s: invalid configuration", path) + } + if err = config.UnmarshalTable(subTable, c.Tags); err != nil { + log.Printf("Could not parse [global_tags] config\n") + return fmt.Errorf("Error parsing %s, %s", path, err) + } + } + } + + // Parse agent table: + if val, ok := tbl.Fields["agent"]; ok { + subTable, ok := val.(*ast.Table) + if !ok { + return fmt.Errorf("%s: invalid configuration", path) + } + if err = config.UnmarshalTable(subTable, c.Agent); err != nil { + log.Printf("Could not parse [agent] config\n") + return fmt.Errorf("Error parsing %s, %s", path, err) + } + } + + // Parse all the rest of the plugins: for name, val := range tbl.Fields { subTable, ok := val.(*ast.Table) if !ok { @@ -418,16 +480,7 @@ func (c *Config) LoadConfig(path string) error { } switch name { - case "agent": - if err = config.UnmarshalTable(subTable, c.Agent); err != nil { - log.Printf("Could not parse [agent] config\n") - return fmt.Errorf("Error parsing %s, %s", path, err) - } - case "global_tags", "tags": - if err = config.UnmarshalTable(subTable, c.Tags); err != nil { - log.Printf("Could not parse [global_tags] config\n") - return fmt.Errorf("Error parsing %s, %s", path, err) - } + case "agent", "global_tags", "tags": case "outputs": for pluginName, pluginVal := range subTable.Fields { switch pluginSubTable := pluginVal.(type) { @@ -525,11 +578,8 @@ func (c *Config) addOutput(name string, table *ast.Table) error { return err } - ro := internal_models.NewRunningOutput(name, output, outputConfig) - if c.Agent.MetricBufferLimit > 0 { - ro.MetricBufferLimit = c.Agent.MetricBufferLimit - } - ro.FlushBufferWhenFull = c.Agent.FlushBufferWhenFull + ro := internal_models.NewRunningOutput(name, output, outputConfig, + c.Agent.MetricBatchSize, c.Agent.MetricBufferLimit) c.Outputs = append(c.Outputs, ro) return nil } @@ -580,9 +630,9 @@ func (c *Config) addInput(name string, table *ast.Table) error { // buildFilter builds a Filter // (tagpass/tagdrop/namepass/namedrop/fieldpass/fielddrop) to -// be inserted into the internal_models.OutputConfig/internal_models.InputConfig to be used for prefix -// filtering on tags and measurements -func buildFilter(tbl *ast.Table) internal_models.Filter { +// be inserted into the internal_models.OutputConfig/internal_models.InputConfig +// to be used for glob filtering on tags and measurements +func buildFilter(tbl *ast.Table) (internal_models.Filter, error) { f := internal_models.Filter{} if node, ok := tbl.Fields["namepass"]; ok { @@ -681,6 +731,33 @@ func buildFilter(tbl *ast.Table) internal_models.Filter { } } + if node, ok := tbl.Fields["tagexclude"]; ok { + if kv, ok := node.(*ast.KeyValue); ok { + if ary, ok := kv.Value.(*ast.Array); ok { + for _, elem := range ary.Value { + if str, ok := elem.(*ast.String); ok { + f.TagExclude = append(f.TagExclude, str.Value) + } + } + } + } + } + + if node, ok := tbl.Fields["taginclude"]; ok { + if kv, ok := node.(*ast.KeyValue); ok { + if ary, ok := kv.Value.(*ast.Array); ok { + for _, elem := range ary.Value { + if str, ok := elem.(*ast.String); ok { + f.TagInclude = append(f.TagInclude, str.Value) + } + } + } + } + } + if err := f.CompileFilter(); err != nil { + return f, err + } + delete(tbl.Fields, "namedrop") delete(tbl.Fields, "namepass") delete(tbl.Fields, "fielddrop") @@ -689,7 +766,9 @@ func buildFilter(tbl *ast.Table) internal_models.Filter { delete(tbl.Fields, "pass") delete(tbl.Fields, "tagdrop") delete(tbl.Fields, "tagpass") - return f + delete(tbl.Fields, "tagexclude") + delete(tbl.Fields, "taginclude") + return f, nil } // buildInput parses input specific items from the ast.Table, @@ -748,7 +827,11 @@ func buildInput(name string, tbl *ast.Table) (*internal_models.InputConfig, erro delete(tbl.Fields, "name_override") delete(tbl.Fields, "interval") delete(tbl.Fields, "tags") - cp.Filter = buildFilter(tbl) + var err error + cp.Filter, err = buildFilter(tbl) + if err != nil { + return cp, err + } return cp, nil } @@ -864,13 +947,18 @@ func buildSerializer(name string, tbl *ast.Table) (serializers.Serializer, error return serializers.NewSerializer(c) } -// buildOutput parses output specific items from the ast.Table, builds the filter and returns an +// buildOutput parses output specific items from the ast.Table, +// builds the filter and returns an // internal_models.OutputConfig to be inserted into internal_models.RunningInput // Note: error exists in the return for future calls that might require error func buildOutput(name string, tbl *ast.Table) (*internal_models.OutputConfig, error) { + filter, err := buildFilter(tbl) + if err != nil { + return nil, err + } oc := &internal_models.OutputConfig{ Name: name, - Filter: buildFilter(tbl), + Filter: filter, } // Outputs don't support FieldDrop/FieldPass, so set to NameDrop/NamePass if len(oc.Filter.FieldDrop) > 0 { diff --git a/internal/config/config_test.go b/internal/config/config_test.go index d78a8d6b8..1659cd6ec 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -26,27 +26,29 @@ func TestConfig_LoadSingleInputWithEnvVars(t *testing.T) { memcached := inputs.Inputs["memcached"]().(*memcached.Memcached) memcached.Servers = []string{"192.168.1.1"} - mConfig := &internal_models.InputConfig{ - Name: "memcached", - Filter: internal_models.Filter{ - NameDrop: []string{"metricname2"}, - NamePass: []string{"metricname1"}, - FieldDrop: []string{"other", "stuff"}, - FieldPass: []string{"some", "strings"}, - TagDrop: []internal_models.TagFilter{ - internal_models.TagFilter{ - Name: "badtag", - Filter: []string{"othertag"}, - }, + filter := internal_models.Filter{ + NameDrop: []string{"metricname2"}, + NamePass: []string{"metricname1"}, + FieldDrop: []string{"other", "stuff"}, + FieldPass: []string{"some", "strings"}, + TagDrop: []internal_models.TagFilter{ + internal_models.TagFilter{ + Name: "badtag", + Filter: []string{"othertag"}, }, - TagPass: []internal_models.TagFilter{ - internal_models.TagFilter{ - Name: "goodtag", - Filter: []string{"mytag"}, - }, - }, - IsActive: true, }, + TagPass: []internal_models.TagFilter{ + internal_models.TagFilter{ + Name: "goodtag", + Filter: []string{"mytag"}, + }, + }, + IsActive: true, + } + assert.NoError(t, filter.CompileFilter()) + mConfig := &internal_models.InputConfig{ + Name: "memcached", + Filter: filter, Interval: 10 * time.Second, } mConfig.Tags = make(map[string]string) @@ -64,27 +66,29 @@ func TestConfig_LoadSingleInput(t *testing.T) { memcached := inputs.Inputs["memcached"]().(*memcached.Memcached) memcached.Servers = []string{"localhost"} - mConfig := &internal_models.InputConfig{ - Name: "memcached", - Filter: internal_models.Filter{ - NameDrop: []string{"metricname2"}, - NamePass: []string{"metricname1"}, - FieldDrop: []string{"other", "stuff"}, - FieldPass: []string{"some", "strings"}, - TagDrop: []internal_models.TagFilter{ - internal_models.TagFilter{ - Name: "badtag", - Filter: []string{"othertag"}, - }, + filter := internal_models.Filter{ + NameDrop: []string{"metricname2"}, + NamePass: []string{"metricname1"}, + FieldDrop: []string{"other", "stuff"}, + FieldPass: []string{"some", "strings"}, + TagDrop: []internal_models.TagFilter{ + internal_models.TagFilter{ + Name: "badtag", + Filter: []string{"othertag"}, }, - TagPass: []internal_models.TagFilter{ - internal_models.TagFilter{ - Name: "goodtag", - Filter: []string{"mytag"}, - }, - }, - IsActive: true, }, + TagPass: []internal_models.TagFilter{ + internal_models.TagFilter{ + Name: "goodtag", + Filter: []string{"mytag"}, + }, + }, + IsActive: true, + } + assert.NoError(t, filter.CompileFilter()) + mConfig := &internal_models.InputConfig{ + Name: "memcached", + Filter: filter, Interval: 5 * time.Second, } mConfig.Tags = make(map[string]string) @@ -109,27 +113,29 @@ func TestConfig_LoadDirectory(t *testing.T) { memcached := inputs.Inputs["memcached"]().(*memcached.Memcached) memcached.Servers = []string{"localhost"} - mConfig := &internal_models.InputConfig{ - Name: "memcached", - Filter: internal_models.Filter{ - NameDrop: []string{"metricname2"}, - NamePass: []string{"metricname1"}, - FieldDrop: []string{"other", "stuff"}, - FieldPass: []string{"some", "strings"}, - TagDrop: []internal_models.TagFilter{ - internal_models.TagFilter{ - Name: "badtag", - Filter: []string{"othertag"}, - }, + filter := internal_models.Filter{ + NameDrop: []string{"metricname2"}, + NamePass: []string{"metricname1"}, + FieldDrop: []string{"other", "stuff"}, + FieldPass: []string{"some", "strings"}, + TagDrop: []internal_models.TagFilter{ + internal_models.TagFilter{ + Name: "badtag", + Filter: []string{"othertag"}, }, - TagPass: []internal_models.TagFilter{ - internal_models.TagFilter{ - Name: "goodtag", - Filter: []string{"mytag"}, - }, - }, - IsActive: true, }, + TagPass: []internal_models.TagFilter{ + internal_models.TagFilter{ + Name: "goodtag", + Filter: []string{"mytag"}, + }, + }, + IsActive: true, + } + assert.NoError(t, filter.CompileFilter()) + mConfig := &internal_models.InputConfig{ + Name: "memcached", + Filter: filter, Interval: 5 * time.Second, } mConfig.Tags = make(map[string]string) diff --git a/internal/globpath/globpath.go b/internal/globpath/globpath.go new file mode 100644 index 000000000..6755e69b2 --- /dev/null +++ b/internal/globpath/globpath.go @@ -0,0 +1,98 @@ +package globpath + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/gobwas/glob" +) + +var sepStr = fmt.Sprintf("%v", string(os.PathSeparator)) + +type GlobPath struct { + path string + hasMeta bool + g glob.Glob + root string +} + +func Compile(path string) (*GlobPath, error) { + out := GlobPath{ + hasMeta: hasMeta(path), + path: path, + } + + // if there are no glob meta characters in the path, don't bother compiling + // a glob object or finding the root directory. (see short-circuit in Match) + if !out.hasMeta { + return &out, nil + } + + var err error + if out.g, err = glob.Compile(path, os.PathSeparator); err != nil { + return nil, err + } + // Get the root directory for this filepath + out.root = findRootDir(path) + return &out, nil +} + +func (g *GlobPath) Match() map[string]os.FileInfo { + if !g.hasMeta { + out := make(map[string]os.FileInfo) + info, err := os.Stat(g.path) + if !os.IsNotExist(err) { + out[g.path] = info + } + return out + } + return walkFilePath(g.root, g.g) +} + +// walk the filepath from the given root and return a list of files that match +// the given glob. +func walkFilePath(root string, g glob.Glob) map[string]os.FileInfo { + matchedFiles := make(map[string]os.FileInfo) + walkfn := func(path string, info os.FileInfo, _ error) error { + if g.Match(path) { + matchedFiles[path] = info + } + return nil + } + filepath.Walk(root, walkfn) + return matchedFiles +} + +// find the root dir of the given path (could include globs). +// ie: +// /var/log/telegraf.conf -> /var/log +// /home/** -> /home +// /home/*/** -> /home +// /lib/share/*/*/**.txt -> /lib/share +func findRootDir(path string) string { + pathItems := strings.Split(path, sepStr) + out := sepStr + for i, item := range pathItems { + if i == len(pathItems)-1 { + break + } + if item == "" { + continue + } + if hasMeta(item) { + break + } + out += item + sepStr + } + if out != "/" { + out = strings.TrimSuffix(out, "/") + } + return out +} + +// hasMeta reports whether path contains any magic glob characters. +func hasMeta(path string) bool { + return strings.IndexAny(path, "*?[") >= 0 +} diff --git a/internal/globpath/globpath_test.go b/internal/globpath/globpath_test.go new file mode 100644 index 000000000..db72c94f4 --- /dev/null +++ b/internal/globpath/globpath_test.go @@ -0,0 +1,62 @@ +package globpath + +import ( + "runtime" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCompileAndMatch(t *testing.T) { + dir := getTestdataDir() + // test super asterisk + g1, err := Compile(dir + "/**") + require.NoError(t, err) + // test single asterisk + g2, err := Compile(dir + "/*.log") + require.NoError(t, err) + // test no meta characters (file exists) + g3, err := Compile(dir + "/log1.log") + require.NoError(t, err) + // test file that doesn't exist + g4, err := Compile(dir + "/i_dont_exist.log") + require.NoError(t, err) + // test super asterisk that doesn't exist + g5, err := Compile(dir + "/dir_doesnt_exist/**") + require.NoError(t, err) + + matches := g1.Match() + assert.Len(t, matches, 3) + matches = g2.Match() + assert.Len(t, matches, 2) + matches = g3.Match() + assert.Len(t, matches, 1) + matches = g4.Match() + assert.Len(t, matches, 0) + matches = g5.Match() + assert.Len(t, matches, 0) +} + +func TestFindRootDir(t *testing.T) { + tests := []struct { + input string + output string + }{ + {"/var/log/telegraf.conf", "/var/log"}, + {"/home/**", "/home"}, + {"/home/*/**", "/home"}, + {"/lib/share/*/*/**.txt", "/lib/share"}, + } + + for _, test := range tests { + actual := findRootDir(test.input) + assert.Equal(t, test.output, actual) + } +} + +func getTestdataDir() string { + _, filename, _, _ := runtime.Caller(1) + return strings.Replace(filename, "globpath_test.go", "testdata", 1) +} diff --git a/internal/globpath/testdata/log1.log b/internal/globpath/testdata/log1.log new file mode 100644 index 000000000..e69de29bb diff --git a/internal/globpath/testdata/log2.log b/internal/globpath/testdata/log2.log new file mode 100644 index 000000000..e69de29bb diff --git a/internal/globpath/testdata/test.conf b/internal/globpath/testdata/test.conf new file mode 100644 index 000000000..a06111991 --- /dev/null +++ b/internal/globpath/testdata/test.conf @@ -0,0 +1,5 @@ +# this is a fake testing config file +# for testing the filestat plugin + +option1 = "foo" +option2 = "bar" diff --git a/internal/internal.go b/internal/internal.go index ff73aae84..ae1464925 100644 --- a/internal/internal.go +++ b/internal/internal.go @@ -2,13 +2,16 @@ package internal import ( "bufio" + "bytes" "crypto/rand" "crypto/tls" "crypto/x509" "errors" "fmt" "io/ioutil" + "log" "os" + "os/exec" "strings" "time" "unicode" @@ -16,6 +19,12 @@ import ( const alphanum string = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" +var ( + TimeoutErr = errors.New("Command timed out.") + + NotImplementedError = errors.New("not implemented yet") +) + // Duration just wraps time.Duration type Duration struct { Duration time.Duration @@ -33,8 +42,6 @@ func (d *Duration) UnmarshalTOML(b []byte) error { return nil } -var NotImplementedError = errors.New("not implemented yet") - // ReadLines reads contents from a file and splits them by new lines. // A convenience wrapper to ReadLinesOffsetN(filename, 0, -1). func ReadLines(filename string) ([]string, error) { @@ -140,58 +147,47 @@ func SnakeCase(in string) string { return string(out) } -// Glob will test a string pattern, potentially containing globs, against a -// subject string. The result is a simple true/false, determining whether or -// not the glob pattern matched the subject text. -// -// Adapted from https://github.com/ryanuber/go-glob/blob/master/glob.go -// thanks Ryan Uber! -func Glob(pattern, measurement string) bool { - // Empty pattern can only match empty subject - if pattern == "" { - return measurement == pattern +// CombinedOutputTimeout runs the given command with the given timeout and +// returns the combined output of stdout and stderr. +// If the command times out, it attempts to kill the process. +func CombinedOutputTimeout(c *exec.Cmd, timeout time.Duration) ([]byte, error) { + var b bytes.Buffer + c.Stdout = &b + c.Stderr = &b + if err := c.Start(); err != nil { + return nil, err + } + err := WaitTimeout(c, timeout) + return b.Bytes(), err +} + +// RunTimeout runs the given command with the given timeout. +// If the command times out, it attempts to kill the process. +func RunTimeout(c *exec.Cmd, timeout time.Duration) error { + if err := c.Start(); err != nil { + return err + } + return WaitTimeout(c, timeout) +} + +// WaitTimeout waits for the given command to finish with a timeout. +// It assumes the command has already been started. +// If the command times out, it attempts to kill the process. +func WaitTimeout(c *exec.Cmd, timeout time.Duration) error { + timer := time.NewTimer(timeout) + done := make(chan error) + go func() { done <- c.Wait() }() + select { + case err := <-done: + timer.Stop() + return err + case <-timer.C: + if err := c.Process.Kill(); err != nil { + log.Printf("FATAL error killing process: %s", err) + return err + } + // wait for the command to return after killing it + <-done + return TimeoutErr } - - // If the pattern _is_ a glob, it matches everything - if pattern == "*" { - return true - } - - parts := strings.Split(pattern, "*") - - if len(parts) == 1 { - // No globs in pattern, so test for match - return pattern == measurement - } - - leadingGlob := strings.HasPrefix(pattern, "*") - trailingGlob := strings.HasSuffix(pattern, "*") - end := len(parts) - 1 - - for i, part := range parts { - switch i { - case 0: - if leadingGlob { - continue - } - if !strings.HasPrefix(measurement, part) { - return false - } - case end: - if len(measurement) > 0 { - return trailingGlob || strings.HasSuffix(measurement, part) - } - default: - if !strings.Contains(measurement, part) { - return false - } - } - - // Trim evaluated text from measurement as we loop over the pattern. - idx := strings.Index(measurement, part) + len(part) - measurement = measurement[idx:] - } - - // All parts of the pattern matched - return true } diff --git a/internal/internal_test.go b/internal/internal_test.go index e4a5eed14..90e1badc1 100644 --- a/internal/internal_test.go +++ b/internal/internal_test.go @@ -1,47 +1,12 @@ package internal -import "testing" +import ( + "os/exec" + "testing" + "time" -func testGlobMatch(t *testing.T, pattern, subj string) { - if !Glob(pattern, subj) { - t.Errorf("%s should match %s", pattern, subj) - } -} - -func testGlobNoMatch(t *testing.T, pattern, subj string) { - if Glob(pattern, subj) { - t.Errorf("%s should not match %s", pattern, subj) - } -} - -func TestEmptyPattern(t *testing.T) { - testGlobMatch(t, "", "") - testGlobNoMatch(t, "", "test") -} - -func TestPatternWithoutGlobs(t *testing.T) { - testGlobMatch(t, "test", "test") -} - -func TestGlob(t *testing.T) { - for _, pattern := range []string{ - "*test", // Leading glob - "this*", // Trailing glob - "*is*a*", // Lots of globs - "**test**", // Double glob characters - "**is**a***test*", // Varying number of globs - } { - testGlobMatch(t, pattern, "this_is_a_test") - } - - for _, pattern := range []string{ - "test*", // Implicit substring match should fail - "*is", // Partial match should fail - "*no*", // Globs without a match between them should fail - } { - testGlobNoMatch(t, pattern, "this_is_a_test") - } -} + "github.com/stretchr/testify/assert" +) type SnakeTest struct { input string @@ -71,3 +36,73 @@ func TestSnakeCase(t *testing.T) { } } } + +var ( + sleepbin, _ = exec.LookPath("sleep") + echobin, _ = exec.LookPath("echo") +) + +func TestRunTimeout(t *testing.T) { + if sleepbin == "" { + t.Skip("'sleep' binary not available on OS, skipping.") + } + cmd := exec.Command(sleepbin, "10") + start := time.Now() + err := RunTimeout(cmd, time.Millisecond*20) + elapsed := time.Since(start) + + assert.Equal(t, TimeoutErr, err) + // Verify that command gets killed in 20ms, with some breathing room + assert.True(t, elapsed < time.Millisecond*75) +} + +func TestCombinedOutputTimeout(t *testing.T) { + if sleepbin == "" { + t.Skip("'sleep' binary not available on OS, skipping.") + } + cmd := exec.Command(sleepbin, "10") + start := time.Now() + _, err := CombinedOutputTimeout(cmd, time.Millisecond*20) + elapsed := time.Since(start) + + assert.Equal(t, TimeoutErr, err) + // Verify that command gets killed in 20ms, with some breathing room + assert.True(t, elapsed < time.Millisecond*75) +} + +func TestCombinedOutput(t *testing.T) { + if echobin == "" { + t.Skip("'echo' binary not available on OS, skipping.") + } + cmd := exec.Command(echobin, "foo") + out, err := CombinedOutputTimeout(cmd, time.Second) + + assert.NoError(t, err) + assert.Equal(t, "foo\n", string(out)) +} + +// test that CombinedOutputTimeout and exec.Cmd.CombinedOutput return +// the same output from a failed command. +func TestCombinedOutputError(t *testing.T) { + if sleepbin == "" { + t.Skip("'sleep' binary not available on OS, skipping.") + } + cmd := exec.Command(sleepbin, "foo") + expected, err := cmd.CombinedOutput() + + cmd2 := exec.Command(sleepbin, "foo") + actual, err := CombinedOutputTimeout(cmd2, time.Second) + + assert.Error(t, err) + assert.Equal(t, expected, actual) +} + +func TestRunError(t *testing.T) { + if sleepbin == "" { + t.Skip("'sleep' binary not available on OS, skipping.") + } + cmd := exec.Command(sleepbin, "foo") + err := RunTimeout(cmd, time.Second) + + assert.Error(t, err) +} diff --git a/internal/models/filter.go b/internal/models/filter.go index e2b1377f4..d78492a5d 100644 --- a/internal/models/filter.go +++ b/internal/models/filter.go @@ -1,33 +1,104 @@ package internal_models import ( + "fmt" "strings" + "github.com/gobwas/glob" + "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/internal" ) // TagFilter is the name of a tag, and the values on which to filter type TagFilter struct { Name string Filter []string + filter glob.Glob } // Filter containing drop/pass and tagdrop/tagpass rules type Filter struct { NameDrop []string + nameDrop glob.Glob NamePass []string + namePass glob.Glob FieldDrop []string + fieldDrop glob.Glob FieldPass []string + fieldPass glob.Glob TagDrop []TagFilter TagPass []TagFilter + TagExclude []string + tagExclude glob.Glob + TagInclude []string + tagInclude glob.Glob + IsActive bool } -func (f Filter) ShouldMetricPass(metric telegraf.Metric) bool { +// Compile all Filter lists into glob.Glob objects. +func (f *Filter) CompileFilter() error { + var err error + f.nameDrop, err = compileFilter(f.NameDrop) + if err != nil { + return fmt.Errorf("Error compiling 'namedrop', %s", err) + } + f.namePass, err = compileFilter(f.NamePass) + if err != nil { + return fmt.Errorf("Error compiling 'namepass', %s", err) + } + + f.fieldDrop, err = compileFilter(f.FieldDrop) + if err != nil { + return fmt.Errorf("Error compiling 'fielddrop', %s", err) + } + f.fieldPass, err = compileFilter(f.FieldPass) + if err != nil { + return fmt.Errorf("Error compiling 'fieldpass', %s", err) + } + + f.tagExclude, err = compileFilter(f.TagExclude) + if err != nil { + return fmt.Errorf("Error compiling 'tagexclude', %s", err) + } + f.tagInclude, err = compileFilter(f.TagInclude) + if err != nil { + return fmt.Errorf("Error compiling 'taginclude', %s", err) + } + + for i, _ := range f.TagDrop { + f.TagDrop[i].filter, err = compileFilter(f.TagDrop[i].Filter) + if err != nil { + return fmt.Errorf("Error compiling 'tagdrop', %s", err) + } + } + for i, _ := range f.TagPass { + f.TagPass[i].filter, err = compileFilter(f.TagPass[i].Filter) + if err != nil { + return fmt.Errorf("Error compiling 'tagpass', %s", err) + } + } + return nil +} + +func compileFilter(filter []string) (glob.Glob, error) { + if len(filter) == 0 { + return nil, nil + } + var g glob.Glob + var err error + if len(filter) == 1 { + g, err = glob.Compile(filter[0]) + } else { + g, err = glob.Compile("{" + strings.Join(filter, ",") + "}") + } + return g, err +} + +func (f *Filter) ShouldMetricPass(metric telegraf.Metric) bool { if f.ShouldNamePass(metric.Name()) && f.ShouldTagsPass(metric.Tags()) { return true } @@ -36,70 +107,51 @@ func (f Filter) ShouldMetricPass(metric telegraf.Metric) bool { // ShouldFieldsPass returns true if the metric should pass, false if should drop // based on the drop/pass filter parameters -func (f Filter) ShouldNamePass(key string) bool { - if f.NamePass != nil { - for _, pat := range f.NamePass { - // TODO remove HasPrefix check, leaving it for now for legacy support. - // Cam, 2015-12-07 - if strings.HasPrefix(key, pat) || internal.Glob(pat, key) { - return true - } +func (f *Filter) ShouldNamePass(key string) bool { + if f.namePass != nil { + if f.namePass.Match(key) { + return true } return false } - if f.NameDrop != nil { - for _, pat := range f.NameDrop { - // TODO remove HasPrefix check, leaving it for now for legacy support. - // Cam, 2015-12-07 - if strings.HasPrefix(key, pat) || internal.Glob(pat, key) { - return false - } + if f.nameDrop != nil { + if f.nameDrop.Match(key) { + return false } - - return true } return true } // ShouldFieldsPass returns true if the metric should pass, false if should drop // based on the drop/pass filter parameters -func (f Filter) ShouldFieldsPass(key string) bool { - if f.FieldPass != nil { - for _, pat := range f.FieldPass { - // TODO remove HasPrefix check, leaving it for now for legacy support. - // Cam, 2015-12-07 - if strings.HasPrefix(key, pat) || internal.Glob(pat, key) { - return true - } +func (f *Filter) ShouldFieldsPass(key string) bool { + if f.fieldPass != nil { + if f.fieldPass.Match(key) { + return true } return false } - if f.FieldDrop != nil { - for _, pat := range f.FieldDrop { - // TODO remove HasPrefix check, leaving it for now for legacy support. - // Cam, 2015-12-07 - if strings.HasPrefix(key, pat) || internal.Glob(pat, key) { - return false - } + if f.fieldDrop != nil { + if f.fieldDrop.Match(key) { + return false } - - return true } return true } // ShouldTagsPass returns true if the metric should pass, false if should drop // based on the tagdrop/tagpass filter parameters -func (f Filter) ShouldTagsPass(tags map[string]string) bool { +func (f *Filter) ShouldTagsPass(tags map[string]string) bool { if f.TagPass != nil { for _, pat := range f.TagPass { + if pat.filter == nil { + continue + } if tagval, ok := tags[pat.Name]; ok { - for _, filter := range pat.Filter { - if internal.Glob(filter, tagval) { - return true - } + if pat.filter.Match(tagval) { + return true } } } @@ -108,11 +160,12 @@ func (f Filter) ShouldTagsPass(tags map[string]string) bool { if f.TagDrop != nil { for _, pat := range f.TagDrop { + if pat.filter == nil { + continue + } if tagval, ok := tags[pat.Name]; ok { - for _, filter := range pat.Filter { - if internal.Glob(filter, tagval) { - return false - } + if pat.filter.Match(tagval) { + return false } } } @@ -121,3 +174,23 @@ func (f Filter) ShouldTagsPass(tags map[string]string) bool { return true } + +// Apply TagInclude and TagExclude filters. +// modifies the tags map in-place. +func (f *Filter) FilterTags(tags map[string]string) { + if f.tagInclude != nil { + for k, _ := range tags { + if !f.tagInclude.Match(k) { + delete(tags, k) + } + } + } + + if f.tagExclude != nil { + for k, _ := range tags { + if f.tagExclude.Match(k) { + delete(tags, k) + } + } + } +} diff --git a/internal/models/filter_test.go b/internal/models/filter_test.go index c69398494..a37416095 100644 --- a/internal/models/filter_test.go +++ b/internal/models/filter_test.go @@ -2,6 +2,11 @@ package internal_models import ( "testing" + + "github.com/influxdata/telegraf/testutil" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestFilter_Empty(t *testing.T) { @@ -28,6 +33,7 @@ func TestFilter_NamePass(t *testing.T) { f := Filter{ NamePass: []string{"foo*", "cpu_usage_idle"}, } + require.NoError(t, f.CompileFilter()) passes := []string{ "foo", @@ -61,6 +67,7 @@ func TestFilter_NameDrop(t *testing.T) { f := Filter{ NameDrop: []string{"foo*", "cpu_usage_idle"}, } + require.NoError(t, f.CompileFilter()) drops := []string{ "foo", @@ -94,6 +101,7 @@ func TestFilter_FieldPass(t *testing.T) { f := Filter{ FieldPass: []string{"foo*", "cpu_usage_idle"}, } + require.NoError(t, f.CompileFilter()) passes := []string{ "foo", @@ -127,6 +135,7 @@ func TestFilter_FieldDrop(t *testing.T) { f := Filter{ FieldDrop: []string{"foo*", "cpu_usage_idle"}, } + require.NoError(t, f.CompileFilter()) drops := []string{ "foo", @@ -169,6 +178,7 @@ func TestFilter_TagPass(t *testing.T) { f := Filter{ TagPass: filters, } + require.NoError(t, f.CompileFilter()) passes := []map[string]string{ {"cpu": "cpu-total"}, @@ -212,6 +222,7 @@ func TestFilter_TagDrop(t *testing.T) { f := Filter{ TagDrop: filters, } + require.NoError(t, f.CompileFilter()) drops := []map[string]string{ {"cpu": "cpu-total"}, @@ -241,3 +252,115 @@ func TestFilter_TagDrop(t *testing.T) { } } } + +func TestFilter_CompileFilterError(t *testing.T) { + f := Filter{ + NameDrop: []string{"", ""}, + } + assert.Error(t, f.CompileFilter()) + f = Filter{ + NamePass: []string{"", ""}, + } + assert.Error(t, f.CompileFilter()) + f = Filter{ + FieldDrop: []string{"", ""}, + } + assert.Error(t, f.CompileFilter()) + f = Filter{ + FieldPass: []string{"", ""}, + } + assert.Error(t, f.CompileFilter()) + f = Filter{ + TagExclude: []string{"", ""}, + } + assert.Error(t, f.CompileFilter()) + f = Filter{ + TagInclude: []string{"", ""}, + } + assert.Error(t, f.CompileFilter()) + filters := []TagFilter{ + TagFilter{ + Name: "cpu", + Filter: []string{"{foobar}"}, + }} + f = Filter{ + TagDrop: filters, + } + require.Error(t, f.CompileFilter()) + filters = []TagFilter{ + TagFilter{ + Name: "cpu", + Filter: []string{"{foobar}"}, + }} + f = Filter{ + TagPass: filters, + } + require.Error(t, f.CompileFilter()) +} + +func TestFilter_ShouldMetricsPass(t *testing.T) { + m := testutil.TestMetric(1, "testmetric") + f := Filter{ + NameDrop: []string{"foobar"}, + } + require.NoError(t, f.CompileFilter()) + require.True(t, f.ShouldMetricPass(m)) + + m = testutil.TestMetric(1, "foobar") + require.False(t, f.ShouldMetricPass(m)) +} + +func TestFilter_FilterTagsNoMatches(t *testing.T) { + pretags := map[string]string{ + "host": "localhost", + "mytag": "foobar", + } + f := Filter{ + TagExclude: []string{"nomatch"}, + } + require.NoError(t, f.CompileFilter()) + + f.FilterTags(pretags) + assert.Equal(t, map[string]string{ + "host": "localhost", + "mytag": "foobar", + }, pretags) + + f = Filter{ + TagInclude: []string{"nomatch"}, + } + require.NoError(t, f.CompileFilter()) + + f.FilterTags(pretags) + assert.Equal(t, map[string]string{}, pretags) +} + +func TestFilter_FilterTagsMatches(t *testing.T) { + pretags := map[string]string{ + "host": "localhost", + "mytag": "foobar", + } + f := Filter{ + TagExclude: []string{"ho*"}, + } + require.NoError(t, f.CompileFilter()) + + f.FilterTags(pretags) + assert.Equal(t, map[string]string{ + "mytag": "foobar", + }, pretags) + + pretags = map[string]string{ + "host": "localhost", + "mytag": "foobar", + } + f = Filter{ + TagInclude: []string{"my*"}, + } + require.NoError(t, f.CompileFilter()) + + f.FilterTags(pretags) + assert.Equal(t, map[string]string{ + "mytag": "foobar", + }, pretags) +} diff --git a/internal/models/running_output.go b/internal/models/running_output.go index 1e3d44a61..d0d2abbc1 100644 --- a/internal/models/running_output.go +++ b/internal/models/running_output.go @@ -2,48 +2,54 @@ package internal_models import ( "log" - "sync" "time" "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal/buffer" ) const ( - // Default number of metrics kept between flushes. - DEFAULT_METRIC_BUFFER_LIMIT = 1000 + // Default size of metrics batch size. + DEFAULT_METRIC_BATCH_SIZE = 1000 - // Limit how many full metric buffers are kept due to failed writes. - FULL_METRIC_BUFFERS_LIMIT = 100 + // Default number of metrics kept. It should be a multiple of batch size. + DEFAULT_METRIC_BUFFER_LIMIT = 10000 ) +// RunningOutput contains the output configuration type RunningOutput struct { - Name string - Output telegraf.Output - Config *OutputConfig - Quiet bool - MetricBufferLimit int - FlushBufferWhenFull bool + Name string + Output telegraf.Output + Config *OutputConfig + Quiet bool + MetricBufferLimit int + MetricBatchSize int - metrics []telegraf.Metric - tmpmetrics map[int][]telegraf.Metric - overwriteI int - mapI int - - sync.Mutex + metrics *buffer.Buffer + failMetrics *buffer.Buffer } func NewRunningOutput( name string, output telegraf.Output, conf *OutputConfig, + batchSize int, + bufferLimit int, ) *RunningOutput { + if bufferLimit == 0 { + bufferLimit = DEFAULT_METRIC_BUFFER_LIMIT + } + if batchSize == 0 { + batchSize = DEFAULT_METRIC_BATCH_SIZE + } ro := &RunningOutput{ Name: name, - metrics: make([]telegraf.Metric, 0), - tmpmetrics: make(map[int][]telegraf.Metric), + metrics: buffer.NewBuffer(batchSize), + failMetrics: buffer.NewBuffer(bufferLimit), Output: output, Config: conf, - MetricBufferLimit: DEFAULT_METRIC_BUFFER_LIMIT, + MetricBufferLimit: bufferLimit, + MetricBatchSize: batchSize, } return ro } @@ -56,67 +62,78 @@ func (ro *RunningOutput) AddMetric(metric telegraf.Metric) { return } } - ro.Lock() - defer ro.Unlock() - if len(ro.metrics) < ro.MetricBufferLimit { - ro.metrics = append(ro.metrics, metric) - } else { - if ro.FlushBufferWhenFull { - ro.metrics = append(ro.metrics, metric) - tmpmetrics := make([]telegraf.Metric, len(ro.metrics)) - copy(tmpmetrics, ro.metrics) - ro.metrics = make([]telegraf.Metric, 0) - err := ro.write(tmpmetrics) - if err != nil { - log.Printf("ERROR writing full metric buffer to output %s, %s", - ro.Name, err) - if len(ro.tmpmetrics) == FULL_METRIC_BUFFERS_LIMIT { - ro.mapI = 0 - // overwrite one - ro.tmpmetrics[ro.mapI] = tmpmetrics - ro.mapI++ - } else { - ro.tmpmetrics[ro.mapI] = tmpmetrics - ro.mapI++ - } - } - } else { - if ro.overwriteI == 0 { - log.Printf("WARNING: overwriting cached metrics, you may want to " + - "increase the metric_buffer_limit setting in your [agent] " + - "config if you do not wish to overwrite metrics.\n") - } - if ro.overwriteI == len(ro.metrics) { - ro.overwriteI = 0 - } - ro.metrics[ro.overwriteI] = metric - ro.overwriteI++ + // Filter any tagexclude/taginclude parameters before adding metric + if len(ro.Config.Filter.TagExclude) != 0 || len(ro.Config.Filter.TagInclude) != 0 { + // In order to filter out tags, we need to create a new metric, since + // metrics are immutable once created. + tags := metric.Tags() + fields := metric.Fields() + t := metric.Time() + name := metric.Name() + ro.Config.Filter.FilterTags(tags) + // error is not possible if creating from another metric, so ignore. + metric, _ = telegraf.NewMetric(name, tags, fields, t) + } + + ro.metrics.Add(metric) + if ro.metrics.Len() == ro.MetricBatchSize { + batch := ro.metrics.Batch(ro.MetricBatchSize) + err := ro.write(batch) + if err != nil { + ro.failMetrics.Add(batch...) } } } // Write writes all cached points to this output. func (ro *RunningOutput) Write() error { - ro.Lock() - defer ro.Unlock() - err := ro.write(ro.metrics) - if err != nil { - return err - } else { - ro.metrics = make([]telegraf.Metric, 0) - ro.overwriteI = 0 + if !ro.Quiet { + log.Printf("Output [%s] buffer fullness: %d / %d metrics. "+ + "Total gathered metrics: %d. Total dropped metrics: %d.", + ro.Name, + ro.failMetrics.Len()+ro.metrics.Len(), + ro.MetricBufferLimit, + ro.metrics.Total(), + ro.metrics.Drops()+ro.failMetrics.Drops()) } - // Write any cached metric buffers that failed previously - for i, tmpmetrics := range ro.tmpmetrics { - if err := ro.write(tmpmetrics); err != nil { - return err - } else { - delete(ro.tmpmetrics, i) + var err error + if !ro.failMetrics.IsEmpty() { + bufLen := ro.failMetrics.Len() + // how many batches of failed writes we need to write. + nBatches := bufLen/ro.MetricBatchSize + 1 + batchSize := ro.MetricBatchSize + + for i := 0; i < nBatches; i++ { + // If it's the last batch, only grab the metrics that have not had + // a write attempt already (this is primarily to preserve order). + if i == nBatches-1 { + batchSize = bufLen % ro.MetricBatchSize + } + batch := ro.failMetrics.Batch(batchSize) + // If we've already failed previous writes, don't bother trying to + // write to this output again. We are not exiting the loop just so + // that we can rotate the metrics to preserve order. + if err == nil { + err = ro.write(batch) + } + if err != nil { + ro.failMetrics.Add(batch...) + } } } + batch := ro.metrics.Batch(ro.MetricBatchSize) + // see comment above about not trying to write to an already failed output. + // if ro.failMetrics is empty then err will always be nil at this point. + if err == nil { + err = ro.write(batch) + } + if err != nil { + ro.failMetrics.Add(batch...) + return err + } return nil } @@ -129,8 +146,8 @@ func (ro *RunningOutput) write(metrics []telegraf.Metric) error { elapsed := time.Since(start) if err == nil { if !ro.Quiet { - log.Printf("Wrote %d metrics to output %s in %s\n", - len(metrics), ro.Name, elapsed) + log.Printf("Output [%s] wrote batch of %d metrics in %s\n", + ro.Name, len(metrics), elapsed) } } return err diff --git a/internal/models/running_output_test.go b/internal/models/running_output_test.go index 6eee3bd11..d9238c5a4 100644 --- a/internal/models/running_output_test.go +++ b/internal/models/running_output_test.go @@ -2,7 +2,6 @@ package internal_models import ( "fmt" - "sort" "sync" "testing" @@ -29,16 +28,100 @@ var next5 = []telegraf.Metric{ testutil.TestMetric(101, "metric10"), } -// Test that we can write metrics with simple default setup. -func TestRunningOutputDefault(t *testing.T) { +// Benchmark adding metrics. +func BenchmarkRunningOutputAddWrite(b *testing.B) { conf := &OutputConfig{ Filter: Filter{ IsActive: false, }, } + m := &perfOutput{} + ro := NewRunningOutput("test", m, conf, 1000, 10000) + ro.Quiet = true + + for n := 0; n < b.N; n++ { + ro.AddMetric(first5[0]) + ro.Write() + } +} + +// Benchmark adding metrics. +func BenchmarkRunningOutputAddWriteEvery100(b *testing.B) { + conf := &OutputConfig{ + Filter: Filter{ + IsActive: false, + }, + } + + m := &perfOutput{} + ro := NewRunningOutput("test", m, conf, 1000, 10000) + ro.Quiet = true + + for n := 0; n < b.N; n++ { + ro.AddMetric(first5[0]) + if n%100 == 0 { + ro.Write() + } + } +} + +// Benchmark adding metrics. +func BenchmarkRunningOutputAddFailWrites(b *testing.B) { + conf := &OutputConfig{ + Filter: Filter{ + IsActive: false, + }, + } + + m := &perfOutput{} + m.failWrite = true + ro := NewRunningOutput("test", m, conf, 1000, 10000) + ro.Quiet = true + + for n := 0; n < b.N; n++ { + ro.AddMetric(first5[0]) + } +} + +// Test that NameDrop filters ger properly applied. +func TestRunningOutput_DropFilter(t *testing.T) { + conf := &OutputConfig{ + Filter: Filter{ + IsActive: true, + NameDrop: []string{"metric1", "metric2"}, + }, + } + assert.NoError(t, conf.Filter.CompileFilter()) + m := &mockOutput{} - ro := NewRunningOutput("test", m, conf) + ro := NewRunningOutput("test", m, conf, 1000, 10000) + + for _, metric := range first5 { + ro.AddMetric(metric) + } + for _, metric := range next5 { + ro.AddMetric(metric) + } + assert.Len(t, m.Metrics(), 0) + + err := ro.Write() + assert.NoError(t, err) + assert.Len(t, m.Metrics(), 8) +} + +// Test that NameDrop filters without a match do nothing. +func TestRunningOutput_PassFilter(t *testing.T) { + conf := &OutputConfig{ + Filter: Filter{ + IsActive: true, + NameDrop: []string{"metric1000", "foo*"}, + }, + } + assert.NoError(t, conf.Filter.CompileFilter()) + + m := &mockOutput{} + ro := NewRunningOutput("test", m, conf, 1000, 10000) for _, metric := range first5 { ro.AddMetric(metric) @@ -53,41 +136,96 @@ func TestRunningOutputDefault(t *testing.T) { assert.Len(t, m.Metrics(), 10) } -// Test that the first metric gets overwritten if there is a buffer overflow. -func TestRunningOutputOverwrite(t *testing.T) { +// Test that tags are properly included +func TestRunningOutput_TagIncludeNoMatch(t *testing.T) { conf := &OutputConfig{ Filter: Filter{ - IsActive: false, + IsActive: true, + TagInclude: []string{"nothing*"}, }, } + assert.NoError(t, conf.Filter.CompileFilter()) m := &mockOutput{} - ro := NewRunningOutput("test", m, conf) - ro.MetricBufferLimit = 4 + ro := NewRunningOutput("test", m, conf, 1000, 10000) - for _, metric := range first5 { - ro.AddMetric(metric) - } - require.Len(t, m.Metrics(), 0) + ro.AddMetric(first5[0]) + assert.Len(t, m.Metrics(), 0) err := ro.Write() - require.NoError(t, err) - require.Len(t, m.Metrics(), 4) - - var expected, actual []string - for i, exp := range first5[1:] { - expected = append(expected, exp.String()) - actual = append(actual, m.Metrics()[i].String()) - } - - sort.Strings(expected) - sort.Strings(actual) - - assert.Equal(t, expected, actual) + assert.NoError(t, err) + assert.Len(t, m.Metrics(), 1) + assert.Empty(t, m.Metrics()[0].Tags()) } -// Test that multiple buffer overflows are handled properly. -func TestRunningOutputMultiOverwrite(t *testing.T) { +// Test that tags are properly excluded +func TestRunningOutput_TagExcludeMatch(t *testing.T) { + conf := &OutputConfig{ + Filter: Filter{ + IsActive: true, + TagExclude: []string{"tag*"}, + }, + } + assert.NoError(t, conf.Filter.CompileFilter()) + + m := &mockOutput{} + ro := NewRunningOutput("test", m, conf, 1000, 10000) + + ro.AddMetric(first5[0]) + assert.Len(t, m.Metrics(), 0) + + err := ro.Write() + assert.NoError(t, err) + assert.Len(t, m.Metrics(), 1) + assert.Len(t, m.Metrics()[0].Tags(), 0) +} + +// Test that tags are properly Excluded +func TestRunningOutput_TagExcludeNoMatch(t *testing.T) { + conf := &OutputConfig{ + Filter: Filter{ + IsActive: true, + TagExclude: []string{"nothing*"}, + }, + } + assert.NoError(t, conf.Filter.CompileFilter()) + + m := &mockOutput{} + ro := NewRunningOutput("test", m, conf, 1000, 10000) + + ro.AddMetric(first5[0]) + assert.Len(t, m.Metrics(), 0) + + err := ro.Write() + assert.NoError(t, err) + assert.Len(t, m.Metrics(), 1) + assert.Len(t, m.Metrics()[0].Tags(), 1) +} + +// Test that tags are properly included +func TestRunningOutput_TagIncludeMatch(t *testing.T) { + conf := &OutputConfig{ + Filter: Filter{ + IsActive: true, + TagInclude: []string{"tag*"}, + }, + } + assert.NoError(t, conf.Filter.CompileFilter()) + + m := &mockOutput{} + ro := NewRunningOutput("test", m, conf, 1000, 10000) + + ro.AddMetric(first5[0]) + assert.Len(t, m.Metrics(), 0) + + err := ro.Write() + assert.NoError(t, err) + assert.Len(t, m.Metrics(), 1) + assert.Len(t, m.Metrics()[0].Tags(), 1) +} + +// Test that we can write metrics with simple default setup. +func TestRunningOutputDefault(t *testing.T) { conf := &OutputConfig{ Filter: Filter{ IsActive: false, @@ -95,8 +233,7 @@ func TestRunningOutputMultiOverwrite(t *testing.T) { } m := &mockOutput{} - ro := NewRunningOutput("test", m, conf) - ro.MetricBufferLimit = 3 + ro := NewRunningOutput("test", m, conf, 1000, 10000) for _, metric := range first5 { ro.AddMetric(metric) @@ -104,22 +241,11 @@ func TestRunningOutputMultiOverwrite(t *testing.T) { for _, metric := range next5 { ro.AddMetric(metric) } - require.Len(t, m.Metrics(), 0) + assert.Len(t, m.Metrics(), 0) err := ro.Write() - require.NoError(t, err) - require.Len(t, m.Metrics(), 3) - - var expected, actual []string - for i, exp := range next5[2:] { - expected = append(expected, exp.String()) - actual = append(actual, m.Metrics()[i].String()) - } - - sort.Strings(expected) - sort.Strings(actual) - - assert.Equal(t, expected, actual) + assert.NoError(t, err) + assert.Len(t, m.Metrics(), 10) } // Test that running output doesn't flush until it's full when @@ -132,11 +258,9 @@ func TestRunningOutputFlushWhenFull(t *testing.T) { } m := &mockOutput{} - ro := NewRunningOutput("test", m, conf) - ro.FlushBufferWhenFull = true - ro.MetricBufferLimit = 5 + ro := NewRunningOutput("test", m, conf, 6, 10) - // Fill buffer to limit + // Fill buffer to 1 under limit for _, metric := range first5 { ro.AddMetric(metric) } @@ -165,9 +289,7 @@ func TestRunningOutputMultiFlushWhenFull(t *testing.T) { } m := &mockOutput{} - ro := NewRunningOutput("test", m, conf) - ro.FlushBufferWhenFull = true - ro.MetricBufferLimit = 4 + ro := NewRunningOutput("test", m, conf, 4, 12) // Fill buffer past limit twive for _, metric := range first5 { @@ -177,7 +299,7 @@ func TestRunningOutputMultiFlushWhenFull(t *testing.T) { ro.AddMetric(metric) } // flushed twice - assert.Len(t, m.Metrics(), 10) + assert.Len(t, m.Metrics(), 8) } func TestRunningOutputWriteFail(t *testing.T) { @@ -189,11 +311,9 @@ func TestRunningOutputWriteFail(t *testing.T) { m := &mockOutput{} m.failWrite = true - ro := NewRunningOutput("test", m, conf) - ro.FlushBufferWhenFull = true - ro.MetricBufferLimit = 4 + ro := NewRunningOutput("test", m, conf, 4, 12) - // Fill buffer past limit twice + // Fill buffer to limit twice for _, metric := range first5 { ro.AddMetric(metric) } @@ -216,6 +336,161 @@ func TestRunningOutputWriteFail(t *testing.T) { assert.Len(t, m.Metrics(), 10) } +// Verify that the order of points is preserved during a write failure. +func TestRunningOutputWriteFailOrder(t *testing.T) { + conf := &OutputConfig{ + Filter: Filter{ + IsActive: false, + }, + } + + m := &mockOutput{} + m.failWrite = true + ro := NewRunningOutput("test", m, conf, 100, 1000) + + // add 5 metrics + for _, metric := range first5 { + ro.AddMetric(metric) + } + // no successful flush yet + assert.Len(t, m.Metrics(), 0) + + // Write fails + err := ro.Write() + require.Error(t, err) + // no successful flush yet + assert.Len(t, m.Metrics(), 0) + + m.failWrite = false + // add 5 more metrics + for _, metric := range next5 { + ro.AddMetric(metric) + } + err = ro.Write() + require.NoError(t, err) + + // Verify that 10 metrics were written + assert.Len(t, m.Metrics(), 10) + // Verify that they are in order + expected := append(first5, next5...) + assert.Equal(t, expected, m.Metrics()) +} + +// Verify that the order of points is preserved during many write failures. +func TestRunningOutputWriteFailOrder2(t *testing.T) { + conf := &OutputConfig{ + Filter: Filter{ + IsActive: false, + }, + } + + m := &mockOutput{} + m.failWrite = true + ro := NewRunningOutput("test", m, conf, 5, 100) + + // add 5 metrics + for _, metric := range first5 { + ro.AddMetric(metric) + } + // Write fails + err := ro.Write() + require.Error(t, err) + // no successful flush yet + assert.Len(t, m.Metrics(), 0) + + // add 5 metrics + for _, metric := range next5 { + ro.AddMetric(metric) + } + // Write fails + err = ro.Write() + require.Error(t, err) + // no successful flush yet + assert.Len(t, m.Metrics(), 0) + + // add 5 metrics + for _, metric := range first5 { + ro.AddMetric(metric) + } + // Write fails + err = ro.Write() + require.Error(t, err) + // no successful flush yet + assert.Len(t, m.Metrics(), 0) + + // add 5 metrics + for _, metric := range next5 { + ro.AddMetric(metric) + } + // Write fails + err = ro.Write() + require.Error(t, err) + // no successful flush yet + assert.Len(t, m.Metrics(), 0) + + m.failWrite = false + err = ro.Write() + require.NoError(t, err) + + // Verify that 10 metrics were written + assert.Len(t, m.Metrics(), 20) + // Verify that they are in order + expected := append(first5, next5...) + expected = append(expected, first5...) + expected = append(expected, next5...) + assert.Equal(t, expected, m.Metrics()) +} + +// Verify that the order of points is preserved when there is a remainder +// of points for the batch. +// +// ie, with a batch size of 5: +// +// 1 2 3 4 5 6 <-- order, failed points +// 6 1 2 3 4 5 <-- order, after 1st write failure (1 2 3 4 5 was batch) +// 1 2 3 4 5 6 <-- order, after 2nd write failure, (6 was batch) +// +func TestRunningOutputWriteFailOrder3(t *testing.T) { + conf := &OutputConfig{ + Filter: Filter{ + IsActive: false, + }, + } + + m := &mockOutput{} + m.failWrite = true + ro := NewRunningOutput("test", m, conf, 5, 1000) + + // add 5 metrics + for _, metric := range first5 { + ro.AddMetric(metric) + } + // no successful flush yet + assert.Len(t, m.Metrics(), 0) + + // Write fails + err := ro.Write() + require.Error(t, err) + // no successful flush yet + assert.Len(t, m.Metrics(), 0) + + // add and attempt to write a single metric: + ro.AddMetric(next5[0]) + err = ro.Write() + require.Error(t, err) + + // unset fail and write metrics + m.failWrite = false + err = ro.Write() + require.NoError(t, err) + + // Verify that 6 metrics were written + assert.Len(t, m.Metrics(), 6) + // Verify that they are in order + expected := append(first5, next5[0]) + assert.Equal(t, expected, m.Metrics()) +} + type mockOutput struct { sync.Mutex @@ -263,3 +538,31 @@ func (m *mockOutput) Metrics() []telegraf.Metric { defer m.Unlock() return m.metrics } + +type perfOutput struct { + // if true, mock a write failure + failWrite bool +} + +func (m *perfOutput) Connect() error { + return nil +} + +func (m *perfOutput) Close() error { + return nil +} + +func (m *perfOutput) Description() string { + return "" +} + +func (m *perfOutput) SampleConfig() string { + return "" +} + +func (m *perfOutput) Write(metrics []telegraf.Metric) error { + if m.failWrite { + return fmt.Errorf("Failed Write!") + } + return nil +} diff --git a/plugins/inputs/all/all.go b/plugins/inputs/all/all.go index 3f56ee541..93ea3e779 100644 --- a/plugins/inputs/all/all.go +++ b/plugins/inputs/all/all.go @@ -14,6 +14,7 @@ import ( _ "github.com/influxdata/telegraf/plugins/inputs/dovecot" _ "github.com/influxdata/telegraf/plugins/inputs/elasticsearch" _ "github.com/influxdata/telegraf/plugins/inputs/exec" + _ "github.com/influxdata/telegraf/plugins/inputs/filestat" _ "github.com/influxdata/telegraf/plugins/inputs/github_webhooks" _ "github.com/influxdata/telegraf/plugins/inputs/haproxy" _ "github.com/influxdata/telegraf/plugins/inputs/http_response" @@ -55,6 +56,7 @@ import ( _ "github.com/influxdata/telegraf/plugins/inputs/statsd" _ "github.com/influxdata/telegraf/plugins/inputs/sysstat" _ "github.com/influxdata/telegraf/plugins/inputs/system" + _ "github.com/influxdata/telegraf/plugins/inputs/tail" _ "github.com/influxdata/telegraf/plugins/inputs/tcp_listener" _ "github.com/influxdata/telegraf/plugins/inputs/trig" _ "github.com/influxdata/telegraf/plugins/inputs/twemproxy" diff --git a/plugins/inputs/cassandra/cassandra.go b/plugins/inputs/cassandra/cassandra.go index b7525de1a..351232aca 100644 --- a/plugins/inputs/cassandra/cassandra.go +++ b/plugins/inputs/cassandra/cassandra.go @@ -7,19 +7,12 @@ import ( "github.com/influxdata/telegraf" "github.com/influxdata/telegraf/plugins/inputs" "io/ioutil" + "log" "net/http" "net/url" - //"reflect" "strings" ) -/*type Server struct { - Host string - Username string - Password string - Port string -}*/ - type JolokiaClient interface { MakeRequest(req *http.Request) (*http.Response, error) } @@ -55,12 +48,6 @@ type jmxMetric interface { addTagsFields(out map[string]interface{}) } -func addServerTags(host string, tags map[string]string) { - if host != "" && host != "localhost" && host != "127.0.0.1" { - tags["host"] = host - } -} - func newJavaMetric(host string, metric string, acc telegraf.Accumulator) *javaMetric { return &javaMetric{host: host, metric: metric, acc: acc} @@ -120,7 +107,7 @@ func (j javaMetric) addTagsFields(out map[string]interface{}) { tokens := parseJmxMetricRequest(mbean) addTokensToTags(tokens, tags) - addServerTags(j.host, tags) + tags["cassandra_host"] = j.host if _, ok := tags["mname"]; !ok { //Queries for a single value will not return a "name" tag in the response. @@ -148,7 +135,7 @@ func addCassandraMetric(mbean string, c cassandraMetric, fields := make(map[string]interface{}) tokens := parseJmxMetricRequest(mbean) addTokensToTags(tokens, tags) - addServerTags(c.host, tags) + tags["cassandra_host"] = c.host addValuesAsFields(values, fields, tags["mname"]) c.acc.AddFields(tokens["class"]+tokens["type"], fields, tags) @@ -192,7 +179,7 @@ func (j *Cassandra) SampleConfig() string { servers = ["myuser:mypassword@10.10.10.1:8778","10.10.10.2:8778",":8778"] ## List of metrics collected on above servers ## Each metric consists of a jmx path. - ## This will collect all heap memory usage metrics from the jvm and + ## This will collect all heap memory usage metrics from the jvm and ## ReadLatency metrics for all keyspaces and tables. ## "type=Table" in the query works with Cassandra3.0. Older versions might ## need to use "type=ColumnFamily" @@ -277,15 +264,19 @@ func (c *Cassandra) Gather(acc telegraf.Accumulator) error { for _, server := range servers { for _, metric := range metrics { - var m jmxMetric - serverTokens := parseServerTokens(server) + var m jmxMetric if strings.HasPrefix(metric, "/java.lang:") { m = newJavaMetric(serverTokens["host"], metric, acc) } else if strings.HasPrefix(metric, "/org.apache.cassandra.metrics:") { m = newCassandraMetric(serverTokens["host"], metric, acc) + } else { + // unsupported metric type + log.Printf("Unsupported Cassandra metric [%s], skipping", + metric) + continue } // Prepare URL diff --git a/plugins/inputs/cassandra/cassandra_test.go b/plugins/inputs/cassandra/cassandra_test.go index 184fa3bbb..aa39017fe 100644 --- a/plugins/inputs/cassandra/cassandra_test.go +++ b/plugins/inputs/cassandra/cassandra_test.go @@ -58,7 +58,7 @@ const validCassandraNestedMultiValueJSON = ` "status": 200, "timestamp": 1458089184, "value": { - "org.apache.cassandra.metrics:keyspace=test_keyspace1,name=ReadLatency,scope=test_table1,type=Table": + "org.apache.cassandra.metrics:keyspace=test_keyspace1,name=ReadLatency,scope=test_table1,type=Table": { "999thPercentile": 1.0, "Count": 100, "DurationUnit": "microseconds", @@ -66,7 +66,7 @@ const validCassandraNestedMultiValueJSON = ` "RateUnit": "events/second", "StdDev": null }, - "org.apache.cassandra.metrics:keyspace=test_keyspace2,name=ReadLatency,scope=test_table2,type=Table": + "org.apache.cassandra.metrics:keyspace=test_keyspace2,name=ReadLatency,scope=test_table2,type=Table": { "999thPercentile": 2.0, "Count": 200, "DurationUnit": "microseconds", @@ -163,13 +163,13 @@ func TestHttpJsonJavaMultiValue(t *testing.T) { "HeapMemoryUsage_used": 203288528.0, } tags1 := map[string]string{ - "host": "10.10.10.10", - "mname": "HeapMemoryUsage", + "cassandra_host": "10.10.10.10", + "mname": "HeapMemoryUsage", } tags2 := map[string]string{ - "host": "10.10.10.11", - "mname": "HeapMemoryUsage", + "cassandra_host": "10.10.10.11", + "mname": "HeapMemoryUsage", } acc.AssertContainsTaggedFields(t, "javaMemory", fields, tags1) acc.AssertContainsTaggedFields(t, "javaMemory", fields, tags2) @@ -190,8 +190,8 @@ func TestHttpJsonJavaMultiType(t *testing.T) { } tags := map[string]string{ - "host": "10.10.10.10", - "mname": "ConcurrentMarkSweep", + "cassandra_host": "10.10.10.10", + "mname": "ConcurrentMarkSweep", } acc.AssertContainsTaggedFields(t, "javaGarbageCollector", fields, tags) } @@ -231,10 +231,10 @@ func TestHttpJsonCassandraMultiValue(t *testing.T) { } tags := map[string]string{ - "host": "10.10.10.10", - "mname": "ReadLatency", - "keyspace": "test_keyspace1", - "scope": "test_table", + "cassandra_host": "10.10.10.10", + "mname": "ReadLatency", + "keyspace": "test_keyspace1", + "scope": "test_table", } acc.AssertContainsTaggedFields(t, "cassandraTable", fields, tags) } @@ -268,17 +268,17 @@ func TestHttpJsonCassandraNestedMultiValue(t *testing.T) { } tags1 := map[string]string{ - "host": "10.10.10.10", - "mname": "ReadLatency", - "keyspace": "test_keyspace1", - "scope": "test_table1", + "cassandra_host": "10.10.10.10", + "mname": "ReadLatency", + "keyspace": "test_keyspace1", + "scope": "test_table1", } tags2 := map[string]string{ - "host": "10.10.10.10", - "mname": "ReadLatency", - "keyspace": "test_keyspace2", - "scope": "test_table2", + "cassandra_host": "10.10.10.10", + "mname": "ReadLatency", + "keyspace": "test_keyspace2", + "scope": "test_table2", } acc.AssertContainsTaggedFields(t, "cassandraTable", fields1, tags1) diff --git a/plugins/inputs/cloudwatch/cloudwatch.go b/plugins/inputs/cloudwatch/cloudwatch.go index e3fa74bad..8edf2f895 100644 --- a/plugins/inputs/cloudwatch/cloudwatch.go +++ b/plugins/inputs/cloudwatch/cloudwatch.go @@ -7,8 +7,6 @@ import ( "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/credentials" - "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds" - "github.com/aws/aws-sdk-go/aws/ec2metadata" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/cloudwatch" @@ -21,6 +19,8 @@ import ( type ( CloudWatch struct { Region string `toml:"region"` + AccessKey string `toml:"access_key"` + SecretKey string `toml:"secret_key"` Period internal.Duration `toml:"period"` Delay internal.Duration `toml:"delay"` Namespace string `toml:"namespace"` @@ -56,13 +56,22 @@ func (c *CloudWatch) SampleConfig() string { ## Amazon Region region = 'us-east-1' + ## Amazon Credentials + ## Credentials are loaded in the following order + ## 1) explicit credentials from 'access_key' and 'secret_key' + ## 2) environment variables + ## 3) shared credentials file + ## 4) EC2 Instance Profile + #access_key = "" + #secret_key = "" + ## Requested CloudWatch aggregation Period (required - must be a multiple of 60s) period = '1m' ## Collection Delay (required - must account for metrics availability via CloudWatch API) delay = '1m' - ## Recomended: use metric 'interval' that is a multiple of 'period' to avoid + ## Recomended: use metric 'interval' that is a multiple of 'period' to avoid ## gaps or overlap in pulled data interval = '1m' @@ -74,7 +83,7 @@ func (c *CloudWatch) SampleConfig() string { ## Refreshes Namespace available metrics every 1h #[[inputs.cloudwatch.metrics]] # names = ['Latency', 'RequestCount'] - # + # # ## Dimension filters for Metric (optional) # [[inputs.cloudwatch.metrics.dimensions]] # name = 'LoadBalancerName' @@ -154,12 +163,9 @@ func init() { func (c *CloudWatch) initializeCloudWatch() error { config := &aws.Config{ Region: aws.String(c.Region), - Credentials: credentials.NewChainCredentials( - []credentials.Provider{ - &ec2rolecreds.EC2RoleProvider{Client: ec2metadata.New(session.New())}, - &credentials.EnvProvider{}, - &credentials.SharedCredentialsProvider{}, - }), + } + if c.AccessKey != "" || c.SecretKey != "" { + config.Credentials = credentials.NewStaticCredentials(c.AccessKey, c.SecretKey, "") } c.client = cloudwatch.New(session.New(config)) diff --git a/plugins/inputs/disque/disque.go b/plugins/inputs/disque/disque.go index d726590b4..0e4baf9cb 100644 --- a/plugins/inputs/disque/disque.go +++ b/plugins/inputs/disque/disque.go @@ -162,7 +162,7 @@ func (g *Disque) gatherServer(addr *url.URL, acc telegraf.Accumulator) error { var read int fields := make(map[string]interface{}) - tags := map[string]string{"host": addr.String()} + tags := map[string]string{"disque_host": addr.String()} for read < sz { line, err := r.ReadString('\n') if err != nil { diff --git a/plugins/inputs/dns_query/dns_query_test.go b/plugins/inputs/dns_query/dns_query_test.go index d7d267a59..aeeb7656f 100644 --- a/plugins/inputs/dns_query/dns_query_test.go +++ b/plugins/inputs/dns_query/dns_query_test.go @@ -15,6 +15,9 @@ var servers = []string{"8.8.8.8"} var domains = []string{"google.com"} func TestGathering(t *testing.T) { + if testing.Short() { + t.Skip("Skipping network-dependent test in short mode.") + } var dnsConfig = DnsQuery{ Servers: servers, Domains: domains, @@ -31,6 +34,9 @@ func TestGathering(t *testing.T) { } func TestGatheringMxRecord(t *testing.T) { + if testing.Short() { + t.Skip("Skipping network-dependent test in short mode.") + } var dnsConfig = DnsQuery{ Servers: servers, Domains: domains, @@ -48,6 +54,9 @@ func TestGatheringMxRecord(t *testing.T) { } func TestGatheringRootDomain(t *testing.T) { + if testing.Short() { + t.Skip("Skipping network-dependent test in short mode.") + } var dnsConfig = DnsQuery{ Servers: servers, Domains: []string{"."}, @@ -72,6 +81,9 @@ func TestGatheringRootDomain(t *testing.T) { } func TestMetricContainsServerAndDomainAndRecordTypeTags(t *testing.T) { + if testing.Short() { + t.Skip("Skipping network-dependent test in short mode.") + } var dnsConfig = DnsQuery{ Servers: servers, Domains: domains, @@ -95,6 +107,9 @@ func TestMetricContainsServerAndDomainAndRecordTypeTags(t *testing.T) { } func TestGatheringTimeout(t *testing.T) { + if testing.Short() { + t.Skip("Skipping network-dependent test in short mode.") + } var dnsConfig = DnsQuery{ Servers: servers, Domains: domains, diff --git a/plugins/inputs/docker/README.md b/plugins/inputs/docker/README.md index c22e6af8e..e59b6f513 100644 --- a/plugins/inputs/docker/README.md +++ b/plugins/inputs/docker/README.md @@ -29,10 +29,10 @@ for the stat structure can be found Every effort was made to preserve the names based on the JSON response from the docker API. -Note that the docker_cpu metric may appear multiple times per collection, based -on the availability of per-cpu stats on your system. +Note that the docker_container_cpu metric may appear multiple times per collection, +based on the availability of per-cpu stats on your system. -- docker_mem +- docker_container_mem - total_pgmafault - cache - mapped_file @@ -66,7 +66,8 @@ on the availability of per-cpu stats on your system. - usage - failcnt - limit -- docker_cpu + - container_id +- docker_container_cpu - throttling_periods - throttling_throttled_periods - throttling_throttled_time @@ -75,7 +76,8 @@ on the availability of per-cpu stats on your system. - usage_system - usage_total - usage_percent -- docker_net + - container_id +- docker_container_net - rx_dropped - rx_bytes - rx_errors @@ -84,7 +86,8 @@ on the availability of per-cpu stats on your system. - rx_packets - tx_errors - tx_bytes -- docker_blkio + - container_id +- docker_container_blkio - io_service_bytes_recursive_async - io_service_bytes_recursive_read - io_service_bytes_recursive_sync @@ -125,20 +128,20 @@ on the availability of per-cpu stats on your system. - docker_metadata - unit=bytes -- docker_cpu specific: - - cont_id (container ID) - - cont_image (container image) - - cont_name (container name) +- docker_container_mem specific: + - container_image + - container_name +- docker_container_cpu specific: + - container_image + - container_name - cpu -- docker_net specific: - - cont_id (container ID) - - cont_image (container image) - - cont_name (container name) +- docker_container_net specific: + - container_image + - container_name - network -- docker_blkio specific: - - cont_id (container ID) - - cont_image (container image) - - cont_name (container name) +- docker_container_blkio specific: + - container_image + - container_name - device ### Example Output: @@ -156,8 +159,8 @@ on the availability of per-cpu stats on your system. > docker,unit=bytes pool_blocksize=65540i 1456926671065383978 > docker_data,unit=bytes available=24340000000i,total=107400000000i,used=14820000000i 1456926671065383978 > docker_metadata,unit=bytes available=2126999999i,total=2146999999i,used=20420000i 145692667106538 -> docker_mem,cont_id=5705ba8ed8fb47527410653d60a8bb2f3af5e62372297c419022a3cc6d45d848,\ -cont_image=spotify/kafka,cont_name=kafka \ +> docker_container_mem, +container_image=spotify/kafka,container_name=kafka \ active_anon=52568064i,active_file=6926336i,cache=12038144i,fail_count=0i,\ hierarchical_memory_limit=9223372036854771712i,inactive_anon=52707328i,\ inactive_file=5111808i,limit=1044578304i,mapped_file=10301440i,\ @@ -168,21 +171,21 @@ total_inactive_file=5111808i,total_mapped_file=10301440i,total_pgfault=63762i,\ total_pgmafault=0i,total_pgpgin=73355i,total_pgpgout=45736i,\ total_rss=105275392i,total_rss_huge=4194304i,total_unevictable=0i,\ total_writeback=0i,unevictable=0i,usage=117440512i,writeback=0i 1453409536840126713 -> docker_cpu,cont_id=5705ba8ed8fb47527410653d60a8bb2f3af5e62372297c419022a3cc6d45d848,\ -cont_image=spotify/kafka,cont_name=kafka,cpu=cpu-total \ +> docker_container_cpu, +container_image=spotify/kafka,container_name=kafka,cpu=cpu-total \ throttling_periods=0i,throttling_throttled_periods=0i,\ throttling_throttled_time=0i,usage_in_kernelmode=440000000i,\ usage_in_usermode=2290000000i,usage_system=84795360000000i,\ usage_total=6628208865i 1453409536840126713 -> docker_cpu,cont_id=5705ba8ed8fb47527410653d60a8bb2f3af5e62372297c419022a3cc6d45d848,\ -cont_image=spotify/kafka,cont_name=kafka,cpu=cpu0 \ +> docker_container_cpu, +container_image=spotify/kafka,container_name=kafka,cpu=cpu0 \ usage_total=6628208865i 1453409536840126713 -> docker_net,cont_id=5705ba8ed8fb47527410653d60a8bb2f3af5e62372297c419022a3cc6d45d848,\ -cont_image=spotify/kafka,cont_name=kafka,network=eth0 \ +> docker_container_net,\ +container_image=spotify/kafka,container_name=kafka,network=eth0 \ rx_bytes=7468i,rx_dropped=0i,rx_errors=0i,rx_packets=94i,tx_bytes=946i,\ tx_dropped=0i,tx_errors=0i,tx_packets=13i 1453409536840126713 -> docker_blkio,cont_id=5705ba8ed8fb47527410653d60a8bb2f3af5e62372297c419022a3cc6d45d848,\ -cont_image=spotify/kafka,cont_name=kafka,device=8:0 \ +> docker_container_blkio, +container_image=spotify/kafka,container_name=kafka,device=8:0 \ io_service_bytes_recursive_async=80216064i,io_service_bytes_recursive_read=79925248i,\ io_service_bytes_recursive_sync=77824i,io_service_bytes_recursive_total=80293888i,\ io_service_bytes_recursive_write=368640i,io_serviced_recursive_async=6562i,\ diff --git a/plugins/inputs/docker/docker.go b/plugins/inputs/docker/docker.go index 094bad8ca..8a680a8e8 100644 --- a/plugins/inputs/docker/docker.go +++ b/plugins/inputs/docker/docker.go @@ -16,6 +16,7 @@ import ( "github.com/docker/engine-api/client" "github.com/docker/engine-api/types" "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal" "github.com/influxdata/telegraf/plugins/inputs" ) @@ -23,6 +24,7 @@ import ( type Docker struct { Endpoint string ContainerNames []string + Timeout internal.Duration client DockerClient } @@ -54,6 +56,8 @@ var sampleConfig = ` endpoint = "unix:///var/run/docker.sock" ## Only collect metrics for these containers, collect all if empty container_names = [] + ## Timeout for docker list, info, and stats commands + timeout = "5s" ` // Description returns input description @@ -97,7 +101,9 @@ func (d *Docker) Gather(acc telegraf.Accumulator) error { // List containers opts := types.ContainerListOptions{} - containers, err := d.client.ContainerList(context.Background(), opts) + ctx, cancel := context.WithTimeout(context.Background(), d.Timeout.Duration) + defer cancel() + containers, err := d.client.ContainerList(ctx, opts) if err != nil { return err } @@ -106,12 +112,12 @@ func (d *Docker) Gather(acc telegraf.Accumulator) error { var wg sync.WaitGroup wg.Add(len(containers)) for _, container := range containers { - go func(c types.Container) { defer wg.Done() err := d.gatherContainer(c, acc) if err != nil { - fmt.Println(err.Error()) + log.Printf("Error gathering container %s stats: %s\n", + c.Names, err.Error()) } }(container) } @@ -126,7 +132,9 @@ func (d *Docker) gatherInfo(acc telegraf.Accumulator) error { metadataFields := make(map[string]interface{}) now := time.Now() // Get info from docker daemon - info, err := d.client.Info(context.Background()) + ctx, cancel := context.WithTimeout(context.Background(), d.Timeout.Duration) + defer cancel() + info, err := d.client.Info(ctx) if err != nil { return err } @@ -200,9 +208,8 @@ func (d *Docker) gatherContainer( } tags := map[string]string{ - "cont_id": container.ID, - "cont_name": cname, - "cont_image": container.Image, + "container_name": cname, + "container_image": container.Image, } if len(d.ContainerNames) > 0 { if !sliceContains(cname, d.ContainerNames) { @@ -210,22 +217,27 @@ func (d *Docker) gatherContainer( } } - r, err := d.client.ContainerStats(context.Background(), container.ID, false) + ctx, cancel := context.WithTimeout(context.Background(), d.Timeout.Duration) + defer cancel() + r, err := d.client.ContainerStats(ctx, container.ID, false) if err != nil { log.Printf("Error getting docker stats: %s\n", err.Error()) } defer r.Close() dec := json.NewDecoder(r) if err = dec.Decode(&v); err != nil { - log.Printf("Error decoding: %s\n", err.Error()) + if err == io.EOF { + return nil + } + return fmt.Errorf("Error decoding: %s", err.Error()) } // Add labels to tags - for k, v := range container.Labels { - tags[k] = v + for k, label := range container.Labels { + tags[k] = label } - gatherContainerStats(v, acc, tags) + gatherContainerStats(v, acc, tags, container.ID) return nil } @@ -234,6 +246,7 @@ func gatherContainerStats( stat *types.StatsJSON, acc telegraf.Accumulator, tags map[string]string, + id string, ) { now := stat.Read @@ -272,8 +285,9 @@ func gatherContainerStats( "inactive_file": stat.MemoryStats.Stats["inactive_file"], "total_pgpgin": stat.MemoryStats.Stats["total_pgpgin"], "usage_percent": calculateMemPercent(stat), + "container_id": id, } - acc.AddFields("docker_mem", memfields, tags, now) + acc.AddFields("docker_container_mem", memfields, tags, now) cpufields := map[string]interface{}{ "usage_total": stat.CPUStats.CPUUsage.TotalUsage, @@ -284,32 +298,34 @@ func gatherContainerStats( "throttling_throttled_periods": stat.CPUStats.ThrottlingData.ThrottledPeriods, "throttling_throttled_time": stat.CPUStats.ThrottlingData.ThrottledTime, "usage_percent": calculateCPUPercent(stat), + "container_id": id, } cputags := copyTags(tags) cputags["cpu"] = "cpu-total" - acc.AddFields("docker_cpu", cpufields, cputags, now) + acc.AddFields("docker_container_cpu", cpufields, cputags, now) for i, percpu := range stat.CPUStats.CPUUsage.PercpuUsage { percputags := copyTags(tags) percputags["cpu"] = fmt.Sprintf("cpu%d", i) - acc.AddFields("docker_cpu", map[string]interface{}{"usage_total": percpu}, percputags, now) + acc.AddFields("docker_container_cpu", map[string]interface{}{"usage_total": percpu}, percputags, now) } for network, netstats := range stat.Networks { netfields := map[string]interface{}{ - "rx_dropped": netstats.RxDropped, - "rx_bytes": netstats.RxBytes, - "rx_errors": netstats.RxErrors, - "tx_packets": netstats.TxPackets, - "tx_dropped": netstats.TxDropped, - "rx_packets": netstats.RxPackets, - "tx_errors": netstats.TxErrors, - "tx_bytes": netstats.TxBytes, + "rx_dropped": netstats.RxDropped, + "rx_bytes": netstats.RxBytes, + "rx_errors": netstats.RxErrors, + "tx_packets": netstats.TxPackets, + "tx_dropped": netstats.TxDropped, + "rx_packets": netstats.RxPackets, + "tx_errors": netstats.TxErrors, + "tx_bytes": netstats.TxBytes, + "container_id": id, } // Create a new network tag dictionary for the "network" tag nettags := copyTags(tags) nettags["network"] = network - acc.AddFields("docker_net", netfields, nettags, now) + acc.AddFields("docker_container_net", netfields, nettags, now) } gatherBlockIOMetrics(stat, acc, tags, now) @@ -404,7 +420,7 @@ func gatherBlockIOMetrics( for device, fields := range deviceStatMap { iotags := copyTags(tags) iotags["device"] = device - acc.AddFields("docker_blkio", fields, iotags, now) + acc.AddFields("docker_container_blkio", fields, iotags, now) } } diff --git a/plugins/inputs/docker/docker_test.go b/plugins/inputs/docker/docker_test.go index c9fe6cea1..4ac05f93b 100644 --- a/plugins/inputs/docker/docker_test.go +++ b/plugins/inputs/docker/docker_test.go @@ -21,26 +21,26 @@ func TestDockerGatherContainerStats(t *testing.T) { stats := testStats() tags := map[string]string{ - "cont_id": "foobarbaz", - "cont_name": "redis", - "cont_image": "redis/image", + "container_name": "redis", + "container_image": "redis/image", } - gatherContainerStats(stats, &acc, tags) + gatherContainerStats(stats, &acc, tags, "123456789") - // test docker_net measurement + // test docker_container_net measurement netfields := map[string]interface{}{ - "rx_dropped": uint64(1), - "rx_bytes": uint64(2), - "rx_errors": uint64(3), - "tx_packets": uint64(4), - "tx_dropped": uint64(1), - "rx_packets": uint64(2), - "tx_errors": uint64(3), - "tx_bytes": uint64(4), + "rx_dropped": uint64(1), + "rx_bytes": uint64(2), + "rx_errors": uint64(3), + "tx_packets": uint64(4), + "tx_dropped": uint64(1), + "rx_packets": uint64(2), + "tx_errors": uint64(3), + "tx_bytes": uint64(4), + "container_id": "123456789", } nettags := copyTags(tags) nettags["network"] = "eth0" - acc.AssertContainsTaggedFields(t, "docker_net", netfields, nettags) + acc.AssertContainsTaggedFields(t, "docker_container_net", netfields, nettags) // test docker_blkio measurement blkiotags := copyTags(tags) @@ -49,9 +49,9 @@ func TestDockerGatherContainerStats(t *testing.T) { "io_service_bytes_recursive_read": uint64(100), "io_serviced_recursive_write": uint64(101), } - acc.AssertContainsTaggedFields(t, "docker_blkio", blkiofields, blkiotags) + acc.AssertContainsTaggedFields(t, "docker_container_blkio", blkiofields, blkiotags) - // test docker_mem measurement + // test docker_container_mem measurement memfields := map[string]interface{}{ "max_usage": uint64(1001), "usage": uint64(1111), @@ -87,11 +87,12 @@ func TestDockerGatherContainerStats(t *testing.T) { "inactive_file": uint64(3), "total_pgpgin": uint64(4), "usage_percent": float64(55.55), + "container_id": "123456789", } - acc.AssertContainsTaggedFields(t, "docker_mem", memfields, tags) + acc.AssertContainsTaggedFields(t, "docker_container_mem", memfields, tags) - // test docker_cpu measurement + // test docker_container_cpu measurement cputags := copyTags(tags) cputags["cpu"] = "cpu-total" cpufields := map[string]interface{}{ @@ -103,20 +104,21 @@ func TestDockerGatherContainerStats(t *testing.T) { "throttling_throttled_periods": uint64(0), "throttling_throttled_time": uint64(0), "usage_percent": float64(400.0), + "container_id": "123456789", } - acc.AssertContainsTaggedFields(t, "docker_cpu", cpufields, cputags) + acc.AssertContainsTaggedFields(t, "docker_container_cpu", cpufields, cputags) cputags["cpu"] = "cpu0" cpu0fields := map[string]interface{}{ "usage_total": uint64(1), } - acc.AssertContainsTaggedFields(t, "docker_cpu", cpu0fields, cputags) + acc.AssertContainsTaggedFields(t, "docker_container_cpu", cpu0fields, cputags) cputags["cpu"] = "cpu1" cpu1fields := map[string]interface{}{ "usage_total": uint64(1002), } - acc.AssertContainsTaggedFields(t, "docker_cpu", cpu1fields, cputags) + acc.AssertContainsTaggedFields(t, "docker_container_cpu", cpu1fields, cputags) } func testStats() *types.StatsJSON { @@ -367,19 +369,18 @@ func TestDockerGatherInfo(t *testing.T) { }, ) acc.AssertContainsTaggedFields(t, - "docker_cpu", + "docker_container_cpu", map[string]interface{}{ "usage_total": uint64(1231652), }, map[string]string{ - "cont_id": "b7dfbb9478a6ae55e237d4d74f8bbb753f0817192b5081334dc78476296e2173", - "cont_name": "etcd2", - "cont_image": "quay.io/coreos/etcd:v2.2.2", - "cpu": "cpu3", + "container_name": "etcd2", + "container_image": "quay.io/coreos/etcd:v2.2.2", + "cpu": "cpu3", }, ) acc.AssertContainsTaggedFields(t, - "docker_mem", + "docker_container_mem", map[string]interface{}{ "total_pgpgout": uint64(0), "usage_percent": float64(0), @@ -415,11 +416,11 @@ func TestDockerGatherInfo(t *testing.T) { "pgfault": uint64(0), "usage": uint64(0), "limit": uint64(18935443456), + "container_id": "b7dfbb9478a6ae55e237d4d74f8bbb753f0817192b5081334dc78476296e2173", }, map[string]string{ - "cont_id": "b7dfbb9478a6ae55e237d4d74f8bbb753f0817192b5081334dc78476296e2173", - "cont_name": "etcd2", - "cont_image": "quay.io/coreos/etcd:v2.2.2", + "container_name": "etcd2", + "container_image": "quay.io/coreos/etcd:v2.2.2", }, ) diff --git a/plugins/inputs/exec/exec.go b/plugins/inputs/exec/exec.go index d2e09ccd0..c1b2092e8 100644 --- a/plugins/inputs/exec/exec.go +++ b/plugins/inputs/exec/exec.go @@ -6,10 +6,12 @@ import ( "os/exec" "sync" "syscall" + "time" "github.com/gonuts/go-shellquote" "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal" "github.com/influxdata/telegraf/plugins/inputs" "github.com/influxdata/telegraf/plugins/parsers" "github.com/influxdata/telegraf/plugins/parsers/nagios" @@ -19,6 +21,9 @@ const sampleConfig = ` ## Commands array commands = ["/tmp/test.sh", "/usr/bin/mycollector --foo=bar"] + ## Timeout for each command to complete. + timeout = "5s" + ## measurement name suffix (for separating different commands) name_suffix = "_mycollector" @@ -32,6 +37,7 @@ const sampleConfig = ` type Exec struct { Commands []string Command string + Timeout internal.Duration parser parsers.Parser @@ -43,7 +49,8 @@ type Exec struct { func NewExec() *Exec { return &Exec{ - runner: CommandRunner{}, + runner: CommandRunner{}, + Timeout: internal.Duration{Duration: time.Second * 5}, } } @@ -73,7 +80,11 @@ func AddNagiosState(exitCode error, acc telegraf.Accumulator) error { return nil } -func (c CommandRunner) Run(e *Exec, command string, acc telegraf.Accumulator) ([]byte, error) { +func (c CommandRunner) Run( + e *Exec, + command string, + acc telegraf.Accumulator, +) ([]byte, error) { split_cmd, err := shellquote.Split(command) if err != nil || len(split_cmd) == 0 { return nil, fmt.Errorf("exec: unable to parse command, %s", err) @@ -84,7 +95,7 @@ func (c CommandRunner) Run(e *Exec, command string, acc telegraf.Accumulator) ([ var out bytes.Buffer cmd.Stdout = &out - if err := cmd.Run(); err != nil { + if err := internal.RunTimeout(cmd, e.Timeout.Duration); err != nil { switch e.parser.(type) { case *nagios.NagiosParser: AddNagiosState(err, acc) diff --git a/plugins/inputs/filestat/README.md b/plugins/inputs/filestat/README.md new file mode 100644 index 000000000..bfa51011c --- /dev/null +++ b/plugins/inputs/filestat/README.md @@ -0,0 +1,37 @@ +# filestat Input Plugin + +The filestat plugin gathers metrics about file existence, size, and other stats. + +### Configuration: + +```toml +# Read stats about given file(s) +[[inputs.filestat]] + ## Files to gather stats about. + ## These accept standard unix glob matching rules, but with the addition of + ## ** as a "super asterisk". See https://github.com/gobwas/glob. + files = ["/etc/telegraf/telegraf.conf", "/var/log/**.log"] + ## If true, read the entire file and calculate an md5 checksum. + md5 = false +``` + +### Measurements & Fields: + +- filestat + - exists (int, 0 | 1) + - size_bytes (int, bytes) + - md5 (optional, string) + +### Tags: + +- All measurements have the following tags: + - file (the path the to file, as specified in the config) + +### Example Output: + +``` +$ telegraf -config /etc/telegraf/telegraf.conf -input-filter filestat -test +* Plugin: filestat, Collection 1 +> filestat,file=/tmp/foo/bar,host=tyrion exists=0i 1461203374493128216 +> filestat,file=/Users/sparrc/ws/telegraf.conf,host=tyrion exists=1i,size=47894i 1461203374493199335 +``` diff --git a/plugins/inputs/filestat/filestat.go b/plugins/inputs/filestat/filestat.go new file mode 100644 index 000000000..938c12e34 --- /dev/null +++ b/plugins/inputs/filestat/filestat.go @@ -0,0 +1,125 @@ +package filestat + +import ( + "crypto/md5" + "fmt" + "io" + "os" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal/globpath" + "github.com/influxdata/telegraf/plugins/inputs" +) + +const sampleConfig = ` + ## Files to gather stats about. + ## These accept standard unix glob matching rules, but with the addition of + ## ** as a "super asterisk". ie: + ## "/var/log/**.log" -> recursively find all .log files in /var/log + ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log + ## "/var/log/apache.log" -> just tail the apache log file + ## + ## See https://github.com/gobwas/glob for more examples + ## + files = ["/var/log/**.log"] + ## If true, read the entire file and calculate an md5 checksum. + md5 = false +` + +type FileStat struct { + Md5 bool + Files []string + + // maps full file paths to globmatch obj + globs map[string]*globpath.GlobPath +} + +func NewFileStat() *FileStat { + return &FileStat{ + globs: make(map[string]*globpath.GlobPath), + } +} + +func (_ *FileStat) Description() string { + return "Read stats about given file(s)" +} + +func (_ *FileStat) SampleConfig() string { return sampleConfig } + +func (f *FileStat) Gather(acc telegraf.Accumulator) error { + var errS string + var err error + + for _, filepath := range f.Files { + // Get the compiled glob object for this filepath + g, ok := f.globs[filepath] + if !ok { + if g, err = globpath.Compile(filepath); err != nil { + errS += err.Error() + " " + continue + } + f.globs[filepath] = g + } + + files := g.Match() + if len(files) == 0 { + acc.AddFields("filestat", + map[string]interface{}{ + "exists": int64(0), + }, + map[string]string{ + "file": filepath, + }) + continue + } + + for fileName, fileInfo := range files { + tags := map[string]string{ + "file": fileName, + } + fields := map[string]interface{}{ + "exists": int64(1), + "size_bytes": fileInfo.Size(), + } + + if f.Md5 { + md5, err := getMd5(fileName) + if err != nil { + errS += err.Error() + " " + } else { + fields["md5_sum"] = md5 + } + } + + acc.AddFields("filestat", fields, tags) + } + } + + if errS != "" { + return fmt.Errorf(errS) + } + return nil +} + +// Read given file and calculate an md5 hash. +func getMd5(file string) (string, error) { + of, err := os.Open(file) + if err != nil { + return "", err + } + defer of.Close() + + hash := md5.New() + _, err = io.Copy(hash, of) + if err != nil { + // fatal error + return "", err + } + return fmt.Sprintf("%x", hash.Sum(nil)), nil +} + +func init() { + inputs.Add("filestat", func() telegraf.Input { + return NewFileStat() + }) +} diff --git a/plugins/inputs/filestat/filestat_test.go b/plugins/inputs/filestat/filestat_test.go new file mode 100644 index 000000000..a404869d9 --- /dev/null +++ b/plugins/inputs/filestat/filestat_test.go @@ -0,0 +1,180 @@ +package filestat + +import ( + "runtime" + "strings" + "testing" + + "github.com/influxdata/telegraf/testutil" + "github.com/stretchr/testify/assert" +) + +func TestGatherNoMd5(t *testing.T) { + dir := getTestdataDir() + fs := NewFileStat() + fs.Files = []string{ + dir + "log1.log", + dir + "log2.log", + "/non/existant/file", + } + + acc := testutil.Accumulator{} + fs.Gather(&acc) + + tags1 := map[string]string{ + "file": dir + "log1.log", + } + fields1 := map[string]interface{}{ + "size_bytes": int64(0), + "exists": int64(1), + } + acc.AssertContainsTaggedFields(t, "filestat", fields1, tags1) + + tags2 := map[string]string{ + "file": dir + "log2.log", + } + fields2 := map[string]interface{}{ + "size_bytes": int64(0), + "exists": int64(1), + } + acc.AssertContainsTaggedFields(t, "filestat", fields2, tags2) + + tags3 := map[string]string{ + "file": "/non/existant/file", + } + fields3 := map[string]interface{}{ + "exists": int64(0), + } + acc.AssertContainsTaggedFields(t, "filestat", fields3, tags3) +} + +func TestGatherExplicitFiles(t *testing.T) { + dir := getTestdataDir() + fs := NewFileStat() + fs.Md5 = true + fs.Files = []string{ + dir + "log1.log", + dir + "log2.log", + "/non/existant/file", + } + + acc := testutil.Accumulator{} + fs.Gather(&acc) + + tags1 := map[string]string{ + "file": dir + "log1.log", + } + fields1 := map[string]interface{}{ + "size_bytes": int64(0), + "exists": int64(1), + "md5_sum": "d41d8cd98f00b204e9800998ecf8427e", + } + acc.AssertContainsTaggedFields(t, "filestat", fields1, tags1) + + tags2 := map[string]string{ + "file": dir + "log2.log", + } + fields2 := map[string]interface{}{ + "size_bytes": int64(0), + "exists": int64(1), + "md5_sum": "d41d8cd98f00b204e9800998ecf8427e", + } + acc.AssertContainsTaggedFields(t, "filestat", fields2, tags2) + + tags3 := map[string]string{ + "file": "/non/existant/file", + } + fields3 := map[string]interface{}{ + "exists": int64(0), + } + acc.AssertContainsTaggedFields(t, "filestat", fields3, tags3) +} + +func TestGatherGlob(t *testing.T) { + dir := getTestdataDir() + fs := NewFileStat() + fs.Md5 = true + fs.Files = []string{ + dir + "*.log", + } + + acc := testutil.Accumulator{} + fs.Gather(&acc) + + tags1 := map[string]string{ + "file": dir + "log1.log", + } + fields1 := map[string]interface{}{ + "size_bytes": int64(0), + "exists": int64(1), + "md5_sum": "d41d8cd98f00b204e9800998ecf8427e", + } + acc.AssertContainsTaggedFields(t, "filestat", fields1, tags1) + + tags2 := map[string]string{ + "file": dir + "log2.log", + } + fields2 := map[string]interface{}{ + "size_bytes": int64(0), + "exists": int64(1), + "md5_sum": "d41d8cd98f00b204e9800998ecf8427e", + } + acc.AssertContainsTaggedFields(t, "filestat", fields2, tags2) +} + +func TestGatherSuperAsterisk(t *testing.T) { + dir := getTestdataDir() + fs := NewFileStat() + fs.Md5 = true + fs.Files = []string{ + dir + "**", + } + + acc := testutil.Accumulator{} + fs.Gather(&acc) + + tags1 := map[string]string{ + "file": dir + "log1.log", + } + fields1 := map[string]interface{}{ + "size_bytes": int64(0), + "exists": int64(1), + "md5_sum": "d41d8cd98f00b204e9800998ecf8427e", + } + acc.AssertContainsTaggedFields(t, "filestat", fields1, tags1) + + tags2 := map[string]string{ + "file": dir + "log2.log", + } + fields2 := map[string]interface{}{ + "size_bytes": int64(0), + "exists": int64(1), + "md5_sum": "d41d8cd98f00b204e9800998ecf8427e", + } + acc.AssertContainsTaggedFields(t, "filestat", fields2, tags2) + + tags3 := map[string]string{ + "file": dir + "test.conf", + } + fields3 := map[string]interface{}{ + "size_bytes": int64(104), + "exists": int64(1), + "md5_sum": "5a7e9b77fa25e7bb411dbd17cf403c1f", + } + acc.AssertContainsTaggedFields(t, "filestat", fields3, tags3) +} + +func TestGetMd5(t *testing.T) { + dir := getTestdataDir() + md5, err := getMd5(dir + "test.conf") + assert.NoError(t, err) + assert.Equal(t, "5a7e9b77fa25e7bb411dbd17cf403c1f", md5) + + md5, err = getMd5("/tmp/foo/bar/fooooo") + assert.Error(t, err) +} + +func getTestdataDir() string { + _, filename, _, _ := runtime.Caller(1) + return strings.Replace(filename, "filestat_test.go", "testdata/", 1) +} diff --git a/plugins/inputs/filestat/testdata/log1.log b/plugins/inputs/filestat/testdata/log1.log new file mode 100644 index 000000000..e69de29bb diff --git a/plugins/inputs/filestat/testdata/log2.log b/plugins/inputs/filestat/testdata/log2.log new file mode 100644 index 000000000..e69de29bb diff --git a/plugins/inputs/filestat/testdata/test.conf b/plugins/inputs/filestat/testdata/test.conf new file mode 100644 index 000000000..a06111991 --- /dev/null +++ b/plugins/inputs/filestat/testdata/test.conf @@ -0,0 +1,5 @@ +# this is a fake testing config file +# for testing the filestat plugin + +option1 = "foo" +option2 = "bar" diff --git a/plugins/inputs/ipmi_sensor/command.go b/plugins/inputs/ipmi_sensor/command.go index 353c27d36..76374c494 100644 --- a/plugins/inputs/ipmi_sensor/command.go +++ b/plugins/inputs/ipmi_sensor/command.go @@ -1,10 +1,12 @@ package ipmi_sensor import ( - "bytes" "fmt" "os/exec" "strings" + "time" + + "github.com/influxdata/telegraf/internal" ) type CommandRunner struct{} @@ -18,21 +20,16 @@ func (t CommandRunner) cmd(conn *Connection, args ...string) *exec.Cmd { } return exec.Command(path, opts...) - } func (t CommandRunner) Run(conn *Connection, args ...string) (string, error) { cmd := t.cmd(conn, args...) - var stdout bytes.Buffer - var stderr bytes.Buffer - cmd.Stdout = &stdout - cmd.Stderr = &stderr - err := cmd.Run() + output, err := internal.CombinedOutputTimeout(cmd, time.Second*5) if err != nil { return "", fmt.Errorf("run %s %s: %s (%s)", - cmd.Path, strings.Join(cmd.Args, " "), stderr.String(), err) + cmd.Path, strings.Join(cmd.Args, " "), string(output), err) } - return stdout.String(), err + return string(output), err } diff --git a/plugins/inputs/jolokia/README.md b/plugins/inputs/jolokia/README.md index 5c7db6230..596dbed5f 100644 --- a/plugins/inputs/jolokia/README.md +++ b/plugins/inputs/jolokia/README.md @@ -3,15 +3,27 @@ #### Configuration ```toml +# Read JMX metrics through Jolokia [[inputs.jolokia]] ## This is the context root used to compose the jolokia url - context = "/jolokia/read" + context = "/jolokia" + + ## This specifies the mode used + # mode = "proxy" + # + ## When in proxy mode this section is used to specify further + ## proxy address configurations. + ## Remember to change host address to fit your environment. + # [inputs.jolokia.proxy] + # host = "127.0.0.1" + # port = "8080" + ## List of servers exposing jolokia read service [[inputs.jolokia.servers]] - name = "stable" - host = "192.168.103.2" - port = "8180" + name = "as-server-01" + host = "127.0.0.1" + port = "8080" # username = "myuser" # password = "mypassword" @@ -21,25 +33,29 @@ ## This collect all heap memory usage metrics. [[inputs.jolokia.metrics]] name = "heap_memory_usage" - jmx = "/java.lang:type=Memory/HeapMemoryUsage" - + mbean = "java.lang:type=Memory" + attribute = "HeapMemoryUsage" + ## This collect thread counts metrics. [[inputs.jolokia.metrics]] name = "thread_count" - jmx = "/java.lang:type=Threading/TotalStartedThreadCount,ThreadCount,DaemonThreadCount,PeakThreadCount" - + mbean = "java.lang:type=Threading" + attribute = "TotalStartedThreadCount,ThreadCount,DaemonThreadCount,PeakThreadCount" + ## This collect number of class loaded/unloaded counts metrics. [[inputs.jolokia.metrics]] name = "class_count" - jmx = "/java.lang:type=ClassLoading/LoadedClassCount,UnloadedClassCount,TotalLoadedClassCount" + mbean = "java.lang:type=ClassLoading" + attribute = "LoadedClassCount,UnloadedClassCount,TotalLoadedClassCount" ``` #### Description -The Jolokia plugin collects JVM metrics exposed as MBean's attributes through jolokia REST endpoint. All metrics -are collected for each server configured. +The Jolokia plugin collects JVM metrics exposed as MBean's attributes through +jolokia REST endpoint. All metrics are collected for each server configured. See: https://jolokia.org/ # Measurements: -Jolokia plugin produces one measure for each metric configured, adding Server's `name`, `host` and `port` as tags. +Jolokia plugin produces one measure for each metric configured, +adding Server's `jolokia_name`, `jolokia_host` and `jolokia_port` as tags. diff --git a/plugins/inputs/jolokia/jolokia.go b/plugins/inputs/jolokia/jolokia.go index 15a01d5de..244338559 100644 --- a/plugins/inputs/jolokia/jolokia.go +++ b/plugins/inputs/jolokia/jolokia.go @@ -1,6 +1,7 @@ package jolokia import ( + "bytes" "encoding/json" "errors" "fmt" @@ -22,8 +23,10 @@ type Server struct { } type Metric struct { - Name string - Jmx string + Name string + Mbean string + Attribute string + Path string } type JolokiaClient interface { @@ -41,20 +44,32 @@ func (c JolokiaClientImpl) MakeRequest(req *http.Request) (*http.Response, error type Jolokia struct { jClient JolokiaClient Context string + Mode string Servers []Server Metrics []Metric + Proxy Server } -func (j *Jolokia) SampleConfig() string { - return ` +const sampleConfig = ` ## This is the context root used to compose the jolokia url - context = "/jolokia/read" + context = "/jolokia" + + ## This specifies the mode used + # mode = "proxy" + # + ## When in proxy mode this section is used to specify further + ## proxy address configurations. + ## Remember to change host address to fit your environment. + # [inputs.jolokia.proxy] + # host = "127.0.0.1" + # port = "8080" + ## List of servers exposing jolokia read service [[inputs.jolokia.servers]] - name = "stable" - host = "192.168.103.2" - port = "8180" + name = "as-server-01" + host = "127.0.0.1" + port = "8080" # username = "myuser" # password = "mypassword" @@ -64,30 +79,31 @@ func (j *Jolokia) SampleConfig() string { ## This collect all heap memory usage metrics. [[inputs.jolokia.metrics]] name = "heap_memory_usage" - jmx = "/java.lang:type=Memory/HeapMemoryUsage" - + mbean = "java.lang:type=Memory" + attribute = "HeapMemoryUsage" + ## This collect thread counts metrics. [[inputs.jolokia.metrics]] name = "thread_count" - jmx = "/java.lang:type=Threading/TotalStartedThreadCount,ThreadCount,DaemonThreadCount,PeakThreadCount" - + mbean = "java.lang:type=Threading" + attribute = "TotalStartedThreadCount,ThreadCount,DaemonThreadCount,PeakThreadCount" + ## This collect number of class loaded/unloaded counts metrics. [[inputs.jolokia.metrics]] name = "class_count" - jmx = "/java.lang:type=ClassLoading/LoadedClassCount,UnloadedClassCount,TotalLoadedClassCount" + mbean = "java.lang:type=ClassLoading" + attribute = "LoadedClassCount,UnloadedClassCount,TotalLoadedClassCount" ` + +func (j *Jolokia) SampleConfig() string { + return sampleConfig } func (j *Jolokia) Description() string { return "Read JMX metrics through Jolokia" } -func (j *Jolokia) getAttr(requestUrl *url.URL) (map[string]interface{}, error) { - // Create + send request - req, err := http.NewRequest("GET", requestUrl.String(), nil) - if err != nil { - return nil, err - } +func (j *Jolokia) doRequest(req *http.Request) (map[string]interface{}, error) { resp, err := j.jClient.MakeRequest(req) if err != nil { @@ -98,7 +114,7 @@ func (j *Jolokia) getAttr(requestUrl *url.URL) (map[string]interface{}, error) { // Process response if resp.StatusCode != http.StatusOK { err = fmt.Errorf("Response from url \"%s\" has status code %d (%s), expected %d (%s)", - requestUrl, + req.RequestURI, resp.StatusCode, http.StatusText(resp.StatusCode), http.StatusOK, @@ -118,51 +134,133 @@ func (j *Jolokia) getAttr(requestUrl *url.URL) (map[string]interface{}, error) { return nil, errors.New("Error decoding JSON response") } + if status, ok := jsonOut["status"]; ok { + if status != float64(200) { + return nil, fmt.Errorf("Not expected status value in response body: %3.f", + status) + } + } else { + return nil, fmt.Errorf("Missing status in response body") + } + return jsonOut, nil } +func (j *Jolokia) prepareRequest(server Server, metric Metric) (*http.Request, error) { + var jolokiaUrl *url.URL + context := j.Context // Usually "/jolokia" + + // Create bodyContent + bodyContent := map[string]interface{}{ + "type": "read", + "mbean": metric.Mbean, + } + + if metric.Attribute != "" { + bodyContent["attribute"] = metric.Attribute + if metric.Path != "" { + bodyContent["path"] = metric.Path + } + } + + // Add target, only in proxy mode + if j.Mode == "proxy" { + serviceUrl := fmt.Sprintf("service:jmx:rmi:///jndi/rmi://%s:%s/jmxrmi", + server.Host, server.Port) + + target := map[string]string{ + "url": serviceUrl, + } + + if server.Username != "" { + target["user"] = server.Username + } + + if server.Password != "" { + target["password"] = server.Password + } + + bodyContent["target"] = target + + proxy := j.Proxy + + // Prepare ProxyURL + proxyUrl, err := url.Parse("http://" + proxy.Host + ":" + proxy.Port + context) + if err != nil { + return nil, err + } + if proxy.Username != "" || proxy.Password != "" { + proxyUrl.User = url.UserPassword(proxy.Username, proxy.Password) + } + + jolokiaUrl = proxyUrl + + } else { + serverUrl, err := url.Parse("http://" + server.Host + ":" + server.Port + context) + if err != nil { + return nil, err + } + if server.Username != "" || server.Password != "" { + serverUrl.User = url.UserPassword(server.Username, server.Password) + } + + jolokiaUrl = serverUrl + } + + requestBody, err := json.Marshal(bodyContent) + + req, err := http.NewRequest("POST", jolokiaUrl.String(), bytes.NewBuffer(requestBody)) + + if err != nil { + return nil, err + } + + req.Header.Add("Content-type", "application/json") + + return req, nil +} + func (j *Jolokia) Gather(acc telegraf.Accumulator) error { - context := j.Context //"/jolokia/read" servers := j.Servers metrics := j.Metrics tags := make(map[string]string) for _, server := range servers { - tags["server"] = server.Name - tags["port"] = server.Port - tags["host"] = server.Host + tags["jolokia_name"] = server.Name + tags["jolokia_port"] = server.Port + tags["jolokia_host"] = server.Host fields := make(map[string]interface{}) + for _, metric := range metrics { - measurement := metric.Name - jmxPath := metric.Jmx - // Prepare URL - requestUrl, err := url.Parse("http://" + server.Host + ":" + - server.Port + context + jmxPath) + req, err := j.prepareRequest(server, metric) if err != nil { return err } - if server.Username != "" || server.Password != "" { - requestUrl.User = url.UserPassword(server.Username, server.Password) - } - out, _ := j.getAttr(requestUrl) + out, err := j.doRequest(req) - if values, ok := out["value"]; ok { - switch t := values.(type) { - case map[string]interface{}: - for k, v := range t { - fields[measurement+"_"+k] = v - } - case interface{}: - fields[measurement] = t - } + if err != nil { + fmt.Printf("Error handling response: %s\n", err) } else { - fmt.Printf("Missing key 'value' in '%s' output response\n", - requestUrl.String()) + + if values, ok := out["value"]; ok { + switch t := values.(type) { + case map[string]interface{}: + for k, v := range t { + fields[measurement+"_"+k] = v + } + case interface{}: + fields[measurement] = t + } + } else { + fmt.Printf("Missing key 'value' in output response\n") + } + } } + acc.AddFields("jolokia", fields, tags) } diff --git a/plugins/inputs/jolokia/jolokia_test.go b/plugins/inputs/jolokia/jolokia_test.go index 961ba7055..13724b937 100644 --- a/plugins/inputs/jolokia/jolokia_test.go +++ b/plugins/inputs/jolokia/jolokia_test.go @@ -47,8 +47,10 @@ const invalidJSON = "I don't think this is JSON" const empty = "" var Servers = []Server{Server{Name: "as1", Host: "127.0.0.1", Port: "8080"}} -var HeapMetric = Metric{Name: "heap_memory_usage", Jmx: "/java.lang:type=Memory/HeapMemoryUsage"} -var UsedHeapMetric = Metric{Name: "heap_memory_usage", Jmx: "/java.lang:type=Memory/HeapMemoryUsage"} +var HeapMetric = Metric{Name: "heap_memory_usage", + Mbean: "java.lang:type=Memory", Attribute: "HeapMemoryUsage"} +var UsedHeapMetric = Metric{Name: "heap_memory_usage", + Mbean: "java.lang:type=Memory", Attribute: "HeapMemoryUsage"} type jolokiaClientStub struct { responseBody string @@ -94,9 +96,9 @@ func TestHttpJsonMultiValue(t *testing.T) { "heap_memory_usage_used": 203288528.0, } tags := map[string]string{ - "host": "127.0.0.1", - "port": "8080", - "server": "as1", + "jolokia_host": "127.0.0.1", + "jolokia_port": "8080", + "jolokia_name": "as1", } acc.AssertContainsTaggedFields(t, "jolokia", fields, tags) } @@ -114,3 +116,17 @@ func TestHttpJsonOn404(t *testing.T) { assert.Nil(t, err) assert.Equal(t, 0, len(acc.Metrics)) } + +// Test that the proper values are ignored or collected +func TestHttpInvalidJson(t *testing.T) { + + jolokia := genJolokiaClientStub(invalidJSON, 200, Servers, + []Metric{UsedHeapMetric}) + + var acc testutil.Accumulator + acc.SetDebug(true) + err := jolokia.Gather(&acc) + + assert.Nil(t, err) + assert.Equal(t, 0, len(acc.Metrics)) +} diff --git a/plugins/inputs/leofs/leofs.go b/plugins/inputs/leofs/leofs.go index f4910ad0c..06c71e932 100644 --- a/plugins/inputs/leofs/leofs.go +++ b/plugins/inputs/leofs/leofs.go @@ -3,13 +3,16 @@ package leofs import ( "bufio" "fmt" - "github.com/influxdata/telegraf" - "github.com/influxdata/telegraf/plugins/inputs" "net/url" "os/exec" "strconv" "strings" "sync" + "time" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal" + "github.com/influxdata/telegraf/plugins/inputs" ) const oid = ".1.3.6.1.4.1.35450" @@ -175,14 +178,18 @@ func (l *LeoFS) Gather(acc telegraf.Accumulator) error { return outerr } -func (l *LeoFS) gatherServer(endpoint string, serverType ServerType, acc telegraf.Accumulator) error { +func (l *LeoFS) gatherServer( + endpoint string, + serverType ServerType, + acc telegraf.Accumulator, +) error { cmd := exec.Command("snmpwalk", "-v2c", "-cpublic", endpoint, oid) stdout, err := cmd.StdoutPipe() if err != nil { return err } cmd.Start() - defer cmd.Wait() + defer internal.WaitTimeout(cmd, time.Second*5) scanner := bufio.NewScanner(stdout) if !scanner.Scan() { return fmt.Errorf("Unable to retrieve the node name") diff --git a/plugins/inputs/lustre2/lustre2.go b/plugins/inputs/lustre2/lustre2.go index 6ac41d391..8ef9223b5 100644 --- a/plugins/inputs/lustre2/lustre2.go +++ b/plugins/inputs/lustre2/lustre2.go @@ -34,9 +34,13 @@ var sampleConfig = ` ## # ost_procfiles = [ # "/proc/fs/lustre/obdfilter/*/stats", - # "/proc/fs/lustre/osd-ldiskfs/*/stats" + # "/proc/fs/lustre/osd-ldiskfs/*/stats", + # "/proc/fs/lustre/obdfilter/*/job_stats", + # ] + # mds_procfiles = [ + # "/proc/fs/lustre/mdt/*/md_stats", + # "/proc/fs/lustre/mdt/*/job_stats", # ] - # mds_procfiles = ["/proc/fs/lustre/mdt/*/md_stats"] ` /* The wanted fields would be a []string if not for the @@ -82,6 +86,139 @@ var wanted_ost_fields = []*mapping{ }, } +var wanted_ost_jobstats_fields = []*mapping{ + { // The read line has several fields, so we need to differentiate what they are + inProc: "read", + field: 3, + reportAs: "jobstats_read_calls", + }, + { + inProc: "read", + field: 7, + reportAs: "jobstats_read_min_size", + }, + { + inProc: "read", + field: 9, + reportAs: "jobstats_read_max_size", + }, + { + inProc: "read", + field: 11, + reportAs: "jobstats_read_bytes", + }, + { // Different inProc for newer versions + inProc: "read_bytes", + field: 3, + reportAs: "jobstats_read_calls", + }, + { + inProc: "read_bytes", + field: 7, + reportAs: "jobstats_read_min_size", + }, + { + inProc: "read_bytes", + field: 9, + reportAs: "jobstats_read_max_size", + }, + { + inProc: "read_bytes", + field: 11, + reportAs: "jobstats_read_bytes", + }, + { // We need to do the same for the write fields + inProc: "write", + field: 3, + reportAs: "jobstats_write_calls", + }, + { + inProc: "write", + field: 7, + reportAs: "jobstats_write_min_size", + }, + { + inProc: "write", + field: 9, + reportAs: "jobstats_write_max_size", + }, + { + inProc: "write", + field: 11, + reportAs: "jobstats_write_bytes", + }, + { // Different inProc for newer versions + inProc: "write_bytes", + field: 3, + reportAs: "jobstats_write_calls", + }, + { + inProc: "write_bytes", + field: 7, + reportAs: "jobstats_write_min_size", + }, + { + inProc: "write_bytes", + field: 9, + reportAs: "jobstats_write_max_size", + }, + { + inProc: "write_bytes", + field: 11, + reportAs: "jobstats_write_bytes", + }, + { + inProc: "getattr", + field: 3, + reportAs: "jobstats_ost_getattr", + }, + { + inProc: "setattr", + field: 3, + reportAs: "jobstats_ost_setattr", + }, + { + inProc: "punch", + field: 3, + reportAs: "jobstats_punch", + }, + { + inProc: "sync", + field: 3, + reportAs: "jobstats_ost_sync", + }, + { + inProc: "destroy", + field: 3, + reportAs: "jobstats_destroy", + }, + { + inProc: "create", + field: 3, + reportAs: "jobstats_create", + }, + { + inProc: "statfs", + field: 3, + reportAs: "jobstats_ost_statfs", + }, + { + inProc: "get_info", + field: 3, + reportAs: "jobstats_get_info", + }, + { + inProc: "set_info", + field: 3, + reportAs: "jobstats_set_info", + }, + { + inProc: "quotactl", + field: 3, + reportAs: "jobstats_quotactl", + }, +} + var wanted_mds_fields = []*mapping{ { inProc: "open", @@ -133,6 +270,89 @@ var wanted_mds_fields = []*mapping{ }, } +var wanted_mdt_jobstats_fields = []*mapping{ + { + inProc: "open", + field: 3, + reportAs: "jobstats_open", + }, + { + inProc: "close", + field: 3, + reportAs: "jobstats_close", + }, + { + inProc: "mknod", + field: 3, + reportAs: "jobstats_mknod", + }, + { + inProc: "link", + field: 3, + reportAs: "jobstats_link", + }, + { + inProc: "unlink", + field: 3, + reportAs: "jobstats_unlink", + }, + { + inProc: "mkdir", + field: 3, + reportAs: "jobstats_mkdir", + }, + { + inProc: "rmdir", + field: 3, + reportAs: "jobstats_rmdir", + }, + { + inProc: "rename", + field: 3, + reportAs: "jobstats_rename", + }, + { + inProc: "getattr", + field: 3, + reportAs: "jobstats_getattr", + }, + { + inProc: "setattr", + field: 3, + reportAs: "jobstats_setattr", + }, + { + inProc: "getxattr", + field: 3, + reportAs: "jobstats_getxattr", + }, + { + inProc: "setxattr", + field: 3, + reportAs: "jobstats_setxattr", + }, + { + inProc: "statfs", + field: 3, + reportAs: "jobstats_statfs", + }, + { + inProc: "sync", + field: 3, + reportAs: "jobstats_sync", + }, + { + inProc: "samedir_rename", + field: 3, + reportAs: "jobstats_samedir_rename", + }, + { + inProc: "crossdir_rename", + field: 3, + reportAs: "jobstats_crossdir_rename", + }, +} + func (l *Lustre2) GetLustreProcStats(fileglob string, wanted_fields []*mapping, acc telegraf.Accumulator) error { files, err := filepath.Glob(fileglob) if err != nil { @@ -143,7 +363,7 @@ func (l *Lustre2) GetLustreProcStats(fileglob string, wanted_fields []*mapping, /* Turn /proc/fs/lustre/obdfilter//stats and similar * into just the object store target name * Assumpion: the target name is always second to last, - * which is true in Lustre 2.1->2.5 + * which is true in Lustre 2.1->2.8 */ path := strings.Split(file, "/") name := path[len(path)-2] @@ -161,16 +381,21 @@ func (l *Lustre2) GetLustreProcStats(fileglob string, wanted_fields []*mapping, for _, line := range lines { parts := strings.Fields(line) + if strings.HasPrefix(line, "- job_id:") { + // Set the job_id explicitly if present + fields["jobid"] = parts[2] + } + for _, wanted := range wanted_fields { var data uint64 - if parts[0] == wanted.inProc { + if strings.TrimSuffix(parts[0], ":") == wanted.inProc { wanted_field := wanted.field // if not set, assume field[1]. Shouldn't be field[0], as // that's a string if wanted_field == 0 { wanted_field = 1 } - data, err = strconv.ParseUint((parts[wanted_field]), 10, 64) + data, err = strconv.ParseUint(strings.TrimSuffix((parts[wanted_field]), ","), 10, 64) if err != nil { return err } @@ -213,6 +438,12 @@ func (l *Lustre2) Gather(acc telegraf.Accumulator) error { if err != nil { return err } + // per job statistics are in obdfilter//job_stats + err = l.GetLustreProcStats("/proc/fs/lustre/obdfilter/*/job_stats", + wanted_ost_jobstats_fields, acc) + if err != nil { + return err + } } if len(l.Mds_procfiles) == 0 { @@ -222,16 +453,31 @@ func (l *Lustre2) Gather(acc telegraf.Accumulator) error { if err != nil { return err } + + // Metadata target job stats + err = l.GetLustreProcStats("/proc/fs/lustre/mdt/*/job_stats", + wanted_mdt_jobstats_fields, acc) + if err != nil { + return err + } } for _, procfile := range l.Ost_procfiles { - err := l.GetLustreProcStats(procfile, wanted_ost_fields, acc) + ost_fields := wanted_ost_fields + if strings.HasSuffix(procfile, "job_stats") { + ost_fields = wanted_ost_jobstats_fields + } + err := l.GetLustreProcStats(procfile, ost_fields, acc) if err != nil { return err } } for _, procfile := range l.Mds_procfiles { - err := l.GetLustreProcStats(procfile, wanted_mds_fields, acc) + mdt_fields := wanted_mds_fields + if strings.HasSuffix(procfile, "job_stats") { + mdt_fields = wanted_mdt_jobstats_fields + } + err := l.GetLustreProcStats(procfile, mdt_fields, acc) if err != nil { return err } @@ -241,6 +487,12 @@ func (l *Lustre2) Gather(acc telegraf.Accumulator) error { tags := map[string]string{ "name": name, } + if _, ok := fields["jobid"]; ok { + if jobid, ok := fields["jobid"].(string); ok { + tags["jobid"] = jobid + } + delete(fields, "jobid") + } acc.AddFields("lustre2", fields, tags) } diff --git a/plugins/inputs/lustre2/lustre2_test.go b/plugins/inputs/lustre2/lustre2_test.go index 9e560df2c..5cc9c0e43 100644 --- a/plugins/inputs/lustre2/lustre2_test.go +++ b/plugins/inputs/lustre2/lustre2_test.go @@ -38,6 +38,23 @@ cache_hit 7393729777 samples [pages] 1 1 7393729777 cache_miss 11653333250 samples [pages] 1 1 11653333250 ` +const obdfilterJobStatsContents = `job_stats: +- job_id: testjob1 + snapshot_time: 1461772761 + read_bytes: { samples: 1, unit: bytes, min: 4096, max: 4096, sum: 4096 } + write_bytes: { samples: 25, unit: bytes, min: 1048576, max: 1048576, sum: 26214400 } + getattr: { samples: 0, unit: reqs } + setattr: { samples: 0, unit: reqs } + punch: { samples: 1, unit: reqs } + sync: { samples: 0, unit: reqs } + destroy: { samples: 0, unit: reqs } + create: { samples: 0, unit: reqs } + statfs: { samples: 0, unit: reqs } + get_info: { samples: 0, unit: reqs } + set_info: { samples: 0, unit: reqs } + quotactl: { samples: 0, unit: reqs } +` + const mdtProcContents = `snapshot_time 1438693238.20113 secs.usecs open 1024577037 samples [reqs] close 873243496 samples [reqs] @@ -57,6 +74,27 @@ samedir_rename 259625 samples [reqs] crossdir_rename 369571 samples [reqs] ` +const mdtJobStatsContents = `job_stats: +- job_id: testjob1 + snapshot_time: 1461772761 + open: { samples: 5, unit: reqs } + close: { samples: 4, unit: reqs } + mknod: { samples: 6, unit: reqs } + link: { samples: 8, unit: reqs } + unlink: { samples: 90, unit: reqs } + mkdir: { samples: 521, unit: reqs } + rmdir: { samples: 520, unit: reqs } + rename: { samples: 9, unit: reqs } + getattr: { samples: 11, unit: reqs } + setattr: { samples: 1, unit: reqs } + getxattr: { samples: 3, unit: reqs } + setxattr: { samples: 4, unit: reqs } + statfs: { samples: 1205, unit: reqs } + sync: { samples: 2, unit: reqs } + samedir_rename: { samples: 705, unit: reqs } + crossdir_rename: { samples: 200, unit: reqs } +` + func TestLustre2GeneratesMetrics(t *testing.T) { tempdir := os.TempDir() + "/telegraf/proc/fs/lustre/" @@ -83,6 +121,7 @@ func TestLustre2GeneratesMetrics(t *testing.T) { err = ioutil.WriteFile(obddir+"/"+ost_name+"/stats", []byte(obdfilterProcContents), 0644) require.NoError(t, err) + // Begin by testing standard Lustre stats m := &Lustre2{ Ost_procfiles: []string{obddir + "/*/stats", osddir + "/*/stats"}, Mds_procfiles: []string{mdtdir + "/*/md_stats"}, @@ -128,3 +167,82 @@ func TestLustre2GeneratesMetrics(t *testing.T) { err = os.RemoveAll(os.TempDir() + "/telegraf") require.NoError(t, err) } + +func TestLustre2GeneratesJobstatsMetrics(t *testing.T) { + + tempdir := os.TempDir() + "/telegraf/proc/fs/lustre/" + ost_name := "OST0001" + job_name := "testjob1" + + mdtdir := tempdir + "/mdt/" + err := os.MkdirAll(mdtdir+"/"+ost_name, 0755) + require.NoError(t, err) + + obddir := tempdir + "/obdfilter/" + err = os.MkdirAll(obddir+"/"+ost_name, 0755) + require.NoError(t, err) + + err = ioutil.WriteFile(mdtdir+"/"+ost_name+"/job_stats", []byte(mdtJobStatsContents), 0644) + require.NoError(t, err) + + err = ioutil.WriteFile(obddir+"/"+ost_name+"/job_stats", []byte(obdfilterJobStatsContents), 0644) + require.NoError(t, err) + + // Test Lustre Jobstats + m := &Lustre2{ + Ost_procfiles: []string{obddir + "/*/job_stats"}, + Mds_procfiles: []string{mdtdir + "/*/job_stats"}, + } + + var acc testutil.Accumulator + + err = m.Gather(&acc) + require.NoError(t, err) + + tags := map[string]string{ + "name": ost_name, + "jobid": job_name, + } + + fields := map[string]interface{}{ + "jobstats_read_calls": uint64(1), + "jobstats_read_min_size": uint64(4096), + "jobstats_read_max_size": uint64(4096), + "jobstats_read_bytes": uint64(4096), + "jobstats_write_calls": uint64(25), + "jobstats_write_min_size": uint64(1048576), + "jobstats_write_max_size": uint64(1048576), + "jobstats_write_bytes": uint64(26214400), + "jobstats_ost_getattr": uint64(0), + "jobstats_ost_setattr": uint64(0), + "jobstats_punch": uint64(1), + "jobstats_ost_sync": uint64(0), + "jobstats_destroy": uint64(0), + "jobstats_create": uint64(0), + "jobstats_ost_statfs": uint64(0), + "jobstats_get_info": uint64(0), + "jobstats_set_info": uint64(0), + "jobstats_quotactl": uint64(0), + "jobstats_open": uint64(5), + "jobstats_close": uint64(4), + "jobstats_mknod": uint64(6), + "jobstats_link": uint64(8), + "jobstats_unlink": uint64(90), + "jobstats_mkdir": uint64(521), + "jobstats_rmdir": uint64(520), + "jobstats_rename": uint64(9), + "jobstats_getattr": uint64(11), + "jobstats_setattr": uint64(1), + "jobstats_getxattr": uint64(3), + "jobstats_setxattr": uint64(4), + "jobstats_statfs": uint64(1205), + "jobstats_sync": uint64(2), + "jobstats_samedir_rename": uint64(705), + "jobstats_crossdir_rename": uint64(200), + } + + acc.AssertContainsTaggedFields(t, "lustre2", fields, tags) + + err = os.RemoveAll(os.TempDir() + "/telegraf") + require.NoError(t, err) +} diff --git a/plugins/inputs/mongodb/README.md b/plugins/inputs/mongodb/README.md new file mode 100644 index 000000000..66ff2668e --- /dev/null +++ b/plugins/inputs/mongodb/README.md @@ -0,0 +1,54 @@ +# Telegraf plugin: MongoDB + +#### Configuration + +```toml +[[inputs.mongodb]] + ## An array of URI to gather stats about. Specify an ip or hostname + ## with optional port add password. ie, + ## mongodb://user:auth_key@10.10.3.30:27017, + ## mongodb://10.10.3.33:18832, + ## 10.0.0.1:10000, etc. + servers = ["127.0.0.1:27017"] +``` + +For authenticated mongodb istances use connection mongdb connection URI + +```toml +[[inputs.mongodb]] + servers = ["mongodb://username:password@10.XX.XX.XX:27101/mydatabase?authSource=admin"] +``` +This connection uri may be different based on your environement and mongodb +setup. If the user doesn't have the required privilege to execute serverStatus +command the you will get this error on telegraf + +``` +Error in input [mongodb]: not authorized on admin to execute command { serverStatus: 1, recordStats: 0 } +``` + +#### Description + +The telegraf plugin collects mongodb stats exposed by serverStatus and few more +and create a single measurement containing values e.g. + * active_reads + * active_writes + * commands_per_sec + * deletes_per_sec + * flushes_per_sec + * getmores_per_sec + * inserts_per_sec + * net_in_bytes + * net_out_bytes + * open_connections + * percent_cache_dirty + * percent_cache_used + * queries_per_sec + * queued_reads + * queued_writes + * resident_megabytes + * updates_per_sec + * vsize_megabytes + * ttl_deletes_per_sec + * ttl_passes_per_sec + * repl_lag + * jumbo_chunks (only if mongos or mongo config) diff --git a/plugins/inputs/mongodb/mongodb_data.go b/plugins/inputs/mongodb/mongodb_data.go index 1a951806d..7a52d650a 100644 --- a/plugins/inputs/mongodb/mongodb_data.go +++ b/plugins/inputs/mongodb/mongodb_data.go @@ -26,22 +26,24 @@ func NewMongodbData(statLine *StatLine, tags map[string]string) *MongodbData { } var DefaultStats = map[string]string{ - "inserts_per_sec": "Insert", - "queries_per_sec": "Query", - "updates_per_sec": "Update", - "deletes_per_sec": "Delete", - "getmores_per_sec": "GetMore", - "commands_per_sec": "Command", - "flushes_per_sec": "Flushes", - "vsize_megabytes": "Virtual", - "resident_megabytes": "Resident", - "queued_reads": "QueuedReaders", - "queued_writes": "QueuedWriters", - "active_reads": "ActiveReaders", - "active_writes": "ActiveWriters", - "net_in_bytes": "NetIn", - "net_out_bytes": "NetOut", - "open_connections": "NumConnections", + "inserts_per_sec": "Insert", + "queries_per_sec": "Query", + "updates_per_sec": "Update", + "deletes_per_sec": "Delete", + "getmores_per_sec": "GetMore", + "commands_per_sec": "Command", + "flushes_per_sec": "Flushes", + "vsize_megabytes": "Virtual", + "resident_megabytes": "Resident", + "queued_reads": "QueuedReaders", + "queued_writes": "QueuedWriters", + "active_reads": "ActiveReaders", + "active_writes": "ActiveWriters", + "net_in_bytes": "NetIn", + "net_out_bytes": "NetOut", + "open_connections": "NumConnections", + "ttl_deletes_per_sec": "DeletedDocuments", + "ttl_passes_per_sec": "Passes", } var DefaultReplStats = map[string]string{ @@ -52,6 +54,11 @@ var DefaultReplStats = map[string]string{ "repl_getmores_per_sec": "GetMoreR", "repl_commands_per_sec": "CommandR", "member_status": "NodeType", + "repl_lag": "ReplLag", +} + +var DefaultClusterStats = map[string]string{ + "jumbo_chunks": "JumboChunksCount", } var MmapStats = map[string]string{ @@ -71,6 +78,7 @@ func (d *MongodbData) AddDefaultStats() { if d.StatLine.NodeType != "" { d.addStat(statLine, DefaultReplStats) } + d.addStat(statLine, DefaultClusterStats) if d.StatLine.StorageEngine == "mmapv1" { d.addStat(statLine, MmapStats) } else if d.StatLine.StorageEngine == "wiredTiger" { diff --git a/plugins/inputs/mongodb/mongodb_data_test.go b/plugins/inputs/mongodb/mongodb_data_test.go index 3166ab018..a08549cfd 100644 --- a/plugins/inputs/mongodb/mongodb_data_test.go +++ b/plugins/inputs/mongodb/mongodb_data_test.go @@ -13,24 +13,26 @@ var tags = make(map[string]string) func TestAddNonReplStats(t *testing.T) { d := NewMongodbData( &StatLine{ - StorageEngine: "", - Time: time.Now(), - Insert: 0, - Query: 0, - Update: 0, - Delete: 0, - GetMore: 0, - Command: 0, - Flushes: 0, - Virtual: 0, - Resident: 0, - QueuedReaders: 0, - QueuedWriters: 0, - ActiveReaders: 0, - ActiveWriters: 0, - NetIn: 0, - NetOut: 0, - NumConnections: 0, + StorageEngine: "", + Time: time.Now(), + Insert: 0, + Query: 0, + Update: 0, + Delete: 0, + GetMore: 0, + Command: 0, + Flushes: 0, + Virtual: 0, + Resident: 0, + QueuedReaders: 0, + QueuedWriters: 0, + ActiveReaders: 0, + ActiveWriters: 0, + NetIn: 0, + NetOut: 0, + NumConnections: 0, + Passes: 0, + DeletedDocuments: 0, }, tags, ) @@ -125,9 +127,13 @@ func TestStateTag(t *testing.T) { "repl_inserts_per_sec": int64(0), "repl_queries_per_sec": int64(0), "repl_updates_per_sec": int64(0), + "repl_lag": int64(0), "resident_megabytes": int64(0), "updates_per_sec": int64(0), "vsize_megabytes": int64(0), + "ttl_deletes_per_sec": int64(0), + "ttl_passes_per_sec": int64(0), + "jumbo_chunks": int64(0), } acc.AssertContainsTaggedFields(t, "mongodb", fields, stateTags) } diff --git a/plugins/inputs/mongodb/mongodb_server.go b/plugins/inputs/mongodb/mongodb_server.go index 26aac2b63..173391e2f 100644 --- a/plugins/inputs/mongodb/mongodb_server.go +++ b/plugins/inputs/mongodb/mongodb_server.go @@ -1,6 +1,7 @@ package mongodb import ( + "log" "net/url" "time" @@ -12,7 +13,7 @@ import ( type Server struct { Url *url.URL Session *mgo.Session - lastResult *ServerStatus + lastResult *MongoStatus } func (s *Server) getDefaultTags() map[string]string { @@ -24,11 +25,29 @@ func (s *Server) getDefaultTags() map[string]string { func (s *Server) gatherData(acc telegraf.Accumulator) error { s.Session.SetMode(mgo.Eventual, true) s.Session.SetSocketTimeout(0) - result := &ServerStatus{} - err := s.Session.DB("admin").Run(bson.D{{"serverStatus", 1}, {"recordStats", 0}}, result) + result_server := &ServerStatus{} + err := s.Session.DB("admin").Run(bson.D{{"serverStatus", 1}, {"recordStats", 0}}, result_server) if err != nil { return err } + result_repl := &ReplSetStatus{} + err = s.Session.DB("admin").Run(bson.D{{"replSetGetStatus", 1}}, result_repl) + if err != nil { + log.Println("Not gathering replica set status, member not in replica set") + } + + jumbo_chunks, _ := s.Session.DB("config").C("chunks").Find(bson.M{"jumbo": true}).Count() + + result_cluster := &ClusterStatus{ + JumboChunksCount: int64(jumbo_chunks), + } + + result := &MongoStatus{ + ServerStatus: result_server, + ReplSetStatus: result_repl, + ClusterStatus: result_cluster, + } + defer func() { s.lastResult = result }() diff --git a/plugins/inputs/mongodb/mongostat.go b/plugins/inputs/mongodb/mongostat.go index e12d797d0..d564931d1 100644 --- a/plugins/inputs/mongodb/mongostat.go +++ b/plugins/inputs/mongodb/mongostat.go @@ -11,6 +11,8 @@ import ( "sort" "strings" "time" + + "gopkg.in/mgo.v2/bson" ) const ( @@ -28,8 +30,14 @@ const ( WTOnly // only active if node has wiredtiger-specific fields ) +type MongoStatus struct { + SampleTime time.Time + ServerStatus *ServerStatus + ReplSetStatus *ReplSetStatus + ClusterStatus *ClusterStatus +} + type ServerStatus struct { - SampleTime time.Time `bson:""` Host string `bson:"host"` Version string `bson:"version"` Process string `bson:"process"` @@ -54,6 +62,25 @@ type ServerStatus struct { ShardCursorType map[string]interface{} `bson:"shardCursorType"` StorageEngine map[string]string `bson:"storageEngine"` WiredTiger *WiredTiger `bson:"wiredTiger"` + Metrics *MetricsStats `bson:"metrics"` +} + +// ClusterStatus stores information related to the whole cluster +type ClusterStatus struct { + JumboChunksCount int64 +} + +// ReplSetStatus stores information from replSetGetStatus +type ReplSetStatus struct { + Members []ReplSetMember `bson:"members"` + MyState int64 `bson:"myState"` +} + +// ReplSetMember stores information related to a replica set member +type ReplSetMember struct { + Name string `bson:"name"` + State int64 `bson:"state"` + Optime *bson.MongoTimestamp `bson:"optime"` } // WiredTiger stores information related to the WiredTiger storage engine. @@ -194,6 +221,17 @@ type OpcountStats struct { Command int64 `bson:"command"` } +// MetricsStats stores information related to metrics +type MetricsStats struct { + TTL *TTLStats `bson:"ttl"` +} + +// TTLStats stores information related to documents with a ttl index. +type TTLStats struct { + DeletedDocuments int64 `bson:"deletedDocuments"` + Passes int64 `bson:"passes"` +} + // ReadWriteLockTimes stores time spent holding read/write locks. type ReadWriteLockTimes struct { Read int64 `bson:"R"` @@ -332,6 +370,9 @@ type StatLine struct { // Opcounter fields Insert, Query, Update, Delete, GetMore, Command int64 + // TTL fields + Passes, DeletedDocuments int64 + // Collection locks (3.0 mmap only) CollectionLocks *CollectionLockStatus @@ -341,6 +382,7 @@ type StatLine struct { // Replicated Opcounter fields InsertR, QueryR, UpdateR, DeleteR, GetMoreR, CommandR int64 + ReplLag int64 Flushes int64 Mapped, Virtual, Resident, NonMapped int64 Faults int64 @@ -351,6 +393,9 @@ type StatLine struct { NumConnections int64 ReplSetName string NodeType string + + // Cluster fields + JumboChunksCount int64 } func parseLocks(stat ServerStatus) map[string]LockUsage { @@ -395,8 +440,11 @@ func diff(newVal, oldVal, sampleTime int64) int64 { return d / sampleTime } -// NewStatLine constructs a StatLine object from two ServerStatus objects. -func NewStatLine(oldStat, newStat ServerStatus, key string, all bool, sampleSecs int64) *StatLine { +// NewStatLine constructs a StatLine object from two MongoStatus objects. +func NewStatLine(oldMongo, newMongo MongoStatus, key string, all bool, sampleSecs int64) *StatLine { + oldStat := *oldMongo.ServerStatus + newStat := *newMongo.ServerStatus + returnVal := &StatLine{ Key: key, Host: newStat.Host, @@ -423,6 +471,11 @@ func NewStatLine(oldStat, newStat ServerStatus, key string, all bool, sampleSecs returnVal.Command = diff(newStat.Opcounters.Command, oldStat.Opcounters.Command, sampleSecs) } + if newStat.Metrics.TTL != nil && oldStat.Metrics.TTL != nil { + returnVal.Passes = diff(newStat.Metrics.TTL.Passes, oldStat.Metrics.TTL.Passes, sampleSecs) + returnVal.DeletedDocuments = diff(newStat.Metrics.TTL.DeletedDocuments, oldStat.Metrics.TTL.DeletedDocuments, sampleSecs) + } + if newStat.OpcountersRepl != nil && oldStat.OpcountersRepl != nil { returnVal.InsertR = diff(newStat.OpcountersRepl.Insert, oldStat.OpcountersRepl.Insert, sampleSecs) returnVal.QueryR = diff(newStat.OpcountersRepl.Query, oldStat.OpcountersRepl.Query, sampleSecs) @@ -442,7 +495,7 @@ func NewStatLine(oldStat, newStat ServerStatus, key string, all bool, sampleSecs returnVal.Flushes = newStat.BackgroundFlushing.Flushes - oldStat.BackgroundFlushing.Flushes } - returnVal.Time = newStat.SampleTime + returnVal.Time = newMongo.SampleTime returnVal.IsMongos = (newStat.ShardCursorType != nil || strings.HasPrefix(newStat.Process, MongosProcess)) @@ -587,5 +640,42 @@ func NewStatLine(oldStat, newStat ServerStatus, key string, all bool, sampleSecs returnVal.NumConnections = newStat.Connections.Current } + newReplStat := *newMongo.ReplSetStatus + + if newReplStat.Members != nil { + myName := newStat.Repl.Me + // Find the master and myself + master := ReplSetMember{} + me := ReplSetMember{} + for _, member := range newReplStat.Members { + if member.Name == myName { + if member.State == 1 { + // I'm the master + returnVal.ReplLag = 0 + break + } else { + // I'm secondary + me = member + } + } else if member.State == 1 { + // Master found + master = member + } + } + + if me.Optime != nil && master.Optime != nil && me.State == 2 { + // MongoTimestamp type is int64 where the first 32bits are the unix timestamp + lag := int64(*master.Optime>>32 - *me.Optime>>32) + if lag < 0 { + returnVal.ReplLag = 0 + } else { + returnVal.ReplLag = lag + } + } + } + + newClusterStat := *newMongo.ClusterStatus + returnVal.JumboChunksCount = newClusterStat.JumboChunksCount + return returnVal } diff --git a/plugins/inputs/mqtt_consumer/mqtt_consumer.go b/plugins/inputs/mqtt_consumer/mqtt_consumer.go index c64d2139b..beebe00ce 100644 --- a/plugins/inputs/mqtt_consumer/mqtt_consumer.go +++ b/plugins/inputs/mqtt_consumer/mqtt_consumer.go @@ -3,6 +3,7 @@ package mqtt_consumer import ( "fmt" "log" + "strings" "sync" "time" @@ -46,6 +47,8 @@ type MQTTConsumer struct { // keep the accumulator internally: acc telegraf.Accumulator + + started bool } var sampleConfig = ` @@ -100,6 +103,7 @@ func (m *MQTTConsumer) SetParser(parser parsers.Parser) { func (m *MQTTConsumer) Start(acc telegraf.Accumulator) error { m.Lock() defer m.Unlock() + m.started = false if m.PersistentSession && m.ClientID == "" { return fmt.Errorf("ERROR MQTT Consumer: When using persistent_session" + @@ -124,20 +128,32 @@ func (m *MQTTConsumer) Start(acc telegraf.Accumulator) error { m.in = make(chan mqtt.Message, 1000) m.done = make(chan struct{}) - topics := make(map[string]byte) - for _, topic := range m.Topics { - topics[topic] = byte(m.QoS) - } - subscribeToken := m.client.SubscribeMultiple(topics, m.recvMessage) - subscribeToken.Wait() - if subscribeToken.Error() != nil { - return subscribeToken.Error() - } - go m.receiver() return nil } +func (m *MQTTConsumer) onConnect(c mqtt.Client) { + log.Printf("MQTT Client Connected") + if !m.PersistentSession || !m.started { + topics := make(map[string]byte) + for _, topic := range m.Topics { + topics[topic] = byte(m.QoS) + } + subscribeToken := c.SubscribeMultiple(topics, m.recvMessage) + subscribeToken.Wait() + if subscribeToken.Error() != nil { + log.Printf("MQTT SUBSCRIBE ERROR\ntopics: %s\nerror: %s", + strings.Join(m.Topics[:], ","), subscribeToken.Error()) + } + m.started = true + } + return +} + +func (m *MQTTConsumer) onConnectionLost(c mqtt.Client, err error) { + log.Printf("MQTT Connection lost\nerror: %s\nMQTT Client will try to reconnect", err.Error()) + return +} // receiver() reads all incoming messages from the consumer, and parses them into // influxdb metric points. @@ -172,6 +188,7 @@ func (m *MQTTConsumer) Stop() { defer m.Unlock() close(m.done) m.client.Disconnect(200) + m.started = false } func (m *MQTTConsumer) Gather(acc telegraf.Accumulator) error { @@ -219,6 +236,8 @@ func (m *MQTTConsumer) createOpts() (*mqtt.ClientOptions, error) { opts.SetAutoReconnect(true) opts.SetKeepAlive(time.Second * 60) opts.SetCleanSession(!m.PersistentSession) + opts.SetOnConnectHandler(m.onConnect) + opts.SetConnectionLostHandler(m.onConnectionLost) return opts, nil } diff --git a/plugins/inputs/mysql/README.md b/plugins/inputs/mysql/README.md new file mode 100644 index 000000000..b4af1de80 --- /dev/null +++ b/plugins/inputs/mysql/README.md @@ -0,0 +1,190 @@ +# MySQL Input plugin + +This plugin gathers the statistic data from MySQL server + +* Global statuses +* Global variables +* Slave statuses +* Binlog size +* Process list +* Info schema auto increment columns +* Table I/O waits +* Index I/O waits +* Perf Schema table lock waits +* Perf Schema event waits +* Perf Schema events statements +* File events statistics +* Table schema statistics + +## Configuration + +``` +# Read metrics from one or many mysql servers +[[inputs.mysql]] + ## specify servers via a url matching: + ## [username[:password]@][protocol[(address)]]/[?tls=[true|false|skip-verify]] + ## see https://github.com/go-sql-driver/mysql#dsn-data-source-name + ## e.g. + ## root:passwd@tcp(127.0.0.1:3306)/?tls=false + ## root@tcp(127.0.0.1:3306)/?tls=false + # + ## If no servers are specified, then localhost is used as the host. + servers = ["tcp(127.0.0.1:3306)/"] + ## the limits for metrics form perf_events_statements + perf_events_statements_digest_text_limit = 120 + perf_events_statements_limit = 250 + perf_events_statements_time_limit = 86400 + # + ## if the list is empty, then metrics are gathered from all database tables + table_schema_databases = [] + # + ## gather metrics from INFORMATION_SCHEMA.TABLES for databases provided above list + gather_table_schema = false + # + ## gather thread state counts from INFORMATION_SCHEMA.PROCESSLIST + gather_process_list = true + # + ## gather auto_increment columns and max values from information schema + gather_info_schema_auto_inc = true + # + ## gather metrics from SHOW SLAVE STATUS command output + gather_slave_status = true + # + ## gather metrics from SHOW BINARY LOGS command output + gather_binary_logs = false + # + ## gather metrics from PERFORMANCE_SCHEMA.TABLE_IO_WAITS_SUMMART_BY_TABLE + gather_table_io_waits = false + # + ## gather metrics from PERFORMANCE_SCHEMA.TABLE_IO_WAITS_SUMMART_BY_INDEX_USAGE + gather_index_io_waits = false + # + ## gather metrics from PERFORMANCE_SCHEMA.FILE_SUMMARY_BY_EVENT_NAME + gather_file_events_stats = false + # + ## gather metrics from PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_DIGEST + gather_perf_events_statements = false + # + ## Some queries we may want to run less often (such as SHOW GLOBAL VARIABLES) + interval_slow = "30m" +``` + +## Measurements & Fields +* Global statuses - all numeric and boolean values of `SHOW GLOBAL STATUSES` +* Global variables - all numeric and boolean values of `SHOW GLOBAL VARIABLES` +* Slave status - metrics from `SHOW SLAVE STATUS` the metrics are gathered when +the single-source replication is on. If the multi-source replication is set, +then everything works differently, this metric does not work with multi-source +replication. + * slave_[column name]() +* Binary logs - all metrics including size and count of all binary files. +Requires to be turned on in configuration. + * binary_size_bytes(int, number) + * binary_files_count(int, number) +* Process list - connection metrics from processlist for each user. It has the following tags + * connections(int, number) +* Perf Table IO waits - total count and time of I/O waits event for each table +and process. It has following fields: + * table_io_waits_total_fetch(float, number) + * table_io_waits_total_insert(float, number) + * table_io_waits_total_update(float, number) + * table_io_waits_total_delete(float, number) + * table_io_waits_seconds_total_fetch(float, milliseconds) + * table_io_waits_seconds_total_insert(float, milliseconds) + * table_io_waits_seconds_total_update(float, milliseconds) + * table_io_waits_seconds_total_delete(float, milliseconds) +* Perf index IO waits - total count and time of I/O waits event for each index +and process. It has following fields: + * index_io_waits_total_fetch(float, number) + * index_io_waits_seconds_total_fetch(float, milliseconds) + * index_io_waits_total_insert(float, number) + * index_io_waits_total_update(float, number) + * index_io_waits_total_delete(float, number) + * index_io_waits_seconds_total_insert(float, milliseconds) + * index_io_waits_seconds_total_update(float, milliseconds) + * index_io_waits_seconds_total_delete(float, milliseconds) +* Info schema autoincrement statuses - autoincrement fields and max values +for them. It has following fields: + * auto_increment_column(int, number) + * auto_increment_column_max(int, number) +* Perf table lock waits - gathers total number and time for SQL and external +lock waits events for each table and operation. It has following fields. +The unit of fields varies by the tags. + * read_normal(float, number/milliseconds) + * read_with_shared_locks(float, number/milliseconds) + * read_high_priority(float, number/milliseconds) + * read_no_insert(float, number/milliseconds) + * write_normal(float, number/milliseconds) + * write_allow_write(float, number/milliseconds) + * write_concurrent_insert(float, number/milliseconds) + * write_delayed(float, number/milliseconds) + * write_low_priority(float, number/milliseconds) + * read(float, number/milliseconds) + * write(float, number/milliseconds) +* Perf events waits - gathers total time and number of event waits + * events_waits_total(float, number) + * events_waits_seconds_total(float, milliseconds) +* Perf file events statuses - gathers file events statuses + * file_events_total(float,number) + * file_events_seconds_total(float, milliseconds) + * file_events_bytes_total(float, bytes) +* Perf file events statements - gathers attributes of each event + * events_statements_total(float, number) + * events_statements_seconds_total(float, millieconds) + * events_statements_errors_total(float, number) + * events_statements_warnings_total(float, number) + * events_statements_rows_affected_total(float, number) + * events_statements_rows_sent_total(float, number) + * events_statements_rows_examined_total(float, number) + * events_statements_tmp_tables_total(float, number) + * events_statements_tmp_disk_tables_total(float, number) + * events_statements_sort_merge_passes_totales(float, number) + * events_statements_sort_rows_total(float, number) + * events_statements_no_index_used_total(float, number) +* Table schema - gathers statistics of each schema. It has following measurements + * info_schema_table_rows(float, number) + * info_schema_table_size_data_length(float, number) + * info_schema_table_size_index_length(float, number) + * info_schema_table_size_data_free(float, number) + * info_schema_table_version(float, number) + +## Tags +* All measurements has following tags + * server (the host name from which the metrics are gathered) +* Process list measurement has following tags + * user (username for whom the metrics are gathered) +* Perf table IO waits measurement has following tags + * schema + * name (object name for event or process) +* Perf index IO waits has following tags + * schema + * name + * index +* Info schema autoincrement statuses has following tags + * schema + * table + * column +* Perf table lock waits has following tags + * schema + * table + * sql_lock_waits_total(fields including this tag have numeric unit) + * external_lock_waits_total(fields including this tag have numeric unit) + * sql_lock_waits_seconds_total(fields including this tag have millisecond unit) + * external_lock_waits_seconds_total(fields including this tag have millisecond unit) +* Perf events statements has following tags + * event_name +* Perf file events statuses has following tags + * event_name + * mode +* Perf file events statements has following tags + * schema + * digest + * digest_text +* Table schema has following tags + * schema + * table + * component + * type + * engine + * row_format + * create_options diff --git a/plugins/inputs/mysql/mysql.go b/plugins/inputs/mysql/mysql.go index 474067716..a7254e250 100644 --- a/plugins/inputs/mysql/mysql.go +++ b/plugins/inputs/mysql/mysql.go @@ -1,7 +1,9 @@ package mysql import ( + "bytes" "database/sql" + "fmt" "net/url" "strconv" "strings" @@ -13,7 +15,21 @@ import ( ) type Mysql struct { - Servers []string + Servers []string `toml:"servers"` + PerfEventsStatementsDigestTextLimit int64 `toml:"perf_events_statements_digest_text_limit"` + PerfEventsStatementsLimit int64 `toml:"perf_events_statements_limit"` + PerfEventsStatementsTimeLimit int64 `toml:"perf_events_statemetns_time_limit"` + TableSchemaDatabases []string `toml:"table_schema_databases"` + GatherProcessList bool `toml:"gather_process_list"` + GatherInfoSchemaAutoInc bool `toml:"gather_info_schema_auto_inc"` + GatherSlaveStatus bool `toml:"gather_slave_status"` + GatherBinaryLogs bool `toml:"gather_binary_logs"` + GatherTableIOWaits bool `toml:"gather_table_io_waits"` + GatherIndexIOWaits bool `toml:"gather_index_io_waits"` + GatherTableSchema bool `toml:"gather_table_schema"` + GatherFileEventsStats bool `toml:"gather_file_events_stats"` + GatherPerfEventsStatements bool `toml:"gather_perf_events_statements"` + IntervalSlow string `toml:"interval_slow"` } var sampleConfig = ` @@ -23,9 +39,46 @@ var sampleConfig = ` ## e.g. ## root:passwd@tcp(127.0.0.1:3306)/?tls=false ## root@tcp(127.0.0.1:3306)/?tls=false - ## + # ## If no servers are specified, then localhost is used as the host. servers = ["tcp(127.0.0.1:3306)/"] + ## the limits for metrics form perf_events_statements + perf_events_statements_digest_text_limit = 120 + perf_events_statements_limit = 250 + perf_events_statements_time_limit = 86400 + # + ## if the list is empty, then metrics are gathered from all databasee tables + table_schema_databases = [] + # + ## gather metrics from INFORMATION_SCHEMA.TABLES for databases provided above list + gather_table_schema = false + # + ## gather thread state counts from INFORMATION_SCHEMA.PROCESSLIST + gather_process_list = true + # + ## gather auto_increment columns and max values from information schema + gather_info_schema_auto_inc = true + # + ## gather metrics from SHOW SLAVE STATUS command output + gather_slave_status = true + # + ## gather metrics from SHOW BINARY LOGS command output + gather_binary_logs = false + # + ## gather metrics from PERFORMANCE_SCHEMA.TABLE_IO_WAITS_SUMMART_BY_TABLE + gather_table_io_waits = false + # + ## gather metrics from PERFORMANCE_SCHEMA.TABLE_IO_WAITS_SUMMART_BY_INDEX_USAGE + gather_index_io_waits = false + # + ## gather metrics from PERFORMANCE_SCHEMA.FILE_SUMMARY_BY_EVENT_NAME + gather_file_events_stats = false + # + ## gather metrics from PERFORMANCE_SCHEMA.EVENTS_STATEMENTS_SUMMARY_BY_DIGEST + gather_perf_events_statements = false + # + ## Some queries we may want to run less often (such as SHOW GLOBAL VARIABLES) + interval_slow = "30m" ` var defaultTimeout = time.Second * time.Duration(5) @@ -38,7 +91,22 @@ func (m *Mysql) Description() string { return "Read metrics from one or many mysql servers" } -var localhost = "" +var ( + localhost = "" + lastT time.Time + initDone = false + scanIntervalSlow uint32 +) + +func (m *Mysql) InitMysql() { + if len(m.IntervalSlow) > 0 { + interval, err := time.ParseDuration(m.IntervalSlow) + if err == nil && interval.Seconds() >= 1.0 { + scanIntervalSlow = uint32(interval.Seconds()) + } + } + initDone = true +} func (m *Mysql) Gather(acc telegraf.Accumulator) error { if len(m.Servers) == 0 { @@ -48,6 +116,12 @@ func (m *Mysql) Gather(acc telegraf.Accumulator) error { return nil } + // Initialise additional query intervals + if !initDone { + m.InitMysql() + } + + // Loop through each server and collect metrics for _, serv := range m.Servers { err := m.gatherServer(serv, acc) if err != nil { @@ -116,104 +190,194 @@ var mappings = []*mapping{ onServer: "Threads_", inExport: "threads_", }, + { + onServer: "Access_", + inExport: "access_", + }, + { + onServer: "Aria__", + inExport: "aria_", + }, + { + onServer: "Binlog__", + inExport: "binlog_", + }, + { + onServer: "Busy_", + inExport: "busy_", + }, + { + onServer: "Connection_", + inExport: "connection_", + }, + { + onServer: "Delayed_", + inExport: "delayed_", + }, + { + onServer: "Empty_", + inExport: "empty_", + }, + { + onServer: "Executed_", + inExport: "executed_", + }, + { + onServer: "Executed_", + inExport: "executed_", + }, + { + onServer: "Feature_", + inExport: "feature_", + }, + { + onServer: "Flush_", + inExport: "flush_", + }, + { + onServer: "Last_", + inExport: "last_", + }, + { + onServer: "Master_", + inExport: "master_", + }, + { + onServer: "Max_", + inExport: "max_", + }, + { + onServer: "Memory_", + inExport: "memory_", + }, + { + onServer: "Not_", + inExport: "not_", + }, + { + onServer: "Performance_", + inExport: "performance_", + }, + { + onServer: "Prepared_", + inExport: "prepared_", + }, + { + onServer: "Rows_", + inExport: "rows_", + }, + { + onServer: "Rpl_", + inExport: "rpl_", + }, + { + onServer: "Select_", + inExport: "select_", + }, + { + onServer: "Slave_", + inExport: "slave_", + }, + { + onServer: "Slow_", + inExport: "slow_", + }, + { + onServer: "Sort_", + inExport: "sort_", + }, + { + onServer: "Subquery_", + inExport: "subquery_", + }, + { + onServer: "Tc_", + inExport: "tc_", + }, + { + onServer: "Threadpool_", + inExport: "threadpool_", + }, } -func (m *Mysql) gatherServer(serv string, acc telegraf.Accumulator) error { - // If user forgot the '/', add it - if strings.HasSuffix(serv, ")") { - serv = serv + "/" - } else if serv == "localhost" { - serv = "" +var ( + // status counter + generalThreadStates = map[string]uint32{ + "after create": uint32(0), + "altering table": uint32(0), + "analyzing": uint32(0), + "checking permissions": uint32(0), + "checking table": uint32(0), + "cleaning up": uint32(0), + "closing tables": uint32(0), + "converting heap to myisam": uint32(0), + "copying to tmp table": uint32(0), + "creating sort index": uint32(0), + "creating table": uint32(0), + "creating tmp table": uint32(0), + "deleting": uint32(0), + "executing": uint32(0), + "execution of init_command": uint32(0), + "end": uint32(0), + "freeing items": uint32(0), + "flushing tables": uint32(0), + "fulltext initialization": uint32(0), + "idle": uint32(0), + "init": uint32(0), + "killed": uint32(0), + "waiting for lock": uint32(0), + "logging slow query": uint32(0), + "login": uint32(0), + "manage keys": uint32(0), + "opening tables": uint32(0), + "optimizing": uint32(0), + "preparing": uint32(0), + "reading from net": uint32(0), + "removing duplicates": uint32(0), + "removing tmp table": uint32(0), + "reopen tables": uint32(0), + "repair by sorting": uint32(0), + "repair done": uint32(0), + "repair with keycache": uint32(0), + "replication master": uint32(0), + "rolling back": uint32(0), + "searching rows for update": uint32(0), + "sending data": uint32(0), + "sorting for group": uint32(0), + "sorting for order": uint32(0), + "sorting index": uint32(0), + "sorting result": uint32(0), + "statistics": uint32(0), + "updating": uint32(0), + "waiting for tables": uint32(0), + "waiting for table flush": uint32(0), + "waiting on cond": uint32(0), + "writing to net": uint32(0), + "other": uint32(0), } - - serv, err := dsnAddTimeout(serv) - if err != nil { - return err + // plaintext statuses + stateStatusMappings = map[string]string{ + "user sleep": "idle", + "creating index": "altering table", + "committing alter table to storage engine": "altering table", + "discard or import tablespace": "altering table", + "rename": "altering table", + "setup": "altering table", + "renaming result table": "altering table", + "preparing for alter table": "altering table", + "copying to group table": "copying to tmp table", + "copy to tmp table": "copying to tmp table", + "query end": "end", + "update": "updating", + "updating main table": "updating", + "updating reference tables": "updating", + "system lock": "waiting for lock", + "user lock": "waiting for lock", + "table lock": "waiting for lock", + "deleting from main table": "deleting", + "deleting from reference tables": "deleting", } - db, err := sql.Open("mysql", serv) - if err != nil { - return err - } - - defer db.Close() - - rows, err := db.Query(`SHOW /*!50002 GLOBAL */ STATUS`) - if err != nil { - return err - } - - var servtag string - servtag, err = parseDSN(serv) - if err != nil { - servtag = "localhost" - } - tags := map[string]string{"server": servtag} - fields := make(map[string]interface{}) - for rows.Next() { - var name string - var val interface{} - - err = rows.Scan(&name, &val) - if err != nil { - return err - } - - var found bool - - for _, mapped := range mappings { - if strings.HasPrefix(name, mapped.onServer) { - i, _ := strconv.Atoi(string(val.([]byte))) - fields[mapped.inExport+name[len(mapped.onServer):]] = i - found = true - } - } - - if found { - continue - } - - switch name { - case "Queries": - i, err := strconv.ParseInt(string(val.([]byte)), 10, 64) - if err != nil { - return err - } - - fields["queries"] = i - case "Slow_queries": - i, err := strconv.ParseInt(string(val.([]byte)), 10, 64) - if err != nil { - return err - } - - fields["slow_queries"] = i - } - } - acc.AddFields("mysql", fields, tags) - - conn_rows, err := db.Query("SELECT user, sum(1) FROM INFORMATION_SCHEMA.PROCESSLIST GROUP BY user") - - for conn_rows.Next() { - var user string - var connections int64 - - err = conn_rows.Scan(&user, &connections) - if err != nil { - return err - } - - tags := map[string]string{"server": servtag, "user": user} - fields := make(map[string]interface{}) - - if err != nil { - return err - } - fields["connections"] = connections - acc.AddFields("mysql_users", fields, tags) - } - - return nil -} +) func dsnAddTimeout(dsn string) (string, error) { @@ -236,6 +400,1160 @@ func dsnAddTimeout(dsn string) (string, error) { return u.String(), nil } +// Math constants +const ( + picoSeconds = 1e12 +) + +// metric queries +const ( + globalStatusQuery = `SHOW GLOBAL STATUS` + globalVariablesQuery = `SHOW GLOBAL VARIABLES` + slaveStatusQuery = `SHOW SLAVE STATUS` + binaryLogsQuery = `SHOW BINARY LOGS` + infoSchemaProcessListQuery = ` + SELECT COALESCE(command,''),COALESCE(state,''),count(*) + FROM information_schema.processlist + WHERE ID != connection_id() + GROUP BY command,state + ORDER BY null` + infoSchemaAutoIncQuery = ` + SELECT table_schema, table_name, column_name, auto_increment, + CAST(pow(2, case data_type + when 'tinyint' then 7 + when 'smallint' then 15 + when 'mediumint' then 23 + when 'int' then 31 + when 'bigint' then 63 + end+(column_type like '% unsigned'))-1 as decimal(19)) as max_int + FROM information_schema.tables t + JOIN information_schema.columns c USING (table_schema,table_name) + WHERE c.extra = 'auto_increment' AND t.auto_increment IS NOT NULL + ` + perfTableIOWaitsQuery = ` + SELECT OBJECT_SCHEMA, OBJECT_NAME, COUNT_FETCH, COUNT_INSERT, COUNT_UPDATE, COUNT_DELETE, + SUM_TIMER_FETCH, SUM_TIMER_INSERT, SUM_TIMER_UPDATE, SUM_TIMER_DELETE + FROM performance_schema.table_io_waits_summary_by_table + WHERE OBJECT_SCHEMA NOT IN ('mysql', 'performance_schema') + ` + perfIndexIOWaitsQuery = ` + SELECT OBJECT_SCHEMA, OBJECT_NAME, ifnull(INDEX_NAME, 'NONE') as INDEX_NAME, + COUNT_FETCH, COUNT_INSERT, COUNT_UPDATE, COUNT_DELETE, + SUM_TIMER_FETCH, SUM_TIMER_INSERT, SUM_TIMER_UPDATE, SUM_TIMER_DELETE + FROM performance_schema.table_io_waits_summary_by_index_usage + WHERE OBJECT_SCHEMA NOT IN ('mysql', 'performance_schema') + ` + perfTableLockWaitsQuery = ` + SELECT + OBJECT_SCHEMA, + OBJECT_NAME, + COUNT_READ_NORMAL, + COUNT_READ_WITH_SHARED_LOCKS, + COUNT_READ_HIGH_PRIORITY, + COUNT_READ_NO_INSERT, + COUNT_READ_EXTERNAL, + COUNT_WRITE_ALLOW_WRITE, + COUNT_WRITE_CONCURRENT_INSERT, + COUNT_WRITE_DELAYED, + COUNT_WRITE_LOW_PRIORITY, + COUNT_WRITE_NORMAL, + COUNT_WRITE_EXTERNAL, + SUM_TIMER_READ_NORMAL, + SUM_TIMER_READ_WITH_SHARED_LOCKS, + SUM_TIMER_READ_HIGH_PRIORITY, + SUM_TIMER_READ_NO_INSERT, + SUM_TIMER_READ_EXTERNAL, + SUM_TIMER_WRITE_ALLOW_WRITE, + SUM_TIMER_WRITE_CONCURRENT_INSERT, + SUM_TIMER_WRITE_DELAYED, + SUM_TIMER_WRITE_LOW_PRIORITY, + SUM_TIMER_WRITE_NORMAL, + SUM_TIMER_WRITE_EXTERNAL + FROM performance_schema.table_lock_waits_summary_by_table + WHERE OBJECT_SCHEMA NOT IN ('mysql', 'performance_schema', 'information_schema') + ` + perfEventsStatementsQuery = ` + SELECT + ifnull(SCHEMA_NAME, 'NONE') as SCHEMA_NAME, + DIGEST, + LEFT(DIGEST_TEXT, %d) as DIGEST_TEXT, + COUNT_STAR, + SUM_TIMER_WAIT, + SUM_ERRORS, + SUM_WARNINGS, + SUM_ROWS_AFFECTED, + SUM_ROWS_SENT, + SUM_ROWS_EXAMINED, + SUM_CREATED_TMP_DISK_TABLES, + SUM_CREATED_TMP_TABLES, + SUM_SORT_MERGE_PASSES, + SUM_SORT_ROWS, + SUM_NO_INDEX_USED + FROM performance_schema.events_statements_summary_by_digest + WHERE SCHEMA_NAME NOT IN ('mysql', 'performance_schema', 'information_schema') + AND last_seen > DATE_SUB(NOW(), INTERVAL %d SECOND) + ORDER BY SUM_TIMER_WAIT DESC + LIMIT %d + ` + perfEventWaitsQuery = ` + SELECT EVENT_NAME, COUNT_STAR, SUM_TIMER_WAIT + FROM performance_schema.events_waits_summary_global_by_event_name + ` + perfFileEventsQuery = ` + SELECT + EVENT_NAME, + COUNT_READ, SUM_TIMER_READ, SUM_NUMBER_OF_BYTES_READ, + COUNT_WRITE, SUM_TIMER_WRITE, SUM_NUMBER_OF_BYTES_WRITE, + COUNT_MISC, SUM_TIMER_MISC + FROM performance_schema.file_summary_by_event_name + ` + tableSchemaQuery = ` + SELECT + TABLE_SCHEMA, + TABLE_NAME, + TABLE_TYPE, + ifnull(ENGINE, 'NONE') as ENGINE, + ifnull(VERSION, '0') as VERSION, + ifnull(ROW_FORMAT, 'NONE') as ROW_FORMAT, + ifnull(TABLE_ROWS, '0') as TABLE_ROWS, + ifnull(DATA_LENGTH, '0') as DATA_LENGTH, + ifnull(INDEX_LENGTH, '0') as INDEX_LENGTH, + ifnull(DATA_FREE, '0') as DATA_FREE, + ifnull(CREATE_OPTIONS, 'NONE') as CREATE_OPTIONS + FROM information_schema.tables + WHERE TABLE_SCHEMA = '%s' + ` + dbListQuery = ` + SELECT + SCHEMA_NAME + FROM information_schema.schemata + WHERE SCHEMA_NAME NOT IN ('mysql', 'performance_schema', 'information_schema') + ` + perfSchemaTablesQuery = ` + SELECT + table_name + FROM information_schema.tables + WHERE table_schema = 'performance_schema' AND table_name = ? + + ` +) + +func (m *Mysql) gatherServer(serv string, acc telegraf.Accumulator) error { + serv, err := dsnAddTimeout(serv) + if err != nil { + return err + } + + db, err := sql.Open("mysql", serv) + if err != nil { + return err + } + + defer db.Close() + + err = m.gatherGlobalStatuses(db, serv, acc) + if err != nil { + return err + } + + // Global Variables may be gathered less often + if len(m.IntervalSlow) > 0 { + if uint32(time.Since(lastT).Seconds()) > scanIntervalSlow { + err = m.gatherGlobalVariables(db, serv, acc) + if err != nil { + return err + } + lastT = time.Now() + } else { + err = m.gatherGlobalVariables(db, serv, acc) + if err != nil { + return err + } + } + } + + if m.GatherBinaryLogs { + err = m.gatherBinaryLogs(db, serv, acc) + if err != nil { + return err + } + } + + if m.GatherProcessList { + err = m.GatherProcessListStatuses(db, serv, acc) + if err != nil { + return err + } + } + + if m.GatherSlaveStatus { + err = m.gatherSlaveStatuses(db, serv, acc) + if err != nil { + return err + } + } + + if m.GatherInfoSchemaAutoInc { + err = m.gatherInfoSchemaAutoIncStatuses(db, serv, acc) + if err != nil { + return err + } + } + + if m.GatherTableIOWaits { + err = m.gatherPerfTableIOWaits(db, serv, acc) + if err != nil { + return err + } + } + + if m.GatherIndexIOWaits { + err = m.gatherPerfIndexIOWaits(db, serv, acc) + if err != nil { + return err + } + } + + err = m.gatherPerfTableLockWaits(db, serv, acc) + if err != nil { + return err + } + + err = m.gatherPerfEventWaits(db, serv, acc) + if err != nil { + return err + } + + if m.GatherFileEventsStats { + err = m.gatherPerfFileEventsStatuses(db, serv, acc) + if err != nil { + return err + } + } + + if m.GatherPerfEventsStatements { + err = m.gatherPerfEventsStatements(db, serv, acc) + if err != nil { + return err + } + } + + if m.GatherTableSchema { + err = m.gatherTableSchema(db, serv, acc) + if err != nil { + return err + } + } + return nil +} + +// gatherGlobalVariables can be used to fetch all global variables from +// MySQL environment. +func (m *Mysql) gatherGlobalVariables(db *sql.DB, serv string, acc telegraf.Accumulator) error { + // run query + rows, err := db.Query(globalVariablesQuery) + if err != nil { + return err + } + defer rows.Close() + + var key string + var val sql.RawBytes + + // parse DSN and save server tag + servtag, err := parseDSN(serv) + if err != nil { + servtag = "localhost" + } + tags := map[string]string{"server": servtag} + fields := make(map[string]interface{}) + for rows.Next() { + if err := rows.Scan(&key, &val); err != nil { + return err + } + key = strings.ToLower(key) + // parse value, if it is numeric then save, otherwise ignore + if floatVal, ok := parseValue(val); ok { + fields[key] = floatVal + } + // Send 20 fields at a time + if len(fields) >= 20 { + acc.AddFields("mysql_variables", fields, tags) + fields = make(map[string]interface{}) + } + } + // Send any remaining fields + if len(fields) > 0 { + acc.AddFields("mysql_variables", fields, tags) + } + return nil +} + +// gatherSlaveStatuses can be used to get replication analytics +// When the server is slave, then it returns only one row. +// If the multi-source replication is set, then everything works differently +// This code does not work with multi-source replication. +func (m *Mysql) gatherSlaveStatuses(db *sql.DB, serv string, acc telegraf.Accumulator) error { + // run query + rows, err := db.Query(slaveStatusQuery) + if err != nil { + return err + } + defer rows.Close() + + servtag, err := parseDSN(serv) + if err != nil { + servtag = "localhost" + } + + tags := map[string]string{"server": servtag} + fields := make(map[string]interface{}) + + // to save the column names as a field key + // scanning keys and values separately + if rows.Next() { + // get columns names, and create an array with its length + cols, err := rows.Columns() + if err != nil { + return err + } + vals := make([]interface{}, len(cols)) + // fill the array with sql.Rawbytes + for i := range vals { + vals[i] = &sql.RawBytes{} + } + if err = rows.Scan(vals...); err != nil { + return err + } + // range over columns, and try to parse values + for i, col := range cols { + // skip unparsable values + if value, ok := parseValue(*vals[i].(*sql.RawBytes)); ok { + fields["slave_"+col] = value + } + } + acc.AddFields("mysql", fields, tags) + } + + return nil +} + +// gatherBinaryLogs can be used to collect size and count of all binary files +// binlogs metric requires the MySQL server to turn it on in configuration +func (m *Mysql) gatherBinaryLogs(db *sql.DB, serv string, acc telegraf.Accumulator) error { + // run query + rows, err := db.Query(binaryLogsQuery) + if err != nil { + return err + } + defer rows.Close() + + // parse DSN and save host as a tag + var servtag string + servtag, err = parseDSN(serv) + if err != nil { + servtag = "localhost" + } + tags := map[string]string{"server": servtag} + var ( + size uint64 = 0 + count uint64 = 0 + fileSize uint64 + fileName string + ) + + // iterate over rows and count the size and count of files + for rows.Next() { + if err := rows.Scan(&fileName, &fileSize); err != nil { + return err + } + size += fileSize + count++ + } + fields := map[string]interface{}{ + "binary_size_bytes": size, + "binary_files_count": count, + } + acc.AddFields("mysql", fields, tags) + return nil +} + +// gatherGlobalStatuses can be used to get MySQL status metrics +// the mappings of actual names and names of each status to be exported +// to output is provided on mappings variable +func (m *Mysql) gatherGlobalStatuses(db *sql.DB, serv string, acc telegraf.Accumulator) error { + // If user forgot the '/', add it + if strings.HasSuffix(serv, ")") { + serv = serv + "/" + } else if serv == "localhost" { + serv = "" + } + + // run query + rows, err := db.Query(globalStatusQuery) + if err != nil { + return err + } + + // parse the DSN and save host name as a tag + var servtag string + servtag, err = parseDSN(serv) + if err != nil { + servtag = "localhost" + } + tags := map[string]string{"server": servtag} + fields := make(map[string]interface{}) + for rows.Next() { + var name string + var val interface{} + + err = rows.Scan(&name, &val) + if err != nil { + return err + } + + var found bool + + // iterate over mappings and gather metrics that is provided on mapping + for _, mapped := range mappings { + if strings.HasPrefix(name, mapped.onServer) { + // convert numeric values to integer + i, _ := strconv.Atoi(string(val.([]byte))) + fields[mapped.inExport+name[len(mapped.onServer):]] = i + found = true + } + } + // Send 20 fields at a time + if len(fields) >= 20 { + acc.AddFields("mysql", fields, tags) + fields = make(map[string]interface{}) + } + + if found { + continue + } + + // search for specific values + switch name { + case "Queries": + i, err := strconv.ParseInt(string(val.([]byte)), 10, 64) + if err != nil { + return err + } + + fields["queries"] = i + case "Slow_queries": + i, err := strconv.ParseInt(string(val.([]byte)), 10, 64) + if err != nil { + return err + } + + fields["slow_queries"] = i + case "Connections": + i, err := strconv.ParseInt(string(val.([]byte)), 10, 64) + if err != nil { + return err + } + fields["connections"] = i + case "Syncs": + i, err := strconv.ParseInt(string(val.([]byte)), 10, 64) + if err != nil { + return err + } + fields["syncs"] = i + } + } + // Send any remaining fields + if len(fields) > 0 { + acc.AddFields("mysql", fields, tags) + } + // gather connection metrics from processlist for each user + if m.GatherProcessList { + conn_rows, err := db.Query("SELECT user, sum(1) FROM INFORMATION_SCHEMA.PROCESSLIST GROUP BY user") + + for conn_rows.Next() { + var user string + var connections int64 + + err = conn_rows.Scan(&user, &connections) + if err != nil { + return err + } + + tags := map[string]string{"server": servtag, "user": user} + fields := make(map[string]interface{}) + + if err != nil { + return err + } + fields["connections"] = connections + acc.AddFields("mysql_users", fields, tags) + } + } + + return nil +} + +// GatherProcessList can be used to collect metrics on each running command +// and its state with its running count +func (m *Mysql) GatherProcessListStatuses(db *sql.DB, serv string, acc telegraf.Accumulator) error { + // run query + rows, err := db.Query(infoSchemaProcessListQuery) + if err != nil { + return err + } + defer rows.Close() + var ( + command string + state string + count uint32 + ) + + var servtag string + fields := make(map[string]interface{}) + servtag, err = parseDSN(serv) + if err != nil { + servtag = "localhost" + } + + // mapping of state with its counts + stateCounts := make(map[string]uint32, len(generalThreadStates)) + // set map with keys and default values + for k, v := range generalThreadStates { + stateCounts[k] = v + } + + for rows.Next() { + err = rows.Scan(&command, &state, &count) + if err != nil { + return err + } + // each state has its mapping + foundState := findThreadState(command, state) + // count each state + stateCounts[foundState] += count + } + + tags := map[string]string{"server": servtag} + for s, c := range stateCounts { + fields[newNamespace("threads", s)] = c + } + acc.AddFields("mysql_info_schema", fields, tags) + return nil +} + +// gatherPerfTableIOWaits can be used to get total count and time +// of I/O wait event for each table and process +func (m *Mysql) gatherPerfTableIOWaits(db *sql.DB, serv string, acc telegraf.Accumulator) error { + rows, err := db.Query(perfTableIOWaitsQuery) + if err != nil { + return err + } + + defer rows.Close() + var ( + objSchema, objName, servtag string + countFetch, countInsert, countUpdate, countDelete float64 + timeFetch, timeInsert, timeUpdate, timeDelete float64 + ) + + servtag, err = parseDSN(serv) + if err != nil { + servtag = "localhost" + } + + for rows.Next() { + err = rows.Scan(&objSchema, &objName, + &countFetch, &countInsert, &countUpdate, &countDelete, + &timeFetch, &timeInsert, &timeUpdate, &timeDelete, + ) + + if err != nil { + return err + } + + tags := map[string]string{ + "server": servtag, + "schema": objSchema, + "name": objName, + } + + fields := map[string]interface{}{ + "table_io_waits_total_fetch": countFetch, + "table_io_waits_total_insert": countInsert, + "table_io_waits_total_update": countUpdate, + "table_io_waits_total_delete": countDelete, + "table_io_waits_seconds_total_fetch": timeFetch / picoSeconds, + "table_io_waits_seconds_total_insert": timeInsert / picoSeconds, + "table_io_waits_seconds_total_update": timeUpdate / picoSeconds, + "table_io_waits_seconds_total_delete": timeDelete / picoSeconds, + } + + acc.AddFields("mysql_perf_schema", fields, tags) + } + return nil +} + +// gatherPerfIndexIOWaits can be used to get total count and time +// of I/O wait event for each index and process +func (m *Mysql) gatherPerfIndexIOWaits(db *sql.DB, serv string, acc telegraf.Accumulator) error { + rows, err := db.Query(perfIndexIOWaitsQuery) + if err != nil { + return err + } + defer rows.Close() + + var ( + objSchema, objName, indexName, servtag string + countFetch, countInsert, countUpdate, countDelete float64 + timeFetch, timeInsert, timeUpdate, timeDelete float64 + ) + + servtag, err = parseDSN(serv) + if err != nil { + servtag = "localhost" + } + + for rows.Next() { + err = rows.Scan(&objSchema, &objName, &indexName, + &countFetch, &countInsert, &countUpdate, &countDelete, + &timeFetch, &timeInsert, &timeUpdate, &timeDelete, + ) + + if err != nil { + return err + } + + tags := map[string]string{ + "server": servtag, + "schema": objSchema, + "name": objName, + "index": indexName, + } + fields := map[string]interface{}{ + "index_io_waits_total_fetch": countFetch, + "index_io_waits_seconds_total_fetch": timeFetch / picoSeconds, + } + + // update write columns only when index is NONE + if indexName == "NONE" { + fields["index_io_waits_total_insert"] = countInsert + fields["index_io_waits_total_update"] = countUpdate + fields["index_io_waits_total_delete"] = countDelete + + fields["index_io_waits_seconds_total_insert"] = timeInsert / picoSeconds + fields["index_io_waits_seconds_total_update"] = timeUpdate / picoSeconds + fields["index_io_waits_seconds_total_delete"] = timeDelete / picoSeconds + } + + acc.AddFields("mysql_perf_schema", fields, tags) + } + return nil +} + +// gatherInfoSchemaAutoIncStatuses can be used to get auto incremented values of the column +func (m *Mysql) gatherInfoSchemaAutoIncStatuses(db *sql.DB, serv string, acc telegraf.Accumulator) error { + rows, err := db.Query(infoSchemaAutoIncQuery) + if err != nil { + return err + } + defer rows.Close() + + var ( + schema, table, column string + incValue, maxInt uint64 + ) + + servtag, err := parseDSN(serv) + if err != nil { + servtag = "localhost" + } + + for rows.Next() { + if err := rows.Scan(&schema, &table, &column, &incValue, &maxInt); err != nil { + return err + } + tags := map[string]string{ + "server": servtag, + "schema": schema, + "table": table, + "column": column, + } + fields := make(map[string]interface{}) + fields["auto_increment_column"] = incValue + fields["auto_increment_column_max"] = maxInt + + acc.AddFields("mysql_info_schema", fields, tags) + } + return nil +} + +// gatherPerfTableLockWaits can be used to get +// the total number and time for SQL and external lock wait events +// for each table and operation +// requires the MySQL server to be enabled to save this metric +func (m *Mysql) gatherPerfTableLockWaits(db *sql.DB, serv string, acc telegraf.Accumulator) error { + // check if table exists, + // if performance_schema is not enabled, tables do not exist + // then there is no need to scan them + var tableName string + err := db.QueryRow(perfSchemaTablesQuery, "table_lock_waits_summary_by_table").Scan(&tableName) + switch { + case err == sql.ErrNoRows: + return nil + case err != nil: + return err + } + + rows, err := db.Query(perfTableLockWaitsQuery) + if err != nil { + return err + } + defer rows.Close() + + servtag, err := parseDSN(serv) + if err != nil { + servtag = "localhost" + } + + var ( + objectSchema string + objectName string + countReadNormal float64 + countReadWithSharedLocks float64 + countReadHighPriority float64 + countReadNoInsert float64 + countReadExternal float64 + countWriteAllowWrite float64 + countWriteConcurrentInsert float64 + countWriteDelayed float64 + countWriteLowPriority float64 + countWriteNormal float64 + countWriteExternal float64 + timeReadNormal float64 + timeReadWithSharedLocks float64 + timeReadHighPriority float64 + timeReadNoInsert float64 + timeReadExternal float64 + timeWriteAllowWrite float64 + timeWriteConcurrentInsert float64 + timeWriteDelayed float64 + timeWriteLowPriority float64 + timeWriteNormal float64 + timeWriteExternal float64 + ) + + for rows.Next() { + err = rows.Scan( + &objectSchema, + &objectName, + &countReadNormal, + &countReadWithSharedLocks, + &countReadHighPriority, + &countReadNoInsert, + &countReadExternal, + &countWriteAllowWrite, + &countWriteConcurrentInsert, + &countWriteDelayed, + &countWriteLowPriority, + &countWriteNormal, + &countWriteExternal, + &timeReadNormal, + &timeReadWithSharedLocks, + &timeReadHighPriority, + &timeReadNoInsert, + &timeReadExternal, + &timeWriteAllowWrite, + &timeWriteConcurrentInsert, + &timeWriteDelayed, + &timeWriteLowPriority, + &timeWriteNormal, + &timeWriteExternal, + ) + + if err != nil { + return err + } + tags := map[string]string{ + "server": servtag, + "schema": objectSchema, + "table": objectName, + } + + sqlLWTags := copyTags(tags) + sqlLWTags["perf_query"] = "sql_lock_waits_total" + sqlLWFields := map[string]interface{}{ + "read_normal": countReadNormal, + "read_with_shared_locks": countReadWithSharedLocks, + "read_high_priority": countReadHighPriority, + "read_no_insert": countReadNoInsert, + "write_normal": countWriteNormal, + "write_allow_write": countWriteAllowWrite, + "write_concurrent_insert": countWriteConcurrentInsert, + "write_delayed": countWriteDelayed, + "write_low_priority": countWriteLowPriority, + } + acc.AddFields("mysql_perf_schema", sqlLWFields, sqlLWTags) + + externalLWTags := copyTags(tags) + externalLWTags["perf_query"] = "external_lock_waits_total" + externalLWFields := map[string]interface{}{ + "read": countReadExternal, + "write": countWriteExternal, + } + acc.AddFields("mysql_perf_schema", externalLWFields, externalLWTags) + + sqlLWSecTotalTags := copyTags(tags) + sqlLWSecTotalTags["perf_query"] = "sql_lock_waits_seconds_total" + sqlLWSecTotalFields := map[string]interface{}{ + "read_normal": timeReadNormal / picoSeconds, + "read_with_shared_locks": timeReadWithSharedLocks / picoSeconds, + "read_high_priority": timeReadHighPriority / picoSeconds, + "read_no_insert": timeReadNoInsert / picoSeconds, + "write_normal": timeWriteNormal / picoSeconds, + "write_allow_write": timeWriteAllowWrite / picoSeconds, + "write_concurrent_insert": timeWriteConcurrentInsert / picoSeconds, + "write_delayed": timeWriteDelayed / picoSeconds, + "write_low_priority": timeWriteLowPriority / picoSeconds, + } + acc.AddFields("mysql_perf_schema", sqlLWSecTotalFields, sqlLWSecTotalTags) + + externalLWSecTotalTags := copyTags(tags) + externalLWSecTotalTags["perf_query"] = "external_lock_waits_seconds_total" + externalLWSecTotalFields := map[string]interface{}{ + "read": timeReadExternal / picoSeconds, + "write": timeWriteExternal / picoSeconds, + } + acc.AddFields("mysql_perf_schema", externalLWSecTotalFields, externalLWSecTotalTags) + } + return nil +} + +// gatherPerfEventWaits can be used to get total time and number of event waits +func (m *Mysql) gatherPerfEventWaits(db *sql.DB, serv string, acc telegraf.Accumulator) error { + rows, err := db.Query(perfEventWaitsQuery) + if err != nil { + return err + } + defer rows.Close() + + var ( + event string + starCount, timeWait float64 + ) + + servtag, err := parseDSN(serv) + if err != nil { + servtag = "localhost" + } + tags := map[string]string{ + "server": servtag, + } + for rows.Next() { + if err := rows.Scan(&event, &starCount, &timeWait); err != nil { + return err + } + tags["event_name"] = event + fields := map[string]interface{}{ + "events_waits_total": starCount, + "events_waits_seconds_total": timeWait / picoSeconds, + } + + acc.AddFields("mysql_perf_schema", fields, tags) + } + return nil +} + +// gatherPerfFileEvents can be used to get stats on file events +func (m *Mysql) gatherPerfFileEventsStatuses(db *sql.DB, serv string, acc telegraf.Accumulator) error { + rows, err := db.Query(perfFileEventsQuery) + if err != nil { + return err + } + + defer rows.Close() + + var ( + eventName string + countRead, countWrite, countMisc float64 + sumTimerRead, sumTimerWrite, sumTimerMisc float64 + sumNumBytesRead, sumNumBytesWrite float64 + ) + + servtag, err := parseDSN(serv) + if err != nil { + servtag = "localhost" + } + tags := map[string]string{ + "server": servtag, + } + for rows.Next() { + err = rows.Scan( + &eventName, + &countRead, &sumTimerRead, &sumNumBytesRead, + &countWrite, &sumTimerWrite, &sumNumBytesWrite, + &countMisc, &sumTimerMisc, + ) + if err != nil { + return err + } + + tags["event_name"] = eventName + fields := make(map[string]interface{}) + + miscTags := copyTags(tags) + miscTags["mode"] = "misc" + fields["file_events_total"] = countWrite + fields["file_events_seconds_total"] = sumTimerMisc / picoSeconds + acc.AddFields("mysql_perf_schema", fields, miscTags) + + readTags := copyTags(tags) + readTags["mode"] = "read" + fields["file_events_total"] = countRead + fields["file_events_seconds_total"] = sumTimerRead / picoSeconds + fields["file_events_bytes_totals"] = sumNumBytesRead + acc.AddFields("mysql_perf_schema", fields, readTags) + + writeTags := copyTags(tags) + writeTags["mode"] = "write" + fields["file_events_total"] = countWrite + fields["file_events_seconds_total"] = sumTimerWrite / picoSeconds + fields["file_events_bytes_totals"] = sumNumBytesWrite + acc.AddFields("mysql_perf_schema", fields, writeTags) + + } + return nil +} + +// gatherPerfEventsStatements can be used to get attributes of each event +func (m *Mysql) gatherPerfEventsStatements(db *sql.DB, serv string, acc telegraf.Accumulator) error { + query := fmt.Sprintf( + perfEventsStatementsQuery, + m.PerfEventsStatementsDigestTextLimit, + m.PerfEventsStatementsTimeLimit, + m.PerfEventsStatementsLimit, + ) + + rows, err := db.Query(query) + if err != nil { + return err + } + + defer rows.Close() + + var ( + schemaName, digest, digest_text string + count, queryTime, errors, warnings float64 + rowsAffected, rowsSent, rowsExamined float64 + tmpTables, tmpDiskTables float64 + sortMergePasses, sortRows float64 + noIndexUsed float64 + ) + + servtag, err := parseDSN(serv) + if err != nil { + servtag = "localhost" + } + tags := map[string]string{ + "server": servtag, + } + + for rows.Next() { + err = rows.Scan( + &schemaName, &digest, &digest_text, + &count, &queryTime, &errors, &warnings, + &rowsAffected, &rowsSent, &rowsExamined, + &tmpTables, &tmpDiskTables, + &sortMergePasses, &sortRows, + ) + + if err != nil { + return err + } + tags["schema"] = schemaName + tags["digest"] = digest + tags["digest_text"] = digest_text + + fields := map[string]interface{}{ + "events_statements_total": count, + "events_statements_seconds_total": queryTime / picoSeconds, + "events_statements_errors_total": errors, + "events_statements_warnings_total": warnings, + "events_statements_rows_affected_total": rowsAffected, + "events_statements_rows_sent_total": rowsSent, + "events_statements_rows_examined_total": rowsExamined, + "events_statements_tmp_tables_total": tmpTables, + "events_statements_tmp_disk_tables_total": tmpDiskTables, + "events_statements_sort_merge_passes_total": sortMergePasses, + "events_statements_sort_rows_total": sortRows, + "events_statements_no_index_used_total": noIndexUsed, + } + + acc.AddFields("mysql_perf_schema", fields, tags) + } + return nil +} + +// gatherTableSchema can be used to gather stats on each schema +func (m *Mysql) gatherTableSchema(db *sql.DB, serv string, acc telegraf.Accumulator) error { + var ( + dbList []string + servtag string + ) + servtag, err := parseDSN(serv) + if err != nil { + servtag = "localhost" + } + + // if the list of databases if empty, then get all databases + if len(m.TableSchemaDatabases) == 0 { + rows, err := db.Query(dbListQuery) + if err != nil { + return err + } + defer rows.Close() + + var database string + for rows.Next() { + err = rows.Scan(&database) + if err != nil { + return err + } + + dbList = append(dbList, database) + } + } else { + dbList = m.TableSchemaDatabases + } + + for _, database := range dbList { + rows, err := db.Query(fmt.Sprintf(tableSchemaQuery, database)) + if err != nil { + return err + } + defer rows.Close() + var ( + tableSchema string + tableName string + tableType string + engine string + version float64 + rowFormat string + tableRows float64 + dataLength float64 + indexLength float64 + dataFree float64 + createOptions string + ) + for rows.Next() { + err = rows.Scan( + &tableSchema, + &tableName, + &tableType, + &engine, + &version, + &rowFormat, + &tableRows, + &dataLength, + &indexLength, + &dataFree, + &createOptions, + ) + if err != nil { + return err + } + tags := map[string]string{"server": servtag} + tags["schema"] = tableSchema + tags["table"] = tableName + + acc.Add(newNamespace("info_schema", "table_rows"), tableRows, tags) + + dlTags := copyTags(tags) + dlTags["component"] = "data_length" + acc.Add(newNamespace("info_schema", "table_size", "data_length"), dataLength, dlTags) + + ilTags := copyTags(tags) + ilTags["component"] = "index_length" + acc.Add(newNamespace("info_schema", "table_size", "index_length"), indexLength, ilTags) + + dfTags := copyTags(tags) + dfTags["component"] = "data_free" + acc.Add(newNamespace("info_schema", "table_size", "data_free"), dataFree, dfTags) + + versionTags := copyTags(tags) + versionTags["type"] = tableType + versionTags["engine"] = engine + versionTags["row_format"] = rowFormat + versionTags["create_options"] = createOptions + + acc.Add(newNamespace("info_schema", "table_version"), version, versionTags) + } + } + return nil +} + +// parseValue can be used to convert values such as "ON","OFF","Yes","No" to 0,1 +func parseValue(value sql.RawBytes) (float64, bool) { + if bytes.Compare(value, []byte("Yes")) == 0 || bytes.Compare(value, []byte("ON")) == 0 { + return 1, true + } + + if bytes.Compare(value, []byte("No")) == 0 || bytes.Compare(value, []byte("OFF")) == 0 { + return 0, false + } + n, err := strconv.ParseFloat(string(value), 64) + return n, err == nil +} + +// findThreadState can be used to find thread state by command and plain state +func findThreadState(rawCommand, rawState string) string { + var ( + // replace '_' symbol with space + command = strings.Replace(strings.ToLower(rawCommand), "_", " ", -1) + state = strings.Replace(strings.ToLower(rawState), "_", " ", -1) + ) + // if the state is already valid, then return it + if _, ok := generalThreadStates[state]; ok { + return state + } + + // if state is plain, return the mapping + if mappedState, ok := stateStatusMappings[state]; ok { + return mappedState + } + // if the state is any lock, return the special state + if strings.Contains(state, "waiting for") && strings.Contains(state, "lock") { + return "waiting for lock" + } + + if command == "sleep" && state == "" { + return "idle" + } + + if command == "query" { + return "executing" + } + + if command == "binlog dump" { + return "replication master" + } + // if no mappings found and state is invalid, then return "other" state + return "other" +} + +// newNamespace can be used to make a namespace +func newNamespace(words ...string) string { + return strings.Replace(strings.Join(words, "_"), " ", "_", -1) +} + +func copyTags(in map[string]string) map[string]string { + out := make(map[string]string) + for k, v := range in { + out[k] = v + } + return out +} + func init() { inputs.Add("mysql", func() telegraf.Input { return &Mysql{} diff --git a/plugins/inputs/mysql/mysql_test.go b/plugins/inputs/mysql/mysql_test.go index 9e4073432..989c21722 100644 --- a/plugins/inputs/mysql/mysql_test.go +++ b/plugins/inputs/mysql/mysql_test.go @@ -1,6 +1,7 @@ package mysql import ( + "database/sql" "fmt" "testing" @@ -115,3 +116,47 @@ func TestMysqlDNSAddTimeout(t *testing.T) { } } } + +func TestParseValue(t *testing.T) { + testCases := []struct { + rawByte sql.RawBytes + value float64 + boolValue bool + }{ + {sql.RawBytes("Yes"), 1, true}, + {sql.RawBytes("No"), 0, false}, + {sql.RawBytes("ON"), 1, true}, + {sql.RawBytes("OFF"), 0, false}, + {sql.RawBytes("ABC"), 0, false}, + } + for _, cases := range testCases { + if value, ok := parseValue(cases.rawByte); value != cases.value && ok != cases.boolValue { + t.Errorf("want %d with %t, got %d with %t", int(cases.value), cases.boolValue, int(value), ok) + } + } +} + +func TestNewNamespace(t *testing.T) { + testCases := []struct { + words []string + namespace string + }{ + { + []string{"thread", "info_scheme", "query update"}, + "thread_info_scheme_query_update", + }, + { + []string{"thread", "info_scheme", "query_update"}, + "thread_info_scheme_query_update", + }, + { + []string{"thread", "info", "scheme", "query", "update"}, + "thread_info_scheme_query_update", + }, + } + for _, cases := range testCases { + if got := newNamespace(cases.words...); got != cases.namespace { + t.Errorf("want %s, got %s", cases.namespace, got) + } + } +} diff --git a/plugins/inputs/phpfpm/phpfpm.go b/plugins/inputs/phpfpm/phpfpm.go index 169fe2194..3b23ef92c 100644 --- a/plugins/inputs/phpfpm/phpfpm.go +++ b/plugins/inputs/phpfpm/phpfpm.go @@ -184,6 +184,7 @@ func (g *phpfpm) gatherHttp(addr string, acc telegraf.Accumulator) error { return fmt.Errorf("Unable to connect to phpfpm status page '%s': %v", addr, err) } + defer res.Body.Close() if res.StatusCode != 200 { return fmt.Errorf("Unable to get valid stat result from '%s': %v", diff --git a/plugins/inputs/ping/ping.go b/plugins/inputs/ping/ping.go index 1798a5eb7..dfe67dc3f 100644 --- a/plugins/inputs/ping/ping.go +++ b/plugins/inputs/ping/ping.go @@ -9,15 +9,17 @@ import ( "strconv" "strings" "sync" + "time" "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal" "github.com/influxdata/telegraf/plugins/inputs" ) // HostPinger is a function that runs the "ping" function using a list of // passed arguments. This can be easily switched with a mocked ping function // for unit test purposes (see ping_test.go) -type HostPinger func(args ...string) (string, error) +type HostPinger func(timeout float64, args ...string) (string, error) type Ping struct { // Interval at which to ping (ping -i ) @@ -43,18 +45,18 @@ func (_ *Ping) Description() string { return "Ping given url(s) and return statistics" } -var sampleConfig = ` +const sampleConfig = ` ## NOTE: this plugin forks the ping command. You may need to set capabilities ## via setcap cap_net_raw+p /bin/ping - + # ## urls to ping urls = ["www.google.com"] # required - ## number of pings to send (ping -c ) + ## number of pings to send per collection (ping -c ) count = 1 # required ## interval, in s, at which to ping. 0 == default (ping -i ) ping_interval = 0.0 - ## ping timeout, in s. 0 == no timeout (ping -t ) - timeout = 0.0 + ## ping timeout, in s. 0 == no timeout (ping -W ) + timeout = 1.0 ## interface to send ping from (ping -I ) interface = "" ` @@ -71,16 +73,16 @@ func (p *Ping) Gather(acc telegraf.Accumulator) error { // Spin off a go routine for each url to ping for _, url := range p.Urls { wg.Add(1) - go func(url string, acc telegraf.Accumulator) { + go func(u string) { defer wg.Done() - args := p.args(url) - out, err := p.pingHost(args...) + args := p.args(u) + out, err := p.pingHost(p.Timeout, args...) if err != nil { // Combine go err + stderr output errorChannel <- errors.New( strings.TrimSpace(out) + ", " + err.Error()) } - tags := map[string]string{"url": url} + tags := map[string]string{"url": u} trans, rec, avg, err := processPingOutput(out) if err != nil { // fatal error @@ -98,7 +100,7 @@ func (p *Ping) Gather(acc telegraf.Accumulator) error { fields["average_response_ms"] = avg } acc.AddFields("ping", fields, tags) - }(url, acc) + }(url) } wg.Wait() @@ -116,13 +118,14 @@ func (p *Ping) Gather(acc telegraf.Accumulator) error { return errors.New(strings.Join(errorStrings, "\n")) } -func hostPinger(args ...string) (string, error) { +func hostPinger(timeout float64, args ...string) (string, error) { bin, err := exec.LookPath("ping") if err != nil { return "", err } c := exec.Command(bin, args...) - out, err := c.CombinedOutput() + out, err := internal.CombinedOutputTimeout(c, + time.Second*time.Duration(timeout+1)) return string(out), err } diff --git a/plugins/inputs/ping/ping_test.go b/plugins/inputs/ping/ping_test.go index cd61a4fb2..25ecdf2fa 100644 --- a/plugins/inputs/ping/ping_test.go +++ b/plugins/inputs/ping/ping_test.go @@ -124,7 +124,7 @@ func TestArgs(t *testing.T) { "Expected: %s Actual: %s", expected, actual) } -func mockHostPinger(args ...string) (string, error) { +func mockHostPinger(timeout float64, args ...string) (string, error) { return linuxPingOutput, nil } @@ -161,7 +161,7 @@ PING www.google.com (216.58.218.164) 56(84) bytes of data. rtt min/avg/max/mdev = 35.225/44.033/51.806/5.325 ms ` -func mockLossyHostPinger(args ...string) (string, error) { +func mockLossyHostPinger(timeout float64, args ...string) (string, error) { return lossyPingOutput, nil } @@ -192,7 +192,7 @@ Request timeout for icmp_seq 0 2 packets transmitted, 0 packets received, 100.0% packet loss ` -func mockErrorHostPinger(args ...string) (string, error) { +func mockErrorHostPinger(timeout float64, args ...string) (string, error) { return errorPingOutput, errors.New("No packets received") } @@ -215,7 +215,7 @@ func TestBadPingGather(t *testing.T) { acc.AssertContainsTaggedFields(t, "ping", fields, tags) } -func mockFatalHostPinger(args ...string) (string, error) { +func mockFatalHostPinger(timeout float64, args ...string) (string, error) { return fatalPingOutput, errors.New("So very bad") } diff --git a/plugins/inputs/postgresql_extensible/postgresql_extensible.go b/plugins/inputs/postgresql_extensible/postgresql_extensible.go index 4ebf752ff..75bc6b936 100644 --- a/plugins/inputs/postgresql_extensible/postgresql_extensible.go +++ b/plugins/inputs/postgresql_extensible/postgresql_extensible.go @@ -4,6 +4,7 @@ import ( "bytes" "database/sql" "fmt" + "log" "regexp" "strings" @@ -15,24 +16,27 @@ import ( type Postgresql struct { Address string + Outputaddress string Databases []string OrderedColumns []string AllColumns []string AdditionalTags []string sanitizedAddress string Query []struct { - Sqlquery string - Version int - Withdbname bool - Tagvalue string + Sqlquery string + Version int + Withdbname bool + Tagvalue string + Measurement string } } type query []struct { - Sqlquery string - Version int - Withdbname bool - Tagvalue string + Sqlquery string + Version int + Withdbname bool + Tagvalue string + Measurement string } var ignoredColumns = map[string]bool{"datid": true, "datname": true, "stats_reset": true} @@ -55,6 +59,11 @@ var sampleConfig = ` ## databases are gathered. ## databases = ["app_production", "testing"] # + # outputaddress = "db01" + ## A custom name for the database that will be used as the "server" tag in the + ## measurement output. If not specified, a default one generated from + ## the connection address is used. + # ## Define the toml config where the sql queries are stored ## New queries can be added, if the withdbname is set to true and there is no ## databases defined in the 'databases field', the sql query is ended by a @@ -65,24 +74,28 @@ var sampleConfig = ` ## because the databases variable was set to ['postgres', 'pgbench' ] and the ## withdbname was true. Be careful that if the withdbname is set to false you ## don't have to define the where clause (aka with the dbname) the tagvalue - ## field is used to define custom tags (separated by comas) + ## field is used to define custom tags (separated by commas) + ## The optional "measurement" value can be used to override the default + ## output measurement name ("postgresql"). # ## Structure : ## [[inputs.postgresql_extensible.query]] ## sqlquery string ## version string ## withdbname boolean - ## tagvalue string (coma separated) + ## tagvalue string (comma separated) + ## measurement string [[inputs.postgresql_extensible.query]] sqlquery="SELECT * FROM pg_stat_database" version=901 withdbname=false tagvalue="" + measurement="" [[inputs.postgresql_extensible.query]] sqlquery="SELECT * FROM pg_stat_bgwriter" version=901 withdbname=false - tagvalue="" + tagvalue="postgresql.stats" ` func (p *Postgresql) SampleConfig() string { @@ -106,6 +119,7 @@ func (p *Postgresql) Gather(acc telegraf.Accumulator) error { var db_version int var query string var tag_value string + var meas_name string if p.Address == "" || p.Address == "localhost" { p.Address = localhost @@ -131,6 +145,11 @@ func (p *Postgresql) Gather(acc telegraf.Accumulator) error { for i := range p.Query { sql_query = p.Query[i].Sqlquery tag_value = p.Query[i].Tagvalue + if p.Query[i].Measurement != "" { + meas_name = p.Query[i].Measurement + } else { + meas_name = "postgresql" + } if p.Query[i].Withdbname { if len(p.Databases) != 0 { @@ -170,7 +189,7 @@ func (p *Postgresql) Gather(acc telegraf.Accumulator) error { } for rows.Next() { - err = p.accRow(rows, acc) + err = p.accRow(meas_name, rows, acc) if err != nil { return err } @@ -184,9 +203,12 @@ type scanner interface { Scan(dest ...interface{}) error } -var passwordKVMatcher, _ = regexp.Compile("password=\\S+ ?") +var KVMatcher, _ = regexp.Compile("(password|sslcert|sslkey|sslmode|sslrootcert)=\\S+ ?") func (p *Postgresql) SanitizedAddress() (_ string, err error) { + if p.Outputaddress != "" { + return p.Outputaddress, nil + } var canonicalizedAddress string if strings.HasPrefix(p.Address, "postgres://") || strings.HasPrefix(p.Address, "postgresql://") { canonicalizedAddress, err = pq.ParseURL(p.Address) @@ -196,12 +218,12 @@ func (p *Postgresql) SanitizedAddress() (_ string, err error) { } else { canonicalizedAddress = p.Address } - p.sanitizedAddress = passwordKVMatcher.ReplaceAllString(canonicalizedAddress, "") + p.sanitizedAddress = KVMatcher.ReplaceAllString(canonicalizedAddress, "") return p.sanitizedAddress, err } -func (p *Postgresql) accRow(row scanner, acc telegraf.Accumulator) error { +func (p *Postgresql) accRow(meas_name string, row scanner, acc telegraf.Accumulator) error { var columnVars []interface{} var dbname bytes.Buffer @@ -247,9 +269,11 @@ func (p *Postgresql) accRow(row scanner, acc telegraf.Accumulator) error { var isATag int fields := make(map[string]interface{}) for col, val := range columnMap { + if acc.Debug() { + log.Printf("postgresql_extensible: column: %s = %T: %s\n", col, *val, *val) + } _, ignore := ignoredColumns[col] - //if !ignore && *val != "" { - if !ignore { + if !ignore && *val != nil { isATag = 0 for tag := range p.AdditionalTags { if col == p.AdditionalTags[tag] { @@ -267,7 +291,7 @@ func (p *Postgresql) accRow(row scanner, acc telegraf.Accumulator) error { } } } - acc.AddFields("postgresql", fields, tags) + acc.AddFields(meas_name, fields, tags) return nil } diff --git a/plugins/inputs/procstat/procstat.go b/plugins/inputs/procstat/procstat.go index a0e63fd6f..2a55f8618 100644 --- a/plugins/inputs/procstat/procstat.go +++ b/plugins/inputs/procstat/procstat.go @@ -21,12 +21,16 @@ type Procstat struct { Prefix string User string + // pidmap maps a pid to a process object, so we don't recreate every gather pidmap map[int32]*process.Process + // tagmap maps a pid to a map of tags for that pid + tagmap map[int32]map[string]string } func NewProcstat() *Procstat { return &Procstat{ pidmap: make(map[int32]*process.Process), + tagmap: make(map[int32]map[string]string), } } @@ -61,8 +65,8 @@ func (p *Procstat) Gather(acc telegraf.Accumulator) error { log.Printf("Error: procstat getting process, exe: [%s] pidfile: [%s] pattern: [%s] user: [%s] %s", p.Exe, p.PidFile, p.Pattern, p.User, err.Error()) } else { - for _, proc := range p.pidmap { - p := NewSpecProcessor(p.Prefix, acc, proc) + for pid, proc := range p.pidmap { + p := NewSpecProcessor(p.Prefix, acc, proc, p.tagmap[pid]) p.pushMetrics() } } @@ -103,45 +107,50 @@ func (p *Procstat) getAllPids() ([]int32, error) { var err error if p.PidFile != "" { - pids, err = pidsFromFile(p.PidFile) + pids, err = p.pidsFromFile() } else if p.Exe != "" { - pids, err = pidsFromExe(p.Exe) + pids, err = p.pidsFromExe() } else if p.Pattern != "" { - pids, err = pidsFromPattern(p.Pattern) + pids, err = p.pidsFromPattern() } else if p.User != "" { - pids, err = pidsFromUser(p.User) + pids, err = p.pidsFromUser() } else { - err = fmt.Errorf("Either exe, pid_file or pattern has to be specified") + err = fmt.Errorf("Either exe, pid_file, user, or pattern has to be specified") } return pids, err } -func pidsFromFile(file string) ([]int32, error) { +func (p *Procstat) pidsFromFile() ([]int32, error) { var out []int32 var outerr error - pidString, err := ioutil.ReadFile(file) + pidString, err := ioutil.ReadFile(p.PidFile) if err != nil { - outerr = fmt.Errorf("Failed to read pidfile '%s'. Error: '%s'", file, err) + outerr = fmt.Errorf("Failed to read pidfile '%s'. Error: '%s'", + p.PidFile, err) } else { pid, err := strconv.Atoi(strings.TrimSpace(string(pidString))) if err != nil { outerr = err } else { out = append(out, int32(pid)) + p.tagmap[int32(pid)] = map[string]string{ + "pidfile": p.PidFile, + "pid": strings.TrimSpace(string(pidString)), + } } } return out, outerr } -func pidsFromExe(exe string) ([]int32, error) { +func (p *Procstat) pidsFromExe() ([]int32, error) { var out []int32 var outerr error bin, err := exec.LookPath("pgrep") if err != nil { return out, fmt.Errorf("Couldn't find pgrep binary: %s", err) } - pgrep, err := exec.Command(bin, exe).Output() + pgrep, err := exec.Command(bin, p.Exe).Output() if err != nil { return out, fmt.Errorf("Failed to execute %s. Error: '%s'", bin, err) } else { @@ -150,6 +159,10 @@ func pidsFromExe(exe string) ([]int32, error) { ipid, err := strconv.Atoi(pid) if err == nil { out = append(out, int32(ipid)) + p.tagmap[int32(ipid)] = map[string]string{ + "exe": p.Exe, + "pid": pid, + } } else { outerr = err } @@ -158,14 +171,14 @@ func pidsFromExe(exe string) ([]int32, error) { return out, outerr } -func pidsFromPattern(pattern string) ([]int32, error) { +func (p *Procstat) pidsFromPattern() ([]int32, error) { var out []int32 var outerr error bin, err := exec.LookPath("pgrep") if err != nil { return out, fmt.Errorf("Couldn't find pgrep binary: %s", err) } - pgrep, err := exec.Command(bin, "-f", pattern).Output() + pgrep, err := exec.Command(bin, "-f", p.Pattern).Output() if err != nil { return out, fmt.Errorf("Failed to execute %s. Error: '%s'", bin, err) } else { @@ -174,6 +187,10 @@ func pidsFromPattern(pattern string) ([]int32, error) { ipid, err := strconv.Atoi(pid) if err == nil { out = append(out, int32(ipid)) + p.tagmap[int32(ipid)] = map[string]string{ + "pattern": p.Pattern, + "pid": pid, + } } else { outerr = err } @@ -182,14 +199,14 @@ func pidsFromPattern(pattern string) ([]int32, error) { return out, outerr } -func pidsFromUser(user string) ([]int32, error) { +func (p *Procstat) pidsFromUser() ([]int32, error) { var out []int32 var outerr error bin, err := exec.LookPath("pgrep") if err != nil { return out, fmt.Errorf("Couldn't find pgrep binary: %s", err) } - pgrep, err := exec.Command(bin, "-u", user).Output() + pgrep, err := exec.Command(bin, "-u", p.User).Output() if err != nil { return out, fmt.Errorf("Failed to execute %s. Error: '%s'", bin, err) } else { @@ -198,6 +215,10 @@ func pidsFromUser(user string) ([]int32, error) { ipid, err := strconv.Atoi(pid) if err == nil { out = append(out, int32(ipid)) + p.tagmap[int32(ipid)] = map[string]string{ + "user": p.User, + "pid": pid, + } } else { outerr = err } diff --git a/plugins/inputs/procstat/procstat_test.go b/plugins/inputs/procstat/procstat_test.go index bf5790f67..ccc72bdbb 100644 --- a/plugins/inputs/procstat/procstat_test.go +++ b/plugins/inputs/procstat/procstat_test.go @@ -25,6 +25,7 @@ func TestGather(t *testing.T) { PidFile: file.Name(), Prefix: "foo", pidmap: make(map[int32]*process.Process), + tagmap: make(map[int32]map[string]string), } p.Gather(&acc) assert.True(t, acc.HasFloatField("procstat", "foo_cpu_time_user")) diff --git a/plugins/inputs/procstat/spec_processor.go b/plugins/inputs/procstat/spec_processor.go index bb248f003..90503f930 100644 --- a/plugins/inputs/procstat/spec_processor.go +++ b/plugins/inputs/procstat/spec_processor.go @@ -1,7 +1,6 @@ package procstat import ( - "fmt" "time" "github.com/shirou/gopsutil/process" @@ -17,28 +16,12 @@ type SpecProcessor struct { proc *process.Process } -func (p *SpecProcessor) add(metric string, value interface{}) { - var mname string - if p.Prefix == "" { - mname = metric - } else { - mname = p.Prefix + "_" + metric - } - p.fields[mname] = value -} - -func (p *SpecProcessor) flush() { - p.acc.AddFields("procstat", p.fields, p.tags) - p.fields = make(map[string]interface{}) -} - func NewSpecProcessor( prefix string, acc telegraf.Accumulator, p *process.Process, + tags map[string]string, ) *SpecProcessor { - tags := make(map[string]string) - tags["pid"] = fmt.Sprintf("%v", p.Pid) if name, err := p.Name(); err == nil { tags["process_name"] = name } @@ -52,90 +35,62 @@ func NewSpecProcessor( } func (p *SpecProcessor) pushMetrics() { - p.pushNThreadsStats() - p.pushFDStats() - p.pushCtxStats() - p.pushIOStats() - p.pushCPUStats() - p.pushMemoryStats() - p.flush() -} + var prefix string + if p.Prefix != "" { + prefix = p.Prefix + "_" + } + fields := map[string]interface{}{} -func (p *SpecProcessor) pushNThreadsStats() error { numThreads, err := p.proc.NumThreads() - if err != nil { - return fmt.Errorf("NumThreads error: %s\n", err) + if err == nil { + fields[prefix+"num_threads"] = numThreads } - p.add("num_threads", numThreads) - return nil -} -func (p *SpecProcessor) pushFDStats() error { fds, err := p.proc.NumFDs() - if err != nil { - return fmt.Errorf("NumFD error: %s\n", err) + if err == nil { + fields[prefix+"num_fds"] = fds } - p.add("num_fds", fds) - return nil -} -func (p *SpecProcessor) pushCtxStats() error { ctx, err := p.proc.NumCtxSwitches() - if err != nil { - return fmt.Errorf("ContextSwitch error: %s\n", err) + if err == nil { + fields[prefix+"voluntary_context_switches"] = ctx.Voluntary + fields[prefix+"involuntary_context_switches"] = ctx.Involuntary } - p.add("voluntary_context_switches", ctx.Voluntary) - p.add("involuntary_context_switches", ctx.Involuntary) - return nil -} -func (p *SpecProcessor) pushIOStats() error { io, err := p.proc.IOCounters() - if err != nil { - return fmt.Errorf("IOCounters error: %s\n", err) + if err == nil { + fields[prefix+"read_count"] = io.ReadCount + fields[prefix+"write_count"] = io.WriteCount + fields[prefix+"read_bytes"] = io.ReadBytes + fields[prefix+"write_bytes"] = io.WriteCount } - p.add("read_count", io.ReadCount) - p.add("write_count", io.WriteCount) - p.add("read_bytes", io.ReadBytes) - p.add("write_bytes", io.WriteCount) - return nil -} -func (p *SpecProcessor) pushCPUStats() error { cpu_time, err := p.proc.CPUTimes() - if err != nil { - return err + if err == nil { + fields[prefix+"cpu_time_user"] = cpu_time.User + fields[prefix+"cpu_time_system"] = cpu_time.System + fields[prefix+"cpu_time_idle"] = cpu_time.Idle + fields[prefix+"cpu_time_nice"] = cpu_time.Nice + fields[prefix+"cpu_time_iowait"] = cpu_time.Iowait + fields[prefix+"cpu_time_irq"] = cpu_time.Irq + fields[prefix+"cpu_time_soft_irq"] = cpu_time.Softirq + fields[prefix+"cpu_time_steal"] = cpu_time.Steal + fields[prefix+"cpu_time_stolen"] = cpu_time.Stolen + fields[prefix+"cpu_time_guest"] = cpu_time.Guest + fields[prefix+"cpu_time_guest_nice"] = cpu_time.GuestNice } - p.add("cpu_time_user", cpu_time.User) - p.add("cpu_time_system", cpu_time.System) - p.add("cpu_time_idle", cpu_time.Idle) - p.add("cpu_time_nice", cpu_time.Nice) - p.add("cpu_time_iowait", cpu_time.Iowait) - p.add("cpu_time_irq", cpu_time.Irq) - p.add("cpu_time_soft_irq", cpu_time.Softirq) - p.add("cpu_time_steal", cpu_time.Steal) - p.add("cpu_time_stolen", cpu_time.Stolen) - p.add("cpu_time_guest", cpu_time.Guest) - p.add("cpu_time_guest_nice", cpu_time.GuestNice) cpu_perc, err := p.proc.CPUPercent(time.Duration(0)) - if err != nil { - return err - } else if cpu_perc == 0 { - return nil + if err == nil && cpu_perc != 0 { + fields[prefix+"cpu_usage"] = cpu_perc } - p.add("cpu_usage", cpu_perc) - return nil -} - -func (p *SpecProcessor) pushMemoryStats() error { mem, err := p.proc.MemoryInfo() - if err != nil { - return err + if err == nil { + fields[prefix+"memory_rss"] = mem.RSS + fields[prefix+"memory_vms"] = mem.VMS + fields[prefix+"memory_swap"] = mem.Swap } - p.add("memory_rss", mem.RSS) - p.add("memory_vms", mem.VMS) - p.add("memory_swap", mem.Swap) - return nil + + p.acc.AddFields("procstat", fields, p.tags) } diff --git a/plugins/inputs/prometheus/README.md b/plugins/inputs/prometheus/README.md index c5c952515..3aa8c8afd 100644 --- a/plugins/inputs/prometheus/README.md +++ b/plugins/inputs/prometheus/README.md @@ -22,7 +22,7 @@ to filter and some tags # An array of urls to scrape metrics from. urls = ["http://my-kube-apiserver:8080/metrics"] # Get only metrics with "apiserver_" string is in metric name - namepass = ["apiserver_"] + namepass = ["apiserver_*"] # Add a metric name prefix name_prefix = "k8s_" # Add tags to be able to make beautiful dashboards diff --git a/plugins/inputs/rethinkdb/rethinkdb_server.go b/plugins/inputs/rethinkdb/rethinkdb_server.go index 98e2a35f0..f172717d1 100644 --- a/plugins/inputs/rethinkdb/rethinkdb_server.go +++ b/plugins/inputs/rethinkdb/rethinkdb_server.go @@ -97,8 +97,8 @@ func (s *Server) getServerStatus() error { func (s *Server) getDefaultTags() map[string]string { tags := make(map[string]string) - tags["host"] = s.Url.Host - tags["hostname"] = s.serverStatus.Network.Hostname + tags["rethinkdb_host"] = s.Url.Host + tags["rethinkdb_hostname"] = s.serverStatus.Network.Hostname return tags } diff --git a/plugins/inputs/rethinkdb/rethinkdb_server_test.go b/plugins/inputs/rethinkdb/rethinkdb_server_test.go index c4b644222..82ff29280 100644 --- a/plugins/inputs/rethinkdb/rethinkdb_server_test.go +++ b/plugins/inputs/rethinkdb/rethinkdb_server_test.go @@ -20,8 +20,8 @@ func TestGetDefaultTags(t *testing.T) { in string out string }{ - {"host", server.Url.Host}, - {"hostname", server.serverStatus.Network.Hostname}, + {"rethinkdb_host", server.Url.Host}, + {"rethinkdb_hostname", server.serverStatus.Network.Hostname}, } defaultTags := server.getDefaultTags() for _, tt := range tagTests { diff --git a/plugins/inputs/snmp/snmp.go b/plugins/inputs/snmp/snmp.go index 4c2de93c9..8ccfe100b 100644 --- a/plugins/inputs/snmp/snmp.go +++ b/plugins/inputs/snmp/snmp.go @@ -398,15 +398,16 @@ func (s *Snmp) Gather(acc telegraf.Accumulator) error { // only if len(s.OidInstanceMapping) == 0 if len(OidInstanceMapping) >= 0 { if err := host.SNMPMap(acc, s.nameToOid, s.subTableMap); err != nil { - return err + log.Printf("SNMP Mapping error for host '%s': %s", host.Address, err) + continue } } // Launch Get requests if err := host.SNMPGet(acc, s.initNode); err != nil { - return err + log.Printf("SNMP Error for host '%s': %s", host.Address, err) } if err := host.SNMPBulk(acc, s.initNode); err != nil { - return err + log.Printf("SNMP Error for host '%s': %s", host.Address, err) } } return nil @@ -732,7 +733,11 @@ func (h *Host) HandleResponse(oids map[string]Data, result *gosnmp.SnmpPacket, a break nextresult } } - if strings.HasPrefix(variable.Name, oid_key) { + // If variable.Name is the same as oid_key + // OR + // the result is SNMP table which "." comes right after oid_key. + // ex: oid_key: .1.3.6.1.2.1.2.2.1.16, variable.Name: .1.3.6.1.2.1.2.2.1.16.1 + if variable.Name == oid_key || strings.HasPrefix(variable.Name, oid_key+".") { switch variable.Type { // handle Metrics case gosnmp.Boolean, gosnmp.Integer, gosnmp.Counter32, gosnmp.Gauge32, diff --git a/plugins/inputs/snmp/snmp_test.go b/plugins/inputs/snmp/snmp_test.go index 22414fb79..5822926dd 100644 --- a/plugins/inputs/snmp/snmp_test.go +++ b/plugins/inputs/snmp/snmp_test.go @@ -5,7 +5,7 @@ import ( "github.com/influxdata/telegraf/testutil" - // "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -45,7 +45,8 @@ func TestSNMPErrorGet2(t *testing.T) { var acc testutil.Accumulator err := s.Gather(&acc) - require.Error(t, err) + require.NoError(t, err) + assert.Equal(t, 0, len(acc.Metrics)) } func TestSNMPErrorBulk(t *testing.T) { @@ -65,7 +66,8 @@ func TestSNMPErrorBulk(t *testing.T) { var acc testutil.Accumulator err := s.Gather(&acc) - require.Error(t, err) + require.NoError(t, err) + assert.Equal(t, 0, len(acc.Metrics)) } func TestSNMPGet1(t *testing.T) { diff --git a/plugins/inputs/sqlserver/sqlserver.go b/plugins/inputs/sqlserver/sqlserver.go index 58d61705f..f91e66c24 100644 --- a/plugins/inputs/sqlserver/sqlserver.go +++ b/plugins/inputs/sqlserver/sqlserver.go @@ -1052,7 +1052,7 @@ SELECT When 1073874176 Then IsNull(Cast(cc.cntr_value - pc.cntr_value as Money) / NullIf(cbc.cntr_value - pbc.cntr_value, 0), 0) -- Avg When 272696320 Then IsNull(Cast(cc.cntr_value - pc.cntr_value as Money) / NullIf(cbc.cntr_value - pbc.cntr_value, 0), 0) -- Avg/sec When 1073939712 Then cc.cntr_value - pc.cntr_value -- Base - Else cc.cntr_value End as int) + Else cc.cntr_value End as bigint) --, currentvalue= CAST(cc.cntr_value as bigint) FROM #CCounters cc INNER JOIN #PCounters pc On cc.object_name = pc.object_name diff --git a/plugins/inputs/statsd/README.md b/plugins/inputs/statsd/README.md index 8722ce1e9..ba0c8e746 100644 --- a/plugins/inputs/statsd/README.md +++ b/plugins/inputs/statsd/README.md @@ -18,10 +18,10 @@ ## Percentiles to calculate for timing & histogram stats percentiles = [90] - ## convert measurement names, "." to "_" and "-" to "__" - convert_names = true + ## separator to use between elements of a statsd metric + metric_separator = "_" - ## Parses tags in DataDog's dogstatsd format + ## Parses tags in the datadog statsd format ## http://docs.datadoghq.com/guides/dogstatsd/ parse_data_dog_tags = false @@ -39,10 +39,6 @@ ## calculation of percentiles. Raising this limit increases the accuracy ## of percentiles but also increases the memory usage and cpu time. percentile_limit = 1000 - - ## UDP packet size for the server to listen for. This will depend on the size - ## of the packets that the client is sending, which is usually 1500 bytes. - udp_packet_size = 1500 ``` ### Description diff --git a/plugins/inputs/sysstat/sysstat.go b/plugins/inputs/sysstat/sysstat.go index c8c17ac45..c55516716 100644 --- a/plugins/inputs/sysstat/sysstat.go +++ b/plugins/inputs/sysstat/sysstat.go @@ -17,6 +17,7 @@ import ( "time" "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal" "github.com/influxdata/telegraf/plugins/inputs" ) @@ -98,31 +99,34 @@ var sampleConfig = ` # group = true # # - ## Options for the sadf command. The values on the left represent the sadf options and - ## the values on the right their description (wich are used for grouping and prefixing metrics). + ## Options for the sadf command. The values on the left represent the sadf + ## options and the values on the right their description (wich are used for + ## grouping and prefixing metrics). ## - ## Run 'sar -h' or 'man sar' to find out the supported options for your sysstat version. + ## Run 'sar -h' or 'man sar' to find out the supported options for your + ## sysstat version. [inputs.sysstat.options] - -C = "cpu" - -B = "paging" - -b = "io" - -d = "disk" # requires DISK activity - "-n ALL" = "network" - "-P ALL" = "per_cpu" - -q = "queue" - -R = "mem" - -r = "mem_util" - -S = "swap_util" - -u = "cpu_util" - -v = "inode" - -W = "swap" - -w = "task" - # -H = "hugepages" # only available for newer linux distributions - # "-I ALL" = "interrupts" # requires INT activity + -C = "cpu" + -B = "paging" + -b = "io" + -d = "disk" # requires DISK activity + "-n ALL" = "network" + "-P ALL" = "per_cpu" + -q = "queue" + -R = "mem" + -r = "mem_util" + -S = "swap_util" + -u = "cpu_util" + -v = "inode" + -W = "swap" + -w = "task" + # -H = "hugepages" # only available for newer linux distributions + # "-I ALL" = "interrupts" # requires INT activity # # - ## Device tags can be used to add additional tags for devices. For example the configuration below - ## adds a tag vg with value rootvg for all metrics with sda devices. + ## Device tags can be used to add additional tags for devices. + ## For example the configuration below adds a tag vg with value rootvg for + ## all metrics with sda devices. # [[inputs.sysstat.device_tags.sda]] # vg = "rootvg" ` @@ -174,24 +178,28 @@ func (s *Sysstat) Gather(acc telegraf.Accumulator) error { return errors.New(strings.Join(errorStrings, "\n")) } -// collect collects sysstat data with the collector utility sadc. It runs the following command: +// collect collects sysstat data with the collector utility sadc. +// It runs the following command: // Sadc -S -S ... 2 tmpFile -// The above command collects system metrics during and saves it in binary form to tmpFile. +// The above command collects system metrics during and +// saves it in binary form to tmpFile. func (s *Sysstat) collect() error { options := []string{} for _, act := range s.Activities { options = append(options, "-S", act) } s.tmpFile = path.Join("/tmp", fmt.Sprintf("sysstat-%d", time.Now().Unix())) - collectInterval := s.interval - parseInterval // collectInterval has to be smaller than the telegraf data collection interval + // collectInterval has to be smaller than the telegraf data collection interval + collectInterval := s.interval - parseInterval - if collectInterval < 0 { // If true, interval is not defined yet and Gather is run for the first time. + // If true, interval is not defined yet and Gather is run for the first time. + if collectInterval < 0 { collectInterval = 1 // In that case we only collect for 1 second. } options = append(options, strconv.Itoa(collectInterval), "2", s.tmpFile) cmd := execCommand(s.Sadc, options...) - out, err := cmd.CombinedOutput() + out, err := internal.CombinedOutputTimeout(cmd, time.Second*5) if err != nil { return fmt.Errorf("failed to run command %s: %s", strings.Join(cmd.Args, " "), string(out)) } @@ -279,8 +287,9 @@ func (s *Sysstat) parse(acc telegraf.Accumulator, option string, ts time.Time) e acc.AddFields(measurement, v.fields, v.tags, ts) } } - if err := cmd.Wait(); err != nil { - return fmt.Errorf("command %s failed with %s", strings.Join(cmd.Args, " "), err) + if err := internal.WaitTimeout(cmd, time.Second*5); err != nil { + return fmt.Errorf("command %s failed with %s", + strings.Join(cmd.Args, " "), err) } return nil } diff --git a/plugins/inputs/system/system.go b/plugins/inputs/system/system.go index 42b0310a4..55e606225 100644 --- a/plugins/inputs/system/system.go +++ b/plugins/inputs/system/system.go @@ -4,6 +4,7 @@ import ( "bufio" "bytes" "fmt" + "runtime" "github.com/shirou/gopsutil/host" "github.com/shirou/gopsutil/load" @@ -43,6 +44,7 @@ func (_ *SystemStats) Gather(acc telegraf.Accumulator) error { "uptime": hostinfo.Uptime, "n_users": len(users), "uptime_format": format_uptime(hostinfo.Uptime), + "n_cpus": runtime.NumCPU(), } acc.AddFields("system", fields, nil) diff --git a/plugins/inputs/tail/README.md b/plugins/inputs/tail/README.md new file mode 100644 index 000000000..9ae120e91 --- /dev/null +++ b/plugins/inputs/tail/README.md @@ -0,0 +1,46 @@ +# tail Input Plugin + +The tail plugin "tails" a logfile and parses each log message. + +By default, the tail plugin acts like the following unix tail command: + +``` +tail -F --lines=0 myfile.log +``` + +- `-F` means that it will follow the _name_ of the given file, so +that it will be compatible with log-rotated files, and that it will retry on +inaccessible files. +- `--lines=0` means that it will start at the end of the file (unless +the `from_beginning` option is set). + +see http://man7.org/linux/man-pages/man1/tail.1.html for more details. + +The plugin expects messages in one of the +[Telegraf Input Data Formats](https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md). + +### Configuration: + +```toml +# Stream a log file, like the tail -f command +[[inputs.tail]] + ## files to tail. + ## These accept standard unix glob matching rules, but with the addition of + ## ** as a "super asterisk". ie: + ## "/var/log/**.log" -> recursively find all .log files in /var/log + ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log + ## "/var/log/apache.log" -> just tail the apache log file + ## + ## See https://github.com/gobwas/glob for more examples + ## + files = ["/var/mymetrics.out"] + ## Read file from beginning. + from_beginning = false + + ## Data format to consume. + ## Each data format has it's own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" +``` + diff --git a/plugins/inputs/tail/tail.go b/plugins/inputs/tail/tail.go new file mode 100644 index 000000000..7cfca81e2 --- /dev/null +++ b/plugins/inputs/tail/tail.go @@ -0,0 +1,156 @@ +package tail + +import ( + "fmt" + "log" + "sync" + + "github.com/hpcloud/tail" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal/globpath" + "github.com/influxdata/telegraf/plugins/inputs" + "github.com/influxdata/telegraf/plugins/parsers" +) + +type Tail struct { + Files []string + FromBeginning bool + + tailers []*tail.Tail + parser parsers.Parser + wg sync.WaitGroup + acc telegraf.Accumulator + + sync.Mutex +} + +func NewTail() *Tail { + return &Tail{ + FromBeginning: false, + } +} + +const sampleConfig = ` + ## files to tail. + ## These accept standard unix glob matching rules, but with the addition of + ## ** as a "super asterisk". ie: + ## "/var/log/**.log" -> recursively find all .log files in /var/log + ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log + ## "/var/log/apache.log" -> just tail the apache log file + ## + ## See https://github.com/gobwas/glob for more examples + ## + files = ["/var/mymetrics.out"] + ## Read file from beginning. + from_beginning = false + + ## Data format to consume. + ## Each data format has it's own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" +` + +func (t *Tail) SampleConfig() string { + return sampleConfig +} + +func (t *Tail) Description() string { + return "Stream a log file, like the tail -f command" +} + +func (t *Tail) Gather(acc telegraf.Accumulator) error { + return nil +} + +func (t *Tail) Start(acc telegraf.Accumulator) error { + t.Lock() + defer t.Unlock() + + t.acc = acc + + var seek tail.SeekInfo + if !t.FromBeginning { + seek.Whence = 2 + seek.Offset = 0 + } + + var errS string + // Create a "tailer" for each file + for _, filepath := range t.Files { + g, err := globpath.Compile(filepath) + if err != nil { + log.Printf("ERROR Glob %s failed to compile, %s", filepath, err) + } + for file, _ := range g.Match() { + tailer, err := tail.TailFile(file, + tail.Config{ + ReOpen: true, + Follow: true, + Location: &seek, + }) + if err != nil { + errS += err.Error() + " " + continue + } + // create a goroutine for each "tailer" + go t.receiver(tailer) + t.tailers = append(t.tailers, tailer) + } + } + + if errS != "" { + return fmt.Errorf(errS) + } + return nil +} + +// this is launched as a goroutine to continuously watch a tailed logfile +// for changes, parse any incoming msgs, and add to the accumulator. +func (t *Tail) receiver(tailer *tail.Tail) { + t.wg.Add(1) + defer t.wg.Done() + + var m telegraf.Metric + var err error + var line *tail.Line + for line = range tailer.Lines { + if line.Err != nil { + log.Printf("ERROR tailing file %s, Error: %s\n", + tailer.Filename, err) + continue + } + m, err = t.parser.ParseLine(line.Text) + if err == nil { + t.acc.AddFields(m.Name(), m.Fields(), m.Tags(), m.Time()) + } else { + log.Printf("Malformed log line in %s: [%s], Error: %s\n", + tailer.Filename, line.Text, err) + } + } +} + +func (t *Tail) Stop() { + t.Lock() + defer t.Unlock() + + for _, t := range t.tailers { + err := t.Stop() + if err != nil { + log.Printf("ERROR stopping tail on file %s\n", t.Filename) + } + t.Cleanup() + } + t.wg.Wait() +} + +func (t *Tail) SetParser(parser parsers.Parser) { + t.parser = parser +} + +func init() { + inputs.Add("tail", func() telegraf.Input { + return NewTail() + }) +} diff --git a/plugins/inputs/tail/tail_test.go b/plugins/inputs/tail/tail_test.go new file mode 100644 index 000000000..5d7c04a88 --- /dev/null +++ b/plugins/inputs/tail/tail_test.go @@ -0,0 +1,101 @@ +package tail + +import ( + "io/ioutil" + "os" + "testing" + "time" + + "github.com/influxdata/telegraf/plugins/parsers" + "github.com/influxdata/telegraf/testutil" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestTailFromBeginning(t *testing.T) { + tmpfile, err := ioutil.TempFile("", "") + require.NoError(t, err) + defer os.Remove(tmpfile.Name()) + + tt := NewTail() + tt.FromBeginning = true + tt.Files = []string{tmpfile.Name()} + p, _ := parsers.NewInfluxParser() + tt.SetParser(p) + defer tt.Stop() + defer tmpfile.Close() + + acc := testutil.Accumulator{} + require.NoError(t, tt.Start(&acc)) + + _, err = tmpfile.WriteString("cpu,mytag=foo usage_idle=100\n") + require.NoError(t, err) + require.NoError(t, tt.Gather(&acc)) + time.Sleep(time.Millisecond * 50) + + acc.AssertContainsTaggedFields(t, "cpu", + map[string]interface{}{ + "usage_idle": float64(100), + }, + map[string]string{ + "mytag": "foo", + }) +} + +func TestTailFromEnd(t *testing.T) { + tmpfile, err := ioutil.TempFile("", "") + require.NoError(t, err) + defer os.Remove(tmpfile.Name()) + _, err = tmpfile.WriteString("cpu,mytag=foo usage_idle=100\n") + require.NoError(t, err) + + tt := NewTail() + tt.Files = []string{tmpfile.Name()} + p, _ := parsers.NewInfluxParser() + tt.SetParser(p) + defer tt.Stop() + defer tmpfile.Close() + + acc := testutil.Accumulator{} + require.NoError(t, tt.Start(&acc)) + time.Sleep(time.Millisecond * 100) + + _, err = tmpfile.WriteString("cpu,othertag=foo usage_idle=100\n") + require.NoError(t, err) + require.NoError(t, tt.Gather(&acc)) + time.Sleep(time.Millisecond * 50) + + acc.AssertContainsTaggedFields(t, "cpu", + map[string]interface{}{ + "usage_idle": float64(100), + }, + map[string]string{ + "othertag": "foo", + }) + assert.Len(t, acc.Metrics, 1) +} + +func TestTailBadLine(t *testing.T) { + tmpfile, err := ioutil.TempFile("", "") + require.NoError(t, err) + defer os.Remove(tmpfile.Name()) + + tt := NewTail() + tt.FromBeginning = true + tt.Files = []string{tmpfile.Name()} + p, _ := parsers.NewInfluxParser() + tt.SetParser(p) + defer tt.Stop() + defer tmpfile.Close() + + acc := testutil.Accumulator{} + require.NoError(t, tt.Start(&acc)) + + _, err = tmpfile.WriteString("cpu mytag= foo usage_idle= 100\n") + require.NoError(t, err) + require.NoError(t, tt.Gather(&acc)) + time.Sleep(time.Millisecond * 50) + + assert.Len(t, acc.Metrics, 0) +} diff --git a/plugins/inputs/tcp_listener/README.md b/plugins/inputs/tcp_listener/README.md index d2dfeb575..0066ea801 100644 --- a/plugins/inputs/tcp_listener/README.md +++ b/plugins/inputs/tcp_listener/README.md @@ -22,8 +22,7 @@ This is a sample configuration for the plugin. ## Maximum number of concurrent TCP connections to allow max_tcp_connections = 250 - ## Data format to consume. - + ## Data format to consume. ## Each data format has it's own unique set of configuration options, read ## more about them here: ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md diff --git a/plugins/inputs/tcp_listener/tcp_listener.go b/plugins/inputs/tcp_listener/tcp_listener.go index b7f5ef9ed..a420ed759 100644 --- a/plugins/inputs/tcp_listener/tcp_listener.go +++ b/plugins/inputs/tcp_listener/tcp_listener.go @@ -150,8 +150,7 @@ func (t *TcpListener) tcpListen() error { if err != nil { return err } - - log.Printf("Received TCP Connection from %s", conn.RemoteAddr()) + // log.Printf("Received TCP Connection from %s", conn.RemoteAddr()) select { case <-t.accept: @@ -187,7 +186,7 @@ func (t *TcpListener) handler(conn *net.TCPConn, id string) { defer func() { t.wg.Done() conn.Close() - log.Printf("Closed TCP Connection from %s", conn.RemoteAddr()) + // log.Printf("Closed TCP Connection from %s", conn.RemoteAddr()) // Add one connection potential back to channel when this one closes t.accept <- true t.forget(id) @@ -222,7 +221,10 @@ func (t *TcpListener) handler(conn *net.TCPConn, id string) { // tcpParser parses the incoming tcp byte packets func (t *TcpListener) tcpParser() error { defer t.wg.Done() + var packet []byte + var metrics []telegraf.Metric + var err error for { select { case <-t.done: @@ -231,7 +233,7 @@ func (t *TcpListener) tcpParser() error { if len(packet) == 0 { continue } - metrics, err := t.parser.Parse(packet) + metrics, err = t.parser.Parse(packet) if err == nil { t.storeMetrics(metrics) } else { diff --git a/plugins/inputs/udp_listener/README.md b/plugins/inputs/udp_listener/README.md index 1dd03a2a7..ee675f535 100644 --- a/plugins/inputs/udp_listener/README.md +++ b/plugins/inputs/udp_listener/README.md @@ -18,13 +18,7 @@ This is a sample configuration for the plugin. ## UDP listener will start dropping packets. allowed_pending_messages = 10000 - ## UDP packet size for the server to listen for. This will depend - ## on the size of the packets that the client is sending, which is - ## usually 1500 bytes. - udp_packet_size = 1500 - - ## Data format to consume. - + ## Data format to consume. ## Each data format has it's own unique set of configuration options, read ## more about them here: ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md diff --git a/plugins/inputs/udp_listener/udp_listener.go b/plugins/inputs/udp_listener/udp_listener.go index 39249de37..8e2637ce7 100644 --- a/plugins/inputs/udp_listener/udp_listener.go +++ b/plugins/inputs/udp_listener/udp_listener.go @@ -135,12 +135,14 @@ func (u *UdpListener) udpParser() error { defer u.wg.Done() var packet []byte + var metrics []telegraf.Metric + var err error for { select { case <-u.done: return nil case packet = <-u.in: - metrics, err := u.parser.Parse(packet) + metrics, err = u.parser.Parse(packet) if err == nil { u.storeMetrics(metrics) } else { diff --git a/plugins/inputs/win_perf_counters/README.md b/plugins/inputs/win_perf_counters/README.md index 7a2b87a61..967714b48 100644 --- a/plugins/inputs/win_perf_counters/README.md +++ b/plugins/inputs/win_perf_counters/README.md @@ -156,6 +156,15 @@ if any of the combinations of ObjectName/Instances/Counters are invalid. Instances = ["------"] # Use 6 x - to remove the Instance bit from the query. Measurement = "win_mem" #IncludeTotal=false #Set to true to include _Total instance when querying for all (*). + + [[inputs.win_perf_counters.object]] + # more counters for the Network Interface Object can be found at + # https://msdn.microsoft.com/en-us/library/ms803962.aspx + ObjectName = "Network Interface" + Counters = ["Bytes Received/sec","Bytes Sent/sec","Packets Received/sec","Packets Sent/sec"] + Instances = ["*"] # Use 6 x - to remove the Instance bit from the query. + Measurement = "win_net" + #IncludeTotal=false #Set to true to include _Total instance when querying for all (*). ``` ### Active Directory Domain Controller diff --git a/plugins/inputs/win_perf_counters/win_perf_counters.go b/plugins/inputs/win_perf_counters/win_perf_counters.go index 8279f1c7a..4684289ee 100644 --- a/plugins/inputs/win_perf_counters/win_perf_counters.go +++ b/plugins/inputs/win_perf_counters/win_perf_counters.go @@ -107,6 +107,8 @@ type item struct { counterHandle win.PDH_HCOUNTER } +var sanitizedChars = strings.NewReplacer("/sec", "_persec", "/Sec", "_persec", " ", "_") + func (m *Win_PerfCounters) AddItem(metrics *itemList, query string, objectName string, counter string, instance string, measurement string, include_total bool) { @@ -297,7 +299,7 @@ func (m *Win_PerfCounters) Gather(acc telegraf.Accumulator) error { tags["instance"] = s } tags["objectname"] = metric.objectName - fields[string(metric.counter)] = float32(c.FmtValue.DoubleValue) + fields[sanitizedChars.Replace(string(metric.counter))] = float32(c.FmtValue.DoubleValue) var measurement string if metric.measurement == "" { diff --git a/plugins/outputs/cloudwatch/cloudwatch.go b/plugins/outputs/cloudwatch/cloudwatch.go index 42d98b5be..e4bfa0666 100644 --- a/plugins/outputs/cloudwatch/cloudwatch.go +++ b/plugins/outputs/cloudwatch/cloudwatch.go @@ -9,8 +9,6 @@ import ( "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/credentials" - "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds" - "github.com/aws/aws-sdk-go/aws/ec2metadata" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/cloudwatch" @@ -19,8 +17,10 @@ import ( ) type CloudWatch struct { - Region string // AWS Region - Namespace string // CloudWatch Metrics Namespace + Region string `toml:"region"` // AWS Region + AccessKey string `toml:"access_key"` // Explicit AWS Access Key ID + SecretKey string `toml:"secret_key"` // Explicit AWS Secret Access Key + Namespace string `toml:"namespace"` // CloudWatch Metrics Namespace svc *cloudwatch.CloudWatch } @@ -28,6 +28,15 @@ var sampleConfig = ` ## Amazon REGION region = 'us-east-1' + ## Amazon Credentials + ## Credentials are loaded in the following order + ## 1) explicit credentials from 'access_key' and 'secret_key' + ## 2) environment variables + ## 3) shared credentials file + ## 4) EC2 Instance Profile + #access_key = "" + #secret_key = "" + ## Namespace for the CloudWatch MetricDatums namespace = 'InfluxData/Telegraf' ` @@ -43,12 +52,9 @@ func (c *CloudWatch) Description() string { func (c *CloudWatch) Connect() error { Config := &aws.Config{ Region: aws.String(c.Region), - Credentials: credentials.NewChainCredentials( - []credentials.Provider{ - &ec2rolecreds.EC2RoleProvider{Client: ec2metadata.New(session.New())}, - &credentials.EnvProvider{}, - &credentials.SharedCredentialsProvider{}, - }), + } + if c.AccessKey != "" || c.SecretKey != "" { + Config.Credentials = credentials.NewStaticCredentials(c.AccessKey, c.SecretKey, "") } svc := cloudwatch.New(session.New(Config)) diff --git a/plugins/outputs/datadog/datadog.go b/plugins/outputs/datadog/datadog.go index 56fdc38e4..088568718 100644 --- a/plugins/outputs/datadog/datadog.go +++ b/plugins/outputs/datadog/datadog.go @@ -8,7 +8,6 @@ import ( "net/http" "net/url" "sort" - "strings" "github.com/influxdata/telegraf" "github.com/influxdata/telegraf/internal" @@ -71,21 +70,22 @@ func (d *Datadog) Write(metrics []telegraf.Metric) error { metricCounter := 0 for _, m := range metrics { - mname := strings.Replace(m.Name(), "_", ".", -1) if dogMs, err := buildMetrics(m); err == nil { for fieldName, dogM := range dogMs { // name of the datadog measurement var dname string if fieldName == "value" { // adding .value seems redundant here - dname = mname + dname = m.Name() } else { - dname = mname + "." + strings.Replace(fieldName, "_", ".", -1) + dname = m.Name() + "." + fieldName } + var host string + host, _ = m.Tags()["host"] metric := &Metric{ Metric: dname, Tags: buildTags(m.Tags()), - Host: m.Tags()["host"], + Host: host, } metric.Points[0] = dogM tempSeries = append(tempSeries, metric) diff --git a/plugins/outputs/influxdb/influxdb.go b/plugins/outputs/influxdb/influxdb.go index 626635a3b..891c752bd 100644 --- a/plugins/outputs/influxdb/influxdb.go +++ b/plugins/outputs/influxdb/influxdb.go @@ -125,13 +125,9 @@ func (i *InfluxDB) Connect() error { return err } - // Create Database if it doesn't exist - _, e := c.Query(client.Query{ - Command: fmt.Sprintf("CREATE DATABASE IF NOT EXISTS \"%s\"", i.Database), - }) - - if e != nil { - log.Println("Database creation failed: " + e.Error()) + err = createDatabase(c, i.Database) + if err != nil { + log.Println("Database creation failed: " + err.Error()) continue } @@ -144,8 +140,24 @@ func (i *InfluxDB) Connect() error { return nil } +func createDatabase(c client.Client, database string) error { + // Create Database if it doesn't exist + _, err := c.Query(client.Query{ + Command: fmt.Sprintf("CREATE DATABASE IF NOT EXISTS \"%s\"", database), + }) + return err +} + func (i *InfluxDB) Close() error { - // InfluxDB client does not provide a Close() function + var errS string + for j, _ := range i.conns { + if err := i.conns[j].Close(); err != nil { + errS += err.Error() + } + } + if errS != "" { + return fmt.Errorf("output influxdb close failed: %s", errS) + } return nil } @@ -185,18 +197,21 @@ func (i *InfluxDB) Write(metrics []telegraf.Metric) error { p := rand.Perm(len(i.conns)) for _, n := range p { if e := i.conns[n].Write(bp); e != nil { - log.Println("ERROR: " + e.Error()) + // Log write failure + log.Printf("ERROR: %s", e) + // If the database was not found, try to recreate it + if strings.Contains(e.Error(), "database not found") { + if errc := createDatabase(i.conns[n], i.Database); errc != nil { + log.Printf("ERROR: Database %s not found and failed to recreate\n", + i.Database) + } + } } else { err = nil break } } - // If all of the writes failed, create a new connection array so that - // i.Connect() will be called on the next gather. - if err != nil { - i.conns = make([]client.Client, 0) - } return err } diff --git a/plugins/outputs/kafka/kafka.go b/plugins/outputs/kafka/kafka.go index 1fafa1353..1b350ac6c 100644 --- a/plugins/outputs/kafka/kafka.go +++ b/plugins/outputs/kafka/kafka.go @@ -181,6 +181,9 @@ func (k *Kafka) Write(metrics []telegraf.Metric) error { func init() { outputs.Add("kafka", func() telegraf.Output { - return &Kafka{} + return &Kafka{ + MaxRetry: 3, + RequiredAcks: -1, + } }) } diff --git a/plugins/outputs/kinesis/kinesis.go b/plugins/outputs/kinesis/kinesis.go index 01906a7f5..fabec2402 100644 --- a/plugins/outputs/kinesis/kinesis.go +++ b/plugins/outputs/kinesis/kinesis.go @@ -9,8 +9,6 @@ import ( "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/credentials" - "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds" - "github.com/aws/aws-sdk-go/aws/ec2metadata" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/kinesis" @@ -20,6 +18,8 @@ import ( type KinesisOutput struct { Region string `toml:"region"` + AccessKey string `toml:"access_key"` + SecretKey string `toml:"secret_key"` StreamName string `toml:"streamname"` PartitionKey string `toml:"partitionkey"` Format string `toml:"format"` @@ -30,6 +30,16 @@ type KinesisOutput struct { var sampleConfig = ` ## Amazon REGION of kinesis endpoint. region = "ap-southeast-2" + + ## Amazon Credentials + ## Credentials are loaded in the following order + ## 1) explicit credentials from 'access_key' and 'secret_key' + ## 2) environment variables + ## 3) shared credentials file + ## 4) EC2 Instance Profile + #access_key = "" + #secret_key = "" + ## Kinesis StreamName must exist prior to starting telegraf. streamname = "StreamName" ## PartitionKey as used for sharding data. @@ -67,12 +77,9 @@ func (k *KinesisOutput) Connect() error { } Config := &aws.Config{ Region: aws.String(k.Region), - Credentials: credentials.NewChainCredentials( - []credentials.Provider{ - &ec2rolecreds.EC2RoleProvider{Client: ec2metadata.New(session.New())}, - &credentials.EnvProvider{}, - &credentials.SharedCredentialsProvider{}, - }), + } + if k.AccessKey != "" || k.SecretKey != "" { + Config.Credentials = credentials.NewStaticCredentials(k.AccessKey, k.SecretKey, "") } svc := kinesis.New(session.New(Config)) diff --git a/plugins/outputs/opentsdb/opentsdb.go b/plugins/outputs/opentsdb/opentsdb.go index 83a3429e3..5e94ca340 100644 --- a/plugins/outputs/opentsdb/opentsdb.go +++ b/plugins/outputs/opentsdb/opentsdb.go @@ -21,6 +21,9 @@ type OpenTSDB struct { Debug bool } +var sanitizedChars = strings.NewReplacer("@", "-", "*", "-", " ", "_", + `%`, "-", "#", "-", "$", "-") + var sampleConfig = ` ## prefix for metrics keys prefix = "my.specific.prefix." @@ -94,8 +97,8 @@ func buildTags(mTags map[string]string) []string { tags := make([]string, len(mTags)) index := 0 for k, v := range mTags { - tags[index] = fmt.Sprintf("%s=%s", k, v) - index += 1 + tags[index] = sanitizedChars.Replace(fmt.Sprintf("%s=%s", k, v)) + index++ } sort.Strings(tags) return tags @@ -105,7 +108,8 @@ func buildMetrics(m telegraf.Metric, now time.Time, prefix string) []*MetricLine ret := []*MetricLine{} for fieldName, value := range m.Fields() { metric := &MetricLine{ - Metric: fmt.Sprintf("%s%s_%s", prefix, m.Name(), fieldName), + Metric: sanitizedChars.Replace(fmt.Sprintf("%s%s_%s", + prefix, m.Name(), fieldName)), Timestamp: now.Unix(), } diff --git a/plugins/outputs/opentsdb/opentsdb_test.go b/plugins/outputs/opentsdb/opentsdb_test.go index 900c9f123..30323725b 100644 --- a/plugins/outputs/opentsdb/opentsdb_test.go +++ b/plugins/outputs/opentsdb/opentsdb_test.go @@ -25,6 +25,10 @@ func TestBuildTagsTelnet(t *testing.T) { map[string]string{"one": "two", "aaa": "bbb"}, []string{"aaa=bbb", "one=two"}, }, + { + map[string]string{"Sp%ci@l Chars": "g$t repl#ced"}, + []string{"Sp-ci-l_Chars=g-t_repl-ced"}, + }, { map[string]string{}, []string{}, @@ -59,13 +63,19 @@ func TestWrite(t *testing.T) { // Verify postive and negative test cases of writing data metrics := testutil.MockMetrics() - metrics = append(metrics, testutil.TestMetric(float64(1.0), "justametric.float")) - metrics = append(metrics, testutil.TestMetric(int64(123456789), "justametric.int")) - metrics = append(metrics, testutil.TestMetric(uint64(123456789012345), "justametric.uint")) - metrics = append(metrics, testutil.TestMetric("Lorem Ipsum", "justametric.string")) - metrics = append(metrics, testutil.TestMetric(float64(42.0), "justametric.anotherfloat")) + metrics = append(metrics, testutil.TestMetric(float64(1.0), + "justametric.float")) + metrics = append(metrics, testutil.TestMetric(int64(123456789), + "justametric.int")) + metrics = append(metrics, testutil.TestMetric(uint64(123456789012345), + "justametric.uint")) + metrics = append(metrics, testutil.TestMetric("Lorem Ipsum", + "justametric.string")) + metrics = append(metrics, testutil.TestMetric(float64(42.0), + "justametric.anotherfloat")) + metrics = append(metrics, testutil.TestMetric(float64(42.0), + "metric w/ specialchars")) err = o.Write(metrics) require.NoError(t, err) - } diff --git a/plugins/outputs/riemann/riemann.go b/plugins/outputs/riemann/riemann.go index c805bbd00..bc49a7191 100644 --- a/plugins/outputs/riemann/riemann.go +++ b/plugins/outputs/riemann/riemann.go @@ -1,7 +1,6 @@ package riemann import ( - "errors" "fmt" "os" "sort" @@ -33,6 +32,7 @@ func (r *Riemann) Connect() error { c, err := raidman.Dial(r.Transport, r.URL) if err != nil { + r.client = nil return err } @@ -41,7 +41,11 @@ func (r *Riemann) Connect() error { } func (r *Riemann) Close() error { + if r.client == nil { + return nil + } r.client.Close() + r.client = nil return nil } @@ -58,6 +62,13 @@ func (r *Riemann) Write(metrics []telegraf.Metric) error { return nil } + if r.client == nil { + err := r.Connect() + if err != nil { + return fmt.Errorf("FAILED to (re)connect to Riemann. Error: %s\n", err) + } + } + var events []*raidman.Event for _, p := range metrics { evs := buildEvents(p, r.Separator) @@ -68,8 +79,9 @@ func (r *Riemann) Write(metrics []telegraf.Metric) error { var senderr = r.client.SendMulti(events) if senderr != nil { - return errors.New(fmt.Sprintf("FAILED to send riemann message: %s\n", - senderr)) + r.Close() // always retuns nil + return fmt.Errorf("FAILED to send riemann message (will try to reconnect). Error: %s\n", + senderr) } return nil diff --git a/plugins/serializers/graphite/graphite.go b/plugins/serializers/graphite/graphite.go index 6484d3fee..bf2e75579 100644 --- a/plugins/serializers/graphite/graphite.go +++ b/plugins/serializers/graphite/graphite.go @@ -17,7 +17,7 @@ type GraphiteSerializer struct { Template string } -var sanitizedChars = strings.NewReplacer("/", "-", "@", "-", " ", "_", "..", ".") +var sanitizedChars = strings.NewReplacer("/", "-", "@", "-", "*", "-", " ", "_", "..", ".") func (s *GraphiteSerializer) Serialize(metric telegraf.Metric) ([]string, error) { out := []string{} diff --git a/scripts/post-install.sh b/scripts/post-install.sh index 53d745ca9..fb0b441e8 100644 --- a/scripts/post-install.sh +++ b/scripts/post-install.sh @@ -12,7 +12,7 @@ function install_init { function install_systemd { cp -f $SCRIPT_DIR/telegraf.service /lib/systemd/system/telegraf.service - systemctl enable telegraf + systemctl enable telegraf || true systemctl daemon-reload || true } @@ -26,7 +26,7 @@ function install_chkconfig { id telegraf &>/dev/null if [[ $? -ne 0 ]]; then - useradd --system -U -M telegraf -s /bin/false -d /etc/telegraf + useradd -r -K USERGROUPS_ENAB=yes -M telegraf -s /bin/false -d /etc/telegraf fi test -d $LOG_DIR || mkdir -p $LOG_DIR @@ -53,29 +53,29 @@ if [[ -f /etc/redhat-release ]]; then # RHEL-variant logic which systemctl &>/dev/null if [[ $? -eq 0 ]]; then - install_systemd + install_systemd else - # Assuming sysv - install_init - install_chkconfig + # Assuming sysv + install_init + install_chkconfig fi elif [[ -f /etc/debian_version ]]; then # Debian/Ubuntu logic which systemctl &>/dev/null if [[ $? -eq 0 ]]; then - install_systemd - systemctl restart telegraf + install_systemd + systemctl restart telegraf || echo "WARNING: systemd not running." else - # Assuming sysv - install_init - install_update_rcd - invoke-rc.d telegraf restart + # Assuming sysv + install_init + install_update_rcd + invoke-rc.d telegraf restart fi elif [[ -f /etc/os-release ]]; then source /etc/os-release if [[ $ID = "amzn" ]]; then - # Amazon Linux logic - install_init - install_chkconfig + # Amazon Linux logic + install_init + install_chkconfig fi fi