From 52d5b19219f2441364457abf59241c064221ca5a Mon Sep 17 00:00:00 2001 From: Rene Zbinden Date: Tue, 24 May 2016 10:55:25 +0200 Subject: [PATCH] add chrony support (#1238) * add chrony support * remove path definition * add changelog --- CHANGELOG.md | 1 + README.md | 1 + plugins/inputs/all/all.go | 1 + plugins/inputs/chrony/README.md | 91 +++++++++++++++++ plugins/inputs/chrony/chrony.go | 118 +++++++++++++++++++++++ plugins/inputs/chrony/chrony_notlinux.go | 3 + plugins/inputs/chrony/chrony_test.go | 95 ++++++++++++++++++ 7 files changed, 310 insertions(+) create mode 100644 plugins/inputs/chrony/README.md create mode 100644 plugins/inputs/chrony/chrony.go create mode 100644 plugins/inputs/chrony/chrony_notlinux.go create mode 100644 plugins/inputs/chrony/chrony_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 667679a5a..762309ae5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ to "stdout". - [#1139](https://github.com/influxdata/telegraf/pull/1139): instrumental output plugin. Thanks @jasonroelofs! - [#1172](https://github.com/influxdata/telegraf/pull/1172): Ceph storage stats. Thanks @robinpercy! - [#1233](https://github.com/influxdata/telegraf/pull/1233): Updated golint gopsutil dependency. +- [#1238](https://github.com/influxdata/telegraf/pull/1238): chrony input plugin. Thanks @zbindenren! - [#479](https://github.com/influxdata/telegraf/issues/479): per-plugin execution time added to debug output. ### Bugfixes diff --git a/README.md b/README.md index 3b969639e..c01fa0c6a 100644 --- a/README.md +++ b/README.md @@ -162,6 +162,7 @@ Currently implemented sources: * [bcache](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/bcache) * [cassandra](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/cassandra) * [ceph](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/ceph) +* [chrony](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/chrony) * [couchbase](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/couchbase) * [couchdb](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/couchdb) * [disque](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/disque) diff --git a/plugins/inputs/all/all.go b/plugins/inputs/all/all.go index df739a6b3..36d0724a6 100644 --- a/plugins/inputs/all/all.go +++ b/plugins/inputs/all/all.go @@ -6,6 +6,7 @@ import ( _ "github.com/influxdata/telegraf/plugins/inputs/bcache" _ "github.com/influxdata/telegraf/plugins/inputs/cassandra" _ "github.com/influxdata/telegraf/plugins/inputs/ceph" + _ "github.com/influxdata/telegraf/plugins/inputs/chrony" _ "github.com/influxdata/telegraf/plugins/inputs/cloudwatch" _ "github.com/influxdata/telegraf/plugins/inputs/couchbase" _ "github.com/influxdata/telegraf/plugins/inputs/couchdb" diff --git a/plugins/inputs/chrony/README.md b/plugins/inputs/chrony/README.md new file mode 100644 index 000000000..e12506ecb --- /dev/null +++ b/plugins/inputs/chrony/README.md @@ -0,0 +1,91 @@ +# chrony Input Plugin + +Get standard chrony metrics, requires chronyc executable. + +Below is the documentation of the various headers returned by `chronyc tracking`. + +- Reference ID - This is the refid and name (or IP address) if available, of the +server to which the computer is currently synchronised. If this is 127.127.1.1 +it means the computer is not synchronised to any external source and that you +have the ‘local’ mode operating (via the local command in chronyc (see section local), +or the local directive in the ‘/etc/chrony.conf’ file (see section local)). +- Stratum - The stratum indicates how many hops away from a computer with an attached +reference clock we are. Such a computer is a stratum-1 computer, so the computer in the +example is two hops away (i.e. a.b.c is a stratum-2 and is synchronised from a stratum-1). +- Ref time - This is the time (UTC) at which the last measurement from the reference +source was processed. +- System time - In normal operation, chronyd never steps the system clock, because any +jump in the timescale can have adverse consequences for certain application programs. +Instead, any error in the system clock is corrected by slightly speeding up or slowing +down the system clock until the error has been removed, and then returning to the system +clock’s normal speed. A consequence of this is that there will be a period when the +system clock (as read by other programs using the gettimeofday() system call, or by the +date command in the shell) will be different from chronyd's estimate of the current true +time (which it reports to NTP clients when it is operating in server mode). The value +reported on this line is the difference due to this effect. +- Last offset - This is the estimated local offset on the last clock update. +- RMS offset - This is a long-term average of the offset value. +- Frequency - The ‘frequency’ is the rate by which the system’s clock would be +wrong if chronyd was not correcting it. It is expressed in ppm (parts per million). +For example, a value of 1ppm would mean that when the system’s clock thinks it has +advanced 1 second, it has actually advanced by 1.000001 seconds relative to true time. +- Residual freq - This shows the ‘residual frequency’ for the currently selected +reference source. This reflects any difference between what the measurements from the +reference source indicate the frequency should be and the frequency currently being used. +The reason this is not always zero is that a smoothing procedure is applied to the +frequency. Each time a measurement from the reference source is obtained and a new +residual frequency computed, the estimated accuracy of this residual is compared with the +estimated accuracy (see ‘skew’ next) of the existing frequency value. A weighted average +is computed for the new frequency, with weights depending on these accuracies. If the +measurements from the reference source follow a consistent trend, the residual will be +driven to zero over time. +- Skew - This is the estimated error bound on the frequency. +- Root delay -This is the total of the network path delays to the stratum-1 computer +from which the computer is ultimately synchronised. In certain extreme situations, this +value can be negative. (This can arise in a symmetric peer arrangement where the computers’ +frequencies are not tracking each other and the network delay is very short relative to the +turn-around time at each computer.) +- Root dispersion - This is the total dispersion accumulated through all the computers +back to the stratum-1 computer from which the computer is ultimately synchronised. +Dispersion is due to system clock resolution, statistical measurement variations etc. +- Leap status - This is the leap status, which can be Normal, Insert second, +Delete second or Not synchronised. + +### Configuration: + +```toml +# Get standard chrony metrics, requires chronyc executable. +[[inputs.chrony]] + # no configuration +``` + +### Measurements & Fields: + +- chrony + - last_offset (float, seconds) + - rms_offset (float, seconds) + - frequency (float, ppm) + - residual_freq (float, ppm) + - skew (float, ppm) + - root_delay (float, seconds) + - root_dispersion (float, seconds) + - update_interval (float, seconds) + +### Tags: + +- All measurements have the following tags: + - reference_id + - stratum + - leap_status + +### Example Output: + +``` +$ telegraf -config telegraf.conf -input-filter chrony -test +* Plugin: chrony, Collection 1 +> chrony,leap_status=normal,reference_id=192.168.1.1,stratum=3 frequency=-35.657,last_offset=-0.000013616,residual_freq=-0,rms_offset=0.000027073,root_delay=0.000644,root_dispersion=0.003444,skew=0.001,update_interval=1031.2 1463750789687639161 +``` + + + + diff --git a/plugins/inputs/chrony/chrony.go b/plugins/inputs/chrony/chrony.go new file mode 100644 index 000000000..b4d874e60 --- /dev/null +++ b/plugins/inputs/chrony/chrony.go @@ -0,0 +1,118 @@ +// +build linux + +package chrony + +import ( + "errors" + "fmt" + "os/exec" + "strconv" + "strings" + "time" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal" + "github.com/influxdata/telegraf/plugins/inputs" +) + +var ( + execCommand = exec.Command // execCommand is used to mock commands in tests. +) + +type Chrony struct { + path string +} + +func (*Chrony) Description() string { + return "Get standard chrony metrics, requires chronyc executable." +} + +func (*Chrony) SampleConfig() string { + return "" +} + +func (c *Chrony) Gather(acc telegraf.Accumulator) error { + if len(c.path) == 0 { + return errors.New("chronyc not found: verify that chrony is installed and that chronyc is in your PATH") + } + cmd := execCommand(c.path, "tracking") + out, err := internal.CombinedOutputTimeout(cmd, time.Second*5) + if err != nil { + return fmt.Errorf("failed to run command %s: %s - %s", strings.Join(cmd.Args, " "), err, string(out)) + } + fields, tags, err := processChronycOutput(string(out)) + if err != nil { + return err + } + acc.AddFields("chrony", fields, tags) + return nil +} + +// processChronycOutput takes in a string output from the chronyc command, like: +// +// Reference ID : 192.168.1.22 (ntp.example.com) +// Stratum : 3 +// Ref time (UTC) : Thu May 12 14:27:07 2016 +// System time : 0.000020390 seconds fast of NTP time +// Last offset : +0.000012651 seconds +// RMS offset : 0.000025577 seconds +// Frequency : 16.001 ppm slow +// Residual freq : -0.000 ppm +// Skew : 0.006 ppm +// Root delay : 0.001655 seconds +// Root dispersion : 0.003307 seconds +// Update interval : 507.2 seconds +// Leap status : Normal +// +// The value on the left side of the colon is used as field name, if the first field on +// the right side is a float. If it cannot be parsed as float, it is a tag name. +// +// Ref time is ignored and all names are converted to snake case. +// +// It returns (, ) +func processChronycOutput(out string) (map[string]interface{}, map[string]string, error) { + tags := map[string]string{} + fields := map[string]interface{}{} + lines := strings.Split(strings.TrimSpace(out), "\n") + for _, line := range lines { + stats := strings.Split(line, ":") + if len(stats) < 2 { + return nil, nil, fmt.Errorf("unexpected output from chronyc, expected ':' in %s", out) + } + name := strings.ToLower(strings.Replace(strings.TrimSpace(stats[0]), " ", "_", -1)) + // ignore reference time + if strings.Contains(name, "time") { + continue + } + valueFields := strings.Fields(stats[1]) + if len(valueFields) == 0 { + return nil, nil, fmt.Errorf("unexpected output from chronyc: %s", out) + } + if strings.Contains(strings.ToLower(name), "stratum") { + tags["stratum"] = valueFields[0] + continue + } + value, err := strconv.ParseFloat(valueFields[0], 64) + if err != nil { + tags[name] = strings.ToLower(valueFields[0]) + continue + } + if strings.Contains(stats[1], "slow") { + value = -value + } + fields[name] = value + } + + return fields, tags, nil +} + +func init() { + c := Chrony{} + path, _ := exec.LookPath("chronyc") + if len(path) > 0 { + c.path = path + } + inputs.Add("chrony", func() telegraf.Input { + return &c + }) +} diff --git a/plugins/inputs/chrony/chrony_notlinux.go b/plugins/inputs/chrony/chrony_notlinux.go new file mode 100644 index 000000000..5a29cc858 --- /dev/null +++ b/plugins/inputs/chrony/chrony_notlinux.go @@ -0,0 +1,3 @@ +// +build !linux + +package chrony diff --git a/plugins/inputs/chrony/chrony_test.go b/plugins/inputs/chrony/chrony_test.go new file mode 100644 index 000000000..0e7d8a1a8 --- /dev/null +++ b/plugins/inputs/chrony/chrony_test.go @@ -0,0 +1,95 @@ +// +build linux + +package chrony + +import ( + "fmt" + "os" + "os/exec" + "testing" + + "github.com/influxdata/telegraf/testutil" +) + +func TestGather(t *testing.T) { + c := Chrony{ + path: "chronyc", + } + // overwriting exec commands with mock commands + execCommand = fakeExecCommand + defer func() { execCommand = exec.Command }() + var acc testutil.Accumulator + + err := c.Gather(&acc) + if err != nil { + t.Fatal(err) + } + + tags := map[string]string{ + "reference_id": "192.168.1.22", + "leap_status": "normal", + "stratum": "3", + } + fields := map[string]interface{}{ + "last_offset": 0.000012651, + "rms_offset": 0.000025577, + "frequency": -16.001, + "residual_freq": 0.0, + "skew": 0.006, + "root_delay": 0.001655, + "root_dispersion": 0.003307, + "update_interval": 507.2, + } + + acc.AssertContainsTaggedFields(t, "chrony", fields, tags) +} + +// fackeExecCommand is a helper function that mock +// the exec.Command call (and call the test binary) +func fakeExecCommand(command string, args ...string) *exec.Cmd { + cs := []string{"-test.run=TestHelperProcess", "--", command} + cs = append(cs, args...) + cmd := exec.Command(os.Args[0], cs...) + cmd.Env = []string{"GO_WANT_HELPER_PROCESS=1"} + return cmd +} + +// TestHelperProcess isn't a real test. It's used to mock exec.Command +// For example, if you run: +// GO_WANT_HELPER_PROCESS=1 go test -test.run=TestHelperProcess -- chrony tracking +// it returns below mockData. +func TestHelperProcess(t *testing.T) { + if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" { + return + } + + mockData := `Reference ID : 192.168.1.22 (ntp.example.com) +Stratum : 3 +Ref time (UTC) : Thu May 12 14:27:07 2016 +System time : 0.000020390 seconds fast of NTP time +Last offset : +0.000012651 seconds +RMS offset : 0.000025577 seconds +Frequency : 16.001 ppm slow +Residual freq : -0.000 ppm +Skew : 0.006 ppm +Root delay : 0.001655 seconds +Root dispersion : 0.003307 seconds +Update interval : 507.2 seconds +Leap status : Normal +` + + args := os.Args + + // Previous arguments are tests stuff, that looks like : + // /tmp/go-build970079519/…/_test/integration.test -test.run=TestHelperProcess -- + cmd, args := args[3], args[4:] + + if cmd == "chronyc" && args[0] == "tracking" { + fmt.Fprint(os.Stdout, mockData) + } else { + fmt.Fprint(os.Stdout, "command not found") + os.Exit(1) + + } + os.Exit(0) +}