Add new "systemd_units" input plugin (#4532)

This commit is contained in:
Benjamin Schweizer 2019-12-03 01:05:50 +01:00 committed by Daniel Nelson
parent fd2e9889ac
commit 6839e5573c
6 changed files with 466 additions and 0 deletions

View File

@ -16,6 +16,7 @@
- [snmp_trap](/plugins/inputs/snmp_trap/README.md) - Contributed by @influxdata
- [suricata](/plugins/inputs/suricata/README.md) - Contributed by @satta
- [synproxy](/plugins/inputs/synproxy/README.md) - Contributed by @rfrenayworldstream
- [systemd_units](/plugins/inputs/systemd_units/README.md) - Contributed by @benschweizer
#### New Processors

View File

@ -148,6 +148,7 @@ import (
_ "github.com/influxdata/telegraf/plugins/inputs/syslog"
_ "github.com/influxdata/telegraf/plugins/inputs/sysstat"
_ "github.com/influxdata/telegraf/plugins/inputs/system"
_ "github.com/influxdata/telegraf/plugins/inputs/systemd_units"
_ "github.com/influxdata/telegraf/plugins/inputs/tail"
_ "github.com/influxdata/telegraf/plugins/inputs/tcp_listener"
_ "github.com/influxdata/telegraf/plugins/inputs/teamspeak"

View File

@ -0,0 +1,140 @@
# Systemd Units Plugin
The systemd_units plugin gathers systemd unit status on Linux. It relies on
`systemctl list-units --all --type=service` to collect data on service status.
The results are tagged with the unit name and provide enumerated fields for
loaded, active and running fields, indicating the unit health.
This plugin is related to the [win_services module](../win_services/), which
fulfills the same purpose on windows.
In addition to services, this plugin can gather other unit types as well,
see `systemctl list-units --all --type help` for possible options.
### Configuration
```
[[inputs.systemd_units]]
## Set timeout for systemctl execution
# timeout = "1s"
#
## Filter for a specific unit type, default is "service", other possible
## values are "socket", "target", "device", "mount", "automount", "swap",
## "timer", "path", "slice" and "scope ":
# unittype = "service"
```
### Metrics
- systemd_units:
- tags:
- name (string, unit name)
- load (string, load state)
- active (string, active state)
- sub (string, sub state)
- fields:
- load_code (int, see below)
- active_code (int, see below)
- sub_code (int, see below)
#### Load
enumeration of [unit_load_state_table](https://github.com/systemd/systemd/blob/c87700a1335f489be31cd3549927da68b5638819/src/basic/unit-def.c#L87)
| Value | Meaning | Description |
| ----- | ------- | ----------- |
| 0 | loaded | unit is ~ |
| 1 | stub | unit is ~ |
| 2 | not-found | unit is ~ |
| 3 | bad-setting | unit is ~ |
| 4 | error | unit is ~ |
| 5 | merged | unit is ~ |
| 6 | masked | unit is ~ |
#### Active
enumeration of [unit_active_state_table](https://github.com/systemd/systemd/blob/c87700a1335f489be31cd3549927da68b5638819/src/basic/unit-def.c#L99)
| Value | Meaning | Description |
| ----- | ------- | ----------- |
| 0 | active | unit is ~ |
| 1 | reloading | unit is ~ |
| 2 | inactive | unit is ~ |
| 3 | failed | unit is ~ |
| 4 | activating | unit is ~ |
| 5 | deactivating | unit is ~ |
#### Sub
enumeration of sub states, see various [unittype_state_tables](https://github.com/systemd/systemd/blob/c87700a1335f489be31cd3549927da68b5638819/src/basic/unit-def.c#L163);
duplicates were removed, tables are hex aligned to keep some space for future
values
| Value | Meaning | Description |
| ----- | ------- | ----------- |
| | | service_state_table start at 0x0000 |
| 0x0000 | running | unit is ~ |
| 0x0001 | dead | unit is ~ |
| 0x0002 | start-pre | unit is ~ |
| 0x0003 | start | unit is ~ |
| 0x0004 | exited | unit is ~ |
| 0x0005 | reload | unit is ~ |
| 0x0006 | stop | unit is ~ |
| 0x0007 | stop-watchdog | unit is ~ |
| 0x0008 | stop-sigterm | unit is ~ |
| 0x0009 | stop-sigkill | unit is ~ |
| 0x000a | stop-post | unit is ~ |
| 0x000b | final-sigterm | unit is ~ |
| 0x000c | failed | unit is ~ |
| 0x000d | auto-restart | unit is ~ |
| | | service_state_table start at 0x0010 |
| 0x0010 | waiting | unit is ~ |
| | | service_state_table start at 0x0020 |
| 0x0020 | tentative | unit is ~ |
| 0x0021 | plugged | unit is ~ |
| | | service_state_table start at 0x0030 |
| 0x0030 | mounting | unit is ~ |
| 0x0031 | mounting-done | unit is ~ |
| 0x0032 | mounted | unit is ~ |
| 0x0033 | remounting | unit is ~ |
| 0x0034 | unmounting | unit is ~ |
| 0x0035 | remounting-sigterm | unit is ~ |
| 0x0036 | remounting-sigkill | unit is ~ |
| 0x0037 | unmounting-sigterm | unit is ~ |
| 0x0038 | unmounting-sigkill | unit is ~ |
| | | service_state_table start at 0x0040 |
| | | service_state_table start at 0x0050 |
| 0x0050 | abandoned | unit is ~ |
| | | service_state_table start at 0x0060 |
| 0x0060 | active | unit is ~ |
| | | service_state_table start at 0x0070 |
| 0x0070 | start-chown | unit is ~ |
| 0x0071 | start-post | unit is ~ |
| 0x0072 | listening | unit is ~ |
| 0x0073 | stop-pre | unit is ~ |
| 0x0074 | stop-pre-sigterm | unit is ~ |
| 0x0075 | stop-pre-sigkill | unit is ~ |
| 0x0076 | final-sigkill | unit is ~ |
| | | service_state_table start at 0x0080 |
| 0x0080 | activating | unit is ~ |
| 0x0081 | activating-done | unit is ~ |
| 0x0082 | deactivating | unit is ~ |
| 0x0083 | deactivating-sigterm | unit is ~ |
| 0x0084 | deactivating-sigkill | unit is ~ |
| | | service_state_table start at 0x0090 |
| | | service_state_table start at 0x00a0 |
| 0x00a0 | elapsed | unit is ~ |
| | | |
### Example Output
Linux Systemd Units:
```
$ telegraf --test --config /tmp/telegraf.conf
> systemd_units,host=host1.example.com,name=dbus.service,load=loaded,active=active,sub=running load_code=0i,active_code=0i,sub_code=0i 1533730725000000000
> systemd_units,host=host1.example.com,name=networking.service,load=loaded,active=failed,sub=failed load_code=0i,active_code=3i,sub_code=12i 1533730725000000000
> systemd_units,host=host1.example.com,name=ssh.service,load=loaded,active=active,sub=running load_code=0i,active_code=0i,sub_code=0i 1533730725000000000
...
```
### Possible Improvements
- add blacklist to filter names

View File

@ -0,0 +1,221 @@
package systemd_units
import (
"bufio"
"bytes"
"fmt"
"os/exec"
"strings"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/plugins/inputs"
)
// SystemdUnits is a telegraf plugin to gather systemd unit status
type SystemdUnits struct {
Timeout internal.Duration
UnitType string `toml:"unittype"`
systemctl systemctl
}
type systemctl func(Timeout internal.Duration, UnitType string) (*bytes.Buffer, error)
const measurement = "systemd_units"
// Below are mappings of systemd state tables as defined in
// https://github.com/systemd/systemd/blob/c87700a1335f489be31cd3549927da68b5638819/src/basic/unit-def.c
// Duplicate strings are removed from this list.
var load_map = map[string]int{
"loaded": 0,
"stub": 1,
"not-found": 2,
"bad-setting": 3,
"error": 4,
"merged": 5,
"masked": 6,
}
var active_map = map[string]int{
"active": 0,
"reloading": 1,
"inactive": 2,
"failed": 3,
"activating": 4,
"deactivating": 5,
}
var sub_map = map[string]int{
// service_state_table, offset 0x0000
"running": 0x0000,
"dead": 0x0001,
"start-pre": 0x0002,
"start": 0x0003,
"exited": 0x0004,
"reload": 0x0005,
"stop": 0x0006,
"stop-watchdog": 0x0007,
"stop-sigterm": 0x0008,
"stop-sigkill": 0x0009,
"stop-post": 0x000a,
"final-sigterm": 0x000b,
"failed": 0x000c,
"auto-restart": 0x000d,
// automount_state_table, offset 0x0010
"waiting": 0x0010,
// device_state_table, offset 0x0020
"tentative": 0x0020,
"plugged": 0x0021,
// mount_state_table, offset 0x0030
"mounting": 0x0030,
"mounting-done": 0x0031,
"mounted": 0x0032,
"remounting": 0x0033,
"unmounting": 0x0034,
"remounting-sigterm": 0x0035,
"remounting-sigkill": 0x0036,
"unmounting-sigterm": 0x0037,
"unmounting-sigkill": 0x0038,
// path_state_table, offset 0x0040
// scope_state_table, offset 0x0050
"abandoned": 0x0050,
// slice_state_table, offset 0x0060
"active": 0x0060,
// socket_state_table, offset 0x0070
"start-chown": 0x0070,
"start-post": 0x0071,
"listening": 0x0072,
"stop-pre": 0x0073,
"stop-pre-sigterm": 0x0074,
"stop-pre-sigkill": 0x0075,
"final-sigkill": 0x0076,
// swap_state_table, offset 0x0080
"activating": 0x0080,
"activating-done": 0x0081,
"deactivating": 0x0082,
"deactivating-sigterm": 0x0083,
"deactivating-sigkill": 0x0084,
// target_state_table, offset 0x0090
// timer_state_table, offset 0x00a0
"elapsed": 0x00a0,
}
var (
defaultTimeout = internal.Duration{Duration: time.Second}
defaultUnitType = "service"
)
// Description returns a short description of the plugin
func (s *SystemdUnits) Description() string {
return "Gather systemd units state"
}
// SampleConfig returns sample configuration options.
func (s *SystemdUnits) SampleConfig() string {
return `
## Set timeout for systemctl execution
# timeout = "1s"
#
## Filter for a specific unit type, default is "service", other possible
## values are "socket", "target", "device", "mount", "automount", "swap",
## "timer", "path", "slice" and "scope ":
# unittype = "service"
`
}
// Gather parses systemctl outputs and adds counters to the Accumulator
func (s *SystemdUnits) Gather(acc telegraf.Accumulator) error {
out, err := s.systemctl(s.Timeout, s.UnitType)
if err != nil {
return err
}
scanner := bufio.NewScanner(out)
for scanner.Scan() {
line := scanner.Text()
data := strings.Fields(line)
if len(data) < 4 {
acc.AddError(fmt.Errorf("Error parsing line (expected at least 4 fields): %s", line))
continue
}
name := data[0]
load := data[1]
active := data[2]
sub := data[3]
tags := map[string]string{
"name": name,
"load": load,
"active": active,
"sub": sub,
}
var (
load_code int
active_code int
sub_code int
ok bool
)
if load_code, ok = load_map[load]; !ok {
acc.AddError(fmt.Errorf("Error parsing field 'load', value not in map: %s", load))
continue
}
if active_code, ok = active_map[active]; !ok {
acc.AddError(fmt.Errorf("Error parsing field 'active', value not in map: %s", active))
continue
}
if sub_code, ok = sub_map[sub]; !ok {
acc.AddError(fmt.Errorf("Error parsing field 'sub', value not in map: %s", sub))
continue
}
fields := map[string]interface{}{
"load_code": load_code,
"active_code": active_code,
"sub_code": sub_code,
}
acc.AddFields(measurement, fields, tags)
}
return nil
}
func setSystemctl(Timeout internal.Duration, UnitType string) (*bytes.Buffer, error) {
// is systemctl available ?
systemctlPath, err := exec.LookPath("systemctl")
if err != nil {
return nil, err
}
cmd := exec.Command(systemctlPath, "list-units", "--all", fmt.Sprintf("--type=%s", UnitType), "--no-legend")
var out bytes.Buffer
cmd.Stdout = &out
err = internal.RunTimeout(cmd, Timeout.Duration)
if err != nil {
return &out, fmt.Errorf("error running systemctl list-units --all --type=%s --no-legend: %s", UnitType, err)
}
return &out, nil
}
func init() {
inputs.Add("systemd_units", func() telegraf.Input {
return &SystemdUnits{
systemctl: setSystemctl,
Timeout: defaultTimeout,
UnitType: defaultUnitType,
}
})
}

View File

@ -0,0 +1,100 @@
package systemd_units
import (
"bytes"
"fmt"
"reflect"
"testing"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/testutil"
)
func TestSystemdUnits(t *testing.T) {
tests := []struct {
name string
line string
tags map[string]string
fields map[string]interface{}
status int
err error
}{
{
name: "example loaded active running",
line: "example.service loaded active running example service description",
tags: map[string]string{"name": "example.service", "load": "loaded", "active": "active", "sub": "running"},
fields: map[string]interface{}{
"load_code": 0,
"active_code": 0,
"sub_code": 0,
},
},
{
name: "example loaded active exited",
line: "example.service loaded active exited example service description",
tags: map[string]string{"name": "example.service", "load": "loaded", "active": "active", "sub": "exited"},
fields: map[string]interface{}{
"load_code": 0,
"active_code": 0,
"sub_code": 4,
},
},
{
name: "example loaded failed failed",
line: "example.service loaded failed failed example service description",
tags: map[string]string{"name": "example.service", "load": "loaded", "active": "failed", "sub": "failed"},
fields: map[string]interface{}{
"load_code": 0,
"active_code": 3,
"sub_code": 12,
},
},
{
name: "example not-found inactive dead",
line: "example.service not-found inactive dead example service description",
tags: map[string]string{"name": "example.service", "load": "not-found", "active": "inactive", "sub": "dead"},
fields: map[string]interface{}{
"load_code": 2,
"active_code": 2,
"sub_code": 1,
},
},
{
name: "example unknown unknown unknown",
line: "example.service unknown unknown unknown example service description",
err: fmt.Errorf("Error parsing field 'load', value not in map: %s", "unknown"),
},
{
name: "example too few fields",
line: "example.service loaded fai",
err: fmt.Errorf("Error parsing line (expected at least 4 fields): %s", "example.service loaded fai"),
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
systemd_units := &SystemdUnits{
systemctl: func(Timeout internal.Duration, UnitType string) (*bytes.Buffer, error) {
return bytes.NewBufferString(tt.line), nil
},
}
acc := new(testutil.Accumulator)
err := acc.GatherError(systemd_units.Gather)
if !reflect.DeepEqual(tt.err, err) {
t.Errorf("%s: expected error '%#v' got '%#v'", tt.name, tt.err, err)
}
if len(acc.Metrics) > 0 {
m := acc.Metrics[0]
if !reflect.DeepEqual(m.Measurement, measurement) {
t.Errorf("%s: expected measurement '%#v' got '%#v'\n", tt.name, measurement, m.Measurement)
}
if !reflect.DeepEqual(m.Tags, tt.tags) {
t.Errorf("%s: expected tags\n%#v got\n%#v\n", tt.name, tt.tags, m.Tags)
}
if !reflect.DeepEqual(m.Fields, tt.fields) {
t.Errorf("%s: expected fields\n%#v got\n%#v\n", tt.name, tt.fields, m.Fields)
}
}
})
}
}

View File

@ -0,0 +1,3 @@
// +build !linux
package systemd_units