Fix only one job per storage target reported in lustre2 input (#5771)
This commit is contained in:
parent
1e1fa1a580
commit
8abf8c10a7
|
@ -9,23 +9,27 @@ for HPC environments. It stores statistics about its activity in
|
||||||
package lustre2
|
package lustre2
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"io/ioutil"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/influxdata/telegraf"
|
"github.com/influxdata/telegraf"
|
||||||
"github.com/influxdata/telegraf/internal"
|
|
||||||
"github.com/influxdata/telegraf/plugins/inputs"
|
"github.com/influxdata/telegraf/plugins/inputs"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type tags struct {
|
||||||
|
name, job string
|
||||||
|
}
|
||||||
|
|
||||||
// Lustre proc files can change between versions, so we want to future-proof
|
// Lustre proc files can change between versions, so we want to future-proof
|
||||||
// by letting people choose what to look at.
|
// by letting people choose what to look at.
|
||||||
type Lustre2 struct {
|
type Lustre2 struct {
|
||||||
Ost_procfiles []string
|
Ost_procfiles []string `toml:"ost_jobstat"`
|
||||||
Mds_procfiles []string
|
Mds_procfiles []string `toml:"mds_jobstat"`
|
||||||
|
|
||||||
// allFields maps and OST name to the metric fields associated with that OST
|
// allFields maps and OST name to the metric fields associated with that OST
|
||||||
allFields map[string]map[string]interface{}
|
allFields map[tags]map[string]interface{}
|
||||||
}
|
}
|
||||||
|
|
||||||
var sampleConfig = `
|
var sampleConfig = `
|
||||||
|
@ -353,7 +357,7 @@ var wanted_mdt_jobstats_fields = []*mapping{
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l *Lustre2) GetLustreProcStats(fileglob string, wanted_fields []*mapping, acc telegraf.Accumulator) error {
|
func (l *Lustre2) GetLustreProcStats(fileglob string, wantedFields []*mapping, acc telegraf.Accumulator) error {
|
||||||
files, err := filepath.Glob(fileglob)
|
files, err := filepath.Glob(fileglob)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -367,43 +371,56 @@ func (l *Lustre2) GetLustreProcStats(fileglob string, wanted_fields []*mapping,
|
||||||
*/
|
*/
|
||||||
path := strings.Split(file, "/")
|
path := strings.Split(file, "/")
|
||||||
name := path[len(path)-2]
|
name := path[len(path)-2]
|
||||||
var fields map[string]interface{}
|
|
||||||
fields, ok := l.allFields[name]
|
|
||||||
if !ok {
|
|
||||||
fields = make(map[string]interface{})
|
|
||||||
l.allFields[name] = fields
|
|
||||||
}
|
|
||||||
|
|
||||||
lines, err := internal.ReadLines(file)
|
//lines, err := internal.ReadLines(file)
|
||||||
|
wholeFile, err := ioutil.ReadFile(file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
jobs := strings.Split(string(wholeFile), "-")
|
||||||
|
for _, job := range jobs {
|
||||||
|
lines := strings.Split(string(job), "\n")
|
||||||
|
jobid := ""
|
||||||
|
|
||||||
for _, line := range lines {
|
// figure out if the data should be tagged with job_id here
|
||||||
parts := strings.Fields(line)
|
parts := strings.Fields(lines[0])
|
||||||
if strings.HasPrefix(line, "- job_id:") {
|
if strings.TrimSuffix(parts[0], ":") == "job_id" {
|
||||||
// Set the job_id explicitly if present
|
jobid = parts[1]
|
||||||
fields["jobid"] = parts[2]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, wanted := range wanted_fields {
|
for _, line := range lines {
|
||||||
var data uint64
|
// skip any empty lines
|
||||||
if strings.TrimSuffix(parts[0], ":") == wanted.inProc {
|
if len(line) < 1 {
|
||||||
wanted_field := wanted.field
|
continue
|
||||||
// if not set, assume field[1]. Shouldn't be field[0], as
|
}
|
||||||
// that's a string
|
parts := strings.Fields(line)
|
||||||
if wanted_field == 0 {
|
|
||||||
wanted_field = 1
|
var fields map[string]interface{}
|
||||||
|
fields, ok := l.allFields[tags{name, jobid}]
|
||||||
|
if !ok {
|
||||||
|
fields = make(map[string]interface{})
|
||||||
|
l.allFields[tags{name, jobid}] = fields
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, wanted := range wantedFields {
|
||||||
|
var data uint64
|
||||||
|
if strings.TrimSuffix(parts[0], ":") == wanted.inProc {
|
||||||
|
wantedField := wanted.field
|
||||||
|
// if not set, assume field[1]. Shouldn't be field[0], as
|
||||||
|
// that's a string
|
||||||
|
if wantedField == 0 {
|
||||||
|
wantedField = 1
|
||||||
|
}
|
||||||
|
data, err = strconv.ParseUint(strings.TrimSuffix((parts[wantedField]), ","), 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
reportName := wanted.inProc
|
||||||
|
if wanted.reportAs != "" {
|
||||||
|
reportName = wanted.reportAs
|
||||||
|
}
|
||||||
|
fields[reportName] = data
|
||||||
}
|
}
|
||||||
data, err = strconv.ParseUint(strings.TrimSuffix((parts[wanted_field]), ","), 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
report_name := wanted.inProc
|
|
||||||
if wanted.reportAs != "" {
|
|
||||||
report_name = wanted.reportAs
|
|
||||||
}
|
|
||||||
fields[report_name] = data
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -423,7 +440,8 @@ func (l *Lustre2) Description() string {
|
||||||
|
|
||||||
// Gather reads stats from all lustre targets
|
// Gather reads stats from all lustre targets
|
||||||
func (l *Lustre2) Gather(acc telegraf.Accumulator) error {
|
func (l *Lustre2) Gather(acc telegraf.Accumulator) error {
|
||||||
l.allFields = make(map[string]map[string]interface{})
|
//l.allFields = make(map[string]map[string]interface{})
|
||||||
|
l.allFields = make(map[tags]map[string]interface{})
|
||||||
|
|
||||||
if len(l.Ost_procfiles) == 0 {
|
if len(l.Ost_procfiles) == 0 {
|
||||||
// read/write bytes are in obdfilter/<ost_name>/stats
|
// read/write bytes are in obdfilter/<ost_name>/stats
|
||||||
|
@ -483,15 +501,13 @@ func (l *Lustre2) Gather(acc telegraf.Accumulator) error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for name, fields := range l.allFields {
|
for tgs, fields := range l.allFields {
|
||||||
|
|
||||||
tags := map[string]string{
|
tags := map[string]string{
|
||||||
"name": name,
|
"name": tgs.name,
|
||||||
}
|
}
|
||||||
if _, ok := fields["jobid"]; ok {
|
if len(tgs.job) > 0 {
|
||||||
if jobid, ok := fields["jobid"].(string); ok {
|
tags["jobid"] = tgs.job
|
||||||
tags["jobid"] = jobid
|
|
||||||
}
|
|
||||||
delete(fields, "jobid")
|
|
||||||
}
|
}
|
||||||
acc.AddFields("lustre2", fields, tags)
|
acc.AddFields("lustre2", fields, tags)
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,6 +53,20 @@ const obdfilterJobStatsContents = `job_stats:
|
||||||
get_info: { samples: 0, unit: reqs }
|
get_info: { samples: 0, unit: reqs }
|
||||||
set_info: { samples: 0, unit: reqs }
|
set_info: { samples: 0, unit: reqs }
|
||||||
quotactl: { samples: 0, unit: reqs }
|
quotactl: { samples: 0, unit: reqs }
|
||||||
|
- job_id: testjob2
|
||||||
|
snapshot_time: 1461772761
|
||||||
|
read_bytes: { samples: 1, unit: bytes, min: 1024, max: 1024, sum: 1024 }
|
||||||
|
write_bytes: { samples: 25, unit: bytes, min: 2048, max: 2048, sum: 51200 }
|
||||||
|
getattr: { samples: 0, unit: reqs }
|
||||||
|
setattr: { samples: 0, unit: reqs }
|
||||||
|
punch: { samples: 1, unit: reqs }
|
||||||
|
sync: { samples: 0, unit: reqs }
|
||||||
|
destroy: { samples: 0, unit: reqs }
|
||||||
|
create: { samples: 0, unit: reqs }
|
||||||
|
statfs: { samples: 0, unit: reqs }
|
||||||
|
get_info: { samples: 0, unit: reqs }
|
||||||
|
set_info: { samples: 0, unit: reqs }
|
||||||
|
quotactl: { samples: 0, unit: reqs }
|
||||||
`
|
`
|
||||||
|
|
||||||
const mdtProcContents = `snapshot_time 1438693238.20113 secs.usecs
|
const mdtProcContents = `snapshot_time 1438693238.20113 secs.usecs
|
||||||
|
@ -93,6 +107,24 @@ const mdtJobStatsContents = `job_stats:
|
||||||
sync: { samples: 2, unit: reqs }
|
sync: { samples: 2, unit: reqs }
|
||||||
samedir_rename: { samples: 705, unit: reqs }
|
samedir_rename: { samples: 705, unit: reqs }
|
||||||
crossdir_rename: { samples: 200, unit: reqs }
|
crossdir_rename: { samples: 200, unit: reqs }
|
||||||
|
- job_id: testjob2
|
||||||
|
snapshot_time: 1461772761
|
||||||
|
open: { samples: 6, unit: reqs }
|
||||||
|
close: { samples: 7, unit: reqs }
|
||||||
|
mknod: { samples: 8, unit: reqs }
|
||||||
|
link: { samples: 9, unit: reqs }
|
||||||
|
unlink: { samples: 20, unit: reqs }
|
||||||
|
mkdir: { samples: 200, unit: reqs }
|
||||||
|
rmdir: { samples: 210, unit: reqs }
|
||||||
|
rename: { samples: 8, unit: reqs }
|
||||||
|
getattr: { samples: 10, unit: reqs }
|
||||||
|
setattr: { samples: 2, unit: reqs }
|
||||||
|
getxattr: { samples: 4, unit: reqs }
|
||||||
|
setxattr: { samples: 5, unit: reqs }
|
||||||
|
statfs: { samples: 1207, unit: reqs }
|
||||||
|
sync: { samples: 3, unit: reqs }
|
||||||
|
samedir_rename: { samples: 706, unit: reqs }
|
||||||
|
crossdir_rename: { samples: 201, unit: reqs }
|
||||||
`
|
`
|
||||||
|
|
||||||
func TestLustre2GeneratesMetrics(t *testing.T) {
|
func TestLustre2GeneratesMetrics(t *testing.T) {
|
||||||
|
@ -172,7 +204,7 @@ func TestLustre2GeneratesJobstatsMetrics(t *testing.T) {
|
||||||
|
|
||||||
tempdir := os.TempDir() + "/telegraf/proc/fs/lustre/"
|
tempdir := os.TempDir() + "/telegraf/proc/fs/lustre/"
|
||||||
ost_name := "OST0001"
|
ost_name := "OST0001"
|
||||||
job_name := "testjob1"
|
job_names := []string{"testjob1", "testjob2"}
|
||||||
|
|
||||||
mdtdir := tempdir + "/mdt/"
|
mdtdir := tempdir + "/mdt/"
|
||||||
err := os.MkdirAll(mdtdir+"/"+ost_name, 0755)
|
err := os.MkdirAll(mdtdir+"/"+ost_name, 0755)
|
||||||
|
@ -199,12 +231,23 @@ func TestLustre2GeneratesJobstatsMetrics(t *testing.T) {
|
||||||
err = m.Gather(&acc)
|
err = m.Gather(&acc)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
tags := map[string]string{
|
// make this two tags
|
||||||
"name": ost_name,
|
// and even further make this dependent on summing per OST
|
||||||
"jobid": job_name,
|
tags := []map[string]string{
|
||||||
|
{
|
||||||
|
"name": ost_name,
|
||||||
|
"jobid": job_names[0],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": ost_name,
|
||||||
|
"jobid": job_names[1],
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
fields := map[string]interface{}{
|
// make this for two tags
|
||||||
|
var fields []map[string]interface{}
|
||||||
|
|
||||||
|
fields = append(fields, map[string]interface{}{
|
||||||
"jobstats_read_calls": uint64(1),
|
"jobstats_read_calls": uint64(1),
|
||||||
"jobstats_read_min_size": uint64(4096),
|
"jobstats_read_min_size": uint64(4096),
|
||||||
"jobstats_read_max_size": uint64(4096),
|
"jobstats_read_max_size": uint64(4096),
|
||||||
|
@ -239,9 +282,50 @@ func TestLustre2GeneratesJobstatsMetrics(t *testing.T) {
|
||||||
"jobstats_sync": uint64(2),
|
"jobstats_sync": uint64(2),
|
||||||
"jobstats_samedir_rename": uint64(705),
|
"jobstats_samedir_rename": uint64(705),
|
||||||
"jobstats_crossdir_rename": uint64(200),
|
"jobstats_crossdir_rename": uint64(200),
|
||||||
|
})
|
||||||
|
|
||||||
|
fields = append(fields, map[string]interface{}{
|
||||||
|
"jobstats_read_calls": uint64(1),
|
||||||
|
"jobstats_read_min_size": uint64(1024),
|
||||||
|
"jobstats_read_max_size": uint64(1024),
|
||||||
|
"jobstats_read_bytes": uint64(1024),
|
||||||
|
"jobstats_write_calls": uint64(25),
|
||||||
|
"jobstats_write_min_size": uint64(2048),
|
||||||
|
"jobstats_write_max_size": uint64(2048),
|
||||||
|
"jobstats_write_bytes": uint64(51200),
|
||||||
|
"jobstats_ost_getattr": uint64(0),
|
||||||
|
"jobstats_ost_setattr": uint64(0),
|
||||||
|
"jobstats_punch": uint64(1),
|
||||||
|
"jobstats_ost_sync": uint64(0),
|
||||||
|
"jobstats_destroy": uint64(0),
|
||||||
|
"jobstats_create": uint64(0),
|
||||||
|
"jobstats_ost_statfs": uint64(0),
|
||||||
|
"jobstats_get_info": uint64(0),
|
||||||
|
"jobstats_set_info": uint64(0),
|
||||||
|
"jobstats_quotactl": uint64(0),
|
||||||
|
"jobstats_open": uint64(6),
|
||||||
|
"jobstats_close": uint64(7),
|
||||||
|
"jobstats_mknod": uint64(8),
|
||||||
|
"jobstats_link": uint64(9),
|
||||||
|
"jobstats_unlink": uint64(20),
|
||||||
|
"jobstats_mkdir": uint64(200),
|
||||||
|
"jobstats_rmdir": uint64(210),
|
||||||
|
"jobstats_rename": uint64(8),
|
||||||
|
"jobstats_getattr": uint64(10),
|
||||||
|
"jobstats_setattr": uint64(2),
|
||||||
|
"jobstats_getxattr": uint64(4),
|
||||||
|
"jobstats_setxattr": uint64(5),
|
||||||
|
"jobstats_statfs": uint64(1207),
|
||||||
|
"jobstats_sync": uint64(3),
|
||||||
|
"jobstats_samedir_rename": uint64(706),
|
||||||
|
"jobstats_crossdir_rename": uint64(201),
|
||||||
|
})
|
||||||
|
|
||||||
|
for index := 0; index < len(fields); index++ {
|
||||||
|
acc.AssertContainsTaggedFields(t, "lustre2", fields[index], tags[index])
|
||||||
}
|
}
|
||||||
|
|
||||||
acc.AssertContainsTaggedFields(t, "lustre2", fields, tags)
|
// run this over both tags
|
||||||
|
|
||||||
err = os.RemoveAll(os.TempDir() + "/telegraf")
|
err = os.RemoveAll(os.TempDir() + "/telegraf")
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
Loading…
Reference in New Issue