Fix potential missing datastore metrics in vSphere plugin (#4968)
This commit is contained in:
committed by
Daniel Nelson
parent
0e07bbb877
commit
2d782fbaac
@@ -24,6 +24,8 @@ import (
|
||||
|
||||
var isolateLUN = regexp.MustCompile(".*/([^/]+)/?$")
|
||||
|
||||
const metricLookback = 3
|
||||
|
||||
// Endpoint is a high-level representation of a connected vCenter endpoint. It is backed by the lower
|
||||
// level Client type.
|
||||
type Endpoint struct {
|
||||
@@ -32,6 +34,7 @@ type Endpoint struct {
|
||||
lastColls map[string]time.Time
|
||||
instanceInfo map[string]resourceInfo
|
||||
resourceKinds map[string]resourceKind
|
||||
hwMarks *TSCache
|
||||
lun2ds map[string]string
|
||||
discoveryTicker *time.Ticker
|
||||
collectMux sync.RWMutex
|
||||
@@ -96,6 +99,7 @@ func NewEndpoint(ctx context.Context, parent *VSphere, url *url.URL) (*Endpoint,
|
||||
URL: url,
|
||||
Parent: parent,
|
||||
lastColls: make(map[string]time.Time),
|
||||
hwMarks: NewTSCache(1 * time.Hour),
|
||||
instanceInfo: make(map[string]resourceInfo),
|
||||
lun2ds: make(map[string]string),
|
||||
initialized: false,
|
||||
@@ -353,8 +357,8 @@ func (e *Endpoint) discover(ctx context.Context) error {
|
||||
// Populate resource objects, and endpoint instance info.
|
||||
for k, res := range e.resourceKinds {
|
||||
log.Printf("D! [input.vsphere] Discovering resources for %s", res.name)
|
||||
// Need to do this for all resource types even if they are not enabled (but datastore)
|
||||
if res.enabled || (k != "datastore" && k != "vm") {
|
||||
// Need to do this for all resource types even if they are not enabled
|
||||
if res.enabled || k != "vm" {
|
||||
objects, err := res.getObjects(ctx, e, client.Root)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -416,7 +420,6 @@ func (e *Endpoint) discover(ctx context.Context) error {
|
||||
url := ds.altID
|
||||
m := isolateLUN.FindStringSubmatch(url)
|
||||
if m != nil {
|
||||
log.Printf("D! [input.vsphere]: LUN: %s", m[1])
|
||||
l2d[m[1]] = ds.name
|
||||
}
|
||||
}
|
||||
@@ -539,7 +542,6 @@ func getDatastores(ctx context.Context, e *Endpoint, root *view.ContainerView) (
|
||||
url = info.Url
|
||||
}
|
||||
}
|
||||
log.Printf("D! [input.vsphere]: DS URL: %s %s", url, r.Name)
|
||||
m[r.ExtensibleManagedObject.Reference().Value] = objectRef{
|
||||
name: r.Name, ref: r.ExtensibleManagedObject.Reference(), parentRef: r.Parent, altID: url}
|
||||
}
|
||||
@@ -584,10 +586,24 @@ func (e *Endpoint) Collect(ctx context.Context, acc telegraf.Accumulator) error
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Purge old timestamps from the cache
|
||||
e.hwMarks.Purge()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, now time.Time, latest time.Time) {
|
||||
maxMetrics := e.Parent.MaxQueryMetrics
|
||||
if maxMetrics < 1 {
|
||||
maxMetrics = 1
|
||||
}
|
||||
|
||||
// Workaround for vCenter weirdness. Cluster metrics seem to count multiple times
|
||||
// when checking query size, so keep it at a low value.
|
||||
// Revisit this when we better understand the reason why vCenter counts it this way!
|
||||
if res.name == "cluster" && maxMetrics > 10 {
|
||||
maxMetrics = 10
|
||||
}
|
||||
pqs := make([]types.PerfQuerySpec, 0, e.Parent.MaxQueryObjects)
|
||||
metrics := 0
|
||||
total := 0
|
||||
@@ -600,7 +616,7 @@ func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, n
|
||||
mr := len(info.metrics)
|
||||
for mr > 0 {
|
||||
mc := mr
|
||||
headroom := e.Parent.MaxQueryMetrics - metrics
|
||||
headroom := maxMetrics - metrics
|
||||
if !res.realTime && mc > headroom { // Metric query limit only applies to non-realtime metrics
|
||||
mc = headroom
|
||||
}
|
||||
@@ -610,10 +626,19 @@ func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, n
|
||||
MaxSample: 1,
|
||||
MetricId: info.metrics[fm : fm+mc],
|
||||
IntervalId: res.sampling,
|
||||
Format: "normal",
|
||||
}
|
||||
|
||||
// For non-realtime metrics, we need to look back a few samples in case
|
||||
// the vCenter is late reporting metrics.
|
||||
if !res.realTime {
|
||||
pq.StartTime = &latest
|
||||
pq.MaxSample = metricLookback
|
||||
}
|
||||
|
||||
// Look back 3 sampling periods
|
||||
start := latest.Add(time.Duration(-res.sampling) * time.Second * (metricLookback - 1))
|
||||
if !res.realTime {
|
||||
pq.StartTime = &start
|
||||
pq.EndTime = &now
|
||||
}
|
||||
pqs = append(pqs, pq)
|
||||
@@ -623,8 +648,8 @@ func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, n
|
||||
// We need to dump the current chunk of metrics for one of two reasons:
|
||||
// 1) We filled up the metric quota while processing the current resource
|
||||
// 2) We are at the last resource and have no more data to process.
|
||||
if mr > 0 || (!res.realTime && metrics >= e.Parent.MaxQueryMetrics) || nRes >= e.Parent.MaxQueryObjects {
|
||||
log.Printf("D! [input.vsphere]: Querying %d objects, %d metrics (%d remaining) of type %s for %s. Processed objects: %d. Total objects %d",
|
||||
if mr > 0 || (!res.realTime && metrics >= maxMetrics) || nRes >= e.Parent.MaxQueryObjects {
|
||||
log.Printf("D! [input.vsphere]: Queueing query: %d objects, %d metrics (%d remaining) of type %s for %s. Processed objects: %d. Total objects %d",
|
||||
len(pqs), metrics, mr, res.name, e.URL.Host, total+1, len(res.objects))
|
||||
|
||||
// To prevent deadlocks, don't send work items if the context has been cancelled.
|
||||
@@ -646,6 +671,8 @@ func (e *Endpoint) chunker(ctx context.Context, f PushFunc, res *resourceKind, n
|
||||
//
|
||||
if len(pqs) > 0 {
|
||||
// Call push function
|
||||
log.Printf("D! [input.vsphere]: Queuing query: %d objects, %d metrics (0 remaining) of type %s for %s. Total objects %d (final chunk)",
|
||||
len(pqs), metrics, res.name, e.URL.Host, len(res.objects))
|
||||
f(ctx, pqs)
|
||||
}
|
||||
}
|
||||
@@ -668,7 +695,7 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc
|
||||
log.Printf("D! [input.vsphere]: Latest: %s, elapsed: %f, resource: %s", latest, elapsed, resourceType)
|
||||
if !res.realTime && elapsed < float64(res.sampling) {
|
||||
// No new data would be available. We're outta herE! [input.vsphere]:
|
||||
log.Printf("D! [input.vsphere]: Sampling period for %s of %d has not elapsed for %s",
|
||||
log.Printf("D! [input.vsphere]: Sampling period for %s of %d has not elapsed on %s",
|
||||
resourceType, res.sampling, e.URL.Host)
|
||||
return nil
|
||||
}
|
||||
@@ -679,7 +706,6 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc
|
||||
internalTags := map[string]string{"resourcetype": resourceType}
|
||||
sw := NewStopwatchWithTags("gather_duration", e.URL.Host, internalTags)
|
||||
|
||||
log.Printf("D! [input.vsphere]: Start of sample period deemed to be %s", latest)
|
||||
log.Printf("D! [input.vsphere]: Collecting metrics for %d objects of type %s for %s",
|
||||
len(res.objects), resourceType, e.URL.Host)
|
||||
|
||||
@@ -690,7 +716,7 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc
|
||||
wp.Run(ctx, func(ctx context.Context, in interface{}) interface{} {
|
||||
chunk := in.([]types.PerfQuerySpec)
|
||||
n, err := e.collectChunk(ctx, chunk, resourceType, res, acc)
|
||||
log.Printf("D! [input.vsphere]: Query returned %d metrics", n)
|
||||
log.Printf("D! [input.vsphere] CollectChunk for %s returned %d metrics", resourceType, n)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -722,7 +748,7 @@ func (e *Endpoint) collectResource(ctx context.Context, resourceType string, acc
|
||||
sw.Stop()
|
||||
SendInternalCounterWithTags("gather_count", e.URL.Host, internalTags, count)
|
||||
if len(merr) > 0 {
|
||||
return err
|
||||
return merr
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -757,6 +783,7 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec,
|
||||
if err != nil {
|
||||
return count, err
|
||||
}
|
||||
log.Printf("D! [input.vsphere] Query for %s returned metrics for %d objects", resourceType, len(ems))
|
||||
|
||||
// Iterate through results
|
||||
for _, em := range ems {
|
||||
@@ -783,10 +810,18 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec,
|
||||
}
|
||||
e.populateTags(&objectRef, resourceType, &res, t, &v)
|
||||
|
||||
// Now deal with the values
|
||||
for idx, value := range v.Value {
|
||||
// Now deal with the values. Iterate backwards so we start with the latest value
|
||||
tsKey := moid + "|" + name + "|" + v.Instance
|
||||
for idx := len(v.Value) - 1; idx >= 0; idx-- {
|
||||
ts := em.SampleInfo[idx].Timestamp
|
||||
|
||||
// Since non-realtime metrics are queries with a lookback, we need to check the high-water mark
|
||||
// to determine if this should be included. Only samples not seen before should be included.
|
||||
if !(res.realTime || e.hwMarks.IsNew(tsKey, ts)) {
|
||||
continue
|
||||
}
|
||||
value := v.Value[idx]
|
||||
|
||||
// Organize the metrics into a bucket per measurement.
|
||||
// Data SHOULD be presented to us with the same timestamp for all samples, but in case
|
||||
// they don't we use the measurement name + timestamp as the key for the bucket.
|
||||
@@ -813,6 +848,11 @@ func (e *Endpoint) collectChunk(ctx context.Context, pqs []types.PerfQuerySpec,
|
||||
bucket.fields[fn] = value
|
||||
}
|
||||
count++
|
||||
|
||||
// Update highwater marks for non-realtime metrics.
|
||||
if !res.realTime {
|
||||
e.hwMarks.Put(tsKey, ts)
|
||||
}
|
||||
}
|
||||
}
|
||||
// We've iterated through all the metrics and collected buckets for each
|
||||
|
||||
Reference in New Issue
Block a user