Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TemplateInstance metrics update #19133

Merged
merged 2 commits into from
Mar 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 44 additions & 41 deletions pkg/template/controller/metrics.go
Original file line number Diff line number Diff line change
@@ -1,76 +1,79 @@
package controller

import (
"time"

templateapi "github.com/openshift/origin/pkg/template/apis/template"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/labels"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
kapi "k8s.io/kubernetes/pkg/apis/core"
)

var templateInstancesTotal = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "openshift_template_instance_total",
Help: "Counts TemplateInstance objects",
var templateInstanceCompleted = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "openshift_template_instance_completed_total",
Help: "Counts completed TemplateInstance objects by condition",
},
nil,
[]string{"condition"},
)

var templateInstanceStatusCondition = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "openshift_template_instance_status_condition_total",
Help: "Counts TemplateInstance objects by condition type and status",
},
[]string{"type", "status"},
)
func newTemplateInstanceActiveAge() prometheus.Histogram {
// We recreate a new Histogram object every time Collect is called. This is
// because we are recording a series of point-in-time observations about the
// population of "active" TemplateInstances. Were we to use a singleton
// Histogram, we would only be able to observe TemplateInstances as they
// completed, which would add latency in reporting very long-running
// TemplateInstances and completely prevent reporting of non-completing
// TemplateInstances.
//
// Effectively, the resulting series is to Histogram what Gauge is to
// Counter. In the resulting series, _count and _sum are not monotonically
// increasing (because TemplateInstances are no longer part of the
// population once they terminate or are deleted), therefore it is not valid
// to use counter functions such as rate() on this series.

var templateInstancesActiveStartTime = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "openshift_template_instance_active_start_time_seconds",
Help: "Show the start time in unix epoch form of active TemplateInstance objects by namespace and name",
},
[]string{"namespace", "name"},
)
return prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "openshift_template_instance_active_age_seconds",
Help: "Shows the instantaneous age distribution of active TemplateInstance objects",
Buckets: prometheus.LinearBuckets(600, 600, 7),
},
)
}

func (c *TemplateInstanceController) Describe(ch chan<- *prometheus.Desc) {
templateInstancesTotal.Describe(ch)
templateInstanceStatusCondition.Describe(ch)
templateInstancesActiveStartTime.Describe(ch)
templateInstanceActiveAge := newTemplateInstanceActiveAge()

templateInstanceCompleted.Describe(ch)
templateInstanceActiveAge.Describe(ch)
}

func (c *TemplateInstanceController) Collect(ch chan<- prometheus.Metric) {
templateInstanceCompleted.Collect(ch)

now := c.clock.Now()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I forget, was there a reason that unix epoch time was not used here?

Curious on the req here vs. what we had to the for build's active metric

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case, c.clock will be an instance of RealClock which has as its Now implementation:

// Now returns the current time.
func (RealClock) Now() time.Time {
	return time.Now()
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(it was done this way to provide an abstraction for the clock implementation so tests could plug in their own clock impl)


templateInstances, err := c.lister.List(labels.Everything())
if err != nil {
utilruntime.HandleError(err)
return
}

templateInstancesTotal.Reset()
templateInstanceStatusCondition.Reset()
templateInstancesActiveStartTime.Reset()

templateInstancesTotal.WithLabelValues().Set(0)
templateInstanceActiveAge := newTemplateInstanceActiveAge()

nextTemplateInstance:
for _, templateInstance := range templateInstances {
waiting := true

templateInstancesTotal.WithLabelValues().Inc()

for _, cond := range templateInstance.Status.Conditions {
templateInstanceStatusCondition.WithLabelValues(string(cond.Type), string(cond.Status)).Inc()

if cond.Status == kapi.ConditionTrue &&
(cond.Type == templateapi.TemplateInstanceInstantiateFailure || cond.Type == templateapi.TemplateInstanceReady) {
waiting = false
(cond.Type == templateapi.TemplateInstanceInstantiateFailure ||
cond.Type == templateapi.TemplateInstanceReady) {
continue nextTemplateInstance
}
}

if waiting {
templateInstancesActiveStartTime.WithLabelValues(templateInstance.Namespace, templateInstance.Name).Set(float64(templateInstance.CreationTimestamp.Unix()))
}
templateInstanceActiveAge.Observe(float64(now.Sub(templateInstance.CreationTimestamp.Time) / time.Second))
}

templateInstancesTotal.Collect(ch)
templateInstanceStatusCondition.Collect(ch)
templateInstancesActiveStartTime.Collect(ch)
templateInstanceActiveAge.Collect(ch)
}
175 changes: 134 additions & 41 deletions pkg/template/controller/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,42 @@ import (
"time"

templateapi "github.com/openshift/origin/pkg/template/apis/template"
templateclient "github.com/openshift/origin/pkg/template/generated/internalclientset"
"github.com/openshift/origin/pkg/template/generated/internalclientset/fake"
"github.com/openshift/origin/pkg/template/generated/listers/template/internalversion"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/util/workqueue"
kapi "k8s.io/kubernetes/pkg/apis/core"
)

type fakeLister []*templateapi.TemplateInstance
type fakeLister struct {
templateClient templateclient.Interface
}

func (f *fakeLister) List(labels.Selector) ([]*templateapi.TemplateInstance, error) {
list, err := f.templateClient.Template().TemplateInstances("").List(metav1.ListOptions{})
if err != nil {
return nil, err
}
templateInstances := make([]*templateapi.TemplateInstance, len(list.Items))
for i := range list.Items {
templateInstances[i] = &list.Items[i]
}
return templateInstances, err
}

func (f fakeLister) List(labels.Selector) ([]*templateapi.TemplateInstance, error) {
return f, nil
func (f *fakeLister) Get(name string) (*templateapi.TemplateInstance, error) {
return f.templateClient.Template().TemplateInstances("").Get(name, metav1.GetOptions{})
}
func (fakeLister) TemplateInstances(string) internalversion.TemplateInstanceNamespaceLister {
return nil

func (f *fakeLister) TemplateInstances(string) internalversion.TemplateInstanceNamespaceLister {
return f
}

type fakeResponseWriter struct {
Expand All @@ -41,58 +60,132 @@ func (f *fakeResponseWriter) WriteHeader(statusCode int) {
}

func TestMetrics(t *testing.T) {
expectedResponse := `# HELP openshift_template_instance_active_start_time_seconds Show the start time in unix epoch form of active TemplateInstance objects by namespace and name
# TYPE openshift_template_instance_active_start_time_seconds gauge
openshift_template_instance_active_start_time_seconds{name="testname",namespace="testnamespace"} 123
# HELP openshift_template_instance_status_condition_total Counts TemplateInstance objects by condition type and status
# TYPE openshift_template_instance_status_condition_total gauge
openshift_template_instance_status_condition_total{status="False",type="Ready"} 1
openshift_template_instance_status_condition_total{status="True",type="Ready"} 1
# HELP openshift_template_instance_total Counts TemplateInstance objects
# TYPE openshift_template_instance_total gauge
openshift_template_instance_total 2
expectedResponse := `# HELP openshift_template_instance_active_age_seconds Shows the instantaneous age distribution of active TemplateInstance objects
# TYPE openshift_template_instance_active_age_seconds histogram
openshift_template_instance_active_age_seconds_bucket{le="600"} 0
openshift_template_instance_active_age_seconds_bucket{le="1200"} 1
openshift_template_instance_active_age_seconds_bucket{le="1800"} 1
openshift_template_instance_active_age_seconds_bucket{le="2400"} 1
openshift_template_instance_active_age_seconds_bucket{le="3000"} 1
openshift_template_instance_active_age_seconds_bucket{le="3600"} 1
openshift_template_instance_active_age_seconds_bucket{le="4200"} 1
openshift_template_instance_active_age_seconds_bucket{le="+Inf"} 1
openshift_template_instance_active_age_seconds_sum 900
openshift_template_instance_active_age_seconds_count 1
# HELP openshift_template_instance_completed_total Counts completed TemplateInstance objects by condition
# TYPE openshift_template_instance_completed_total counter
openshift_template_instance_completed_total{condition="InstantiateFailure"} 2
openshift_template_instance_completed_total{condition="Ready"} 1
`

clock := &fakeClock{now: time.Unix(0, 0)}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah here we are using unix epoch ...


registry := prometheus.NewRegistry()

c := &TemplateInstanceController{
lister: &fakeLister{
{
Status: templateapi.TemplateInstanceStatus{
Conditions: []templateapi.TemplateInstanceCondition{
{
Type: templateapi.TemplateInstanceReady,
Status: kapi.ConditionTrue,
},
fakeTemplateClient := fake.NewSimpleClientset(
// when sync is called on this TemplateInstance it should fail and
// increment openshift_template_instance_completed_total
// {condition="InstantiateFailure"}
&templateapi.TemplateInstance{
ObjectMeta: metav1.ObjectMeta{
Name: "abouttofail",
},
Spec: templateapi.TemplateInstanceSpec{
Template: templateapi.Template{
Objects: []runtime.Object{
&kapi.ConfigMap{},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Namespace: "testnamespace",
Name: "testname",
CreationTimestamp: metav1.Time{
Time: time.Unix(123, 0),
},
// when sync is called on this TemplateInstance it should timeout and
// increment openshift_template_instance_completed_total
// {condition="InstantiateFailure"}
&templateapi.TemplateInstance{
ObjectMeta: metav1.ObjectMeta{
Name: "abouttotimeout",
},
Spec: templateapi.TemplateInstanceSpec{
Template: templateapi.Template{
Objects: []runtime.Object{
&kapi.ConfigMap{},
},
},
Status: templateapi.TemplateInstanceStatus{
Conditions: []templateapi.TemplateInstanceCondition{
{
Type: templateapi.TemplateInstanceReady,
Status: kapi.ConditionFalse,
},
Requester: &templateapi.TemplateInstanceRequester{},
},
Status: templateapi.TemplateInstanceStatus{
Objects: []templateapi.TemplateInstanceObject{
{},
},
},
},
// when sync is called on this TemplateInstance it should succeed and
// increment openshift_template_instance_completed_total
// {condition="Ready"}
&templateapi.TemplateInstance{
ObjectMeta: metav1.ObjectMeta{
Name: "abouttosucceed",
CreationTimestamp: metav1.Time{
Time: clock.now,
},
},
Spec: templateapi.TemplateInstanceSpec{
Template: templateapi.Template{
Objects: []runtime.Object{
&kapi.ConfigMap{},
},
},
Requester: &templateapi.TemplateInstanceRequester{},
},
Status: templateapi.TemplateInstanceStatus{
Objects: []templateapi.TemplateInstanceObject{
{},
},
},
},
// this TemplateInstance is in-flight, not timed out.
&templateapi.TemplateInstance{
ObjectMeta: metav1.ObjectMeta{
CreationTimestamp: metav1.Time{
Time: clock.now.Add(-900 * time.Second),
},
},
Status: templateapi.TemplateInstanceStatus{
Conditions: []templateapi.TemplateInstanceCondition{
{
Type: templateapi.TemplateInstanceReady,
Status: kapi.ConditionFalse,
},
},
},
},
)

c := &TemplateInstanceController{
lister: &fakeLister{fakeTemplateClient},
templateClient: fakeTemplateClient,
clock: clock,
readinessLimiter: &workqueue.BucketRateLimiter{},
}

registry.MustRegister(c)

h := promhttp.HandlerFor(registry, promhttp.HandlerOpts{ErrorHandling: promhttp.PanicOnError})
rw := &fakeResponseWriter{header: http.Header{}}
h.ServeHTTP(rw, &http.Request{})

if rw.String() != expectedResponse {
t.Error(rw.String())
// We loop twice: we expect the metrics response to match after the first
// set of sync calls, and not change after the second set.
for i := 0; i < 2; i++ {
for _, key := range []string{"/abouttofail", "/abouttotimeout", "/abouttosucceed"} {
err := c.sync(key)
if err != nil {
t.Fatal(err)
}
}

rw := &fakeResponseWriter{header: http.Header{}}
h.ServeHTTP(rw, &http.Request{})

if rw.String() != expectedResponse {
t.Errorf("run %d: %s\n", i, rw.String())
}
}
}
2 changes: 1 addition & 1 deletion pkg/template/controller/readiness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ func TestCheckReadiness(t *testing.T) {
groupKind: batch.Kind("Job"),
object: &batch.Job{
Status: batch.JobStatus{
CompletionTime: &metav1.Time{Time: time.Now()},
CompletionTime: &metav1.Time{Time: time.Unix(0, 0)},
},
},
expectedReady: true,
Expand Down
Loading