Skip to content

Commit

Permalink
sdn: make pod operation metrics more useful and collectable
Browse files Browse the repository at this point in the history
The pod operation error metrics were in the wrong place to capture the
overall pod setup/teardown operation.  Move them to capture everything.

Next, the labels of the Latency metric meant that every observation was
a unique metric and no statistics could be determined from them in
aggregate.  Change that (and pod errors) to follow the Kubelet dockershim
DockerOperations[Latency|Errors] metric pattern with a label for the
operation instead of the sandbox.
  • Loading branch information
dcbw committed Nov 28, 2017
1 parent fe4f498 commit d61ffa1
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 58 deletions.
50 changes: 16 additions & 34 deletions pkg/network/node/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@ const (
OVSFlowsKey = "ovs_flows"
ARPCacheAvailableEntriesKey = "arp_cache_entries"
PodIPsKey = "pod_ips"
PodSetupErrorsKey = "pod_setup_errors"
PodSetupLatencyKey = "pod_setup_latency"
PodTeardownErrorsKey = "pod_teardown_errors"
PodTeardownLatencyKey = "pod_teardown_latency"
PodOperationsErrorsKey = "pod_operations_errors"
PodOperationsLatencyKey = "pod_operations_latency"
VnidNotFoundErrorsKey = "vnid_not_found_errors"

// Pod Operation types
PodOperationSetup = "setup"
PodOperationTeardown = "teardown"
)

var (
Expand Down Expand Up @@ -58,42 +60,24 @@ var (
},
)

PodSetupErrors = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: SDNNamespace,
Subsystem: SDNSubsystem,
Name: PodSetupErrorsKey,
Help: "Number pod setup errors",
},
)

PodSetupLatency = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: SDNNamespace,
Subsystem: SDNSubsystem,
Name: PodSetupLatencyKey,
Help: "Latency of pod network setup in microseconds",
},
[]string{"pod_namespace", "pod_name", "sandbox_id"},
)

PodTeardownErrors = prometheus.NewCounter(
PodOperationsErrors = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: SDNNamespace,
Subsystem: SDNSubsystem,
Name: PodTeardownErrorsKey,
Help: "Number pod teardown errors",
Name: PodOperationsErrorsKey,
Help: "Cumulative number of SDN operation errors by operation type",
},
[]string{"operation_type"},
)

PodTeardownLatency = prometheus.NewSummaryVec(
PodOperationsLatency = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: SDNNamespace,
Subsystem: SDNSubsystem,
Name: PodTeardownLatencyKey,
Help: "Latency of pod network teardown in microseconds",
Name: PodOperationsLatencyKey,
Help: "Latency in microseconds of SDN operations by operation type",
},
[]string{"pod_namespace", "pod_name", "sandbox_id"},
[]string{"operation_type"},
)

VnidNotFoundErrors = prometheus.NewCounter(
Expand Down Expand Up @@ -121,10 +105,8 @@ func RegisterMetrics() {
prometheus.MustRegister(OVSFlows)
prometheus.MustRegister(ARPCacheAvailableEntries)
prometheus.MustRegister(PodIPs)
prometheus.MustRegister(PodSetupErrors)
prometheus.MustRegister(PodSetupLatency)
prometheus.MustRegister(PodTeardownErrors)
prometheus.MustRegister(PodTeardownLatency)
prometheus.MustRegister(PodOperationsErrors)
prometheus.MustRegister(PodOperationsLatency)
prometheus.MustRegister(VnidNotFoundErrors)
})
}
Expand Down
25 changes: 3 additions & 22 deletions pkg/network/node/ovscontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,22 +288,11 @@ func getPodNote(sandboxID string) (string, error) {
}

func (oc *ovsController) SetUpPod(hostVeth, podIP, podMAC, sandboxID string, vnid uint32) (int, error) {
var (
err error
note string
ofport int
)
defer func() {
if err != nil {
PodSetupErrors.Inc()
}
}()

note, err = getPodNote(sandboxID)
note, err := getPodNote(sandboxID)
if err != nil {
return -1, err
}
ofport, err = oc.ensureOvsPort(hostVeth)
ofport, err := oc.ensureOvsPort(hostVeth)
if err != nil {
return -1, err
}
Expand Down Expand Up @@ -422,15 +411,7 @@ func (oc *ovsController) TearDownPod(hostVeth, podIP, sandboxID string) error {
podIP = ip
}

var err error
defer func() {
if err != nil {
PodTeardownErrors.Inc()
}
}()

err = oc.cleanupPodFlows(podIP)
if err != nil {
if err := oc.cleanupPodFlows(podIP); err != nil {
return err
}
_ = oc.SetPodBandwidth(hostVeth, -1, -1)
Expand Down
8 changes: 6 additions & 2 deletions pkg/network/node/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ func (m *podManager) processRequest(request *cniserver.PodRequest) *cniserver.Po
}
}
if err != nil {
PodOperationsErrors.WithLabelValues(PodOperationSetup).Inc()
result.Err = err
}
case cniserver.CNI_UPDATE:
Expand All @@ -329,6 +330,9 @@ func (m *podManager) processRequest(request *cniserver.PodRequest) *cniserver.Po
}
}
result.Err = m.podHandler.teardown(request)
if result.Err != nil {
PodOperationsErrors.WithLabelValues(PodOperationTeardown).Inc()
}
default:
result.Err = fmt.Errorf("unhandled CNI request %v", request.Command)
}
Expand Down Expand Up @@ -543,7 +547,7 @@ func podIsExited(p *kcontainer.Pod) bool {

// Set up all networking (host/container veth, OVS flows, IPAM, loopback, etc)
func (m *podManager) setup(req *cniserver.PodRequest) (cnitypes.Result, *runningPod, error) {
defer PodSetupLatency.WithLabelValues(req.PodNamespace, req.PodName, req.SandboxID).Observe(sinceInMicroseconds(time.Now()))
defer PodOperationsLatency.WithLabelValues(PodOperationSetup).Observe(sinceInMicroseconds(time.Now()))

pod, err := m.kClient.Core().Pods(req.PodNamespace).Get(req.PodName, metav1.GetOptions{})
if err != nil {
Expand Down Expand Up @@ -672,7 +676,7 @@ func (m *podManager) update(req *cniserver.PodRequest) (uint32, error) {

// Clean up all pod networking (clear OVS flows, release IPAM lease, remove host/container veth)
func (m *podManager) teardown(req *cniserver.PodRequest) error {
defer PodTeardownLatency.WithLabelValues(req.PodNamespace, req.PodName, req.SandboxID).Observe(sinceInMicroseconds(time.Now()))
defer PodOperationsLatency.WithLabelValues(PodOperationTeardown).Observe(sinceInMicroseconds(time.Now()))

netnsValid := true
if err := ns.IsNSorErr(req.Netns); err != nil {
Expand Down

0 comments on commit d61ffa1

Please sign in to comment.