From ac0793cf07d34122fe653229bbc5b81b82bb203c Mon Sep 17 00:00:00 2001 From: Ravi Sankar Penta Date: Tue, 12 Dec 2017 20:32:57 -0800 Subject: [PATCH] Fix SDN exponential backoff timeouts - First iteration of wait.ExponentialBackoff() will not wait, so the times will be: 0, a, a * r, a * r^2, ... a * r^n-2 [a: duration, r: factor, n: steps] Total = a * (r^(n-1) - 1)/(r-1)) --- pkg/network/node/runtime.go | 2 +- pkg/network/node/sdn_controller.go | 7 ++++--- pkg/oc/admin/diagnostics/diagnostics/network/run_pod.go | 4 ++-- pkg/oc/admin/diagnostics/diagnostics/network/setup.go | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pkg/network/node/runtime.go b/pkg/network/node/runtime.go index 61c037d7254f..5f71d533a879 100644 --- a/pkg/network/node/runtime.go +++ b/pkg/network/node/runtime.go @@ -23,7 +23,7 @@ func (node *OsdnNode) getRuntimeService() (kubeletapi.RuntimeService, error) { kwait.Backoff{ Duration: 100 * time.Millisecond, Factor: 1.2, - Steps: 23, + Steps: 24, }, func() (bool, error) { runtimeService, err := kubeletremote.NewRemoteRuntimeService(node.runtimeEndpoint, node.runtimeRequestTimeout) diff --git a/pkg/network/node/sdn_controller.go b/pkg/network/node/sdn_controller.go index 4ffc5e4b5f65..a038224d0a44 100644 --- a/pkg/network/node/sdn_controller.go +++ b/pkg/network/node/sdn_controller.go @@ -30,10 +30,10 @@ func (plugin *OsdnNode) getLocalSubnet() (string, error) { // unexpectedly long though, so give it plenty of time before returning an error // (since that will cause the node process to exit). backoff := utilwait.Backoff{ - // A bit over 1 minute total + // ~2 mins total Duration: time.Second, Factor: 1.5, - Steps: 8, + Steps: 11, } err := utilwait.ExponentialBackoff(backoff, func() (bool, error) { var err error @@ -106,10 +106,11 @@ func (plugin *OsdnNode) alreadySetUp(localSubnetGatewayCIDR string, clusterNetwo } func deleteLocalSubnetRoute(device, localSubnetCIDR string) { + // ~1 sec total backoff := utilwait.Backoff{ Duration: 100 * time.Millisecond, Factor: 1.25, - Steps: 6, + Steps: 7, } err := utilwait.ExponentialBackoff(backoff, func() (bool, error) { l, err := netlink.LinkByName(device) diff --git a/pkg/oc/admin/diagnostics/diagnostics/network/run_pod.go b/pkg/oc/admin/diagnostics/diagnostics/network/run_pod.go index 77ac964dbace..7dfb38868d3d 100644 --- a/pkg/oc/admin/diagnostics/diagnostics/network/run_pod.go +++ b/pkg/oc/admin/diagnostics/diagnostics/network/run_pod.go @@ -143,7 +143,7 @@ func (d *NetworkDiagnostic) runNetworkDiagnostic() { return } // Wait for network diagnostic pod completion (timeout: ~3 mins) - backoff := wait.Backoff{Steps: 38, Duration: 500 * time.Millisecond, Factor: 1.1} + backoff := wait.Backoff{Steps: 39, Duration: 500 * time.Millisecond, Factor: 1.1} if err := d.waitForNetworkPod(d.nsName1, util.NetworkDiagPodNamePrefix, backoff, []kapi.PodPhase{kapi.PodSucceeded, kapi.PodFailed}); err != nil { d.res.Error("DNet2007", err, err.Error()) return @@ -164,7 +164,7 @@ func (d *NetworkDiagnostic) runNetworkDiagnostic() { } // Wait for network diagnostic pod to start (timeout: ~5 mins) - backoff = wait.Backoff{Steps: 36, Duration: time.Second, Factor: 1.1} + backoff = wait.Backoff{Steps: 37, Duration: time.Second, Factor: 1.1} if err := d.waitForNetworkPod(d.nsName1, util.NetworkDiagPodNamePrefix, backoff, []kapi.PodPhase{kapi.PodRunning, kapi.PodFailed, kapi.PodSucceeded}); err != nil { d.res.Error("DNet2010", err, err.Error()) // Do not bail out here, collect what ever info is available from all valid nodes diff --git a/pkg/oc/admin/diagnostics/diagnostics/network/setup.go b/pkg/oc/admin/diagnostics/diagnostics/network/setup.go index a99df4fa6273..5b7330471d86 100644 --- a/pkg/oc/admin/diagnostics/diagnostics/network/setup.go +++ b/pkg/oc/admin/diagnostics/diagnostics/network/setup.go @@ -155,7 +155,7 @@ func (d *NetworkDiagnostic) waitForTestPodAndService(nsList []string) error { errList := []error{} validPhases := []kapi.PodPhase{kapi.PodRunning, kapi.PodSucceeded, kapi.PodFailed} for _, name := range nsList { - backoff := wait.Backoff{Steps: 36, Duration: time.Second, Factor: 1.1} // timeout: ~5 mins + backoff := wait.Backoff{Steps: 37, Duration: time.Second, Factor: 1.1} // timeout: ~5 mins if err := d.waitForNetworkPod(name, util.NetworkDiagTestPodNamePrefix, backoff, validPhases); err != nil { errList = append(errList, err) }