Skip to content

Commit

Permalink
Fix SDN exponential backoff timeouts
Browse files Browse the repository at this point in the history
- First iteration of wait.ExponentialBackoff() will not wait, so
  the times will be:
  0, a, a * r, a * r^2, ... a * r^n-2 [a: duration, r: factor, n: steps]
  Total = a * (r^(n-1) - 1)/(r-1))
  • Loading branch information
Ravi Sankar Penta committed Dec 13, 2017
1 parent 4d15c88 commit ac0793c
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 7 deletions.
2 changes: 1 addition & 1 deletion pkg/network/node/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ func (node *OsdnNode) getRuntimeService() (kubeletapi.RuntimeService, error) {
kwait.Backoff{
Duration: 100 * time.Millisecond,
Factor: 1.2,
Steps: 23,
Steps: 24,
},
func() (bool, error) {
runtimeService, err := kubeletremote.NewRemoteRuntimeService(node.runtimeEndpoint, node.runtimeRequestTimeout)
Expand Down
7 changes: 4 additions & 3 deletions pkg/network/node/sdn_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ func (plugin *OsdnNode) getLocalSubnet() (string, error) {
// unexpectedly long though, so give it plenty of time before returning an error
// (since that will cause the node process to exit).
backoff := utilwait.Backoff{
// A bit over 1 minute total
// ~2 mins total
Duration: time.Second,
Factor: 1.5,
Steps: 8,
Steps: 11,
}
err := utilwait.ExponentialBackoff(backoff, func() (bool, error) {
var err error
Expand Down Expand Up @@ -106,10 +106,11 @@ func (plugin *OsdnNode) alreadySetUp(localSubnetGatewayCIDR string, clusterNetwo
}

func deleteLocalSubnetRoute(device, localSubnetCIDR string) {
// ~1 sec total
backoff := utilwait.Backoff{
Duration: 100 * time.Millisecond,
Factor: 1.25,
Steps: 6,
Steps: 7,
}
err := utilwait.ExponentialBackoff(backoff, func() (bool, error) {
l, err := netlink.LinkByName(device)
Expand Down
4 changes: 2 additions & 2 deletions pkg/oc/admin/diagnostics/diagnostics/network/run_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ func (d *NetworkDiagnostic) runNetworkDiagnostic() {
return
}
// Wait for network diagnostic pod completion (timeout: ~3 mins)
backoff := wait.Backoff{Steps: 38, Duration: 500 * time.Millisecond, Factor: 1.1}
backoff := wait.Backoff{Steps: 39, Duration: 500 * time.Millisecond, Factor: 1.1}
if err := d.waitForNetworkPod(d.nsName1, util.NetworkDiagPodNamePrefix, backoff, []kapi.PodPhase{kapi.PodSucceeded, kapi.PodFailed}); err != nil {
d.res.Error("DNet2007", err, err.Error())
return
Expand All @@ -164,7 +164,7 @@ func (d *NetworkDiagnostic) runNetworkDiagnostic() {
}

// Wait for network diagnostic pod to start (timeout: ~5 mins)
backoff = wait.Backoff{Steps: 36, Duration: time.Second, Factor: 1.1}
backoff = wait.Backoff{Steps: 37, Duration: time.Second, Factor: 1.1}
if err := d.waitForNetworkPod(d.nsName1, util.NetworkDiagPodNamePrefix, backoff, []kapi.PodPhase{kapi.PodRunning, kapi.PodFailed, kapi.PodSucceeded}); err != nil {
d.res.Error("DNet2010", err, err.Error())
// Do not bail out here, collect what ever info is available from all valid nodes
Expand Down
2 changes: 1 addition & 1 deletion pkg/oc/admin/diagnostics/diagnostics/network/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ func (d *NetworkDiagnostic) waitForTestPodAndService(nsList []string) error {
errList := []error{}
validPhases := []kapi.PodPhase{kapi.PodRunning, kapi.PodSucceeded, kapi.PodFailed}
for _, name := range nsList {
backoff := wait.Backoff{Steps: 36, Duration: time.Second, Factor: 1.1} // timeout: ~5 mins
backoff := wait.Backoff{Steps: 37, Duration: time.Second, Factor: 1.1} // timeout: ~5 mins
if err := d.waitForNetworkPod(name, util.NetworkDiagTestPodNamePrefix, backoff, validPhases); err != nil {
errList = append(errList, err)
}
Expand Down

0 comments on commit ac0793c

Please sign in to comment.