Skip to content

Commit

Permalink
Merge pull request #13550 from kargakis/retry-longer-on-pending-deplo…
Browse files Browse the repository at this point in the history
…yments

Merged by openshift-bot
  • Loading branch information
OpenShift Bot authored Mar 28, 2017
2 parents 959be87 + df79f89 commit dae0037
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 29 deletions.
31 changes: 27 additions & 4 deletions pkg/deploy/controller/deployment/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,21 @@ import (
"github.com/openshift/origin/pkg/util"
)

// maxRetryCount is the maximum number of times the controller will retry errors.
// The first requeue is after 5ms and subsequent requeues grow exponentially.
// This effectively can extend up to 5^10ms which caps to 1000s:
//
// 5ms, 25ms, 125ms, 625ms, 3s, 16s, 78s, 390s, 1000s, 1000s
//
// The most common errors are:
//
// * failure to delete the deployer pods
// * failure to update the replication controller
// * pod may be missing from the cache once the deployment transitions to Pending.
//
// In most cases, we shouldn't need to retry up to maxRetryCount...
const maxRetryCount = 10

// fatalError is an error which can't be retried.
type fatalError string

Expand Down Expand Up @@ -71,10 +86,10 @@ type DeploymentController struct {
recorder record.EventRecorder
}

// Handle processes deployment and either creates a deployer pod or responds
// handle processes a deployment and either creates a deployer pod or responds
// to a terminal deployment status. Since this controller started using caches,
// the provided rc MUST be deep-copied beforehand (see work() in factory.go).
func (c *DeploymentController) Handle(deployment *kapi.ReplicationController) error {
func (c *DeploymentController) handle(deployment *kapi.ReplicationController, willBeDropped bool) error {
// Copy all the annotations from the deployment.
updatedAnnotations := make(map[string]string)
for key, value := range deployment.Annotations {
Expand Down Expand Up @@ -127,6 +142,7 @@ func (c *DeploymentController) Handle(deployment *kapi.ReplicationController) er
nextStatus = deployapi.DeploymentStatusPending
glog.V(4).Infof("Created deployer pod %s for deployment %s", deploymentPod.Name, deployutil.LabelForDeployment(deployment))

// Most likely dead code since we never get an error different from 404 back from the cache.
case deployerErr != nil:
// If the pod already exists, it's possible that a previous CreatePod
// succeeded but the deployment state update failed and now we're re-
Expand Down Expand Up @@ -163,6 +179,11 @@ func (c *DeploymentController) Handle(deployment *kapi.ReplicationController) er
// If the deployment is cancelled here then we deleted the deployer in a previous
// resync of the deployment.
if !deployutil.IsDeploymentCancelled(deployment) {
// Retry more before setting the deployment to Failed if it's Pending - the pod might not have
// appeared in the cache yet.
if !willBeDropped && currentStatus == deployapi.DeploymentStatusPending {
return deployerErr
}
updatedAnnotations[deployapi.DeploymentStatusReasonAnnotation] = deployapi.DeploymentFailedDeployerPodNoLongerExists
c.emitDeploymentEvent(deployment, kapi.EventTypeWarning, "Failed", fmt.Sprintf("Deployer pod %q has gone missing", deployerPodName))
deployerErr = fmt.Errorf("Failing deployment %q because its deployer pod %q disappeared", deployutil.LabelForDeployment(deployment), deployerPodName)
Expand Down Expand Up @@ -423,13 +444,15 @@ func (c *DeploymentController) handleErr(err error, key interface{}, deployment
return
}

if c.queue.NumRequeues(key) < 2 {
if c.queue.NumRequeues(key) < maxRetryCount {
c.queue.AddRateLimited(key)
return
}

msg := fmt.Sprintf("About to stop retrying %q: %v", deployment.Name, err)
if _, isActionableErr := err.(actionableError); isActionableErr {
c.emitDeploymentEvent(deployment, kapi.EventTypeWarning, "FailedRetry", fmt.Sprintf("About to stop retrying %s: %v", deployment.Name, err))
c.emitDeploymentEvent(deployment, kapi.EventTypeWarning, "FailedRetry", msg)
}
glog.V(2).Infof(msg)
c.queue.Forget(key)
}
78 changes: 54 additions & 24 deletions pkg/deploy/controller/deployment/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ func TestHandle_createPodOk(t *testing.T) {

controller := okDeploymentController(client, nil, nil, true, kapi.PodUnknown)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -211,7 +211,7 @@ func TestHandle_createPodFail(t *testing.T) {

controller := okDeploymentController(client, nil, nil, true, kapi.PodUnknown)

err := controller.Handle(deployment)
err := controller.handle(deployment, false)
if err == nil {
t.Fatalf("expected an error")
}
Expand Down Expand Up @@ -278,7 +278,7 @@ func TestHandle_deployerPodAlreadyExists(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, test.podPhase)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Errorf("%s: unexpected error: %v", test.name, err)
continue
}
Expand Down Expand Up @@ -317,7 +317,7 @@ func TestHandle_unrelatedPodAlreadyExists(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, false, kapi.PodRunning)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -358,7 +358,7 @@ func TestHandle_unrelatedPodAlreadyExistsTestScaled(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, false, kapi.PodRunning)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -416,7 +416,7 @@ func TestHandle_noop(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, test.podPhase)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Errorf("%s: unexpected error: %v", test.name, err)
continue
}
Expand Down Expand Up @@ -451,7 +451,7 @@ func TestHandle_failedTest(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, kapi.PodFailed)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -492,7 +492,7 @@ func TestHandle_cleanupPodOk(t *testing.T) {
controller := okDeploymentController(client, deployment, hookPods, true, kapi.PodSucceeded)
hookPods = append(hookPods, deployment.Name)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -537,7 +537,7 @@ func TestHandle_cleanupPodOkTest(t *testing.T) {
controller := okDeploymentController(client, deployment, hookPods, true, kapi.PodSucceeded)
hookPods = append(hookPods, deployment.Name)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -581,7 +581,7 @@ func TestHandle_cleanupPodNoop(t *testing.T) {
pod.Labels[deployapi.DeployerPodForDeploymentLabel] = "unrelated"
controller.podStore.Indexer.Update(pod)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
Expand Down Expand Up @@ -609,7 +609,7 @@ func TestHandle_cleanupPodFail(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, kapi.PodSucceeded)

err := controller.Handle(deployment)
err := controller.handle(deployment, false)
if err == nil {
t.Fatal("expected an actionable error")
}
Expand Down Expand Up @@ -640,7 +640,7 @@ func TestHandle_cancelNew(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, kapi.PodRunning)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -676,7 +676,7 @@ func TestHandle_cleanupNewWithDeployers(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, kapi.PodRunning)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -755,7 +755,7 @@ func TestHandle_cleanupPostNew(t *testing.T) {

controller := okDeploymentController(client, deployment, hookPods, true, test.podPhase)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Errorf("%s: unexpected error: %v", test.name, err)
continue
}
Expand All @@ -767,15 +767,27 @@ func TestHandle_cleanupPostNew(t *testing.T) {
}

// TestHandle_deployerPodDisappeared ensures that a pending/running deployment
// is failed when its deployer pod vanishes.
// is failed when its deployer pod vanishes. Ensure that pending deployments
// wont fail instantly on a missing deployer pod because it may take some time
// for it to appear in the pod cache.
func TestHandle_deployerPodDisappeared(t *testing.T) {
tests := []struct {
name string
phase deployapi.DeploymentStatus
name string
phase deployapi.DeploymentStatus
willBeDropped bool
shouldRetry bool
}{
{
name: "pending",
phase: deployapi.DeploymentStatusPending,
name: "pending - retry",
phase: deployapi.DeploymentStatusPending,
willBeDropped: false,
shouldRetry: true,
},
{
name: "pending - fail",
phase: deployapi.DeploymentStatusPending,
willBeDropped: true,
shouldRetry: false,
},
{
name: "running",
Expand All @@ -801,21 +813,39 @@ func TestHandle_deployerPodDisappeared(t *testing.T) {
continue
}
deployment.Annotations[deployapi.DeploymentStatusAnnotation] = string(test.phase)
updatedDeployment = deployment

controller := okDeploymentController(client, nil, nil, true, kapi.PodUnknown)

if err := controller.Handle(deployment); err != nil {
err = controller.handle(deployment, test.willBeDropped)
if !test.shouldRetry && err != nil {
t.Errorf("%s: unexpected error: %v", test.name, err)
continue
}
if test.shouldRetry && err == nil {
t.Errorf("%s: expected an error so that the deployment can be retried, got none", test.name)
continue
}

if !updateCalled {
if !test.shouldRetry && !updateCalled {
t.Errorf("%s: expected update", test.name)
continue
}

if e, a := deployapi.DeploymentStatusFailed, deployutil.DeploymentStatusFor(updatedDeployment); e != a {
t.Errorf("%s: expected deployment status %q, got %q", test.name, e, a)
if test.shouldRetry && updateCalled {
t.Errorf("%s: unexpected update", test.name)
continue
}

gotStatus := deployutil.DeploymentStatusFor(updatedDeployment)
if !test.shouldRetry && deployapi.DeploymentStatusFailed != gotStatus {
t.Errorf("%s: expected deployment status %q, got %q", test.name, deployapi.DeploymentStatusFailed, gotStatus)
continue
}

if test.shouldRetry && deployapi.DeploymentStatusPending != gotStatus {
t.Errorf("%s: expected deployment status %q, got %q", test.name, deployapi.DeploymentStatusPending, gotStatus)
continue
}
}
}
Expand Down Expand Up @@ -921,7 +951,7 @@ func TestHandle_transitionFromDeployer(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, test.podPhase)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Errorf("%s: unexpected error: %v", test.name, err)
continue
}
Expand Down
6 changes: 5 additions & 1 deletion pkg/deploy/controller/deployment/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,11 @@ func (c *DeploymentController) work() bool {
return false
}

err = c.Handle(rc)
// Resist missing deployer pods from the cache in case of a pending deployment.
// Give some room for a possible rc update failure in case we decided to mark it
// failed.
willBeDropped := c.queue.NumRequeues(key) >= maxRetryCount-2
err = c.handle(rc, willBeDropped)
c.handleErr(err, key, rc)

return false
Expand Down

0 comments on commit dae0037

Please sign in to comment.