diff --git a/vendor/k8s.io/kubernetes/pkg/controller/job/job_controller.go b/vendor/k8s.io/kubernetes/pkg/controller/job/job_controller.go index abd5bb02484c..1f24e05e4e8e 100644 --- a/vendor/k8s.io/kubernetes/pkg/controller/job/job_controller.go +++ b/vendor/k8s.io/kubernetes/pkg/controller/job/job_controller.go @@ -48,6 +48,8 @@ import ( "github.com/golang/glog" ) +const statusUpdateRetries = 3 + // controllerKind contains the schema.GroupVersionKind for this controller type. var controllerKind = batch.SchemeGroupVersion.WithKind("Job") @@ -357,10 +359,10 @@ func (jm *JobController) enqueueController(obj interface{}, immediate bool) { return } - if immediate { - jm.queue.Forget(key) + backoff := time.Duration(0) + if !immediate { + backoff = getBackoff(jm.queue, key) } - backoff := getBackoff(jm.queue, key) // TODO: Handle overlapping controllers better. Either disallow them at admission time or // deterministically avoid syncing controllers that fight over pods. Currently, we only @@ -496,7 +498,11 @@ func (jm *JobController) syncJob(key string) (bool, error) { var failureMessage string jobHaveNewFailure := failed > job.Status.Failed - exceedsBackoffLimit := jobHaveNewFailure && (int32(previousRetry)+1 > *job.Spec.BackoffLimit) + // new failures happen when status does not reflect the failures and active + // is different than parallelism, otherwise the previous controller loop + // failed updating status so even if we pick up failure it is not a new one + exceedsBackoffLimit := jobHaveNewFailure && (active != *job.Spec.Parallelism) && + (int32(previousRetry)+1 > *job.Spec.BackoffLimit) if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) { // check if the number of pod restart exceeds backoff (for restart OnFailure only) @@ -814,7 +820,20 @@ func (jm *JobController) manageJob(activePods []*v1.Pod, succeeded int32, job *b } func (jm *JobController) updateJobStatus(job *batch.Job) error { - _, err := jm.kubeClient.BatchV1().Jobs(job.Namespace).UpdateStatus(job) + jobClient := jm.kubeClient.BatchV1().Jobs(job.Namespace) + var err error + for i := 0; i <= statusUpdateRetries; i = i + 1 { + var newJob *batch.Job + newJob, err = jobClient.Get(job.Name, metav1.GetOptions{}) + if err != nil { + break + } + newJob.Status = job.Status + if _, err = jobClient.UpdateStatus(newJob); err == nil { + break + } + } + return err } diff --git a/vendor/k8s.io/kubernetes/pkg/controller/job/job_controller_test.go b/vendor/k8s.io/kubernetes/pkg/controller/job/job_controller_test.go index 466abd703b13..4b7b86999ee1 100644 --- a/vendor/k8s.io/kubernetes/pkg/controller/job/job_controller_test.go +++ b/vendor/k8s.io/kubernetes/pkg/controller/job/job_controller_test.go @@ -412,8 +412,8 @@ func TestSyncJobPastDeadline(t *testing.T) { }, "activeDeadlineSeconds with backofflimit reach": { 1, 1, 1, 10, 0, - 1, 0, 2, - true, 1, 0, 0, 3, "BackoffLimitExceeded", + 0, 0, 1, + true, 0, 0, 0, 1, "BackoffLimitExceeded", }, } diff --git a/vendor/k8s.io/kubernetes/test/e2e/apps/job.go b/vendor/k8s.io/kubernetes/test/e2e/apps/job.go index 8543c29d599a..7fdd2f59440e 100644 --- a/vendor/k8s.io/kubernetes/test/e2e/apps/job.go +++ b/vendor/k8s.io/kubernetes/test/e2e/apps/job.go @@ -179,7 +179,8 @@ var _ = SIGDescribe("Job", func() { It("should exceed backoffLimit", func() { By("Creating a job") - job := framework.NewTestJob("fail", "backofflimit", v1.RestartPolicyNever, 1, 1, nil, 0) + backoff := 1 + job := framework.NewTestJob("fail", "backofflimit", v1.RestartPolicyNever, 1, 1, nil, int32(backoff)) job, err := framework.CreateJob(f.ClientSet, f.Namespace.Name, job) Expect(err).NotTo(HaveOccurred()) By("Ensuring job exceed backofflimit") @@ -187,11 +188,18 @@ var _ = SIGDescribe("Job", func() { err = framework.WaitForJobFailure(f.ClientSet, f.Namespace.Name, job.Name, framework.JobTimeout, "BackoffLimitExceeded") Expect(err).NotTo(HaveOccurred()) - By("Checking that only one pod created and status is failed") + By(fmt.Sprintf("Checking that %d pod created and status is failed", backoff+1)) pods, err := framework.GetJobPods(f.ClientSet, f.Namespace.Name, job.Name) Expect(err).NotTo(HaveOccurred()) - Expect(pods.Items).To(HaveLen(1)) - pod := pods.Items[0] - Expect(pod.Status.Phase).To(Equal(v1.PodFailed)) + // Expect(pods.Items).To(HaveLen(backoff + 1)) + // due to NumRequeus not being stable enough, especially with failed status + // updates we need to allow more than backoff+1 + // TODO revert this back to above when https://github.com/kubernetes/kubernetes/issues/64787 gets fixed + if len(pods.Items) < backoff+1 { + framework.Failf("Not enough pod created expected at least %d, got %#v", backoff+1, pods.Items) + } + for _, pod := range pods.Items { + Expect(pod.Status.Phase).To(Equal(v1.PodFailed)) + } }) })