Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 1596440 - surface OOMKilled pod to build #20297

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pkg/build/apis/build/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,9 @@ const (
// range of build failures.
StatusReasonGenericBuildFailed StatusReason = "GenericBuildFailed"

// StatusReasonOutOfMemoryKilled indicates that the build pod was killed for its memory consumption
StatusReasonOutOfMemoryKilled StatusReason = "OutOfMemoryKilled"

// StatusCannotRetrieveServiceAccount is the reason associated with a failure
// to look up the service account associated with the BuildConfig.
StatusReasonCannotRetrieveServiceAccount StatusReason = "CannotRetrieveServiceAccount"
Expand All @@ -540,6 +543,7 @@ const (
StatusMessageNoBuildContainerStatus = "The pod for this build has no container statuses indicating success or failure."
StatusMessageFailedContainer = "The pod for this build has at least one container with a non-zero exit status."
StatusMessageGenericBuildFailed = "Generic Build failure - check logs for details."
StatusMessageOutOfMemoryKilled = "The build pod was killed due to an out of memory condition."
StatusMessageUnresolvableEnvironmentVariable = "Unable to resolve build environment variable reference."
StatusMessageCannotRetrieveServiceAccount = "Unable to look up the service account secrets for this build."
)
Expand Down
26 changes: 25 additions & 1 deletion pkg/build/controller/build/build_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1001,7 +1001,9 @@ func (bc *BuildController) handleActiveBuild(build *buildapi.Build, pod *v1.Pod)
}
}
case v1.PodFailed:
if build.Status.Phase != buildapi.BuildPhaseFailed {
if isOOMKilled(pod) {
update = transitionToPhase(buildapi.BuildPhaseFailed, buildapi.StatusReasonOutOfMemoryKilled, buildapi.StatusMessageOutOfMemoryKilled)
} else if build.Status.Phase != buildapi.BuildPhaseFailed {
// If a DeletionTimestamp has been set, it means that the pod will
// soon be deleted. The build should be transitioned to the Error phase.
if pod.DeletionTimestamp != nil {
Expand All @@ -1014,11 +1016,33 @@ func (bc *BuildController) handleActiveBuild(build *buildapi.Build, pod *v1.Pod)
return update, nil
}

func isOOMKilled(pod *v1.Pod) bool {
if pod.Status.Reason == "OOMKilled" {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you have constant for this right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought since that constant is in builds context, it shouldn't be used to check the status reason for pods (they currently have the same string describing it but potentially this could change, maybe?)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah technically they are two different constants. One is the "build failure reason" and the other is the "pod status reason" so i agree that keeping them as separate constants makes sense.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's too bad there's no k8s constant for this that we can reference.

return true
}
for _, c := range pod.Status.InitContainerStatuses {
terminated := c.State.Terminated
if terminated != nil && terminated.Reason == "OOMKilled" {
return true
}
}
for _, c := range pod.Status.ContainerStatuses {
terminated := c.State.Terminated
if terminated != nil && terminated.Reason == "OOMKilled" {
return true
}
}
return false
}

// handleCompletedBuild will only be called on builds that are already in a terminal phase. It is used to setup the
// completion timestamp and failure logsnippet as needed.
func (bc *BuildController) handleCompletedBuild(build *buildapi.Build, pod *v1.Pod) (*buildUpdate, error) {

update := &buildUpdate{}
if isOOMKilled(pod) {
update = transitionToPhase(buildapi.BuildPhaseFailed, buildapi.StatusReasonOutOfMemoryKilled, buildapi.StatusMessageOutOfMemoryKilled)
}
setBuildCompletionData(build, pod, update)

return update, nil
Expand Down
6 changes: 5 additions & 1 deletion pkg/build/registry/build/strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,11 @@ func (strategy) PrepareForUpdate(ctx context.Context, obj, old runtime.Object) {
// of the reason and message. This is to prevent the build controller from
// overwriting the reason and message that was set by the builder pod
// when it updated the build's details.
if oldBuild.Status.Phase == buildapi.BuildPhaseFailed {
// Only allow OOMKilled override because various processes in a container
// can get OOMKilled and this confuses builder to prematurely populate
// failure reason
if oldBuild.Status.Phase == buildapi.BuildPhaseFailed &&
newBuild.Status.Reason != buildapi.StatusReasonOutOfMemoryKilled {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bparees I think the API was overriding the reason and message here. With this changed, I don't get any flakes testing this (but both reason and message updates after being set from builder.go)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so without this change the reason was never getting set to OOM, or it only sometimes getting set?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

most of the time it was setting OOM, only sometimes, when builder.go set it first, it wasn't

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well, it's an ugly hack, but i don't have a better idea at the moment :)

newBuild.Status.Reason = oldBuild.Status.Reason
newBuild.Status.Message = oldBuild.Status.Message
}
Expand Down
27 changes: 27 additions & 0 deletions test/extended/builds/failure_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ var _ = g.Describe("[Feature:Builds][Slow] update failure status", func() {
fetchDockerSrc = exutil.FixturePath("testdata", "builds", "statusfail-fetchsourcedocker.yaml")
fetchS2ISrc = exutil.FixturePath("testdata", "builds", "statusfail-fetchsources2i.yaml")
badContextDirS2ISrc = exutil.FixturePath("testdata", "builds", "statusfail-badcontextdirs2i.yaml")
oomkilled = exutil.FixturePath("testdata", "builds", "statusfail-oomkilled.yaml")
builderImageFixture = exutil.FixturePath("testdata", "builds", "statusfail-fetchbuilderimage.yaml")
pushToRegistryFixture = exutil.FixturePath("testdata", "builds", "statusfail-pushtoregistry.yaml")
failedAssembleFixture = exutil.FixturePath("testdata", "builds", "statusfail-failedassemble.yaml")
Expand Down Expand Up @@ -130,6 +131,32 @@ var _ = g.Describe("[Feature:Builds][Slow] update failure status", func() {
})
})

g.Describe("Build status OutOfMemoryKilled", func() {
g.It("should contain OutOfMemoryKilled failure reason and message", func() {
err := oc.Run("create").Args("-f", oomkilled).Execute()
o.Expect(err).NotTo(o.HaveOccurred())

br, err := exutil.StartBuildAndWait(oc, "statusfail-oomkilled", "--build-loglevel=5")
o.Expect(err).NotTo(o.HaveOccurred())
br.AssertFailure()
br.DumpLogs()

var build *buildapi.Build
wait.PollImmediate(200*time.Millisecond, 30*time.Second, func() (bool, error) {
build, err = oc.BuildClient().Build().Builds(oc.Namespace()).Get(br.Build.Name, metav1.GetOptions{})
if build.Status.Reason != buildapi.StatusReasonOutOfMemoryKilled {
return false, nil
}
return true, nil
})
o.Expect(err).NotTo(o.HaveOccurred())
o.Expect(build.Status.Reason).To(o.Equal(buildapi.StatusReasonOutOfMemoryKilled))
o.Expect(build.Status.Message).To(o.Equal(buildapi.StatusMessageOutOfMemoryKilled))

exutil.CheckForBuildEvent(oc.KubeClient().Core(), br.Build, buildapi.BuildFailedEventReason, buildapi.BuildFailedEventMessage)
})
})

g.Describe("Build status S2I bad context dir failure", func() {
g.It("should contain the S2I bad context dir failure reason and message", func() {
err := oc.Run("create").Args("-f", badContextDirS2ISrc).Execute()
Expand Down
37 changes: 37 additions & 0 deletions test/extended/testdata/bindata.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions test/extended/testdata/builds/statusfail-oomkilled.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
kind: BuildConfig
apiVersion: v1
metadata:
name: statusfail-oomkilled
spec:
resources:
limits:
memory: 10Mi
source:
git:
uri: "https://github.com/openshift/ruby-hello-world"
strategy:
type: Source
sourceStrategy:
from:
kind: DockerImage
name: centos/ruby-23-centos7:latest