Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
Signed-off-by: MohammedAbdi <[email protected]>
  • Loading branch information
mamy-CS committed Feb 10, 2025
1 parent b575ef0 commit 6ac8831
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 13 deletions.
8 changes: 5 additions & 3 deletions config/crd/bases/inference.redhat.com_instaslices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,6 @@ spec:
gpuUUID:
description: gpuUUID represents gpu uuid of selected gpu
type: string
isMetricProcessed:
description: to prevent double metric incrementation
type: boolean
memory:
description: memory represents amount of memory requested by
user workload
Expand Down Expand Up @@ -189,6 +186,11 @@ spec:
It provides runtime information about the resource, such as whether
allocations have been processed.
properties:
isMetricProcessed:
type: boolean
observedGeneration:
format: int64
type: integer
processed:
description: processed represents state of the instaslice object after
daemonset creation
Expand Down
6 changes: 5 additions & 1 deletion internal/controller/instaslice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,11 @@ func calculateUsedSlotsForGPU(instaslice inferencev1alpha1.Instaslice, nodeName,
usedSlots := int32(0)
for _, allocation := range instaslice.Spec.Allocations {
if allocation.Nodename == nodeName && allocation.GPUUUID == gpuID {
usedSlots += allocation.Size
if allocation.Size == 8 { // handle for 7g.40gb profile
usedSlots += 7
} else {
usedSlots += allocation.Size
}
}
}
return usedSlots
Expand Down
5 changes: 5 additions & 0 deletions internal/controller/instaslice_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1016,6 +1016,11 @@ var _ = Describe("Metrics Incrementation", func() {
Expect(instaslice.Status.ObservedGeneration).To(Equal(int64(1)))
Expect(instaslice.Status.IsMetricProcessed).To(BeTrue()) // Ensure metrics are marked as processed

// Check cleanup of incompatible profiles
r.UpdateCompatibleProfilesMetrics(*instaslice, "node-1", map[string]int32{"gpu-1": 0})
Expect(instasliceMetrics.compatibleProfiles.WithLabelValues("1g.5gb", "node-1", "0")).NotTo(BeNil())
Expect(instasliceMetrics.compatibleProfiles.WithLabelValues("2g.10gb", "node-1", "0")).NotTo(BeNil())

// Simulate spec update
instaslice.Generation = 3
r.updateMetrics(ctx, inferencev1alpha1.InstasliceList{Items: []inferencev1alpha1.Instaslice{*instaslice}})
Expand Down
12 changes: 3 additions & 9 deletions internal/controller/prometheus_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,6 @@ func (r *InstasliceReconciler) UpdatePendingSliceRequests(count uint32) error {

// UpdateCompatibleProfilesMetrics updates metrics based on remaining GPU slices and calculates compatible profiles dynamically
func (r *InstasliceReconciler) UpdateCompatibleProfilesMetrics(instasliceObj inferencev1alpha1.Instaslice, nodeName string, remainingSlices map[string]int32) error {
totalRemaining := int32(0)
for _, remaining := range remainingSlices {
totalRemaining += remaining
}
// Reset compatible profiles
instasliceMetrics.compatibleProfiles.Reset()
ctrl.Log.Info("Reset compatible profiles metric")

// profile map with fixed indexes for prometheus
// example for A100
// {
Expand Down Expand Up @@ -178,6 +170,8 @@ func (r *InstasliceReconciler) UpdateCompatibleProfilesMetrics(instasliceObj inf
// **Fix `7g.40gb` handling:** Ensure it fits when exactly 7 slices are available.
if profileName == "7g.40gb" && remaining == 7 {
gpuFit = 1
} else if profileName == "7g.40gb" && remaining > 7 {
gpuFit = 0
}

// **Accumulate per-GPU fit counts**
Expand All @@ -196,7 +190,7 @@ func (r *InstasliceReconciler) UpdateCompatibleProfilesMetrics(instasliceObj inf
for profileName := range baseProfileSliceMap { // baseProfileSliceMap contains all possible profiles
if _, exists := currentProfiles[profileName]; !exists {
// Profile is no longer compatible; set its value to 0
instasliceMetrics.compatibleProfiles.WithLabelValues(profileName, nodeName, fmt.Sprintf("%d", totalRemaining)).Set(0)
instasliceMetrics.compatibleProfiles.WithLabelValues(profileName, nodeName, fmt.Sprintf("%d", 0)).Set(0)
ctrl.Log.Info("Removed incompatible profile", "profile", profileName, "nodeName", nodeName)
}
}
Expand Down

0 comments on commit 6ac8831

Please sign in to comment.