Skip to content

Commit

Permalink
nits
Browse files Browse the repository at this point in the history
Signed-off-by: MohammedAbdi <[email protected]>
  • Loading branch information
mamy-CS committed Jan 28, 2025
1 parent 74932f6 commit 8aaef23
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 20 deletions.
10 changes: 4 additions & 6 deletions internal/controller/instaslice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -449,11 +449,11 @@ func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return ctrl.Result{}, nil
}

// If no allocations exist, update metrics with all slots free
// updateMetricsAllSlotsFree - If no allocations exist, update metrics with all slots free
func (r *InstasliceReconciler) updateMetricsAllSlotsFree(ctx context.Context, instasliceList inferencev1alpha1.InstasliceList) {
log := logr.FromContext(ctx)
for _, instaslice := range instasliceList.Items {
remainingSlotsPerGPU := map[string]uint32{}
remainingSlotsPerGPU := map[string]int32{}
for gpuID := range instaslice.Spec.MigGPUUUID {
nodeName := instaslice.Name
if instaslice.Spec.Allocations == nil || len(instaslice.Spec.Allocations) == 0 {
Expand All @@ -472,7 +472,7 @@ func (r *InstasliceReconciler) updateMetricsAllSlotsFree(ctx context.Context, in
}
}

// updates UpdateGpuSliceMetrics and UpdateCompatibleProfilesMetrics
// updateMetrics - updates UpdateDeployedPodTotalMetrics, UpdateGpuSliceMetrics and UpdateCompatibleProfilesMetrics
func (r *InstasliceReconciler) updateMetrics(ctx context.Context, instasliceList inferencev1alpha1.InstasliceList) {
log := logr.FromContext(ctx)
for _, instaslice := range instasliceList.Items {
Expand All @@ -484,12 +484,10 @@ func (r *InstasliceReconciler) updateMetrics(ctx context.Context, instasliceList
if instaslice.Spec.Allocations == nil || len(instaslice.Spec.Allocations) == 0 {
log.Info("No allocations found, resetting GPU slice metrics", "node", nodeName, "gpuID", gpuID)
totalSlots, err := r.getTotalGpuSlotsForGPU(instaslice, gpuID)
log.Info("Total slots", totalSlots)
if err != nil {
log.Error(err, "Failed to determine total GPU slots for GPU without allocations", "gpuID", gpuID)
continue
}
log.Info("Updating GPU slice metrics", "node", nodeName, "gpuID", gpuID, "used", 0, "free", totalSlots)
if err := r.UpdateGpuSliceMetrics(nodeName, gpuID, 0, totalSlots); err != nil {
log.Error(err, "Failed to update GPU slice metrics for unallocated GPU", "nodeName", nodeName, "gpuID", gpuID)
}
Expand Down Expand Up @@ -566,7 +564,7 @@ func (r *InstasliceReconciler) getTotalGpuSlotsForGPU(instaslice inferencev1alph
return slotsPerGPU, nil
}

// create the DaemonSet object
// createInstaSliceDaemonSet - create the DaemonSet object
func (r *InstasliceReconciler) createInstaSliceDaemonSet(namespace string) *appsv1.DaemonSet {
emulatorMode := r.Config.EmulatorModeEnable
instasliceDaemonsetImage := r.Config.DaemonsetImage
Expand Down
24 changes: 10 additions & 14 deletions internal/controller/prometheus_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ var (
// Total number of GPU slices
GpuSliceTotal: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "instaslice_gpu_slices_total",
Help: "Total number of GPU slices utilized/ free per gpu in a node.",
Help: "Total number of GPU slices utilized and free per gpu in a node.",
},
[]string{"node", "gpu_id", "slot_status"}), // Labels: node, GPU ID, slot status.
// Total number of GPU slices
// Current deployed pod total
deployedPodTotal: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "instaslice_deployed_pod_total",
Help: "Pods that are deployed on slices.",
Name: "instaslice_current_deployed_pod_total",
Help: "Pods that are deployed currently on slices.",
},
[]string{"node", "gpu_id", "namespace", "podname", "profile"}), // Labels: node, GPU ID, namespace, podname, profile
// Pending GPU slice requests
Expand All @@ -71,16 +71,16 @@ var (
}),
// compatible profiles with remaining gpu slices
compatibleProfiles: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "instaslice_gpu_compatible_profiles",
Name: "instaslice_current_gpu_compatible_profiles",
Help: "Profiles compatible with remaining GPU slices.",
},
[]string{"profile", "node", "remaining_slices"}),
// processed slices
[]string{"profile", "node", "remaining_slices"}), // Labels: profile, node, remaining slices
// total processed slices
processedSlices: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "instaslice_total_processed_gpu_slices",
Help: "Number of total processed GPU slices.",
},
[]string{"node", "gpu_id"}),
[]string{"node", "gpu_id"}), // Labels: node, GPU ID
}

prometheusRegistry *prometheus.Registry
Expand All @@ -107,10 +107,6 @@ func (r *InstasliceReconciler) UpdateGpuSliceMetrics(nodeName, gpuID string, use
nodeName, gpuID, "used").Set(float64(usedSlots))
instasliceMetrics.GpuSliceTotal.WithLabelValues(
nodeName, gpuID, "free").Set(float64(freeSlots))
// log check
// ctrl.Log.Info("GpuSliceTotal metric updated",
// "node", nodeName, "gpuID", gpuID, "used", usedSlots,
// "value", instasliceMetrics.GpuSliceTotal.WithLabelValues(nodeName, gpuID, namespace, podname, profile, "used").Desc())
ctrl.Log.Info(fmt.Sprintf("[UpdateGpuSliceMetrics] Updated GPU Slices: %d used slot/s, %d freeslot for node -> %v, GPUID -> %v", usedSlots, freeSlots, nodeName, gpuID)) // trace
return nil
}
Expand Down Expand Up @@ -140,6 +136,7 @@ func (r *InstasliceReconciler) UpdateCompatibleProfilesMetrics(instasliceObj inf
instasliceMetrics.compatibleProfiles.Reset()
ctrl.Log.Info("Reset compatible profiles metric")

// profile map with fixed indexes for promethues
recommendedProfileMap := map[string]int{
"1g.5gb": 1,
"1g.10gb": 2,
Expand Down Expand Up @@ -171,7 +168,7 @@ func (r *InstasliceReconciler) UpdateCompatibleProfilesMetrics(instasliceObj inf
if size <= remaining {
currentProfiles[profileName] = struct{}{}
instasliceMetrics.compatibleProfiles.WithLabelValues(profileName, nodeName, fmt.Sprintf("%d", totalRemaining)).Set(float64(recommendedProfileMap[profileName])) // Indicate compatibility
ctrl.Log.Info("Added compatible profile", "profile", profileName, "size", size, "gpuID", gpuID, "remainingSlices", totalRemaining)
ctrl.Log.Info("[UpdateCompatibleProfilesMetrics] Added compatible profile", "profile", profileName, "size", size, "gpuID", gpuID, "remainingSlices", totalRemaining)
break
}
}
Expand Down Expand Up @@ -228,7 +225,6 @@ func (r *InstasliceReconciler) getPendingGpuRequests(ctx context.Context, client
}
}
}

return pendingSlices, nil
}

Expand Down

0 comments on commit 8aaef23

Please sign in to comment.