openshift · mamy-CS · Dec 16, 2024
diff --git a/api/v1alpha1/instaslice_types.go b/api/v1alpha1/instaslice_types.go
@@ -182,6 +182,9 @@ type InstasliceStatus struct {
 	// nodeResources specifies the discovered resources of the node
 	// +optional
 	NodeResources DiscoveredNodeResources `json:"nodeResources"`
+	// ObservedGeneration tracks the latest generation of the resource that has been observed and acted upon by the controller
+	// +optional
+	ObservedGeneration int64 `json:"observedGeneration,omitempty"`
 }
 
 //+kubebuilder:object:root=true

diff --git a/cmd/controller/main.go b/cmd/controller/main.go
@@ -58,12 +58,31 @@ func init() {
 	//+kubebuilder:scaffold:scheme
 }
 
+func getEnv(key, defval string) string {
+	if value, ok := os.LookupEnv(key); ok {
+		return value
+	}
+	return defval
+}
+
 func main() {
+	// Log info before initializing metrics exporter
+	ctrl.Log.Info("[SetupWithManager] Initializing Metrics Exporter.")
+	controller.RegisterMetrics()
+	// Log info after the metrics exporter is initialized
+	ctrl.Log.Info("[SetupWithManager] Metrics Exporter Initialized.")
+	var instaslicePrometheusMetricsUrl string = "http://0.0.0.0:8443"
+
+	// NOTE: these can be set as env or flag, flag takes precedence over env
+	instaslicePrometheusMetricsUrlEnv := getEnv("INSTASLICE-PROMETHEUS-METRICS-URL", instaslicePrometheusMetricsUrl)
+
 	var metricsAddr string
 	var enableLeaderElection bool
 	var probeAddr string
 	var secureMetrics bool
 	var enableHTTP2 bool
+	flag.StringVar(&instaslicePrometheusMetricsUrl, "instaslice-prometheus-metrics-url", instaslicePrometheusMetricsUrlEnv,
+		"The URL for the Prometheus metrics where Instaslice exposes metrics")
 	flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
 	flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
 	flag.BoolVar(&enableLeaderElection, "leader-elect", false,

diff --git a/config/crd/bases/inference.redhat.com_instaslices.yaml b/config/crd/bases/inference.redhat.com_instaslices.yaml
@@ -324,6 +324,11 @@ spec:
                 - nodeGpus
                 - nodeResources
                 type: object
+              observedGeneration:
+                description: ObservedGeneration tracks the latest generation of the
+                  resource that has been observed and acted upon by the controller
+                format: int64
+                type: integer
               podAllocationResults:
                 additionalProperties:
                   properties:

diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml
@@ -78,6 +78,7 @@ spec:
           capabilities:
             drop:
             - "ALL"
+          runAsUser: 1000
         livenessProbe:
           httpGet:
             path: /healthz
@@ -104,5 +105,7 @@ spec:
             value: <IMG_DMST>
           - name: EMULATOR_MODE
             value: "false"
+          - name: INSTASLICE-PROMETHEUS-METRICS-URL
+            value: "http://0.0.0.0:8443"
       serviceAccountName: controller-manager
       terminationGracePeriodSeconds: 10
diff --git a/deploy/instaslice-metrics-service.yaml b/deploy/instaslice-metrics-service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: instaslice-metrics
+  namespace: instaslice-system
+  labels:
+    control-plane: controller-manager
+spec:
+  ports:
+    - name: metrics
+      port: 8443
+      protocol: TCP
+      targetPort: 8443
+  selector:
+    control-plane: controller-manager  # Use the correct label here
diff --git a/deploy/instaslice-servicemonitor.yaml b/deploy/instaslice-servicemonitor.yaml
@@ -0,0 +1,23 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: instaslice-monitor
+  namespace: instaslice-monitoring
+  labels:
+    release: prometheus  # Label to match Prometheus serviceMonitorSelector
+spec:
+  selector:
+    matchLabels:
+      control-plane: controller-manager # Match labels of the Service exposing metrics
+  namespaceSelector:
+    matchNames:
+      - instaslice-system  # Namespace where the Service resides
+  endpoints:
+    - port: metrics  # Port name exposed in the Service for kube-rbac-proxy
+      interval: 15s
+      path: /metrics
+      scheme: https
+      bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token  # Prometheus authentication
+      honorLabels: true
+      tlsConfig:
+        insecureSkipVerify: true  # Set to false if using a valid CA
diff --git a/deploy/prometheus-role.yaml b/deploy/prometheus-role.yaml
@@ -0,0 +1,16 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: prometheus-metrics-reader
+  namespace: instaslice-system
+rules:
+  - apiGroups: [""]
+    resources: ["services", "endpoints", "pods"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["metrics.k8s.io"]
+    resources: ["pods", "nodes"]
+    verbs: ["get", "list"]
+  - apiGroups: ["authorization.k8s.io"]
+    resources: ["subjectaccessreviews"]
+    verbs: ["create"]
+
diff --git a/deploy/prometheus-rolebinding.yaml b/deploy/prometheus-rolebinding.yaml
@@ -0,0 +1,13 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: prometheus-metrics-binding
+  namespace: instaslice-system
+subjects:
+- kind: ServiceAccount
+  name: prometheus-kube-prometheus-prometheus  # Change this to your Prometheus ServiceAccount
+  namespace: instaslice-monitoring  # Change to Prometheus namespace
+roleRef:
+  kind: Role
+  name: prometheus-metrics-reader
+  apiGroup: rbac.authorization.k8s.io
diff --git a/deploy/prometheus-serviceaccount.yaml b/deploy/prometheus-serviceaccount.yaml
@@ -0,0 +1,5 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: prometheus
+  namespace: instaslice-monitoring  # namespace where Prometheus is running
diff --git a/deploy/prometheus.yaml b/deploy/prometheus.yaml
@@ -0,0 +1,36 @@
+alertmanager:
+    enabled: false
+kube-state-metrics:
+    enabled: false
+prometheus-node-exporter:
+    enabled: false
+prometheus-pushgateway:
+    enabled: false
+server:
+    name: instaslice
+    service:
+        enabled: true
+        type: NodePort
+        servicePort: 9090
+    persistentVolume:
+        existingClaim: prometheus-instaslice
+        enabled: false
+    securityContext:
+        runAsUser:
+        runAsNonRoot:
+        runAsGroup:
+        fsGroup:
+extraScrapeConfigs: |
+    - job_name: instaslice-metrics
+      honor_labels: true
+      metrics_path: /metrics
+      scheme: https
+      scrape_interval: 15s
+      static_configs:
+          - targets:
+              - instaslice-metrics.instaslice-system.svc.cluster.local:8443
+      tls_config:
+        insecure_skip_verify: true
+serviceMonitorSelector:
+    matchLabels:
+        release: prometheus
diff --git a/internal/controller/constants.go b/internal/controller/constants.go
@@ -37,8 +37,15 @@ const (
 	daemonSetImageName               = "quay.io/amalvank/instaslicev2-daemonset:latest"
 	daemonSetName                    = "daemonset"
 	serviceAccountName               = "instaslice-operator-controller-manager"
+	profile3g20gb                    = "3g.20gb"
+	profile1g10gb                    = "1g.10gb"
 
-	Requeue1sDelay  = 1 * time.Second
-	Requeue2sDelay  = 2 * time.Second
-	requeue10sDelay = 10 * time.Second
+	Requeue1sDelay     = 1 * time.Second
+	Requeue2sDelay     = 2 * time.Second
+	requeue10sDelay    = 10 * time.Second
+	maxSlices7g40gb    = 7
+	EndPosSlices3g20gb = 3
+	EndPosSlices1g10gb = 1
+	EndStartPos3g20gb  = 4
+	EndStartPos1g10gb  = 6
 )
diff --git a/internal/controller/instaslice_controller.go b/internal/controller/instaslice_controller.go
@@ -203,7 +203,7 @@ func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 						return ctrl.Result{}, nil
 					}
 					if allocation.AllocationStatus.AllocationStatusDaemonset == inferencev1alpha1.AllocationStatusDeleted {
-						err := r.removeInstasliceAllocation(ctx, instaslice.Name, &allocation)
+						err := r.removeInstasliceAllocation(ctx, &allocation, instaslice, uuid)
 						if err != nil {
 							return ctrl.Result{}, err
 						}
@@ -244,7 +244,7 @@ func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 					}
 
 					if allocation.AllocationStatus.AllocationStatusDaemonset == inferencev1alpha1.AllocationStatusDeleted {
-						err := r.removeInstasliceAllocation(ctx, instaslice.Name, &allocation)
+						err := r.removeInstasliceAllocation(ctx, &allocation, instaslice, uuid)
 						if err != nil {
 							return ctrl.Result{}, err
 						}
@@ -264,6 +264,8 @@ func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 			}
 			log.Info("finalizer deleted for succeeded ", "pod", pod.Name)
 		}
+		// If no allocations exist, update metrics with all slots free
+		r.updateMetricsAllSlotsFree(ctx, instasliceList)
 		return ctrl.Result{}, nil
 	}
 
@@ -283,7 +285,7 @@ func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 					return ctrl.Result{}, nil
 				}
 				if podUuid == pod.UID && allocation.AllocationStatus.AllocationStatusDaemonset == inferencev1alpha1.AllocationStatusDeleted {
-					err := r.removeInstasliceAllocation(ctx, instaslice.Name, &allocation)
+					err := r.removeInstasliceAllocation(ctx, &allocation, instaslice, podUuid)
 					if err != nil {
 						return ctrl.Result{}, err
 					}
@@ -308,8 +310,8 @@ func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 			for _, instaslice := range instasliceList.Items {
 				for podUuid, allocation := range instaslice.Status.PodAllocationResults {
 					if podUuid == pod.UID {
+						allocRequest := instaslice.Spec.PodAllocationRequests[podUuid]
 						if allocation.AllocationStatus.AllocationStatusDaemonset == inferencev1alpha1.AllocationStatusDeleted {
-							allocRequest := instaslice.Spec.PodAllocationRequests[podUuid]
 							err := utils.UpdateOrDeleteInstasliceAllocations(ctx, r.Client, instaslice.Name, &allocation, &allocRequest)
 							if err != nil {
 								return ctrl.Result{}, err
@@ -397,22 +399,48 @@ func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 				// Sort by Name in ascending order
 				return instasliceList.Items[i].Name < instasliceList.Items[j].Name
 			})
+
+			var successfulAllocRequest *inferencev1alpha1.AllocationRequest
+			var successfulAllocResult *inferencev1alpha1.AllocationResult
+			var instasliceListItemSuccess inferencev1alpha1.Instaslice
 			for _, instaslice := range instasliceList.Items {
 				// find the GPU on the node and the GPU index where the slice can be created
 				allocRequest, allocResult, err := r.findNodeAndDeviceForASlice(ctx, &instaslice, profileName, policy, pod)
 				if err != nil {
 					continue
 				}
 				podHasNodeAllocation = true
-				if podHasNodeAllocation {
-					err := utils.UpdateOrDeleteInstasliceAllocations(ctx, r.Client, instaslice.Name, allocResult, allocRequest)
-					if err != nil {
-						return ctrl.Result{Requeue: true}, nil
+				successfulAllocRequest = allocRequest
+				successfulAllocResult = allocResult
+				instasliceListItemSuccess = instaslice
+				// Break immediately after finding a suitable allocation
+				break
+
+			}
+			if podHasNodeAllocation {
+				err := utils.UpdateOrDeleteInstasliceAllocations(ctx, r.Client, instasliceListItemSuccess.Name, successfulAllocResult, successfulAllocRequest)
+				if err != nil {
+					return ctrl.Result{Requeue: true}, err
+				}
+				// allocation was successful
+				// Update total processed GPU slices metrics
+				// Check if metrics need processing based on ObservedGeneration
+				if instasliceListItemSuccess.Status.ObservedGeneration < instasliceListItemSuccess.Generation {
+					if err := r.IncrementTotalProcessedGpuSliceMetrics(string(successfulAllocResult.Nodename), successfulAllocResult.GPUUUID, successfulAllocResult.MigPlacement.Size, successfulAllocResult.MigPlacement.Start, successfulAllocRequest.Profile); err != nil {
+						log.Error(err, "Failed to update total processed GPU slices metric", "nodeName", successfulAllocResult.Nodename, "gpuID", successfulAllocResult.GPUUUID)
+						return ctrl.Result{Requeue: true}, err
+					}
+					// Mark as processed by updating ObservedGeneration
+					instasliceListItemSuccess.Status.ObservedGeneration = instasliceListItemSuccess.Generation
+					if err := r.Status().Update(ctx, &instasliceListItemSuccess); err != nil {
+						log.Error(err, "Failed to update Instaslice status after processing metrics", "allocation", successfulAllocRequest)
+						return ctrl.Result{Requeue: true}, err
 					}
-					// allocation was successful
-					return ctrl.Result{}, nil
 				}
+
+				return ctrl.Result{}, nil
 			}
+
 		}
 
 		// if the cluster does not have suitable node, requeue request
@@ -425,10 +453,24 @@ func (r *InstasliceReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 
 	}
 
+	// updates UpdateGpuSliceMetrics and UpdateCompatibleProfilesMetrics
+	r.updateMetrics(ctx, instasliceList)
+
+	// Update current pending GPU slice requests metrics
+	pendingCount, err := r.getPendingGpuRequests(ctx, r.Client)
+	if err != nil {
+		log.Error(err, "Failed to count pending GPU slice requests")
+		return ctrl.Result{}, err
+	}
+	if err := r.UpdatePendingSliceRequests(pendingCount); err != nil {
+		log.Error(err, "Failed to update pending GPU slice requests metric")
+		return ctrl.Result{}, err
+	}
+
 	return ctrl.Result{}, nil
 }
 
-// create the DaemonSet object
+// createInstaSliceDaemonSet - create the DaemonSet object
 func (r *InstasliceReconciler) createInstaSliceDaemonSet(namespace string) *appsv1.DaemonSet {
 	emulatorMode := r.Config.EmulatorModeEnable
 	instasliceDaemonsetImage := r.Config.DaemonsetImage
@@ -596,10 +638,12 @@ func (r *InstasliceReconciler) SetupWithManager(mgr ctrl.Manager) error {
 		return err
 	}
 
-	return ctrl.NewControllerManagedBy(mgr).
+	controllerManager := ctrl.NewControllerManagedBy(mgr).
 		For(&v1.Pod{}).Named("InstaSlice-controller").
 		Watches(&inferencev1alpha1.Instaslice{}, handler.EnqueueRequestsFromMapFunc(r.podMapFunc)).
 		Complete(r)
+
+	return controllerManager
 }
 
 func (r *InstasliceReconciler) unGatePod(podUpdate *v1.Pod) *v1.Pod {
@@ -678,15 +722,28 @@ func (l *RightToLeftPolicy) SetAllocationDetails(profileName string, newStart, s
 	return &inferencev1alpha1.AllocationRequest{}
 }
 
-func (r *InstasliceReconciler) removeInstasliceAllocation(ctx context.Context, instasliceName string, allocation *inferencev1alpha1.AllocationResult) error {
+func (r *InstasliceReconciler) removeInstasliceAllocation(ctx context.Context, allocation *inferencev1alpha1.AllocationResult, instaslice inferencev1alpha1.Instaslice, podUid types.UID) error {
+	log := logr.FromContext(ctx)
 	if allocation.AllocationStatus.AllocationStatusDaemonset == inferencev1alpha1.AllocationStatusDeleted {
-		err := utils.UpdateOrDeleteInstasliceAllocations(ctx, r.Client, instasliceName, nil, nil)
+		err := utils.UpdateOrDeleteInstasliceAllocations(ctx, r.Client, instaslice.Name, nil, nil)
 		if err != nil {
 			return err
 		}
 	}
+	allocRequest, exists := instaslice.Spec.PodAllocationRequests[podUid]
+	// prevents using an empty allocRequest and logs an error when it's missing
+	if !exists {
+		log.Error(fmt.Errorf("podUid not found"), "Pod UID not found in Instaslice PodAllocationRequests", "podUid", podUid)
+		return fmt.Errorf("podUid %s not found in Instaslice PodAllocationRequests", podUid)
+	}
+
+	// update DeployedPodTotal Metrics by setting value to 0 as pod allocation is deleted
+	if err := r.UpdateDeployedPodTotalMetrics(string(allocation.Nodename), allocation.GPUUUID, allocRequest.PodRef.Namespace, allocRequest.PodRef.Name, allocRequest.Profile, 0); err != nil {
+		log.Error(err, "Failed to update deployed pod metrics", "nodeName", allocation.Nodename)
+	}
 	return nil
 }
+
 func (r *InstasliceReconciler) setInstasliceAllocationToDeleting(ctx context.Context, instasliceName string, allocResult *inferencev1alpha1.AllocationResult, allocRequest *inferencev1alpha1.AllocationRequest) (ctrl.Result, error) {
 	log := logr.FromContext(ctx)
 	allocResult.AllocationStatus.AllocationStatusController = inferencev1alpha1.AllocationStatusDeleting