Skip to content

Commit

Permalink
add cutom instaslice metrics
Browse files Browse the repository at this point in the history
Signed-off-by: MohammedAbdi <[email protected]>

update metrics

Signed-off-by: MohammedAbdi <[email protected]>

nit

Signed-off-by: MohammedAbdi <[email protected]>

update metrics

Signed-off-by: MohammedAbdi <[email protected]>

update

Signed-off-by: MohammedAbdi <[email protected]>

update deployed pod total and total processed slices metrics

Signed-off-by: MohammedAbdi <[email protected]>

updateMetricsAllSlotsFree

Signed-off-by: MohammedAbdi <[email protected]>

nits

Signed-off-by: MohammedAbdi <[email protected]>

update promethues

Signed-off-by: MohammedAbdi <[email protected]>

update deployed pod total metrics call

Signed-off-by: MohammedAbdi <[email protected]>

remove fake capacity file

Signed-off-by: MohammedAbdi <[email protected]>

update profile map extraction automation

Signed-off-by: MohammedAbdi <[email protected]>

update

Signed-off-by: MohammedAbdi <[email protected]>

Track total fit across all GPUs correctly

Signed-off-by: MohammedAbdi <[email protected]>

add unit tests

Signed-off-by: MohammedAbdi <[email protected]>

update metrics url

Signed-off-by: MohammedAbdi <[email protected]>

nit

Signed-off-by: MohammedAbdi <[email protected]>

nit

Signed-off-by: MohammedAbdi <[email protected]>

adjust unit tests

Signed-off-by: MohammedAbdi <[email protected]>

nit

Signed-off-by: MohammedAbdi <[email protected]>

update

Signed-off-by: MohammedAbdi <[email protected]>

update manifests

Signed-off-by: MohammedAbdi <[email protected]>

update test file

Signed-off-by: MohammedAbdi <[email protected]>

update compatible profiles

Signed-off-by: MohammedAbdi <[email protected]>

nit

Signed-off-by: MohammedAbdi <[email protected]>
  • Loading branch information
mamy-CS committed Feb 20, 2025
1 parent 48360d3 commit 20c385e
Show file tree
Hide file tree
Showing 14 changed files with 879 additions and 13 deletions.
6 changes: 6 additions & 0 deletions api/v1alpha1/instaslice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,12 @@ type InstasliceStatus struct {
// nodeResources specifies the discovered resources of the node
// +optional
NodeResources DiscoveredNodeResources `json:"nodeResources"`
// ObservedGeneration tracks the latest generation of the resource that has been observed and acted upon by the controller
// +optional
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
// IsMetricProcessed specifies the metrics is updated
// +optional
IsMetricProcessed bool `json:"isMetricProcessed,omitempty"`
}

//+kubebuilder:object:root=true
Expand Down
21 changes: 21 additions & 0 deletions cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,31 @@ func init() {
//+kubebuilder:scaffold:scheme
}

func getEnv(key, defval string) string {
if value, ok := os.LookupEnv(key); ok {
return value
}
return defval
}

func main() {
// Log info before initializing metrics exporter
ctrl.Log.Info("[SetupWithManager] Initializing Metrics Exporter.")
controller.RegisterMetrics()
// Log info after the metrics exporter is initialized
ctrl.Log.Info("[SetupWithManager] Metrics Exporter Initialized.")
var instaslicePrometheusMetricsUrl string = "http://0.0.0.0:8443"

// NOTE: these can be set as env or flag, flag takes precedence over env
instaslicePrometheusMetricsUrlEnv := getEnv("INSTASLICE-PROMETHEUS-METRICS-URL", instaslicePrometheusMetricsUrl)

var metricsAddr string
var enableLeaderElection bool
var probeAddr string
var secureMetrics bool
var enableHTTP2 bool
flag.StringVar(&instaslicePrometheusMetricsUrl, "instaslice-prometheus-metrics-url", instaslicePrometheusMetricsUrlEnv,
"The URL for the Prometheus metrics where Instaslice exposes metrics")
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
Expand All @@ -82,6 +101,8 @@ func main() {

ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))

//setupLog.Info(("instaslicePrometheusMetricsUrl=" + instaslicePrometheusMetricsUrl))

// if the enable-http2 flag is false (the default), http/2 should be disabled
// due to its vulnerabilities. More specifically, disabling http/2 will
// prevent from being vulnerable to the HTTP/2 Stream Cancellation and
Expand Down
8 changes: 8 additions & 0 deletions config/crd/bases/inference.redhat.com_instaslices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,9 @@ spec:
x-kubernetes-list-map-keys:
- type
x-kubernetes-list-type: map
isMetricProcessed:
description: IsMetricProcessed specifies the metrics is updated
type: boolean
nodeResources:
description: nodeResources specifies the discovered resources of the
node
Expand Down Expand Up @@ -324,6 +327,11 @@ spec:
- nodeGpus
- nodeResources
type: object
observedGeneration:
description: ObservedGeneration tracks the latest generation of the
resource that has been observed and acted upon by the controller
format: int64
type: integer
podAllocationResults:
additionalProperties:
properties:
Expand Down
3 changes: 3 additions & 0 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ spec:
capabilities:
drop:
- "ALL"
runAsUser: 1000
livenessProbe:
httpGet:
path: /healthz
Expand All @@ -104,5 +105,7 @@ spec:
value: <IMG_DMST>
- name: EMULATOR_MODE
value: "false"
- name: INSTASLICE-PROMETHEUS-METRICS-URL
value: "http://0.0.0.0:8443"
serviceAccountName: controller-manager
terminationGracePeriodSeconds: 10
15 changes: 15 additions & 0 deletions deploy/instaslice-metrics-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: v1
kind: Service
metadata:
name: instaslice-metrics
namespace: instaslice-system
labels:
control-plane: controller-manager
spec:
ports:
- name: metrics
port: 8443
protocol: TCP
targetPort: 8443
selector:
control-plane: controller-manager # Use the correct label here
23 changes: 23 additions & 0 deletions deploy/instaslice-servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: instaslice-monitor
namespace: instaslice-monitoring
labels:
release: prometheus # Label to match Prometheus serviceMonitorSelector
spec:
selector:
matchLabels:
control-plane: controller-manager # Match labels of the Service exposing metrics
namespaceSelector:
matchNames:
- instaslice-system # Namespace where the Service resides
endpoints:
- port: metrics # Port name exposed in the Service for kube-rbac-proxy
interval: 15s
path: /metrics
scheme: https
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token # Prometheus authentication
honorLabels: true
tlsConfig:
insecureSkipVerify: true # Set to false if using a valid CA
16 changes: 16 additions & 0 deletions deploy/prometheus-role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-metrics-reader
namespace: instaslice-system
rules:
- apiGroups: [""]
resources: ["services", "endpoints", "pods"]
verbs: ["get", "list", "watch"]
- apiGroups: ["metrics.k8s.io"]
resources: ["pods", "nodes"]
verbs: ["get", "list"]
- apiGroups: ["authorization.k8s.io"]
resources: ["subjectaccessreviews"]
verbs: ["create"]

13 changes: 13 additions & 0 deletions deploy/prometheus-rolebinding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-metrics-binding
namespace: instaslice-system
subjects:
- kind: ServiceAccount
name: prometheus-kube-prometheus-prometheus # Change this to your Prometheus ServiceAccount
namespace: instaslice-monitoring # Change to Prometheus namespace
roleRef:
kind: Role
name: prometheus-metrics-reader
apiGroup: rbac.authorization.k8s.io
5 changes: 5 additions & 0 deletions deploy/prometheus-serviceaccount.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: instaslice-monitoring # namespace where Prometheus is running
36 changes: 36 additions & 0 deletions deploy/prometheus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
alertmanager:
enabled: false
kube-state-metrics:
enabled: false
prometheus-node-exporter:
enabled: false
prometheus-pushgateway:
enabled: false
server:
name: instaslice
service:
enabled: true
type: NodePort
servicePort: 9090
persistentVolume:
existingClaim: prometheus-instaslice
enabled: false
securityContext:
runAsUser:
runAsNonRoot:
runAsGroup:
fsGroup:
extraScrapeConfigs: |
- job_name: instaslice-metrics
honor_labels: true
metrics_path: /metrics
scheme: https
scrape_interval: 15s
static_configs:
- targets:
- instaslice-metrics.instaslice-system.svc.cluster.local:8443
tls_config:
insecure_skip_verify: true
serviceMonitorSelector:
matchLabels:
release: prometheus
Loading

0 comments on commit 20c385e

Please sign in to comment.