vllm-project · Jeffwan · Feb 14, 2025 · Dec 10, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
@@ -16,4 +16,4 @@ spec:
       endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
       path: /metrics/default/simulator-llama2-7b-a40
       targetMetric: "vllm:deployment_replicas"
-      targetValue: "1"
+      targetValue: "100" # For stable workloads. Set to a fraction to tolerate bursts.
diff --git a/development/app/config/simulator/patch_podautoscaler_a100.yaml b/development/app/config/simulator/patch_podautoscaler_a100.yaml
@@ -16,4 +16,4 @@ spec:
       endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
       path: /metrics/default/simulator-llama2-7b-a100
       targetMetric: "vllm:deployment_replicas"
-      targetValue: "1"
+      targetValue: "100"  # For stable workloads. Set to a fraction to tolerate bursts.
diff --git a/docs/source/features/heterogeneous-gpu.rst b/docs/source/features/heterogeneous-gpu.rst
@@ -24,7 +24,7 @@ Step 1: Deploy the heterogeneous deployments.
 
 One deployment and corresponding PodAutoscaler should be deployed for each GPU type.
 See `sample heterogeneous configuration <https://github.com/aibrix/aibrix/tree/main/samples/heterogeneous>`_ for an example of heterogeneous configuration composed of two GPU types. The following codes 
-deploy heterogeneous deployments using L20 and A10 GPU.
+deploy heterogeneous deployments using L20 and V100 GPU.
 
 .. code-block:: bash
 
@@ -45,9 +45,10 @@ Incoming requests are routed through the gateway and directed to the optimal pod
 
     kubectl get pods
     NAME                                       READY   STATUS    RESTARTS   AGE
-    deepseek-coder-7b-a10-96667667c-6gjql      2/2     Running   0          33s
+    deepseek-coder-7b-v100-96667667c-6gjql     2/2     Running   0          33s
     deepseek-coder-7b-l20-96667667c-7zj7k      2/2     Running   0          33s
 
+Step 2: Install aibrix python module:
 
 Step 2: Install aibrix python module:
 
@@ -74,32 +75,31 @@ Step 4: Decide SLO and generate profile, run `aibrix_gen_profile -h` for help.
 
     kubectl -n aibrix-system port-forward svc/aibrix-redis-master 6379:6379 1>/dev/null 2>&1 &
     # Wait for port-forward taking effect.
-    aibrix_gen_profile deepseek-coder-7b-a10 --cost [cost1] [SLO-metric] [SLO-value] -o "redis://localhost:6379/?model=deepseek-coder-7b"
+    aibrix_gen_profile deepseek-coder-7b-v100 --cost [cost1] [SLO-metric] [SLO-value] -o "redis://localhost:6379/?model=deepseek-coder-7b"
     aibrix_gen_profile deepseek-coder-7b-l20 --cost [cost2] [SLO-metric] [SLO-value] -o "redis://localhost:6379/?model=deepseek-coder-7b"
 
 Now the GPU Optimizer is ready to work. You should observe that the number of workload pods changes in response to the requests sent to the gateway. Once the GPU optimizer finishes the scaling optimization, the output of the GPU optimizer is passed to PodAutoscaler as a metricSource via a designated HTTP endpoint for the final scaling decision.  The following is an example of PodAutoscaler spec.
 
-A simple example of PodAutoscaler spec for a10 GPU is as follows:
+A simple example of PodAutoscaler spec for v100 GPU is as follows:
 
-.. literalinclude:: ../../../samples/heterogeneous/deepseek-coder-7b-l20-podautoscaler.yaml
+.. literalinclude:: ../../../samples/heterogeneous/deepseek-coder-7b-v100-podautoscaler.yaml
    :language: yaml
 
-
 Miscellaneous
 -------------
 
-A new label label ``model.aibrix.ai/min_replicas`` is added to specifies the minimum number of replicas to maintain when there is no workload. We recommend setting this to 1 for at least one Deployment spec to ensure there is always one READY pod available. For example, while the GPU optimizer might recommend 0 replicas for an a10 GPU during periods of no activity, setting ``model.aibrix.ai/min_replicas: "1"`` will maintain one a10 replica. This label only affects the system when there is no workload - it is ignored when there are active requests.
+A new label label ``model.aibrix.ai/min_replicas`` is added to specifies the minimum number of replicas to maintain when there is no workload. We recommend setting this to 1 for at least one Deployment spec to ensure there is always one READY pod available. For example, while the GPU optimizer might recommend 0 replicas for an v100 GPU during periods of no activity, setting ``model.aibrix.ai/min_replicas: "1"`` will maintain one v100 replica. This label only affects the system when there is no workload - it is ignored when there are active requests.
 
 .. code-block:: yaml
 
     apiVersion: apps/v1
     kind: Deployment
     metadata:
-      name: deepseek-coder-7b-a10
+      name: deepseek-coder-7b-v100
       labels:
         model.aibrix.ai/name: "deepseek-coder-7b"
         model.aibrix.ai/min_replicas: "1" # min replica for gpu optimizer when no workloads.
     ... rest yaml deployments
 
-Important: The ``minReplicas`` field in the PodAutoscaler spec must be set to 0 to allow proper scaling behavior. Setting it to any value greater than 0 will interfere with the GPU optimizer's scaling decisions. For instance, if the GPU optimizer determines an optimal configuration of ``{a10: 0, l20: 4}`` but the a10 PodAutoscaler has ``minReplicas: 1``, the system won't be able to scale the a10 down to 0 as recommended.
+Important: The ``minReplicas`` field in the PodAutoscaler spec must be set to 0 to allow proper scaling behavior. Setting it to any value greater than 0 will interfere with the GPU optimizer's scaling decisions. For instance, if the GPU optimizer determines an optimal configuration of ``{v100: 0, l20: 4}`` but the v100 PodAutoscaler has ``minReplicas: 1``, the system won't be able to scale the v100 down to 0 as recommended.
 
diff --git a/python/aibrix/aibrix/gpu_optimizer/load_monitor/load_reader.py b/python/aibrix/aibrix/gpu_optimizer/load_monitor/load_reader.py
@@ -14,6 +14,7 @@
 
 import json
 import logging
+import math
 import re
 from datetime import datetime
 from typing import Any, List, Optional, Protocol, Tuple, Union
@@ -147,7 +148,12 @@ class WorkloadReader:
 
     def __init__(self, filepath, scale: float = 1.0, interval: int = 10) -> None:
         if filepath != unittest_filepath:
-            self.df = pd.read_json(filepath)
+            try:
+                self.df = pd.read_json(filepath)
+            except Exception:
+                self.df = pd.read_json(filepath, lines=True)
+                self.df["Timestamp"] = self.df["timestamp"]
+                self.df["Requests"] = self.df["requests"]
 
         self.scale = scale
         self.interval = interval
@@ -180,6 +186,10 @@ def read(self, ts: float = 0.0) -> Tuple[List[LoadRecord], float]:
                 self.log2_aggregate(self.tick_df["Prompt Length"] * self.scale, 1),
                 self.log2_aggregate(self.tick_df["Output Length"] * self.scale, 1),
             ):
+                # Unlikely, just in case.
+                if math.isinf(output_tokens) or math.isinf(input_tokens):
+                    continue
+
                 records.append(
                     LoadRecord(
                         (self.tick - self.start),

diff --git a/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py b/python/aibrix/aibrix/gpu_optimizer/load_monitor/monitor.py
@@ -106,6 +106,7 @@ def __init__(
         deployment: Optional[DeploymentStates] = None,
         namespace: Optional[str] = None,
         profile_reader: Optional[ProfileReader] = None,
+        gpu_fraction: float = 100.0,
         debug: bool = False,
     ):
         """Initialize the model monitor.
@@ -119,6 +120,7 @@ def __init__(
             replicas: (optional) The initial number of replicas for the model deployment.
             interval: (optional) The interval (in seconds) at which to monitor the model. Defaults to 10 seconds.
             window: (optional) The window (in seconds) to consider for clustering. Defaults to 240 seconds.
+            gpu_fraction: (optional) The number of fractions that a GPU is counted. Defaults to 100.
             debug: (optional) Whether to enable debugging behavior. Defaults to False.
         """
         self.model_name = model_name
@@ -129,6 +131,7 @@ def __init__(
         self.debug = debug
         self.done = False
         self.window = float(window)
+        self.gpu_fraction = gpu_fraction
         self._lock = threading.Lock()
 
         # Load reader
@@ -139,7 +142,7 @@ def __init__(
 
         # Optimizer
         self._profiles: Dict[str, GPUProfile] = {}
-        self._optimizer = Optimizer()
+        self._optimizer = Optimizer(self.gpu_fraction)
 
         # Monitor states
         self._centers: Iterable[Centeroid] = Empty_Array
@@ -276,6 +279,7 @@ def load_profiles(self, profile_reader: Optional[ProfileReader] = None) -> bool:
 
             profiles = profile_reader.read()
             for profile in profiles:
+                profile.cost /= self.gpu_fraction
                 if self._update_profile(profile):
                     logger.debug(f"Profile of {profile.gpu} updated.")
 

diff --git a/python/aibrix/aibrix/gpu_optimizer/optimizer/optimizer.py b/python/aibrix/aibrix/gpu_optimizer/optimizer/optimizer.py
@@ -27,11 +27,14 @@
 
 
 class Optimizer:
-    def __init__(self, profiles: Optional[Iterable[GPUProfile]] = None):
+    def __init__(
+        self, gpu_fraction: float, profiles: Optional[Iterable[GPUProfile]] = None
+    ):
         self._config = MelangConfig()
         self._workload_distribution_template: Optional[np.ndarray] = None
         self._indexes: Optional[list] = None  # Values ticks of tputs columns and rows
         self._log_indexes: Optional[list] = None  # Cache the log2 value of index
+        self._gpu_fraction = gpu_fraction
         if profiles is not None:
             for profile in profiles:
                 self.set_profile(profile)
@@ -73,7 +76,7 @@ def set_workload_distribution(
         self._workload_distribution_template.fill(0)
 
         # Maintain the overall request scale disregard some request are not covered.
-        self._config.total_request_rate = total_request_rate
+        self._config.total_request_rate = total_request_rate * self._gpu_fraction
         # covered_request_rate is used to calculate the workload distribution.
         covered_request_rate = reduce(
             lambda cnt, center: cnt + center.rate, profiles, 0.0
@@ -82,7 +85,8 @@ def set_workload_distribution(
         for profile in profiles:
             try:
                 signature = self._validate_workload_signature(profile)
-                self._workload_distribution_template[signature] = (
+                # Merge possible multiple patterns (out of range patterns coinincident with border patterns)
+                self._workload_distribution_template[signature] += (
                     profile.rate / covered_request_rate
                 )  # type: ignore
                 logger.debug(

diff --git a/samples/heterogeneous/deepseek-coder-7b-l20-deployment.yaml b/samples/heterogeneous/deepseek-coder-7b-l20-deployment.yaml
@@ -3,7 +3,6 @@ kind: Deployment
 metadata:
   labels:
     adapter.model.aibrix.ai/enabled: "true"
-    model.aibrix.ai/min_replicas: "4"
     model.aibrix.ai/name: deepseek-coder-7b
     model.aibrix.ai/port: "8000"
     model.aibrix.ai/min_replicas: "1" # min replica when there is no workloads.

diff --git a/samples/heterogeneous/deepseek-coder-7b-l20-podautoscaler.yaml b/samples/heterogeneous/deepseek-coder-7b-l20-podautoscaler.yaml
@@ -15,8 +15,8 @@ spec:
     path: /metrics/default/deepseek-coder-7b-l20
     protocolType: http
     targetMetric: vllm:deployment_replicas
-    targetValue: "1"
-  minReplicas: 1
+    targetValue: "100"  # For stable workloads. Set to a fraction to tolerate bursts.
+  minReplicas: 0
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment

diff --git a/samples/heterogeneous/deepseek-coder-7b-v100-podautoscaler.yaml b/samples/heterogeneous/deepseek-coder-7b-v100-podautoscaler.yaml
@@ -15,7 +15,7 @@ spec:
     path: /metrics/default/deepseek-coder-7b-v100
     protocolType: http
     targetMetric: vllm:deployment_replicas
-    targetValue: "1"
+    targetValue: "100"  # For stable workloads. Set to a fraction to tolerate bursts.
   minReplicas: 0
   scaleTargetRef:
     apiVersion: apps/v1

diff --git a/samples/heterogeneous/kustomization.yaml b/samples/heterogeneous/kustomization.yaml
@@ -0,0 +1,32 @@
+kind: Kustomization
+
+resources:
+- deepseek-coder-7b-service.yaml
+- deepseek-coder-7b-l20-deployment.yaml
+- deepseek-coder-7b-l20-podautoscaler.yaml
+- deepseek-coder-7b-v100-deployment.yaml
+- deepseek-coder-7b-v100-podautoscaler.yaml
+
+patches:
+- patch: |-  # Use the '|' and '-' for inline patching, warm up 10 hosts and start with 7
+    apiVersion: apps/v1
+    kind: Deployment
+    metadata:
+      name: deepseek-coder-7b-v100
+      labels:
+        model.aibrix.ai/min_replicas: "1"
+  target:
+    kind: Deployment
+    name: deepseek-coder-7b-v100
+- patch: |-  # Use the '|' and '-' for inline patching, warm up 10 hosts and start with 7
+    apiVersion: apps/v1
+    kind: Deployment
+    metadata:
+      name: deepseek-coder-7b-l20
+      labels:
+        model.aibrix.ai/min_replicas: "0"
+  target:
+    kind: Deployment
+    name: deepseek-coder-7b-l20
+
+apiVersion: kustomize.config.k8s.io/v1beta1