From 1b97711e3d258e16f986bddb9be52af3f78919e1 Mon Sep 17 00:00:00 2001
From: Qidong Su <soodoshll@gmail.com>
Date: Sat, 10 Jan 2026 16:03:57 -0500
Subject: [PATCH 1/7] fix

---
 components/src/dynamo/vllm/handlers.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
index b9a7ba6baa5..63b77a52b42 100644
--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -1300,16 +1300,21 @@ async def _generate_token_mode(self, request, context, request_id):
 
     async def _generate_text_mode(self, request, context, request_id):
         """Generate text using OpenAI-compatible format (text-in-text-out)."""
+
         # Get text input using InputParamManager
         input_data = self.input_param_manager.get_input_param(
             request, use_tokenizer=True
         )
 
+        multi_modal_data = await self._extract_multimodal_data(request)
+
         # Build prompt for vLLM
         if isinstance(input_data, list):
-            prompt = TokensPrompt(prompt_token_ids=input_data)
+            prompt = TokensPrompt(
+                prompt_token_ids=input_data, multi_modal_data=multi_modal_data
+            )
         else:
-            prompt = TextPrompt(prompt=input_data)
+            prompt = TextPrompt(prompt=input_data, multi_modal_data=multi_modal_data)
 
         # Build sampling params from OpenAI-style request
         sampling_params = build_sampling_params_openai(

From e3254ad11b1fbdc23197a33308145d526aca68cf Mon Sep 17 00:00:00 2001
From: Qidong Su <soodoshll@gmail.com>
Date: Fri, 9 Jan 2026 14:39:47 -0800
Subject: [PATCH 2/7] ups

Signed-off-by: Qidong Su <soodoshll@gmail.com>

From b251962a8505c30e6364fc7e2d714d8874f02aca Mon Sep 17 00:00:00 2001
From: Qidong Su <soodoshll@gmail.com>
Date: Sun, 11 Jan 2026 00:00:14 -0500
Subject: [PATCH 3/7] upd

---
 components/src/dynamo/vllm/handlers.py | 56 +++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
index 63b77a52b42..e757c7b1429 100644
--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -893,6 +893,59 @@ async def _extract_multimodal_data(
 
         return vllm_mm_data if vllm_mm_data else None
 
+    async def _extract_multimodal_from_openai_messages(
+        self, request: Dict[str, Any]
+    ) -> Dict[str, Any] | None:
+        messages = request.get("messages")
+        if not messages:
+            return None
+
+        image_urls = []
+        for message in messages:
+            content = message.get("content")
+            if not isinstance(content, list):
+                continue
+
+            for item in content:
+                if not isinstance(item, dict) or item.get("type") != "image_url":
+                    continue
+
+                image_url_data = item.get("image_url")
+                if isinstance(image_url_data, dict):
+                    url = image_url_data.get("url")
+                elif isinstance(image_url_data, str):
+                    url = image_url_data
+                else:
+                    continue
+
+                if url:
+                    image_urls.append(url)
+
+        if not image_urls:
+            return None
+
+        if not self.enable_multimodal:
+            raise ValueError(
+                "Received multimodal data but multimodal processing is not enabled. "
+                "Use --enable-multimodal flag to enable multimodal processing."
+            )
+
+        images = []
+        for url in image_urls:
+            try:
+                image = await self.image_loader.load_image(url)
+                images.append(image)
+                logger.debug(f"Loaded image from OpenAI message: {url[:80]}...")
+            except Exception:
+                logger.exception(f"Failed to load image from {url[:80]}...")
+                raise
+
+        vllm_mm_data = {"image": images[0] if len(images) == 1 else images}
+        logger.debug(
+            f"Extracted {len(images)} image(s) from OpenAI messages for multimodal processing"
+        )
+        return vllm_mm_data
+
     def _build_prompt_from_request(
         self,
         request: Dict[str, Any],
@@ -1306,7 +1359,8 @@ async def _generate_text_mode(self, request, context, request_id):
             request, use_tokenizer=True
         )
 
-        multi_modal_data = await self._extract_multimodal_data(request)
+        # Extract multimodal data
+        multi_modal_data = await self._extract_multimodal_from_openai_messages(request)
 
         # Build prompt for vLLM
         if isinstance(input_data, list):

From 11db62bab46d3a7c803e1e0b742d8fbfb1affef9 Mon Sep 17 00:00:00 2001
From: Qidong Su <soodoshll@gmail.com>
Date: Mon, 12 Jan 2026 14:42:04 -0500
Subject: [PATCH 4/7] update

---
 components/src/dynamo/vllm/handlers.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
index e757c7b1429..c3daf0a50ae 100644
--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -174,6 +174,7 @@ def build_sampling_params(
 def build_sampling_params_openai(
     request: Dict[str, Any],
     default_sampling_params: Dict[str, Any],
+    model_max_len: int | None = None,
 ) -> SamplingParams:
     """
     Build SamplingParams from an OpenAI-compatible request format.
@@ -181,7 +182,8 @@ def build_sampling_params_openai(
     Args:
         request: The OpenAI-style request dict with parameters like temperature, max_tokens, etc.
         default_sampling_params: Default sampling parameters to initialize with
-
+        model_max_len: Maximum model context length for computing dynamic max_tokens default
+    
     Returns:
         SamplingParams configured from the request
     """
@@ -210,6 +212,9 @@ def build_sampling_params_openai(
     # Handle max_tokens
     if "max_tokens" in request and request["max_tokens"] is not None:
         sampling_params.max_tokens = request["max_tokens"]
+    elif model_max_len is not None:
+        # Match token mode behavior: generate until context limit
+        sampling_params.max_tokens = model_max_len
 
     # Handle stop sequences
     if "stop" in request and request["stop"] is not None:
@@ -1372,7 +1377,7 @@ async def _generate_text_mode(self, request, context, request_id):
 
         # Build sampling params from OpenAI-style request
         sampling_params = build_sampling_params_openai(
-            request, self.default_sampling_params
+            request, self.default_sampling_params, self.model_max_len
         )
 
         dp_rank = request.get("dp_rank", None)

From 986f699465f44a0d60945617d08647d35990ed50 Mon Sep 17 00:00:00 2001
From: Qidong Su <soodoshll@gmail.com>
Date: Mon, 12 Jan 2026 14:58:01 -0500
Subject: [PATCH 5/7] fix

---
 components/src/dynamo/vllm/handlers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
index c3daf0a50ae..b84bcbef039 100644
--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -210,8 +210,8 @@ def build_sampling_params_openai(
                 setattr(sampling_params, param_key, request[req_key])
 
     # Handle max_tokens
-    if "max_tokens" in request and request["max_tokens"] is not None:
-        sampling_params.max_tokens = request["max_tokens"]
+    if (provided_max_tokens := request.get("max_tokens")) is not None:
+        sampling_params.max_tokens = provided_max_tokens
     elif model_max_len is not None:
         # Match token mode behavior: generate until context limit
         sampling_params.max_tokens = model_max_len

From c3794c5779751d2b3975a5bd34d7f1e372167bab Mon Sep 17 00:00:00 2001
From: Qidong Su <qidongs@nvidia.com>
Date: Sun, 1 Feb 2026 22:12:35 -0500
Subject: [PATCH 6/7] upd

---
 components/src/dynamo/vllm/handlers.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
index b84bcbef039..c1053f17f16 100644
--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -210,11 +210,11 @@ def build_sampling_params_openai(
                 setattr(sampling_params, param_key, request[req_key])
 
     # Handle max_tokens
-    if (provided_max_tokens := request.get("max_tokens")) is not None:
-        sampling_params.max_tokens = provided_max_tokens
-    elif model_max_len is not None:
-        # Match token mode behavior: generate until context limit
-        sampling_params.max_tokens = model_max_len
+    provided_max_tokens = request.get("max_tokens")
+    model_config_max_tokens = default_sampling_params.get("max_tokens")
+    
+    sampling_params.max_tokens = min(filter(lambda x: x is not None, 
+        [provided_max_tokens, model_max_len, model_config_max_tokens]))
 
     # Handle stop sequences
     if "stop" in request and request["stop"] is not None:

From ddde01b414c81eeed6f9c452ad005b160887e509 Mon Sep 17 00:00:00 2001
From: Qidong Su <qidongs@nvidia.com>
Date: Mon, 2 Feb 2026 16:27:22 -0500
Subject: [PATCH 7/7] fix

---
 components/src/dynamo/vllm/handlers.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
index c1053f17f16..b6daf581294 100644
--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -166,8 +166,12 @@ def build_sampling_params(
     input_length = len(token_ids)
     dynamic_default = max(1, model_max_len - input_length)
     model_config_max_tokens = default_sampling_params.get("max_tokens")
-    sampling_params.max_tokens = min(filter(lambda x: x is not None, 
-        [provided_max_tokens, dynamic_default, model_config_max_tokens]))
+    sampling_params.max_tokens = min(
+        filter(
+            lambda x: x is not None,
+            [provided_max_tokens, dynamic_default, model_config_max_tokens],
+        )
+    )
     return sampling_params
 
 
@@ -183,7 +187,7 @@ def build_sampling_params_openai(
         request: The OpenAI-style request dict with parameters like temperature, max_tokens, etc.
         default_sampling_params: Default sampling parameters to initialize with
         model_max_len: Maximum model context length for computing dynamic max_tokens default
-    
+
     Returns:
         SamplingParams configured from the request
     """
@@ -212,9 +216,13 @@ def build_sampling_params_openai(
     # Handle max_tokens
     provided_max_tokens = request.get("max_tokens")
     model_config_max_tokens = default_sampling_params.get("max_tokens")
-    
-    sampling_params.max_tokens = min(filter(lambda x: x is not None, 
-        [provided_max_tokens, model_max_len, model_config_max_tokens]))
+
+    sampling_params.max_tokens = min(
+        filter(
+            lambda x: x is not None,
+            [provided_max_tokens, model_max_len, model_config_max_tokens],
+        )
+    )
 
     # Handle stop sequences
     if "stop" in request and request["stop"] is not None: