From 1b97711e3d258e16f986bddb9be52af3f78919e1 Mon Sep 17 00:00:00 2001 From: Qidong Su Date: Sat, 10 Jan 2026 16:03:57 -0500 Subject: [PATCH 1/7] fix --- components/src/dynamo/vllm/handlers.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py index b9a7ba6baa5..63b77a52b42 100644 --- a/components/src/dynamo/vllm/handlers.py +++ b/components/src/dynamo/vllm/handlers.py @@ -1300,16 +1300,21 @@ async def _generate_token_mode(self, request, context, request_id): async def _generate_text_mode(self, request, context, request_id): """Generate text using OpenAI-compatible format (text-in-text-out).""" + # Get text input using InputParamManager input_data = self.input_param_manager.get_input_param( request, use_tokenizer=True ) + multi_modal_data = await self._extract_multimodal_data(request) + # Build prompt for vLLM if isinstance(input_data, list): - prompt = TokensPrompt(prompt_token_ids=input_data) + prompt = TokensPrompt( + prompt_token_ids=input_data, multi_modal_data=multi_modal_data + ) else: - prompt = TextPrompt(prompt=input_data) + prompt = TextPrompt(prompt=input_data, multi_modal_data=multi_modal_data) # Build sampling params from OpenAI-style request sampling_params = build_sampling_params_openai( From e3254ad11b1fbdc23197a33308145d526aca68cf Mon Sep 17 00:00:00 2001 From: Qidong Su Date: Fri, 9 Jan 2026 14:39:47 -0800 Subject: [PATCH 2/7] ups Signed-off-by: Qidong Su From b251962a8505c30e6364fc7e2d714d8874f02aca Mon Sep 17 00:00:00 2001 From: Qidong Su Date: Sun, 11 Jan 2026 00:00:14 -0500 Subject: [PATCH 3/7] upd --- components/src/dynamo/vllm/handlers.py | 56 +++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py index 63b77a52b42..e757c7b1429 100644 --- a/components/src/dynamo/vllm/handlers.py +++ b/components/src/dynamo/vllm/handlers.py @@ -893,6 +893,59 @@ async def _extract_multimodal_data( return vllm_mm_data if vllm_mm_data else None + async def _extract_multimodal_from_openai_messages( + self, request: Dict[str, Any] + ) -> Dict[str, Any] | None: + messages = request.get("messages") + if not messages: + return None + + image_urls = [] + for message in messages: + content = message.get("content") + if not isinstance(content, list): + continue + + for item in content: + if not isinstance(item, dict) or item.get("type") != "image_url": + continue + + image_url_data = item.get("image_url") + if isinstance(image_url_data, dict): + url = image_url_data.get("url") + elif isinstance(image_url_data, str): + url = image_url_data + else: + continue + + if url: + image_urls.append(url) + + if not image_urls: + return None + + if not self.enable_multimodal: + raise ValueError( + "Received multimodal data but multimodal processing is not enabled. " + "Use --enable-multimodal flag to enable multimodal processing." + ) + + images = [] + for url in image_urls: + try: + image = await self.image_loader.load_image(url) + images.append(image) + logger.debug(f"Loaded image from OpenAI message: {url[:80]}...") + except Exception: + logger.exception(f"Failed to load image from {url[:80]}...") + raise + + vllm_mm_data = {"image": images[0] if len(images) == 1 else images} + logger.debug( + f"Extracted {len(images)} image(s) from OpenAI messages for multimodal processing" + ) + return vllm_mm_data + def _build_prompt_from_request( self, request: Dict[str, Any], @@ -1306,7 +1359,8 @@ async def _generate_text_mode(self, request, context, request_id): request, use_tokenizer=True ) - multi_modal_data = await self._extract_multimodal_data(request) + # Extract multimodal data + multi_modal_data = await self._extract_multimodal_from_openai_messages(request) # Build prompt for vLLM if isinstance(input_data, list): From 11db62bab46d3a7c803e1e0b742d8fbfb1affef9 Mon Sep 17 00:00:00 2001 From: Qidong Su Date: Mon, 12 Jan 2026 14:42:04 -0500 Subject: [PATCH 4/7] update --- components/src/dynamo/vllm/handlers.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py index e757c7b1429..c3daf0a50ae 100644 --- a/components/src/dynamo/vllm/handlers.py +++ b/components/src/dynamo/vllm/handlers.py @@ -174,6 +174,7 @@ def build_sampling_params( def build_sampling_params_openai( request: Dict[str, Any], default_sampling_params: Dict[str, Any], + model_max_len: int | None = None, ) -> SamplingParams: """ Build SamplingParams from an OpenAI-compatible request format. @@ -181,7 +182,8 @@ def build_sampling_params_openai( Args: request: The OpenAI-style request dict with parameters like temperature, max_tokens, etc. default_sampling_params: Default sampling parameters to initialize with - + model_max_len: Maximum model context length for computing dynamic max_tokens default + Returns: SamplingParams configured from the request """ @@ -210,6 +212,9 @@ def build_sampling_params_openai( # Handle max_tokens if "max_tokens" in request and request["max_tokens"] is not None: sampling_params.max_tokens = request["max_tokens"] + elif model_max_len is not None: + # Match token mode behavior: generate until context limit + sampling_params.max_tokens = model_max_len # Handle stop sequences if "stop" in request and request["stop"] is not None: @@ -1372,7 +1377,7 @@ async def _generate_text_mode(self, request, context, request_id): # Build sampling params from OpenAI-style request sampling_params = build_sampling_params_openai( - request, self.default_sampling_params + request, self.default_sampling_params, self.model_max_len ) dp_rank = request.get("dp_rank", None) From 986f699465f44a0d60945617d08647d35990ed50 Mon Sep 17 00:00:00 2001 From: Qidong Su Date: Mon, 12 Jan 2026 14:58:01 -0500 Subject: [PATCH 5/7] fix --- components/src/dynamo/vllm/handlers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py index c3daf0a50ae..b84bcbef039 100644 --- a/components/src/dynamo/vllm/handlers.py +++ b/components/src/dynamo/vllm/handlers.py @@ -210,8 +210,8 @@ def build_sampling_params_openai( setattr(sampling_params, param_key, request[req_key]) # Handle max_tokens - if "max_tokens" in request and request["max_tokens"] is not None: - sampling_params.max_tokens = request["max_tokens"] + if (provided_max_tokens := request.get("max_tokens")) is not None: + sampling_params.max_tokens = provided_max_tokens elif model_max_len is not None: # Match token mode behavior: generate until context limit sampling_params.max_tokens = model_max_len From c3794c5779751d2b3975a5bd34d7f1e372167bab Mon Sep 17 00:00:00 2001 From: Qidong Su Date: Sun, 1 Feb 2026 22:12:35 -0500 Subject: [PATCH 6/7] upd --- components/src/dynamo/vllm/handlers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py index b84bcbef039..c1053f17f16 100644 --- a/components/src/dynamo/vllm/handlers.py +++ b/components/src/dynamo/vllm/handlers.py @@ -210,11 +210,11 @@ def build_sampling_params_openai( setattr(sampling_params, param_key, request[req_key]) # Handle max_tokens - if (provided_max_tokens := request.get("max_tokens")) is not None: - sampling_params.max_tokens = provided_max_tokens - elif model_max_len is not None: - # Match token mode behavior: generate until context limit - sampling_params.max_tokens = model_max_len + provided_max_tokens = request.get("max_tokens") + model_config_max_tokens = default_sampling_params.get("max_tokens") + + sampling_params.max_tokens = min(filter(lambda x: x is not None, + [provided_max_tokens, model_max_len, model_config_max_tokens])) # Handle stop sequences if "stop" in request and request["stop"] is not None: From ddde01b414c81eeed6f9c452ad005b160887e509 Mon Sep 17 00:00:00 2001 From: Qidong Su Date: Mon, 2 Feb 2026 16:27:22 -0500 Subject: [PATCH 7/7] fix --- components/src/dynamo/vllm/handlers.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py index c1053f17f16..b6daf581294 100644 --- a/components/src/dynamo/vllm/handlers.py +++ b/components/src/dynamo/vllm/handlers.py @@ -166,8 +166,12 @@ def build_sampling_params( input_length = len(token_ids) dynamic_default = max(1, model_max_len - input_length) model_config_max_tokens = default_sampling_params.get("max_tokens") - sampling_params.max_tokens = min(filter(lambda x: x is not None, - [provided_max_tokens, dynamic_default, model_config_max_tokens])) + sampling_params.max_tokens = min( + filter( + lambda x: x is not None, + [provided_max_tokens, dynamic_default, model_config_max_tokens], + ) + ) return sampling_params @@ -183,7 +187,7 @@ def build_sampling_params_openai( request: The OpenAI-style request dict with parameters like temperature, max_tokens, etc. default_sampling_params: Default sampling parameters to initialize with model_max_len: Maximum model context length for computing dynamic max_tokens default - + Returns: SamplingParams configured from the request """ @@ -212,9 +216,13 @@ def build_sampling_params_openai( # Handle max_tokens provided_max_tokens = request.get("max_tokens") model_config_max_tokens = default_sampling_params.get("max_tokens") - - sampling_params.max_tokens = min(filter(lambda x: x is not None, - [provided_max_tokens, model_max_len, model_config_max_tokens])) + + sampling_params.max_tokens = min( + filter( + lambda x: x is not None, + [provided_max_tokens, model_max_len, model_config_max_tokens], + ) + ) # Handle stop sequences if "stop" in request and request["stop"] is not None: