diff --git a/README.md b/README.md index 98d26bb4d3..a2e6337499 100644 --- a/README.md +++ b/README.md @@ -409,16 +409,16 @@ nm.deploy() nm.serve() ``` -### Deploy NeMo Multimodal Models Directly with Triton Inference Server +### Deploy Megatron Multimodal Models Directly with Triton Inference Server -You can also deploy NeMo multimodal models directly using Triton Inference Server without exporting to TensorRT-LLM. This provides a simpler deployment path while still leveraging Triton's scalable serving capabilities. +You can also deploy Megatron multimodal models directly using Triton Inference Server without exporting to TensorRT-LLM. This provides a simpler deployment path while still leveraging Triton's scalable serving capabilities. ```python from nemo_deploy import DeployPyTriton -from nemo_deploy.multimodal import NeMoMultimodalDeployable +from nemo_deploy.multimodal import MegatronMultimodalDeployable -model = NeMoMultimodalDeployable( - nemo_checkpoint_filepath="/path/to/model.nemo", +model = MegatronMultimodalDeployable( + megatron_checkpoint_filepath="/path/to/model.nemo", tensor_parallel_size=1, pipeline_parallel_size=1, ) @@ -458,18 +458,17 @@ output = nq.query( print(output) ``` -### Query Directly Deployed NeMo Multimodal Models +### Query Directly Deployed Megatron Multimodal Models -For multimodal models deployed directly with `NeMoMultimodalDeployable`, use the `NemoQueryMultimodalPytorch` class: +For multimodal models deployed directly with `MegatronMultimodalDeployable`, use the `NemoQueryMultimodalPytorch` class: ```python from nemo_deploy.multimodal import NemoQueryMultimodalPytorch -from PIL import Image nq = NemoQueryMultimodalPytorch(url="localhost:8000", model_name="qwen") output = nq.query_multimodal( prompts=["What is in this image?"], - images=[Image.open("/path/to/image.jpg")], + images=["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"], max_length=100, top_k=1, top_p=0.0, diff --git a/nemo_deploy/multimodal/nemo_multimodal_deployable.py b/nemo_deploy/multimodal/megatron_multimodal_deployable.py similarity index 82% rename from nemo_deploy/multimodal/nemo_multimodal_deployable.py rename to nemo_deploy/multimodal/megatron_multimodal_deployable.py index 5bcbb8a05a..3dc60d27c5 100644 --- a/nemo_deploy/multimodal/nemo_multimodal_deployable.py +++ b/nemo_deploy/multimodal/megatron_multimodal_deployable.py @@ -24,19 +24,19 @@ from nemo_deploy import ITritonDeployable from nemo_deploy.utils import cast_output, str_ndarray2list from nemo_export_deploy_common.import_utils import ( - MISSING_NEMO_MSG, + MISSING_MBRIDGE_MSG, MISSING_TRITON_MSG, UnavailableError, null_decorator, ) try: - from nemo.collections.vlm.inference.base import generate, setup_model_and_tokenizer - from nemo.collections.vlm.inference.qwenvl_inference_wrapper import QwenVLInferenceWrapper + from megatron.bridge.inference.vlm.base import generate, setup_model_and_tokenizer + from megatron.bridge.inference.vlm.qwenvl_inference_wrapper import QwenVLInferenceWrapper - HAVE_NEMO = True + HAVE_MBRIDGE = True except (ImportError, ModuleNotFoundError): - HAVE_NEMO = False + HAVE_MBRIDGE = False from typing import Any generate = Any @@ -67,42 +67,46 @@ def dict_to_str(messages): return json.dumps(messages) -class NeMoMultimodalDeployable(ITritonDeployable): - """Triton inference server compatible deploy class for a NeMo multimodal model file. +class MegatronMultimodalDeployable(ITritonDeployable): + """Triton inference server compatible deploy class for a Megatron multimodal model file. Args: - nemo_checkpoint_filepath (str): path for the nemo checkpoint. - tensor_parallel_size (int): tensor parallelism. - pipeline_parallel_size (int): pipeline parallelism. + megatron_checkpoint_filepath (str): path for the megatron checkpoint. + tensor_model_parallel_size (int): tensor parallelism. + pipeline_model_parallel_size (int): pipeline parallelism. params_dtype (torch.dtype): data type for model parameters. inference_batch_times_seqlen_threshold (int): sequence threshold. + inference_max_seq_length (int): maximum sequence length for inference. """ def __init__( self, - nemo_checkpoint_filepath: str = None, - tensor_parallel_size: int = 1, - pipeline_parallel_size: int = 1, + megatron_checkpoint_filepath: str, + tensor_model_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, params_dtype: torch.dtype = torch.bfloat16, inference_batch_times_seqlen_threshold: int = 1000, + inference_max_seq_length: int = 8192, ): if not HAVE_TRITON: raise UnavailableError(MISSING_TRITON_MSG) - if not HAVE_NEMO: - raise UnavailableError(MISSING_NEMO_MSG) + if not HAVE_MBRIDGE: + raise UnavailableError(MISSING_MBRIDGE_MSG) - self.nemo_checkpoint_filepath = nemo_checkpoint_filepath - self.tensor_parallel_size = tensor_parallel_size - self.pipeline_parallel_size = pipeline_parallel_size + self.megatron_checkpoint_filepath = megatron_checkpoint_filepath + self.tensor_model_parallel_size = tensor_model_parallel_size + self.pipeline_model_parallel_size = pipeline_model_parallel_size self.params_dtype = params_dtype self.inference_batch_times_seqlen_threshold = inference_batch_times_seqlen_threshold + self.inference_max_seq_length = inference_max_seq_length self.inference_wrapped_model, self.processor = setup_model_and_tokenizer( - path=nemo_checkpoint_filepath, - tp_size=tensor_parallel_size, - pp_size=pipeline_parallel_size, + megatron_model_path=megatron_checkpoint_filepath, + tp=tensor_model_parallel_size, + pp=pipeline_model_parallel_size, params_dtype=params_dtype, inference_batch_times_seqlen_threshold=inference_batch_times_seqlen_threshold, + inference_max_seq_length=inference_max_seq_length, ) def generate( @@ -157,8 +161,16 @@ def apply_chat_template(self, messages, add_generation_prompt=True): ) return text - def base64_to_image(self, image_base64): - """Convert base64-encoded image to PIL Image.""" + def process_image_input(self, image_source): + """Process image input from base64-encoded string or HTTP URL. + + Args: + image_source (str): Image source - either base64-encoded image string with data URI prefix + (e.g., "data:image;base64,...") or HTTP/HTTPS URL (e.g., "http://example.com/image.jpg") + + Returns: + Processed image content suitable for model inference. + """ if isinstance(self.inference_wrapped_model, QwenVLInferenceWrapper): from qwen_vl_utils import process_vision_info @@ -166,7 +178,7 @@ def base64_to_image(self, image_base64): { "role": "user", "content": [ - {"type": "image", "image": f"data:image;base64,{image_base64}"}, + {"type": "image", "image": image_source}, ], } ] @@ -259,6 +271,12 @@ def _infer_fn( Returns: dict: sentences. """ + # Handle temperature=0.0 for greedy decoding + if temperature == 0.0: + LOGGER.warning("temperature=0.0 detected. Setting top_k=1 for greedy sampling.") + top_k = 1 + top_p = 0.0 + inference_params = CommonInferenceParams( temperature=float(temperature), top_k=int(top_k), @@ -266,7 +284,7 @@ def _infer_fn( num_tokens_to_generate=num_tokens_to_generate, ) - images = [self.base64_to_image(img_b64) for img_b64 in images] + images = [self.process_image_input(image_source) for image_source in images] results = self.generate( prompts, diff --git a/nemo_deploy/multimodal/query_multimodal.py b/nemo_deploy/multimodal/query_multimodal.py index 17d1e49a87..41590ca0a1 100644 --- a/nemo_deploy/multimodal/query_multimodal.py +++ b/nemo_deploy/multimodal/query_multimodal.py @@ -195,9 +195,16 @@ class NemoQueryMultimodalPytorch: nq = NemoQueryMultimodalPytorch(url="localhost", model_name="qwen") - # Encode image to base64 + # Option 1: Use HTTP URL directly + output = nq.query_multimodal( + prompts=["Describe this image"], + images=["http://example.com/image.jpg"], + max_length=100, + ) + + # Option 2: Encode image to base64 with data URI prefix with open("image.jpg", "rb") as f: - image_base64 = base64.b64encode(f.read()).decode('utf-8') + image_base64 = "data:image;base64," + base64.b64encode(f.read()).decode('utf-8') output = nq.query_multimodal( prompts=["Describe this image"], @@ -231,7 +238,8 @@ def query_multimodal( Args: prompts (List[str]): List of input text prompts. - images (List[str]): List of base64-encoded image strings. + images (List[str]): List of image strings - either base64-encoded with data URI prefix + (e.g., "data:image;base64,...") or HTTP/HTTPS URLs (e.g., "http://example.com/image.jpg"). max_length (Optional[int]): Maximum number of tokens to generate. max_batch_size (Optional[int]): Maximum batch size for inference. top_k (Optional[int]): Limits to the top K tokens to consider at each step. diff --git a/nemo_deploy/service/fastapi_interface_to_pytriton_multimodal.py b/nemo_deploy/service/fastapi_interface_to_pytriton_multimodal.py index 3955753ea3..df854e0e59 100644 --- a/nemo_deploy/service/fastapi_interface_to_pytriton_multimodal.py +++ b/nemo_deploy/service/fastapi_interface_to_pytriton_multimodal.py @@ -19,7 +19,7 @@ import numpy as np import requests from fastapi import FastAPI, HTTPException -from pydantic import BaseModel, model_validator +from pydantic import BaseModel from pydantic_settings import BaseSettings from nemo_deploy.multimodal.query_multimodal import NemoQueryMultimodalPytorch @@ -82,18 +82,10 @@ class BaseMultimodalRequest(BaseModel): max_tokens: int = 50 temperature: float = 1.0 top_p: float = 0.0 - top_k: int = 1 + top_k: int = 0 random_seed: Optional[int] = None max_batch_size: int = 4 - @model_validator(mode="after") - def set_greedy_params(self): - """Validate parameters for greedy decoding.""" - if self.temperature == 0 and self.top_p == 0: - logging.warning("Both temperature and top_p are 0. Setting top_k to 1 to ensure greedy sampling.") - self.top_k = 1 - return self - class MultimodalCompletionRequest(BaseMultimodalRequest): """Represents a request for multimodal text completion. @@ -290,12 +282,33 @@ def dict_to_str(messages): @app.post("/v1/chat/completions/") async def chat_completions_v1(request: MultimodalChatCompletionRequest): - """Defines the multimodal chat completions endpoint and queries the model deployed on PyTriton server.""" + """Defines the multimodal chat completions endpoint and queries the model deployed on PyTriton server. + + Supports two image content formats (normalized internally to format 1): + 1. {"type": "image", "image": "url_or_base64"} + 2. {"type": "image_url", "image_url": {"url": "url_or_base64"}} (OpenAI-style, converted to format 1) + """ url = f"http://{triton_settings.triton_service_ip}:{triton_settings.triton_service_port}" prompts = request.messages if not isinstance(request.messages, list): prompts = [request.messages] + + # Normalize image_url format to image format for consistent processing + for message in prompts: + for content in message["content"]: + if content["type"] == "image_url": + # Convert OpenAI-style image_url to standard image format + if isinstance(content.get("image_url"), dict): + image_data = content["image_url"]["url"] + else: + image_data = content["image_url"] + # Transform to image format + content["type"] = "image" + content["image"] = image_data + # Remove image_url field + content.pop("image_url", None) + # Serialize the dictionary to a JSON string represnetation to be able to convert to numpy array # (str_list2numpy) and back to list (str_ndarray2list) as required by PyTriton. Using the dictionaries directly # with these methods is not possible as they expect string type. diff --git a/nemo_export_deploy_common/import_utils.py b/nemo_export_deploy_common/import_utils.py index 103b7af05b..1543ab46cc 100644 --- a/nemo_export_deploy_common/import_utils.py +++ b/nemo_export_deploy_common/import_utils.py @@ -39,6 +39,9 @@ MISSING_TENSORRT_LLM_MSG = "tensorrt_llm is not available. Please install it with `pip install tensorrt-llm`." MISSING_TENSORRT_MSG = "tensorrt is not available. Please install it with `pip install nvidia-tensorrt`." MISSING_NEMO_MSG = "nemo is not available. Please install it with `pip install nemo`." +MISSING_MBRIDGE_MSG = ( + "megatron.bridge is not available. Please install it from https://github.com/NVIDIA-NeMo/Megatron-Bridge" +) MISSING_TORCHVISION_MSG = "torchvision is not available. Please install it with `pip install torchvision`." MISSING_MODELOPT_MSG = "modelopt is not available. Please install it with `pip install nvidia-modelopt[torch]`." MISSING_RAY_MSG = "ray is not available. Please install it with `pip install ray`." diff --git a/scripts/deploy/multimodal/deploy_inframework_triton.py b/scripts/deploy/multimodal/deploy_inframework_triton.py index fb11b7a4e2..2106b46e58 100644 --- a/scripts/deploy/multimodal/deploy_inframework_triton.py +++ b/scripts/deploy/multimodal/deploy_inframework_triton.py @@ -31,18 +31,18 @@ multimodal_supported = True try: - from nemo_deploy.multimodal.nemo_multimodal_deployable import NeMoMultimodalDeployable + from nemo_deploy.multimodal.megatron_multimodal_deployable import MegatronMultimodalDeployable except Exception as e: - LOGGER.warning(f"Cannot import NeMoMultimodalDeployable, it will not be available. {type(e).__name__}: {e}") + LOGGER.warning(f"Cannot import MegatronMultimodalDeployable, it will not be available. {type(e).__name__}: {e}") multimodal_supported = False def get_args(argv): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description="Deploy nemo multimodal models to Triton", + description="Deploy megatron multimodal models to Triton", ) - parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file") + parser.add_argument("-mc", "--megatron_checkpoint", type=str, help="Source megatron checkpoint path") parser.add_argument( "-tmn", "--triton_model_name", @@ -88,14 +88,14 @@ def get_args(argv): parser.add_argument( "-tps", - "--tensor_parallel_size", + "--tensor_model_parallel_size", default=1, type=int, help="Tensor parallelism size", ) parser.add_argument( "-pps", - "--pipeline_parallel_size", + "--pipeline_model_parallel_size", default=1, type=int, help="Pipeline parallelism size", @@ -130,6 +130,13 @@ def get_args(argv): type=int, help="Inference batch times sequence length threshold", ) + parser.add_argument( + "-imsl", + "--inference_max_seq_length", + default=8192, + type=int, + help="Maximum sequence length for inference", + ) args = parser.parse_args(argv) return args @@ -147,9 +154,9 @@ def nemo_deploy(argv): LOGGER.info(args) if not multimodal_supported: - raise ValueError("NeMoMultimodalDeployable is not supported in this environment.") + raise ValueError("MegatronMultimodalDeployable is not supported in this environment.") - if args.nemo_checkpoint is None: + if args.megatron_checkpoint is None: raise ValueError("In-Framework deployment requires a checkpoint folder.") # Convert dtype string to torch dtype @@ -160,12 +167,13 @@ def nemo_deploy(argv): } params_dtype = dtype_map[args.params_dtype] - model = NeMoMultimodalDeployable( - nemo_checkpoint_filepath=args.nemo_checkpoint, - tensor_parallel_size=args.tensor_parallel_size, - pipeline_parallel_size=args.pipeline_parallel_size, + model = MegatronMultimodalDeployable( + megatron_checkpoint_filepath=args.megatron_checkpoint, + tensor_model_parallel_size=args.tensor_model_parallel_size, + pipeline_model_parallel_size=args.pipeline_model_parallel_size, params_dtype=params_dtype, inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold, + inference_max_seq_length=args.inference_max_seq_length, ) if torch.distributed.is_initialized(): diff --git a/scripts/deploy/multimodal/query_ray_deployment.py b/scripts/deploy/multimodal/query_fastapi_inframework.py similarity index 85% rename from scripts/deploy/multimodal/query_ray_deployment.py rename to scripts/deploy/multimodal/query_fastapi_inframework.py index 971af56559..1ee17fe3d9 100644 --- a/scripts/deploy/multimodal/query_ray_deployment.py +++ b/scripts/deploy/multimodal/query_fastapi_inframework.py @@ -64,19 +64,16 @@ def load_image_from_path(image_path: str) -> str: image_path: Path to local image file or URL Returns: - Base64-encoded image string + Image string - HTTP URL directly or base64-encoded string for local files """ if image_path.startswith(("http://", "https://")): - LOGGER.info(f"Loading image from URL: {image_path}") - response = requests.get(image_path, timeout=30) - response.raise_for_status() - image_content = response.content + LOGGER.info(f"Using image URL directly: {image_path}") + return image_path else: - LOGGER.info(f"Loading image from local path: {image_path}") + LOGGER.info(f"Loading and encoding image from local path: {image_path}") with open(image_path, "rb") as f: image_content = f.read() - - return base64.b64encode(image_content).decode("utf-8") + return "data:image;base64," + base64.b64encode(image_content).decode("utf-8") def test_completions_endpoint(base_url: str, model_id: str, prompt: str = None, image_source: str = None) -> None: @@ -114,8 +111,8 @@ def test_completions_endpoint(base_url: str, model_id: str, prompt: str = None, payload["prompt"] = text try: - image_base64 = load_image_from_path(image_source) - payload["image"] = image_base64 + image_data = load_image_from_path(image_source) + payload["image"] = image_data except Exception as e: LOGGER.error(f"Failed to load image: {e}") return @@ -130,7 +127,12 @@ def test_completions_endpoint(base_url: str, model_id: str, prompt: str = None, def test_chat_completions_endpoint(base_url: str, model_id: str, prompt: str = None, image_source: str = None) -> None: - """Test the chat completions endpoint for multimodal models.""" + """Test the chat completions endpoint for multimodal models. + + Supports two image content formats: + 1. {"type": "image", "image": "url_or_base64"} + 2. {"type": "image_url", "image_url": {"url": "url_or_base64"}} (OpenAI-style) + """ url = f"{base_url}/v1/chat/completions/" # Use provided prompt or default @@ -141,8 +143,10 @@ def test_chat_completions_endpoint(base_url: str, model_id: str, prompt: str = N content = [] try: - image_base64 = load_image_from_path(image_source) - content.append({"type": "image", "image": image_base64}) + image_data = load_image_from_path(image_source) + # Using format 1: {"type": "image", "image": "url_or_base64"} + # Alternative format 2: {"type": "image_url", "image_url": {"url": "url_or_base64"}} + content.append({"type": "image", "image": image_data}) except Exception as e: LOGGER.error(f"Failed to load image: {e}") return @@ -167,19 +171,6 @@ def test_chat_completions_endpoint(base_url: str, model_id: str, prompt: str = N LOGGER.error(f"Error: {response.text}") -def test_models_endpoint(base_url: str) -> None: - """Test the models endpoint.""" - url = f"{base_url}/v1/models" - - LOGGER.info(f"Testing models endpoint at {url}") - response = requests.get(url) - LOGGER.info(f"Response status code: {response.status_code}") - if response.status_code == 200: - LOGGER.info(f"Response: {json.dumps(response.json(), indent=2)}") - else: - LOGGER.error(f"Error: {response.text}") - - def test_health_endpoint(base_url: str) -> None: """Test the health endpoint.""" url = f"{base_url}/v1/health" @@ -218,7 +209,6 @@ def main(): test_completions_endpoint(base_url, args.model_id, args.prompt, args.image) test_chat_completions_endpoint(base_url, args.model_id, args.prompt, args.image) test_health_endpoint(base_url) - test_models_endpoint(base_url) if __name__ == "__main__": diff --git a/scripts/deploy/multimodal/query_inframework.py b/scripts/deploy/multimodal/query_inframework.py index a7ddf1cc63..24accc0966 100644 --- a/scripts/deploy/multimodal/query_inframework.py +++ b/scripts/deploy/multimodal/query_inframework.py @@ -17,7 +17,6 @@ import logging import time -import requests from transformers import AutoProcessor from nemo_deploy.multimodal.query_multimodal import NemoQueryMultimodalPytorch @@ -32,19 +31,16 @@ def load_image_from_path(image_path: str) -> str: image_path: Path to local image file or URL Returns: - Base64-encoded image string + Image string - HTTP URL directly or base64-encoded string for local files """ if image_path.startswith(("http://", "https://")): - LOGGER.info(f"Loading image from URL: {image_path}") - response = requests.get(image_path, timeout=30) - response.raise_for_status() - image_content = response.content + LOGGER.info(f"Using image URL directly: {image_path}") + return image_path else: - LOGGER.info(f"Loading image from local path: {image_path}") + LOGGER.info(f"Loading and encoding image from local path: {image_path}") with open(image_path, "rb") as f: image_content = f.read() - - return base64.b64encode(image_content).decode("utf-8") + return "data:image;base64," + base64.b64encode(image_content).decode("utf-8") def get_args(): @@ -121,7 +117,7 @@ def query(): with open(args.prompt_file, "r") as f: args.prompt = f.read() - image_base64 = load_image_from_path(args.image) + image_source = load_image_from_path(args.image) if "Qwen" in args.processor_name: processor = AutoProcessor.from_pretrained(args.processor_name) @@ -146,7 +142,7 @@ def query(): nemo_query = NemoQueryMultimodalPytorch(args.url, args.model_name) outputs = nemo_query.query_multimodal( prompts=[args.prompt], - images=[image_base64], + images=[image_source], max_length=args.max_output_len, max_batch_size=args.max_batch_size, top_k=args.top_k, diff --git a/tests/unit_tests/deploy/test_fastapi_interface_to_pytriton_multimodal.py b/tests/unit_tests/deploy/test_fastapi_interface_to_pytriton_multimodal.py index 304f811025..2da62db31e 100644 --- a/tests/unit_tests/deploy/test_fastapi_interface_to_pytriton_multimodal.py +++ b/tests/unit_tests/deploy/test_fastapi_interface_to_pytriton_multimodal.py @@ -89,7 +89,7 @@ def test_base_multimodal_request_defaults(self): assert request.max_tokens == 50 assert request.temperature == 1.0 assert request.top_p == 0.0 - assert request.top_k == 1 + assert request.top_k == 0 assert request.random_seed is None assert request.max_batch_size == 4 @@ -112,11 +112,6 @@ def test_base_multimodal_request_custom_values(self): assert request.random_seed == 42 assert request.max_batch_size == 8 - def test_base_multimodal_request_greedy_validation(self): - """Test BaseMultimodalRequest validator for greedy sampling.""" - request = BaseMultimodalRequest(model="test-model", temperature=0, top_p=0, top_k=5) - assert request.top_k == 1 - def test_multimodal_completion_request(self): """Test MultimodalCompletionRequest.""" request = MultimodalCompletionRequest( @@ -274,7 +269,7 @@ def test_completions_with_image(self, client, mock_triton_settings): request_data = { "model": "test-model", "prompt": "Describe this image", - "image": "base64_encoded_image_data", + "image": "data:image;base64,base64_encoded_image_data", "temperature": 0.7, } @@ -291,7 +286,7 @@ def test_completions_with_image(self, client, mock_triton_settings): mock_query.assert_called_once() call_kwargs = mock_query.call_args[1] - assert call_kwargs["images"] == ["base64_encoded_image_data"] + assert call_kwargs["images"] == ["data:image;base64,base64_encoded_image_data"] assert call_kwargs["temperature"] == 0.7 def test_completions_with_custom_params(self, client, mock_triton_settings): @@ -357,7 +352,7 @@ def test_chat_completions_with_image(self, client, mock_triton_settings): "role": "user", "content": [ {"type": "text", "text": "What's in this image?"}, - {"type": "image", "image": "base64_image_data"}, + {"type": "image", "image": "data:image;base64,base64_image_data"}, ], } ] @@ -376,7 +371,7 @@ def test_chat_completions_with_image(self, client, mock_triton_settings): mock_query.assert_called_once() call_kwargs = mock_query.call_args[1] - assert call_kwargs["images"] == ["base64_image_data"] + assert call_kwargs["images"] == ["data:image;base64,base64_image_data"] def test_chat_completions_multiple_images(self, client, mock_triton_settings): """Test /v1/chat/completions/ endpoint with multiple images.""" @@ -385,8 +380,8 @@ def test_chat_completions_multiple_images(self, client, mock_triton_settings): "role": "user", "content": [ {"type": "text", "text": "Compare these images"}, - {"type": "image", "image": "base64_image_1"}, - {"type": "image", "image": "base64_image_2"}, + {"type": "image", "image": "data:image;base64,base64_image_1"}, + {"type": "image", "image": "data:image;base64,base64_image_2"}, ], } ] @@ -403,9 +398,64 @@ def test_chat_completions_multiple_images(self, client, mock_triton_settings): mock_query.assert_called_once() call_kwargs = mock_query.call_args[1] - assert call_kwargs["images"] == ["base64_image_1", "base64_image_2"] + assert call_kwargs["images"] == ["data:image;base64,base64_image_1", "data:image;base64,base64_image_2"] assert call_kwargs["max_length"] == 200 + def test_chat_completions_with_image_url_format(self, client, mock_triton_settings): + """Test /v1/chat/completions/ endpoint with OpenAI-style image_url format.""" + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}}, + ], + } + ] + request_data = {"model": "test-model", "messages": messages} + + mock_output = {"choices": [{"text": [["I see a cat"]]}], "model": "test-model"} + + with patch("nemo_deploy.service.fastapi_interface_to_pytriton_multimodal.query_multimodal_async") as mock_query: + mock_query.return_value = mock_output + + response = client.post("/v1/chat/completions/", json=request_data) + + assert response.status_code == 200 + result = response.json() + assert result["choices"][0]["message"]["content"] == "I see a cat" + + mock_query.assert_called_once() + call_kwargs = mock_query.call_args[1] + assert call_kwargs["images"] == ["https://example.com/image.jpg"] + + def test_chat_completions_with_mixed_image_formats(self, client, mock_triton_settings): + """Test /v1/chat/completions/ endpoint with mixed image and image_url formats.""" + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Compare these images"}, + {"type": "image", "image": "data:image;base64,base64_data"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}}, + ], + } + ] + request_data = {"model": "test-model", "messages": messages} + + mock_output = {"choices": [{"text": [["Comparison"]]}], "model": "test-model"} + + with patch("nemo_deploy.service.fastapi_interface_to_pytriton_multimodal.query_multimodal_async") as mock_query: + mock_query.return_value = mock_output + + response = client.post("/v1/chat/completions/", json=request_data) + + assert response.status_code == 200 + + mock_query.assert_called_once() + call_kwargs = mock_query.call_args[1] + assert call_kwargs["images"] == ["data:image;base64,base64_data", "https://example.com/image.jpg"] + def test_chat_completions_with_params(self, client, mock_triton_settings): """Test /v1/chat/completions/ endpoint with custom parameters.""" messages = [{"role": "user", "content": [{"type": "text", "text": "Hello"}]}] @@ -452,7 +502,7 @@ def test_helper_fun(self): url="http://localhost:8000", model="test-model", prompts=["test prompt"], - images=["image_data"], + images=["data:image;base64,image_data"], temperature=0.7, top_k=10, top_p=0.9, @@ -465,7 +515,7 @@ def test_helper_fun(self): mock_nq_class.assert_called_once_with(url="http://localhost:8000", model_name="test-model") mock_nq.query_multimodal.assert_called_once_with( prompts=["test prompt"], - images=["image_data"], + images=["data:image;base64,image_data"], temperature=0.7, top_k=10, top_p=0.9, diff --git a/tests/unit_tests/deploy/test_nemo_multimodal_deployable.py b/tests/unit_tests/deploy/test_megatron_multimodal_deployable.py similarity index 71% rename from tests/unit_tests/deploy/test_nemo_multimodal_deployable.py rename to tests/unit_tests/deploy/test_megatron_multimodal_deployable.py index 538fe06653..e20e8ee30a 100644 --- a/tests/unit_tests/deploy/test_nemo_multimodal_deployable.py +++ b/tests/unit_tests/deploy/test_megatron_multimodal_deployable.py @@ -21,7 +21,7 @@ from megatron.core.inference.common_inference_params import CommonInferenceParams from PIL import Image -from nemo_deploy.multimodal.nemo_multimodal_deployable import NeMoMultimodalDeployable +from nemo_deploy.multimodal.megatron_multimodal_deployable import MegatronMultimodalDeployable from nemo_export_deploy_common.import_utils import UnavailableError @@ -43,18 +43,18 @@ def __init__(self, generated_text): @pytest.fixture def mock_setup_model_and_tokenizer(): - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.setup_model_and_tokenizer") as mock: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.setup_model_and_tokenizer") as mock: mock.return_value = (MockInferenceWrappedModel(), MockProcessor()) yield mock @pytest.fixture def mock_triton_imports(): - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.HAVE_TRITON", True): - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.HAVE_NEMO", True): - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.batch") as mock_batch: - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.first_value") as mock_first_value: - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.Tensor") as mock_tensor: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.HAVE_TRITON", True): + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.HAVE_MBRIDGE", True): + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.batch") as mock_batch: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.first_value") as mock_first_value: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.Tensor") as mock_tensor: mock_batch.return_value = lambda x: x mock_first_value.return_value = lambda x: x @@ -74,8 +74,8 @@ def create_tensor(**kwargs): @pytest.fixture def mock_utils(): - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.str_ndarray2list") as mock_str2list: - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.cast_output") as mock_cast: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.str_ndarray2list") as mock_str2list: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.cast_output") as mock_cast: mock_str2list.return_value = ["test prompt 1", "test prompt 2"] mock_cast.return_value = np.array([b"Generated text 1", b"Generated text 2"]) yield mock_str2list, mock_cast @@ -94,23 +94,23 @@ def sample_image_base64(): @pytest.fixture def deployable(mock_setup_model_and_tokenizer, mock_triton_imports): - return NeMoMultimodalDeployable( - nemo_checkpoint_filepath="test_checkpoint.nemo", - tensor_parallel_size=1, - pipeline_parallel_size=1, + return MegatronMultimodalDeployable( + megatron_checkpoint_filepath="test_checkpoint.nemo", + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, params_dtype=torch.bfloat16, inference_batch_times_seqlen_threshold=1000, ) -class TestNeMoMultimodalDeployable: +class TestMegatronMultimodalDeployable: def test_initialization_success(self, mock_setup_model_and_tokenizer, mock_triton_imports): - """Test successful initialization of NeMoMultimodalDeployable.""" - deployable = NeMoMultimodalDeployable(nemo_checkpoint_filepath="test_checkpoint.nemo") + """Test successful initialization of MegatronMultimodalDeployable.""" + deployable = MegatronMultimodalDeployable(megatron_checkpoint_filepath="test_checkpoint.nemo") - assert deployable.nemo_checkpoint_filepath == "test_checkpoint.nemo" - assert deployable.tensor_parallel_size == 1 - assert deployable.pipeline_parallel_size == 1 + assert deployable.megatron_checkpoint_filepath == "test_checkpoint.nemo" + assert deployable.tensor_model_parallel_size == 1 + assert deployable.pipeline_model_parallel_size == 1 assert deployable.params_dtype == torch.bfloat16 assert deployable.inference_batch_times_seqlen_threshold == 1000 assert deployable.inference_wrapped_model is not None @@ -118,35 +118,37 @@ def test_initialization_success(self, mock_setup_model_and_tokenizer, mock_trito def test_initialization_with_custom_params(self, mock_setup_model_and_tokenizer, mock_triton_imports): """Test initialization with custom parameters.""" - deployable = NeMoMultimodalDeployable( - nemo_checkpoint_filepath="custom_checkpoint.nemo", - tensor_parallel_size=2, - pipeline_parallel_size=2, + deployable = MegatronMultimodalDeployable( + megatron_checkpoint_filepath="custom_checkpoint.nemo", + tensor_model_parallel_size=2, + pipeline_model_parallel_size=2, params_dtype=torch.float16, inference_batch_times_seqlen_threshold=2000, ) - assert deployable.tensor_parallel_size == 2 - assert deployable.pipeline_parallel_size == 2 + assert deployable.tensor_model_parallel_size == 2 + assert deployable.pipeline_model_parallel_size == 2 assert deployable.params_dtype == torch.float16 assert deployable.inference_batch_times_seqlen_threshold == 2000 def test_initialization_calls_setup_model(self, mock_setup_model_and_tokenizer, mock_triton_imports): """Test that initialization calls setup_model_and_tokenizer with correct parameters.""" - NeMoMultimodalDeployable( - nemo_checkpoint_filepath="test_checkpoint.nemo", - tensor_parallel_size=2, - pipeline_parallel_size=2, + MegatronMultimodalDeployable( + megatron_checkpoint_filepath="test_checkpoint.nemo", + tensor_model_parallel_size=2, + pipeline_model_parallel_size=2, params_dtype=torch.float16, inference_batch_times_seqlen_threshold=1500, + inference_max_seq_length=4096, ) mock_setup_model_and_tokenizer.assert_called_once_with( - path="test_checkpoint.nemo", - tp_size=2, - pp_size=2, + megatron_model_path="test_checkpoint.nemo", + tp=2, + pp=2, params_dtype=torch.float16, inference_batch_times_seqlen_threshold=1500, + inference_max_seq_length=4096, ) def test_generate_method(self, deployable, sample_image): @@ -155,7 +157,7 @@ def test_generate_method(self, deployable, sample_image): images = [sample_image, sample_image] inference_params = CommonInferenceParams(temperature=0.7, top_k=10, top_p=0.9, num_tokens_to_generate=100) - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.generate") as mock_generate: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.generate") as mock_generate: with patch.object(deployable, "apply_chat_template", side_effect=lambda x: x): mock_generate.return_value = [MockResult("Generated text 1"), MockResult("Generated text 2")] @@ -189,7 +191,7 @@ def test_generate_method_default_params(self, deployable, sample_image): prompts = ["Test prompt"] images = [sample_image] - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.generate") as mock_generate: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.generate") as mock_generate: mock_generate.return_value = [MockResult("Generated text")] deployable.generate(prompts=prompts, images=images) @@ -255,10 +257,10 @@ def test_infer_fn(self, deployable, sample_image_base64, sample_image): prompts = ["Test prompt 1", "Test prompt 2"] images = [sample_image_base64, sample_image_base64] - with patch.object(deployable, "base64_to_image") as mock_base64_to_image: + with patch.object(deployable, "process_image_input") as mock_process_image_input: with patch.object(deployable, "generate") as mock_generate: - # Mock base64_to_image to return PIL Images - mock_base64_to_image.return_value = sample_image + # Mock process_image_input to return PIL Images + mock_process_image_input.return_value = sample_image mock_generate.return_value = [MockResult("Generated text 1"), MockResult("Generated text 2")] result = deployable._infer_fn( @@ -272,8 +274,8 @@ def test_infer_fn(self, deployable, sample_image_base64, sample_image): max_batch_size=3, ) - # Check that base64_to_image was called for each image - assert mock_base64_to_image.call_count == 2 + # Check that process_image_input was called for each image + assert mock_process_image_input.call_count == 2 # Check that generate was called with the right parameters assert mock_generate.call_count == 1 @@ -301,16 +303,16 @@ def test_infer_fn_default_params(self, deployable, sample_image_base64, sample_i prompts = ["Test prompt"] images = [sample_image_base64] - with patch.object(deployable, "base64_to_image") as mock_base64_to_image: + with patch.object(deployable, "process_image_input") as mock_process_image_input: with patch.object(deployable, "generate") as mock_generate: - # Mock base64_to_image to return PIL Images - mock_base64_to_image.return_value = sample_image + # Mock process_image_input to return PIL Images + mock_process_image_input.return_value = sample_image mock_generate.return_value = [MockResult("Generated text 1")] result = deployable._infer_fn(prompts=prompts, images=images) - # Check that base64_to_image was called - assert mock_base64_to_image.call_count == 1 + # Check that process_image_input was called + assert mock_process_image_input.call_count == 1 # Check that generate was called with the right parameters assert mock_generate.call_count == 1 @@ -331,9 +333,45 @@ def test_infer_fn_default_params(self, deployable, sample_image_base64, sample_i assert result["sentences"] == ["Generated text 1"] + def test_infer_fn_with_temperature_zero(self, deployable): + """Test _infer_fn with temperature=0.0 for greedy decoding.""" + sample_image = Image.new("RGB", (100, 100)) + sample_image_base64 = "data:image;base64,test_base64_string" + + prompts = ["Test prompt"] + images = [sample_image_base64] + + with patch.object(deployable, "process_image_input") as mock_process_image: + with patch.object(deployable, "generate") as mock_generate: + # Mock process_image_input to return PIL Images + mock_process_image.return_value = sample_image + mock_generate.return_value = [MockResult("Generated text")] + + result = deployable._infer_fn( + prompts=prompts, + images=images, + temperature=0.0, # Should trigger greedy sampling handling + top_k=5, # Should be overridden to 1 + top_p=0.5, # Should be overridden to 0.0 + num_tokens_to_generate=100, + ) + + # Check that generate was called with the right parameters + assert mock_generate.call_count == 1 + call_args = mock_generate.call_args + + # Check that inference_params has greedy sampling parameters + assert isinstance(call_args[0][2], CommonInferenceParams) + assert call_args[0][2].temperature == 0.0 # Kept as 0.0 + assert call_args[0][2].top_k == 1 # Overridden for greedy sampling + assert call_args[0][2].top_p == 0.0 # Overridden for greedy sampling + assert call_args[0][2].num_tokens_to_generate == 100 + + assert result["sentences"] == ["Generated text"] + def test_dict_to_str_function(self): """Test the dict_to_str utility function.""" - from nemo_deploy.multimodal.nemo_multimodal_deployable import dict_to_str + from nemo_deploy.multimodal.megatron_multimodal_deployable import dict_to_str test_dict = {"key1": "value1", "key2": "value2"} result = dict_to_str(test_dict) @@ -341,26 +379,29 @@ def test_dict_to_str_function(self): assert isinstance(result, str) assert json.loads(result) == test_dict - @patch("nemo_deploy.multimodal.nemo_multimodal_deployable.HAVE_TRITON", False) + @patch("nemo_deploy.multimodal.megatron_multimodal_deployable.HAVE_TRITON", False) def test_initialization_no_triton(self): """Test that initialization fails when Triton is not available.""" with pytest.raises(UnavailableError): - NeMoMultimodalDeployable(nemo_checkpoint_filepath="test_checkpoint.nemo") - - @patch("nemo_deploy.multimodal.nemo_multimodal_deployable.HAVE_NEMO", False) - def test_initialization_no_nemo(self): - """Test that initialization fails when NeMo is not available.""" - with pytest.raises(UnavailableError, match="nemo is not available. Please install it with `pip install nemo`."): - NeMoMultimodalDeployable(nemo_checkpoint_filepath="test_checkpoint.nemo") - - def test_initialization_missing_checkpoint(self, mock_triton_imports): + MegatronMultimodalDeployable(megatron_checkpoint_filepath="test_checkpoint.nemo") + + @patch("nemo_deploy.multimodal.megatron_multimodal_deployable.HAVE_MBRIDGE", False) + def test_initialization_no_mbridge(self): + """Test that initialization fails when Megatron Bridge is not available.""" + with pytest.raises( + UnavailableError, + match="megatron.bridge is not available. Please install it from https://github.com/NVIDIA-NeMo/Megatron-Bridge", + ): + MegatronMultimodalDeployable(megatron_checkpoint_filepath="test_checkpoint.nemo") + + def test_initialization_missing_checkpoint(self, mock_setup_model_and_tokenizer, mock_triton_imports): """Test initialization with missing checkpoint filepath.""" with pytest.raises(TypeError): - NeMoMultimodalDeployable() + MegatronMultimodalDeployable() def test_generate_empty_inputs(self, deployable): """Test generate method with empty inputs.""" - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.generate") as mock_generate: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.generate") as mock_generate: mock_generate.return_value = [] results = deployable.generate(prompts=[], images=[]) @@ -371,7 +412,7 @@ def test_generate_mismatched_inputs(self, deployable, sample_image): prompts = ["prompt1", "prompt2"] images = [sample_image] # Only one image for two prompts - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.generate") as mock_generate: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.generate") as mock_generate: mock_generate.return_value = [MockResult("Generated text 1"), MockResult("Generated text 2")] # This should work as the mock handles it, but in real scenario it might fail @@ -393,8 +434,8 @@ def test_triton_infer_fn_without_decorators(self, deployable, sample_image_base6 "apply_chat_template": np.array([False]), } - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.str_ndarray2list") as mock_str2list: - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.cast_output") as mock_cast: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.str_ndarray2list") as mock_str2list: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.cast_output") as mock_cast: with patch.object(deployable, "_infer_fn") as mock_infer: # Setup mocks mock_str2list.side_effect = [["test prompt 1", "test prompt 2"], ["mock_base64_1", "mock_base64_2"]] @@ -484,8 +525,8 @@ def test_apply_chat_template_without_generation_prompt(self, deployable): ) assert result == expected_text - def test_base64_to_image_with_qwenvl_wrapper(self, deployable): - """Test base64_to_image with QwenVLInferenceWrapper.""" + def test_process_image_input_with_qwenvl_wrapper(self, deployable): + """Test process_image_input with QwenVLInferenceWrapper using base64 image.""" # Create a mock QwenVLInferenceWrapper class mock_qwenvl_class = MagicMock() @@ -493,20 +534,19 @@ def test_base64_to_image_with_qwenvl_wrapper(self, deployable): # Use isinstance check to return True for QwenVLInferenceWrapper deployable.inference_wrapped_model = MagicMock() - image_base64 = ( - "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==" - ) + # Image source with data URI prefix (new format) + image_source = "data:image;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==" expected_image = Image.new("RGB", (100, 100)) - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.QwenVLInferenceWrapper", mock_qwenvl_class): + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.QwenVLInferenceWrapper", mock_qwenvl_class): # Make isinstance return True for our mock - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.isinstance") as mock_isinstance: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.isinstance") as mock_isinstance: mock_isinstance.return_value = True with patch("qwen_vl_utils.process_vision_info") as mock_process: mock_process.return_value = (expected_image, None) - result = deployable.base64_to_image(image_base64) + result = deployable.process_image_input(image_source) # Verify isinstance was called to check the model type mock_isinstance.assert_called_once_with(deployable.inference_wrapped_model, mock_qwenvl_class) @@ -516,27 +556,58 @@ def test_base64_to_image_with_qwenvl_wrapper(self, deployable): assert len(call_args) == 1 assert call_args[0]["role"] == "user" assert call_args[0]["content"][0]["type"] == "image" - assert call_args[0]["content"][0]["image"] == f"data:image;base64,{image_base64}" + assert call_args[0]["content"][0]["image"] == image_source + + assert result == expected_image + + def test_process_image_input_with_http_url(self, deployable): + """Test process_image_input with HTTP URL.""" + # Create a mock QwenVLInferenceWrapper class + mock_qwenvl_class = MagicMock() + + # Make deployable.inference_wrapped_model an instance of the mock class + deployable.inference_wrapped_model = MagicMock() + + # HTTP URL as image source + image_source = "https://example.com/image.jpg" + expected_image = Image.new("RGB", (100, 100)) + + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.QwenVLInferenceWrapper", mock_qwenvl_class): + # Make isinstance return True for our mock + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.isinstance") as mock_isinstance: + mock_isinstance.return_value = True + + with patch("qwen_vl_utils.process_vision_info") as mock_process: + mock_process.return_value = (expected_image, None) + + result = deployable.process_image_input(image_source) + + # Verify process_vision_info was called with URL + call_args = mock_process.call_args[0][0] + assert len(call_args) == 1 + assert call_args[0]["role"] == "user" + assert call_args[0]["content"][0]["type"] == "image" + assert call_args[0]["content"][0]["image"] == image_source assert result == expected_image - def test_base64_to_image_with_unsupported_model(self, deployable): - """Test base64_to_image with unsupported model raises ValueError.""" + def test_process_image_input_with_unsupported_model(self, deployable): + """Test process_image_input with unsupported model raises ValueError.""" # Create a mock QwenVLInferenceWrapper class mock_qwenvl_class = MagicMock() # Make sure the wrapped model is NOT a QwenVLInferenceWrapper deployable.inference_wrapped_model = MagicMock() - image_base64 = "test_base64_string" + image_source = "data:image;base64,test_base64_string" - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.QwenVLInferenceWrapper", mock_qwenvl_class): + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.QwenVLInferenceWrapper", mock_qwenvl_class): # Make isinstance return False for our mock (not a QwenVLInferenceWrapper) - with patch("nemo_deploy.multimodal.nemo_multimodal_deployable.isinstance") as mock_isinstance: + with patch("nemo_deploy.multimodal.megatron_multimodal_deployable.isinstance") as mock_isinstance: mock_isinstance.return_value = False with pytest.raises(ValueError, match="not supported"): - deployable.base64_to_image(image_base64) + deployable.process_image_input(image_source) def test_ray_infer_fn(self, deployable): """Test ray_infer_fn method.""" diff --git a/tests/unit_tests/deploy/test_query_multimodal.py b/tests/unit_tests/deploy/test_query_multimodal.py index e383d05dc0..1a5fa5b249 100644 --- a/tests/unit_tests/deploy/test_query_multimodal.py +++ b/tests/unit_tests/deploy/test_query_multimodal.py @@ -138,7 +138,7 @@ def query_multimodal_pytorch(self): @pytest.fixture def mock_images(self): # Create sample base64-encoded image strings for testing - return ["mock_base64_image_1", "mock_base64_image_2"] + return ["data:image;base64,mock_base64_image_1", "data:image;base64,mock_base64_image_2"] @pytest.fixture def mock_prompts(self): @@ -305,7 +305,7 @@ def test_query_multimodal_single_prompt_single_image(self, mock_model_client, qu mock_model_client.return_value.__enter__.return_value = mock_client_instance # Use mock base64 image string - base64_image = "mock_base64_single_image" + base64_image = "data:image;base64,mock_base64_single_image" result = query_multimodal_pytorch.query_multimodal(prompts=["Single prompt"], images=[base64_image])