CentML · wangshangsam · Feb 9, 2026 · Feb 3, 2026 · Feb 4, 2026 · Jan 10, 2026
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
@@ -12,4 +12,6 @@ torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytor
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.3
 # FA4
-flash-attn-cute @ git+https://github.com/Dao-AILab/flash-attention.git@2580b5a4882562640f3cfbffd2bb8d2de9268f9f#subdirectory=flash_attn/cute
+flash-attn-cute @ git+https://github.com/Dao-AILab/flash-attention.git@2580b5a4882562640f3cfbffd2bb8d2de9268f9f#subdirectory=flash_attn/cute
+# nvimgcodec
+nvidia-nvimgcodec-cu13==0.7.0.11
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
@@ -283,3 +283,86 @@ def test_custom_class_serialization_disallowed_without_pickle():
     with pytest.raises(TypeError):
         # Attempt to encode the custom class
         encoder.encode(obj)
+
+
+@dataclass
+class RequestWithTensor:
+    """Mock request with non-multimodal tensor field like EngineCoreRequest."""
+
+    prompt_embeds: torch.Tensor | None
+    data: str
+
+
+def test_non_multimodal_tensor_with_ipc():
+    """Test that non-multimodal tensor fields work correctly with IPC enabled.
+
+    This reproduces the bug where fields like prompt_embeds: torch.Tensor | None
+    would fail to decode when IPC is enabled because _decode_tensor expected a tuple
+    but received a TensorIpcHandle dict.
+    """
+    import torch.multiprocessing as torch_mp
+
+    # Create tensor queues for IPC
+    tensor_queues = [torch_mp.Queue()]
+
+    # Create encoder with IPC enabled
+    encoder = MsgpackEncoder(
+        tensor_queues=tensor_queues, multimodal_tensor_ipc="torch_shm"
+    )
+    encoder.set_target_engine(0)
+    encoder.set_request_context("test_request_123")
+
+    # Create decoder with IPC queue
+    decoder = MsgpackDecoder(RequestWithTensor, tensor_queue=tensor_queues[0])
+
+    # Create a request with a non-multimodal tensor
+    original_tensor = torch.randn(5, 10, dtype=torch.float32)
+    request = RequestWithTensor(prompt_embeds=original_tensor, data="test_data")
+
+    # Encode the request - this should send the tensor via IPC
+    encoded = encoder.encode(request)
+
+    # Verify encoding succeeded
+    assert len(encoded) > 0
+
+    # Decode the request - this should retrieve the tensor from IPC queue
+    # Previously this would fail with: TypeError: cannot unpack non-iterable dict object
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded request matches the original
+    assert isinstance(decoded, RequestWithTensor)
+    assert decoded.data == "test_data"
+    assert decoded.prompt_embeds is not None
+    assert torch.allclose(decoded.prompt_embeds, original_tensor), (
+        "Decoded tensor does not match the original tensor."
+    )
+
+
+def test_non_multimodal_tensor_with_ipc_none_value():
+    """Test that None values for tensor fields work correctly with IPC enabled."""
+    import torch.multiprocessing as torch_mp
+
+    # Create tensor queues for IPC
+    tensor_queues = [torch_mp.Queue()]
+
+    # Create encoder with IPC enabled
+    encoder = MsgpackEncoder(
+        tensor_queues=tensor_queues, multimodal_tensor_ipc="torch_shm"
+    )
+    encoder.set_target_engine(0)
+    encoder.set_request_context("test_request_456")
+
+    # Create decoder with IPC queue
+    decoder = MsgpackDecoder(RequestWithTensor, tensor_queue=tensor_queues[0])
+
+    # Create a request with None for the tensor field
+    request = RequestWithTensor(prompt_embeds=None, data="test_data_with_none")
+
+    # Encode and decode the request
+    encoded = encoder.encode(request)
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded request matches the original
+    assert isinstance(decoded, RequestWithTensor)
+    assert decoded.data == "test_data_with_none"
+    assert decoded.prompt_embeds is None