Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
5468273
upd
soodoshll Feb 3, 2026
60557b2
upd
soodoshll Feb 4, 2026
54968a8
Add tensor IPC transfer mechanism for multimodal data
brandonpelfrey Jan 10, 2026
84c8d65
Default to not use Tensor IPC datapath
brandonpelfrey Jan 11, 2026
e104cbd
Update vllm/v1/engine/core.py
brandonpelfrey Jan 11, 2026
cb0893f
Enable/Disable Tensor IPC datapath via args with explicit dest
brandonpelfrey Jan 11, 2026
ba500df
Normalize DP config in engine/core
brandonpelfrey Jan 11, 2026
12fbbf4
Handling TensorIpcHandle for dec_hook
brandonpelfrey Jan 11, 2026
c8c3daf
formatting, type fixes, additional issues from CI review bots
brandonpelfrey Jan 11, 2026
a16a093
Handle orphaned tensors during timeout
brandonpelfrey Jan 11, 2026
2ab0074
remove references to maximum_concurrent_videos
brandonpelfrey Jan 11, 2026
1107bcc
Handle race condition between tensor cleanup and decode threads
brandonpelfrey Jan 12, 2026
d11a8a6
Ensure tensor queue is non-null
brandonpelfrey Jan 12, 2026
2fecb85
SyncMPClient: set target engine for IPC routing
brandonpelfrey Jan 12, 2026
4fbc3c0
Remove video-related options leftover from other PR
brandonpelfrey Jan 12, 2026
1df4745
remove --disable-multimodal-tensor-ipc
brandonpelfrey Jan 12, 2026
c3b7856
multimodal_tensor_ipc = False
brandonpelfrey Jan 12, 2026
500dc8c
rename _decode_ipc_queue_tensor
brandonpelfrey Jan 12, 2026
809fe38
Use encoder_request_context across MP/Async Clients
brandonpelfrey Jan 12, 2026
0140d4f
Symmetric _encode/_decode methods for tensor queues
brandonpelfrey Jan 12, 2026
68e5bc6
Handle _decode_tensor calls for both TensorIpcHandle/dict cases
brandonpelfrey Jan 12, 2026
8bf94c4
remove VLLM_MULTIMODAL_TENSOR_IPC env variable
brandonpelfrey Jan 25, 2026
d03c791
CR comments on request->tensor cleanup
brandonpelfrey Jan 25, 2026
bd4b5ee
Address precommit
brandonpelfrey Jan 25, 2026
e37a2c8
Change config to msgspec|torch instead of boolean
brandonpelfrey Jan 28, 2026
b1f6aa5
remove None typing for multimodal_tensor_ipc
brandonpelfrey Jan 28, 2026
6aa1e3d
Change to direct_rpc and torch_shm, dataclass -> NamedTuple+tuple dat…
brandonpelfrey Jan 30, 2026
8a8a7b8
precommit issues resolved
brandonpelfrey Feb 3, 2026
852e13f
upd
soodoshll Feb 4, 2026
7713eaf
upd
soodoshll Feb 4, 2026
d5c1780
upd
soodoshll Feb 4, 2026
517e7e4
upd
soodoshll Feb 5, 2026
c5e2df1
fix
soodoshll Feb 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion requirements/cuda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,6 @@ torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytor
# FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.5.3
# FA4
flash-attn-cute @ git+https://github.com/Dao-AILab/flash-attention.git@2580b5a4882562640f3cfbffd2bb8d2de9268f9f#subdirectory=flash_attn/cute
flash-attn-cute @ git+https://github.com/Dao-AILab/flash-attention.git@2580b5a4882562640f3cfbffd2bb8d2de9268f9f#subdirectory=flash_attn/cute
# nvimgcodec
nvidia-nvimgcodec-cu13==0.7.0.11
83 changes: 83 additions & 0 deletions tests/v1/test_serial_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,3 +283,86 @@ def test_custom_class_serialization_disallowed_without_pickle():
with pytest.raises(TypeError):
# Attempt to encode the custom class
encoder.encode(obj)


@dataclass
class RequestWithTensor:
"""Mock request with non-multimodal tensor field like EngineCoreRequest."""

prompt_embeds: torch.Tensor | None
data: str


def test_non_multimodal_tensor_with_ipc():
"""Test that non-multimodal tensor fields work correctly with IPC enabled.

This reproduces the bug where fields like prompt_embeds: torch.Tensor | None
would fail to decode when IPC is enabled because _decode_tensor expected a tuple
but received a TensorIpcHandle dict.
"""
import torch.multiprocessing as torch_mp

# Create tensor queues for IPC
tensor_queues = [torch_mp.Queue()]

# Create encoder with IPC enabled
encoder = MsgpackEncoder(
tensor_queues=tensor_queues, multimodal_tensor_ipc="torch_shm"
)
encoder.set_target_engine(0)
encoder.set_request_context("test_request_123")

# Create decoder with IPC queue
decoder = MsgpackDecoder(RequestWithTensor, tensor_queue=tensor_queues[0])

# Create a request with a non-multimodal tensor
original_tensor = torch.randn(5, 10, dtype=torch.float32)
request = RequestWithTensor(prompt_embeds=original_tensor, data="test_data")

# Encode the request - this should send the tensor via IPC
encoded = encoder.encode(request)

# Verify encoding succeeded
assert len(encoded) > 0

# Decode the request - this should retrieve the tensor from IPC queue
# Previously this would fail with: TypeError: cannot unpack non-iterable dict object
decoded = decoder.decode(encoded)

# Verify the decoded request matches the original
assert isinstance(decoded, RequestWithTensor)
assert decoded.data == "test_data"
assert decoded.prompt_embeds is not None
assert torch.allclose(decoded.prompt_embeds, original_tensor), (
"Decoded tensor does not match the original tensor."
)


def test_non_multimodal_tensor_with_ipc_none_value():
"""Test that None values for tensor fields work correctly with IPC enabled."""
import torch.multiprocessing as torch_mp

# Create tensor queues for IPC
tensor_queues = [torch_mp.Queue()]

# Create encoder with IPC enabled
encoder = MsgpackEncoder(
tensor_queues=tensor_queues, multimodal_tensor_ipc="torch_shm"
)
encoder.set_target_engine(0)
encoder.set_request_context("test_request_456")

# Create decoder with IPC queue
decoder = MsgpackDecoder(RequestWithTensor, tensor_queue=tensor_queues[0])

# Create a request with None for the tensor field
request = RequestWithTensor(prompt_embeds=None, data="test_data_with_none")

# Encode and decode the request
encoded = encoder.encode(request)
decoded = decoder.decode(encoded)

# Verify the decoded request matches the original
assert isinstance(decoded, RequestWithTensor)
assert decoded.data == "test_data_with_none"
assert decoded.prompt_embeds is None
Loading