From a997f975db2b1dff1065d7dd2fab5921a73bb568 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Tue, 27 Jan 2026 09:33:03 -0800 Subject: [PATCH 001/189] transfer impl Signed-off-by: Max Hu --- vllm/compilation/piecewise_backend.py | 10 +- vllm/config/vllm.py | 6 + vllm/model_executor/models/qwen3.py | 4 +- vllm/model_executor/models/qwen3_vl.py | 171 +++++++++++++-------- vllm/model_executor/models/qwen3_vl_moe.py | 1 + vllm/v1/worker/gpu_worker.py | 3 + 6 files changed, 121 insertions(+), 74 deletions(-) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 29d6f89990cd..0a144e728283 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -183,9 +183,9 @@ def __call__(self, *args: Any) -> Any: runtime_shape = args[self.sym_shape_indices[0]] range_entry = self._find_range_for_shape(runtime_shape) - assert range_entry is not None, ( - f"Shape: {runtime_shape} out of considered ranges: {self.compile_ranges}" - ) + # assert range_entry is not None, ( + # f"Shape: {runtime_shape} out of considered ranges: {self.compile_ranges}" + # ) - self._maybe_compile_for_range_entry(range_entry, args) - return range_entry.runnable(*args) + self._maybe_compile_for_range_entry(range_entry, args) # type: ignore[arg-type] + return range_entry.runnable(*args) # type: ignore[union-attr] diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 0181cb1f086e..454ea4e9670d 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1249,6 +1249,12 @@ def _set_compile_ranges(self): and x > 1 ): computed_compile_ranges_split_points.append(x) + + # (hack) Add a large number to the compile ranges split points to ensure that + # the last range is always included for vit models. + INT_MAX = 2**63 - 1 + computed_compile_ranges_split_points.append(INT_MAX) + compilation_config.compile_ranges_split_points = sorted( computed_compile_ranges_split_points ) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 707e0ccfd3c5..ef8c56c15bd5 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -144,10 +144,10 @@ def forward( # Add qk-norm q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim) q_by_head = self.q_norm(q_by_head) - q = q_by_head.view(q.shape) + q = q_by_head.flatten(-2) k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim) k_by_head = self.k_norm(k_by_head) - k = k_by_head.view(k.shape) + k = k_by_head.flatten(-2) q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 1f1ee2f56219..3e6bad76a5fc 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -52,6 +52,7 @@ from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_pp_group, parallel_state +from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.conv import Conv3dLayer @@ -65,6 +66,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.models.vision import should_torch_compile_mm_vit from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.evs import ( compute_mrope_for_media, @@ -136,6 +138,10 @@ BATCH_BUCKETS = [8, 16, 32, 64] +@support_torch_compile( + dynamic_arg_dims={"x": 0}, + enable_if=should_torch_compile_mm_vit, +) class Qwen3_VisionPatchEmbed(nn.Module): def __init__( self, @@ -207,6 +213,17 @@ def forward(self, x: torch.Tensor): return mlp_output +@support_torch_compile( + dynamic_arg_dims={ + "x": 0, + "cu_seqlens": 0, + "rotary_pos_emb_cos": 0, + "rotary_pos_emb_sin": 0, + "max_seqlen": 0, + }, + mark_unbacked_dims={"max_seqlen": 0}, + enable_if=should_torch_compile_mm_vit, +) class Qwen3_VisionBlock(nn.Module): def __init__( self, @@ -266,6 +283,10 @@ def forward( return x +@support_torch_compile( + dynamic_arg_dims={"x": 0}, + enable_if=should_torch_compile_mm_vit, +) class Qwen3_VisionPatchMerger(nn.Module): def __init__( self, @@ -300,6 +321,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.linear_fc1", disable_tp=use_data_parallel, + return_bias=False, ) self.act_fn = nn.GELU() self.linear_fc2 = RowParallelLinear( @@ -309,6 +331,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.linear_fc2", disable_tp=use_data_parallel, + return_bias=False, ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -317,9 +340,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: else: x = self.norm(x).view(-1, self.hidden_size) - x_parallel, _ = self.linear_fc1(x) + x_parallel = self.linear_fc1(x) x_parallel = self.act_fn(x_parallel) - out, _ = self.linear_fc2(x_parallel) + out = self.linear_fc2(x_parallel) return out @@ -360,12 +383,15 @@ def __init__( 1 + len(self.deepstack_visual_indexes) ) - self.patch_embed = Qwen3_VisionPatchEmbed( - patch_size=self.patch_size, - temporal_patch_size=self.temporal_patch_size, - in_channels=vision_config.in_channels, - hidden_size=self.hidden_size, - ) + from vllm.compilation.backends import set_model_tag + + with set_model_tag("Qwen3_VisionPatchEmbed"): + self.patch_embed = Qwen3_VisionPatchEmbed( + patch_size=self.patch_size, + temporal_patch_size=self.temporal_patch_size, + in_channels=vision_config.in_channels, + hidden_size=self.hidden_size, + ) self.pos_embed = nn.Embedding(self.num_position_embeddings, self.hidden_size) @@ -378,31 +404,33 @@ def __init__( rope_parameters={"partial_rotary_factor": 0.5}, ) - self.merger = Qwen3_VisionPatchMerger( - d_model=vision_config.out_hidden_size, - context_dim=self.hidden_size, - norm_layer=norm_layer, - spatial_merge_size=self.spatial_merge_size, - quant_config=quant_config, - multimodal_config=multimodal_config, - prefix=f"{prefix}.merger", - ) + with set_model_tag("Qwen3_VisionPatchMerger"): + self.merger = Qwen3_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + norm_layer=norm_layer, + spatial_merge_size=self.spatial_merge_size, + quant_config=quant_config, + multimodal_config=multimodal_config, + prefix=f"{prefix}.merger", + ) - self.deepstack_merger_list = nn.ModuleList( - [ - Qwen3_VisionPatchMerger( - d_model=vision_config.out_hidden_size, - context_dim=self.hidden_size, - spatial_merge_size=self.spatial_merge_size, - use_postshuffle_norm=True, - norm_layer=norm_layer, - quant_config=quant_config, - multimodal_config=multimodal_config, - prefix=f"{prefix}.deepstack_merger_list.{layer_idx}", - ) - for layer_idx in range(len(self.deepstack_visual_indexes)) - ] - ) + with set_model_tag("Qwen3_VisionPatchMerger_postshuffle_norm"): + self.deepstack_merger_list = nn.ModuleList( + [ + Qwen3_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + spatial_merge_size=self.spatial_merge_size, + use_postshuffle_norm=True, + norm_layer=norm_layer, + quant_config=quant_config, + multimodal_config=multimodal_config, + prefix=f"{prefix}.deepstack_merger_list.{layer_idx}", + ) + for layer_idx in range(len(self.deepstack_visual_indexes)) + ] + ) attn_backend_override = ( multimodal_config.mm_encoder_attn_backend if multimodal_config else None @@ -424,28 +452,29 @@ def __init__( f"Qwen3-VL does not support {self.attn_backend} backend now." ) - workspace_buffer = ( - None - if self.attn_backend != AttentionBackendEnum.FLASHINFER - else torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device=self.device) - ) + with set_model_tag("Qwen3_VisionBlock"): + workspace_buffer = ( + None + if self.attn_backend != AttentionBackendEnum.FLASHINFER + else torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device=self.device) + ) - self.blocks = nn.ModuleList( - [ - Qwen3_VisionBlock( - dim=self.hidden_size, - num_heads=self.num_heads, - mlp_hidden_dim=vision_config.intermediate_size, - act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], - norm_layer=norm_layer, - quant_config=quant_config, - multimodal_config=multimodal_config, - prefix=f"{prefix}.blocks.{layer_idx}", - workspace_buffer=workspace_buffer, - ) - for layer_idx in range(vision_config.depth) - ] - ) + self.blocks = nn.ModuleList( + [ + Qwen3_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], + norm_layer=norm_layer, + quant_config=quant_config, + multimodal_config=multimodal_config, + prefix=f"{prefix}.blocks.{layer_idx}", + workspace_buffer=workspace_buffer, + ) + for layer_idx in range(vision_config.depth) + ] + ) @property def dtype(self) -> torch.dtype: @@ -1359,6 +1388,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config + self.vllm_config = vllm_config self.config = config self.multimodal_config = multimodal_config self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" @@ -1373,7 +1403,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): self.visual = None else: self.visual = Qwen3_VisionTransformer( - config.vision_config, + vision_config=config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=quant_config, multimodal_config=multimodal_config, @@ -1510,12 +1540,16 @@ def _process_image_input( image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) - if self.use_data_parallel: - return run_dp_sharded_mrope_vision_model( - self.visual, pixel_values, grid_thw.tolist(), rope_type="rope_3d" - ) - else: - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + with set_forward_context(None, self.vllm_config): + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model( + self.visual, + pixel_values, + grid_thw.tolist(), + rope_type="rope_3d", + ) + else: + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) # Split concatenated embeddings for each image item. merge_size = self.visual.spatial_merge_size @@ -1534,13 +1568,16 @@ def _process_video_input( pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype ) - if self.use_data_parallel: - grid_thw_list = grid_thw.tolist() - return run_dp_sharded_mrope_vision_model( - self.visual, pixel_values_videos, grid_thw_list, rope_type="rope_3d" - ) - else: - video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + with set_forward_context(None, self.vllm_config): + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model( + self.visual, + pixel_values_videos, + grid_thw.tolist(), + rope_type="rope_3d", + ) + else: + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index 3186804488e5..b37ae9c307f7 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -416,6 +416,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config + self.vllm_config = vllm_config self.config = config self.multimodal_config = multimodal_config self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 013780479743..e714d3c6d6aa 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -432,7 +432,10 @@ def compile_or_warm_up_model(self) -> None: # add the end of the range to ensure compilation/warmup. all_sizes = set(cg_capture_sizes) all_sizes.update([x for x in warmup_sizes if isinstance(x, int)]) + INT_MAX = 2**63 - 1 for compile_range in compile_ranges: + if compile_range.end == INT_MAX: + continue if not any(x in compile_range for x in all_sizes): warmup_sizes.append(compile_range.end) From b5886e9a45e9a50a31d125ae9883380796c63b58 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 27 Jan 2026 18:28:30 -0500 Subject: [PATCH 002/189] add compilation configs for mm encoder cudagraph. --- vllm/config/compilation.py | 40 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 035aa24e33c7..007e27afd87b 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -438,6 +438,46 @@ class CompilationConfig: on selected platforms. Disabled by default until more models are supported/tested to work.""" + # Encoder (ViT) CUDA graph settings + cudagraph_mm_encoder: bool = False + """Whether to enable CUDA graph capture for multimodal encoders (ViT). + When enabled, CUDA graphs are captured for the vision encoder to eliminate + kernel launch overhead. Requires fixed input sizes via bucketing. + Experimental feature - use with caution.""" + + encoder_cudagraph_bucket_sizes: list[int] | None = None + """Bucket sizes for encoder CUDA graph capture. Each size represents the + number of visual tokens (after spatial merge) to capture a graph for. + If None, auto-generates based on common image resolutions: + [64, 128, 256, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192]""" + + encoder_cudagraph_grid_configs: list[tuple[int, int, int]] | None = None + """Grid configurations (T, H, W in patch units) to capture encoder CUDA + graphs for. Each tuple represents a specific image dimension configuration. + If None, uses default common configurations for Qwen-VL models. + Example: [(1, 16, 16), (1, 24, 24), (1, 32, 32)] for 448x448, 672x672, + and 896x896 images with patch_size=14 and merge_size=2.""" + + encoder_cudagraph_token_buckets: list[int] | str | None = None + """Token bucket sizes for encoder CUDA graphs with padding support. + Instead of requiring exact grid matches, inputs are padded to the smallest + bucket that fits. This trades some compute (padding overhead) for higher + CUDA graph utilization. + + Can be a list of token counts or a preset name: + - "shopify_fine": [1024, 2048, 3072, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192, 8464] + - "shopify_medium": [1024, 2048, 3072, 4096, 5120, 6144, 7168, 8192, 8464] + - "shopify_coarse": [2048, 4096, 6144, 8192, 8464] + - "shopify_single": [8464] (all images padded to max) + + When set, overrides encoder_cudagraph_grid_configs.""" + + encoder_cudagraph_padded_mode: bool = True + """Whether to use padded execution for encoder CUDA graphs. + When True, inputs smaller than a captured bucket are padded to fit, + enabling higher CUDA graph hit rates at the cost of padding overhead. + When False, only exact grid matches use CUDA graphs.""" + # Inductor capture compile_sizes: list[int | str] | None = None """Sizes to compile for inductor. In addition From ccbeba999cd27e7f5e1e5d28b7a5810a42456800 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 27 Jan 2026 18:29:58 -0500 Subject: [PATCH 003/189] add mm encoder cudagraph manager (exact | bucket). --- vllm/v1/worker/gpu/mm/encoder_runner.py | 244 +++++++++++++++++++++++- 1 file changed, 243 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py index f9a0b50f34b4..d6d019f0e827 100644 --- a/vllm/v1/worker/gpu/mm/encoder_runner.py +++ b/vllm/v1/worker/gpu/mm/encoder_runner.py @@ -1,14 +1,25 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + +from typing import TYPE_CHECKING + import numpy as np import torch +from vllm.logger import init_logger from vllm.model_executor.models.interfaces import SupportsMultiModal from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItem from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.v1.worker.gpu.buffer_utils import UvaBufferPool from vllm.v1.worker.utils import sanity_check_mm_encoder_outputs +if TYPE_CHECKING: + from vllm.config import VllmConfig + from vllm.v1.worker.gpu.mm.encoder_cudagraph import EncoderCudaGraphManager + +logger = init_logger(__name__) + class EncoderRunner: def __init__( @@ -17,11 +28,13 @@ def __init__( hidden_size: int, dtype: torch.dtype, device: torch.device, + vllm_config: VllmConfig | None = None, ): self.max_num_tokens = max_num_tokens self.hidden_size = hidden_size self.dtype = dtype self.device = device + self.vllm_config = vllm_config self.inputs_embeds = torch.zeros( max_num_tokens, @@ -34,6 +47,79 @@ def __init__( self.tmp_is_mm_embed = UvaBufferPool(max_num_tokens, torch.bool) + # Encoder CUDA graph manager (optional) + self.encoder_cudagraph_manager: EncoderCudaGraphManager | None = None + self.encoder_cudagraph_padded_mode: bool = True + self._init_encoder_cudagraph_manager() + + def _init_encoder_cudagraph_manager(self) -> None: + """Initialize encoder CUDA graph manager if enabled in config.""" + if self.vllm_config is None: + return + + compilation_config = self.vllm_config.compilation_config + if compilation_config is None: + return + + if not getattr(compilation_config, 'cudagraph_mm_encoder', False): + return + + # Import here to avoid circular imports + from vllm.v1.worker.gpu.mm.encoder_cudagraph import EncoderCudaGraphManager + + bucket_sizes = getattr( + compilation_config, + 'encoder_cudagraph_bucket_sizes', + None + ) + + # Check if padded mode is enabled + self.encoder_cudagraph_padded_mode = getattr( + compilation_config, + 'encoder_cudagraph_padded_mode', + True # Default to padded mode for better CUDA graph utilization + ) + + self.encoder_cudagraph_manager = EncoderCudaGraphManager( + vllm_config=self.vllm_config, + device=self.device, + dtype=self.dtype, + bucket_sizes=bucket_sizes, + ) + + # Log configuration + grid_configs = self.encoder_cudagraph_manager.grid_configs + logger.info( + "Encoder CUDA graph manager initialized: " + f"padded_mode={self.encoder_cudagraph_padded_mode}, " + f"num_grids={len(grid_configs)}, " + f"grids={grid_configs}" + ) + + def capture_encoder_cudagraphs( + self, + model: SupportsMultiModal, + ) -> None: + """ + Capture CUDA graphs for the encoder. + + Should be called during model warmup after the model is loaded. + """ + if self.encoder_cudagraph_manager is None: + return + + if not hasattr(model, 'visual') or model.visual is None: + logger.warning( + "Model does not have a visual encoder, " + "skipping encoder CUDA graph capture" + ) + return + + self.encoder_cudagraph_manager.capture( + vision_encoder=model.visual, + embed_multimodal_fn=model.embed_multimodal, + ) + def add_request(self, req_id: str, mm_features: list[MultiModalFeatureSpec]): self.req_id_to_mm_features[req_id] = mm_features @@ -59,6 +145,57 @@ def prepare_mm_inputs( mm_kwargs.append(mm_feature.data) return mm_hashes, mm_kwargs + def _get_grid_thw_from_kwargs( + self, + mm_kwargs_group: dict, + modality: str, + ) -> list[list[int]] | None: + """ + Extract grid_thw from mm_kwargs_group. + + Returns None if grid_thw is not available. + """ + if modality not in ("image", "video"): + return None + + # Try to get grid_thw from the kwargs + grid_thw = mm_kwargs_group.get("image_grid_thw") or mm_kwargs_group.get( + "video_grid_thw" + ) + if grid_thw is None: + return None + + # Convert to list if tensor + if hasattr(grid_thw, "tolist"): + grid_thw = grid_thw.tolist() + + return grid_thw + + def _estimate_visual_tokens( + self, + mm_kwargs_group: dict, + modality: str, + ) -> int | None: + """ + Estimate the number of visual tokens for CUDA graph bucket selection. + + Returns None if estimation is not possible. + """ + grid_thw = self._get_grid_thw_from_kwargs(mm_kwargs_group, modality) + if grid_thw is None: + return None + + # Calculate total visual tokens (after spatial merge, assuming 2x2) + # Formula: sum of (T * H/merge * W/merge) for each item + # Note: grid_thw contains [T, H, W] where H and W are already in patch units + spatial_merge_size = 2 # Default for Qwen-VL models + total_tokens = 0 + for t, h, w in grid_thw: + tokens_per_image = t * (h // spatial_merge_size) * (w // spatial_merge_size) + total_tokens += tokens_per_image + + return total_tokens + @torch.inference_mode() def execute_mm_encoder( self, @@ -75,7 +212,20 @@ def execute_mm_encoder( device=self.device, pin_memory=False, ): - curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) + # Try to use CUDA graph if available + cudagraph_result = None + if self.encoder_cudagraph_manager is not None: + cudagraph_result = self._execute_with_cudagraph( + model, mm_kwargs_group, modality, num_items + ) + + if cudagraph_result is not None: + # CUDA graph was used successfully + curr_group_outputs = cudagraph_result + else: + # Fall back to eager mode + curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) + sanity_check_mm_encoder_outputs( curr_group_outputs, expected_num_items=num_items, @@ -87,6 +237,98 @@ def execute_mm_encoder( self.encoder_cache[mm_hash] = output return encoder_outputs + def _execute_with_cudagraph( + self, + model: SupportsMultiModal, + mm_kwargs_group: dict, + modality: str, + num_items: int, + ) -> list[torch.Tensor] | None: + """ + Execute the encoder using CUDA graphs if a matching graph is available. + + Supports two modes: + 1. Exact match: Only use CUDA graph if grid_thw exactly matches + 2. Padded mode: Pad inputs to fit the smallest available bucket + + Args: + model: The multimodal model + mm_kwargs_group: Batched multimodal kwargs + modality: The modality type ("image" or "video") + num_items: Number of items in the batch + + Returns: + List of encoder outputs if CUDA graph was used, None otherwise + """ + if self.encoder_cudagraph_manager is None: + return None + + # Extract grid_thw from kwargs + grid_thw = self._get_grid_thw_from_kwargs(mm_kwargs_group, modality) + if grid_thw is None: + return None + + # Currently only supports single-image batches for CUDA graph + if len(grid_thw) != 1: + logger.debug( + "CUDA graph only supports single-image batches, " + f"got {len(grid_thw)} images. Using eager mode." + ) + return None + + # Extract pixel_values + if modality == "image": + pixel_values = mm_kwargs_group.get("pixel_values") + else: # video + pixel_values = mm_kwargs_group.get("pixel_values_videos") + + if pixel_values is None: + logger.debug("No pixel_values found in kwargs. Using eager mode.") + return None + + # Ensure pixel_values is on the correct device + pixel_values = pixel_values.to(device=self.device, dtype=self.dtype) + + # Get spatial merge size for token calculations + spatial_merge_size = getattr(model.visual, 'spatial_merge_size', 2) + t, h, w = grid_thw[0] + num_output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) + + # Try exact match first + grid_key = self.encoder_cudagraph_manager.get_graph_for_grid(grid_thw) + if grid_key is not None: + # Exact match found - try to run + output = self.encoder_cudagraph_manager.run(pixel_values, grid_thw) + if output is not None: + logger.debug( + f"Encoder CUDA graph exact match for grid {grid_key}, " + f"output: {output.shape}" + ) + return [output[:num_output_tokens]] + + # Try padded execution if enabled + if self.encoder_cudagraph_padded_mode: + result = self.encoder_cudagraph_manager.run_padded( + pixel_values, + grid_thw, + num_output_tokens, + spatial_merge_size, + ) + if result is not None: + output, padding_waste = result + logger.debug( + f"Encoder CUDA graph padded execution: " + f"{num_output_tokens} tokens, waste={padding_waste}" + ) + return [output] + + # No CUDA graph available + logger.debug( + f"No CUDA graph for grid {grid_thw[0]} " + f"(padded_mode={self.encoder_cudagraph_padded_mode}). Using eager mode." + ) + return None + def gather_mm_embeddings( self, req_ids: list[str], From 062ceea6f6746d64a7fc8b020fd5e3410439f48d Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 27 Jan 2026 18:31:15 -0500 Subject: [PATCH 004/189] add capture mm encoder cudagraph option. --- vllm/v1/worker/gpu/model_runner.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index a55519f0fa36..427109bd5c8b 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -107,6 +107,7 @@ def __init__( hidden_size=self.inputs_embeds_size, dtype=self.dtype, device=self.device, + vllm_config=self.vllm_config, ) self.uses_mrope = self.model_config.uses_mrope if self.uses_mrope: @@ -425,6 +426,10 @@ def warmup_for_prefill(self) -> None: self._dummy_run(self.max_num_tokens, skip_attn=False) torch.cuda.synchronize() + # Capture encoder CUDA graphs if enabled + if self.supports_mm_inputs: + self.encoder_runner.capture_encoder_cudagraphs(self.model) + def finish_requests(self, scheduler_output: SchedulerOutput) -> None: if scheduler_output.preempted_req_ids is not None: for req_id in scheduler_output.preempted_req_ids: From 7d70346fca54e03313fd09867ca039604f7cfdc4 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 27 Jan 2026 18:33:39 -0500 Subject: [PATCH 005/189] implement mm encoder cudagraph manager (exact | bucket). --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 826 +++++++++++++++++++++ 1 file changed, 826 insertions(+) create mode 100644 vllm/v1/worker/gpu/mm/encoder_cudagraph.py diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py new file mode 100644 index 000000000000..4b2f68c7aa9b --- /dev/null +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -0,0 +1,826 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +CUDA Graph Manager for Multimodal Encoders (ViT). + +This module provides CUDA graph capture and replay functionality for vision +encoders to eliminate kernel launch overhead and improve GPU utilization. + +Key design principles: +1. Capture graphs for specific grid_thw configurations (not just token counts) +2. Only replay when input dimensions exactly match captured configuration +3. Fall back to eager mode for non-matching inputs +4. Track statistics for monitoring and optimization + +Limitations: +- CUDA graphs are only used when input dimensions exactly match captured graphs +- Variable-size images that don't match any captured configuration use eager mode +- Multiple images in a batch are processed sequentially through graph replay +""" + +from __future__ import annotations + +import math +from collections.abc import Callable +from typing import TYPE_CHECKING, Any + +import torch +import torch.nn as nn +from tqdm import tqdm + +from vllm.config import VllmConfig +from vllm.distributed.parallel_state import graph_capture, is_global_first_rank +from vllm.logger import init_logger + +if TYPE_CHECKING: + from vllm.model_executor.models.interfaces import SupportsMultiModal + +logger = init_logger(__name__) + +# Default grid configurations to capture (T, H, W in patch units) +# These are common configurations for Qwen-VL models after smart_resize +# Format: (temporal, height_patches, width_patches) +DEFAULT_ENCODER_GRID_CONFIGS = [ + # Common single-frame image configurations (T=1) + # After smart_resize with factor=28 (patch=14, merge=2), common sizes: + (1, 16, 16), # ~224x224 -> 64 output tokens + (1, 24, 24), # ~336x336 -> 144 output tokens + (1, 32, 32), # ~448x448 -> 256 output tokens + (1, 48, 48), # ~672x672 -> 576 output tokens + (1, 64, 64), # ~896x896 -> 1024 output tokens + (1, 80, 80), # ~1120x1120 -> 1600 output tokens + (1, 96, 96), # ~1344x1344 -> 2304 output tokens +] + +# Optimized grid configurations for MLPerf Shopify dataset +# Based on analysis: 96% of images have 4000-8200 output tokens +# Using square grids that cover the common token ranges with padding +SHOPIFY_OPTIMIZED_GRID_CONFIGS = [ + # Small images (rare, <5% of dataset) + (1, 64, 64), # 1024 tokens - covers up to ~1024 tokens + (1, 80, 80), # 1600 tokens + (1, 96, 96), # 2304 tokens + (1, 112, 112), # 3136 tokens + # Main distribution (95% of dataset: 4000-8200 tokens) + (1, 128, 128), # 4096 tokens - covers P10 (4646) + (1, 144, 144), # 5184 tokens - covers ~P25 (5351) + (1, 160, 160), # 6400 tokens - covers ~P50-P75 (6072-6904) + (1, 176, 176), # 7744 tokens - covers ~P90 (7948) + (1, 184, 184), # 8464 tokens - covers max (8161) +] + +# Alternative: Rectangular grids for better aspect ratio coverage +SHOPIFY_RECTANGULAR_GRID_CONFIGS = [ + # 4:3 and 3:4 aspect ratios for product images + (1, 128, 128), # 4096 tokens (square) + (1, 112, 144), # 4032 tokens (3:4) + (1, 144, 112), # 4032 tokens (4:3) + (1, 144, 144), # 5184 tokens (square) + (1, 128, 160), # 5120 tokens (4:5) + (1, 160, 128), # 5120 tokens (5:4) + (1, 160, 160), # 6400 tokens (square) + (1, 144, 176), # 6336 tokens (9:11) + (1, 176, 144), # 6336 tokens (11:9) + (1, 176, 176), # 7744 tokens (square) + (1, 160, 192), # 7680 tokens (5:6) + (1, 192, 160), # 7680 tokens (6:5) + (1, 184, 184), # 8464 tokens (square, max) +] + +# Legacy bucket sizes for backward compatibility +DEFAULT_ENCODER_CUDAGRAPH_BUCKET_SIZES = [ + 64, 128, 256, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192 +] + +# ============================================================================= +# TOKEN BUCKET PRESETS FOR PADDED CUDA GRAPHS +# ============================================================================= +# These define output token buckets. Inputs are padded to the smallest bucket +# that fits, trading padding overhead for CUDA graph utilization. +# +# For Shopify dataset analysis: +# - P10=4646, P25=5351, P50=6072, P75=6904, P90=7948, Max=8161 tokens + +# Fine-grained buckets: More buckets = less padding waste, more GPU memory +SHOPIFY_TOKEN_BUCKETS_FINE = [ + 1024, # Small images (<5% of dataset) + 2048, + 3072, + 4096, # ~P10 + 4608, + 5120, # ~P25 + 5632, + 6144, # ~P50 + 6656, + 7168, # ~P75 + 7680, + 8192, # ~P90-P99 + 8464, # Max coverage +] + +# Medium granularity: Balanced tradeoff +SHOPIFY_TOKEN_BUCKETS_MEDIUM = [ + 1024, # Small images + 2048, + 3072, + 4096, # Covers up to P10 + 5120, # Covers P10-P25 + 6144, # Covers P25-P50 + 7168, # Covers P50-P75 + 8192, # Covers P75-P99 + 8464, # Max coverage +] + +# Coarse buckets: Fewer graphs, more padding waste +SHOPIFY_TOKEN_BUCKETS_COARSE = [ + 2048, # Small images + 4096, # Up to ~P10 + 6144, # P10-P50 + 8192, # P50-P99 + 8464, # Max +] + +# Single bucket: Maximum CUDA graph utilization, maximum padding +SHOPIFY_TOKEN_BUCKETS_SINGLE = [ + 8464, # All images padded to max +] + + +def token_bucket_to_grid(token_bucket: int, merge_size: int = 2) -> tuple[int, int, int]: + """ + Convert a token bucket size to a square grid configuration. + + Args: + token_bucket: Number of output tokens (after spatial merge) + merge_size: Spatial merge size (default 2 for Qwen-VL) + + Returns: + Grid config (T, H_patches, W_patches) + """ + # For square grid: tokens = (H/merge)^2, so H = merge * sqrt(tokens) + side = int(math.ceil(math.sqrt(token_bucket))) * merge_size + return (1, side, side) + + +def get_grid_configs_from_token_buckets( + token_buckets: list[int], + merge_size: int = 2, +) -> list[tuple[int, int, int]]: + """Convert token bucket list to grid configurations.""" + return [token_bucket_to_grid(t, merge_size) for t in token_buckets] + + +class EncoderCudaGraphManager: + """ + Manages CUDA graphs for multimodal encoders (e.g., ViT in VLMs). + + The manager captures CUDA graphs for specific grid configurations + (T, H, W in patch units) and replays them during inference when + input dimensions exactly match. + + Design: + - Captures graphs for predefined grid configurations + - Only replays when input exactly matches a captured configuration + - Falls back to eager mode for non-matching inputs + - Tracks statistics for monitoring + + Limitations: + - Requires exact dimension match for graph replay + - Variable-size images may not benefit from CUDA graphs + """ + + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + dtype: torch.dtype, + bucket_sizes: list[int] | None = None, + grid_configs: list[tuple[int, int, int]] | None = None, + ): + self.vllm_config = vllm_config + self.device = device + self.dtype = dtype + + # Get grid configs from config or use defaults + if grid_configs is None: + grid_configs = self._get_grid_configs_from_config() + self.grid_configs = grid_configs + + # Legacy bucket sizes (for backward compatibility with bucket-based API) + if bucket_sizes is None: + bucket_sizes = self._get_bucket_sizes_from_config() + self.bucket_sizes = sorted(bucket_sizes) + + # CUDA graph storage - keyed by (t, h, w) tuple + self.graphs: dict[tuple[int, int, int], torch.cuda.CUDAGraph] = {} + self.pool = torch.cuda.graph_pool_handle() + + # Pre-allocated input/output buffers per grid config + # Key: (t, h, w), Value: {"pixel_values": tensor, "grid_thw": list} + self.input_buffers: dict[tuple[int, int, int], dict[str, Any]] = {} + self.output_buffers: dict[tuple[int, int, int], torch.Tensor] = {} + + # Store metadata about captured graphs + self.captured_metadata: dict[tuple[int, int, int], dict[str, Any]] = {} + + # Track if graphs have been captured + self.captured = False + + # Statistics + self.cache_hits = 0 + self.cache_misses = 0 + self.eager_fallbacks = 0 + + def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: + """Get encoder grid configurations from config or use defaults.""" + compilation_config = self.vllm_config.compilation_config + if compilation_config is None: + return DEFAULT_ENCODER_GRID_CONFIGS + + # Check for token bucket config first (new preferred way) + token_buckets = getattr( + compilation_config, + 'encoder_cudagraph_token_buckets', + None + ) + if token_buckets is not None: + # Handle preset names for token buckets + if isinstance(token_buckets, str): + bucket_presets = { + "shopify_fine": SHOPIFY_TOKEN_BUCKETS_FINE, + "shopify_medium": SHOPIFY_TOKEN_BUCKETS_MEDIUM, + "shopify_coarse": SHOPIFY_TOKEN_BUCKETS_COARSE, + "shopify_single": SHOPIFY_TOKEN_BUCKETS_SINGLE, + } + if token_buckets in bucket_presets: + buckets = bucket_presets[token_buckets] + logger.info( + f"Using token bucket preset '{token_buckets}': {buckets}" + ) + return get_grid_configs_from_token_buckets(buckets) + else: + logger.warning( + f"Unknown token bucket preset '{token_buckets}', " + f"available: {list(bucket_presets.keys())}" + ) + elif isinstance(token_buckets, list): + logger.info(f"Using custom token buckets: {token_buckets}") + return get_grid_configs_from_token_buckets(token_buckets) + + # Check for encoder-specific grid config + grid_configs = getattr( + compilation_config, + 'encoder_cudagraph_grid_configs', + None + ) + if grid_configs is not None: + # Handle preset names + if isinstance(grid_configs, str): + if grid_configs == "shopify": + return SHOPIFY_OPTIMIZED_GRID_CONFIGS + elif grid_configs == "shopify_rectangular": + return SHOPIFY_RECTANGULAR_GRID_CONFIGS + elif grid_configs == "default": + return DEFAULT_ENCODER_GRID_CONFIGS + else: + logger.warning( + f"Unknown grid config preset '{grid_configs}', " + "using default" + ) + return DEFAULT_ENCODER_GRID_CONFIGS + return [tuple(cfg) for cfg in grid_configs] + + return DEFAULT_ENCODER_GRID_CONFIGS + + def _get_bucket_sizes_from_config(self) -> list[int]: + """Get encoder CUDA graph bucket sizes from config or use defaults.""" + compilation_config = self.vllm_config.compilation_config + if compilation_config is None: + return DEFAULT_ENCODER_CUDAGRAPH_BUCKET_SIZES + + encoder_sizes = getattr( + compilation_config, + 'encoder_cudagraph_bucket_sizes', + None + ) + if encoder_sizes is not None: + return encoder_sizes + + return DEFAULT_ENCODER_CUDAGRAPH_BUCKET_SIZES + + def get_padded_size(self, num_visual_tokens: int) -> int | None: + """ + Find the smallest bucket size >= num_visual_tokens. + + Returns None if the input is larger than all buckets. + Note: This is for backward compatibility. For actual graph lookup, + use get_graph_for_grid() instead. + """ + for bucket_size in self.bucket_sizes: + if num_visual_tokens <= bucket_size: + return bucket_size + return None + + def _grid_to_key(self, grid_thw: list[list[int]]) -> tuple[int, int, int] | None: + """ + Convert a grid_thw list to a hashable key. + + Only supports single-image grids (len(grid_thw) == 1). + Returns None for multi-image batches. + """ + if len(grid_thw) != 1: + return None + t, h, w = grid_thw[0] + return (t, h, w) + + def find_best_grid_for_padding( + self, + grid_thw: list[list[int]], + spatial_merge_size: int = 2, + ) -> tuple[int, int, int] | None: + """ + Find the smallest captured grid that can accommodate the input with padding. + + For CUDA graph compatibility with variable-size inputs, this finds the + smallest captured configuration where: + - T_captured >= T_input + - H_captured >= H_input + - W_captured >= W_input + + Args: + grid_thw: Input grid configuration [[T, H, W]] + spatial_merge_size: Merge size for spatial dimensions (default 2) + + Returns: + The best matching captured grid config, or None if no match found + """ + key = self._grid_to_key(grid_thw) + if key is None: + return None + + t_in, h_in, w_in = key + + # First check for exact match + if key in self.graphs: + return key + + # Find smallest captured grid that can accommodate input + best_match = None + best_waste = float('inf') + + for captured_key in self.graphs.keys(): + t_cap, h_cap, w_cap = captured_key + + # Check if captured grid can accommodate input + if t_cap >= t_in and h_cap >= h_in and w_cap >= w_in: + # Calculate waste (padding overhead) + input_tokens = self._compute_output_tokens(key, spatial_merge_size) + captured_tokens = self._compute_output_tokens( + captured_key, spatial_merge_size + ) + waste = captured_tokens - input_tokens + + if waste < best_waste: + best_waste = waste + best_match = captured_key + + if best_match is not None: + logger.debug( + f"Found padding-compatible grid: input={key} -> captured={best_match} " + f"(waste={best_waste} tokens)" + ) + + return best_match + + def _compute_output_tokens( + self, + grid_thw: tuple[int, int, int], + spatial_merge_size: int, + ) -> int: + """Compute number of output tokens for a grid configuration.""" + t, h, w = grid_thw + # After spatial merge: tokens = T * (H/merge) * (W/merge) + return t * (h // spatial_merge_size) * (w // spatial_merge_size) + + def _prepare_dummy_inputs_for_grid( + self, + grid_config: tuple[int, int, int], + vision_encoder: nn.Module, + ) -> dict[str, Any]: + """ + Prepare dummy inputs for CUDA graph capture with a specific grid config. + + Args: + grid_config: Tuple of (T, H, W) in patch units + vision_encoder: The vision encoder module + + Returns: + Dict with pixel_values, grid_thw, and metadata + """ + t, h, w = grid_config + + # Get vision encoder properties + patch_size = vision_encoder.patch_size + temporal_patch_size = vision_encoder.temporal_patch_size + spatial_merge_size = vision_encoder.spatial_merge_size + in_channels = 3 # RGB + + # Calculate patch input channels + patch_input_channels = ( + temporal_patch_size * patch_size * patch_size * in_channels + ) + + # Calculate number of pixel patches (before patch embedding) + # h, w are in patch units, so num_patches = t * h * w + num_pixel_patches = t * h * w + + # Create dummy pixel values + pixel_values = torch.randn( + num_pixel_patches, + patch_input_channels, + dtype=self.dtype, + device=self.device, + ) + + # Grid THW for this configuration + grid_thw = [[t, h, w]] + + # Calculate output tokens + output_tokens = self._compute_output_tokens( + grid_config, spatial_merge_size + ) + + return { + "pixel_values": pixel_values, + "grid_thw": grid_thw, + "num_output_tokens": output_tokens, + "num_pixel_patches": num_pixel_patches, + "patch_input_channels": patch_input_channels, + } + + def capture_graph_for_grid( + self, + grid_config: tuple[int, int, int], + vision_encoder: nn.Module, + ) -> None: + """ + Capture a CUDA graph for the given grid configuration. + + Args: + grid_config: Tuple of (T, H, W) in patch units + vision_encoder: The vision encoder module + """ + logger.debug(f"Capturing encoder CUDA graph for grid config {grid_config}") + + # Prepare dummy inputs + dummy_inputs = self._prepare_dummy_inputs_for_grid(grid_config, vision_encoder) + pixel_values = dummy_inputs["pixel_values"] + grid_thw = dummy_inputs["grid_thw"] + + # Store input buffer reference + self.input_buffers[grid_config] = { + "pixel_values": pixel_values.clone(), + "grid_thw": grid_thw, + } + + # Store metadata + self.captured_metadata[grid_config] = { + "num_output_tokens": dummy_inputs["num_output_tokens"], + "num_pixel_patches": dummy_inputs["num_pixel_patches"], + "patch_input_channels": dummy_inputs["patch_input_channels"], + } + + # Warmup run (required before capture) + with torch.cuda.stream(torch.cuda.current_stream()): + warmup_output = vision_encoder(pixel_values, grid_thw=grid_thw) + + # Allocate output buffer based on actual output shape + self.output_buffers[grid_config] = torch.empty_like(warmup_output) + + torch.cuda.synchronize() + + # Capture the graph + graph = torch.cuda.CUDAGraph() + + # Get a fresh reference to the input buffer for capture + input_buffer = self.input_buffers[grid_config]["pixel_values"] + + with torch.cuda.graph(graph, pool=self.pool): + output = vision_encoder(input_buffer, grid_thw=grid_thw) + self.output_buffers[grid_config].copy_(output) + + self.graphs[grid_config] = graph + logger.debug( + f"Captured encoder CUDA graph for grid config {grid_config} " + f"-> {dummy_inputs['num_output_tokens']} output tokens" + ) + + @torch.inference_mode() + def capture( + self, + vision_encoder: nn.Module, + embed_multimodal_fn: Callable, + ) -> None: + """ + Capture CUDA graphs for all configured grid configurations. + + Args: + vision_encoder: The vision encoder module (e.g., Qwen3_VisionTransformer) + embed_multimodal_fn: The model's embed_multimodal method (unused but kept for API) + """ + if self.captured: + logger.warning("Encoder CUDA graphs already captured, skipping") + return + + logger.info( + f"Capturing encoder CUDA graphs for {len(self.grid_configs)} " + f"grid configurations" + ) + + # Capture from largest to smallest (more memory efficient) + configs_to_capture = sorted( + self.grid_configs, + key=lambda x: x[0] * x[1] * x[2], + reverse=True + ) + + if is_global_first_rank(): + configs_to_capture = tqdm( + configs_to_capture, + desc="Capturing encoder CUDA graphs" + ) + + with graph_capture(device=self.device): + for grid_config in configs_to_capture: + try: + self.capture_graph_for_grid( + grid_config, + vision_encoder, + ) + except Exception as e: + logger.warning( + f"Failed to capture encoder CUDA graph for grid config " + f"{grid_config}: {e}. Will use eager mode." + ) + + self.captured = True + logger.info( + f"Captured {len(self.graphs)} encoder CUDA graphs " + f"(configs: {sorted(self.graphs.keys())})" + ) + + def can_use_graph(self, num_visual_tokens: int) -> bool: + """ + Check if a CUDA graph might be available for the given token count. + + Note: This is a heuristic check. Actual graph usage depends on + exact grid_thw match via get_graph_for_grid(). + """ + if not self.captured: + return False + padded_size = self.get_padded_size(num_visual_tokens) + return padded_size is not None + + def get_graph_for_grid( + self, + grid_thw: list[list[int]], + ) -> tuple[int, int, int] | None: + """ + Check if a CUDA graph is available for the given grid configuration. + + Args: + grid_thw: List of [T, H, W] for each image + + Returns: + The grid config key if a matching graph exists, None otherwise + """ + key = self._grid_to_key(grid_thw) + if key is None: + return None + return key if key in self.graphs else None + + def find_bucket_for_tokens( + self, + num_tokens: int, + spatial_merge_size: int = 2, + ) -> tuple[int, int, int] | None: + """ + Find the smallest captured grid that can fit the given token count. + + This enables padded execution where inputs smaller than a bucket + are padded to match the bucket size. + + Args: + num_tokens: Number of output tokens needed + spatial_merge_size: Merge size (default 2) + + Returns: + Grid config (T, H, W) of the best bucket, or None if too large + """ + best_grid = None + best_bucket_tokens = float('inf') + + for grid_key in self.graphs.keys(): + bucket_tokens = self._compute_output_tokens(grid_key, spatial_merge_size) + if bucket_tokens >= num_tokens and bucket_tokens < best_bucket_tokens: + best_bucket_tokens = bucket_tokens + best_grid = grid_key + + return best_grid + + def run( + self, + pixel_values: torch.Tensor, + grid_thw: list[list[int]], + ) -> torch.Tensor | None: + """ + Run the vision encoder using a captured CUDA graph if available. + + Args: + pixel_values: Input pixel values [num_patches, patch_channels] + grid_thw: List of [T, H, W] for each image + + Returns: + Vision encoder output tensor if graph was used, None if no matching graph + """ + grid_key = self.get_graph_for_grid(grid_thw) + + if grid_key is None: + self.cache_misses += 1 + return None + + # Verify input dimensions match + input_buffer = self.input_buffers[grid_key]["pixel_values"] + if pixel_values.shape != input_buffer.shape: + logger.warning( + f"Pixel values shape mismatch: expected {input_buffer.shape}, " + f"got {pixel_values.shape}. Falling back to eager mode." + ) + self.eager_fallbacks += 1 + return None + + self.cache_hits += 1 + + # Copy input to the captured buffer + input_buffer.copy_(pixel_values) + + # Replay the graph + self.graphs[grid_key].replay() + + # Return a clone of the output to avoid issues with buffer reuse + return self.output_buffers[grid_key].clone() + + def run_padded( + self, + pixel_values: torch.Tensor, + grid_thw: list[list[int]], + num_output_tokens: int, + spatial_merge_size: int = 2, + ) -> tuple[torch.Tensor, int] | None: + """ + Run the vision encoder with padding to fit a captured bucket. + + This method pads the input to match a captured CUDA graph bucket, + executes the graph, and returns the trimmed output. + + Args: + pixel_values: Input pixel values [num_patches, patch_channels] + grid_thw: List of [T, H, W] for each image (only single image supported) + num_output_tokens: Expected number of output tokens for the input + spatial_merge_size: Spatial merge size (default 2) + + Returns: + Tuple of (output tensor trimmed to actual size, padding_waste_tokens) + or None if no suitable bucket found + """ + if len(grid_thw) != 1: + logger.debug("Padded mode only supports single-image inputs") + return None + + # Find the smallest bucket that fits + bucket_grid = self.find_bucket_for_tokens(num_output_tokens, spatial_merge_size) + if bucket_grid is None: + self.cache_misses += 1 + logger.debug( + f"No bucket found for {num_output_tokens} tokens, " + f"max available: {max(self._compute_output_tokens(g, spatial_merge_size) for g in self.graphs.keys()) if self.graphs else 0}" + ) + return None + + bucket_tokens = self._compute_output_tokens(bucket_grid, spatial_merge_size) + padding_waste = bucket_tokens - num_output_tokens + + # Get the input buffer for this bucket + input_buffer = self.input_buffers[bucket_grid]["pixel_values"] + num_input_patches = pixel_values.shape[0] + bucket_input_patches = input_buffer.shape[0] + + if num_input_patches > bucket_input_patches: + logger.warning( + f"Input patches ({num_input_patches}) exceed bucket capacity " + f"({bucket_input_patches}). This shouldn't happen." + ) + self.eager_fallbacks += 1 + return None + + self.cache_hits += 1 + + # Zero the buffer first (for clean padding) + input_buffer.zero_() + + # Copy actual input to the beginning of the buffer + input_buffer[:num_input_patches].copy_(pixel_values) + + # Replay the graph (uses the bucket's grid_thw for position embeddings) + self.graphs[bucket_grid].replay() + + # Get output and trim to actual size + full_output = self.output_buffers[bucket_grid] + trimmed_output = full_output[:num_output_tokens].clone() + + logger.debug( + f"Padded execution: {num_output_tokens} -> {bucket_tokens} tokens " + f"(waste: {padding_waste}, {padding_waste/bucket_tokens*100:.1f}%)" + ) + + return trimmed_output, padding_waste + + def get_stats(self) -> dict[str, Any]: + """Get cache statistics.""" + total = self.cache_hits + self.cache_misses + self.eager_fallbacks + hit_rate = self.cache_hits / total if total > 0 else 0.0 + return { + "cache_hits": self.cache_hits, + "cache_misses": self.cache_misses, + "eager_fallbacks": self.eager_fallbacks, + "hit_rate": hit_rate, + "num_graphs": len(self.graphs), + "captured_configs": sorted(self.graphs.keys()), + } + + +def get_encoder_cudagraph_bucket_sizes( + max_visual_tokens: int, + min_bucket: int = 64, + growth_factor: float = 1.5, +) -> list[int]: + """ + Generate bucket sizes for encoder CUDA graphs. + + Uses exponential growth to cover the range [min_bucket, max_visual_tokens] + with reasonable granularity. + + Args: + max_visual_tokens: Maximum number of visual tokens to support + min_bucket: Minimum bucket size + growth_factor: Multiplier for each successive bucket + + Returns: + List of bucket sizes + """ + buckets = [] + current = min_bucket + + while current <= max_visual_tokens: + buckets.append(int(current)) + current = int(current * growth_factor) + + # Ensure max is included + if buckets[-1] < max_visual_tokens: + buckets.append(max_visual_tokens) + + return buckets + + +def generate_grid_configs_for_resolution_range( + min_size: int = 448, + max_size: int = 1344, + step: int = 224, + patch_size: int = 14, + temporal_values: list[int] | None = None, +) -> list[tuple[int, int, int]]: + """ + Generate grid configurations for a range of image resolutions. + + Args: + min_size: Minimum image dimension in pixels + max_size: Maximum image dimension in pixels + step: Step size in pixels + patch_size: Patch size of the vision encoder + temporal_values: List of temporal dimensions to include (default [1]) + + Returns: + List of (T, H, W) tuples in patch units + """ + if temporal_values is None: + temporal_values = [1] + + configs = [] + for h_pixels in range(min_size, max_size + 1, step): + for w_pixels in range(min_size, max_size + 1, step): + h_patches = h_pixels // patch_size + w_patches = w_pixels // patch_size + for t in temporal_values: + configs.append((t, h_patches, w_patches)) + + return configs From e1019e318b1ccc649ecd4c470dca4c2c289ec3b4 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 28 Jan 2026 00:22:34 -0500 Subject: [PATCH 006/189] precompute pos_embeds, rotary embeddings, cu_seqlens. --- vllm/model_executor/models/qwen3_vl.py | 106 +++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 3e6bad76a5fc..f413b2d8e5dc 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -712,6 +712,112 @@ def forward( ) # [seq_len, hidden_size * (1 + depth_of_deepstack)] return hidden_states + def forward_cudagraph( + self, + x: torch.Tensor, + pos_embeds: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, + cu_seqlens: torch.Tensor, + max_seqlen: torch.Tensor, + ) -> torch.Tensor: + """ + Forward pass optimized for CUDA graph capture/replay. + + This method accepts pre-computed position embeddings, rotary embeddings, + and cumulative sequence lengths to avoid CPU operations during CUDA graph + replay. All tensor arguments must be on the correct device. + + Args: + x: Input pixel values [num_patches, patch_channels] + pos_embeds: Pre-computed position embeddings [num_patches, hidden_size] + rotary_pos_emb_cos: Pre-computed rotary cosine embeddings + rotary_pos_emb_sin: Pre-computed rotary sine embeddings + cu_seqlens: Pre-computed cumulative sequence lengths (on GPU) + max_seqlen: Pre-computed max sequence length (scalar tensor on GPU) + + Returns: + Vision encoder output tensor + """ + # Patch embedding (GPU operation) + hidden_states = x.to(device=self.device, dtype=self.dtype, non_blocking=True) + hidden_states = self.patch_embed(hidden_states) + + # Add pre-computed position embeddings + hidden_states = hidden_states + pos_embeds + + hidden_states = hidden_states.unsqueeze(1) + + # Run through transformer blocks with pre-computed values + deepstack_feature_lists = [] + for layer_num, blk in enumerate(self.blocks): + hidden_states = blk( + hidden_states, + cu_seqlens=cu_seqlens, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, + max_seqlen=max_seqlen, + ) + if layer_num in self.deepstack_visual_indexes: + deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num) + deepstack_feature = self.deepstack_merger_list[deepstack_merger_idx]( + hidden_states + ) + deepstack_feature_lists.append(deepstack_feature) + + hidden_states = self.merger(hidden_states) + hidden_states = torch.cat( + [hidden_states] + deepstack_feature_lists, dim=1 + ) + return hidden_states + + def precompute_for_cudagraph( + self, + grid_thw: list[list[int]], + ) -> dict[str, torch.Tensor]: + """ + Pre-compute all grid-dependent tensors for CUDA graph capture. + + This method computes position embeddings, rotary embeddings, and + cumulative sequence lengths that are fixed for a given grid configuration. + These can be cached and reused during CUDA graph replay. + + Args: + grid_thw: List of [T, H, W] for each image + + Returns: + Dict containing pre-computed tensors: + - pos_embeds: Position embeddings + - rotary_pos_emb_cos: Rotary cosine embeddings + - rotary_pos_emb_sin: Rotary sine embeddings + - cu_seqlens: Cumulative sequence lengths (on GPU) + - max_seqlen: Maximum sequence length (scalar tensor on GPU) + """ + # Compute position embeddings + pos_embeds = self.fast_pos_embed_interpolate(grid_thw) + + # Compute rotary embeddings + rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw) + + # Compute cumulative sequence lengths + grid_thw_np = np.array(grid_thw, dtype=np.int32) + cu_seqlens = np.repeat( + grid_thw_np[:, 1] * grid_thw_np[:, 2], grid_thw_np[:, 0] + ).cumsum(axis=0, dtype=np.int32) + cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens]) + cu_seqlens = torch.from_numpy(cu_seqlens).to(self.device, non_blocking=True) + + # Compute max sequence length + max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens) + + return { + "pos_embeds": pos_embeds, + "rotary_pos_emb_cos": rotary_pos_emb_cos, + "rotary_pos_emb_sin": rotary_pos_emb_sin, + "cu_seqlens": cu_seqlens, + "max_seqlen": max_seqlen, + } + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) From b31bca94ca1e1f1ed11440b612633d719a765959 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 28 Jan 2026 00:25:31 -0500 Subject: [PATCH 007/189] use forward_cudagraph() with precomputed tensors. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 80 ++++++++++++++++++---- 1 file changed, 67 insertions(+), 13 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 4b2f68c7aa9b..4561de65cbc4 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -220,6 +220,10 @@ def __init__( self.input_buffers: dict[tuple[int, int, int], dict[str, Any]] = {} self.output_buffers: dict[tuple[int, int, int], torch.Tensor] = {} + # Cached pre-computed tensors for CUDA graph replay + # Key: (t, h, w), Value: dict with pos_embeds, rotary embeddings, cu_seqlens, etc. + self.cached_tensors: dict[tuple[int, int, int], dict[str, torch.Tensor]] = {} + # Store metadata about captured graphs self.captured_metadata: dict[tuple[int, int, int], dict[str, Any]] = {} @@ -466,6 +470,10 @@ def capture_graph_for_grid( """ Capture a CUDA graph for the given grid configuration. + This method pre-computes and caches all grid-dependent tensors + (position embeddings, rotary embeddings, cu_seqlens) to eliminate + CPU operations during CUDA graph replay. + Args: grid_config: Tuple of (T, H, W) in patch units vision_encoder: The vision encoder module @@ -490,29 +498,75 @@ def capture_graph_for_grid( "patch_input_channels": dummy_inputs["patch_input_channels"], } - # Warmup run (required before capture) - with torch.cuda.stream(torch.cuda.current_stream()): - warmup_output = vision_encoder(pixel_values, grid_thw=grid_thw) + # Check if vision encoder supports optimized CUDA graph forward + has_cudagraph_forward = hasattr(vision_encoder, 'forward_cudagraph') and \ + hasattr(vision_encoder, 'precompute_for_cudagraph') + + if has_cudagraph_forward: + # Pre-compute and cache all grid-dependent tensors + cached = vision_encoder.precompute_for_cudagraph(grid_thw) + self.cached_tensors[grid_config] = cached + logger.debug( + f"Pre-computed cached tensors for grid config {grid_config}: " + f"pos_embeds={cached['pos_embeds'].shape}, " + f"cu_seqlens={cached['cu_seqlens'].shape}" + ) - # Allocate output buffer based on actual output shape - self.output_buffers[grid_config] = torch.empty_like(warmup_output) + # Warmup run with cached tensors + with torch.cuda.stream(torch.cuda.current_stream()): + warmup_output = vision_encoder.forward_cudagraph( + pixel_values, + pos_embeds=cached["pos_embeds"], + rotary_pos_emb_cos=cached["rotary_pos_emb_cos"], + rotary_pos_emb_sin=cached["rotary_pos_emb_sin"], + cu_seqlens=cached["cu_seqlens"], + max_seqlen=cached["max_seqlen"], + ) + self.output_buffers[grid_config] = torch.empty_like(warmup_output) + + torch.cuda.synchronize() + + # Capture the graph with cached tensors + graph = torch.cuda.CUDAGraph() + input_buffer = self.input_buffers[grid_config]["pixel_values"] + + with torch.cuda.graph(graph, pool=self.pool): + output = vision_encoder.forward_cudagraph( + input_buffer, + pos_embeds=cached["pos_embeds"], + rotary_pos_emb_cos=cached["rotary_pos_emb_cos"], + rotary_pos_emb_sin=cached["rotary_pos_emb_sin"], + cu_seqlens=cached["cu_seqlens"], + max_seqlen=cached["max_seqlen"], + ) + self.output_buffers[grid_config].copy_(output) + else: + # Fallback to original forward (will have CPU gaps) + logger.warning( + f"Vision encoder does not support forward_cudagraph, " + f"using standard forward (will have CPU gaps)" + ) - torch.cuda.synchronize() + # Warmup run (required before capture) + with torch.cuda.stream(torch.cuda.current_stream()): + warmup_output = vision_encoder(pixel_values, grid_thw=grid_thw) + self.output_buffers[grid_config] = torch.empty_like(warmup_output) - # Capture the graph - graph = torch.cuda.CUDAGraph() + torch.cuda.synchronize() - # Get a fresh reference to the input buffer for capture - input_buffer = self.input_buffers[grid_config]["pixel_values"] + # Capture the graph + graph = torch.cuda.CUDAGraph() + input_buffer = self.input_buffers[grid_config]["pixel_values"] - with torch.cuda.graph(graph, pool=self.pool): - output = vision_encoder(input_buffer, grid_thw=grid_thw) - self.output_buffers[grid_config].copy_(output) + with torch.cuda.graph(graph, pool=self.pool): + output = vision_encoder(input_buffer, grid_thw=grid_thw) + self.output_buffers[grid_config].copy_(output) self.graphs[grid_config] = graph logger.debug( f"Captured encoder CUDA graph for grid config {grid_config} " f"-> {dummy_inputs['num_output_tokens']} output tokens" + f"{' (with cached tensors)' if has_cudagraph_forward else ''}" ) @torch.inference_mode() From d0d63e3e847c82caf7fe2fb20fcce1722f029033 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 28 Jan 2026 12:18:07 -0500 Subject: [PATCH 008/189] add more grids. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 75 ++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 4561de65cbc4..dae158b455cd 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -145,6 +145,76 @@ 8464, # All images padded to max ] +# ============================================================================= +# OPTIMIZED GRID-ALIGNED TOKEN BUCKETS +# ============================================================================= +# These buckets are perfect squares (n^2) which align exactly with actual grid +# token counts. This eliminates any mismatch between bucket size and the tokens +# produced by the grid configuration. +# +# Formula: tokens = (side / merge_size)^2 where side must be even +# So valid token counts are: 16, 36, 64, 100, 144, 196, 256, 324, 400, 484, 576... +# +# Analysis on Shopify dataset (12,754 samples): +# - Token range: 63-8161 +# - P5=4170, P50=6072, P95=8005 +# - 96% of images have 4000-8200 tokens +# +# Comparison with previous presets: +# - shopify_coarse (5 buckets): 13.3% padding waste +# - shopify_medium (9 buckets): 7.3% padding waste +# - shopify_fine (13 buckets): 4.1% padding waste +# - optimized (10 buckets): 5.0% padding waste <- better efficiency per bucket + +# Optimized buckets for main distribution (4000-8500 tokens, 96% of dataset) +# 10 buckets with 5.0% padding waste +SHOPIFY_TOKEN_BUCKETS_OPTIMIZED = [ + 4096, # 64^2, grid (1,128,128) - covers up to 4096 tokens + 4489, # 67^2, grid (1,134,134) - covers 4097-4489 + 4900, # 70^2, grid (1,140,140) - covers 4490-4900 + 5329, # 73^2, grid (1,146,146) - covers 4901-5329 + 5776, # 76^2, grid (1,152,152) - covers 5330-5776 + 6241, # 79^2, grid (1,158,158) - covers 5777-6241 + 6724, # 82^2, grid (1,164,164) - covers 6242-6724 + 7225, # 85^2, grid (1,170,170) - covers 6725-7225 + 7744, # 88^2, grid (1,176,176) - covers 7226-7744 + 8464, # 92^2, grid (1,184,184) - covers 7745-8464 (max) +] + +# Full range including small images (adds 6 buckets for <4096 tokens) +# 16 buckets with 4.2% padding waste +SHOPIFY_TOKEN_BUCKETS_OPTIMIZED_FULL = [ + # Small images (<4% of dataset) + 256, # 16^2, grid (1,32,32) + 576, # 24^2, grid (1,48,48) + 1024, # 32^2, grid (1,64,64) + 1600, # 40^2, grid (1,80,80) + 2304, # 48^2, grid (1,96,96) + 3136, # 56^2, grid (1,112,112) + # Main distribution (96% of dataset) + 4096, # 64^2, grid (1,128,128) + 4489, # 67^2, grid (1,134,134) + 4900, # 70^2, grid (1,140,140) + 5329, # 73^2, grid (1,146,146) + 5776, # 76^2, grid (1,152,152) + 6241, # 79^2, grid (1,158,158) + 6724, # 82^2, grid (1,164,164) + 7225, # 85^2, grid (1,170,170) + 7744, # 88^2, grid (1,176,176) + 8464, # 92^2, grid (1,184,184) +] + +# Compact optimized (6 buckets, ~6.5% padding waste) +# Good balance between memory usage and padding overhead +SHOPIFY_TOKEN_BUCKETS_OPTIMIZED_COMPACT = [ + 4096, # 64^2, grid (1,128,128) + 5041, # 71^2, grid (1,142,142) + 5929, # 77^2, grid (1,154,154) + 6724, # 82^2, grid (1,164,164) + 7569, # 87^2, grid (1,174,174) + 8464, # 92^2, grid (1,184,184) +] + def token_bucket_to_grid(token_bucket: int, merge_size: int = 2) -> tuple[int, int, int]: """ @@ -251,10 +321,15 @@ def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: # Handle preset names for token buckets if isinstance(token_buckets, str): bucket_presets = { + # Legacy presets (non-grid-aligned) "shopify_fine": SHOPIFY_TOKEN_BUCKETS_FINE, "shopify_medium": SHOPIFY_TOKEN_BUCKETS_MEDIUM, "shopify_coarse": SHOPIFY_TOKEN_BUCKETS_COARSE, "shopify_single": SHOPIFY_TOKEN_BUCKETS_SINGLE, + # Optimized grid-aligned presets (recommended) + "optimized": SHOPIFY_TOKEN_BUCKETS_OPTIMIZED, + "optimized_full": SHOPIFY_TOKEN_BUCKETS_OPTIMIZED_FULL, + "optimized_compact": SHOPIFY_TOKEN_BUCKETS_OPTIMIZED_COMPACT, } if token_buckets in bucket_presets: buckets = bucket_presets[token_buckets] From bb32c23b422bbac630ea416f179896ffada8db11 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 28 Jan 2026 12:23:11 -0500 Subject: [PATCH 009/189] add encoder cudagraph manager in v1. --- vllm/v1/worker/gpu_model_runner.py | 196 +++++++++++++++++++++++++++-- 1 file changed, 188 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 23d5bac75d00..ca20abbbe617 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -159,6 +159,7 @@ from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin +from vllm.v1.worker.gpu.mm.encoder_cudagraph import EncoderCudaGraphManager from vllm.v1.worker.ubatch_utils import ( UBatchSlices, check_ubatch_thresholds, @@ -425,6 +426,11 @@ def __init__( # mm_hash -> encoder_output self.encoder_cache: dict[str, torch.Tensor] = {} + # Encoder CUDA graph manager for ViT + self.encoder_cudagraph_manager: EncoderCudaGraphManager | None = None + self.encoder_cudagraph_padded_mode: bool = True + self._init_encoder_cudagraph_manager() + self.use_aux_hidden_state_outputs = False # Set up speculative decoding. # NOTE(Jiayi): currently we put the entire draft model on @@ -680,6 +686,43 @@ def __init__( self.kv_connector_output: KVConnectorOutput | None = None self.layerwise_nvtx_hooks_registered = False + def _init_encoder_cudagraph_manager(self) -> None: + """Initialize encoder CUDA graph manager if enabled in config.""" + if self.compilation_config is None: + return + + if not getattr(self.compilation_config, 'cudagraph_mm_encoder', False): + return + + bucket_sizes = getattr( + self.compilation_config, + 'encoder_cudagraph_bucket_sizes', + None + ) + + # Check if padded mode is enabled + self.encoder_cudagraph_padded_mode = getattr( + self.compilation_config, + 'encoder_cudagraph_padded_mode', + True # Default to padded mode for better CUDA graph utilization + ) + + self.encoder_cudagraph_manager = EncoderCudaGraphManager( + vllm_config=self.vllm_config, + device=self.device, + dtype=self.dtype, + bucket_sizes=bucket_sizes, + ) + + # Log configuration + grid_configs = self.encoder_cudagraph_manager.grid_configs + logger.info( + "Encoder CUDA graph manager initialized: " + f"padded_mode={self.encoder_cudagraph_padded_mode}, " + f"num_grids={len(grid_configs)}, " + f"grids={grid_configs}" + ) + def update_max_model_len(self, max_model_len: int) -> None: self.max_model_len = max_model_len if self.speculative_config: @@ -2303,14 +2346,26 @@ def _execute_mm_encoder( curr_group_outputs = curr_group_outputs_lst else: - # Run the encoder. - # `curr_group_outputs` is either of the following: - # 1. A tensor of shape (num_items, feature_size, hidden_size) - # in case feature_size is fixed across all multimodal items. - # 2. A list or tuple (length: num_items) of tensors, - # each of shape (feature_size, hidden_size) in case the feature - # size is dynamic depending on the input multimodal items. - curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) + # Try to use CUDA graph if available + cudagraph_result = None + if self.encoder_cudagraph_manager is not None: + cudagraph_result = self._execute_with_encoder_cudagraph( + model, mm_kwargs_group, modality, num_items + ) + + if cudagraph_result is not None: + # CUDA graph was used successfully + curr_group_outputs = cudagraph_result + else: + # Fall back to eager mode. + # Run the encoder. + # `curr_group_outputs` is either of the following: + # 1. A tensor of shape (num_items, feature_size, hidden_size) + # in case feature_size is fixed across all multimodal items. + # 2. A list or tuple (length: num_items) of tensors, + # each of shape (feature_size, hidden_size) in case the feature + # size is dynamic depending on the input multimodal items. + curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) sanity_check_mm_encoder_outputs( curr_group_outputs, @@ -2326,6 +2381,108 @@ def _execute_mm_encoder( return encoder_outputs + def _execute_with_encoder_cudagraph( + self, + model: "SupportsMultiModal", + mm_kwargs_group: dict, + modality: str, + num_items: int, + ) -> list[torch.Tensor] | None: + """ + Execute the encoder using CUDA graphs if a matching graph is available. + + Supports two modes: + 1. Exact match: Only use CUDA graph if grid_thw exactly matches + 2. Padded mode: Pad inputs to fit the smallest available bucket + + Args: + model: The multimodal model + mm_kwargs_group: Batched multimodal kwargs + modality: The modality type ("image" or "video") + num_items: Number of items in the batch + + Returns: + List of encoder outputs if CUDA graph was used, None otherwise + """ + if self.encoder_cudagraph_manager is None: + return None + + # Only support image/video modalities + if modality not in ("image", "video"): + return None + + # Extract grid_thw from kwargs + grid_thw = mm_kwargs_group.get("image_grid_thw") or mm_kwargs_group.get( + "video_grid_thw" + ) + if grid_thw is None: + return None + + # Convert to list if tensor + if hasattr(grid_thw, "tolist"): + grid_thw = grid_thw.tolist() + + # Currently only supports single-image batches for CUDA graph + if len(grid_thw) != 1: + logger.debug( + "Encoder CUDA graph only supports single-image batches, " + f"got {len(grid_thw)} images. Using eager mode." + ) + return None + + # Extract pixel_values + if modality == "image": + pixel_values = mm_kwargs_group.get("pixel_values") + else: # video + pixel_values = mm_kwargs_group.get("pixel_values_videos") + + if pixel_values is None: + logger.debug("No pixel_values found in kwargs. Using eager mode.") + return None + + # Ensure pixel_values is on the correct device + pixel_values = pixel_values.to(device=self.device, dtype=self.dtype) + + # Get spatial merge size for token calculations + spatial_merge_size = getattr(model.visual, 'spatial_merge_size', 2) + t, h, w = grid_thw[0] + num_output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) + + # Try exact match first + grid_key = self.encoder_cudagraph_manager.get_graph_for_grid(grid_thw) + if grid_key is not None: + # Exact match found - try to run + output = self.encoder_cudagraph_manager.run(pixel_values, grid_thw) + if output is not None: + logger.debug( + f"Encoder CUDA graph exact match for grid {grid_key}, " + f"output: {output.shape}" + ) + return [output[:num_output_tokens]] + + # Try padded execution if enabled + if self.encoder_cudagraph_padded_mode: + result = self.encoder_cudagraph_manager.run_padded( + pixel_values, + grid_thw, + num_output_tokens, + spatial_merge_size, + ) + if result is not None: + output, padding_waste = result + logger.debug( + f"Encoder CUDA graph padded execution: " + f"{num_output_tokens} tokens, waste={padding_waste}" + ) + return [output] + + # No CUDA graph available + logger.debug( + f"No encoder CUDA graph for grid {grid_thw[0]} " + f"(padded_mode={self.encoder_cudagraph_padded_mode}). Using eager mode." + ) + return None + def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", @@ -4797,6 +4954,11 @@ def freeze_gc(): set_cudagraph_capturing_enabled(True) with freeze_gc(), graph_capture(device=self.device): start_free_gpu_memory = torch.cuda.mem_get_info()[0] + + # Capture encoder CUDA graphs first (if enabled) + if self.encoder_cudagraph_manager is not None: + self._capture_encoder_cudagraphs() + cudagraph_mode = self.compilation_config.cudagraph_mode assert cudagraph_mode is not None @@ -4869,6 +5031,24 @@ def freeze_gc(): ) return cuda_graph_size + def _capture_encoder_cudagraphs(self) -> None: + """Capture CUDA graphs for the vision encoder.""" + if self.encoder_cudagraph_manager is None: + return + + model = self.model + if not hasattr(model, 'visual') or model.visual is None: + logger.warning( + "Model does not have a visual encoder, " + "skipping encoder CUDA graph capture" + ) + return + + self.encoder_cudagraph_manager.capture( + vision_encoder=model.visual, + embed_multimodal_fn=model.embed_multimodal, + ) + def _capture_cudagraphs( self, compilation_cases: list[tuple[int, bool]], From bcc72a4bec7c53cab7beb5df2d849a707b792575 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 28 Jan 2026 12:30:42 -0500 Subject: [PATCH 010/189] keep assertion. --- vllm/compilation/piecewise_backend.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 0a144e728283..ee6779bffa55 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -183,9 +183,9 @@ def __call__(self, *args: Any) -> Any: runtime_shape = args[self.sym_shape_indices[0]] range_entry = self._find_range_for_shape(runtime_shape) - # assert range_entry is not None, ( - # f"Shape: {runtime_shape} out of considered ranges: {self.compile_ranges}" - # ) + assert range_entry is not None, ( + f"Shape: {runtime_shape} out of considered ranges: {self.compile_ranges}" + ) self._maybe_compile_for_range_entry(range_entry, args) # type: ignore[arg-type] return range_entry.runnable(*args) # type: ignore[union-attr] From 35becaa1d3155cf74e1490370f3737c0a5bcdee8 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 28 Jan 2026 19:12:22 -0500 Subject: [PATCH 011/189] distinguish encoder and lm for graph capture range. --- vllm/model_executor/models/qwen3_vl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index f413b2d8e5dc..133198896a5a 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -385,7 +385,7 @@ def __init__( from vllm.compilation.backends import set_model_tag - with set_model_tag("Qwen3_VisionPatchEmbed"): + with set_model_tag("Qwen3_VisionPatchEmbed", is_encoder=True): self.patch_embed = Qwen3_VisionPatchEmbed( patch_size=self.patch_size, temporal_patch_size=self.temporal_patch_size, @@ -404,7 +404,7 @@ def __init__( rope_parameters={"partial_rotary_factor": 0.5}, ) - with set_model_tag("Qwen3_VisionPatchMerger"): + with set_model_tag("Qwen3_VisionPatchMerger", is_encoder=True): self.merger = Qwen3_VisionPatchMerger( d_model=vision_config.out_hidden_size, context_dim=self.hidden_size, @@ -415,7 +415,7 @@ def __init__( prefix=f"{prefix}.merger", ) - with set_model_tag("Qwen3_VisionPatchMerger_postshuffle_norm"): + with set_model_tag("Qwen3_VisionPatchMerger_postshuffle_norm", is_encoder=True): self.deepstack_merger_list = nn.ModuleList( [ Qwen3_VisionPatchMerger( @@ -452,7 +452,7 @@ def __init__( f"Qwen3-VL does not support {self.attn_backend} backend now." ) - with set_model_tag("Qwen3_VisionBlock"): + with set_model_tag("Qwen3_VisionBlock", is_encoder=True): workspace_buffer = ( None if self.attn_backend != AttentionBackendEnum.FLASHINFER From 9a7e47b70ecf1762994fb2148616d8ddde997538 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 28 Jan 2026 19:13:35 -0500 Subject: [PATCH 012/189] fix ambiguous tensor. --- vllm/v1/worker/gpu/mm/encoder_runner.py | 6 +++--- vllm/v1/worker/gpu_model_runner.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py index d6d019f0e827..9ea462681d1d 100644 --- a/vllm/v1/worker/gpu/mm/encoder_runner.py +++ b/vllm/v1/worker/gpu/mm/encoder_runner.py @@ -159,9 +159,9 @@ def _get_grid_thw_from_kwargs( return None # Try to get grid_thw from the kwargs - grid_thw = mm_kwargs_group.get("image_grid_thw") or mm_kwargs_group.get( - "video_grid_thw" - ) + grid_thw = mm_kwargs_group.get("image_grid_thw") + if grid_thw is None: + grid_thw = mm_kwargs_group.get("video_grid_thw") if grid_thw is None: return None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ca20abbbe617..75e5c6e7e9c3 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2412,9 +2412,9 @@ def _execute_with_encoder_cudagraph( return None # Extract grid_thw from kwargs - grid_thw = mm_kwargs_group.get("image_grid_thw") or mm_kwargs_group.get( - "video_grid_thw" - ) + grid_thw = mm_kwargs_group.get("image_grid_thw") + if grid_thw is None: + grid_thw = mm_kwargs_group.get("video_grid_thw") if grid_thw is None: return None From f7af48a1033ad278c8c9bd5f3736f9b1029bc1b2 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 28 Jan 2026 19:33:48 -0500 Subject: [PATCH 013/189] add log for grid_thw. --- vllm/v1/worker/gpu_model_runner.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 75e5c6e7e9c3..1614a7f454ed 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2447,6 +2447,14 @@ def _execute_with_encoder_cudagraph( spatial_merge_size = getattr(model.visual, 'spatial_merge_size', 2) t, h, w = grid_thw[0] num_output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) + num_input_patches = pixel_values.shape[0] + + # Log the exact size needed for bucket analysis + logger.info( + f"ViT input: grid_thw=({t}, {h}, {w}), " + f"input_patches={num_input_patches}, " + f"output_tokens={num_output_tokens}" + ) # Try exact match first grid_key = self.encoder_cudagraph_manager.get_graph_for_grid(grid_thw) From a516b151ec4d074064189449d2579a484cc82f35 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 28 Jan 2026 21:35:26 -0500 Subject: [PATCH 014/189] disable assertion for now, but warn. --- vllm/compilation/piecewise_backend.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index ee6779bffa55..df9c2f6cc00d 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -61,10 +61,14 @@ def __init__( # to set the upper bound of the compile ranges max_int32 = 2**31 - 1 last_compile_range = self.compile_ranges[-1] + if last_compile_range.end != vllm_config.scheduler_config.max_num_batched_tokens: + print(f'WARNING: last_compile_range.end={last_compile_range.end}, max_num_batched_tokens={vllm_config.scheduler_config.max_num_batched_tokens}') + """ assert ( last_compile_range.end == vllm_config.scheduler_config.max_num_batched_tokens ) + """ self.compile_ranges[-1] = Range( start=last_compile_range.start, end=max_int32 ) From 3490b832a3d4985ba3e8e760cf8b0c0efa7e72e7 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 28 Jan 2026 22:36:19 -0500 Subject: [PATCH 015/189] compute embeddings with exact, unpadded grid thw. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 103 +++++++++++++++++---- 1 file changed, 84 insertions(+), 19 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index dae158b455cd..c09fb0b1df98 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -290,13 +290,20 @@ def __init__( self.input_buffers: dict[tuple[int, int, int], dict[str, Any]] = {} self.output_buffers: dict[tuple[int, int, int], torch.Tensor] = {} - # Cached pre-computed tensors for CUDA graph replay + # Cached pre-computed tensors for CUDA graph replay (used for exact match mode) # Key: (t, h, w), Value: dict with pos_embeds, rotary embeddings, cu_seqlens, etc. self.cached_tensors: dict[tuple[int, int, int], dict[str, torch.Tensor]] = {} + # Input buffers for embeddings (used for padded mode with runtime computation) + # Key: (t, h, w), Value: dict with pos_embeds, rotary_cos, rotary_sin, cu_seqlens buffers + self.embedding_buffers: dict[tuple[int, int, int], dict[str, torch.Tensor]] = {} + # Store metadata about captured graphs self.captured_metadata: dict[tuple[int, int, int], dict[str, Any]] = {} + # Reference to vision encoder for runtime embedding computation (set during capture) + self.vision_encoder = None + # Track if graphs have been captured self.captured = False @@ -573,12 +580,15 @@ def capture_graph_for_grid( "patch_input_channels": dummy_inputs["patch_input_channels"], } + # Store vision encoder reference for runtime embedding computation + self.vision_encoder = vision_encoder + # Check if vision encoder supports optimized CUDA graph forward has_cudagraph_forward = hasattr(vision_encoder, 'forward_cudagraph') and \ hasattr(vision_encoder, 'precompute_for_cudagraph') if has_cudagraph_forward: - # Pre-compute and cache all grid-dependent tensors + # Pre-compute tensors for the bucket grid (used for exact match mode) cached = vision_encoder.precompute_for_cudagraph(grid_thw) self.cached_tensors[grid_config] = cached logger.debug( @@ -587,32 +597,44 @@ def capture_graph_for_grid( f"cu_seqlens={cached['cu_seqlens'].shape}" ) - # Warmup run with cached tensors + # Create INPUT BUFFERS for embeddings (for padded mode with runtime computation) + # These buffers can be updated at runtime before graph replay + self.embedding_buffers[grid_config] = { + "pos_embeds": cached["pos_embeds"].clone(), + "rotary_pos_emb_cos": cached["rotary_pos_emb_cos"].clone(), + "rotary_pos_emb_sin": cached["rotary_pos_emb_sin"].clone(), + "cu_seqlens": cached["cu_seqlens"].clone(), + "max_seqlen": cached["max_seqlen"].clone(), + } + embed_buffers = self.embedding_buffers[grid_config] + + # Warmup run with embedding buffers with torch.cuda.stream(torch.cuda.current_stream()): warmup_output = vision_encoder.forward_cudagraph( pixel_values, - pos_embeds=cached["pos_embeds"], - rotary_pos_emb_cos=cached["rotary_pos_emb_cos"], - rotary_pos_emb_sin=cached["rotary_pos_emb_sin"], - cu_seqlens=cached["cu_seqlens"], - max_seqlen=cached["max_seqlen"], + pos_embeds=embed_buffers["pos_embeds"], + rotary_pos_emb_cos=embed_buffers["rotary_pos_emb_cos"], + rotary_pos_emb_sin=embed_buffers["rotary_pos_emb_sin"], + cu_seqlens=embed_buffers["cu_seqlens"], + max_seqlen=embed_buffers["max_seqlen"], ) self.output_buffers[grid_config] = torch.empty_like(warmup_output) torch.cuda.synchronize() - # Capture the graph with cached tensors + # Capture the graph with embedding BUFFERS (not constants) + # This allows updating embeddings at runtime for padded mode graph = torch.cuda.CUDAGraph() input_buffer = self.input_buffers[grid_config]["pixel_values"] with torch.cuda.graph(graph, pool=self.pool): output = vision_encoder.forward_cudagraph( input_buffer, - pos_embeds=cached["pos_embeds"], - rotary_pos_emb_cos=cached["rotary_pos_emb_cos"], - rotary_pos_emb_sin=cached["rotary_pos_emb_sin"], - cu_seqlens=cached["cu_seqlens"], - max_seqlen=cached["max_seqlen"], + pos_embeds=embed_buffers["pos_embeds"], + rotary_pos_emb_cos=embed_buffers["rotary_pos_emb_cos"], + rotary_pos_emb_sin=embed_buffers["rotary_pos_emb_sin"], + cu_seqlens=embed_buffers["cu_seqlens"], + max_seqlen=embed_buffers["max_seqlen"], ) self.output_buffers[grid_config].copy_(output) else: @@ -793,6 +815,16 @@ def run( # Copy input to the captured buffer input_buffer.copy_(pixel_values) + # For exact match, restore cached embeddings (may have been modified by run_padded) + if grid_key in self.embedding_buffers and grid_key in self.cached_tensors: + embed_buffers = self.embedding_buffers[grid_key] + cached = self.cached_tensors[grid_key] + embed_buffers["pos_embeds"].copy_(cached["pos_embeds"]) + embed_buffers["rotary_pos_emb_cos"].copy_(cached["rotary_pos_emb_cos"]) + embed_buffers["rotary_pos_emb_sin"].copy_(cached["rotary_pos_emb_sin"]) + embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"]) + embed_buffers["max_seqlen"].copy_(cached["max_seqlen"]) + # Replay the graph self.graphs[grid_key].replay() @@ -809,8 +841,9 @@ def run_padded( """ Run the vision encoder with padding to fit a captured bucket. - This method pads the input to match a captured CUDA graph bucket, - executes the graph, and returns the trimmed output. + This method computes embeddings for the ACTUAL input grid, pads them + to match the bucket size, then replays the CUDA graph. This ensures + correct position embeddings while still benefiting from CUDA graphs. Args: pixel_values: Input pixel values [num_patches, patch_channels] @@ -826,6 +859,11 @@ def run_padded( logger.debug("Padded mode only supports single-image inputs") return None + # Check if vision encoder is available for embedding computation + if self.vision_encoder is None or not hasattr(self.vision_encoder, 'precompute_for_cudagraph'): + logger.debug("Vision encoder not available for padded mode") + return None + # Find the smallest bucket that fits bucket_grid = self.find_bucket_for_tokens(num_output_tokens, spatial_merge_size) if bucket_grid is None: @@ -836,6 +874,11 @@ def run_padded( ) return None + # Check if we have embedding buffers for this bucket + if bucket_grid not in self.embedding_buffers: + logger.debug(f"No embedding buffers for bucket {bucket_grid}") + return None + bucket_tokens = self._compute_output_tokens(bucket_grid, spatial_merge_size) padding_waste = bucket_tokens - num_output_tokens @@ -854,13 +897,35 @@ def run_padded( self.cache_hits += 1 - # Zero the buffer first (for clean padding) + # === KEY FIX: Compute embeddings for ACTUAL grid, then pad === + # This ensures correct position embeddings for the actual input size + actual_embeds = self.vision_encoder.precompute_for_cudagraph(grid_thw) + + # Get embedding buffers for the bucket + embed_buffers = self.embedding_buffers[bucket_grid] + + # Zero the buffers first (for clean padding) input_buffer.zero_() + embed_buffers["pos_embeds"].zero_() + embed_buffers["rotary_pos_emb_cos"].zero_() + embed_buffers["rotary_pos_emb_sin"].zero_() - # Copy actual input to the beginning of the buffer + # Copy actual pixel values to the beginning of the buffer input_buffer[:num_input_patches].copy_(pixel_values) - # Replay the graph (uses the bucket's grid_thw for position embeddings) + # Copy actual embeddings to the beginning of the buffers (pad with zeros) + actual_num_patches = actual_embeds["pos_embeds"].shape[0] + embed_buffers["pos_embeds"][:actual_num_patches].copy_(actual_embeds["pos_embeds"]) + embed_buffers["rotary_pos_emb_cos"][:actual_num_patches].copy_(actual_embeds["rotary_pos_emb_cos"]) + embed_buffers["rotary_pos_emb_sin"][:actual_num_patches].copy_(actual_embeds["rotary_pos_emb_sin"]) + + # Update cu_seqlens and max_seqlen to actual values + # cu_seqlens shape is [num_images + 1], for single image it's [2]: [0, num_patches] + # We copy the actual values so flash attention processes only the real tokens + embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"]) + embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"]) + + # Replay the graph with updated embedding buffers self.graphs[bucket_grid].replay() # Get output and trim to actual size From 6aea3290953122a413d2f4ceea51305f56f47eae Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 28 Jan 2026 22:38:24 -0500 Subject: [PATCH 016/189] log vit cudagraph mode. --- vllm/v1/worker/gpu/mm/encoder_runner.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py index 9ea462681d1d..90166f44a3b2 100644 --- a/vllm/v1/worker/gpu/mm/encoder_runner.py +++ b/vllm/v1/worker/gpu/mm/encoder_runner.py @@ -300,9 +300,9 @@ def _execute_with_cudagraph( # Exact match found - try to run output = self.encoder_cudagraph_manager.run(pixel_values, grid_thw) if output is not None: - logger.debug( - f"Encoder CUDA graph exact match for grid {grid_key}, " - f"output: {output.shape}" + logger.info( + f"ViT CUDA graph EXACT: grid=({t}, {h}, {w}), " + f"tokens={num_output_tokens}" ) return [output[:num_output_tokens]] @@ -316,16 +316,15 @@ def _execute_with_cudagraph( ) if result is not None: output, padding_waste = result - logger.debug( - f"Encoder CUDA graph padded execution: " - f"{num_output_tokens} tokens, waste={padding_waste}" + logger.info( + f"ViT CUDA graph PADDED: grid=({t}, {h}, {w}), " + f"tokens={num_output_tokens}, waste={padding_waste}" ) return [output] # No CUDA graph available - logger.debug( - f"No CUDA graph for grid {grid_thw[0]} " - f"(padded_mode={self.encoder_cudagraph_padded_mode}). Using eager mode." + logger.info( + f"ViT EAGER: grid=({t}, {h}, {w}), tokens={num_output_tokens}" ) return None From 4c1f1a0485b06329bd5cbe70b777c35c390e924a Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 28 Jan 2026 22:40:29 -0500 Subject: [PATCH 017/189] add custom grid config. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index c09fb0b1df98..171159f80e7e 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -52,6 +52,39 @@ (1, 96, 96), # ~1344x1344 -> 2304 output tokens ] +CUSTOM_GRID_CONFIGS = [ + (1, 62, 62), + (1, 94, 94), + (1, 50, 50), + (1, 124, 124), + (1, 32, 32), + (1, 76, 76), + (1, 100, 100), + (1, 64, 64), + (1, 38, 38), + (1, 188, 188), + (1, 68, 68), + (1, 128, 128), + (1, 250, 250), + (1, 44, 44), + (1, 112, 112), + (1, 80, 80), + (1, 46, 46), + (1, 160, 160), + (1, 42, 42), + (1, 24, 24), + (1, 56, 56), + (1, 16, 16), + (1, 256, 256), + (1, 208, 312), + (1, 188, 252), + (1, 156, 156), + (1, 252, 188), + (1, 88, 88), + (1, 120, 120), + (1, 40, 40), +] + # Optimized grid configurations for MLPerf Shopify dataset # Based on analysis: 96% of images have 4000-8200 output tokens # Using square grids that cover the common token ranges with padding @@ -366,6 +399,8 @@ def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: return SHOPIFY_OPTIMIZED_GRID_CONFIGS elif grid_configs == "shopify_rectangular": return SHOPIFY_RECTANGULAR_GRID_CONFIGS + elif grid_configs == "custom": + return CUSTOM_GRID_CONFIGS elif grid_configs == "default": return DEFAULT_ENCODER_GRID_CONFIGS else: From 7ca3136b435020a219e379494fc3b8bfbc71e532 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 28 Jan 2026 23:10:01 -0500 Subject: [PATCH 018/189] update custom grid config. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 171159f80e7e..ba514bfc4898 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -56,8 +56,8 @@ (1, 62, 62), (1, 94, 94), (1, 50, 50), - (1, 124, 124), (1, 32, 32), + (1, 124, 124), (1, 76, 76), (1, 100, 100), (1, 64, 64), @@ -65,24 +65,24 @@ (1, 188, 188), (1, 68, 68), (1, 128, 128), - (1, 250, 250), (1, 44, 44), - (1, 112, 112), - (1, 80, 80), - (1, 46, 46), - (1, 160, 160), + (1, 250, 250), + (1, 256, 256), (1, 42, 42), (1, 24, 24), - (1, 56, 56), + (1, 160, 160), + (1, 46, 46), + (1, 80, 80), + (1, 112, 112), (1, 16, 16), - (1, 256, 256), + (1, 56, 56), (1, 208, 312), (1, 188, 252), (1, 156, 156), + (1, 40, 40), (1, 252, 188), - (1, 88, 88), (1, 120, 120), - (1, 40, 40), + (1, 218, 218), ] # Optimized grid configurations for MLPerf Shopify dataset From cf04736856f07fc4be81a1eee6fcdecf3fe4ea62 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 00:59:19 -0500 Subject: [PATCH 019/189] update comment for cudagraph related compilation config. --- vllm/config/compilation.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 007e27afd87b..1981713f4d1c 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -446,17 +446,16 @@ class CompilationConfig: Experimental feature - use with caution.""" encoder_cudagraph_bucket_sizes: list[int] | None = None - """Bucket sizes for encoder CUDA graph capture. Each size represents the - number of visual tokens (after spatial merge) to capture a graph for. - If None, auto-generates based on common image resolutions: - [64, 128, 256, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192]""" - - encoder_cudagraph_grid_configs: list[tuple[int, int, int]] | None = None - """Grid configurations (T, H, W in patch units) to capture encoder CUDA - graphs for. Each tuple represents a specific image dimension configuration. - If None, uses default common configurations for Qwen-VL models. - Example: [(1, 16, 16), (1, 24, 24), (1, 32, 32)] for 448x448, 672x672, - and 896x896 images with patch_size=14 and merge_size=2.""" + """Square grid side lengths for padded CUDA graph execution. Each size N + creates a bucket grid (1, N, N). Inputs with max(H, W) <= N are padded to + fit the bucket. Example: [32, 64, 94, 128, 188, 256, 312] captures grids + (1, 32, 32), (1, 64, 64), etc. Used with encoder_cudagraph_padded_mode=True.""" + + encoder_cudagraph_grid_configs: list[tuple[int, int, int]] | str | None = None + """Grid configurations (T, H, W in patch units) for exact-match CUDA graph + capture. Can be a list of tuples or a preset name: "default", "custom", + "shopify", "shopify_rectangular". The "custom" preset contains top 30 grids + from MLPerf dataset (58.9% exact match coverage). If None, uses default.""" encoder_cudagraph_token_buckets: list[int] | str | None = None """Token bucket sizes for encoder CUDA graphs with padding support. @@ -474,8 +473,10 @@ class CompilationConfig: encoder_cudagraph_padded_mode: bool = True """Whether to use padded execution for encoder CUDA graphs. - When True, inputs smaller than a captured bucket are padded to fit, - enabling higher CUDA graph hit rates at the cost of padding overhead. + When True, inputs smaller than a captured bucket are padded to fit. + Padded: pixel_values, pos_embeds, rotary_embeds (with zeros). + NOT padded: cu_seqlens, max_seqlen (set to actual values so flash + attention only processes real tokens). Output is trimmed to actual size. When False, only exact grid matches use CUDA graphs.""" # Inductor capture From 545e4786a70561983a9729997fb3e4cbbf0bd08c Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 01:10:01 -0500 Subject: [PATCH 020/189] clean up. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 325 +-------------------- 1 file changed, 11 insertions(+), 314 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index ba514bfc4898..c870790b1868 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -20,7 +20,6 @@ from __future__ import annotations -import math from collections.abc import Callable from typing import TYPE_CHECKING, Any @@ -37,21 +36,8 @@ logger = init_logger(__name__) -# Default grid configurations to capture (T, H, W in patch units) -# These are common configurations for Qwen-VL models after smart_resize -# Format: (temporal, height_patches, width_patches) -DEFAULT_ENCODER_GRID_CONFIGS = [ - # Common single-frame image configurations (T=1) - # After smart_resize with factor=28 (patch=14, merge=2), common sizes: - (1, 16, 16), # ~224x224 -> 64 output tokens - (1, 24, 24), # ~336x336 -> 144 output tokens - (1, 32, 32), # ~448x448 -> 256 output tokens - (1, 48, 48), # ~672x672 -> 576 output tokens - (1, 64, 64), # ~896x896 -> 1024 output tokens - (1, 80, 80), # ~1120x1120 -> 1600 output tokens - (1, 96, 96), # ~1344x1344 -> 2304 output tokens -] - +# Grid configurations for CUDA graph capture (T, H, W in patch units) +# Top 30 most common grids from MLPerf dataset analysis (58.9% exact match coverage) CUSTOM_GRID_CONFIGS = [ (1, 62, 62), (1, 94, 94), @@ -85,193 +71,6 @@ (1, 218, 218), ] -# Optimized grid configurations for MLPerf Shopify dataset -# Based on analysis: 96% of images have 4000-8200 output tokens -# Using square grids that cover the common token ranges with padding -SHOPIFY_OPTIMIZED_GRID_CONFIGS = [ - # Small images (rare, <5% of dataset) - (1, 64, 64), # 1024 tokens - covers up to ~1024 tokens - (1, 80, 80), # 1600 tokens - (1, 96, 96), # 2304 tokens - (1, 112, 112), # 3136 tokens - # Main distribution (95% of dataset: 4000-8200 tokens) - (1, 128, 128), # 4096 tokens - covers P10 (4646) - (1, 144, 144), # 5184 tokens - covers ~P25 (5351) - (1, 160, 160), # 6400 tokens - covers ~P50-P75 (6072-6904) - (1, 176, 176), # 7744 tokens - covers ~P90 (7948) - (1, 184, 184), # 8464 tokens - covers max (8161) -] - -# Alternative: Rectangular grids for better aspect ratio coverage -SHOPIFY_RECTANGULAR_GRID_CONFIGS = [ - # 4:3 and 3:4 aspect ratios for product images - (1, 128, 128), # 4096 tokens (square) - (1, 112, 144), # 4032 tokens (3:4) - (1, 144, 112), # 4032 tokens (4:3) - (1, 144, 144), # 5184 tokens (square) - (1, 128, 160), # 5120 tokens (4:5) - (1, 160, 128), # 5120 tokens (5:4) - (1, 160, 160), # 6400 tokens (square) - (1, 144, 176), # 6336 tokens (9:11) - (1, 176, 144), # 6336 tokens (11:9) - (1, 176, 176), # 7744 tokens (square) - (1, 160, 192), # 7680 tokens (5:6) - (1, 192, 160), # 7680 tokens (6:5) - (1, 184, 184), # 8464 tokens (square, max) -] - -# Legacy bucket sizes for backward compatibility -DEFAULT_ENCODER_CUDAGRAPH_BUCKET_SIZES = [ - 64, 128, 256, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192 -] - -# ============================================================================= -# TOKEN BUCKET PRESETS FOR PADDED CUDA GRAPHS -# ============================================================================= -# These define output token buckets. Inputs are padded to the smallest bucket -# that fits, trading padding overhead for CUDA graph utilization. -# -# For Shopify dataset analysis: -# - P10=4646, P25=5351, P50=6072, P75=6904, P90=7948, Max=8161 tokens - -# Fine-grained buckets: More buckets = less padding waste, more GPU memory -SHOPIFY_TOKEN_BUCKETS_FINE = [ - 1024, # Small images (<5% of dataset) - 2048, - 3072, - 4096, # ~P10 - 4608, - 5120, # ~P25 - 5632, - 6144, # ~P50 - 6656, - 7168, # ~P75 - 7680, - 8192, # ~P90-P99 - 8464, # Max coverage -] - -# Medium granularity: Balanced tradeoff -SHOPIFY_TOKEN_BUCKETS_MEDIUM = [ - 1024, # Small images - 2048, - 3072, - 4096, # Covers up to P10 - 5120, # Covers P10-P25 - 6144, # Covers P25-P50 - 7168, # Covers P50-P75 - 8192, # Covers P75-P99 - 8464, # Max coverage -] - -# Coarse buckets: Fewer graphs, more padding waste -SHOPIFY_TOKEN_BUCKETS_COARSE = [ - 2048, # Small images - 4096, # Up to ~P10 - 6144, # P10-P50 - 8192, # P50-P99 - 8464, # Max -] - -# Single bucket: Maximum CUDA graph utilization, maximum padding -SHOPIFY_TOKEN_BUCKETS_SINGLE = [ - 8464, # All images padded to max -] - -# ============================================================================= -# OPTIMIZED GRID-ALIGNED TOKEN BUCKETS -# ============================================================================= -# These buckets are perfect squares (n^2) which align exactly with actual grid -# token counts. This eliminates any mismatch between bucket size and the tokens -# produced by the grid configuration. -# -# Formula: tokens = (side / merge_size)^2 where side must be even -# So valid token counts are: 16, 36, 64, 100, 144, 196, 256, 324, 400, 484, 576... -# -# Analysis on Shopify dataset (12,754 samples): -# - Token range: 63-8161 -# - P5=4170, P50=6072, P95=8005 -# - 96% of images have 4000-8200 tokens -# -# Comparison with previous presets: -# - shopify_coarse (5 buckets): 13.3% padding waste -# - shopify_medium (9 buckets): 7.3% padding waste -# - shopify_fine (13 buckets): 4.1% padding waste -# - optimized (10 buckets): 5.0% padding waste <- better efficiency per bucket - -# Optimized buckets for main distribution (4000-8500 tokens, 96% of dataset) -# 10 buckets with 5.0% padding waste -SHOPIFY_TOKEN_BUCKETS_OPTIMIZED = [ - 4096, # 64^2, grid (1,128,128) - covers up to 4096 tokens - 4489, # 67^2, grid (1,134,134) - covers 4097-4489 - 4900, # 70^2, grid (1,140,140) - covers 4490-4900 - 5329, # 73^2, grid (1,146,146) - covers 4901-5329 - 5776, # 76^2, grid (1,152,152) - covers 5330-5776 - 6241, # 79^2, grid (1,158,158) - covers 5777-6241 - 6724, # 82^2, grid (1,164,164) - covers 6242-6724 - 7225, # 85^2, grid (1,170,170) - covers 6725-7225 - 7744, # 88^2, grid (1,176,176) - covers 7226-7744 - 8464, # 92^2, grid (1,184,184) - covers 7745-8464 (max) -] - -# Full range including small images (adds 6 buckets for <4096 tokens) -# 16 buckets with 4.2% padding waste -SHOPIFY_TOKEN_BUCKETS_OPTIMIZED_FULL = [ - # Small images (<4% of dataset) - 256, # 16^2, grid (1,32,32) - 576, # 24^2, grid (1,48,48) - 1024, # 32^2, grid (1,64,64) - 1600, # 40^2, grid (1,80,80) - 2304, # 48^2, grid (1,96,96) - 3136, # 56^2, grid (1,112,112) - # Main distribution (96% of dataset) - 4096, # 64^2, grid (1,128,128) - 4489, # 67^2, grid (1,134,134) - 4900, # 70^2, grid (1,140,140) - 5329, # 73^2, grid (1,146,146) - 5776, # 76^2, grid (1,152,152) - 6241, # 79^2, grid (1,158,158) - 6724, # 82^2, grid (1,164,164) - 7225, # 85^2, grid (1,170,170) - 7744, # 88^2, grid (1,176,176) - 8464, # 92^2, grid (1,184,184) -] - -# Compact optimized (6 buckets, ~6.5% padding waste) -# Good balance between memory usage and padding overhead -SHOPIFY_TOKEN_BUCKETS_OPTIMIZED_COMPACT = [ - 4096, # 64^2, grid (1,128,128) - 5041, # 71^2, grid (1,142,142) - 5929, # 77^2, grid (1,154,154) - 6724, # 82^2, grid (1,164,164) - 7569, # 87^2, grid (1,174,174) - 8464, # 92^2, grid (1,184,184) -] - - -def token_bucket_to_grid(token_bucket: int, merge_size: int = 2) -> tuple[int, int, int]: - """ - Convert a token bucket size to a square grid configuration. - - Args: - token_bucket: Number of output tokens (after spatial merge) - merge_size: Spatial merge size (default 2 for Qwen-VL) - - Returns: - Grid config (T, H_patches, W_patches) - """ - # For square grid: tokens = (H/merge)^2, so H = merge * sqrt(tokens) - side = int(math.ceil(math.sqrt(token_bucket))) * merge_size - return (1, side, side) - - -def get_grid_configs_from_token_buckets( - token_buckets: list[int], - merge_size: int = 2, -) -> list[tuple[int, int, int]]: - """Convert token bucket list to grid configurations.""" - return [token_bucket_to_grid(t, merge_size) for t in token_buckets] - class EncoderCudaGraphManager: """ @@ -349,42 +148,7 @@ def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: """Get encoder grid configurations from config or use defaults.""" compilation_config = self.vllm_config.compilation_config if compilation_config is None: - return DEFAULT_ENCODER_GRID_CONFIGS - - # Check for token bucket config first (new preferred way) - token_buckets = getattr( - compilation_config, - 'encoder_cudagraph_token_buckets', - None - ) - if token_buckets is not None: - # Handle preset names for token buckets - if isinstance(token_buckets, str): - bucket_presets = { - # Legacy presets (non-grid-aligned) - "shopify_fine": SHOPIFY_TOKEN_BUCKETS_FINE, - "shopify_medium": SHOPIFY_TOKEN_BUCKETS_MEDIUM, - "shopify_coarse": SHOPIFY_TOKEN_BUCKETS_COARSE, - "shopify_single": SHOPIFY_TOKEN_BUCKETS_SINGLE, - # Optimized grid-aligned presets (recommended) - "optimized": SHOPIFY_TOKEN_BUCKETS_OPTIMIZED, - "optimized_full": SHOPIFY_TOKEN_BUCKETS_OPTIMIZED_FULL, - "optimized_compact": SHOPIFY_TOKEN_BUCKETS_OPTIMIZED_COMPACT, - } - if token_buckets in bucket_presets: - buckets = bucket_presets[token_buckets] - logger.info( - f"Using token bucket preset '{token_buckets}': {buckets}" - ) - return get_grid_configs_from_token_buckets(buckets) - else: - logger.warning( - f"Unknown token bucket preset '{token_buckets}', " - f"available: {list(bucket_presets.keys())}" - ) - elif isinstance(token_buckets, list): - logger.info(f"Using custom token buckets: {token_buckets}") - return get_grid_configs_from_token_buckets(token_buckets) + return CUSTOM_GRID_CONFIGS # Check for encoder-specific grid config grid_configs = getattr( @@ -393,54 +157,32 @@ def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: None ) if grid_configs is not None: - # Handle preset names + # Handle preset name or custom list if isinstance(grid_configs, str): - if grid_configs == "shopify": - return SHOPIFY_OPTIMIZED_GRID_CONFIGS - elif grid_configs == "shopify_rectangular": - return SHOPIFY_RECTANGULAR_GRID_CONFIGS - elif grid_configs == "custom": + if grid_configs == "custom": return CUSTOM_GRID_CONFIGS - elif grid_configs == "default": - return DEFAULT_ENCODER_GRID_CONFIGS else: logger.warning( f"Unknown grid config preset '{grid_configs}', " - "using default" + "using 'custom'" ) - return DEFAULT_ENCODER_GRID_CONFIGS + return CUSTOM_GRID_CONFIGS return [tuple(cfg) for cfg in grid_configs] - return DEFAULT_ENCODER_GRID_CONFIGS + return CUSTOM_GRID_CONFIGS def _get_bucket_sizes_from_config(self) -> list[int]: - """Get encoder CUDA graph bucket sizes from config or use defaults.""" + """Get encoder CUDA graph bucket sizes from config.""" compilation_config = self.vllm_config.compilation_config if compilation_config is None: - return DEFAULT_ENCODER_CUDAGRAPH_BUCKET_SIZES + return [] encoder_sizes = getattr( compilation_config, 'encoder_cudagraph_bucket_sizes', None ) - if encoder_sizes is not None: - return encoder_sizes - - return DEFAULT_ENCODER_CUDAGRAPH_BUCKET_SIZES - - def get_padded_size(self, num_visual_tokens: int) -> int | None: - """ - Find the smallest bucket size >= num_visual_tokens. - - Returns None if the input is larger than all buckets. - Note: This is for backward compatibility. For actual graph lookup, - use get_graph_for_grid() instead. - """ - for bucket_size in self.bucket_sizes: - if num_visual_tokens <= bucket_size: - return bucket_size - return None + return encoder_sizes if encoder_sizes is not None else [] def _grid_to_key(self, grid_thw: list[list[int]]) -> tuple[int, int, int] | None: """ @@ -755,18 +497,6 @@ def capture( f"(configs: {sorted(self.graphs.keys())})" ) - def can_use_graph(self, num_visual_tokens: int) -> bool: - """ - Check if a CUDA graph might be available for the given token count. - - Note: This is a heuristic check. Actual graph usage depends on - exact grid_thw match via get_graph_for_grid(). - """ - if not self.captured: - return False - padded_size = self.get_padded_size(num_visual_tokens) - return padded_size is not None - def get_graph_for_grid( self, grid_thw: list[list[int]], @@ -988,39 +718,6 @@ def get_stats(self) -> dict[str, Any]: } -def get_encoder_cudagraph_bucket_sizes( - max_visual_tokens: int, - min_bucket: int = 64, - growth_factor: float = 1.5, -) -> list[int]: - """ - Generate bucket sizes for encoder CUDA graphs. - - Uses exponential growth to cover the range [min_bucket, max_visual_tokens] - with reasonable granularity. - - Args: - max_visual_tokens: Maximum number of visual tokens to support - min_bucket: Minimum bucket size - growth_factor: Multiplier for each successive bucket - - Returns: - List of bucket sizes - """ - buckets = [] - current = min_bucket - - while current <= max_visual_tokens: - buckets.append(int(current)) - current = int(current * growth_factor) - - # Ensure max is included - if buckets[-1] < max_visual_tokens: - buckets.append(max_visual_tokens) - - return buckets - - def generate_grid_configs_for_resolution_range( min_size: int = 448, max_size: int = 1344, From a2b474d63736851d991cc5254fd09bdb244bed41 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 01:12:25 -0500 Subject: [PATCH 021/189] clean up. --- vllm/config/compilation.py | 19 ++----------------- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 2 +- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 1981713f4d1c..dd639ae15430 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -453,23 +453,8 @@ class CompilationConfig: encoder_cudagraph_grid_configs: list[tuple[int, int, int]] | str | None = None """Grid configurations (T, H, W in patch units) for exact-match CUDA graph - capture. Can be a list of tuples or a preset name: "default", "custom", - "shopify", "shopify_rectangular". The "custom" preset contains top 30 grids - from MLPerf dataset (58.9% exact match coverage). If None, uses default.""" - - encoder_cudagraph_token_buckets: list[int] | str | None = None - """Token bucket sizes for encoder CUDA graphs with padding support. - Instead of requiring exact grid matches, inputs are padded to the smallest - bucket that fits. This trades some compute (padding overhead) for higher - CUDA graph utilization. - - Can be a list of token counts or a preset name: - - "shopify_fine": [1024, 2048, 3072, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192, 8464] - - "shopify_medium": [1024, 2048, 3072, 4096, 5120, 6144, 7168, 8192, 8464] - - "shopify_coarse": [2048, 4096, 6144, 8192, 8464] - - "shopify_single": [8464] (all images padded to max) - - When set, overrides encoder_cudagraph_grid_configs.""" + capture. Can be a list of tuples or preset "custom" (top 30 most common grids, + 58.9% exact match coverage). If None, uses "custom" as default.""" encoder_cudagraph_padded_mode: bool = True """Whether to use padded execution for encoder CUDA graphs. diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index c870790b1868..d6fee56ec806 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -37,7 +37,7 @@ logger = init_logger(__name__) # Grid configurations for CUDA graph capture (T, H, W in patch units) -# Top 30 most common grids from MLPerf dataset analysis (58.9% exact match coverage) +# Top 30 most common grids (58.9% exact match coverage) CUSTOM_GRID_CONFIGS = [ (1, 62, 62), (1, 94, 94), From c3a025f1abd8f3a45d72858f3261e6fa0ba4fe87 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 01:16:46 -0500 Subject: [PATCH 022/189] update comment for encoder cudagraph. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 24 ++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index d6fee56ec806..33c1fa8cc7d6 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -6,16 +6,24 @@ This module provides CUDA graph capture and replay functionality for vision encoders to eliminate kernel launch overhead and improve GPU utilization. +Two execution modes: +1. Exact match mode: Replay CUDA graph when input grid_thw exactly matches + a captured configuration. No padding overhead. +2. Padded mode: Pad inputs to fit the smallest captured bucket that can + accommodate them. Enables higher CUDA graph utilization at the cost of + padding compute overhead. + +Padded mode details: +- Padded with zeros: pixel_values, pos_embeds, rotary_pos_emb_cos/sin +- NOT padded (set to actual values): cu_seqlens, max_seqlen +- This ensures flash attention only processes real tokens (via cu_seqlens) +- Output is trimmed to actual size after graph replay + Key design principles: -1. Capture graphs for specific grid_thw configurations (not just token counts) -2. Only replay when input dimensions exactly match captured configuration -3. Fall back to eager mode for non-matching inputs +1. Capture graphs for specific grid_thw configurations +2. Support both exact match and padded execution +3. Fall back to eager mode when no suitable graph is available 4. Track statistics for monitoring and optimization - -Limitations: -- CUDA graphs are only used when input dimensions exactly match captured graphs -- Variable-size images that don't match any captured configuration use eager mode -- Multiple images in a batch are processed sequentially through graph replay """ from __future__ import annotations From 722ff9df86c9c83efbd7d58d3207e31b9e0bbc14 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 01:19:42 -0500 Subject: [PATCH 023/189] log encoder cudagraph stats. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 33c1fa8cc7d6..15238aaa0a9f 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -713,10 +713,10 @@ def run_padded( return trimmed_output, padding_waste def get_stats(self) -> dict[str, Any]: - """Get cache statistics.""" + """Get and log cache statistics.""" total = self.cache_hits + self.cache_misses + self.eager_fallbacks hit_rate = self.cache_hits / total if total > 0 else 0.0 - return { + stats = { "cache_hits": self.cache_hits, "cache_misses": self.cache_misses, "eager_fallbacks": self.eager_fallbacks, @@ -724,6 +724,13 @@ def get_stats(self) -> dict[str, Any]: "num_graphs": len(self.graphs), "captured_configs": sorted(self.graphs.keys()), } + logger.info( + f"Encoder CUDA graph stats: " + f"hits={self.cache_hits}, misses={self.cache_misses}, " + f"eager={self.eager_fallbacks}, hit_rate={hit_rate:.1%}, " + f"num_graphs={len(self.graphs)}" + ) + return stats def generate_grid_configs_for_resolution_range( From c98190440d11c51fd6a5df426b94ee6f76f7fbcb Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 01:59:23 -0500 Subject: [PATCH 024/189] get bucket size from config for padding mode. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 15238aaa0a9f..f3906dd6f6e5 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -111,15 +111,20 @@ def __init__( self.device = device self.dtype = dtype - # Get grid configs from config or use defaults + # Get grid configs from config or use defaults (for exact match) if grid_configs is None: grid_configs = self._get_grid_configs_from_config() - self.grid_configs = grid_configs - # Legacy bucket sizes (for backward compatibility with bucket-based API) + # Get bucket sizes from config (for padded mode) if bucket_sizes is None: bucket_sizes = self._get_bucket_sizes_from_config() - self.bucket_sizes = sorted(bucket_sizes) + + # Merge: grid_configs (exact match) + bucket_sizes (padded mode square grids) + # Bucket sizes create square grids (1, size, size) for padded mode + grid_set = set(grid_configs) + for size in bucket_sizes: + grid_set.add((1, size, size)) + self.grid_configs = list(grid_set) # CUDA graph storage - keyed by (t, h, w) tuple self.graphs: dict[tuple[int, int, int], torch.cuda.CUDAGraph] = {} From b4796272dc6aed64ecc2e9c02d74d59d13546bbc Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 02:00:44 -0500 Subject: [PATCH 025/189] eliminate dead code. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 91 ---------------------- 1 file changed, 91 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index f3906dd6f6e5..0fc02ebc8619 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -209,65 +209,6 @@ def _grid_to_key(self, grid_thw: list[list[int]]) -> tuple[int, int, int] | None t, h, w = grid_thw[0] return (t, h, w) - def find_best_grid_for_padding( - self, - grid_thw: list[list[int]], - spatial_merge_size: int = 2, - ) -> tuple[int, int, int] | None: - """ - Find the smallest captured grid that can accommodate the input with padding. - - For CUDA graph compatibility with variable-size inputs, this finds the - smallest captured configuration where: - - T_captured >= T_input - - H_captured >= H_input - - W_captured >= W_input - - Args: - grid_thw: Input grid configuration [[T, H, W]] - spatial_merge_size: Merge size for spatial dimensions (default 2) - - Returns: - The best matching captured grid config, or None if no match found - """ - key = self._grid_to_key(grid_thw) - if key is None: - return None - - t_in, h_in, w_in = key - - # First check for exact match - if key in self.graphs: - return key - - # Find smallest captured grid that can accommodate input - best_match = None - best_waste = float('inf') - - for captured_key in self.graphs.keys(): - t_cap, h_cap, w_cap = captured_key - - # Check if captured grid can accommodate input - if t_cap >= t_in and h_cap >= h_in and w_cap >= w_in: - # Calculate waste (padding overhead) - input_tokens = self._compute_output_tokens(key, spatial_merge_size) - captured_tokens = self._compute_output_tokens( - captured_key, spatial_merge_size - ) - waste = captured_tokens - input_tokens - - if waste < best_waste: - best_waste = waste - best_match = captured_key - - if best_match is not None: - logger.debug( - f"Found padding-compatible grid: input={key} -> captured={best_match} " - f"(waste={best_waste} tokens)" - ) - - return best_match - def _compute_output_tokens( self, grid_thw: tuple[int, int, int], @@ -738,35 +679,3 @@ def get_stats(self) -> dict[str, Any]: return stats -def generate_grid_configs_for_resolution_range( - min_size: int = 448, - max_size: int = 1344, - step: int = 224, - patch_size: int = 14, - temporal_values: list[int] | None = None, -) -> list[tuple[int, int, int]]: - """ - Generate grid configurations for a range of image resolutions. - - Args: - min_size: Minimum image dimension in pixels - max_size: Maximum image dimension in pixels - step: Step size in pixels - patch_size: Patch size of the vision encoder - temporal_values: List of temporal dimensions to include (default [1]) - - Returns: - List of (T, H, W) tuples in patch units - """ - if temporal_values is None: - temporal_values = [1] - - configs = [] - for h_pixels in range(min_size, max_size + 1, step): - for w_pixels in range(min_size, max_size + 1, step): - h_patches = h_pixels // patch_size - w_patches = w_pixels // patch_size - for t in temporal_values: - configs.append((t, h_patches, w_patches)) - - return configs From 14ec7d5c858882029490fe867ad3411ccf4f308c Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 15:43:44 -0500 Subject: [PATCH 026/189] log encoder cudagraph stats for every call. --- vllm/v1/worker/gpu/mm/encoder_runner.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py index 90166f44a3b2..973ad5773023 100644 --- a/vllm/v1/worker/gpu/mm/encoder_runner.py +++ b/vllm/v1/worker/gpu/mm/encoder_runner.py @@ -50,6 +50,7 @@ def __init__( # Encoder CUDA graph manager (optional) self.encoder_cudagraph_manager: EncoderCudaGraphManager | None = None self.encoder_cudagraph_padded_mode: bool = True + self._encoder_call_count: int = 0 self._init_encoder_cudagraph_manager() def _init_encoder_cudagraph_manager(self) -> None: @@ -235,6 +236,12 @@ def execute_mm_encoder( # Cache the encoder outputs by mm_hash for mm_hash, output in zip(mm_hashes, encoder_outputs): self.encoder_cache[mm_hash] = output + + # Log encoder CUDA graph stats + self._encoder_call_count += 1 + if self.encoder_cudagraph_manager is not None: + self.encoder_cudagraph_manager.get_stats() + return encoder_outputs def _execute_with_cudagraph( From fa3242c464420de1b32dc4bfbef39c208ce0d782 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 15:55:43 -0500 Subject: [PATCH 027/189] use set_forward_context for encoder cudagraph. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 70 ++++++++++++++-------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 0fc02ebc8619..9c6ff813833e 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -37,6 +37,7 @@ from vllm.config import VllmConfig from vllm.distributed.parallel_state import graph_capture, is_global_first_rank +from vllm.forward_context import set_forward_context from vllm.logger import init_logger if TYPE_CHECKING: @@ -340,16 +341,21 @@ def capture_graph_for_grid( embed_buffers = self.embedding_buffers[grid_config] # Warmup run with embedding buffers - with torch.cuda.stream(torch.cuda.current_stream()): - warmup_output = vision_encoder.forward_cudagraph( - pixel_values, - pos_embeds=embed_buffers["pos_embeds"], - rotary_pos_emb_cos=embed_buffers["rotary_pos_emb_cos"], - rotary_pos_emb_sin=embed_buffers["rotary_pos_emb_sin"], - cu_seqlens=embed_buffers["cu_seqlens"], - max_seqlen=embed_buffers["max_seqlen"], - ) - self.output_buffers[grid_config] = torch.empty_like(warmup_output) + # Use set_forward_context to provide vllm_config for torch.compile + with set_forward_context( + attn_metadata=None, + vllm_config=self.vllm_config, + ): + with torch.cuda.stream(torch.cuda.current_stream()): + warmup_output = vision_encoder.forward_cudagraph( + pixel_values, + pos_embeds=embed_buffers["pos_embeds"], + rotary_pos_emb_cos=embed_buffers["rotary_pos_emb_cos"], + rotary_pos_emb_sin=embed_buffers["rotary_pos_emb_sin"], + cu_seqlens=embed_buffers["cu_seqlens"], + max_seqlen=embed_buffers["max_seqlen"], + ) + self.output_buffers[grid_config] = torch.empty_like(warmup_output) torch.cuda.synchronize() @@ -358,16 +364,20 @@ def capture_graph_for_grid( graph = torch.cuda.CUDAGraph() input_buffer = self.input_buffers[grid_config]["pixel_values"] - with torch.cuda.graph(graph, pool=self.pool): - output = vision_encoder.forward_cudagraph( - input_buffer, - pos_embeds=embed_buffers["pos_embeds"], - rotary_pos_emb_cos=embed_buffers["rotary_pos_emb_cos"], - rotary_pos_emb_sin=embed_buffers["rotary_pos_emb_sin"], - cu_seqlens=embed_buffers["cu_seqlens"], - max_seqlen=embed_buffers["max_seqlen"], - ) - self.output_buffers[grid_config].copy_(output) + with set_forward_context( + attn_metadata=None, + vllm_config=self.vllm_config, + ): + with torch.cuda.graph(graph, pool=self.pool): + output = vision_encoder.forward_cudagraph( + input_buffer, + pos_embeds=embed_buffers["pos_embeds"], + rotary_pos_emb_cos=embed_buffers["rotary_pos_emb_cos"], + rotary_pos_emb_sin=embed_buffers["rotary_pos_emb_sin"], + cu_seqlens=embed_buffers["cu_seqlens"], + max_seqlen=embed_buffers["max_seqlen"], + ) + self.output_buffers[grid_config].copy_(output) else: # Fallback to original forward (will have CPU gaps) logger.warning( @@ -376,9 +386,13 @@ def capture_graph_for_grid( ) # Warmup run (required before capture) - with torch.cuda.stream(torch.cuda.current_stream()): - warmup_output = vision_encoder(pixel_values, grid_thw=grid_thw) - self.output_buffers[grid_config] = torch.empty_like(warmup_output) + with set_forward_context( + attn_metadata=None, + vllm_config=self.vllm_config, + ): + with torch.cuda.stream(torch.cuda.current_stream()): + warmup_output = vision_encoder(pixel_values, grid_thw=grid_thw) + self.output_buffers[grid_config] = torch.empty_like(warmup_output) torch.cuda.synchronize() @@ -386,9 +400,13 @@ def capture_graph_for_grid( graph = torch.cuda.CUDAGraph() input_buffer = self.input_buffers[grid_config]["pixel_values"] - with torch.cuda.graph(graph, pool=self.pool): - output = vision_encoder(input_buffer, grid_thw=grid_thw) - self.output_buffers[grid_config].copy_(output) + with set_forward_context( + attn_metadata=None, + vllm_config=self.vllm_config, + ): + with torch.cuda.graph(graph, pool=self.pool): + output = vision_encoder(input_buffer, grid_thw=grid_thw) + self.output_buffers[grid_config].copy_(output) self.graphs[grid_config] = graph logger.debug( From 11bdbe6233a7e6d6773c214d7af6a164f0dff070 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 16:12:17 -0500 Subject: [PATCH 028/189] use zero instead of randn as dummy for warmup/capture. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 9c6ff813833e..bf4e1ae638b4 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -252,8 +252,8 @@ def _prepare_dummy_inputs_for_grid( # h, w are in patch units, so num_patches = t * h * w num_pixel_patches = t * h * w - # Create dummy pixel values - pixel_values = torch.randn( + # Create dummy pixel values (zeros are fine for warmup/capture) + pixel_values = torch.zeros( num_pixel_patches, patch_input_channels, dtype=self.dtype, From 7558fcf0bc1bc506d54e27b92fc663ab281a08d7 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 16:14:11 -0500 Subject: [PATCH 029/189] adjust decorator for forward context and graph capture. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 70 +++++++++++----------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index bf4e1ae638b4..a6f8cf1b3b68 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -346,38 +346,37 @@ def capture_graph_for_grid( attn_metadata=None, vllm_config=self.vllm_config, ): - with torch.cuda.stream(torch.cuda.current_stream()): - warmup_output = vision_encoder.forward_cudagraph( - pixel_values, - pos_embeds=embed_buffers["pos_embeds"], - rotary_pos_emb_cos=embed_buffers["rotary_pos_emb_cos"], - rotary_pos_emb_sin=embed_buffers["rotary_pos_emb_sin"], - cu_seqlens=embed_buffers["cu_seqlens"], - max_seqlen=embed_buffers["max_seqlen"], - ) - self.output_buffers[grid_config] = torch.empty_like(warmup_output) - - torch.cuda.synchronize() + warmup_output = vision_encoder.forward_cudagraph( + pixel_values, + pos_embeds=embed_buffers["pos_embeds"], + rotary_pos_emb_cos=embed_buffers["rotary_pos_emb_cos"], + rotary_pos_emb_sin=embed_buffers["rotary_pos_emb_sin"], + cu_seqlens=embed_buffers["cu_seqlens"], + max_seqlen=embed_buffers["max_seqlen"], + ) + self.output_buffers[grid_config] = torch.empty_like(warmup_output) # Capture the graph with embedding BUFFERS (not constants) # This allows updating embeddings at runtime for padded mode graph = torch.cuda.CUDAGraph() input_buffer = self.input_buffers[grid_config]["pixel_values"] - with set_forward_context( - attn_metadata=None, - vllm_config=self.vllm_config, + with ( + set_forward_context( + attn_metadata=None, + vllm_config=self.vllm_config, + ), + torch.cuda.graph(graph, self.pool), ): - with torch.cuda.graph(graph, pool=self.pool): - output = vision_encoder.forward_cudagraph( - input_buffer, - pos_embeds=embed_buffers["pos_embeds"], - rotary_pos_emb_cos=embed_buffers["rotary_pos_emb_cos"], - rotary_pos_emb_sin=embed_buffers["rotary_pos_emb_sin"], - cu_seqlens=embed_buffers["cu_seqlens"], - max_seqlen=embed_buffers["max_seqlen"], - ) - self.output_buffers[grid_config].copy_(output) + output = vision_encoder.forward_cudagraph( + input_buffer, + pos_embeds=embed_buffers["pos_embeds"], + rotary_pos_emb_cos=embed_buffers["rotary_pos_emb_cos"], + rotary_pos_emb_sin=embed_buffers["rotary_pos_emb_sin"], + cu_seqlens=embed_buffers["cu_seqlens"], + max_seqlen=embed_buffers["max_seqlen"], + ) + self.output_buffers[grid_config].copy_(output) else: # Fallback to original forward (will have CPU gaps) logger.warning( @@ -390,23 +389,22 @@ def capture_graph_for_grid( attn_metadata=None, vllm_config=self.vllm_config, ): - with torch.cuda.stream(torch.cuda.current_stream()): - warmup_output = vision_encoder(pixel_values, grid_thw=grid_thw) - self.output_buffers[grid_config] = torch.empty_like(warmup_output) - - torch.cuda.synchronize() + warmup_output = vision_encoder(pixel_values, grid_thw=grid_thw) + self.output_buffers[grid_config] = torch.empty_like(warmup_output) # Capture the graph graph = torch.cuda.CUDAGraph() input_buffer = self.input_buffers[grid_config]["pixel_values"] - with set_forward_context( - attn_metadata=None, - vllm_config=self.vllm_config, + with ( + set_forward_context( + attn_metadata=None, + vllm_config=self.vllm_config, + ), + torch.cuda.graph(graph, self.pool), ): - with torch.cuda.graph(graph, pool=self.pool): - output = vision_encoder(input_buffer, grid_thw=grid_thw) - self.output_buffers[grid_config].copy_(output) + output = vision_encoder(input_buffer, grid_thw=grid_thw) + self.output_buffers[grid_config].copy_(output) self.graphs[grid_config] = graph logger.debug( From 99db6e012bdb64effb35e7dfc4a775e2eeba6dd0 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 16:27:09 -0500 Subject: [PATCH 030/189] try capture each graph in its own context. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index a6f8cf1b3b68..4c247999ebff 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -448,18 +448,20 @@ def capture( desc="Capturing encoder CUDA graphs" ) - with graph_capture(device=self.device): - for grid_config in configs_to_capture: - try: + # Capture each graph in its own graph_capture context to isolate failures. + # If one capture fails, the pool state won't affect subsequent captures. + for grid_config in configs_to_capture: + try: + with graph_capture(device=self.device): self.capture_graph_for_grid( grid_config, vision_encoder, ) - except Exception as e: - logger.warning( - f"Failed to capture encoder CUDA graph for grid config " - f"{grid_config}: {e}. Will use eager mode." - ) + except Exception as e: + logger.warning( + f"Failed to capture encoder CUDA graph for grid config " + f"{grid_config}: {e}. Will use eager mode." + ) self.captured = True logger.info( From cc493e2fca0049cb5e59a92d1cfe88e8093184e4 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 16:30:51 -0500 Subject: [PATCH 031/189] use int max_seqlen instead of tensor. --- vllm/model_executor/models/qwen3_vl.py | 7 +++++-- vllm/v1/attention/ops/vit_attn_wrappers.py | 22 ++++++++++++++++++---- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 7 ++++--- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 133198896a5a..e08adf8c5446 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -807,8 +807,11 @@ def precompute_for_cudagraph( cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens]) cu_seqlens = torch.from_numpy(cu_seqlens).to(self.device, non_blocking=True) - # Compute max sequence length - max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens) + # Compute max sequence length as Python int (not tensor) + # This is important for CUDA graph capture: .item() must happen BEFORE + # capture, not during. Passing as int avoids .item() calls in FA4 wrapper. + max_seqlen_tensor = self.compute_attn_mask_seqlen(cu_seqlens) + max_seqlen = int(max_seqlen_tensor.item()) return { "pos_embeds": pos_embeds, diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index 84b1438fb1b0..076e00c739a5 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -29,7 +29,7 @@ def flash_attn_maxseqlen_wrapper( fa_version: int | None, scale: float | None = None, cu_seqlens: torch.Tensor | None = None, - max_seqlen: torch.Tensor | None = None, + max_seqlen: torch.Tensor | int | None = None, ) -> torch.Tensor: kwargs = {} if is_rocm_aiter: @@ -45,7 +45,14 @@ def flash_attn_maxseqlen_wrapper( cu_seqlens = torch.arange( 0, (batch_size + 1) * q_len, step=q_len, dtype=torch.int32, device=q.device ) - max_seqlen = q_len if max_seqlen is None else max_seqlen.item() + # Handle max_seqlen as int (for CUDA graph) or tensor (for eager mode) + # Using int avoids .item() call which breaks CUDA graph capture + if max_seqlen is None: + max_seqlen = q_len + elif isinstance(max_seqlen, int): + pass # already an int + else: + max_seqlen = max_seqlen.item() q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) output = flash_attn_varlen_func( @@ -117,7 +124,7 @@ def fa4_flash_attn_maxseqlen_wrapper( batch_size: int, scale: float | None = None, cu_seqlens: torch.Tensor | None = None, - max_seqlen: torch.Tensor | None = None, + max_seqlen: torch.Tensor | int | None = None, ) -> torch.Tensor: """FA4 (flash_attn.cute) wrapper for ViT attention. @@ -132,7 +139,14 @@ def fa4_flash_attn_maxseqlen_wrapper( cu_seqlens = torch.arange( 0, (batch_size + 1) * q_len, step=q_len, dtype=torch.int32, device=q.device ) - max_seqlen_int = q_len if max_seqlen is None else max_seqlen.item() + # Handle max_seqlen as int (for CUDA graph) or tensor (for eager mode) + # Using int avoids .item() call which breaks CUDA graph capture + if max_seqlen is None: + max_seqlen_int = q_len + elif isinstance(max_seqlen, int): + max_seqlen_int = max_seqlen + else: + max_seqlen_int = max_seqlen.item() q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) output = fa4_flash_attn_varlen_func( diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 4c247999ebff..4dd04cfd160d 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -331,12 +331,13 @@ def capture_graph_for_grid( # Create INPUT BUFFERS for embeddings (for padded mode with runtime computation) # These buffers can be updated at runtime before graph replay + # Note: max_seqlen is stored as int (not tensor) to avoid .item() during capture self.embedding_buffers[grid_config] = { "pos_embeds": cached["pos_embeds"].clone(), "rotary_pos_emb_cos": cached["rotary_pos_emb_cos"].clone(), "rotary_pos_emb_sin": cached["rotary_pos_emb_sin"].clone(), "cu_seqlens": cached["cu_seqlens"].clone(), - "max_seqlen": cached["max_seqlen"].clone(), + "max_seqlen": cached["max_seqlen"], # int, not tensor } embed_buffers = self.embedding_buffers[grid_config] @@ -560,7 +561,7 @@ def run( embed_buffers["rotary_pos_emb_cos"].copy_(cached["rotary_pos_emb_cos"]) embed_buffers["rotary_pos_emb_sin"].copy_(cached["rotary_pos_emb_sin"]) embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"]) - embed_buffers["max_seqlen"].copy_(cached["max_seqlen"]) + embed_buffers["max_seqlen"] = cached["max_seqlen"] # int, not tensor # Replay the graph self.graphs[grid_key].replay() @@ -660,7 +661,7 @@ def run_padded( # cu_seqlens shape is [num_images + 1], for single image it's [2]: [0, num_patches] # We copy the actual values so flash attention processes only the real tokens embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"]) - embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"]) + embed_buffers["max_seqlen"] = actual_embeds["max_seqlen"] # int, not tensor # Replay the graph with updated embedding buffers self.graphs[bucket_grid].replay() From 74879cdbdc5888c28a0c910f9fa9a061750f6777 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 16:39:00 -0500 Subject: [PATCH 032/189] revert type annotation for now. --- vllm/v1/attention/ops/vit_attn_wrappers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index 076e00c739a5..1e0ff930a0bd 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -29,7 +29,7 @@ def flash_attn_maxseqlen_wrapper( fa_version: int | None, scale: float | None = None, cu_seqlens: torch.Tensor | None = None, - max_seqlen: torch.Tensor | int | None = None, + max_seqlen: torch.Tensor | None = None, # Also accepts int at runtime ) -> torch.Tensor: kwargs = {} if is_rocm_aiter: @@ -124,7 +124,7 @@ def fa4_flash_attn_maxseqlen_wrapper( batch_size: int, scale: float | None = None, cu_seqlens: torch.Tensor | None = None, - max_seqlen: torch.Tensor | int | None = None, + max_seqlen: torch.Tensor | None = None, # Also accepts int at runtime ) -> torch.Tensor: """FA4 (flash_attn.cute) wrapper for ViT attention. From fbdfca804365f51b3986b1120dec9e7bd7f68ef7 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 16:49:28 -0500 Subject: [PATCH 033/189] make max_seqlen cpu tensor. --- vllm/model_executor/models/qwen3_vl.py | 10 +++++----- vllm/v1/attention/ops/vit_attn_wrappers.py | 12 ++++++------ vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 8 ++++---- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index e08adf8c5446..8f57169f2f43 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -807,11 +807,11 @@ def precompute_for_cudagraph( cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens]) cu_seqlens = torch.from_numpy(cu_seqlens).to(self.device, non_blocking=True) - # Compute max sequence length as Python int (not tensor) - # This is important for CUDA graph capture: .item() must happen BEFORE - # capture, not during. Passing as int avoids .item() calls in FA4 wrapper. - max_seqlen_tensor = self.compute_attn_mask_seqlen(cu_seqlens) - max_seqlen = int(max_seqlen_tensor.item()) + # Compute max sequence length as CPU scalar tensor + # Using CPU tensor is important for CUDA graph capture: .item() on CPU + # tensor doesn't trigger GPU sync, so it won't invalidate capture. + max_seqlen_gpu = self.compute_attn_mask_seqlen(cu_seqlens) + max_seqlen = max_seqlen_gpu.cpu() # Move to CPU to avoid GPU sync on .item() return { "pos_embeds": pos_embeds, diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index 1e0ff930a0bd..eda852f04603 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -45,14 +45,14 @@ def flash_attn_maxseqlen_wrapper( cu_seqlens = torch.arange( 0, (batch_size + 1) * q_len, step=q_len, dtype=torch.int32, device=q.device ) - # Handle max_seqlen as int (for CUDA graph) or tensor (for eager mode) - # Using int avoids .item() call which breaks CUDA graph capture + # Handle max_seqlen: can be None, int, or tensor + # For CUDA graph capture, use CPU tensor so .item() doesn't trigger GPU sync if max_seqlen is None: max_seqlen = q_len elif isinstance(max_seqlen, int): pass # already an int else: - max_seqlen = max_seqlen.item() + max_seqlen = max_seqlen.item() # CPU tensor .item() is safe during capture q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) output = flash_attn_varlen_func( @@ -139,14 +139,14 @@ def fa4_flash_attn_maxseqlen_wrapper( cu_seqlens = torch.arange( 0, (batch_size + 1) * q_len, step=q_len, dtype=torch.int32, device=q.device ) - # Handle max_seqlen as int (for CUDA graph) or tensor (for eager mode) - # Using int avoids .item() call which breaks CUDA graph capture + # Handle max_seqlen: can be None, int, or tensor + # For CUDA graph capture, use CPU tensor so .item() doesn't trigger GPU sync if max_seqlen is None: max_seqlen_int = q_len elif isinstance(max_seqlen, int): max_seqlen_int = max_seqlen else: - max_seqlen_int = max_seqlen.item() + max_seqlen_int = max_seqlen.item() # CPU tensor .item() is safe during capture q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) output = fa4_flash_attn_varlen_func( diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 4dd04cfd160d..b209896b552e 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -331,13 +331,13 @@ def capture_graph_for_grid( # Create INPUT BUFFERS for embeddings (for padded mode with runtime computation) # These buffers can be updated at runtime before graph replay - # Note: max_seqlen is stored as int (not tensor) to avoid .item() during capture + # Note: max_seqlen is a CPU scalar tensor to avoid GPU sync on .item() self.embedding_buffers[grid_config] = { "pos_embeds": cached["pos_embeds"].clone(), "rotary_pos_emb_cos": cached["rotary_pos_emb_cos"].clone(), "rotary_pos_emb_sin": cached["rotary_pos_emb_sin"].clone(), "cu_seqlens": cached["cu_seqlens"].clone(), - "max_seqlen": cached["max_seqlen"], # int, not tensor + "max_seqlen": cached["max_seqlen"].clone(), } embed_buffers = self.embedding_buffers[grid_config] @@ -561,7 +561,7 @@ def run( embed_buffers["rotary_pos_emb_cos"].copy_(cached["rotary_pos_emb_cos"]) embed_buffers["rotary_pos_emb_sin"].copy_(cached["rotary_pos_emb_sin"]) embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"]) - embed_buffers["max_seqlen"] = cached["max_seqlen"] # int, not tensor + embed_buffers["max_seqlen"].copy_(cached["max_seqlen"]) # Replay the graph self.graphs[grid_key].replay() @@ -661,7 +661,7 @@ def run_padded( # cu_seqlens shape is [num_images + 1], for single image it's [2]: [0, num_patches] # We copy the actual values so flash attention processes only the real tokens embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"]) - embed_buffers["max_seqlen"] = actual_embeds["max_seqlen"] # int, not tensor + embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"]) # Replay the graph with updated embedding buffers self.graphs[bucket_grid].replay() From b8dc719ea44e7dbf46c7bd0b9c450b648f1aa68e Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 17:02:29 -0500 Subject: [PATCH 034/189] reduce to 15 grids. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index b209896b552e..7a45cb3c0438 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -46,9 +46,10 @@ logger = init_logger(__name__) # Grid configurations for CUDA graph capture (T, H, W in patch units) -# Top 30 most common grids (58.9% exact match coverage) +# Top 15 most common grids by frequency (reduced from 30 to save memory) +# Excludes very large grids (>128) to reduce CUDA graph memory footprint CUSTOM_GRID_CONFIGS = [ - (1, 62, 62), + (1, 62, 62), # Most common (1, 94, 94), (1, 50, 50), (1, 32, 32), @@ -57,27 +58,12 @@ (1, 100, 100), (1, 64, 64), (1, 38, 38), - (1, 188, 188), (1, 68, 68), (1, 128, 128), (1, 44, 44), - (1, 250, 250), - (1, 256, 256), (1, 42, 42), (1, 24, 24), - (1, 160, 160), (1, 46, 46), - (1, 80, 80), - (1, 112, 112), - (1, 16, 16), - (1, 56, 56), - (1, 208, 312), - (1, 188, 252), - (1, 156, 156), - (1, 40, 40), - (1, 252, 188), - (1, 120, 120), - (1, 218, 218), ] From d164dfa198cb5c74eda3c1abf68a79f4d9aebbe2 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 17:38:04 -0500 Subject: [PATCH 035/189] change log level to info. --- vllm/v1/worker/gpu_model_runner.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1614a7f454ed..1e9406133832 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2379,6 +2379,10 @@ def _execute_mm_encoder( logger.debug("Finish execute for mm hash %s", mm_hash) self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash) + # Log encoder CUDA graph stats periodically + if self.encoder_cudagraph_manager is not None: + self.encoder_cudagraph_manager.get_stats() + return encoder_outputs def _execute_with_encoder_cudagraph( @@ -2462,9 +2466,9 @@ def _execute_with_encoder_cudagraph( # Exact match found - try to run output = self.encoder_cudagraph_manager.run(pixel_values, grid_thw) if output is not None: - logger.debug( - f"Encoder CUDA graph exact match for grid {grid_key}, " - f"output: {output.shape}" + logger.info( + f"ViT CUDA graph EXACT: grid={grid_key}, " + f"output={output.shape}" ) return [output[:num_output_tokens]] @@ -2478,16 +2482,16 @@ def _execute_with_encoder_cudagraph( ) if result is not None: output, padding_waste = result - logger.debug( - f"Encoder CUDA graph padded execution: " - f"{num_output_tokens} tokens, waste={padding_waste}" + logger.info( + f"ViT CUDA graph PADDED: grid=({t}, {h}, {w}), " + f"tokens={num_output_tokens}, waste={padding_waste}" ) return [output] # No CUDA graph available - logger.debug( - f"No encoder CUDA graph for grid {grid_thw[0]} " - f"(padded_mode={self.encoder_cudagraph_padded_mode}). Using eager mode." + logger.info( + f"ViT EAGER: grid=({t}, {h}, {w}), tokens={num_output_tokens} " + f"(padded_mode={self.encoder_cudagraph_padded_mode})" ) return None From 28517322e131da059dedde2dd5c53b2a31611354 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 18:43:34 -0500 Subject: [PATCH 036/189] fix encoder cudagraph stats. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 12 +++++++++-- vllm/v1/worker/gpu_model_runner.py | 24 ++++++++++------------ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 7a45cb3c0438..cf726aee6434 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -521,7 +521,7 @@ def run( grid_key = self.get_graph_for_grid(grid_thw) if grid_key is None: - self.cache_misses += 1 + # Don't count miss here - caller may try run_padded() next return None # Verify input dimensions match @@ -591,7 +591,7 @@ def run_padded( # Find the smallest bucket that fits bucket_grid = self.find_bucket_for_tokens(num_output_tokens, spatial_merge_size) if bucket_grid is None: - self.cache_misses += 1 + # Don't count miss here - caller will count it when falling back to eager logger.debug( f"No bucket found for {num_output_tokens} tokens, " f"max available: {max(self._compute_output_tokens(g, spatial_merge_size) for g in self.graphs.keys()) if self.graphs else 0}" @@ -663,6 +663,14 @@ def run_padded( return trimmed_output, padding_waste + def count_miss(self) -> None: + """Count a cache miss when falling back to eager mode. + + This should be called by the caller when neither run() nor run_padded() + succeeded and eager execution is used. + """ + self.cache_misses += 1 + def get_stats(self) -> dict[str, Any]: """Get and log cache statistics.""" total = self.cache_hits + self.cache_misses + self.eager_fallbacks diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1e9406133832..be27fc452880 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2460,19 +2460,16 @@ def _execute_with_encoder_cudagraph( f"output_tokens={num_output_tokens}" ) - # Try exact match first - grid_key = self.encoder_cudagraph_manager.get_graph_for_grid(grid_thw) - if grid_key is not None: - # Exact match found - try to run - output = self.encoder_cudagraph_manager.run(pixel_values, grid_thw) - if output is not None: - logger.info( - f"ViT CUDA graph EXACT: grid={grid_key}, " - f"output={output.shape}" - ) - return [output[:num_output_tokens]] + # Try exact match first via run() - counts hits internally + output = self.encoder_cudagraph_manager.run(pixel_values, grid_thw) + if output is not None: + logger.info( + f"ViT CUDA graph EXACT: grid=({t}, {h}, {w}), " + f"output={output.shape}" + ) + return [output[:num_output_tokens]] - # Try padded execution if enabled + # Try padded execution if enabled (run_padded counts hits internally) if self.encoder_cudagraph_padded_mode: result = self.encoder_cudagraph_manager.run_padded( pixel_values, @@ -2488,7 +2485,8 @@ def _execute_with_encoder_cudagraph( ) return [output] - # No CUDA graph available + # No CUDA graph available - count the miss and fall back to eager mode + self.encoder_cudagraph_manager.count_miss() logger.info( f"ViT EAGER: grid=({t}, {h}, {w}), tokens={num_output_tokens} " f"(padded_mode={self.encoder_cudagraph_padded_mode})" From 6c2b3adb8ae306852d7f0205661b29f0ac48f752 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 18:54:21 -0500 Subject: [PATCH 037/189] add separate memory pool for encoder cudagraph. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 43 +++++++++++++++--- vllm/v1/worker/gpu_model_runner.py | 52 +++++++++++++++++++--- 2 files changed, 85 insertions(+), 10 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index cf726aee6434..b4a5de63bdc4 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -46,9 +46,11 @@ logger = init_logger(__name__) # Grid configurations for CUDA graph capture (T, H, W in patch units) -# Top 15 most common grids by frequency (reduced from 30 to save memory) -# Excludes very large grids (>128) to reduce CUDA graph memory footprint +# Top 30 most common grids by frequency, sorted by occurrence count +# With dedicated encoder graph pool, we can capture more grids without +# competing with decoder CUDA graphs for memory CUSTOM_GRID_CONFIGS = [ + # Top 15 most common (high priority) (1, 62, 62), # Most common (1, 94, 94), (1, 50, 50), @@ -64,6 +66,22 @@ (1, 42, 42), (1, 24, 24), (1, 46, 46), + # Additional 15 grids (medium priority) + (1, 56, 56), + (1, 82, 82), + (1, 70, 70), + (1, 88, 88), + (1, 106, 106), + (1, 112, 112), + (1, 118, 118), + (1, 52, 52), + (1, 58, 58), + (1, 74, 74), + (1, 80, 80), + (1, 86, 86), + (1, 92, 92), + (1, 96, 96), + (1, 102, 102), ] @@ -93,6 +111,7 @@ def __init__( dtype: torch.dtype, bucket_sizes: list[int] | None = None, grid_configs: list[tuple[int, int, int]] | None = None, + graph_pool: Any | None = None, ): self.vllm_config = vllm_config self.device = device @@ -115,7 +134,9 @@ def __init__( # CUDA graph storage - keyed by (t, h, w) tuple self.graphs: dict[tuple[int, int, int], torch.cuda.CUDAGraph] = {} - self.pool = torch.cuda.graph_pool_handle() + # Use provided pool or create a dedicated encoder pool + # Using a separate pool from decoder allows independent memory management + self.pool = graph_pool if graph_pool is not None else torch.cuda.graph_pool_handle() # Pre-allocated input/output buffers per grid config # Key: (t, h, w), Value: {"pixel_values": tensor, "grid_thw": list} @@ -417,9 +438,13 @@ def capture( logger.warning("Encoder CUDA graphs already captured, skipping") return + # Log initial memory state + free_mem_before, total_mem = torch.cuda.mem_get_info(self.device) + used_mem_before = total_mem - free_mem_before logger.info( f"Capturing encoder CUDA graphs for {len(self.grid_configs)} " - f"grid configurations" + f"grid configurations (GPU memory: {used_mem_before / 1024**3:.2f} GiB used, " + f"{free_mem_before / 1024**3:.2f} GiB free)" ) # Capture from largest to smallest (more memory efficient) @@ -451,9 +476,17 @@ def capture( ) self.captured = True + + # Log final memory state + free_mem_after, _ = torch.cuda.mem_get_info(self.device) + used_mem_after = total_mem - free_mem_after + encoder_graph_mem = used_mem_after - used_mem_before logger.info( f"Captured {len(self.graphs)} encoder CUDA graphs " - f"(configs: {sorted(self.graphs.keys())})" + f"(configs: {sorted(self.graphs.keys())}). " + f"Encoder graph memory: {encoder_graph_mem / 1024**3:.2f} GiB " + f"(GPU: {used_mem_after / 1024**3:.2f} GiB used, " + f"{free_mem_after / 1024**3:.2f} GiB free)" ) def get_graph_for_grid( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index be27fc452880..e0b1faa6a96d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -707,11 +707,19 @@ def _init_encoder_cudagraph_manager(self) -> None: True # Default to padded mode for better CUDA graph utilization ) + # Create a dedicated graph pool for encoder CUDA graphs + # This keeps encoder and decoder graph memory separate for: + # 1. Better memory isolation and predictability + # 2. Independent memory management for each subsystem + # 3. Easier debugging of memory usage + encoder_graph_pool = torch.cuda.graph_pool_handle() + self.encoder_cudagraph_manager = EncoderCudaGraphManager( vllm_config=self.vllm_config, device=self.device, dtype=self.dtype, bucket_sizes=bucket_sizes, + graph_pool=encoder_graph_pool, ) # Log configuration @@ -720,7 +728,8 @@ def _init_encoder_cudagraph_manager(self) -> None: "Encoder CUDA graph manager initialized: " f"padded_mode={self.encoder_cudagraph_padded_mode}, " f"num_grids={len(grid_configs)}, " - f"grids={grid_configs}" + f"grids={grid_configs}, " + f"using dedicated encoder graph pool" ) def update_max_model_len(self, max_model_len: int) -> None: @@ -4962,12 +4971,32 @@ def freeze_gc(): # Capture the large shapes first so that the smaller shapes # can reuse the memory pool allocated for the large shapes. set_cudagraph_capturing_enabled(True) - with freeze_gc(), graph_capture(device=self.device): - start_free_gpu_memory = torch.cuda.mem_get_info()[0] - # Capture encoder CUDA graphs first (if enabled) - if self.encoder_cudagraph_manager is not None: + start_free_gpu_memory = torch.cuda.mem_get_info()[0] + start_total_memory = torch.cuda.mem_get_info()[1] + logger.info( + f"Starting CUDA graph capture: " + f"{(start_total_memory - start_free_gpu_memory) / 1024**3:.2f} GiB used, " + f"{start_free_gpu_memory / 1024**3:.2f} GiB free" + ) + + # Capture encoder CUDA graphs first (if enabled) + # Encoder uses a dedicated graph pool separate from decoder, + # captured outside the decoder's graph_capture context for clean isolation + if self.encoder_cudagraph_manager is not None: + with freeze_gc(): self._capture_encoder_cudagraphs() + after_encoder_free = torch.cuda.mem_get_info()[0] + encoder_mem = start_free_gpu_memory - after_encoder_free + logger.info( + f"Encoder CUDA graphs captured: " + f"{encoder_mem / 1024**3:.2f} GiB used by encoder graphs, " + f"{after_encoder_free / 1024**3:.2f} GiB free" + ) + + # Capture decoder/LM CUDA graphs in their own context with global pool + with freeze_gc(), graph_capture(device=self.device): + before_decoder_free = torch.cuda.mem_get_info()[0] cudagraph_mode = self.compilation_config.cudagraph_mode assert cudagraph_mode is not None @@ -5017,6 +5046,19 @@ def freeze_gc(): torch.cuda.synchronize() end_free_gpu_memory = torch.cuda.mem_get_info()[0] + decoder_mem = before_decoder_free - end_free_gpu_memory + logger.info( + f"Decoder CUDA graphs captured: " + f"{decoder_mem / 1024**3:.2f} GiB used by decoder graphs, " + f"{end_free_gpu_memory / 1024**3:.2f} GiB free" + ) + + total_cudagraph_mem = start_free_gpu_memory - end_free_gpu_memory + logger.info( + f"CUDA graph capture complete: " + f"total {total_cudagraph_mem / 1024**3:.2f} GiB for all graphs, " + f"{end_free_gpu_memory / 1024**3:.2f} GiB free" + ) # Disable cudagraph capturing globally, so any unexpected cudagraph # capturing will be detected and raise an error after here. From 618df682e499202765d0018ac216ce616b0db200 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 19:04:08 -0500 Subject: [PATCH 038/189] switch back to 30 grids. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 34 ++++++++++------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index b4a5de63bdc4..a8ad87c5a273 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -46,12 +46,11 @@ logger = init_logger(__name__) # Grid configurations for CUDA graph capture (T, H, W in patch units) -# Top 30 most common grids by frequency, sorted by occurrence count +# Top 30 most common grids (58.9% exact match coverage) # With dedicated encoder graph pool, we can capture more grids without # competing with decoder CUDA graphs for memory CUSTOM_GRID_CONFIGS = [ - # Top 15 most common (high priority) - (1, 62, 62), # Most common + (1, 62, 62), (1, 94, 94), (1, 50, 50), (1, 32, 32), @@ -60,28 +59,27 @@ (1, 100, 100), (1, 64, 64), (1, 38, 38), + (1, 188, 188), (1, 68, 68), (1, 128, 128), (1, 44, 44), + (1, 250, 250), + (1, 256, 256), (1, 42, 42), (1, 24, 24), + (1, 160, 160), (1, 46, 46), - # Additional 15 grids (medium priority) - (1, 56, 56), - (1, 82, 82), - (1, 70, 70), - (1, 88, 88), - (1, 106, 106), - (1, 112, 112), - (1, 118, 118), - (1, 52, 52), - (1, 58, 58), - (1, 74, 74), (1, 80, 80), - (1, 86, 86), - (1, 92, 92), - (1, 96, 96), - (1, 102, 102), + (1, 112, 112), + (1, 16, 16), + (1, 56, 56), + (1, 208, 312), + (1, 188, 252), + (1, 156, 156), + (1, 40, 40), + (1, 252, 188), + (1, 120, 120), + (1, 218, 218), ] From f803a2269d1530b443b917a6a5842e58dfacd594 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 19:54:39 -0500 Subject: [PATCH 039/189] limit max grid size. capture from small to large. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 49 ++++++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index a8ad87c5a273..616574cec6da 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -128,7 +128,27 @@ def __init__( grid_set = set(grid_configs) for size in bucket_sizes: grid_set.add((1, size, size)) - self.grid_configs = list(grid_set) + + # Filter out grids that are too large to capture efficiently + # Large grids (e.g., 256x256+) consume massive memory (~14+ GiB each) + # and are better served by eager mode or padded execution + max_grid_size = self._get_max_grid_size_from_config() + filtered_grids = [] + skipped_grids = [] + for grid in grid_set: + t, h, w = grid + if h <= max_grid_size and w <= max_grid_size: + filtered_grids.append(grid) + else: + skipped_grids.append(grid) + + if skipped_grids: + logger.info( + f"Skipping {len(skipped_grids)} grids exceeding max_grid_size={max_grid_size}: " + f"{sorted(skipped_grids, key=lambda x: x[1]*x[2], reverse=True)[:5]}..." + ) + + self.grid_configs = filtered_grids # CUDA graph storage - keyed by (t, h, w) tuple self.graphs: dict[tuple[int, int, int], torch.cuda.CUDAGraph] = {} @@ -203,6 +223,28 @@ def _get_bucket_sizes_from_config(self) -> list[int]: ) return encoder_sizes if encoder_sizes is not None else [] + def _get_max_grid_size_from_config(self) -> int: + """Get maximum grid size for encoder CUDA graph capture. + + Large grids (e.g., 256x256+) consume massive GPU memory per graph: + - 128x128: ~3.5 GiB + - 188x188: ~7.7 GiB + - 256x256: ~14 GiB + - 512x512: ~57 GiB + + Default is 128 to allow capturing ~15-20 useful grids on typical hardware. + """ + compilation_config = self.vllm_config.compilation_config + if compilation_config is None: + return 128 # Conservative default + + max_size = getattr( + compilation_config, + 'encoder_cudagraph_max_grid_size', + 128 # Default: max 128x128 grids + ) + return max_size + def _grid_to_key(self, grid_thw: list[list[int]]) -> tuple[int, int, int] | None: """ Convert a grid_thw list to a hashable key. @@ -445,11 +487,12 @@ def capture( f"{free_mem_before / 1024**3:.2f} GiB free)" ) - # Capture from largest to smallest (more memory efficient) + # Capture from smallest to largest so that common smaller grids are + # captured first. If we run out of memory, only large grids will fail. configs_to_capture = sorted( self.grid_configs, key=lambda x: x[0] * x[1] * x[2], - reverse=True + reverse=False # Smallest first ) if is_global_first_rank(): From d5dc1246dbd89bccb160e406dc9cc1a490315073 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 20:31:24 -0500 Subject: [PATCH 040/189] add max_grid_size compilation config. --- vllm/config/compilation.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index dd639ae15430..3377db22c95b 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -464,6 +464,16 @@ class CompilationConfig: attention only processes real tokens). Output is trimmed to actual size. When False, only exact grid matches use CUDA graphs.""" + encoder_cudagraph_max_grid_size: int = 256 + """Maximum grid dimension (H or W) for encoder CUDA graph capture. + Grids with H > max or W > max are skipped to limit GPU memory usage. + Memory scales roughly with H*W: + - 128x128: ~0.8 GiB + - 188x188: ~1.7 GiB + - 256x256: ~3.2 GiB + Set lower (e.g., 128, 188, 218) on memory-constrained systems. + Default 256 captures all grids in CUSTOM_GRID_CONFIGS.""" + # Inductor capture compile_sizes: list[int | str] | None = None """Sizes to compile for inductor. In addition From 70345f05a2e59dab27cb32bd8edd1224fd079eef Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 29 Jan 2026 21:27:59 -0500 Subject: [PATCH 041/189] add cli to control encoder cugraph log level. --- vllm/config/compilation.py | 6 +++ vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 21 +++++---- vllm/v1/worker/gpu_model_runner.py | 54 ++++++++++++++-------- 3 files changed, 53 insertions(+), 28 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 3377db22c95b..18649f60c36a 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -474,6 +474,12 @@ class CompilationConfig: Set lower (e.g., 128, 188, 218) on memory-constrained systems. Default 256 captures all grids in CUSTOM_GRID_CONFIGS.""" + encoder_cudagraph_verbose: bool = False + """Enable verbose logging for encoder CUDA graph execution. + When True, logs each ViT input size and CUDA graph hit/miss/padded status. + Useful for debugging and analyzing CUDA graph utilization. + When False, only logs summary stats at the end of execution.""" + # Inductor capture compile_sizes: list[int | str] | None = None """Sizes to compile for inductor. In addition diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 616574cec6da..af6f4f21499c 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -745,8 +745,12 @@ def count_miss(self) -> None: """ self.cache_misses += 1 - def get_stats(self) -> dict[str, Any]: - """Get and log cache statistics.""" + def get_stats(self, verbose: bool = True) -> dict[str, Any]: + """Get and optionally log cache statistics. + + Args: + verbose: If True, log stats to INFO level. If False, only return stats dict. + """ total = self.cache_hits + self.cache_misses + self.eager_fallbacks hit_rate = self.cache_hits / total if total > 0 else 0.0 stats = { @@ -757,12 +761,13 @@ def get_stats(self) -> dict[str, Any]: "num_graphs": len(self.graphs), "captured_configs": sorted(self.graphs.keys()), } - logger.info( - f"Encoder CUDA graph stats: " - f"hits={self.cache_hits}, misses={self.cache_misses}, " - f"eager={self.eager_fallbacks}, hit_rate={hit_rate:.1%}, " - f"num_graphs={len(self.graphs)}" - ) + if verbose: + logger.info( + f"Encoder CUDA graph stats: " + f"hits={self.cache_hits}, misses={self.cache_misses}, " + f"eager={self.eager_fallbacks}, hit_rate={hit_rate:.1%}, " + f"num_graphs={len(self.graphs)}" + ) return stats diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e0b1faa6a96d..f803e9983ec9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -429,6 +429,7 @@ def __init__( # Encoder CUDA graph manager for ViT self.encoder_cudagraph_manager: EncoderCudaGraphManager | None = None self.encoder_cudagraph_padded_mode: bool = True + self.encoder_cudagraph_verbose: bool = False self._init_encoder_cudagraph_manager() self.use_aux_hidden_state_outputs = False @@ -707,6 +708,13 @@ def _init_encoder_cudagraph_manager(self) -> None: True # Default to padded mode for better CUDA graph utilization ) + # Check if verbose logging is enabled + self.encoder_cudagraph_verbose = getattr( + self.compilation_config, + 'encoder_cudagraph_verbose', + False # Default to quiet mode + ) + # Create a dedicated graph pool for encoder CUDA graphs # This keeps encoder and decoder graph memory separate for: # 1. Better memory isolation and predictability @@ -2388,9 +2396,11 @@ def _execute_mm_encoder( logger.debug("Finish execute for mm hash %s", mm_hash) self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash) - # Log encoder CUDA graph stats periodically + # Log encoder CUDA graph stats periodically (verbose only) if self.encoder_cudagraph_manager is not None: - self.encoder_cudagraph_manager.get_stats() + self.encoder_cudagraph_manager.get_stats( + verbose=self.encoder_cudagraph_verbose + ) return encoder_outputs @@ -2462,20 +2472,22 @@ def _execute_with_encoder_cudagraph( num_output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) num_input_patches = pixel_values.shape[0] - # Log the exact size needed for bucket analysis - logger.info( - f"ViT input: grid_thw=({t}, {h}, {w}), " - f"input_patches={num_input_patches}, " - f"output_tokens={num_output_tokens}" - ) + # Log the exact size needed for bucket analysis (verbose only) + if self.encoder_cudagraph_verbose: + logger.info( + f"ViT input: grid_thw=({t}, {h}, {w}), " + f"input_patches={num_input_patches}, " + f"output_tokens={num_output_tokens}" + ) # Try exact match first via run() - counts hits internally output = self.encoder_cudagraph_manager.run(pixel_values, grid_thw) if output is not None: - logger.info( - f"ViT CUDA graph EXACT: grid=({t}, {h}, {w}), " - f"output={output.shape}" - ) + if self.encoder_cudagraph_verbose: + logger.info( + f"ViT CUDA graph EXACT: grid=({t}, {h}, {w}), " + f"output={output.shape}" + ) return [output[:num_output_tokens]] # Try padded execution if enabled (run_padded counts hits internally) @@ -2488,18 +2500,20 @@ def _execute_with_encoder_cudagraph( ) if result is not None: output, padding_waste = result - logger.info( - f"ViT CUDA graph PADDED: grid=({t}, {h}, {w}), " - f"tokens={num_output_tokens}, waste={padding_waste}" - ) + if self.encoder_cudagraph_verbose: + logger.info( + f"ViT CUDA graph PADDED: grid=({t}, {h}, {w}), " + f"tokens={num_output_tokens}, waste={padding_waste}" + ) return [output] # No CUDA graph available - count the miss and fall back to eager mode self.encoder_cudagraph_manager.count_miss() - logger.info( - f"ViT EAGER: grid=({t}, {h}, {w}), tokens={num_output_tokens} " - f"(padded_mode={self.encoder_cudagraph_padded_mode})" - ) + if self.encoder_cudagraph_verbose: + logger.info( + f"ViT EAGER: grid=({t}, {h}, {w}), tokens={num_output_tokens} " + f"(padded_mode={self.encoder_cudagraph_padded_mode})" + ) return None def _gather_mm_embeddings( From 27a43c986e50c9dcf26f3f08771142fb3c5e838d Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 10:44:33 -0500 Subject: [PATCH 042/189] process image one by one if multiple images are batched. --- vllm/v1/worker/gpu_model_runner.py | 69 ++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f803e9983ec9..6efd7008b261 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2364,25 +2364,58 @@ def _execute_mm_encoder( curr_group_outputs = curr_group_outputs_lst else: # Try to use CUDA graph if available - cudagraph_result = None - if self.encoder_cudagraph_manager is not None: - cudagraph_result = self._execute_with_encoder_cudagraph( - model, mm_kwargs_group, modality, num_items - ) - - if cudagraph_result is not None: - # CUDA graph was used successfully - curr_group_outputs = cudagraph_result + # When CUDA graphs are enabled and we have multiple items, + # process them one at a time since CUDA graphs only support + # single-image batches + if (self.encoder_cudagraph_manager is not None + and num_items > 1 + and modality in ("image", "video")): + # Process each image individually for CUDA graph support + curr_group_outputs_lst = [] + for mm_item in filter( + lambda item: item.modality == modality, mm_kwargs + ): + _, _, single_mm_inputs = next( + group_mm_kwargs_by_modality( + [mm_item], + device=self.device, + pin_memory=self.pin_memory, + ) + ) + # Try CUDA graph for this single image + single_result = self._execute_with_encoder_cudagraph( + model, single_mm_inputs, modality, 1 + ) + if single_result is not None: + curr_group_outputs_lst.extend(single_result) + else: + # Fall back to eager for this image + single_output = model.embed_multimodal( + **single_mm_inputs + ) + curr_group_outputs_lst.extend(single_output) + curr_group_outputs = curr_group_outputs_lst else: - # Fall back to eager mode. - # Run the encoder. - # `curr_group_outputs` is either of the following: - # 1. A tensor of shape (num_items, feature_size, hidden_size) - # in case feature_size is fixed across all multimodal items. - # 2. A list or tuple (length: num_items) of tensors, - # each of shape (feature_size, hidden_size) in case the feature - # size is dynamic depending on the input multimodal items. - curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) + # Single item or no CUDA graph manager - try CUDA graph + cudagraph_result = None + if self.encoder_cudagraph_manager is not None: + cudagraph_result = self._execute_with_encoder_cudagraph( + model, mm_kwargs_group, modality, num_items + ) + + if cudagraph_result is not None: + # CUDA graph was used successfully + curr_group_outputs = cudagraph_result + else: + # Fall back to eager mode. + # Run the encoder. + # `curr_group_outputs` is either of the following: + # 1. A tensor of shape (num_items, feature_size, hidden_size) + # in case feature_size is fixed across all multimodal items. + # 2. A list or tuple (length: num_items) of tensors, + # each of shape (feature_size, hidden_size) in case the feature + # size is dynamic depending on the input multimodal items. + curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) sanity_check_mm_encoder_outputs( curr_group_outputs, From 05154eb78c61f389070f4e1bb6a9d09b1b61ec91 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 12:20:07 -0500 Subject: [PATCH 043/189] increment eager fallback count. --- vllm/v1/worker/gpu_model_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6efd7008b261..c81113a430f8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2472,6 +2472,7 @@ def _execute_with_encoder_cudagraph( if grid_thw is None: grid_thw = mm_kwargs_group.get("video_grid_thw") if grid_thw is None: + self.encoder_cudagraph_manager.count_miss() return None # Convert to list if tensor @@ -2484,6 +2485,7 @@ def _execute_with_encoder_cudagraph( "Encoder CUDA graph only supports single-image batches, " f"got {len(grid_thw)} images. Using eager mode." ) + self.encoder_cudagraph_manager.count_miss() return None # Extract pixel_values @@ -2494,6 +2496,7 @@ def _execute_with_encoder_cudagraph( if pixel_values is None: logger.debug("No pixel_values found in kwargs. Using eager mode.") + self.encoder_cudagraph_manager.count_miss() return None # Ensure pixel_values is on the correct device From f7c9c08eb88a125d52fa6f9a71904497cb8adbe5 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 12:20:36 -0500 Subject: [PATCH 044/189] remove duplicate counts of miss and eager. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index af6f4f21499c..104bfd9af6bf 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -180,7 +180,6 @@ def __init__( # Statistics self.cache_hits = 0 - self.cache_misses = 0 self.eager_fallbacks = 0 def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: @@ -738,12 +737,12 @@ def run_padded( return trimmed_output, padding_waste def count_miss(self) -> None: - """Count a cache miss when falling back to eager mode. + """Count when falling back to eager mode. This should be called by the caller when neither run() nor run_padded() succeeded and eager execution is used. """ - self.cache_misses += 1 + self.eager_fallbacks += 1 def get_stats(self, verbose: bool = True) -> dict[str, Any]: """Get and optionally log cache statistics. @@ -751,11 +750,10 @@ def get_stats(self, verbose: bool = True) -> dict[str, Any]: Args: verbose: If True, log stats to INFO level. If False, only return stats dict. """ - total = self.cache_hits + self.cache_misses + self.eager_fallbacks + total = self.cache_hits + self.eager_fallbacks hit_rate = self.cache_hits / total if total > 0 else 0.0 stats = { "cache_hits": self.cache_hits, - "cache_misses": self.cache_misses, "eager_fallbacks": self.eager_fallbacks, "hit_rate": hit_rate, "num_graphs": len(self.graphs), @@ -764,9 +762,8 @@ def get_stats(self, verbose: bool = True) -> dict[str, Any]: if verbose: logger.info( f"Encoder CUDA graph stats: " - f"hits={self.cache_hits}, misses={self.cache_misses}, " - f"eager={self.eager_fallbacks}, hit_rate={hit_rate:.1%}, " - f"num_graphs={len(self.graphs)}" + f"hits={self.cache_hits}, eager={self.eager_fallbacks}, " + f"hit_rate={hit_rate:.1%}, num_graphs={len(self.graphs)}" ) return stats From d98928c2d923166fe70bac441dc586cb2c85a462 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 12:51:07 -0500 Subject: [PATCH 045/189] use non_blocking copy. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 31 +++++++++++++--------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 104bfd9af6bf..a5f8b9f89ed1 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -609,18 +609,20 @@ def run( self.cache_hits += 1 - # Copy input to the captured buffer - input_buffer.copy_(pixel_values) + # Copy input to the captured buffer (non-blocking for better overlap) + input_buffer.copy_(pixel_values, non_blocking=True) # For exact match, restore cached embeddings (may have been modified by run_padded) if grid_key in self.embedding_buffers and grid_key in self.cached_tensors: embed_buffers = self.embedding_buffers[grid_key] cached = self.cached_tensors[grid_key] - embed_buffers["pos_embeds"].copy_(cached["pos_embeds"]) - embed_buffers["rotary_pos_emb_cos"].copy_(cached["rotary_pos_emb_cos"]) - embed_buffers["rotary_pos_emb_sin"].copy_(cached["rotary_pos_emb_sin"]) - embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"]) - embed_buffers["max_seqlen"].copy_(cached["max_seqlen"]) + embed_buffers["pos_embeds"].copy_(cached["pos_embeds"], non_blocking=True) + embed_buffers["rotary_pos_emb_cos"].copy_( + cached["rotary_pos_emb_cos"], non_blocking=True) + embed_buffers["rotary_pos_emb_sin"].copy_( + cached["rotary_pos_emb_sin"], non_blocking=True) + embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=True) + embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=True) # Replay the graph self.graphs[grid_key].replay() @@ -708,19 +710,22 @@ def run_padded( embed_buffers["rotary_pos_emb_sin"].zero_() # Copy actual pixel values to the beginning of the buffer - input_buffer[:num_input_patches].copy_(pixel_values) + input_buffer[:num_input_patches].copy_(pixel_values, non_blocking=True) # Copy actual embeddings to the beginning of the buffers (pad with zeros) actual_num_patches = actual_embeds["pos_embeds"].shape[0] - embed_buffers["pos_embeds"][:actual_num_patches].copy_(actual_embeds["pos_embeds"]) - embed_buffers["rotary_pos_emb_cos"][:actual_num_patches].copy_(actual_embeds["rotary_pos_emb_cos"]) - embed_buffers["rotary_pos_emb_sin"][:actual_num_patches].copy_(actual_embeds["rotary_pos_emb_sin"]) + embed_buffers["pos_embeds"][:actual_num_patches].copy_( + actual_embeds["pos_embeds"], non_blocking=True) + embed_buffers["rotary_pos_emb_cos"][:actual_num_patches].copy_( + actual_embeds["rotary_pos_emb_cos"], non_blocking=True) + embed_buffers["rotary_pos_emb_sin"][:actual_num_patches].copy_( + actual_embeds["rotary_pos_emb_sin"], non_blocking=True) # Update cu_seqlens and max_seqlen to actual values # cu_seqlens shape is [num_images + 1], for single image it's [2]: [0, num_patches] # We copy the actual values so flash attention processes only the real tokens - embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"]) - embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"]) + embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"], non_blocking=True) + embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"], non_blocking=True) # Replay the graph with updated embedding buffers self.graphs[bucket_grid].replay() From 0a14bf4dd5a0d4263b9168e8e02e5343a520a05c Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 12:51:27 -0500 Subject: [PATCH 046/189] slice from batched data to reduce copy overhead. --- vllm/v1/worker/gpu_model_runner.py | 77 +++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 22 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c81113a430f8..43cfdad04c7e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2371,30 +2371,63 @@ def _execute_mm_encoder( and num_items > 1 and modality in ("image", "video")): # Process each image individually for CUDA graph support + # Extract batched data and slice per-image to avoid + # re-calling group_mm_kwargs_by_modality overhead curr_group_outputs_lst = [] - for mm_item in filter( - lambda item: item.modality == modality, mm_kwargs - ): - _, _, single_mm_inputs = next( - group_mm_kwargs_by_modality( - [mm_item], - device=self.device, - pin_memory=self.pin_memory, - ) - ) - # Try CUDA graph for this single image - single_result = self._execute_with_encoder_cudagraph( - model, single_mm_inputs, modality, 1 - ) - if single_result is not None: - curr_group_outputs_lst.extend(single_result) - else: - # Fall back to eager for this image - single_output = model.embed_multimodal( - **single_mm_inputs + + # Get batched pixel_values and grid_thw + if modality == "image": + batched_pixel_values = mm_kwargs_group.get("pixel_values") + grid_thw_list = mm_kwargs_group.get("image_grid_thw") + grid_key = "image_grid_thw" + pixel_key = "pixel_values" + else: # video + batched_pixel_values = mm_kwargs_group.get( + "pixel_values_videos") + grid_thw_list = mm_kwargs_group.get("video_grid_thw") + grid_key = "video_grid_thw" + pixel_key = "pixel_values_videos" + + if batched_pixel_values is not None and grid_thw_list is not None: + # Convert grid_thw to list if tensor + if hasattr(grid_thw_list, "tolist"): + grid_thw_list = grid_thw_list.tolist() + + # Calculate patch boundaries for slicing + patch_offset = 0 + for grid_thw in grid_thw_list: + t, h, w = grid_thw + num_patches = t * h * w + + # Slice pixel_values for this image + single_pixel_values = batched_pixel_values[ + patch_offset:patch_offset + num_patches] + patch_offset += num_patches + + # Build single-image kwargs + single_mm_inputs = { + pixel_key: single_pixel_values, + grid_key: [grid_thw], + } + + # Try CUDA graph for this single image + single_result = self._execute_with_encoder_cudagraph( + model, single_mm_inputs, modality, 1 ) - curr_group_outputs_lst.extend(single_output) - curr_group_outputs = curr_group_outputs_lst + if single_result is not None: + curr_group_outputs_lst.extend(single_result) + else: + # Fall back to eager for this image + single_output = model.embed_multimodal( + **single_mm_inputs + ) + curr_group_outputs_lst.extend(single_output) + + curr_group_outputs = curr_group_outputs_lst + else: + # Fallback to eager if data extraction fails + curr_group_outputs = model.embed_multimodal( + **mm_kwargs_group) else: # Single item or no CUDA graph manager - try CUDA graph cudagraph_result = None From 36926b19b20160b90376348059a084743c00d698 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 13:02:19 -0500 Subject: [PATCH 047/189] debug segfault. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 42 ++++++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 5 +-- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index a5f8b9f89ed1..f991fb8d9995 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -607,8 +607,29 @@ def run( self.eager_fallbacks += 1 return None + # Verify device and dtype match + if pixel_values.device != input_buffer.device: + logger.warning( + f"Device mismatch: expected {input_buffer.device}, " + f"got {pixel_values.device}. Falling back to eager mode." + ) + self.eager_fallbacks += 1 + return None + + if pixel_values.dtype != input_buffer.dtype: + logger.warning( + f"Dtype mismatch: expected {input_buffer.dtype}, " + f"got {pixel_values.dtype}. Falling back to eager mode." + ) + self.eager_fallbacks += 1 + return None + self.cache_hits += 1 + # Ensure contiguous memory layout for safe copy + if not pixel_values.is_contiguous(): + pixel_values = pixel_values.contiguous() + # Copy input to the captured buffer (non-blocking for better overlap) input_buffer.copy_(pixel_values, non_blocking=True) @@ -694,6 +715,27 @@ def run_padded( self.eager_fallbacks += 1 return None + # Verify device and dtype match + if pixel_values.device != input_buffer.device: + logger.warning( + f"Device mismatch: expected {input_buffer.device}, " + f"got {pixel_values.device}. Falling back to eager mode." + ) + self.eager_fallbacks += 1 + return None + + if pixel_values.dtype != input_buffer.dtype: + logger.warning( + f"Dtype mismatch: expected {input_buffer.dtype}, " + f"got {pixel_values.dtype}. Falling back to eager mode." + ) + self.eager_fallbacks += 1 + return None + + # Ensure contiguous memory layout for safe copy + if not pixel_values.is_contiguous(): + pixel_values = pixel_values.contiguous() + self.cache_hits += 1 # === KEY FIX: Compute embeddings for ACTUAL grid, then pad === diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 43cfdad04c7e..2d0bd92eb23f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2532,8 +2532,9 @@ def _execute_with_encoder_cudagraph( self.encoder_cudagraph_manager.count_miss() return None - # Ensure pixel_values is on the correct device - pixel_values = pixel_values.to(device=self.device, dtype=self.dtype) + # Ensure pixel_values is on the correct device and contiguous + # Contiguity is important for CUDA graph replay to avoid memory issues + pixel_values = pixel_values.to(device=self.device, dtype=self.dtype).contiguous() # Get spatial merge size for token calculations spatial_merge_size = getattr(model.visual, 'spatial_merge_size', 2) From 2e8af6e8a030e187a050908287588e470bd54b8f Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 13:31:17 -0500 Subject: [PATCH 048/189] pass grid_thw as tensor not list. --- vllm/v1/worker/gpu_model_runner.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2d0bd92eb23f..e095daddda43 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2404,22 +2404,31 @@ def _execute_mm_encoder( patch_offset:patch_offset + num_patches] patch_offset += num_patches - # Build single-image kwargs - single_mm_inputs = { + # Build single-image kwargs for CUDA graph (list format) + single_mm_inputs_for_cudagraph = { pixel_key: single_pixel_values, grid_key: [grid_thw], } # Try CUDA graph for this single image single_result = self._execute_with_encoder_cudagraph( - model, single_mm_inputs, modality, 1 + model, single_mm_inputs_for_cudagraph, modality, 1 ) if single_result is not None: curr_group_outputs_lst.extend(single_result) else: # Fall back to eager for this image + # Model expects grid_thw as tensor, not list + single_mm_inputs_for_eager = { + pixel_key: single_pixel_values, + grid_key: torch.tensor( + [grid_thw], + dtype=torch.int64, + device=self.device, + ), + } single_output = model.embed_multimodal( - **single_mm_inputs + **single_mm_inputs_for_eager ) curr_group_outputs_lst.extend(single_output) From a62d1312191c3cf53a99eda0ad31915dd4877575 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 13:36:34 -0500 Subject: [PATCH 049/189] debug segfault. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index f991fb8d9995..c8e0835e2c0f 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -645,6 +645,9 @@ def run( embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=True) embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=True) + # Sync before replay to ensure all copies are complete (debug) + torch.cuda.synchronize() + # Replay the graph self.graphs[grid_key].replay() @@ -769,6 +772,9 @@ def run_padded( embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"], non_blocking=True) embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"], non_blocking=True) + # Sync before replay to ensure all copies are complete (debug) + torch.cuda.synchronize() + # Replay the graph with updated embedding buffers self.graphs[bucket_grid].replay() From 18aab182ddbbf392e6cb09e919cc2d13b5b78712 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 13:37:05 -0500 Subject: [PATCH 050/189] add log for replay. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 17 +++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 10 +++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index c8e0835e2c0f..b9f4f63e0ffd 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -648,9 +648,17 @@ def run( # Sync before replay to ensure all copies are complete (debug) torch.cuda.synchronize() + # Debug: log before replay to identify crash pattern + logger.info( + f"DEBUG run(): About to replay graph for grid_key={grid_key}, " + f"input_shape={pixel_values.shape}, buffer_shape={input_buffer.shape}" + ) + # Replay the graph self.graphs[grid_key].replay() + logger.info(f"DEBUG run(): Replay completed for grid_key={grid_key}") + # Return a clone of the output to avoid issues with buffer reuse return self.output_buffers[grid_key].clone() @@ -775,9 +783,18 @@ def run_padded( # Sync before replay to ensure all copies are complete (debug) torch.cuda.synchronize() + # Debug: log before replay to identify crash pattern + logger.info( + f"DEBUG run_padded(): About to replay graph for bucket_grid={bucket_grid}, " + f"actual_grid={grid_thw[0]}, input_patches={num_input_patches}, " + f"bucket_patches={bucket_input_patches}" + ) + # Replay the graph with updated embedding buffers self.graphs[bucket_grid].replay() + logger.info(f"DEBUG run_padded(): Replay completed for bucket_grid={bucket_grid}") + # Get output and trim to actual size full_output = self.output_buffers[bucket_grid] trimmed_output = full_output[:num_output_tokens].clone() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e095daddda43..b7e18d3dacd6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2395,7 +2395,15 @@ def _execute_mm_encoder( # Calculate patch boundaries for slicing patch_offset = 0 - for grid_thw in grid_thw_list: + logger.info( + f"DEBUG: Processing {len(grid_thw_list)} images " + f"one-at-a-time, grids={grid_thw_list}" + ) + for img_idx, grid_thw in enumerate(grid_thw_list): + logger.info( + f"DEBUG: Processing image {img_idx+1}/{len(grid_thw_list)}, " + f"grid={grid_thw}" + ) t, h, w = grid_thw num_patches = t * h * w From 046127dfe76504bf12c01f91682a411263786750 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 14:00:12 -0500 Subject: [PATCH 051/189] make grid_thw cpu tensor. --- vllm/v1/worker/gpu_model_runner.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b7e18d3dacd6..16f5a6929a40 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2426,14 +2426,13 @@ def _execute_mm_encoder( curr_group_outputs_lst.extend(single_result) else: # Fall back to eager for this image - # Model expects grid_thw as tensor, not list + # Model expects grid_thw as CPU tensor (it calls .numpy()) single_mm_inputs_for_eager = { pixel_key: single_pixel_values, grid_key: torch.tensor( [grid_thw], dtype=torch.int64, - device=self.device, - ), + ), # Keep on CPU } single_output = model.embed_multimodal( **single_mm_inputs_for_eager From c5021d1ca547bed8adc824c066d48e7d17e42cc4 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 14:33:46 -0500 Subject: [PATCH 052/189] debug segfault by disabling multiple image batch processing. --- vllm/v1/worker/gpu_model_runner.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 16f5a6929a40..e38d65d0900d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2367,9 +2367,15 @@ def _execute_mm_encoder( # When CUDA graphs are enabled and we have multiple items, # process them one at a time since CUDA graphs only support # single-image batches + # Set VLLM_DISABLE_ENCODER_ONEBYONE=1 to disable one-at-a-time + # processing for debugging + import os + disable_onebyone = os.environ.get( + "VLLM_DISABLE_ENCODER_ONEBYONE", "0") == "1" if (self.encoder_cudagraph_manager is not None and num_items > 1 - and modality in ("image", "video")): + and modality in ("image", "video") + and not disable_onebyone): # Process each image individually for CUDA graph support # Extract batched data and slice per-image to avoid # re-calling group_mm_kwargs_by_modality overhead @@ -2448,6 +2454,9 @@ def _execute_mm_encoder( # Single item or no CUDA graph manager - try CUDA graph cudagraph_result = None if self.encoder_cudagraph_manager is not None: + logger.info( + f"DEBUG: Processing single item, modality={modality}" + ) cudagraph_result = self._execute_with_encoder_cudagraph( model, mm_kwargs_group, modality, num_items ) From 86efb8af0878aa82074596186953e3bc0f65a4cb Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 14:49:59 -0500 Subject: [PATCH 053/189] disable non_blocking copy. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index b9f4f63e0ffd..29d3862dac48 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -631,19 +631,19 @@ def run( pixel_values = pixel_values.contiguous() # Copy input to the captured buffer (non-blocking for better overlap) - input_buffer.copy_(pixel_values, non_blocking=True) + input_buffer.copy_(pixel_values, non_blocking=False) # For exact match, restore cached embeddings (may have been modified by run_padded) if grid_key in self.embedding_buffers and grid_key in self.cached_tensors: embed_buffers = self.embedding_buffers[grid_key] cached = self.cached_tensors[grid_key] - embed_buffers["pos_embeds"].copy_(cached["pos_embeds"], non_blocking=True) + embed_buffers["pos_embeds"].copy_(cached["pos_embeds"], non_blocking=False) embed_buffers["rotary_pos_emb_cos"].copy_( - cached["rotary_pos_emb_cos"], non_blocking=True) + cached["rotary_pos_emb_cos"], non_blocking=False) embed_buffers["rotary_pos_emb_sin"].copy_( - cached["rotary_pos_emb_sin"], non_blocking=True) - embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=True) - embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=True) + cached["rotary_pos_emb_sin"], non_blocking=False) + embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=False) + embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=False) # Sync before replay to ensure all copies are complete (debug) torch.cuda.synchronize() @@ -763,22 +763,22 @@ def run_padded( embed_buffers["rotary_pos_emb_sin"].zero_() # Copy actual pixel values to the beginning of the buffer - input_buffer[:num_input_patches].copy_(pixel_values, non_blocking=True) + input_buffer[:num_input_patches].copy_(pixel_values, non_blocking=False) # Copy actual embeddings to the beginning of the buffers (pad with zeros) actual_num_patches = actual_embeds["pos_embeds"].shape[0] embed_buffers["pos_embeds"][:actual_num_patches].copy_( - actual_embeds["pos_embeds"], non_blocking=True) + actual_embeds["pos_embeds"], non_blocking=False) embed_buffers["rotary_pos_emb_cos"][:actual_num_patches].copy_( - actual_embeds["rotary_pos_emb_cos"], non_blocking=True) + actual_embeds["rotary_pos_emb_cos"], non_blocking=False) embed_buffers["rotary_pos_emb_sin"][:actual_num_patches].copy_( - actual_embeds["rotary_pos_emb_sin"], non_blocking=True) + actual_embeds["rotary_pos_emb_sin"], non_blocking=False) # Update cu_seqlens and max_seqlen to actual values # cu_seqlens shape is [num_images + 1], for single image it's [2]: [0, num_patches] # We copy the actual values so flash attention processes only the real tokens - embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"], non_blocking=True) - embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"], non_blocking=True) + embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"], non_blocking=False) + embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"], non_blocking=False) # Sync before replay to ensure all copies are complete (debug) torch.cuda.synchronize() From 53f0cd8fa8cc0092c735d62ceb8b145b16fcb6b3 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 15:58:19 -0500 Subject: [PATCH 054/189] debug cuda graph mem pool. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 29d3862dac48..f5a8532a99d7 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -154,7 +154,13 @@ def __init__( self.graphs: dict[tuple[int, int, int], torch.cuda.CUDAGraph] = {} # Use provided pool or create a dedicated encoder pool # Using a separate pool from decoder allows independent memory management - self.pool = graph_pool if graph_pool is not None else torch.cuda.graph_pool_handle() + # Set VLLM_ENCODER_NO_POOL=1 to disable shared pool (debug for segfault) + import os + if os.environ.get("VLLM_ENCODER_NO_POOL", "0") == "1": + self.pool = None # Each graph uses private memory + logger.info("Encoder CUDA graphs: using private pools (no shared pool)") + else: + self.pool = graph_pool if graph_pool is not None else torch.cuda.graph_pool_handle() # Pre-allocated input/output buffers per grid config # Key: (t, h, w), Value: {"pixel_values": tensor, "grid_thw": list} From 936c90271cbdcb564955638a3d5a989b497d86b4 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 17:02:15 -0500 Subject: [PATCH 055/189] make private pool the default. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index f5a8532a99d7..689f885dbed1 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -152,15 +152,16 @@ def __init__( # CUDA graph storage - keyed by (t, h, w) tuple self.graphs: dict[tuple[int, int, int], torch.cuda.CUDAGraph] = {} - # Use provided pool or create a dedicated encoder pool - # Using a separate pool from decoder allows independent memory management - # Set VLLM_ENCODER_NO_POOL=1 to disable shared pool (debug for segfault) + # Use private pools by default to avoid segfaults with rapid back-to-back + # graph replays during one-by-one multi-image processing. + # Set VLLM_ENCODER_SHARED_POOL=1 to use shared pool (saves memory but + # may cause issues with rapid replays) import os - if os.environ.get("VLLM_ENCODER_NO_POOL", "0") == "1": - self.pool = None # Each graph uses private memory - logger.info("Encoder CUDA graphs: using private pools (no shared pool)") - else: + if os.environ.get("VLLM_ENCODER_SHARED_POOL", "0") == "1": self.pool = graph_pool if graph_pool is not None else torch.cuda.graph_pool_handle() + logger.info("Encoder CUDA graphs: using shared pool") + else: + self.pool = None # Each graph uses private memory (default) # Pre-allocated input/output buffers per grid config # Key: (t, h, w), Value: {"pixel_values": tensor, "grid_thw": list} From eb46592094387cdf26df8cf6b518cb72e8ea64f6 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 17:05:28 -0500 Subject: [PATCH 056/189] make debug message controlled through verbose cli. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 34 +++++++++------------- vllm/v1/worker/gpu_model_runner.py | 23 ++++----------- 2 files changed, 19 insertions(+), 38 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 689f885dbed1..d6035842440f 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -110,10 +110,12 @@ def __init__( bucket_sizes: list[int] | None = None, grid_configs: list[tuple[int, int, int]] | None = None, graph_pool: Any | None = None, + verbose: bool = False, ): self.vllm_config = vllm_config self.device = device self.dtype = dtype + self.verbose = verbose # Get grid configs from config or use defaults (for exact match) if grid_configs is None: @@ -652,20 +654,15 @@ def run( embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=False) embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=False) - # Sync before replay to ensure all copies are complete (debug) - torch.cuda.synchronize() - - # Debug: log before replay to identify crash pattern - logger.info( - f"DEBUG run(): About to replay graph for grid_key={grid_key}, " - f"input_shape={pixel_values.shape}, buffer_shape={input_buffer.shape}" - ) + if self.verbose: + logger.info( + f"run(): grid_key={grid_key}, " + f"input_shape={pixel_values.shape}, buffer_shape={input_buffer.shape}" + ) # Replay the graph self.graphs[grid_key].replay() - logger.info(f"DEBUG run(): Replay completed for grid_key={grid_key}") - # Return a clone of the output to avoid issues with buffer reuse return self.output_buffers[grid_key].clone() @@ -787,21 +784,16 @@ def run_padded( embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"], non_blocking=False) embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"], non_blocking=False) - # Sync before replay to ensure all copies are complete (debug) - torch.cuda.synchronize() - - # Debug: log before replay to identify crash pattern - logger.info( - f"DEBUG run_padded(): About to replay graph for bucket_grid={bucket_grid}, " - f"actual_grid={grid_thw[0]}, input_patches={num_input_patches}, " - f"bucket_patches={bucket_input_patches}" - ) + if self.verbose: + logger.info( + f"run_padded(): bucket_grid={bucket_grid}, " + f"actual_grid={grid_thw[0]}, input_patches={num_input_patches}, " + f"bucket_patches={bucket_input_patches}" + ) # Replay the graph with updated embedding buffers self.graphs[bucket_grid].replay() - logger.info(f"DEBUG run_padded(): Replay completed for bucket_grid={bucket_grid}") - # Get output and trim to actual size full_output = self.output_buffers[bucket_grid] trimmed_output = full_output[:num_output_tokens].clone() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e38d65d0900d..0dc43b411a26 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -728,6 +728,7 @@ def _init_encoder_cudagraph_manager(self) -> None: dtype=self.dtype, bucket_sizes=bucket_sizes, graph_pool=encoder_graph_pool, + verbose=self.encoder_cudagraph_verbose, ) # Log configuration @@ -2367,15 +2368,9 @@ def _execute_mm_encoder( # When CUDA graphs are enabled and we have multiple items, # process them one at a time since CUDA graphs only support # single-image batches - # Set VLLM_DISABLE_ENCODER_ONEBYONE=1 to disable one-at-a-time - # processing for debugging - import os - disable_onebyone = os.environ.get( - "VLLM_DISABLE_ENCODER_ONEBYONE", "0") == "1" if (self.encoder_cudagraph_manager is not None and num_items > 1 - and modality in ("image", "video") - and not disable_onebyone): + and modality in ("image", "video")): # Process each image individually for CUDA graph support # Extract batched data and slice per-image to avoid # re-calling group_mm_kwargs_by_modality overhead @@ -2401,15 +2396,12 @@ def _execute_mm_encoder( # Calculate patch boundaries for slicing patch_offset = 0 - logger.info( - f"DEBUG: Processing {len(grid_thw_list)} images " - f"one-at-a-time, grids={grid_thw_list}" - ) - for img_idx, grid_thw in enumerate(grid_thw_list): + if self.encoder_cudagraph_verbose: logger.info( - f"DEBUG: Processing image {img_idx+1}/{len(grid_thw_list)}, " - f"grid={grid_thw}" + f"Processing {len(grid_thw_list)} images " + f"one-at-a-time, grids={grid_thw_list}" ) + for img_idx, grid_thw in enumerate(grid_thw_list): t, h, w = grid_thw num_patches = t * h * w @@ -2454,9 +2446,6 @@ def _execute_mm_encoder( # Single item or no CUDA graph manager - try CUDA graph cudagraph_result = None if self.encoder_cudagraph_manager is not None: - logger.info( - f"DEBUG: Processing single item, modality={modality}" - ) cudagraph_result = self._execute_with_encoder_cudagraph( model, mm_kwargs_group, modality, num_items ) From 4d1752240b6a5a12de41c038beabda48a7a29994 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 17:06:11 -0500 Subject: [PATCH 057/189] use non blocking copy. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index d6035842440f..39e18ae2a668 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -640,19 +640,19 @@ def run( pixel_values = pixel_values.contiguous() # Copy input to the captured buffer (non-blocking for better overlap) - input_buffer.copy_(pixel_values, non_blocking=False) + input_buffer.copy_(pixel_values, non_blocking=True) # For exact match, restore cached embeddings (may have been modified by run_padded) if grid_key in self.embedding_buffers and grid_key in self.cached_tensors: embed_buffers = self.embedding_buffers[grid_key] cached = self.cached_tensors[grid_key] - embed_buffers["pos_embeds"].copy_(cached["pos_embeds"], non_blocking=False) + embed_buffers["pos_embeds"].copy_(cached["pos_embeds"], non_blocking=True) embed_buffers["rotary_pos_emb_cos"].copy_( - cached["rotary_pos_emb_cos"], non_blocking=False) + cached["rotary_pos_emb_cos"], non_blocking=True) embed_buffers["rotary_pos_emb_sin"].copy_( - cached["rotary_pos_emb_sin"], non_blocking=False) - embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=False) - embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=False) + cached["rotary_pos_emb_sin"], non_blocking=True) + embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=True) + embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=True) if self.verbose: logger.info( @@ -767,22 +767,22 @@ def run_padded( embed_buffers["rotary_pos_emb_sin"].zero_() # Copy actual pixel values to the beginning of the buffer - input_buffer[:num_input_patches].copy_(pixel_values, non_blocking=False) + input_buffer[:num_input_patches].copy_(pixel_values, non_blocking=True) # Copy actual embeddings to the beginning of the buffers (pad with zeros) actual_num_patches = actual_embeds["pos_embeds"].shape[0] embed_buffers["pos_embeds"][:actual_num_patches].copy_( - actual_embeds["pos_embeds"], non_blocking=False) + actual_embeds["pos_embeds"], non_blocking=True) embed_buffers["rotary_pos_emb_cos"][:actual_num_patches].copy_( - actual_embeds["rotary_pos_emb_cos"], non_blocking=False) + actual_embeds["rotary_pos_emb_cos"], non_blocking=True) embed_buffers["rotary_pos_emb_sin"][:actual_num_patches].copy_( - actual_embeds["rotary_pos_emb_sin"], non_blocking=False) + actual_embeds["rotary_pos_emb_sin"], non_blocking=True) # Update cu_seqlens and max_seqlen to actual values # cu_seqlens shape is [num_images + 1], for single image it's [2]: [0, num_patches] # We copy the actual values so flash attention processes only the real tokens - embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"], non_blocking=False) - embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"], non_blocking=False) + embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"], non_blocking=True) + embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"], non_blocking=True) if self.verbose: logger.info( From a1ac7b91b793bfea5e9a64a3681fc47a9aac0e82 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 18:01:49 -0500 Subject: [PATCH 058/189] sync stream before replay. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 39e18ae2a668..3690b000f6ae 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -791,6 +791,13 @@ def run_padded( f"bucket_patches={bucket_input_patches}" ) + # Synchronize before replay to ensure precompute_for_cudagraph() and all + # non-blocking copies have completed. This is necessary because we're + # running fresh GPU operations (precompute) before replaying the graph, + # unlike run() which only copies from pre-existing cached tensors. + # Without this sync, rapid back-to-back calls can cause memory corruption. + torch.cuda.current_stream().synchronize() + # Replay the graph with updated embedding buffers self.graphs[bucket_grid].replay() From a1ddd01dec843f3641b66ebd85628f02912a9681 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 18:31:49 -0500 Subject: [PATCH 059/189] adjust grid config. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 90 +++++++++++----------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 3690b000f6ae..6d2d10be4d4a 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -46,42 +46,42 @@ logger = init_logger(__name__) # Grid configurations for CUDA graph capture (T, H, W in patch units) -# Top 30 most common grids (58.9% exact match coverage) -# With dedicated encoder graph pool, we can capture more grids without -# competing with decoder CUDA graphs for memory +# +# Strategy: Prioritize small grids where kernel launch overhead dominates. +# For larger grids, computation time dominates and CUDA graph benefit is minimal. +# +# Grids larger than max_grid_size (default 96) should use padded mode or eager. CUSTOM_GRID_CONFIGS = [ - (1, 62, 62), - (1, 94, 94), - (1, 50, 50), - (1, 32, 32), - (1, 124, 124), - (1, 76, 76), - (1, 100, 100), - (1, 64, 64), - (1, 38, 38), - (1, 188, 188), - (1, 68, 68), - (1, 128, 128), - (1, 44, 44), - (1, 250, 250), - (1, 256, 256), - (1, 42, 42), - (1, 24, 24), - (1, 160, 160), - (1, 46, 46), - (1, 80, 80), - (1, 112, 112), - (1, 16, 16), - (1, 56, 56), - (1, 208, 312), - (1, 188, 252), - (1, 156, 156), - (1, 40, 40), - (1, 252, 188), - (1, 120, 120), - (1, 218, 218), + # === Tier 1: Very small grids (<=32) === + (1, 16, 16), # 256 patches + (1, 24, 24), # 576 patches + (1, 32, 32), # 1024 patches + + # === Tier 2: Small grids (33-50) === + (1, 38, 38), # 1444 patches + (1, 40, 40), # 1600 patches + (1, 42, 42), # 1764 patches + (1, 44, 44), # 1936 patches + (1, 46, 46), # 2116 patches + (1, 50, 50), # 2500 patches + + # === Tier 3: Medium-small grids (51-70) === + (1, 56, 56), # 3136 patches + (1, 62, 62), # 3844 patches + (1, 64, 64), # 4096 patches + (1, 68, 68), # 4624 patches + + # === Tier 4: Medium grids (71-96) === + (1, 76, 76), # 5776 patches + (1, 80, 80), # 6400 patches + (1, 94, 94), # 8836 patches ] +# Default bucket sizes for padded mode (creates square grids) +# These cover medium-large grids that are too big for exact match capture +# but still benefit from CUDA graphs via padding. +DEFAULT_PADDED_BUCKET_SIZES = [100, 128] + class EncoderCudaGraphManager: """ @@ -219,37 +219,39 @@ def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: return CUSTOM_GRID_CONFIGS def _get_bucket_sizes_from_config(self) -> list[int]: - """Get encoder CUDA graph bucket sizes from config.""" + """Get encoder CUDA graph bucket sizes from config. + + Bucket sizes enable padded mode for grids that don't have exact matches. + Default buckets (100, 128) cover medium-large grids efficiently. + """ compilation_config = self.vllm_config.compilation_config if compilation_config is None: - return [] + return DEFAULT_PADDED_BUCKET_SIZES encoder_sizes = getattr( compilation_config, 'encoder_cudagraph_bucket_sizes', None ) - return encoder_sizes if encoder_sizes is not None else [] + return encoder_sizes if encoder_sizes is not None else DEFAULT_PADDED_BUCKET_SIZES def _get_max_grid_size_from_config(self) -> int: """Get maximum grid size for encoder CUDA graph capture. - Large grids (e.g., 256x256+) consume massive GPU memory per graph: - - 128x128: ~3.5 GiB - - 188x188: ~7.7 GiB - - 256x256: ~14 GiB - - 512x512: ~57 GiB + Large grids consume massive GPU memory per graph and provide minimal + benefit since computation time dominates over launch overhead. - Default is 128 to allow capturing ~15-20 useful grids on typical hardware. + Default is 96 to focus memory on small grids where benefit is highest. + Grids larger than this will use padded mode (if buckets configured) or eager. """ compilation_config = self.vllm_config.compilation_config if compilation_config is None: - return 128 # Conservative default + return 96 # Focus on small grids where benefit is highest max_size = getattr( compilation_config, 'encoder_cudagraph_max_grid_size', - 128 # Default: max 128x128 grids + 96 # Default: max 96x96 grids for exact match ) return max_size From 864a17235bac588b460a23e9cba106af91a9f315 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 18:35:52 -0500 Subject: [PATCH 060/189] remove sync stream. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 6d2d10be4d4a..26ba677de410 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -793,13 +793,6 @@ def run_padded( f"bucket_patches={bucket_input_patches}" ) - # Synchronize before replay to ensure precompute_for_cudagraph() and all - # non-blocking copies have completed. This is necessary because we're - # running fresh GPU operations (precompute) before replaying the graph, - # unlike run() which only copies from pre-existing cached tensors. - # Without this sync, rapid back-to-back calls can cause memory corruption. - torch.cuda.current_stream().synchronize() - # Replay the graph with updated embedding buffers self.graphs[bucket_grid].replay() From 0a8d84cd30f2fde4a5f667340461acebc1281a9d Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 30 Jan 2026 18:37:00 -0500 Subject: [PATCH 061/189] skip padding mode when there's multiple images in the batch. --- vllm/v1/worker/gpu_model_runner.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0dc43b411a26..b6350135a288 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2417,8 +2417,11 @@ def _execute_mm_encoder( } # Try CUDA graph for this single image + # Skip padded mode to avoid segfaults with rapid + # back-to-back graph replays during one-by-one processing single_result = self._execute_with_encoder_cudagraph( - model, single_mm_inputs_for_cudagraph, modality, 1 + model, single_mm_inputs_for_cudagraph, modality, 1, + skip_padded_mode=True, ) if single_result is not None: curr_group_outputs_lst.extend(single_result) @@ -2490,6 +2493,7 @@ def _execute_with_encoder_cudagraph( mm_kwargs_group: dict, modality: str, num_items: int, + skip_padded_mode: bool = False, ) -> list[torch.Tensor] | None: """ Execute the encoder using CUDA graphs if a matching graph is available. @@ -2503,6 +2507,8 @@ def _execute_with_encoder_cudagraph( mm_kwargs_group: Batched multimodal kwargs modality: The modality type ("image" or "video") num_items: Number of items in the batch + skip_padded_mode: If True, skip padded mode even if enabled. + Used during one-by-one processing to avoid segfaults. Returns: List of encoder outputs if CUDA graph was used, None otherwise @@ -2575,7 +2581,8 @@ def _execute_with_encoder_cudagraph( return [output[:num_output_tokens]] # Try padded execution if enabled (run_padded counts hits internally) - if self.encoder_cudagraph_padded_mode: + # Skip padded mode during one-by-one processing to avoid segfaults + if self.encoder_cudagraph_padded_mode and not skip_padded_mode: result = self.encoder_cudagraph_manager.run_padded( pixel_values, grid_thw, From 85012a6bcb1071474041525ac01203fedad526ad Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sat, 31 Jan 2026 15:52:55 -0500 Subject: [PATCH 062/189] sync before run_padded replay. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 26ba677de410..febc1db89212 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -793,6 +793,11 @@ def run_padded( f"bucket_patches={bucket_input_patches}" ) + # Sync before replay: graph was captured on a separate stream, but buffer + # modifications (zero, copy) happen on the default stream. Without sync, + # replay may start before copies complete, reading zeros/partial data. + torch.cuda.synchronize() + # Replay the graph with updated embedding buffers self.graphs[bucket_grid].replay() From a7040d83108915808fed75f3aa07c153d70a3c74 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sat, 31 Jan 2026 16:34:02 -0500 Subject: [PATCH 063/189] test pad mode for multi-image processing. --- vllm/v1/worker/gpu_model_runner.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b6350135a288..ca6885b21822 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2417,11 +2417,8 @@ def _execute_mm_encoder( } # Try CUDA graph for this single image - # Skip padded mode to avoid segfaults with rapid - # back-to-back graph replays during one-by-one processing single_result = self._execute_with_encoder_cudagraph( model, single_mm_inputs_for_cudagraph, modality, 1, - skip_padded_mode=True, ) if single_result is not None: curr_group_outputs_lst.extend(single_result) From ed906f2b6b21cbc4c19eac5f8ee679f9d8ffa9a1 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sat, 31 Jan 2026 17:02:36 -0500 Subject: [PATCH 064/189] sync at the beginning of run_padded. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index febc1db89212..d96f46bc36d2 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -755,6 +755,11 @@ def run_padded( self.cache_hits += 1 + # Sync before modifying buffers: ensure any previous graph replay + # (from a prior call) has completed. Without this, we could zero/modify + # buffers while a previous replay is still reading them. + torch.cuda.synchronize() + # === KEY FIX: Compute embeddings for ACTUAL grid, then pad === # This ensures correct position embeddings for the actual input size actual_embeds = self.vision_encoder.precompute_for_cudagraph(grid_thw) From 661e2a3960d898df8303eb278b7fefdb0237ef8b Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sat, 31 Jan 2026 17:28:56 -0500 Subject: [PATCH 065/189] add sync in run(). --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index d96f46bc36d2..05717c6fe55a 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -637,6 +637,11 @@ def run( self.cache_hits += 1 + # Sync before modifying buffers: ensure any previous graph replay + # (from a prior call) has completed. Without this, we could modify + # buffers while a previous replay is still reading them. + torch.cuda.synchronize() + # Ensure contiguous memory layout for safe copy if not pixel_values.is_contiguous(): pixel_values = pixel_values.contiguous() @@ -662,6 +667,11 @@ def run( f"input_shape={pixel_values.shape}, buffer_shape={input_buffer.shape}" ) + # Sync before replay: graph was captured on a separate stream, but buffer + # modifications (copy) happen on the default stream. Without sync, + # replay may start before copies complete. + torch.cuda.synchronize() + # Replay the graph self.graphs[grid_key].replay() From 7ed88104cb7682bc8a3c688c48f29496f703ade0 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sat, 31 Jan 2026 19:05:28 -0500 Subject: [PATCH 066/189] debug msg. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 05717c6fe55a..55675a3dd0a0 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -640,13 +640,16 @@ def run( # Sync before modifying buffers: ensure any previous graph replay # (from a prior call) has completed. Without this, we could modify # buffers while a previous replay is still reading them. + print(f"[EXACT] sync before modify, grid={grid_key}", file=sys.stderr, flush=True) torch.cuda.synchronize() + print(f"[EXACT] sync done", file=sys.stderr, flush=True) # Ensure contiguous memory layout for safe copy if not pixel_values.is_contiguous(): pixel_values = pixel_values.contiguous() # Copy input to the captured buffer (non-blocking for better overlap) + print(f"[EXACT] copying buffers", file=sys.stderr, flush=True) input_buffer.copy_(pixel_values, non_blocking=True) # For exact match, restore cached embeddings (may have been modified by run_padded) @@ -660,6 +663,7 @@ def run( cached["rotary_pos_emb_sin"], non_blocking=True) embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=True) embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=True) + print(f"[EXACT] copying done", file=sys.stderr, flush=True) if self.verbose: logger.info( @@ -670,10 +674,13 @@ def run( # Sync before replay: graph was captured on a separate stream, but buffer # modifications (copy) happen on the default stream. Without sync, # replay may start before copies complete. + print(f"[EXACT] sync before replay", file=sys.stderr, flush=True) torch.cuda.synchronize() + print(f"[EXACT] sync done, about to replay", file=sys.stderr, flush=True) # Replay the graph self.graphs[grid_key].replay() + print(f"[EXACT] replay done", file=sys.stderr, flush=True) # Return a clone of the output to avoid issues with buffer reuse return self.output_buffers[grid_key].clone() @@ -768,22 +775,29 @@ def run_padded( # Sync before modifying buffers: ensure any previous graph replay # (from a prior call) has completed. Without this, we could zero/modify # buffers while a previous replay is still reading them. + print(f"[PADDED] sync before modify, bucket={bucket_grid}", file=sys.stderr, flush=True) torch.cuda.synchronize() + print(f"[PADDED] sync done", file=sys.stderr, flush=True) # === KEY FIX: Compute embeddings for ACTUAL grid, then pad === # This ensures correct position embeddings for the actual input size + print(f"[PADDED] precompute start", file=sys.stderr, flush=True) actual_embeds = self.vision_encoder.precompute_for_cudagraph(grid_thw) + print(f"[PADDED] precompute done", file=sys.stderr, flush=True) # Get embedding buffers for the bucket embed_buffers = self.embedding_buffers[bucket_grid] # Zero the buffers first (for clean padding) + print(f"[PADDED] zeroing buffers", file=sys.stderr, flush=True) input_buffer.zero_() embed_buffers["pos_embeds"].zero_() embed_buffers["rotary_pos_emb_cos"].zero_() embed_buffers["rotary_pos_emb_sin"].zero_() + print(f"[PADDED] zeroing done", file=sys.stderr, flush=True) # Copy actual pixel values to the beginning of the buffer + print(f"[PADDED] copying buffers", file=sys.stderr, flush=True) input_buffer[:num_input_patches].copy_(pixel_values, non_blocking=True) # Copy actual embeddings to the beginning of the buffers (pad with zeros) @@ -800,6 +814,7 @@ def run_padded( # We copy the actual values so flash attention processes only the real tokens embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"], non_blocking=True) embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"], non_blocking=True) + print(f"[PADDED] copying done", file=sys.stderr, flush=True) if self.verbose: logger.info( @@ -811,10 +826,13 @@ def run_padded( # Sync before replay: graph was captured on a separate stream, but buffer # modifications (zero, copy) happen on the default stream. Without sync, # replay may start before copies complete, reading zeros/partial data. + print(f"[PADDED] sync before replay", file=sys.stderr, flush=True) torch.cuda.synchronize() + print(f"[PADDED] sync done, about to replay", file=sys.stderr, flush=True) # Replay the graph with updated embedding buffers self.graphs[bucket_grid].replay() + print(f"[PADDED] replay done", file=sys.stderr, flush=True) # Get output and trim to actual size full_output = self.output_buffers[bucket_grid] From 7e6dbea6ce06bf03af5633f48855622f37ddb02b Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sat, 31 Jan 2026 19:52:35 -0500 Subject: [PATCH 067/189] sync after replay. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 55675a3dd0a0..825062d9100d 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -682,6 +682,11 @@ def run( self.graphs[grid_key].replay() print(f"[EXACT] replay done", file=sys.stderr, flush=True) + # Sync after replay: ensure graph execution completes before we read output + # (replay is on capture stream, clone is on default stream) + torch.cuda.synchronize() + print(f"[EXACT] post-replay sync done", file=sys.stderr, flush=True) + # Return a clone of the output to avoid issues with buffer reuse return self.output_buffers[grid_key].clone() @@ -834,6 +839,11 @@ def run_padded( self.graphs[bucket_grid].replay() print(f"[PADDED] replay done", file=sys.stderr, flush=True) + # Sync after replay: ensure graph execution completes before we read output + # (replay is on capture stream, clone is on default stream) + torch.cuda.synchronize() + print(f"[PADDED] post-replay sync done", file=sys.stderr, flush=True) + # Get output and trim to actual size full_output = self.output_buffers[bucket_grid] trimmed_output = full_output[:num_output_tokens].clone() From c702983f6b7c2548f790be2c4c20d86a5389687d Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sat, 31 Jan 2026 20:54:00 -0500 Subject: [PATCH 068/189] clean up. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 24 ++-------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 825062d9100d..06c750d29ffd 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -640,16 +640,13 @@ def run( # Sync before modifying buffers: ensure any previous graph replay # (from a prior call) has completed. Without this, we could modify # buffers while a previous replay is still reading them. - print(f"[EXACT] sync before modify, grid={grid_key}", file=sys.stderr, flush=True) torch.cuda.synchronize() - print(f"[EXACT] sync done", file=sys.stderr, flush=True) # Ensure contiguous memory layout for safe copy if not pixel_values.is_contiguous(): pixel_values = pixel_values.contiguous() # Copy input to the captured buffer (non-blocking for better overlap) - print(f"[EXACT] copying buffers", file=sys.stderr, flush=True) input_buffer.copy_(pixel_values, non_blocking=True) # For exact match, restore cached embeddings (may have been modified by run_padded) @@ -663,7 +660,6 @@ def run( cached["rotary_pos_emb_sin"], non_blocking=True) embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=True) embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=True) - print(f"[EXACT] copying done", file=sys.stderr, flush=True) if self.verbose: logger.info( @@ -674,18 +670,14 @@ def run( # Sync before replay: graph was captured on a separate stream, but buffer # modifications (copy) happen on the default stream. Without sync, # replay may start before copies complete. - print(f"[EXACT] sync before replay", file=sys.stderr, flush=True) torch.cuda.synchronize() - print(f"[EXACT] sync done, about to replay", file=sys.stderr, flush=True) # Replay the graph self.graphs[grid_key].replay() - print(f"[EXACT] replay done", file=sys.stderr, flush=True) # Sync after replay: ensure graph execution completes before we read output # (replay is on capture stream, clone is on default stream) torch.cuda.synchronize() - print(f"[EXACT] post-replay sync done", file=sys.stderr, flush=True) # Return a clone of the output to avoid issues with buffer reuse return self.output_buffers[grid_key].clone() @@ -780,29 +772,22 @@ def run_padded( # Sync before modifying buffers: ensure any previous graph replay # (from a prior call) has completed. Without this, we could zero/modify # buffers while a previous replay is still reading them. - print(f"[PADDED] sync before modify, bucket={bucket_grid}", file=sys.stderr, flush=True) torch.cuda.synchronize() - print(f"[PADDED] sync done", file=sys.stderr, flush=True) - # === KEY FIX: Compute embeddings for ACTUAL grid, then pad === - # This ensures correct position embeddings for the actual input size - print(f"[PADDED] precompute start", file=sys.stderr, flush=True) + # Compute embeddings for ACTUAL grid, then pad to bucket size. + # This ensures correct position embeddings for the actual input size. actual_embeds = self.vision_encoder.precompute_for_cudagraph(grid_thw) - print(f"[PADDED] precompute done", file=sys.stderr, flush=True) # Get embedding buffers for the bucket embed_buffers = self.embedding_buffers[bucket_grid] # Zero the buffers first (for clean padding) - print(f"[PADDED] zeroing buffers", file=sys.stderr, flush=True) input_buffer.zero_() embed_buffers["pos_embeds"].zero_() embed_buffers["rotary_pos_emb_cos"].zero_() embed_buffers["rotary_pos_emb_sin"].zero_() - print(f"[PADDED] zeroing done", file=sys.stderr, flush=True) # Copy actual pixel values to the beginning of the buffer - print(f"[PADDED] copying buffers", file=sys.stderr, flush=True) input_buffer[:num_input_patches].copy_(pixel_values, non_blocking=True) # Copy actual embeddings to the beginning of the buffers (pad with zeros) @@ -819,7 +804,6 @@ def run_padded( # We copy the actual values so flash attention processes only the real tokens embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"], non_blocking=True) embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"], non_blocking=True) - print(f"[PADDED] copying done", file=sys.stderr, flush=True) if self.verbose: logger.info( @@ -831,18 +815,14 @@ def run_padded( # Sync before replay: graph was captured on a separate stream, but buffer # modifications (zero, copy) happen on the default stream. Without sync, # replay may start before copies complete, reading zeros/partial data. - print(f"[PADDED] sync before replay", file=sys.stderr, flush=True) torch.cuda.synchronize() - print(f"[PADDED] sync done, about to replay", file=sys.stderr, flush=True) # Replay the graph with updated embedding buffers self.graphs[bucket_grid].replay() - print(f"[PADDED] replay done", file=sys.stderr, flush=True) # Sync after replay: ensure graph execution completes before we read output # (replay is on capture stream, clone is on default stream) torch.cuda.synchronize() - print(f"[PADDED] post-replay sync done", file=sys.stderr, flush=True) # Get output and trim to actual size full_output = self.output_buffers[bucket_grid] From 23d1d6a90240e493c117ebd396a870a1f20dcbfa Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 00:10:38 -0500 Subject: [PATCH 069/189] add compilation config to control multi image processing. --- vllm/config/compilation.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 18649f60c36a..9cf9dbf7baca 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -480,6 +480,15 @@ class CompilationConfig: Useful for debugging and analyzing CUDA graph utilization. When False, only logs summary stats at the end of execution.""" + encoder_cudagraph_one_by_one: bool = True + """Enable one-by-one image processing for multi-image batches. + When True (default), multi-image batches are processed individually to + maximize CUDA graph hit rate. + When False, multi-image batches are processed together in eager mode, + which may be faster when CUDA graph overhead (sync, memory) outweighs + the kernel launch savings. + Set to False if you observe throughput regression with encoder CUDA graphs.""" + # Inductor capture compile_sizes: list[int | str] | None = None """Sizes to compile for inductor. In addition From cd3f613a19a87c516775b36f62d8ab3be0f788bd Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 00:13:07 -0500 Subject: [PATCH 070/189] replace torch.cuda.sync by stream sync. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 63 ++++++++++++++-------- 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 06c750d29ffd..3aa92b3049b4 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -191,6 +191,12 @@ def __init__( self.cache_hits = 0 self.eager_fallbacks = 0 + # CUDA event for lightweight synchronization + # Instead of torch.cuda.synchronize() which waits for ALL GPU work, + # we use an event to track only the last replay completion. + # This allows better overlap between encoder and other GPU work. + self.replay_done_event: torch.cuda.Event | None = None + def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: """Get encoder grid configurations from config or use defaults.""" compilation_config = self.vllm_config.compilation_config @@ -637,10 +643,11 @@ def run( self.cache_hits += 1 - # Sync before modifying buffers: ensure any previous graph replay - # (from a prior call) has completed. Without this, we could modify - # buffers while a previous replay is still reading them. - torch.cuda.synchronize() + # Wait for any previous graph replay to complete before modifying buffers. + # Using event.synchronize() is lighter than torch.cuda.synchronize() + # which waits for ALL GPU work across all streams. + if self.replay_done_event is not None: + self.replay_done_event.synchronize() # Ensure contiguous memory layout for safe copy if not pixel_values.is_contiguous(): @@ -667,17 +674,25 @@ def run( f"input_shape={pixel_values.shape}, buffer_shape={input_buffer.shape}" ) - # Sync before replay: graph was captured on a separate stream, but buffer - # modifications (copy) happen on the default stream. Without sync, - # replay may start before copies complete. - torch.cuda.synchronize() + # Sync current stream before replay: graph was captured on a separate stream, + # but buffer copies happen on the default stream. We need copies to complete + # before replay reads from those buffers. + # Using current_stream().synchronize() is lighter than torch.cuda.synchronize() + torch.cuda.current_stream().synchronize() # Replay the graph self.graphs[grid_key].replay() - # Sync after replay: ensure graph execution completes before we read output - # (replay is on capture stream, clone is on default stream) - torch.cuda.synchronize() + # Record event after replay for lightweight sync in next call. + # Create event lazily on first use. + if self.replay_done_event is None: + self.replay_done_event = torch.cuda.Event() + self.replay_done_event.record() + + # Sync to ensure output is ready before clone. + # TODO: Could eliminate this if we return a view and defer sync to caller, + # but that would require careful handling of buffer reuse. + self.replay_done_event.synchronize() # Return a clone of the output to avoid issues with buffer reuse return self.output_buffers[grid_key].clone() @@ -769,10 +784,10 @@ def run_padded( self.cache_hits += 1 - # Sync before modifying buffers: ensure any previous graph replay - # (from a prior call) has completed. Without this, we could zero/modify - # buffers while a previous replay is still reading them. - torch.cuda.synchronize() + # Wait for any previous graph replay to complete before modifying buffers. + # Using event.synchronize() is lighter than torch.cuda.synchronize() + if self.replay_done_event is not None: + self.replay_done_event.synchronize() # Compute embeddings for ACTUAL grid, then pad to bucket size. # This ensures correct position embeddings for the actual input size. @@ -812,17 +827,21 @@ def run_padded( f"bucket_patches={bucket_input_patches}" ) - # Sync before replay: graph was captured on a separate stream, but buffer - # modifications (zero, copy) happen on the default stream. Without sync, - # replay may start before copies complete, reading zeros/partial data. - torch.cuda.synchronize() + # Sync current stream before replay: graph was captured on a separate stream, + # but buffer modifications (zero, copy) happen on the default stream. + # Using current_stream().synchronize() is lighter than torch.cuda.synchronize() + torch.cuda.current_stream().synchronize() # Replay the graph with updated embedding buffers self.graphs[bucket_grid].replay() - # Sync after replay: ensure graph execution completes before we read output - # (replay is on capture stream, clone is on default stream) - torch.cuda.synchronize() + # Record event after replay for lightweight sync in next call. + if self.replay_done_event is None: + self.replay_done_event = torch.cuda.Event() + self.replay_done_event.record() + + # Sync to ensure output is ready before clone. + self.replay_done_event.synchronize() # Get output and trim to actual size full_output = self.output_buffers[bucket_grid] From b1836d32b9c104809a7e8721019c43d89c442484 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 00:13:48 -0500 Subject: [PATCH 071/189] add option to control multi image processing. --- vllm/v1/worker/gpu_model_runner.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ca6885b21822..19da83466f20 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -430,6 +430,7 @@ def __init__( self.encoder_cudagraph_manager: EncoderCudaGraphManager | None = None self.encoder_cudagraph_padded_mode: bool = True self.encoder_cudagraph_verbose: bool = False + self.encoder_cudagraph_one_by_one: bool = True self._init_encoder_cudagraph_manager() self.use_aux_hidden_state_outputs = False @@ -715,6 +716,13 @@ def _init_encoder_cudagraph_manager(self) -> None: False # Default to quiet mode ) + # Check if one-by-one processing is enabled for multi-image batches + self.encoder_cudagraph_one_by_one = getattr( + self.compilation_config, + 'encoder_cudagraph_one_by_one', + True # Default to one-by-one for higher CUDA graph hit rate + ) + # Create a dedicated graph pool for encoder CUDA graphs # This keeps encoder and decoder graph memory separate for: # 1. Better memory isolation and predictability @@ -736,6 +744,7 @@ def _init_encoder_cudagraph_manager(self) -> None: logger.info( "Encoder CUDA graph manager initialized: " f"padded_mode={self.encoder_cudagraph_padded_mode}, " + f"one_by_one={self.encoder_cudagraph_one_by_one}, " f"num_grids={len(grid_configs)}, " f"grids={grid_configs}, " f"using dedicated encoder graph pool" @@ -2367,8 +2376,10 @@ def _execute_mm_encoder( # Try to use CUDA graph if available # When CUDA graphs are enabled and we have multiple items, # process them one at a time since CUDA graphs only support - # single-image batches + # single-image batches. This can be disabled via config if + # the sync overhead outweighs the CUDA graph benefits. if (self.encoder_cudagraph_manager is not None + and self.encoder_cudagraph_one_by_one and num_items > 1 and modality in ("image", "video")): # Process each image individually for CUDA graph support From 97dbf861716cf495334d365c89802c04c74b03de Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 17:06:57 -0500 Subject: [PATCH 072/189] remove warning about last compile range. --- vllm/compilation/piecewise_backend.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index df9c2f6cc00d..48caaa4865ca 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -61,14 +61,6 @@ def __init__( # to set the upper bound of the compile ranges max_int32 = 2**31 - 1 last_compile_range = self.compile_ranges[-1] - if last_compile_range.end != vllm_config.scheduler_config.max_num_batched_tokens: - print(f'WARNING: last_compile_range.end={last_compile_range.end}, max_num_batched_tokens={vllm_config.scheduler_config.max_num_batched_tokens}') - """ - assert ( - last_compile_range.end - == vllm_config.scheduler_config.max_num_batched_tokens - ) - """ self.compile_ranges[-1] = Range( start=last_compile_range.start, end=max_int32 ) From 124b893bae95f22b2d7f16eea69cfce400ddb0a5 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 17:09:45 -0500 Subject: [PATCH 073/189] only report padding waste when verbose. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 3aa92b3049b4..3b8dac527cfa 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -847,10 +847,11 @@ def run_padded( full_output = self.output_buffers[bucket_grid] trimmed_output = full_output[:num_output_tokens].clone() - logger.debug( - f"Padded execution: {num_output_tokens} -> {bucket_tokens} tokens " - f"(waste: {padding_waste}, {padding_waste/bucket_tokens*100:.1f}%)" - ) + if self.verbose: + logger.debug( + f"Padded execution: {num_output_tokens} -> {bucket_tokens} tokens " + f"(waste: {padding_waste}, {padding_waste/bucket_tokens*100:.1f}%)" + ) return trimmed_output, padding_waste From 4fa0971b3e7f30847c147e429e71d23c822fa832 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 18:28:55 -0500 Subject: [PATCH 074/189] format. --- vllm/model_executor/models/qwen3_vl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 8f57169f2f43..b8351e1dbb8a 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -456,7 +456,9 @@ def __init__( workspace_buffer = ( None if self.attn_backend != AttentionBackendEnum.FLASHINFER - else torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device=self.device) + else torch.zeros( + 128 * 1024 * 1024, dtype=torch.uint8, device=self.device + ) ) self.blocks = nn.ModuleList( @@ -766,9 +768,7 @@ def forward_cudagraph( deepstack_feature_lists.append(deepstack_feature) hidden_states = self.merger(hidden_states) - hidden_states = torch.cat( - [hidden_states] + deepstack_feature_lists, dim=1 - ) + hidden_states = torch.cat([hidden_states] + deepstack_feature_lists, dim=1) return hidden_states def precompute_for_cudagraph( From f5292d2596755093f229e9b562b43f902a322686 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 18:29:06 -0500 Subject: [PATCH 075/189] format. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 114 +++++++++++---------- 1 file changed, 59 insertions(+), 55 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 3b8dac527cfa..700c1091fe59 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -41,7 +41,7 @@ from vllm.logger import init_logger if TYPE_CHECKING: - from vllm.model_executor.models.interfaces import SupportsMultiModal + pass logger = init_logger(__name__) @@ -53,28 +53,25 @@ # Grids larger than max_grid_size (default 96) should use padded mode or eager. CUSTOM_GRID_CONFIGS = [ # === Tier 1: Very small grids (<=32) === - (1, 16, 16), # 256 patches - (1, 24, 24), # 576 patches - (1, 32, 32), # 1024 patches - + (1, 16, 16), # 256 patches + (1, 24, 24), # 576 patches + (1, 32, 32), # 1024 patches # === Tier 2: Small grids (33-50) === - (1, 38, 38), # 1444 patches - (1, 40, 40), # 1600 patches - (1, 42, 42), # 1764 patches - (1, 44, 44), # 1936 patches - (1, 46, 46), # 2116 patches - (1, 50, 50), # 2500 patches - + (1, 38, 38), # 1444 patches + (1, 40, 40), # 1600 patches + (1, 42, 42), # 1764 patches + (1, 44, 44), # 1936 patches + (1, 46, 46), # 2116 patches + (1, 50, 50), # 2500 patches # === Tier 3: Medium-small grids (51-70) === - (1, 56, 56), # 3136 patches - (1, 62, 62), # 3844 patches - (1, 64, 64), # 4096 patches - (1, 68, 68), # 4624 patches - + (1, 56, 56), # 3136 patches + (1, 62, 62), # 3844 patches + (1, 64, 64), # 4096 patches + (1, 68, 68), # 4624 patches # === Tier 4: Medium grids (71-96) === - (1, 76, 76), # 5776 patches - (1, 80, 80), # 6400 patches - (1, 94, 94), # 8836 patches + (1, 76, 76), # 5776 patches + (1, 80, 80), # 6400 patches + (1, 94, 94), # 8836 patches ] # Default bucket sizes for padded mode (creates square grids) @@ -147,7 +144,7 @@ def __init__( if skipped_grids: logger.info( f"Skipping {len(skipped_grids)} grids exceeding max_grid_size={max_grid_size}: " - f"{sorted(skipped_grids, key=lambda x: x[1]*x[2], reverse=True)[:5]}..." + f"{sorted(skipped_grids, key=lambda x: x[1] * x[2], reverse=True)[:5]}..." ) self.grid_configs = filtered_grids @@ -159,8 +156,11 @@ def __init__( # Set VLLM_ENCODER_SHARED_POOL=1 to use shared pool (saves memory but # may cause issues with rapid replays) import os + if os.environ.get("VLLM_ENCODER_SHARED_POOL", "0") == "1": - self.pool = graph_pool if graph_pool is not None else torch.cuda.graph_pool_handle() + self.pool = ( + graph_pool if graph_pool is not None else torch.cuda.graph_pool_handle() + ) logger.info("Encoder CUDA graphs: using shared pool") else: self.pool = None # Each graph uses private memory (default) @@ -205,9 +205,7 @@ def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: # Check for encoder-specific grid config grid_configs = getattr( - compilation_config, - 'encoder_cudagraph_grid_configs', - None + compilation_config, "encoder_cudagraph_grid_configs", None ) if grid_configs is not None: # Handle preset name or custom list @@ -216,8 +214,7 @@ def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: return CUSTOM_GRID_CONFIGS else: logger.warning( - f"Unknown grid config preset '{grid_configs}', " - "using 'custom'" + f"Unknown grid config preset '{grid_configs}', using 'custom'" ) return CUSTOM_GRID_CONFIGS return [tuple(cfg) for cfg in grid_configs] @@ -235,11 +232,11 @@ def _get_bucket_sizes_from_config(self) -> list[int]: return DEFAULT_PADDED_BUCKET_SIZES encoder_sizes = getattr( - compilation_config, - 'encoder_cudagraph_bucket_sizes', - None + compilation_config, "encoder_cudagraph_bucket_sizes", None + ) + return ( + encoder_sizes if encoder_sizes is not None else DEFAULT_PADDED_BUCKET_SIZES ) - return encoder_sizes if encoder_sizes is not None else DEFAULT_PADDED_BUCKET_SIZES def _get_max_grid_size_from_config(self) -> int: """Get maximum grid size for encoder CUDA graph capture. @@ -256,8 +253,8 @@ def _get_max_grid_size_from_config(self) -> int: max_size = getattr( compilation_config, - 'encoder_cudagraph_max_grid_size', - 96 # Default: max 96x96 grids for exact match + "encoder_cudagraph_max_grid_size", + 96, # Default: max 96x96 grids for exact match ) return max_size @@ -327,9 +324,7 @@ def _prepare_dummy_inputs_for_grid( grid_thw = [[t, h, w]] # Calculate output tokens - output_tokens = self._compute_output_tokens( - grid_config, spatial_merge_size - ) + output_tokens = self._compute_output_tokens(grid_config, spatial_merge_size) return { "pixel_values": pixel_values, @@ -379,8 +374,9 @@ def capture_graph_for_grid( self.vision_encoder = vision_encoder # Check if vision encoder supports optimized CUDA graph forward - has_cudagraph_forward = hasattr(vision_encoder, 'forward_cudagraph') and \ - hasattr(vision_encoder, 'precompute_for_cudagraph') + has_cudagraph_forward = hasattr( + vision_encoder, "forward_cudagraph" + ) and hasattr(vision_encoder, "precompute_for_cudagraph") if has_cudagraph_forward: # Pre-compute tensors for the bucket grid (used for exact match mode) @@ -444,8 +440,8 @@ def capture_graph_for_grid( else: # Fallback to original forward (will have CPU gaps) logger.warning( - f"Vision encoder does not support forward_cudagraph, " - f"using standard forward (will have CPU gaps)" + "Vision encoder does not support forward_cudagraph, " + "using standard forward (will have CPU gaps)" ) # Warmup run (required before capture) @@ -508,13 +504,12 @@ def capture( configs_to_capture = sorted( self.grid_configs, key=lambda x: x[0] * x[1] * x[2], - reverse=False # Smallest first + reverse=False, # Smallest first ) if is_global_first_rank(): configs_to_capture = tqdm( - configs_to_capture, - desc="Capturing encoder CUDA graphs" + configs_to_capture, desc="Capturing encoder CUDA graphs" ) # Capture each graph in its own graph_capture context to isolate failures. @@ -583,7 +578,7 @@ def find_bucket_for_tokens( Grid config (T, H, W) of the best bucket, or None if too large """ best_grid = None - best_bucket_tokens = float('inf') + best_bucket_tokens = float("inf") for grid_key in self.graphs.keys(): bucket_tokens = self._compute_output_tokens(grid_key, spatial_merge_size) @@ -662,9 +657,11 @@ def run( cached = self.cached_tensors[grid_key] embed_buffers["pos_embeds"].copy_(cached["pos_embeds"], non_blocking=True) embed_buffers["rotary_pos_emb_cos"].copy_( - cached["rotary_pos_emb_cos"], non_blocking=True) + cached["rotary_pos_emb_cos"], non_blocking=True + ) embed_buffers["rotary_pos_emb_sin"].copy_( - cached["rotary_pos_emb_sin"], non_blocking=True) + cached["rotary_pos_emb_sin"], non_blocking=True + ) embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=True) embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=True) @@ -726,7 +723,9 @@ def run_padded( return None # Check if vision encoder is available for embedding computation - if self.vision_encoder is None or not hasattr(self.vision_encoder, 'precompute_for_cudagraph'): + if self.vision_encoder is None or not hasattr( + self.vision_encoder, "precompute_for_cudagraph" + ): logger.debug("Vision encoder not available for padded mode") return None @@ -808,17 +807,24 @@ def run_padded( # Copy actual embeddings to the beginning of the buffers (pad with zeros) actual_num_patches = actual_embeds["pos_embeds"].shape[0] embed_buffers["pos_embeds"][:actual_num_patches].copy_( - actual_embeds["pos_embeds"], non_blocking=True) + actual_embeds["pos_embeds"], non_blocking=True + ) embed_buffers["rotary_pos_emb_cos"][:actual_num_patches].copy_( - actual_embeds["rotary_pos_emb_cos"], non_blocking=True) + actual_embeds["rotary_pos_emb_cos"], non_blocking=True + ) embed_buffers["rotary_pos_emb_sin"][:actual_num_patches].copy_( - actual_embeds["rotary_pos_emb_sin"], non_blocking=True) + actual_embeds["rotary_pos_emb_sin"], non_blocking=True + ) # Update cu_seqlens and max_seqlen to actual values # cu_seqlens shape is [num_images + 1], for single image it's [2]: [0, num_patches] # We copy the actual values so flash attention processes only the real tokens - embed_buffers["cu_seqlens"].copy_(actual_embeds["cu_seqlens"], non_blocking=True) - embed_buffers["max_seqlen"].copy_(actual_embeds["max_seqlen"], non_blocking=True) + embed_buffers["cu_seqlens"].copy_( + actual_embeds["cu_seqlens"], non_blocking=True + ) + embed_buffers["max_seqlen"].copy_( + actual_embeds["max_seqlen"], non_blocking=True + ) if self.verbose: logger.info( @@ -850,7 +856,7 @@ def run_padded( if self.verbose: logger.debug( f"Padded execution: {num_output_tokens} -> {bucket_tokens} tokens " - f"(waste: {padding_waste}, {padding_waste/bucket_tokens*100:.1f}%)" + f"(waste: {padding_waste}, {padding_waste / bucket_tokens * 100:.1f}%)" ) return trimmed_output, padding_waste @@ -885,5 +891,3 @@ def get_stats(self, verbose: bool = True) -> dict[str, Any]: f"hit_rate={hit_rate:.1%}, num_graphs={len(self.graphs)}" ) return stats - - From 9eec91df78b88e53c60f286a5d42321bf15ba6c2 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 18:30:27 -0500 Subject: [PATCH 076/189] format. --- vllm/v1/worker/gpu/mm/encoder_runner.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py index 973ad5773023..905e3ef142c9 100644 --- a/vllm/v1/worker/gpu/mm/encoder_runner.py +++ b/vllm/v1/worker/gpu/mm/encoder_runner.py @@ -62,23 +62,21 @@ def _init_encoder_cudagraph_manager(self) -> None: if compilation_config is None: return - if not getattr(compilation_config, 'cudagraph_mm_encoder', False): + if not getattr(compilation_config, "cudagraph_mm_encoder", False): return # Import here to avoid circular imports from vllm.v1.worker.gpu.mm.encoder_cudagraph import EncoderCudaGraphManager bucket_sizes = getattr( - compilation_config, - 'encoder_cudagraph_bucket_sizes', - None + compilation_config, "encoder_cudagraph_bucket_sizes", None ) # Check if padded mode is enabled self.encoder_cudagraph_padded_mode = getattr( compilation_config, - 'encoder_cudagraph_padded_mode', - True # Default to padded mode for better CUDA graph utilization + "encoder_cudagraph_padded_mode", + True, # Default to padded mode for better CUDA graph utilization ) self.encoder_cudagraph_manager = EncoderCudaGraphManager( @@ -109,7 +107,7 @@ def capture_encoder_cudagraphs( if self.encoder_cudagraph_manager is None: return - if not hasattr(model, 'visual') or model.visual is None: + if not hasattr(model, "visual") or model.visual is None: logger.warning( "Model does not have a visual encoder, " "skipping encoder CUDA graph capture" @@ -297,7 +295,7 @@ def _execute_with_cudagraph( pixel_values = pixel_values.to(device=self.device, dtype=self.dtype) # Get spatial merge size for token calculations - spatial_merge_size = getattr(model.visual, 'spatial_merge_size', 2) + spatial_merge_size = getattr(model.visual, "spatial_merge_size", 2) t, h, w = grid_thw[0] num_output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) @@ -330,9 +328,7 @@ def _execute_with_cudagraph( return [output] # No CUDA graph available - logger.info( - f"ViT EAGER: grid=({t}, {h}, {w}), tokens={num_output_tokens}" - ) + logger.info(f"ViT EAGER: grid=({t}, {h}, {w}), tokens={num_output_tokens}") return None def gather_mm_embeddings( From b0feebd69999c52ecf716394af6c3b106cf0ab03 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 18:31:00 -0500 Subject: [PATCH 077/189] format. --- vllm/v1/worker/gpu_model_runner.py | 51 ++++++++++++++++-------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 19da83466f20..e8088ac786c1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -155,11 +155,11 @@ from vllm.v1.worker.cp_utils import check_attention_cp_compatibility from vllm.v1.worker.dp_utils import coordinate_batch_across_dp from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin +from vllm.v1.worker.gpu.mm.encoder_cudagraph import EncoderCudaGraphManager from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin -from vllm.v1.worker.gpu.mm.encoder_cudagraph import EncoderCudaGraphManager from vllm.v1.worker.ubatch_utils import ( UBatchSlices, check_ubatch_thresholds, @@ -693,34 +693,32 @@ def _init_encoder_cudagraph_manager(self) -> None: if self.compilation_config is None: return - if not getattr(self.compilation_config, 'cudagraph_mm_encoder', False): + if not getattr(self.compilation_config, "cudagraph_mm_encoder", False): return bucket_sizes = getattr( - self.compilation_config, - 'encoder_cudagraph_bucket_sizes', - None + self.compilation_config, "encoder_cudagraph_bucket_sizes", None ) # Check if padded mode is enabled self.encoder_cudagraph_padded_mode = getattr( self.compilation_config, - 'encoder_cudagraph_padded_mode', - True # Default to padded mode for better CUDA graph utilization + "encoder_cudagraph_padded_mode", + True, # Default to padded mode for better CUDA graph utilization ) # Check if verbose logging is enabled self.encoder_cudagraph_verbose = getattr( self.compilation_config, - 'encoder_cudagraph_verbose', - False # Default to quiet mode + "encoder_cudagraph_verbose", + False, # Default to quiet mode ) # Check if one-by-one processing is enabled for multi-image batches self.encoder_cudagraph_one_by_one = getattr( self.compilation_config, - 'encoder_cudagraph_one_by_one', - True # Default to one-by-one for higher CUDA graph hit rate + "encoder_cudagraph_one_by_one", + True, # Default to one-by-one for higher CUDA graph hit rate ) # Create a dedicated graph pool for encoder CUDA graphs @@ -2378,10 +2376,12 @@ def _execute_mm_encoder( # process them one at a time since CUDA graphs only support # single-image batches. This can be disabled via config if # the sync overhead outweighs the CUDA graph benefits. - if (self.encoder_cudagraph_manager is not None + if ( + self.encoder_cudagraph_manager is not None and self.encoder_cudagraph_one_by_one and num_items > 1 - and modality in ("image", "video")): + and modality in ("image", "video") + ): # Process each image individually for CUDA graph support # Extract batched data and slice per-image to avoid # re-calling group_mm_kwargs_by_modality overhead @@ -2395,7 +2395,8 @@ def _execute_mm_encoder( pixel_key = "pixel_values" else: # video batched_pixel_values = mm_kwargs_group.get( - "pixel_values_videos") + "pixel_values_videos" + ) grid_thw_list = mm_kwargs_group.get("video_grid_thw") grid_key = "video_grid_thw" pixel_key = "pixel_values_videos" @@ -2418,7 +2419,8 @@ def _execute_mm_encoder( # Slice pixel_values for this image single_pixel_values = batched_pixel_values[ - patch_offset:patch_offset + num_patches] + patch_offset : patch_offset + num_patches + ] patch_offset += num_patches # Build single-image kwargs for CUDA graph (list format) @@ -2429,7 +2431,10 @@ def _execute_mm_encoder( # Try CUDA graph for this single image single_result = self._execute_with_encoder_cudagraph( - model, single_mm_inputs_for_cudagraph, modality, 1, + model, + single_mm_inputs_for_cudagraph, + modality, + 1, ) if single_result is not None: curr_group_outputs_lst.extend(single_result) @@ -2451,8 +2456,7 @@ def _execute_mm_encoder( curr_group_outputs = curr_group_outputs_lst else: # Fallback to eager if data extraction fails - curr_group_outputs = model.embed_multimodal( - **mm_kwargs_group) + curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) else: # Single item or no CUDA graph manager - try CUDA graph cudagraph_result = None @@ -2562,10 +2566,12 @@ def _execute_with_encoder_cudagraph( # Ensure pixel_values is on the correct device and contiguous # Contiguity is important for CUDA graph replay to avoid memory issues - pixel_values = pixel_values.to(device=self.device, dtype=self.dtype).contiguous() + pixel_values = pixel_values.to( + device=self.device, dtype=self.dtype + ).contiguous() # Get spatial merge size for token calculations - spatial_merge_size = getattr(model.visual, 'spatial_merge_size', 2) + spatial_merge_size = getattr(model.visual, "spatial_merge_size", 2) t, h, w = grid_thw[0] num_output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) num_input_patches = pixel_values.shape[0] @@ -2583,8 +2589,7 @@ def _execute_with_encoder_cudagraph( if output is not None: if self.encoder_cudagraph_verbose: logger.info( - f"ViT CUDA graph EXACT: grid=({t}, {h}, {w}), " - f"output={output.shape}" + f"ViT CUDA graph EXACT: grid=({t}, {h}, {w}), output={output.shape}" ) return [output[:num_output_tokens]] @@ -5202,7 +5207,7 @@ def _capture_encoder_cudagraphs(self) -> None: return model = self.model - if not hasattr(model, 'visual') or model.visual is None: + if not hasattr(model, "visual") or model.visual is None: logger.warning( "Model does not have a visual encoder, " "skipping encoder CUDA graph capture" From 3458b7b67de5f876922faa28d8d3b9f466977e20 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 18:35:52 -0500 Subject: [PATCH 078/189] format. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 700c1091fe59..13839dd2fe07 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -320,7 +320,7 @@ def _prepare_dummy_inputs_for_grid( device=self.device, ) - # Grid THW for this configuration + # Grid (temporal, height, width) for this configuration grid_thw = [[t, h, w]] # Calculate output tokens From f2a8f3d72b2a3db6f203c4b68f6ce2c0b51646f9 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 18:51:55 -0500 Subject: [PATCH 079/189] format. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 156 +++++++++++++-------- vllm/v1/worker/gpu/mm/encoder_runner.py | 29 ++-- vllm/v1/worker/gpu_model_runner.py | 84 ++++++----- 3 files changed, 172 insertions(+), 97 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 13839dd2fe07..c39290f6ac89 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -142,9 +142,14 @@ def __init__( skipped_grids.append(grid) if skipped_grids: + top_skipped = sorted( + skipped_grids, key=lambda x: x[1] * x[2], reverse=True + )[:5] logger.info( - f"Skipping {len(skipped_grids)} grids exceeding max_grid_size={max_grid_size}: " - f"{sorted(skipped_grids, key=lambda x: x[1] * x[2], reverse=True)[:5]}..." + "Skipping %d grids exceeding max_grid_size=%d: %s...", + len(skipped_grids), + max_grid_size, + top_skipped, ) self.grid_configs = filtered_grids @@ -170,18 +175,18 @@ def __init__( self.input_buffers: dict[tuple[int, int, int], dict[str, Any]] = {} self.output_buffers: dict[tuple[int, int, int], torch.Tensor] = {} - # Cached pre-computed tensors for CUDA graph replay (used for exact match mode) - # Key: (t, h, w), Value: dict with pos_embeds, rotary embeddings, cu_seqlens, etc. + # Cached pre-computed tensors for CUDA graph replay (exact match mode) + # Key: (t, h, w), Value: dict with pos_embeds, rotary embeddings, etc. self.cached_tensors: dict[tuple[int, int, int], dict[str, torch.Tensor]] = {} - # Input buffers for embeddings (used for padded mode with runtime computation) - # Key: (t, h, w), Value: dict with pos_embeds, rotary_cos, rotary_sin, cu_seqlens buffers + # Input buffers for embeddings (padded mode with runtime computation) + # Key: (t, h, w), Value: dict with pos_embeds, rotary_cos/sin, cu_seqlens self.embedding_buffers: dict[tuple[int, int, int], dict[str, torch.Tensor]] = {} # Store metadata about captured graphs self.captured_metadata: dict[tuple[int, int, int], dict[str, Any]] = {} - # Reference to vision encoder for runtime embedding computation (set during capture) + # Vision encoder reference for runtime embedding computation (set at capture) self.vision_encoder = None # Track if graphs have been captured @@ -214,7 +219,8 @@ def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: return CUSTOM_GRID_CONFIGS else: logger.warning( - f"Unknown grid config preset '{grid_configs}', using 'custom'" + "Unknown grid config preset '%s', using 'custom'", + grid_configs, ) return CUSTOM_GRID_CONFIGS return [tuple(cfg) for cfg in grid_configs] @@ -350,7 +356,7 @@ def capture_graph_for_grid( grid_config: Tuple of (T, H, W) in patch units vision_encoder: The vision encoder module """ - logger.debug(f"Capturing encoder CUDA graph for grid config {grid_config}") + logger.debug("Capturing encoder CUDA graph for grid config %s", grid_config) # Prepare dummy inputs dummy_inputs = self._prepare_dummy_inputs_for_grid(grid_config, vision_encoder) @@ -383,12 +389,14 @@ def capture_graph_for_grid( cached = vision_encoder.precompute_for_cudagraph(grid_thw) self.cached_tensors[grid_config] = cached logger.debug( - f"Pre-computed cached tensors for grid config {grid_config}: " - f"pos_embeds={cached['pos_embeds'].shape}, " - f"cu_seqlens={cached['cu_seqlens'].shape}" + "Pre-computed cached tensors for grid config %s: " + "pos_embeds=%s, cu_seqlens=%s", + grid_config, + cached["pos_embeds"].shape, + cached["cu_seqlens"].shape, ) - # Create INPUT BUFFERS for embeddings (for padded mode with runtime computation) + # Create INPUT BUFFERS for embeddings (padded mode runtime computation) # These buffers can be updated at runtime before graph replay # Note: max_seqlen is a CPU scalar tensor to avoid GPU sync on .item() self.embedding_buffers[grid_config] = { @@ -467,10 +475,12 @@ def capture_graph_for_grid( self.output_buffers[grid_config].copy_(output) self.graphs[grid_config] = graph + cached_suffix = " (with cached tensors)" if has_cudagraph_forward else "" logger.debug( - f"Captured encoder CUDA graph for grid config {grid_config} " - f"-> {dummy_inputs['num_output_tokens']} output tokens" - f"{' (with cached tensors)' if has_cudagraph_forward else ''}" + "Captured encoder CUDA graph for grid config %s -> %d output tokens%s", + grid_config, + dummy_inputs["num_output_tokens"], + cached_suffix, ) @torch.inference_mode() @@ -484,7 +494,7 @@ def capture( Args: vision_encoder: The vision encoder module (e.g., Qwen3_VisionTransformer) - embed_multimodal_fn: The model's embed_multimodal method (unused but kept for API) + embed_multimodal_fn: The model's embed_multimodal method (unused) """ if self.captured: logger.warning("Encoder CUDA graphs already captured, skipping") @@ -494,9 +504,11 @@ def capture( free_mem_before, total_mem = torch.cuda.mem_get_info(self.device) used_mem_before = total_mem - free_mem_before logger.info( - f"Capturing encoder CUDA graphs for {len(self.grid_configs)} " - f"grid configurations (GPU memory: {used_mem_before / 1024**3:.2f} GiB used, " - f"{free_mem_before / 1024**3:.2f} GiB free)" + "Capturing encoder CUDA graphs for %d grid configurations " + "(GPU memory: %.2f GiB used, %.2f GiB free)", + len(self.grid_configs), + used_mem_before / 1024**3, + free_mem_before / 1024**3, ) # Capture from smallest to largest so that common smaller grids are @@ -523,8 +535,10 @@ def capture( ) except Exception as e: logger.warning( - f"Failed to capture encoder CUDA graph for grid config " - f"{grid_config}: {e}. Will use eager mode." + "Failed to capture encoder CUDA graph for grid config " + "%s: %s. Will use eager mode.", + grid_config, + e, ) self.captured = True @@ -534,11 +548,13 @@ def capture( used_mem_after = total_mem - free_mem_after encoder_graph_mem = used_mem_after - used_mem_before logger.info( - f"Captured {len(self.graphs)} encoder CUDA graphs " - f"(configs: {sorted(self.graphs.keys())}). " - f"Encoder graph memory: {encoder_graph_mem / 1024**3:.2f} GiB " - f"(GPU: {used_mem_after / 1024**3:.2f} GiB used, " - f"{free_mem_after / 1024**3:.2f} GiB free)" + "Captured %d encoder CUDA graphs (configs: %s). " + "Encoder graph memory: %.2f GiB (GPU: %.2f GiB used, %.2f GiB free)", + len(self.graphs), + sorted(self.graphs.keys()), + encoder_graph_mem / 1024**3, + used_mem_after / 1024**3, + free_mem_after / 1024**3, ) def get_graph_for_grid( @@ -580,7 +596,7 @@ def find_bucket_for_tokens( best_grid = None best_bucket_tokens = float("inf") - for grid_key in self.graphs.keys(): + for grid_key in self.graphs: bucket_tokens = self._compute_output_tokens(grid_key, spatial_merge_size) if bucket_tokens >= num_tokens and bucket_tokens < best_bucket_tokens: best_bucket_tokens = bucket_tokens @@ -613,8 +629,10 @@ def run( input_buffer = self.input_buffers[grid_key]["pixel_values"] if pixel_values.shape != input_buffer.shape: logger.warning( - f"Pixel values shape mismatch: expected {input_buffer.shape}, " - f"got {pixel_values.shape}. Falling back to eager mode." + "Pixel values shape mismatch: expected %s, got %s. " + "Falling back to eager mode.", + input_buffer.shape, + pixel_values.shape, ) self.eager_fallbacks += 1 return None @@ -622,16 +640,18 @@ def run( # Verify device and dtype match if pixel_values.device != input_buffer.device: logger.warning( - f"Device mismatch: expected {input_buffer.device}, " - f"got {pixel_values.device}. Falling back to eager mode." + "Device mismatch: expected %s, got %s. Falling back to eager mode.", + input_buffer.device, + pixel_values.device, ) self.eager_fallbacks += 1 return None if pixel_values.dtype != input_buffer.dtype: logger.warning( - f"Dtype mismatch: expected {input_buffer.dtype}, " - f"got {pixel_values.dtype}. Falling back to eager mode." + "Dtype mismatch: expected %s, got %s. Falling back to eager mode.", + input_buffer.dtype, + pixel_values.dtype, ) self.eager_fallbacks += 1 return None @@ -651,7 +671,7 @@ def run( # Copy input to the captured buffer (non-blocking for better overlap) input_buffer.copy_(pixel_values, non_blocking=True) - # For exact match, restore cached embeddings (may have been modified by run_padded) + # For exact match, restore cached embeddings (may have been modified) if grid_key in self.embedding_buffers and grid_key in self.cached_tensors: embed_buffers = self.embedding_buffers[grid_key] cached = self.cached_tensors[grid_key] @@ -667,8 +687,10 @@ def run( if self.verbose: logger.info( - f"run(): grid_key={grid_key}, " - f"input_shape={pixel_values.shape}, buffer_shape={input_buffer.shape}" + "run(): grid_key=%s, input_shape=%s, buffer_shape=%s", + grid_key, + pixel_values.shape, + input_buffer.shape, ) # Sync current stream before replay: graph was captured on a separate stream, @@ -733,15 +755,24 @@ def run_padded( bucket_grid = self.find_bucket_for_tokens(num_output_tokens, spatial_merge_size) if bucket_grid is None: # Don't count miss here - caller will count it when falling back to eager + max_available = ( + max( + self._compute_output_tokens(g, spatial_merge_size) + for g in self.graphs + ) + if self.graphs + else 0 + ) logger.debug( - f"No bucket found for {num_output_tokens} tokens, " - f"max available: {max(self._compute_output_tokens(g, spatial_merge_size) for g in self.graphs.keys()) if self.graphs else 0}" + "No bucket found for %d tokens, max available: %d", + num_output_tokens, + max_available, ) return None # Check if we have embedding buffers for this bucket if bucket_grid not in self.embedding_buffers: - logger.debug(f"No embedding buffers for bucket {bucket_grid}") + logger.debug("No embedding buffers for bucket %s", bucket_grid) return None bucket_tokens = self._compute_output_tokens(bucket_grid, spatial_merge_size) @@ -754,8 +785,10 @@ def run_padded( if num_input_patches > bucket_input_patches: logger.warning( - f"Input patches ({num_input_patches}) exceed bucket capacity " - f"({bucket_input_patches}). This shouldn't happen." + "Input patches (%d) exceed bucket capacity (%d). " + "This shouldn't happen.", + num_input_patches, + bucket_input_patches, ) self.eager_fallbacks += 1 return None @@ -763,16 +796,18 @@ def run_padded( # Verify device and dtype match if pixel_values.device != input_buffer.device: logger.warning( - f"Device mismatch: expected {input_buffer.device}, " - f"got {pixel_values.device}. Falling back to eager mode." + "Device mismatch: expected %s, got %s. Falling back to eager mode.", + input_buffer.device, + pixel_values.device, ) self.eager_fallbacks += 1 return None if pixel_values.dtype != input_buffer.dtype: logger.warning( - f"Dtype mismatch: expected {input_buffer.dtype}, " - f"got {pixel_values.dtype}. Falling back to eager mode." + "Dtype mismatch: expected %s, got %s. Falling back to eager mode.", + input_buffer.dtype, + pixel_values.dtype, ) self.eager_fallbacks += 1 return None @@ -817,8 +852,8 @@ def run_padded( ) # Update cu_seqlens and max_seqlen to actual values - # cu_seqlens shape is [num_images + 1], for single image it's [2]: [0, num_patches] - # We copy the actual values so flash attention processes only the real tokens + # cu_seqlens shape is [num_images + 1], for single image: [0, num_patches] + # We copy actual values so flash attention processes only the real tokens embed_buffers["cu_seqlens"].copy_( actual_embeds["cu_seqlens"], non_blocking=True ) @@ -828,9 +863,12 @@ def run_padded( if self.verbose: logger.info( - f"run_padded(): bucket_grid={bucket_grid}, " - f"actual_grid={grid_thw[0]}, input_patches={num_input_patches}, " - f"bucket_patches={bucket_input_patches}" + "run_padded(): bucket_grid=%s, actual_grid=%s, " + "input_patches=%d, bucket_patches=%d", + bucket_grid, + grid_thw[0], + num_input_patches, + bucket_input_patches, ) # Sync current stream before replay: graph was captured on a separate stream, @@ -855,8 +893,11 @@ def run_padded( if self.verbose: logger.debug( - f"Padded execution: {num_output_tokens} -> {bucket_tokens} tokens " - f"(waste: {padding_waste}, {padding_waste / bucket_tokens * 100:.1f}%)" + "Padded execution: %d -> %d tokens (waste: %d, %.1f%%)", + num_output_tokens, + bucket_tokens, + padding_waste, + padding_waste / bucket_tokens * 100, ) return trimmed_output, padding_waste @@ -886,8 +927,11 @@ def get_stats(self, verbose: bool = True) -> dict[str, Any]: } if verbose: logger.info( - f"Encoder CUDA graph stats: " - f"hits={self.cache_hits}, eager={self.eager_fallbacks}, " - f"hit_rate={hit_rate:.1%}, num_graphs={len(self.graphs)}" + "Encoder CUDA graph stats: hits=%d, eager=%d, " + "hit_rate=%.1f%%, num_graphs=%d", + self.cache_hits, + self.eager_fallbacks, + hit_rate * 100, + len(self.graphs), ) return stats diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py index 905e3ef142c9..2558cca8147b 100644 --- a/vllm/v1/worker/gpu/mm/encoder_runner.py +++ b/vllm/v1/worker/gpu/mm/encoder_runner.py @@ -90,9 +90,10 @@ def _init_encoder_cudagraph_manager(self) -> None: grid_configs = self.encoder_cudagraph_manager.grid_configs logger.info( "Encoder CUDA graph manager initialized: " - f"padded_mode={self.encoder_cudagraph_padded_mode}, " - f"num_grids={len(grid_configs)}, " - f"grids={grid_configs}" + "padded_mode=%s, num_grids=%d, grids=%s", + self.encoder_cudagraph_padded_mode, + len(grid_configs), + grid_configs, ) def capture_encoder_cudagraphs( @@ -277,7 +278,8 @@ def _execute_with_cudagraph( if len(grid_thw) != 1: logger.debug( "CUDA graph only supports single-image batches, " - f"got {len(grid_thw)} images. Using eager mode." + "got %d images. Using eager mode.", + len(grid_thw), ) return None @@ -306,8 +308,11 @@ def _execute_with_cudagraph( output = self.encoder_cudagraph_manager.run(pixel_values, grid_thw) if output is not None: logger.info( - f"ViT CUDA graph EXACT: grid=({t}, {h}, {w}), " - f"tokens={num_output_tokens}" + "ViT CUDA graph EXACT: grid=(%d, %d, %d), tokens=%d", + t, + h, + w, + num_output_tokens, ) return [output[:num_output_tokens]] @@ -322,13 +327,19 @@ def _execute_with_cudagraph( if result is not None: output, padding_waste = result logger.info( - f"ViT CUDA graph PADDED: grid=({t}, {h}, {w}), " - f"tokens={num_output_tokens}, waste={padding_waste}" + "ViT CUDA graph PADDED: grid=(%d, %d, %d), tokens=%d, waste=%d", + t, + h, + w, + num_output_tokens, + padding_waste, ) return [output] # No CUDA graph available - logger.info(f"ViT EAGER: grid=({t}, {h}, {w}), tokens={num_output_tokens}") + logger.info( + "ViT EAGER: grid=(%d, %d, %d), tokens=%d", t, h, w, num_output_tokens + ) return None def gather_mm_embeddings( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e8088ac786c1..ce07009abdef 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -741,11 +741,12 @@ def _init_encoder_cudagraph_manager(self) -> None: grid_configs = self.encoder_cudagraph_manager.grid_configs logger.info( "Encoder CUDA graph manager initialized: " - f"padded_mode={self.encoder_cudagraph_padded_mode}, " - f"one_by_one={self.encoder_cudagraph_one_by_one}, " - f"num_grids={len(grid_configs)}, " - f"grids={grid_configs}, " - f"using dedicated encoder graph pool" + "padded_mode=%s, one_by_one=%s, num_grids=%d, grids=%s, " + "using dedicated encoder graph pool", + self.encoder_cudagraph_padded_mode, + self.encoder_cudagraph_one_by_one, + len(grid_configs), + grid_configs, ) def update_max_model_len(self, max_model_len: int) -> None: @@ -781,7 +782,7 @@ def init_fp8_kv_scales(self) -> None: attn_layers = self.compilation_config.static_forward_context for name, module in attn_layers.items(): - if isinstance(module, (Attention, MLAAttention)): + if isinstance(module, Attention | MLAAttention): # TODO: Generally, scale is 1.0 if user uses on-the-fly fp8 # kvcache quant. However, to get better accuracy, compression # frameworks like llm-compressors allow users to tune the @@ -2410,8 +2411,9 @@ def _execute_mm_encoder( patch_offset = 0 if self.encoder_cudagraph_verbose: logger.info( - f"Processing {len(grid_thw_list)} images " - f"one-at-a-time, grids={grid_thw_list}" + "Processing %d images one-at-a-time, grids=%s", + len(grid_thw_list), + grid_thw_list, ) for img_idx, grid_thw in enumerate(grid_thw_list): t, h, w = grid_thw @@ -2440,7 +2442,7 @@ def _execute_mm_encoder( curr_group_outputs_lst.extend(single_result) else: # Fall back to eager for this image - # Model expects grid_thw as CPU tensor (it calls .numpy()) + # Model expects grid_thw as CPU tensor (.numpy()) single_mm_inputs_for_eager = { pixel_key: single_pixel_values, grid_key: torch.tensor( @@ -2548,7 +2550,8 @@ def _execute_with_encoder_cudagraph( if len(grid_thw) != 1: logger.debug( "Encoder CUDA graph only supports single-image batches, " - f"got {len(grid_thw)} images. Using eager mode." + "got %d images. Using eager mode.", + len(grid_thw), ) self.encoder_cudagraph_manager.count_miss() return None @@ -2579,9 +2582,12 @@ def _execute_with_encoder_cudagraph( # Log the exact size needed for bucket analysis (verbose only) if self.encoder_cudagraph_verbose: logger.info( - f"ViT input: grid_thw=({t}, {h}, {w}), " - f"input_patches={num_input_patches}, " - f"output_tokens={num_output_tokens}" + "ViT input: grid_thw=(%d, %d, %d), input_patches=%d, output_tokens=%d", + t, + h, + w, + num_input_patches, + num_output_tokens, ) # Try exact match first via run() - counts hits internally @@ -2589,7 +2595,11 @@ def _execute_with_encoder_cudagraph( if output is not None: if self.encoder_cudagraph_verbose: logger.info( - f"ViT CUDA graph EXACT: grid=({t}, {h}, {w}), output={output.shape}" + "ViT CUDA graph EXACT: grid=(%d, %d, %d), output=%s", + t, + h, + w, + output.shape, ) return [output[:num_output_tokens]] @@ -2606,8 +2616,12 @@ def _execute_with_encoder_cudagraph( output, padding_waste = result if self.encoder_cudagraph_verbose: logger.info( - f"ViT CUDA graph PADDED: grid=({t}, {h}, {w}), " - f"tokens={num_output_tokens}, waste={padding_waste}" + "ViT CUDA graph PADDED: grid=(%d, %d, %d), tokens=%d, waste=%d", + t, + h, + w, + num_output_tokens, + padding_waste, ) return [output] @@ -2615,8 +2629,12 @@ def _execute_with_encoder_cudagraph( self.encoder_cudagraph_manager.count_miss() if self.encoder_cudagraph_verbose: logger.info( - f"ViT EAGER: grid=({t}, {h}, {w}), tokens={num_output_tokens} " - f"(padded_mode={self.encoder_cudagraph_padded_mode})" + "ViT EAGER: grid=(%d, %d, %d), tokens=%d (padded_mode=%s)", + t, + h, + w, + num_output_tokens, + self.encoder_cudagraph_padded_mode, ) return None @@ -2725,7 +2743,7 @@ def _gather_mm_embeddings( def get_model(self) -> nn.Module: # get raw model out of the cudagraph wrapper. - if isinstance(self.model, (CUDAGraphWrapper, UBatchWrapper)): + if isinstance(self.model, CUDAGraphWrapper | UBatchWrapper): return self.model.unwrap() return self.model @@ -4288,7 +4306,7 @@ def _get_eagle3_aux_layers_from_config(self) -> tuple[int, ...] | None: return None layer_ids = hf_config.eagle_aux_hidden_state_layer_ids - if layer_ids and isinstance(layer_ids, (list, tuple)): + if layer_ids and isinstance(layer_ids, list | tuple): return tuple(layer_ids) return None @@ -5093,9 +5111,9 @@ def freeze_gc(): start_free_gpu_memory = torch.cuda.mem_get_info()[0] start_total_memory = torch.cuda.mem_get_info()[1] logger.info( - f"Starting CUDA graph capture: " - f"{(start_total_memory - start_free_gpu_memory) / 1024**3:.2f} GiB used, " - f"{start_free_gpu_memory / 1024**3:.2f} GiB free" + "Starting CUDA graph capture: %.2f GiB used, %.2f GiB free", + (start_total_memory - start_free_gpu_memory) / 1024**3, + start_free_gpu_memory / 1024**3, ) # Capture encoder CUDA graphs first (if enabled) @@ -5107,9 +5125,10 @@ def freeze_gc(): after_encoder_free = torch.cuda.mem_get_info()[0] encoder_mem = start_free_gpu_memory - after_encoder_free logger.info( - f"Encoder CUDA graphs captured: " - f"{encoder_mem / 1024**3:.2f} GiB used by encoder graphs, " - f"{after_encoder_free / 1024**3:.2f} GiB free" + "Encoder CUDA graphs captured: %.2f GiB used by encoder graphs, " + "%.2f GiB free", + encoder_mem / 1024**3, + after_encoder_free / 1024**3, ) # Capture decoder/LM CUDA graphs in their own context with global pool @@ -5166,16 +5185,17 @@ def freeze_gc(): end_free_gpu_memory = torch.cuda.mem_get_info()[0] decoder_mem = before_decoder_free - end_free_gpu_memory logger.info( - f"Decoder CUDA graphs captured: " - f"{decoder_mem / 1024**3:.2f} GiB used by decoder graphs, " - f"{end_free_gpu_memory / 1024**3:.2f} GiB free" + "Decoder CUDA graphs captured: %.2f GiB used by decoder graphs, " + "%.2f GiB free", + decoder_mem / 1024**3, + end_free_gpu_memory / 1024**3, ) total_cudagraph_mem = start_free_gpu_memory - end_free_gpu_memory logger.info( - f"CUDA graph capture complete: " - f"total {total_cudagraph_mem / 1024**3:.2f} GiB for all graphs, " - f"{end_free_gpu_memory / 1024**3:.2f} GiB free" + "CUDA graph capture complete: total %.2f GiB for all graphs, %.2f GiB free", + total_cudagraph_mem / 1024**3, + end_free_gpu_memory / 1024**3, ) # Disable cudagraph capturing globally, so any unexpected cudagraph From 48fb275e1391c4e86602f43cffea8a4b2f2ca0f2 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 18:55:07 -0500 Subject: [PATCH 080/189] check grid_thw_list type. --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ce07009abdef..96a429604da1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2404,7 +2404,7 @@ def _execute_mm_encoder( if batched_pixel_values is not None and grid_thw_list is not None: # Convert grid_thw to list if tensor - if hasattr(grid_thw_list, "tolist"): + if isinstance(grid_thw_list, torch.Tensor): grid_thw_list = grid_thw_list.tolist() # Calculate patch boundaries for slicing From 83a0380e1952c8f82eabc5aae8511f7f3e0e1909 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 19:08:51 -0500 Subject: [PATCH 081/189] fix mypy type check. --- vllm/v1/worker/gpu/mm/encoder_runner.py | 5 +++-- vllm/v1/worker/gpu_model_runner.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py index 2558cca8147b..99893315fbb2 100644 --- a/vllm/v1/worker/gpu/mm/encoder_runner.py +++ b/vllm/v1/worker/gpu/mm/encoder_runner.py @@ -224,7 +224,7 @@ def execute_mm_encoder( curr_group_outputs = cudagraph_result else: # Fall back to eager mode - curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) + curr_group_outputs = list(model.embed_multimodal(**mm_kwargs_group)) sanity_check_mm_encoder_outputs( curr_group_outputs, @@ -297,7 +297,8 @@ def _execute_with_cudagraph( pixel_values = pixel_values.to(device=self.device, dtype=self.dtype) # Get spatial merge size for token calculations - spatial_merge_size = getattr(model.visual, "spatial_merge_size", 2) + visual = getattr(model, "visual", None) + spatial_merge_size = getattr(visual, "spatial_merge_size", 2) t, h, w = grid_thw[0] num_output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 96a429604da1..b4d56052ca25 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2574,7 +2574,8 @@ def _execute_with_encoder_cudagraph( ).contiguous() # Get spatial merge size for token calculations - spatial_merge_size = getattr(model.visual, "spatial_merge_size", 2) + visual = getattr(model, "visual", None) + spatial_merge_size = getattr(visual, "spatial_merge_size", 2) t, h, w = grid_thw[0] num_output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) num_input_patches = pixel_values.shape[0] From b81d3cea9cbb3559aee00a4da3c6e17a07d95b68 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 19:30:28 -0500 Subject: [PATCH 082/189] fix mypy call checks. --- vllm/v1/attention/backends/fa_utils.py | 6 +++--- vllm/v1/attention/backends/mla/aiter_triton_mla.py | 2 +- vllm/v1/attention/backends/mla/rocm_aiter_mla.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py index ea1a10f6ac9b..d66bf92d0827 100644 --- a/vllm/v1/attention/backends/fa_utils.py +++ b/vllm/v1/attention/backends/fa_utils.py @@ -17,15 +17,15 @@ from vllm._ipex_ops import ipex_ops reshape_and_cache_flash = ipex_ops.reshape_and_cache_flash - flash_attn_varlen_func = ipex_ops.flash_attn_varlen_func + flash_attn_varlen_func = ipex_ops.flash_attn_varlen_func # type: ignore[assignment] get_scheduler_metadata = ipex_ops.get_scheduler_metadata elif current_platform.is_rocm(): try: - from flash_attn import flash_attn_varlen_func # noqa: F401 + from flash_attn import flash_attn_varlen_func # type: ignore[no-redef] # noqa: F401 except ImportError: - def flash_attn_varlen_func(*args, **kwargs): + def flash_attn_varlen_func(*args, **kwargs): # type: ignore[misc] raise ImportError( "ROCm platform requires upstream flash-attn " "to be installed. Please install flash-attn first." diff --git a/vllm/v1/attention/backends/mla/aiter_triton_mla.py b/vllm/v1/attention/backends/mla/aiter_triton_mla.py index b164bb7b2ecd..5b6ecb65c243 100644 --- a/vllm/v1/attention/backends/mla/aiter_triton_mla.py +++ b/vllm/v1/attention/backends/mla/aiter_triton_mla.py @@ -49,7 +49,7 @@ def __init__( def _flash_attn_varlen_diff_headdims( self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs ): - result = self.flash_attn_varlen_func( + result = self.flash_attn_varlen_func( # type: ignore[call-arg] q, k, v, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 46ca97cac670..3abf8ad309d3 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -230,7 +230,7 @@ def __init__( def _flash_attn_varlen_diff_headdims( self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs ): - output = self.flash_attn_varlen_func( + output = self.flash_attn_varlen_func( # type: ignore[call-arg] q=q, k=k, v=v, From cccd01d9803f85541932ed38e790270a70193dc6 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 19:35:10 -0500 Subject: [PATCH 083/189] format. --- vllm/v1/attention/backends/fa_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py index d66bf92d0827..fa005a2e07e6 100644 --- a/vllm/v1/attention/backends/fa_utils.py +++ b/vllm/v1/attention/backends/fa_utils.py @@ -22,7 +22,9 @@ elif current_platform.is_rocm(): try: - from flash_attn import flash_attn_varlen_func # type: ignore[no-redef] # noqa: F401 + from flash_attn import ( + flash_attn_varlen_func, # type: ignore[no-redef] # noqa: F401 + ) except ImportError: def flash_attn_varlen_func(*args, **kwargs): # type: ignore[misc] From f595666fa86bb3a4f5b96a8d17f8365b9eb6360f Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Sun, 1 Feb 2026 19:37:31 -0500 Subject: [PATCH 084/189] fix mypy check. --- vllm/v1/attention/backends/fa_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py index fa005a2e07e6..dd26c3548c78 100644 --- a/vllm/v1/attention/backends/fa_utils.py +++ b/vllm/v1/attention/backends/fa_utils.py @@ -22,8 +22,8 @@ elif current_platform.is_rocm(): try: - from flash_attn import ( - flash_attn_varlen_func, # type: ignore[no-redef] # noqa: F401 + from flash_attn import ( # type: ignore[no-redef] + flash_attn_varlen_func, # noqa: F401 ) except ImportError: From f987139799f2ab2821617f7b7a99d7731c85cde5 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 2 Feb 2026 13:54:28 -0800 Subject: [PATCH 085/189] remove hacks Signed-off-by: Max Hu --- vllm/compilation/piecewise_backend.py | 4 ++++ vllm/config/vllm.py | 5 ----- vllm/v1/worker/gpu_worker.py | 3 --- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 48caaa4865ca..ee6779bffa55 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -61,6 +61,10 @@ def __init__( # to set the upper bound of the compile ranges max_int32 = 2**31 - 1 last_compile_range = self.compile_ranges[-1] + assert ( + last_compile_range.end + == vllm_config.scheduler_config.max_num_batched_tokens + ) self.compile_ranges[-1] = Range( start=last_compile_range.start, end=max_int32 ) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 454ea4e9670d..c99f8009c565 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1250,11 +1250,6 @@ def _set_compile_ranges(self): ): computed_compile_ranges_split_points.append(x) - # (hack) Add a large number to the compile ranges split points to ensure that - # the last range is always included for vit models. - INT_MAX = 2**63 - 1 - computed_compile_ranges_split_points.append(INT_MAX) - compilation_config.compile_ranges_split_points = sorted( computed_compile_ranges_split_points ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index e714d3c6d6aa..013780479743 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -432,10 +432,7 @@ def compile_or_warm_up_model(self) -> None: # add the end of the range to ensure compilation/warmup. all_sizes = set(cg_capture_sizes) all_sizes.update([x for x in warmup_sizes if isinstance(x, int)]) - INT_MAX = 2**63 - 1 for compile_range in compile_ranges: - if compile_range.end == INT_MAX: - continue if not any(x in compile_range for x in all_sizes): warmup_sizes.append(compile_range.end) From a7650be94d3905df5ab10ae99ebf8faff199bb0c Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Mon, 2 Feb 2026 18:11:28 -0500 Subject: [PATCH 086/189] add sequence_lengths in vit cudagraph. --- vllm/model_executor/models/qwen3_vl.py | 9 +++++++++ vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index b8351e1dbb8a..9ed02513b505 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -722,6 +722,7 @@ def forward_cudagraph( rotary_pos_emb_sin: torch.Tensor, cu_seqlens: torch.Tensor, max_seqlen: torch.Tensor, + sequence_lengths: torch.Tensor, ) -> torch.Tensor: """ Forward pass optimized for CUDA graph capture/replay. @@ -737,6 +738,7 @@ def forward_cudagraph( rotary_pos_emb_sin: Pre-computed rotary sine embeddings cu_seqlens: Pre-computed cumulative sequence lengths (on GPU) max_seqlen: Pre-computed max sequence length (scalar tensor on GPU) + sequence_lengths: Pre-computed sequence lengths (for FlashInfer CuDNN) Returns: Vision encoder output tensor @@ -759,6 +761,7 @@ def forward_cudagraph( rotary_pos_emb_cos=rotary_pos_emb_cos, rotary_pos_emb_sin=rotary_pos_emb_sin, max_seqlen=max_seqlen, + sequence_lengths=sequence_lengths, ) if layer_num in self.deepstack_visual_indexes: deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num) @@ -813,12 +816,18 @@ def precompute_for_cudagraph( max_seqlen_gpu = self.compute_attn_mask_seqlen(cu_seqlens) max_seqlen = max_seqlen_gpu.cpu() # Move to CPU to avoid GPU sync on .item() + # Compute sequence_lengths (individual sequence lengths from cu_seqlens) + # This is used by FlashInfer CuDNN backend + sequence_lengths = cu_seqlens[1:] - cu_seqlens[:-1] + sequence_lengths = sequence_lengths.to(self.device, non_blocking=True) + return { "pos_embeds": pos_embeds, "rotary_pos_emb_cos": rotary_pos_emb_cos, "rotary_pos_emb_sin": rotary_pos_emb_sin, "cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen, + "sequence_lengths": sequence_lengths, } def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index c39290f6ac89..d914f90b2c4e 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -405,6 +405,7 @@ def capture_graph_for_grid( "rotary_pos_emb_sin": cached["rotary_pos_emb_sin"].clone(), "cu_seqlens": cached["cu_seqlens"].clone(), "max_seqlen": cached["max_seqlen"].clone(), + "sequence_lengths": cached["sequence_lengths"].clone(), } embed_buffers = self.embedding_buffers[grid_config] @@ -421,6 +422,7 @@ def capture_graph_for_grid( rotary_pos_emb_sin=embed_buffers["rotary_pos_emb_sin"], cu_seqlens=embed_buffers["cu_seqlens"], max_seqlen=embed_buffers["max_seqlen"], + sequence_lengths=embed_buffers["sequence_lengths"], ) self.output_buffers[grid_config] = torch.empty_like(warmup_output) @@ -443,6 +445,7 @@ def capture_graph_for_grid( rotary_pos_emb_sin=embed_buffers["rotary_pos_emb_sin"], cu_seqlens=embed_buffers["cu_seqlens"], max_seqlen=embed_buffers["max_seqlen"], + sequence_lengths=embed_buffers["sequence_lengths"], ) self.output_buffers[grid_config].copy_(output) else: @@ -684,6 +687,9 @@ def run( ) embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=True) embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=True) + embed_buffers["sequence_lengths"].copy_( + cached["sequence_lengths"], non_blocking=True + ) if self.verbose: logger.info( @@ -860,6 +866,9 @@ def run_padded( embed_buffers["max_seqlen"].copy_( actual_embeds["max_seqlen"], non_blocking=True ) + embed_buffers["sequence_lengths"].copy_( + actual_embeds["sequence_lengths"], non_blocking=True + ) if self.verbose: logger.info( From 432ad03bfd984d88672a7fedc492acaf67470862 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Mon, 2 Feb 2026 01:26:32 -0500 Subject: [PATCH 087/189] optimize sync for single gpu. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 117 +++++++++++++-------- 1 file changed, 76 insertions(+), 41 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index d914f90b2c4e..bcba515fbcc0 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -202,6 +202,21 @@ def __init__( # This allows better overlap between encoder and other GPU work. self.replay_done_event: torch.cuda.Event | None = None + # Single-GPU optimization: when TP=1, PP=1, DP=1, we can capture graphs + # on the current stream instead of a separate stream. This eliminates + # the need for stream synchronization before replay. + parallel_config = vllm_config.parallel_config + self.is_single_gpu = ( + parallel_config.tensor_parallel_size == 1 + and parallel_config.pipeline_parallel_size == 1 + and parallel_config.data_parallel_size == 1 + ) + if self.is_single_gpu: + logger.info( + "Encoder CUDA graphs: single-GPU mode enabled " + "(TP=1, PP=1, DP=1), using optimized sync scheme" + ) + def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: """Get encoder grid configurations from config or use defaults.""" compilation_config = self.vllm_config.compilation_config @@ -527,15 +542,24 @@ def capture( configs_to_capture, desc="Capturing encoder CUDA graphs" ) - # Capture each graph in its own graph_capture context to isolate failures. - # If one capture fails, the pool state won't affect subsequent captures. + # Capture each graph. For single-GPU mode, capture directly on current stream + # to avoid stream synchronization overhead at replay time. + # For multi-GPU mode, use graph_capture() context to coordinate with TP/PP. for grid_config in configs_to_capture: try: - with graph_capture(device=self.device): + if self.is_single_gpu: + # Single-GPU: capture on current stream (no separate stream) self.capture_graph_for_grid( grid_config, vision_encoder, ) + else: + # Multi-GPU: use graph_capture() for TP/PP coordination + with graph_capture(device=self.device): + self.capture_graph_for_grid( + grid_config, + vision_encoder, + ) except Exception as e: logger.warning( "Failed to capture encoder CUDA graph for grid config " @@ -662,9 +686,10 @@ def run( self.cache_hits += 1 # Wait for any previous graph replay to complete before modifying buffers. - # Using event.synchronize() is lighter than torch.cuda.synchronize() - # which waits for ALL GPU work across all streams. - if self.replay_done_event is not None: + # For single-GPU mode, this is not needed because everything is on the same + # stream and CUDA guarantees ordering. For multi-GPU mode, we need this + # because the graph runs on a different stream. + if not self.is_single_gpu and self.replay_done_event is not None: self.replay_done_event.synchronize() # Ensure contiguous memory layout for safe copy @@ -699,28 +724,30 @@ def run( input_buffer.shape, ) - # Sync current stream before replay: graph was captured on a separate stream, - # but buffer copies happen on the default stream. We need copies to complete - # before replay reads from those buffers. - # Using current_stream().synchronize() is lighter than torch.cuda.synchronize() - torch.cuda.current_stream().synchronize() + if self.is_single_gpu: + # Single-GPU optimized path: graph was captured on current stream, + # so buffer copies and replay are on the same stream - no sync needed. + # Return view directly; caller must use output before next run() call. + self.graphs[grid_key].replay() + return self.output_buffers[grid_key] + else: + # Multi-GPU path: graph was captured on a separate stream. + # Sync current stream before replay to ensure buffer copies complete. + torch.cuda.current_stream().synchronize() - # Replay the graph - self.graphs[grid_key].replay() + # Replay the graph + self.graphs[grid_key].replay() - # Record event after replay for lightweight sync in next call. - # Create event lazily on first use. - if self.replay_done_event is None: - self.replay_done_event = torch.cuda.Event() - self.replay_done_event.record() + # Record event after replay for lightweight sync in next call. + if self.replay_done_event is None: + self.replay_done_event = torch.cuda.Event() + self.replay_done_event.record() - # Sync to ensure output is ready before clone. - # TODO: Could eliminate this if we return a view and defer sync to caller, - # but that would require careful handling of buffer reuse. - self.replay_done_event.synchronize() + # Sync to ensure output is ready before clone. + self.replay_done_event.synchronize() - # Return a clone of the output to avoid issues with buffer reuse - return self.output_buffers[grid_key].clone() + # Return a clone of the output to avoid issues with buffer reuse + return self.output_buffers[grid_key].clone() def run_padded( self, @@ -825,8 +852,9 @@ def run_padded( self.cache_hits += 1 # Wait for any previous graph replay to complete before modifying buffers. - # Using event.synchronize() is lighter than torch.cuda.synchronize() - if self.replay_done_event is not None: + # For single-GPU mode, this is not needed because everything is on the same + # stream and CUDA guarantees ordering. + if not self.is_single_gpu and self.replay_done_event is not None: self.replay_done_event.synchronize() # Compute embeddings for ACTUAL grid, then pad to bucket size. @@ -880,25 +908,32 @@ def run_padded( bucket_input_patches, ) - # Sync current stream before replay: graph was captured on a separate stream, - # but buffer modifications (zero, copy) happen on the default stream. - # Using current_stream().synchronize() is lighter than torch.cuda.synchronize() - torch.cuda.current_stream().synchronize() + if self.is_single_gpu: + # Single-GPU optimized path: graph was captured on current stream, + # so buffer modifications and replay are on the same stream - no sync needed. + # Return view directly; caller must use output before next run() call. + self.graphs[bucket_grid].replay() + full_output = self.output_buffers[bucket_grid] + trimmed_output = full_output[:num_output_tokens] + else: + # Multi-GPU path: graph was captured on a separate stream. + # Sync current stream before replay to ensure buffer modifications complete. + torch.cuda.current_stream().synchronize() - # Replay the graph with updated embedding buffers - self.graphs[bucket_grid].replay() + # Replay the graph with updated embedding buffers + self.graphs[bucket_grid].replay() - # Record event after replay for lightweight sync in next call. - if self.replay_done_event is None: - self.replay_done_event = torch.cuda.Event() - self.replay_done_event.record() + # Record event after replay for lightweight sync in next call. + if self.replay_done_event is None: + self.replay_done_event = torch.cuda.Event() + self.replay_done_event.record() - # Sync to ensure output is ready before clone. - self.replay_done_event.synchronize() + # Sync to ensure output is ready before clone. + self.replay_done_event.synchronize() - # Get output and trim to actual size - full_output = self.output_buffers[bucket_grid] - trimmed_output = full_output[:num_output_tokens].clone() + # Get output and trim to actual size + full_output = self.output_buffers[bucket_grid] + trimmed_output = full_output[:num_output_tokens].clone() if self.verbose: logger.debug( From 7356f51aa5667730216051a2afdba058700c185a Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Mon, 2 Feb 2026 14:39:12 -0500 Subject: [PATCH 088/189] update buffer only if run_padded modified it. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index bcba515fbcc0..6ec6c00c6f61 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -217,6 +217,10 @@ def __init__( "(TP=1, PP=1, DP=1), using optimized sync scheme" ) + # Track which grids have had their embedding buffers modified by run_padded(). + # This allows run() to skip restoring cached tensors when not needed. + self.modified_grids: set[tuple[int, int, int]] = set() + def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: """Get encoder grid configurations from config or use defaults.""" compilation_config = self.vllm_config.compilation_config @@ -699,8 +703,9 @@ def run( # Copy input to the captured buffer (non-blocking for better overlap) input_buffer.copy_(pixel_values, non_blocking=True) - # For exact match, restore cached embeddings (may have been modified) - if grid_key in self.embedding_buffers and grid_key in self.cached_tensors: + # For exact match, restore cached embeddings only if modified by run_padded(). + # This avoids 6 unnecessary tensor copies when only using exact-match mode. + if grid_key in self.modified_grids: embed_buffers = self.embedding_buffers[grid_key] cached = self.cached_tensors[grid_key] embed_buffers["pos_embeds"].copy_(cached["pos_embeds"], non_blocking=True) @@ -715,6 +720,7 @@ def run( embed_buffers["sequence_lengths"].copy_( cached["sequence_lengths"], non_blocking=True ) + self.modified_grids.discard(grid_key) if self.verbose: logger.info( @@ -898,6 +904,9 @@ def run_padded( actual_embeds["sequence_lengths"], non_blocking=True ) + # Mark this grid as modified so run() knows to restore cached tensors + self.modified_grids.add(bucket_grid) + if self.verbose: logger.info( "run_padded(): bucket_grid=%s, actual_grid=%s, " From 86cb2f701870d6412a071c9bfb290deb710d4132 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 2 Feb 2026 20:50:53 -0800 Subject: [PATCH 089/189] fix cudnn Signed-off-by: Max Hu --- vllm/model_executor/models/qwen3_vl.py | 25 ++++++++++++++++------ vllm/v1/attention/ops/vit_attn_wrappers.py | 3 ++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 9ed02513b505..b8251c498e00 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -220,6 +220,7 @@ def forward(self, x: torch.Tensor): "rotary_pos_emb_cos": 0, "rotary_pos_emb_sin": 0, "max_seqlen": 0, + "sequence_lengths": 0, # Batch dimension is dynamic }, mark_unbacked_dims={"max_seqlen": 0}, enable_if=should_torch_compile_mm_vit, @@ -808,19 +809,31 @@ def precompute_for_cudagraph( grid_thw_np[:, 1] * grid_thw_np[:, 2], grid_thw_np[:, 0] ).cumsum(axis=0, dtype=np.int32) cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens]) + sequence_lengths = cu_seqlens[1:] - cu_seqlens[:-1] + if self.attn_backend == AttentionBackendEnum.FLASHINFER: + sequence_lengths = self.add_padding_to_fi_seqlens( + sequence_lengths, len(sequence_lengths), 0 + ) + cu_seqlens = self.compute_flashinfer_cu_seqlens( + cu_seqlens, rotary_pos_emb_cos, rotary_pos_emb_sin + ) cu_seqlens = torch.from_numpy(cu_seqlens).to(self.device, non_blocking=True) + sequence_lengths = torch.from_numpy(sequence_lengths).to( + self.device, non_blocking=True + ) # Compute max sequence length as CPU scalar tensor # Using CPU tensor is important for CUDA graph capture: .item() on CPU # tensor doesn't trigger GPU sync, so it won't invalidate capture. - max_seqlen_gpu = self.compute_attn_mask_seqlen(cu_seqlens) + max_seqlen_gpu = ( + torch.tensor(128 * 1024, device=self.device) + # setting to 128k to avoid cudnn recompilation + # TODO: use the real max_seqlen once cudnn compilation is optimized + if self.attn_backend == AttentionBackendEnum.FLASHINFER + else self.compute_attn_mask_seqlen(cu_seqlens) + ) max_seqlen = max_seqlen_gpu.cpu() # Move to CPU to avoid GPU sync on .item() - # Compute sequence_lengths (individual sequence lengths from cu_seqlens) - # This is used by FlashInfer CuDNN backend - sequence_lengths = cu_seqlens[1:] - cu_seqlens[:-1] - sequence_lengths = sequence_lengths.to(self.device, non_blocking=True) - return { "pos_embeds": pos_embeds, "rotary_pos_emb_cos": rotary_pos_emb_cos, diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index eda852f04603..e678d9e6d534 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -320,6 +320,7 @@ def flashinfer_wrapper( batch_offsets_k=batch_offsets_qk, batch_offsets_v=batch_offsets_v, batch_offsets_o=batch_offsets_o, + is_cuda_graph_compatible=True, ) if is_reshaped: @@ -338,7 +339,7 @@ def vit_flashinfer_wrapper_fake( max_seqlen: torch.Tensor | None = None, sequence_lengths: torch.Tensor | None = None, ) -> torch.Tensor: - return torch.empty_like(q) + return torch.empty_like(q, memory_format=torch.contiguous_format) direct_register_custom_op( From 0aae8fbff8ffffaa6fd85faed9c674c205a9e6a5 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Mon, 2 Feb 2026 21:03:33 -0800 Subject: [PATCH 090/189] fix Signed-off-by: Max Hu --- vllm/v1/attention/ops/vit_attn_wrappers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index e678d9e6d534..edc118c9add8 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -339,7 +339,7 @@ def vit_flashinfer_wrapper_fake( max_seqlen: torch.Tensor | None = None, sequence_lengths: torch.Tensor | None = None, ) -> torch.Tensor: - return torch.empty_like(q, memory_format=torch.contiguous_format) + return torch.empty_like(q) direct_register_custom_op( From 5b6176b5b02fa2c18dbe7a03f32782233a235545 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 13:17:23 -0500 Subject: [PATCH 091/189] fix line too long. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 6ec6c00c6f61..ae67e47461a0 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -919,7 +919,7 @@ def run_padded( if self.is_single_gpu: # Single-GPU optimized path: graph was captured on current stream, - # so buffer modifications and replay are on the same stream - no sync needed. + # so buffer modifications and replay are on same stream - no sync needed. # Return view directly; caller must use output before next run() call. self.graphs[bucket_grid].replay() full_output = self.output_buffers[bucket_grid] From ce5a1ed345877526bc6b43d2d9029a6698a6ba31 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 14:54:36 -0500 Subject: [PATCH 092/189] add encoder split ops. --- vllm/compilation/backends.py | 4 ++++ vllm/config/compilation.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index f06047be61b9..d75bc1bd4772 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -716,6 +716,10 @@ def __call__( if self.compilation_config.use_inductor_graph_partition: # Let Inductor decide partitioning; avoid FX-level pre-splitting. fx_split_ops: list[str] = [] + elif self.is_encoder: + # For encoder compilation, use encoder-specific splitting ops + # to enable piecewise cudagraph (attention in eager, rest in graph) + fx_split_ops = self.compilation_config.get_encoder_splitting_ops() else: fx_split_ops = self.compilation_config.splitting_ops or [] diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 9cf9dbf7baca..caea22868903 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -673,6 +673,15 @@ class CompilationConfig: "vllm::sparse_attn_indexer", ] + # Encoder (ViT) attention ops; used for piecewise cudagraphs on encoders + # These ops depend on batch structure (cu_seqlens), so they must be + # excluded from cudagraph capture to allow batching multiple images. + _encoder_attention_ops: ClassVar[list[str]] = [ + "vllm::flash_attn_maxseqlen_wrapper", + "vllm::fa4_flash_attn_maxseqlen_wrapper", + "vllm::flashinfer_wrapper", + ] + def compute_hash(self) -> str: """ Provide a hash that uniquely identifies all the configs @@ -1074,6 +1083,15 @@ def splitting_ops_contain_attention(self) -> bool: op in self.splitting_ops for op in self._attention_ops ) + def get_encoder_splitting_ops(self) -> list[str]: + """Get splitting ops for encoder (ViT) compilation. + + For piecewise cudagraph on encoders, we split at attention ops + so that non-attention ops (norm, MLP) can be captured in cudagraphs + while attention runs in eager mode with batched images. + """ + return list(self._encoder_attention_ops) + def is_attention_compiled_piecewise(self) -> bool: if not self.splitting_ops_contain_attention(): return False From 727cb8843279916d0d167d5978796bb049f4c31c Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 14:55:21 -0500 Subject: [PATCH 093/189] add encoder piecewise cudagraph option. --- vllm/config/compilation.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index caea22868903..0123ffdc236d 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -489,6 +489,16 @@ class CompilationConfig: the kernel launch savings. Set to False if you observe throughput regression with encoder CUDA graphs.""" + encoder_cudagraph_piecewise: bool = False + """Enable piecewise CUDA graph mode for encoder (ViT). + When True, torch.compile splits the encoder graph at attention ops, so: + - Non-attention ops (norm, MLP, patch_embed, merger) are captured in CUDA graphs + - Attention ops run in eager mode with original batch structure + This allows batching multiple images together while still benefiting from + CUDA graphs for the non-attention parts. More efficient than one-by-one + processing when batch sizes vary. + Requires compile_mm_encoder=True. Mutually exclusive with cudagraph_mm_encoder.""" + # Inductor capture compile_sizes: list[int | str] | None = None """Sizes to compile for inductor. In addition From 7df605453f812fe159ee46ce14709a657be208fb Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 14:56:29 -0500 Subject: [PATCH 094/189] check if piecewise encoder cudagraph is enabled. --- vllm/v1/worker/gpu_model_runner.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b4d56052ca25..e8bfe0d33273 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -693,6 +693,28 @@ def _init_encoder_cudagraph_manager(self) -> None: if self.compilation_config is None: return + # Check if piecewise encoder cudagraph mode is enabled + # In piecewise mode, torch.compile handles graph splitting at attention ops, + # so we don't need the full EncoderCudaGraphManager + encoder_cudagraph_piecewise = getattr( + self.compilation_config, "encoder_cudagraph_piecewise", False + ) + if encoder_cudagraph_piecewise: + compile_mm_encoder = getattr( + self.compilation_config, "compile_mm_encoder", False + ) + if not compile_mm_encoder: + logger.warning( + "encoder_cudagraph_piecewise=True requires compile_mm_encoder=True. " + "Piecewise encoder cudagraph will not be effective." + ) + else: + logger.info( + "Piecewise encoder CUDA graph mode enabled. " + "torch.compile will handle graph splitting at attention ops." + ) + return + if not getattr(self.compilation_config, "cudagraph_mm_encoder", False): return From f87337091bbfbd0cb9da35adaa3f11272da3115e Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 14:58:14 -0500 Subject: [PATCH 095/189] check if compile is enabled for piecewise. --- vllm/v1/worker/gpu_model_runner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e8bfe0d33273..b3bb4b1cc3f3 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -705,8 +705,9 @@ def _init_encoder_cudagraph_manager(self) -> None: ) if not compile_mm_encoder: logger.warning( - "encoder_cudagraph_piecewise=True requires compile_mm_encoder=True. " - "Piecewise encoder cudagraph will not be effective." + "encoder_cudagraph_piecewise=True requires " + "compile_mm_encoder=True. Piecewise encoder cudagraph " + "will not be effective." ) else: logger.info( From 226a42bb4836c9d887fda4d2281e6280215d22af Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 15:11:19 -0500 Subject: [PATCH 096/189] add encoder cudagraph capture sizes. --- vllm/config/compilation.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 0123ffdc236d..d8abe9673e5c 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -499,6 +499,16 @@ class CompilationConfig: processing when batch sizes vary. Requires compile_mm_encoder=True. Mutually exclusive with cudagraph_mm_encoder.""" + encoder_cudagraph_capture_sizes: list[int] | None = None + """CUDA graph capture sizes (token counts) for encoder piecewise mode. + These are the total token counts at which CUDA graphs are captured. + For Qwen3-VL with spatial_merge_size=2: + - (1, 32, 32) grid → 1024 patches → 256 output tokens + - (1, 64, 64) grid → 4096 patches → 1024 output tokens + - (1, 94, 94) grid → 8836 patches → 2209 output tokens + Example: [256, 512, 1024, 2048, 4096, 8192, 16384] + If None, encoder piecewise mode will use compile_ranges only (no cudagraph).""" + # Inductor capture compile_sizes: list[int | str] | None = None """Sizes to compile for inductor. In addition From e165ef4709d1db99e07a41e4cf467cbecd3d7f2d Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 15:12:19 -0500 Subject: [PATCH 097/189] use encoder specific cudagraph capture size. --- vllm/compilation/piecewise_backend.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index ee6779bffa55..852f2aabbab0 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -72,8 +72,15 @@ def __init__( log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}" logger.debug_once(log_string) - self.compile_sizes = self.compilation_config.compile_sizes - log_string = f"PiecewiseBackend: compile_sizes: {self.compile_sizes}" + # Use encoder-specific capture sizes for encoder compilation + if self.is_encoder_compilation: + self.compile_sizes = self.compilation_config.encoder_cudagraph_capture_sizes + else: + self.compile_sizes = self.compilation_config.compile_sizes + log_string = ( + f"PiecewiseBackend: compile_sizes: {self.compile_sizes} " + f"(is_encoder={self.is_encoder_compilation})" + ) logger.debug_once(log_string) self.sym_shape_indices = sym_shape_indices From 26b1b8c8340d0d77d50e50fc185ccc4df6d69564 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 15:54:18 -0500 Subject: [PATCH 098/189] add padding mode for piecewise vit cudagraph. --- vllm/v1/worker/gpu_model_runner.py | 175 +++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b3bb4b1cc3f3..926b0ef9af4d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2662,6 +2662,181 @@ def _execute_with_encoder_cudagraph( ) return None + def _find_nearest_encoder_capture_size( + self, num_tokens: int + ) -> int | None: + """Find the smallest capture size >= num_tokens for piecewise mode. + + Args: + num_tokens: The actual number of output tokens + + Returns: + The nearest capture size, or None if no suitable size found + """ + if self.compilation_config is None: + return None + + capture_sizes = getattr( + self.compilation_config, "encoder_cudagraph_capture_sizes", None + ) + if capture_sizes is None or len(capture_sizes) == 0: + return None + + # Find smallest size >= num_tokens + for size in sorted(capture_sizes): + if size >= num_tokens: + return size + + # num_tokens exceeds all capture sizes + return None + + def _execute_encoder_piecewise_padded( + self, + model: "SupportsMultiModal", + mm_kwargs_group: dict, + modality: str, + ) -> list[torch.Tensor] | None: + """Execute encoder with padding for piecewise cudagraph mode. + + Pads inputs to the nearest capture size so that the compiled encoder + can use cudagraph. Trims output to actual size after execution. + + Args: + model: The multimodal model + mm_kwargs_group: Batched multimodal kwargs + modality: The modality type ("image" or "video") + + Returns: + List of encoder outputs if padding was applied, None otherwise + """ + # Only support image/video modalities + if modality not in ("image", "video"): + return None + + # Extract grid_thw and pixel_values + grid_thw = mm_kwargs_group.get("image_grid_thw") + pixel_key = "pixel_values" + if grid_thw is None: + grid_thw = mm_kwargs_group.get("video_grid_thw") + pixel_key = "pixel_values_videos" + if grid_thw is None: + return None + + pixel_values = mm_kwargs_group.get(pixel_key) + if pixel_values is None: + return None + + # Convert to list if tensor + if hasattr(grid_thw, "tolist"): + grid_thw_list = grid_thw.tolist() + else: + grid_thw_list = list(grid_thw) + + # Get spatial merge size from model + visual = getattr(model, "visual", None) + if visual is None: + return None + spatial_merge_size = getattr(visual, "spatial_merge_size", 2) + + # Calculate actual output tokens + actual_num_patches = sum(t * h * w for t, h, w in grid_thw_list) + actual_output_tokens = actual_num_patches // (spatial_merge_size ** 2) + + # Find nearest capture size + capture_size = self._find_nearest_encoder_capture_size(actual_output_tokens) + if capture_size is None: + # No suitable capture size, fall back to non-padded execution + return None + + if capture_size == actual_output_tokens: + # Exact match, no padding needed + return None + + # Calculate padding needed + padding_tokens = capture_size - actual_output_tokens + padding_patches = padding_tokens * (spatial_merge_size ** 2) + + # Pad pixel_values with zeros + # pixel_values shape: [num_patches, patch_channels] + num_input_patches = pixel_values.shape[0] + padded_num_patches = num_input_patches + padding_patches + + padded_pixel_values = torch.zeros( + (padded_num_patches, pixel_values.shape[1]), + dtype=pixel_values.dtype, + device=pixel_values.device, + ) + padded_pixel_values[:num_input_patches] = pixel_values + + # Extend the last image's grid to include padding patches + # This ensures cu_seqlens is computed correctly + padded_grid_thw = [list(g) for g in grid_thw_list] + if len(padded_grid_thw) > 0: + # Add padding to the last image's H dimension + # We need to add padding_patches to the last image + last_t, last_h, last_w = padded_grid_thw[-1] + # Calculate new dimensions that accommodate padding + last_patches = last_t * last_h * last_w + new_last_patches = last_patches + padding_patches + # Keep T and W the same, adjust H + new_last_h = new_last_patches // (last_t * last_w) + if new_last_h * last_t * last_w != new_last_patches: + # Can't evenly divide, adjust W instead + new_last_w = new_last_patches // (last_t * last_h) + if new_last_w * last_t * last_h != new_last_patches: + # Use square-ish dimensions + import math + total = new_last_patches // last_t + new_last_h = int(math.ceil(math.sqrt(total))) + new_last_w = int(math.ceil(total / new_last_h)) + # Adjust padding to match + actual_new_patches = last_t * new_last_h * new_last_w + if actual_new_patches != new_last_patches: + # Recalculate padded pixel values + extra_padding = actual_new_patches - last_patches + padded_num_patches = num_input_patches + extra_padding + padded_pixel_values = torch.zeros( + (padded_num_patches, pixel_values.shape[1]), + dtype=pixel_values.dtype, + device=pixel_values.device, + ) + padded_pixel_values[:num_input_patches] = pixel_values + else: + new_last_h = last_h + else: + new_last_w = last_w + padded_grid_thw[-1] = [last_t, new_last_h, new_last_w] + + # Create padded kwargs + padded_kwargs = dict(mm_kwargs_group) + padded_kwargs[pixel_key] = padded_pixel_values + if modality == "image": + padded_kwargs["image_grid_thw"] = padded_grid_thw + else: + padded_kwargs["video_grid_thw"] = padded_grid_thw + + # Execute encoder with padded inputs + padded_outputs = model.embed_multimodal(**padded_kwargs) + + # Trim outputs to actual size + trimmed_outputs = [] + for output in padded_outputs: + if isinstance(output, torch.Tensor): + trimmed_outputs.append(output[:actual_output_tokens]) + else: + trimmed_outputs.append(output) + + if self.encoder_cudagraph_verbose: + logger.info( + "Piecewise padded execution: actual_tokens=%d, " + "capture_size=%d, padding=%d", + actual_output_tokens, + capture_size, + capture_size - actual_output_tokens, + ) + + return trimmed_outputs + def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", From 992dd7ef2fa094802e5cd7983daf5c383f5d8d37 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 15:55:56 -0500 Subject: [PATCH 099/189] use padding if enabled for piecewise. --- vllm/v1/worker/gpu_model_runner.py | 42 +++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 926b0ef9af4d..19d802ac2a4d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2494,15 +2494,39 @@ def _execute_mm_encoder( # CUDA graph was used successfully curr_group_outputs = cudagraph_result else: - # Fall back to eager mode. - # Run the encoder. - # `curr_group_outputs` is either of the following: - # 1. A tensor of shape (num_items, feature_size, hidden_size) - # in case feature_size is fixed across all multimodal items. - # 2. A list or tuple (length: num_items) of tensors, - # each of shape (feature_size, hidden_size) in case the feature - # size is dynamic depending on the input multimodal items. - curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) + # Try piecewise padded execution if enabled + piecewise_result = None + if ( + self.compilation_config is not None + and getattr( + self.compilation_config, + "encoder_cudagraph_piecewise", + False, + ) + ): + piecewise_result = ( + self._execute_encoder_piecewise_padded( + model, mm_kwargs_group, modality + ) + ) + + if piecewise_result is not None: + curr_group_outputs = piecewise_result + else: + # Fall back to non-padded execution. + # Run the encoder. + # `curr_group_outputs` is either of the following: + # 1. A tensor of shape + # (num_items, feature_size, hidden_size) + # in case feature_size is fixed across all + # multimodal items. + # 2. A list or tuple (length: num_items) of tensors, + # each of shape (feature_size, hidden_size) in + # case the feature size is dynamic depending on + # the input multimodal items. + curr_group_outputs = model.embed_multimodal( + **mm_kwargs_group + ) sanity_check_mm_encoder_outputs( curr_group_outputs, From 240306edcff50cab92e8e272a08cf51addb52b37 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 16:00:20 -0500 Subject: [PATCH 100/189] use compilation config to control padded mode. --- vllm/v1/worker/gpu_model_runner.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 19d802ac2a4d..ce844c95dcdb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2503,6 +2503,11 @@ def _execute_mm_encoder( "encoder_cudagraph_piecewise", False, ) + and getattr( + self.compilation_config, + "encoder_cudagraph_padded_mode", + True, + ) ): piecewise_result = ( self._execute_encoder_piecewise_padded( From 1fcd767fd348d2b0173e51367d53c840a687c3c3 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 16:06:52 -0500 Subject: [PATCH 101/189] format. --- vllm/v1/worker/gpu_model_runner.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ce844c95dcdb..0cb8bddadf41 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2509,10 +2509,8 @@ def _execute_mm_encoder( True, ) ): - piecewise_result = ( - self._execute_encoder_piecewise_padded( - model, mm_kwargs_group, modality - ) + piecewise_result = self._execute_encoder_piecewise_padded( + model, mm_kwargs_group, modality ) if piecewise_result is not None: @@ -2691,9 +2689,7 @@ def _execute_with_encoder_cudagraph( ) return None - def _find_nearest_encoder_capture_size( - self, num_tokens: int - ) -> int | None: + def _find_nearest_encoder_capture_size(self, num_tokens: int) -> int | None: """Find the smallest capture size >= num_tokens for piecewise mode. Args: @@ -2769,7 +2765,7 @@ def _execute_encoder_piecewise_padded( # Calculate actual output tokens actual_num_patches = sum(t * h * w for t, h, w in grid_thw_list) - actual_output_tokens = actual_num_patches // (spatial_merge_size ** 2) + actual_output_tokens = actual_num_patches // (spatial_merge_size**2) # Find nearest capture size capture_size = self._find_nearest_encoder_capture_size(actual_output_tokens) @@ -2783,7 +2779,7 @@ def _execute_encoder_piecewise_padded( # Calculate padding needed padding_tokens = capture_size - actual_output_tokens - padding_patches = padding_tokens * (spatial_merge_size ** 2) + padding_patches = padding_tokens * (spatial_merge_size**2) # Pad pixel_values with zeros # pixel_values shape: [num_patches, patch_channels] @@ -2815,6 +2811,7 @@ def _execute_encoder_piecewise_padded( if new_last_w * last_t * last_h != new_last_patches: # Use square-ish dimensions import math + total = new_last_patches // last_t new_last_h = int(math.ceil(math.sqrt(total))) new_last_w = int(math.ceil(total / new_last_h)) From e63d608d2118e3c571491ed3016be790ad023f8b Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 17:02:45 -0500 Subject: [PATCH 102/189] use tensor for grid_thw instead of list. --- vllm/v1/worker/gpu_model_runner.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0cb8bddadf41..ef4478980d08 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2836,10 +2836,14 @@ def _execute_encoder_piecewise_padded( # Create padded kwargs padded_kwargs = dict(mm_kwargs_group) padded_kwargs[pixel_key] = padded_pixel_values + # Convert padded_grid_thw to tensor (model expects tensor, not list) + padded_grid_thw_tensor = torch.tensor( + padded_grid_thw, dtype=torch.int32, device=pixel_values.device + ) if modality == "image": - padded_kwargs["image_grid_thw"] = padded_grid_thw + padded_kwargs["image_grid_thw"] = padded_grid_thw_tensor else: - padded_kwargs["video_grid_thw"] = padded_grid_thw + padded_kwargs["video_grid_thw"] = padded_grid_thw_tensor # Execute encoder with padded inputs padded_outputs = model.embed_multimodal(**padded_kwargs) From 0328e2217112ba010f9b5693b4ff17b714d967af Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 17:30:04 -0500 Subject: [PATCH 103/189] use numpy array or cpu tensor for grid_thw. --- vllm/model_executor/models/qwen2_vl.py | 4 +++- vllm/model_executor/models/qwen3_vl.py | 4 +++- vllm/v1/worker/gpu_model_runner.py | 10 ++++------ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 4f8e694d75cb..7495d14102ce 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -686,9 +686,11 @@ def forward( if isinstance(grid_thw, list): grid_thw_list = grid_thw grid_thw = np.array(grid_thw, dtype=np.int32) + elif isinstance(grid_thw, np.ndarray): + grid_thw_list = grid_thw.tolist() else: grid_thw_list = grid_thw.tolist() - grid_thw = grid_thw.numpy() + grid_thw = grid_thw.cpu().numpy() # compute position embedding rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index b8251c498e00..03aacbf53a13 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -660,9 +660,11 @@ def forward( if isinstance(grid_thw, list): grid_thw_list = grid_thw grid_thw = np.array(grid_thw, dtype=np.int32) + elif isinstance(grid_thw, np.ndarray): + grid_thw_list = grid_thw.tolist() else: grid_thw_list = grid_thw.tolist() - grid_thw = grid_thw.numpy() + grid_thw = grid_thw.cpu().numpy() pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list) hidden_states = hidden_states + pos_embeds diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ef4478980d08..92a83e63a429 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2836,14 +2836,12 @@ def _execute_encoder_piecewise_padded( # Create padded kwargs padded_kwargs = dict(mm_kwargs_group) padded_kwargs[pixel_key] = padded_pixel_values - # Convert padded_grid_thw to tensor (model expects tensor, not list) - padded_grid_thw_tensor = torch.tensor( - padded_grid_thw, dtype=torch.int32, device=pixel_values.device - ) + # Use numpy array directly (has .ndim, avoids tensor/device overhead) + padded_grid_thw_np = np.array(padded_grid_thw, dtype=np.int32) if modality == "image": - padded_kwargs["image_grid_thw"] = padded_grid_thw_tensor + padded_kwargs["image_grid_thw"] = padded_grid_thw_np else: - padded_kwargs["video_grid_thw"] = padded_grid_thw_tensor + padded_kwargs["video_grid_thw"] = padded_grid_thw_np # Execute encoder with padded inputs padded_outputs = model.embed_multimodal(**padded_kwargs) From f98c7d03adaed69cdc15212298cf0a07af64c281 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 18:29:30 -0500 Subject: [PATCH 104/189] use cpu tensor for grid_thw. --- vllm/v1/worker/gpu_model_runner.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 92a83e63a429..3b39938dc30d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2836,12 +2836,14 @@ def _execute_encoder_piecewise_padded( # Create padded kwargs padded_kwargs = dict(mm_kwargs_group) padded_kwargs[pixel_key] = padded_pixel_values - # Use numpy array directly (has .ndim, avoids tensor/device overhead) - padded_grid_thw_np = np.array(padded_grid_thw, dtype=np.int32) + # Use CPU tensor (has .ndim for model, accepted by tensor schema) + padded_grid_thw_tensor = torch.tensor( + padded_grid_thw, dtype=torch.int32, device="cpu" + ) if modality == "image": - padded_kwargs["image_grid_thw"] = padded_grid_thw_np + padded_kwargs["image_grid_thw"] = padded_grid_thw_tensor else: - padded_kwargs["video_grid_thw"] = padded_grid_thw_np + padded_kwargs["video_grid_thw"] = padded_grid_thw_tensor # Execute encoder with padded inputs padded_outputs = model.embed_multimodal(**padded_kwargs) From 87480b82155d9d8d00b4746851e920a83a1090ba Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 18:55:16 -0500 Subject: [PATCH 105/189] add dummy image instead of pad each image. --- vllm/v1/worker/gpu_model_runner.py | 75 +++++++++++------------------- 1 file changed, 27 insertions(+), 48 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3b39938dc30d..474486a7b26f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4,6 +4,7 @@ import functools import gc import itertools +import math import time from collections import defaultdict from collections.abc import Iterator, Sequence @@ -2793,45 +2794,27 @@ def _execute_encoder_piecewise_padded( ) padded_pixel_values[:num_input_patches] = pixel_values - # Extend the last image's grid to include padding patches - # This ensures cu_seqlens is computed correctly + # Add a dummy image entry for padding patches (don't modify existing grids) + # This preserves position embeddings for real images padded_grid_thw = [list(g) for g in grid_thw_list] - if len(padded_grid_thw) > 0: - # Add padding to the last image's H dimension - # We need to add padding_patches to the last image - last_t, last_h, last_w = padded_grid_thw[-1] - # Calculate new dimensions that accommodate padding - last_patches = last_t * last_h * last_w - new_last_patches = last_patches + padding_patches - # Keep T and W the same, adjust H - new_last_h = new_last_patches // (last_t * last_w) - if new_last_h * last_t * last_w != new_last_patches: - # Can't evenly divide, adjust W instead - new_last_w = new_last_patches // (last_t * last_h) - if new_last_w * last_t * last_h != new_last_patches: - # Use square-ish dimensions - import math - - total = new_last_patches // last_t - new_last_h = int(math.ceil(math.sqrt(total))) - new_last_w = int(math.ceil(total / new_last_h)) - # Adjust padding to match - actual_new_patches = last_t * new_last_h * new_last_w - if actual_new_patches != new_last_patches: - # Recalculate padded pixel values - extra_padding = actual_new_patches - last_patches - padded_num_patches = num_input_patches + extra_padding - padded_pixel_values = torch.zeros( - (padded_num_patches, pixel_values.shape[1]), - dtype=pixel_values.dtype, - device=pixel_values.device, - ) - padded_pixel_values[:num_input_patches] = pixel_values - else: - new_last_h = last_h - else: - new_last_w = last_w - padded_grid_thw[-1] = [last_t, new_last_h, new_last_w] + if padding_patches > 0: + # Create a square-ish dummy grid for padding patches + dummy_side = int(math.ceil(math.sqrt(padding_patches))) + # Ensure it's even (required by spatial_merge_size=2) + if dummy_side % 2 != 0: + dummy_side += 1 + actual_dummy_patches = dummy_side * dummy_side + # Update padding to match actual dummy grid + if actual_dummy_patches != padding_patches: + padded_num_patches = num_input_patches + actual_dummy_patches + padded_pixel_values = torch.zeros( + (padded_num_patches, pixel_values.shape[1]), + dtype=pixel_values.dtype, + device=pixel_values.device, + ) + padded_pixel_values[:num_input_patches] = pixel_values + # Add dummy image entry: T=1, H=dummy_side, W=dummy_side + padded_grid_thw.append([1, dummy_side, dummy_side]) # Create padded kwargs padded_kwargs = dict(mm_kwargs_group) @@ -2848,24 +2831,20 @@ def _execute_encoder_piecewise_padded( # Execute encoder with padded inputs padded_outputs = model.embed_multimodal(**padded_kwargs) - # Trim outputs to actual size - trimmed_outputs = [] - for output in padded_outputs: - if isinstance(output, torch.Tensor): - trimmed_outputs.append(output[:actual_output_tokens]) - else: - trimmed_outputs.append(output) + # Return only real image outputs (exclude dummy image at the end) + num_real_images = len(grid_thw_list) + real_outputs = list(padded_outputs[:num_real_images]) if self.encoder_cudagraph_verbose: logger.info( "Piecewise padded execution: actual_tokens=%d, " - "capture_size=%d, padding=%d", + "capture_size=%d, num_real_images=%d", actual_output_tokens, capture_size, - capture_size - actual_output_tokens, + num_real_images, ) - return trimmed_outputs + return real_outputs def _gather_mm_embeddings( self, From f8f5e252b5cf42dc1be2fa30df237f018f52d980 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 20:22:31 -0500 Subject: [PATCH 106/189] add custom forward function for piecewise. --- vllm/model_executor/models/qwen3_vl.py | 66 ++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 03aacbf53a13..cb712c5942f1 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -845,6 +845,72 @@ def precompute_for_cudagraph( "sequence_lengths": sequence_lengths, } + def forward_piecewise( + self, + x: torch.Tensor, + pos_embeds: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, + cu_seqlens: torch.Tensor, + max_seqlen: torch.Tensor, + sequence_lengths: torch.Tensor, + ) -> torch.Tensor: + """ + Forward pass optimized for piecewise CUDA graph mode with batched images. + + This method accepts pre-computed position embeddings, rotary embeddings, + and cumulative sequence lengths. Unlike forward_cudagraph which processes + one image at a time, this method handles batched images with padding for + piecewise cudagraph optimization. + + The key difference from the regular forward() is that all grid-dependent + computations (position embeddings, rotary embeddings, cu_seqlens) are + pre-computed outside the compiled graph, allowing padding to be applied + to match cudagraph capture sizes. + + Args: + x: Input pixel values [num_patches, patch_channels] + pos_embeds: Pre-computed position embeddings [num_patches, hidden_size] + rotary_pos_emb_cos: Pre-computed rotary cosine embeddings + rotary_pos_emb_sin: Pre-computed rotary sine embeddings + cu_seqlens: Pre-computed cumulative sequence lengths (on GPU) + max_seqlen: Pre-computed max sequence length (scalar tensor on GPU) + sequence_lengths: Pre-computed sequence lengths (for FlashInfer CuDNN) + + Returns: + Vision encoder output tensor [num_output_tokens, hidden_size] + """ + # Patch embedding (GPU operation) + hidden_states = x.to(device=self.device, dtype=self.dtype, non_blocking=True) + hidden_states = self.patch_embed(hidden_states) + + # Add pre-computed position embeddings + hidden_states = hidden_states + pos_embeds + + hidden_states = hidden_states.unsqueeze(1) + + # Run through transformer blocks with pre-computed values + deepstack_feature_lists = [] + for layer_num, blk in enumerate(self.blocks): + hidden_states = blk( + hidden_states, + cu_seqlens=cu_seqlens, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, + max_seqlen=max_seqlen, + sequence_lengths=sequence_lengths, + ) + if layer_num in self.deepstack_visual_indexes: + deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num) + deepstack_feature = self.deepstack_merger_list[deepstack_merger_idx]( + hidden_states + ) + deepstack_feature_lists.append(deepstack_feature) + + hidden_states = self.merger(hidden_states) + hidden_states = torch.cat([hidden_states] + deepstack_feature_lists, dim=1) + return hidden_states + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) From 34cccff18d5117e5c3ada0844a750316fda7bfe9 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 20:22:57 -0500 Subject: [PATCH 107/189] precompute embeddings and use piecewise. --- vllm/v1/worker/gpu_model_runner.py | 149 +++++++++++++++++++---------- 1 file changed, 97 insertions(+), 52 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 474486a7b26f..01d81d72b439 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2724,8 +2724,13 @@ def _execute_encoder_piecewise_padded( ) -> list[torch.Tensor] | None: """Execute encoder with padding for piecewise cudagraph mode. - Pads inputs to the nearest capture size so that the compiled encoder - can use cudagraph. Trims output to actual size after execution. + Pre-computes embeddings outside the compiled graph, pads all tensors + to the nearest capture size, then calls forward_piecewise. This allows + cudagraph capture at fixed sizes while handling variable batch sizes. + + The key insight is that position embeddings depend on grid dimensions + and must be computed OUTSIDE the compiled graph. By pre-computing them + and padding, we can achieve cudagraph hits for the compiled regions. Args: model: The multimodal model @@ -2758,13 +2763,18 @@ def _execute_encoder_piecewise_padded( else: grid_thw_list = list(grid_thw) - # Get spatial merge size from model + # Get visual encoder and check for forward_piecewise support visual = getattr(model, "visual", None) if visual is None: return None + + # Check if forward_piecewise is available + if not hasattr(visual, "forward_piecewise"): + return None + spatial_merge_size = getattr(visual, "spatial_merge_size", 2) - # Calculate actual output tokens + # Calculate actual tokens actual_num_patches = sum(t * h * w for t, h, w in grid_thw_list) actual_output_tokens = actual_num_patches // (spatial_merge_size**2) @@ -2774,19 +2784,25 @@ def _execute_encoder_piecewise_padded( # No suitable capture size, fall back to non-padded execution return None - if capture_size == actual_output_tokens: - # Exact match, no padding needed - return None - # Calculate padding needed - padding_tokens = capture_size - actual_output_tokens - padding_patches = padding_tokens * (spatial_merge_size**2) - - # Pad pixel_values with zeros - # pixel_values shape: [num_patches, patch_channels] + padding_output_tokens = capture_size - actual_output_tokens + padding_patches = padding_output_tokens * (spatial_merge_size**2) + + # Pre-compute embeddings for real images (OUTSIDE compiled graph) + # This is the key to making piecewise padding work + precomputed = visual.precompute_for_cudagraph(grid_thw_list) + pos_embeds = precomputed["pos_embeds"] + rotary_pos_emb_cos = precomputed["rotary_pos_emb_cos"] + rotary_pos_emb_sin = precomputed["rotary_pos_emb_sin"] + cu_seqlens = precomputed["cu_seqlens"] + max_seqlen = precomputed["max_seqlen"] + sequence_lengths = precomputed["sequence_lengths"] + + # Pad all tensors to capture_size num_input_patches = pixel_values.shape[0] padded_num_patches = num_input_patches + padding_patches + # Pad pixel_values padded_pixel_values = torch.zeros( (padded_num_patches, pixel_values.shape[1]), dtype=pixel_values.dtype, @@ -2794,54 +2810,83 @@ def _execute_encoder_piecewise_padded( ) padded_pixel_values[:num_input_patches] = pixel_values - # Add a dummy image entry for padding patches (don't modify existing grids) - # This preserves position embeddings for real images - padded_grid_thw = [list(g) for g in grid_thw_list] - if padding_patches > 0: - # Create a square-ish dummy grid for padding patches - dummy_side = int(math.ceil(math.sqrt(padding_patches))) - # Ensure it's even (required by spatial_merge_size=2) - if dummy_side % 2 != 0: - dummy_side += 1 - actual_dummy_patches = dummy_side * dummy_side - # Update padding to match actual dummy grid - if actual_dummy_patches != padding_patches: - padded_num_patches = num_input_patches + actual_dummy_patches - padded_pixel_values = torch.zeros( - (padded_num_patches, pixel_values.shape[1]), - dtype=pixel_values.dtype, - device=pixel_values.device, - ) - padded_pixel_values[:num_input_patches] = pixel_values - # Add dummy image entry: T=1, H=dummy_side, W=dummy_side - padded_grid_thw.append([1, dummy_side, dummy_side]) - - # Create padded kwargs - padded_kwargs = dict(mm_kwargs_group) - padded_kwargs[pixel_key] = padded_pixel_values - # Use CPU tensor (has .ndim for model, accepted by tensor schema) - padded_grid_thw_tensor = torch.tensor( - padded_grid_thw, dtype=torch.int32, device="cpu" + # Pad position embeddings + padded_pos_embeds = torch.zeros( + (padded_num_patches, pos_embeds.shape[1]), + dtype=pos_embeds.dtype, + device=pos_embeds.device, ) - if modality == "image": - padded_kwargs["image_grid_thw"] = padded_grid_thw_tensor - else: - padded_kwargs["video_grid_thw"] = padded_grid_thw_tensor + padded_pos_embeds[:num_input_patches] = pos_embeds - # Execute encoder with padded inputs - padded_outputs = model.embed_multimodal(**padded_kwargs) + # Pad rotary embeddings + padded_rotary_cos = torch.zeros( + (padded_num_patches, rotary_pos_emb_cos.shape[1]), + dtype=rotary_pos_emb_cos.dtype, + device=rotary_pos_emb_cos.device, + ) + padded_rotary_cos[:num_input_patches] = rotary_pos_emb_cos + + padded_rotary_sin = torch.zeros( + (padded_num_patches, rotary_pos_emb_sin.shape[1]), + dtype=rotary_pos_emb_sin.dtype, + device=rotary_pos_emb_sin.device, + ) + padded_rotary_sin[:num_input_patches] = rotary_pos_emb_sin + + # Update cu_seqlens to include padding as a separate sequence + # Original cu_seqlens ends at actual_num_patches + # Add padding patches as one sequence at the end + if padding_patches > 0: + # Append padding sequence boundary + padding_end = cu_seqlens[-1] + padding_patches + padded_cu_seqlens = torch.cat([ + cu_seqlens, + torch.tensor([padding_end], dtype=cu_seqlens.dtype, + device=cu_seqlens.device) + ]) + # Add padding sequence length + padded_sequence_lengths = torch.cat([ + sequence_lengths, + torch.tensor([padding_patches], dtype=sequence_lengths.dtype, + device=sequence_lengths.device) + ]) + else: + padded_cu_seqlens = cu_seqlens + padded_sequence_lengths = sequence_lengths + + # Update max_seqlen if padding sequence is larger + if padding_patches > max_seqlen.item(): + max_seqlen = torch.tensor(padding_patches, dtype=max_seqlen.dtype, + device=max_seqlen.device) + + # Convert pixel_values to visual encoder dtype + padded_pixel_values = padded_pixel_values.type(visual.dtype) + + # Call forward_piecewise directly with pre-computed and padded tensors + with set_forward_context(None, self.vllm_config): + encoder_output = visual.forward_piecewise( + x=padded_pixel_values, + pos_embeds=padded_pos_embeds, + rotary_pos_emb_cos=padded_rotary_cos, + rotary_pos_emb_sin=padded_rotary_sin, + cu_seqlens=padded_cu_seqlens, + max_seqlen=max_seqlen, + sequence_lengths=padded_sequence_lengths, + ) - # Return only real image outputs (exclude dummy image at the end) - num_real_images = len(grid_thw_list) - real_outputs = list(padded_outputs[:num_real_images]) + # Split output by actual token counts for each image (exclude padding) + merge_size_sq = spatial_merge_size ** 2 + sizes = [t * h * w // merge_size_sq for t, h, w in grid_thw_list] + real_outputs = list(encoder_output[:actual_output_tokens].split(sizes)) if self.encoder_cudagraph_verbose: logger.info( "Piecewise padded execution: actual_tokens=%d, " - "capture_size=%d, num_real_images=%d", + "capture_size=%d, padding=%d, num_images=%d", actual_output_tokens, capture_size, - num_real_images, + padding_output_tokens, + len(grid_thw_list), ) return real_outputs From 6473b921ffa72363b5c9da16bcf88de467df5cf8 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 20:30:54 -0500 Subject: [PATCH 108/189] check if max_seqlen is on cpu. --- vllm/model_executor/models/qwen3_vl.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index cb712c5942f1..09972ca7fb4c 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -874,7 +874,7 @@ def forward_piecewise( rotary_pos_emb_cos: Pre-computed rotary cosine embeddings rotary_pos_emb_sin: Pre-computed rotary sine embeddings cu_seqlens: Pre-computed cumulative sequence lengths (on GPU) - max_seqlen: Pre-computed max sequence length (scalar tensor on GPU) + max_seqlen: Pre-computed max sequence length (scalar tensor, can be CPU) sequence_lengths: Pre-computed sequence lengths (for FlashInfer CuDNN) Returns: @@ -884,6 +884,10 @@ def forward_piecewise( hidden_states = x.to(device=self.device, dtype=self.dtype, non_blocking=True) hidden_states = self.patch_embed(hidden_states) + # Ensure max_seqlen is on GPU for attention kernels + if max_seqlen.device.type == "cpu": + max_seqlen = max_seqlen.to(self.device, non_blocking=True) + # Add pre-computed position embeddings hidden_states = hidden_states + pos_embeds From 171466a9f2ec38b9fcfea96c4d6d4b5bc24e064b Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 22:14:15 -0500 Subject: [PATCH 109/189] log piecewise cudagraph stats. --- vllm/v1/worker/gpu_model_runner.py | 106 +++++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 5 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 01d81d72b439..20b62b9ed4cc 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2497,7 +2497,7 @@ def _execute_mm_encoder( else: # Try piecewise padded execution if enabled piecewise_result = None - if ( + piecewise_enabled = ( self.compilation_config is not None and getattr( self.compilation_config, @@ -2509,7 +2509,22 @@ def _execute_mm_encoder( "encoder_cudagraph_padded_mode", True, ) - ): + ) + + if self.encoder_cudagraph_verbose: + logger.info( + "ViT: cudagraph_result=None, piecewise_enabled=%s " + "(piecewise=%s, padded_mode=%s)", + piecewise_enabled, + getattr(self.compilation_config, + "encoder_cudagraph_piecewise", False) + if self.compilation_config else None, + getattr(self.compilation_config, + "encoder_cudagraph_padded_mode", True) + if self.compilation_config else None, + ) + + if piecewise_enabled: piecewise_result = self._execute_encoder_piecewise_padded( model, mm_kwargs_group, modality ) @@ -2716,6 +2731,66 @@ def _find_nearest_encoder_capture_size(self, num_tokens: int) -> int | None: # num_tokens exceeds all capture sizes return None + # Class-level counters for piecewise padded mode statistics + _piecewise_stats: dict = {} + + @classmethod + def _init_piecewise_stats(cls): + if not cls._piecewise_stats: + cls._piecewise_stats = { + "calls": 0, + "executions": 0, + "total_actual_tokens": 0, + "total_padded_tokens": 0, + "capture_size_hits": {}, # capture_size -> count + "fallback_reasons": {}, # reason -> count + } + + def _record_piecewise_fallback(self, reason: str): + self._init_piecewise_stats() + self._piecewise_stats["calls"] += 1 + self._piecewise_stats["fallback_reasons"][reason] = ( + self._piecewise_stats["fallback_reasons"].get(reason, 0) + 1 + ) + if self.encoder_cudagraph_verbose: + logger.info("ViT PIECEWISE fallback: %s", reason) + + @classmethod + def _record_piecewise_execution(cls, actual_tokens: int, capture_size: int): + cls._init_piecewise_stats() + cls._piecewise_stats["calls"] += 1 + cls._piecewise_stats["executions"] += 1 + cls._piecewise_stats["total_actual_tokens"] += actual_tokens + cls._piecewise_stats["total_padded_tokens"] += capture_size + cls._piecewise_stats["capture_size_hits"][capture_size] = ( + cls._piecewise_stats["capture_size_hits"].get(capture_size, 0) + 1 + ) + + @classmethod + def get_piecewise_stats_summary(cls) -> str: + cls._init_piecewise_stats() + stats = cls._piecewise_stats + if stats["calls"] == 0: + return "Piecewise padded: no calls" + + total_actual = stats["total_actual_tokens"] + total_padded = stats["total_padded_tokens"] + waste_pct = ( + (total_padded - total_actual) / total_padded * 100 + if total_padded > 0 else 0 + ) + + lines = [ + f"Piecewise padded stats:", + f" Calls: {stats['calls']}, Executions: {stats['executions']}", + f" Total actual tokens: {total_actual}", + f" Total padded tokens: {total_padded}", + f" Padding waste: {waste_pct:.1f}%", + f" Capture size hits: {stats['capture_size_hits']}", + f" Fallback reasons: {stats['fallback_reasons']}", + ] + return "\n".join(lines) + def _execute_encoder_piecewise_padded( self, model: "SupportsMultiModal", @@ -2740,8 +2815,13 @@ def _execute_encoder_piecewise_padded( Returns: List of encoder outputs if padding was applied, None otherwise """ + if self.encoder_cudagraph_verbose: + logger.info("ViT PIECEWISE: _execute_encoder_piecewise_padded called, " + "modality=%s", modality) + # Only support image/video modalities if modality not in ("image", "video"): + self._record_piecewise_fallback(f"unsupported_modality:{modality}") return None # Extract grid_thw and pixel_values @@ -2751,10 +2831,12 @@ def _execute_encoder_piecewise_padded( grid_thw = mm_kwargs_group.get("video_grid_thw") pixel_key = "pixel_values_videos" if grid_thw is None: + self._record_piecewise_fallback("no_grid_thw") return None pixel_values = mm_kwargs_group.get(pixel_key) if pixel_values is None: + self._record_piecewise_fallback("no_pixel_values") return None # Convert to list if tensor @@ -2766,10 +2848,12 @@ def _execute_encoder_piecewise_padded( # Get visual encoder and check for forward_piecewise support visual = getattr(model, "visual", None) if visual is None: + self._record_piecewise_fallback("no_visual_encoder") return None # Check if forward_piecewise is available if not hasattr(visual, "forward_piecewise"): + self._record_piecewise_fallback("no_forward_piecewise_method") return None spatial_merge_size = getattr(visual, "spatial_merge_size", 2) @@ -2781,7 +2865,9 @@ def _execute_encoder_piecewise_padded( # Find nearest capture size capture_size = self._find_nearest_encoder_capture_size(actual_output_tokens) if capture_size is None: - # No suitable capture size, fall back to non-padded execution + self._record_piecewise_fallback( + f"no_capture_size_for_{actual_output_tokens}_tokens" + ) return None # Calculate padding needed @@ -2879,15 +2965,25 @@ def _execute_encoder_piecewise_padded( sizes = [t * h * w // merge_size_sq for t, h, w in grid_thw_list] real_outputs = list(encoder_output[:actual_output_tokens].split(sizes)) + # Record statistics + self._record_piecewise_execution(actual_output_tokens, capture_size) + if self.encoder_cudagraph_verbose: + waste_pct = padding_output_tokens / capture_size * 100 logger.info( - "Piecewise padded execution: actual_tokens=%d, " - "capture_size=%d, padding=%d, num_images=%d", + "ViT PIECEWISE PADDED: actual_tokens=%d, capture_size=%d, " + "padding=%d (%.1f%% waste), num_images=%d, grids=%s", actual_output_tokens, capture_size, padding_output_tokens, + waste_pct, len(grid_thw_list), + grid_thw_list, ) + # Periodically log overall stats + stats = self._piecewise_stats + if stats["executions"] % 100 == 0: + logger.info(self.get_piecewise_stats_summary()) return real_outputs From 571c0f4d7d08dd53612f56899c72b7ae5492f71b Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 22:27:49 -0500 Subject: [PATCH 110/189] log every call. --- vllm/v1/worker/gpu_model_runner.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 20b62b9ed4cc..25c457d585f5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2970,20 +2970,27 @@ def _execute_encoder_piecewise_padded( if self.encoder_cudagraph_verbose: waste_pct = padding_output_tokens / capture_size * 100 + stats = self._piecewise_stats + total_waste_pct = ( + (stats["total_padded_tokens"] - stats["total_actual_tokens"]) + / stats["total_padded_tokens"] * 100 + if stats["total_padded_tokens"] > 0 else 0 + ) logger.info( - "ViT PIECEWISE PADDED: actual_tokens=%d, capture_size=%d, " - "padding=%d (%.1f%% waste), num_images=%d, grids=%s", + "ViT PIECEWISE PADDED: actual=%d, capture_size=%d, " + "padding=%d (%.1f%%), num_images=%d | " + "cumulative: executions=%d, total_actual=%d, total_padded=%d, " + "waste=%.1f%%", actual_output_tokens, capture_size, padding_output_tokens, waste_pct, len(grid_thw_list), - grid_thw_list, + stats["executions"], + stats["total_actual_tokens"], + stats["total_padded_tokens"], + total_waste_pct, ) - # Periodically log overall stats - stats = self._piecewise_stats - if stats["executions"] % 100 == 0: - logger.info(self.get_piecewise_stats_summary()) return real_outputs From 142745b3dee11c04f8d0be2d673713cc7cceb3e3 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 22:51:20 -0500 Subject: [PATCH 111/189] set verbose flag. --- vllm/v1/worker/gpu_model_runner.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 25c457d585f5..a174f4c93c9d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -694,6 +694,13 @@ def _init_encoder_cudagraph_manager(self) -> None: if self.compilation_config is None: return + # Always check verbose logging first (applies to all modes) + self.encoder_cudagraph_verbose = getattr( + self.compilation_config, + "encoder_cudagraph_verbose", + False, + ) + # Check if piecewise encoder cudagraph mode is enabled # In piecewise mode, torch.compile handles graph splitting at attention ops, # so we don't need the full EncoderCudaGraphManager From 46e4431c110cf9183f3a979759a483cccd5da29c Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 22:56:47 -0500 Subject: [PATCH 112/189] preallocate buffer for embeddings etc. --- vllm/v1/worker/gpu_model_runner.py | 126 +++++++++++++++++------------ 1 file changed, 76 insertions(+), 50 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a174f4c93c9d..96f5966769a8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -432,6 +432,9 @@ def __init__( self.encoder_cudagraph_padded_mode: bool = True self.encoder_cudagraph_verbose: bool = False self.encoder_cudagraph_one_by_one: bool = True + # Pre-allocated buffers for piecewise padded mode (lazily initialized) + # Key: capture_size (output tokens), Value: dict of buffers + self._piecewise_buffers: dict[int, dict[str, torch.Tensor]] = {} self._init_encoder_cudagraph_manager() self.use_aux_hidden_state_outputs = False @@ -2880,6 +2883,7 @@ def _execute_encoder_piecewise_padded( # Calculate padding needed padding_output_tokens = capture_size - actual_output_tokens padding_patches = padding_output_tokens * (spatial_merge_size**2) + padded_num_patches = capture_size * (spatial_merge_size**2) # Pre-compute embeddings for real images (OUTSIDE compiled graph) # This is the key to making piecewise padding work @@ -2891,70 +2895,92 @@ def _execute_encoder_piecewise_padded( max_seqlen = precomputed["max_seqlen"] sequence_lengths = precomputed["sequence_lengths"] - # Pad all tensors to capture_size num_input_patches = pixel_values.shape[0] - padded_num_patches = num_input_patches + padding_patches - # Pad pixel_values - padded_pixel_values = torch.zeros( - (padded_num_patches, pixel_values.shape[1]), - dtype=pixel_values.dtype, - device=pixel_values.device, - ) - padded_pixel_values[:num_input_patches] = pixel_values + # Get or create pre-allocated buffers for this capture_size + # This avoids allocation and zeros kernels on every call + buffers = self._piecewise_buffers.get(capture_size) + if buffers is None: + # Lazily allocate buffers on first use for this capture_size + # Using torch.empty avoids the zeros kernel + buffers = { + "pixel_values": torch.empty( + (padded_num_patches, pixel_values.shape[1]), + dtype=visual.dtype, + device=pixel_values.device, + ), + "pos_embeds": torch.empty( + (padded_num_patches, pos_embeds.shape[1]), + dtype=pos_embeds.dtype, + device=pos_embeds.device, + ), + "rotary_cos": torch.empty( + (padded_num_patches, rotary_pos_emb_cos.shape[1]), + dtype=rotary_pos_emb_cos.dtype, + device=rotary_pos_emb_cos.device, + ), + "rotary_sin": torch.empty( + (padded_num_patches, rotary_pos_emb_sin.shape[1]), + dtype=rotary_pos_emb_sin.dtype, + device=rotary_pos_emb_sin.device, + ), + # Pre-allocate cu_seqlens with max possible entries + # (assuming max ~1000 images per batch is more than enough) + "cu_seqlens": torch.empty( + (1001,), dtype=cu_seqlens.dtype, device=cu_seqlens.device + ), + "sequence_lengths": torch.empty( + (1000,), dtype=sequence_lengths.dtype, + device=sequence_lengths.device + ), + } + self._piecewise_buffers[capture_size] = buffers + if self.encoder_cudagraph_verbose: + logger.info( + "ViT PIECEWISE: Allocated buffers for capture_size=%d " + "(patches=%d)", + capture_size, padded_num_patches + ) - # Pad position embeddings - padded_pos_embeds = torch.zeros( - (padded_num_patches, pos_embeds.shape[1]), - dtype=pos_embeds.dtype, - device=pos_embeds.device, - ) - padded_pos_embeds[:num_input_patches] = pos_embeds + # Copy data into pre-allocated buffers (no allocation, no zeros kernel) + padded_pixel_values = buffers["pixel_values"] + padded_pixel_values[:num_input_patches].copy_( + pixel_values.type(visual.dtype)) - # Pad rotary embeddings - padded_rotary_cos = torch.zeros( - (padded_num_patches, rotary_pos_emb_cos.shape[1]), - dtype=rotary_pos_emb_cos.dtype, - device=rotary_pos_emb_cos.device, - ) - padded_rotary_cos[:num_input_patches] = rotary_pos_emb_cos + padded_pos_embeds = buffers["pos_embeds"] + padded_pos_embeds[:num_input_patches].copy_(pos_embeds) - padded_rotary_sin = torch.zeros( - (padded_num_patches, rotary_pos_emb_sin.shape[1]), - dtype=rotary_pos_emb_sin.dtype, - device=rotary_pos_emb_sin.device, - ) - padded_rotary_sin[:num_input_patches] = rotary_pos_emb_sin + padded_rotary_cos = buffers["rotary_cos"] + padded_rotary_cos[:num_input_patches].copy_(rotary_pos_emb_cos) + + padded_rotary_sin = buffers["rotary_sin"] + padded_rotary_sin[:num_input_patches].copy_(rotary_pos_emb_sin) # Update cu_seqlens to include padding as a separate sequence - # Original cu_seqlens ends at actual_num_patches - # Add padding patches as one sequence at the end + num_seqs = cu_seqlens.shape[0] + padded_cu_seqlens = buffers["cu_seqlens"] + padded_cu_seqlens[:num_seqs].copy_(cu_seqlens) if padding_patches > 0: - # Append padding sequence boundary - padding_end = cu_seqlens[-1] + padding_patches - padded_cu_seqlens = torch.cat([ - cu_seqlens, - torch.tensor([padding_end], dtype=cu_seqlens.dtype, - device=cu_seqlens.device) - ]) - # Add padding sequence length - padded_sequence_lengths = torch.cat([ - sequence_lengths, - torch.tensor([padding_patches], dtype=sequence_lengths.dtype, - device=sequence_lengths.device) - ]) - else: - padded_cu_seqlens = cu_seqlens - padded_sequence_lengths = sequence_lengths + # Add padding sequence boundary + padded_cu_seqlens[num_seqs] = cu_seqlens[-1] + padding_patches + num_seqs += 1 + + num_seq_lens = sequence_lengths.shape[0] + padded_sequence_lengths = buffers["sequence_lengths"] + padded_sequence_lengths[:num_seq_lens].copy_(sequence_lengths) + if padding_patches > 0: + padded_sequence_lengths[num_seq_lens] = padding_patches + num_seq_lens += 1 + + # Slice to actual size needed + padded_cu_seqlens = padded_cu_seqlens[:num_seqs] + padded_sequence_lengths = padded_sequence_lengths[:num_seq_lens] # Update max_seqlen if padding sequence is larger if padding_patches > max_seqlen.item(): max_seqlen = torch.tensor(padding_patches, dtype=max_seqlen.dtype, device=max_seqlen.device) - # Convert pixel_values to visual encoder dtype - padded_pixel_values = padded_pixel_values.type(visual.dtype) - # Call forward_piecewise directly with pre-computed and padded tensors with set_forward_context(None, self.vllm_config): encoder_output = visual.forward_piecewise( From ab88f9bb6b939a47dcee127cc878b353a3e1062b Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Tue, 3 Feb 2026 23:20:37 -0500 Subject: [PATCH 113/189] use zeros instead of empty. --- vllm/v1/worker/gpu_model_runner.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 96f5966769a8..a6e9a738ec14 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2902,34 +2902,35 @@ def _execute_encoder_piecewise_padded( buffers = self._piecewise_buffers.get(capture_size) if buffers is None: # Lazily allocate buffers on first use for this capture_size - # Using torch.empty avoids the zeros kernel + # Using torch.zeros ensures padding region is valid (not garbage) + # The zeros kernel only runs once per capture_size, not per call buffers = { - "pixel_values": torch.empty( + "pixel_values": torch.zeros( (padded_num_patches, pixel_values.shape[1]), dtype=visual.dtype, device=pixel_values.device, ), - "pos_embeds": torch.empty( + "pos_embeds": torch.zeros( (padded_num_patches, pos_embeds.shape[1]), dtype=pos_embeds.dtype, device=pos_embeds.device, ), - "rotary_cos": torch.empty( + "rotary_cos": torch.zeros( (padded_num_patches, rotary_pos_emb_cos.shape[1]), dtype=rotary_pos_emb_cos.dtype, device=rotary_pos_emb_cos.device, ), - "rotary_sin": torch.empty( + "rotary_sin": torch.zeros( (padded_num_patches, rotary_pos_emb_sin.shape[1]), dtype=rotary_pos_emb_sin.dtype, device=rotary_pos_emb_sin.device, ), # Pre-allocate cu_seqlens with max possible entries # (assuming max ~1000 images per batch is more than enough) - "cu_seqlens": torch.empty( + "cu_seqlens": torch.zeros( (1001,), dtype=cu_seqlens.dtype, device=cu_seqlens.device ), - "sequence_lengths": torch.empty( + "sequence_lengths": torch.zeros( (1000,), dtype=sequence_lengths.dtype, device=sequence_lengths.device ), From 86c96fe887b40c81ed2189bee08368335280f368 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 00:13:20 -0500 Subject: [PATCH 114/189] log piecewise capture and replay. --- vllm/compilation/piecewise_backend.py | 53 ++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 852f2aabbab0..3a3ac9c01ac6 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -150,13 +150,24 @@ def _maybe_compile_for_range_entry( range_entry.compiled = True self.to_be_compiled_ranges.remove(range_entry.compile_range) + is_exact_size = range_entry.compile_range.is_single_size() + logger.info( + "PIECEWISE CAPTURE: graph_index=%d/%d, range=%s, " + "is_exact_size=%s, is_encoder=%s", + self.piecewise_compile_index, + self.total_piecewise_compiles, + range_entry.compile_range, + is_exact_size, + self.is_encoder_compilation, + ) + # args are real arguments # fakify for range, real args for concrete size. # For concrete size, we clear the shape env in # compiler_manager.compile() so no need to fakify. args_list = ( self._fakify_args(args) - if not range_entry.compile_range.is_single_size() + if not is_exact_size else list(args) ) range_entry.runnable = self.vllm_backend.compiler_manager.compile( @@ -176,14 +187,36 @@ def _find_range_for_shape(self, runtime_shape: int) -> RangeEntry | None: # If not found, we search for the range entry # that contains the runtime shape. if self.compile_sizes is None: + logger.debug( + "PIECEWISE: compile_sizes is None, shape=%d, is_encoder=%s", + runtime_shape, self.is_encoder_compilation + ) return None if runtime_shape in self.compile_sizes: + # Exact match with capture size - will use cudagraph + logger.debug( + "PIECEWISE: exact match shape=%d in compile_sizes, is_encoder=%s", + runtime_shape, self.is_encoder_compilation + ) return self.range_entries[Range(start=runtime_shape, end=runtime_shape)] else: + # No exact match - fall back to compile_ranges (no cudagraph) for range in self.compile_ranges: if runtime_shape in range: + logger.debug( + "PIECEWISE: shape=%d not in compile_sizes, " + "using compile_range=%s (NO CUDAGRAPH), is_encoder=%s", + runtime_shape, range, self.is_encoder_compilation + ) return self.range_entries[range] + # Shape not in any range - will cause assertion error + logger.warning( + "PIECEWISE: shape=%d not in compile_sizes=%s or " + "compile_ranges=%s, is_encoder=%s", + runtime_shape, self.compile_sizes, self.compile_ranges, + self.is_encoder_compilation + ) return None def __call__(self, *args: Any) -> Any: @@ -194,5 +227,23 @@ def __call__(self, *args: Any) -> Any: f"Shape: {runtime_shape} out of considered ranges: {self.compile_ranges}" ) + # Log capture vs replay + is_capture = not range_entry.compiled + is_exact_size = range_entry.compile_range.is_single_size() + self._maybe_compile_for_range_entry(range_entry, args) # type: ignore[arg-type] + + # Log replay (capture is logged inside _maybe_compile_for_range_entry) + if not is_capture: + logger.debug( + "PIECEWISE REPLAY: graph_index=%d/%d, shape=%d, range=%s, " + "is_exact_size=%s, is_encoder=%s", + self.piecewise_compile_index, + self.total_piecewise_compiles, + runtime_shape, + range_entry.compile_range, + is_exact_size, + self.is_encoder_compilation, + ) + return range_entry.runnable(*args) # type: ignore[union-attr] From 67d828a456f270b80c3c09fe8ba507697f9d3b9d Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 00:37:27 -0500 Subject: [PATCH 115/189] add runtime/sym shape check. --- vllm/compilation/piecewise_backend.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 3a3ac9c01ac6..d0f432eb1d2b 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -221,6 +221,24 @@ def _find_range_for_shape(self, runtime_shape: int) -> RangeEntry | None: def __call__(self, *args: Any) -> Any: runtime_shape = args[self.sym_shape_indices[0]] + + # Debug logging for encoder to understand shape mismatch + if self.is_encoder_compilation: + arg_shapes = [] + for i, a in enumerate(args[:5]): # First 5 args + if hasattr(a, 'shape'): + arg_shapes.append(f"arg[{i}].shape={tuple(a.shape)}") + else: + arg_shapes.append(f"arg[{i}]={type(a).__name__}") + logger.info( + "PIECEWISE ENCODER __call__: runtime_shape=%s, " + "sym_shape_indices=%s, compile_sizes=%s, %s", + runtime_shape, + self.sym_shape_indices, + self.compile_sizes, + ", ".join(arg_shapes), + ) + range_entry = self._find_range_for_shape(runtime_shape) assert range_entry is not None, ( From 0e61942ad3a0142b64b16cde4a7e9761c83e3f2e Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 00:47:37 -0500 Subject: [PATCH 116/189] convert capture size from # tokens to # patches. --- vllm/compilation/piecewise_backend.py | 15 ++++++++++++++- vllm/config/compilation.py | 6 ++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index d0f432eb1d2b..4f6c98b6ec74 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -74,7 +74,20 @@ def __init__( # Use encoder-specific capture sizes for encoder compilation if self.is_encoder_compilation: - self.compile_sizes = self.compilation_config.encoder_cudagraph_capture_sizes + encoder_capture_sizes = self.compilation_config.encoder_cudagraph_capture_sizes + if encoder_capture_sizes is not None: + # Convert from output tokens to input patches + # encoder_cudagraph_capture_sizes is specified in output tokens + # but runtime_shape (from sym_shape_indices) is in input patches + merge_size_sq = self.compilation_config.encoder_spatial_merge_size ** 2 + self.compile_sizes = [size * merge_size_sq for size in encoder_capture_sizes] + logger.debug_once( + "PiecewiseBackend: converted encoder capture sizes from " + "output tokens %s to input patches %s (merge_size²=%d)", + encoder_capture_sizes, self.compile_sizes, merge_size_sq + ) + else: + self.compile_sizes = None else: self.compile_sizes = self.compilation_config.compile_sizes log_string = ( diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index d8abe9673e5c..bc99e5eed4bc 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -509,6 +509,12 @@ class CompilationConfig: Example: [256, 512, 1024, 2048, 4096, 8192, 16384] If None, encoder piecewise mode will use compile_ranges only (no cudagraph).""" + encoder_spatial_merge_size: int = 2 + """Spatial merge size for vision encoder (e.g., 2 for Qwen3-VL). + This converts encoder_cudagraph_capture_sizes (output tokens) to input patches. + Input patches = output tokens * spatial_merge_size². + Default is 2, which is common for Qwen-VL family models.""" + # Inductor capture compile_sizes: list[int | str] | None = None """Sizes to compile for inductor. In addition From e87d4d81c8e74123a41a984540c3e98cc2610608 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 00:54:23 -0500 Subject: [PATCH 117/189] fix type error. --- vllm/compilation/piecewise_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 4f6c98b6ec74..f184b3f0bfd8 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -84,7 +84,7 @@ def __init__( logger.debug_once( "PiecewiseBackend: converted encoder capture sizes from " "output tokens %s to input patches %s (merge_size²=%d)", - encoder_capture_sizes, self.compile_sizes, merge_size_sq + tuple(encoder_capture_sizes), tuple(self.compile_sizes), merge_size_sq ) else: self.compile_sizes = None From 78a2eb46820a43e3cd3d0a5c88dcd3590a8c8be8 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 01:15:29 -0500 Subject: [PATCH 118/189] warmup encoder cudagraph. --- vllm/v1/worker/gpu_model_runner.py | 106 +++++++++++++++++++++++++++++ vllm/v1/worker/gpu_worker.py | 6 ++ 2 files changed, 112 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a6e9a738ec14..ac61eef60180 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3028,6 +3028,112 @@ def _execute_encoder_piecewise_padded( return real_outputs + def warmup_encoder_piecewise(self) -> None: + """Pre-capture all encoder piecewise cudagraph sizes. + + This should be called during model warmup to compile all capture sizes + upfront, avoiding compilation latency during actual execution. + """ + if not getattr(self.compilation_config, "encoder_cudagraph_piecewise", False): + return + + capture_sizes = getattr( + self.compilation_config, "encoder_cudagraph_capture_sizes", None + ) + if capture_sizes is None or len(capture_sizes) == 0: + return + + # Get visual encoder + model = self.model + visual = getattr(model, "visual", None) + if visual is None or not hasattr(visual, "forward_piecewise"): + return + + spatial_merge_size = getattr(visual, "spatial_merge_size", 2) + merge_size_sq = spatial_merge_size ** 2 + + logger.info( + "Warming up encoder piecewise for %d capture sizes: %s", + len(capture_sizes), capture_sizes + ) + + for capture_size in sorted(capture_sizes): + # Convert output tokens to input patches + num_patches = capture_size * merge_size_sq + + # Create dummy inputs matching the expected shapes + # pixel_values: [num_patches, patch_channels] + patch_channels = getattr(visual, "patch_embed", None) + if patch_channels is not None: + in_channels = getattr(patch_channels, "in_channels", + getattr(patch_channels, "proj", None)) + if in_channels is not None and hasattr(in_channels, "in_channels"): + in_channels = in_channels.in_channels + else: + in_channels = 3 * 14 * 14 # Default for Qwen3-VL + else: + in_channels = 3 * 14 * 14 + + pixel_values = torch.zeros( + (num_patches, in_channels), + dtype=visual.dtype, + device=self.device, + ) + + # Get hidden size from visual encoder + hidden_size = getattr(visual, "hidden_size", + getattr(visual, "embed_dim", 1152)) + + pos_embeds = torch.zeros( + (num_patches, hidden_size), + dtype=visual.dtype, + device=self.device, + ) + + # Rotary embeddings - get actual dim from model + rotary_dim = hidden_size // getattr(visual, "num_heads", 16) // 2 + rotary_cos = torch.zeros( + (num_patches, rotary_dim), + dtype=visual.dtype, + device=self.device, + ) + rotary_sin = torch.zeros( + (num_patches, rotary_dim), + dtype=visual.dtype, + device=self.device, + ) + + # cu_seqlens for single "image" covering all patches + cu_seqlens = torch.tensor( + [0, num_patches], dtype=torch.int32, device=self.device + ) + max_seqlen = torch.tensor(num_patches, device=self.device) + sequence_lengths = torch.tensor( + [num_patches], dtype=torch.int32, device=self.device + ) + + logger.info( + "Warming up encoder piecewise: capture_size=%d (patches=%d)", + capture_size, num_patches + ) + + # Call forward_piecewise to trigger compilation + with set_forward_context(None, self.vllm_config): + _ = visual.forward_piecewise( + x=pixel_values, + pos_embeds=pos_embeds, + rotary_pos_emb_cos=rotary_cos, + rotary_pos_emb_sin=rotary_sin, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + sequence_lengths=sequence_lengths, + ) + + # Clear CUDA cache after each warmup to free intermediate memory + torch.cuda.empty_cache() + + logger.info("Encoder piecewise warmup complete for %d sizes", len(capture_sizes)) + def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 013780479743..a15c547a7053 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -446,6 +446,12 @@ def compile_or_warm_up_model(self) -> None: # cuda graph capture. kernel_warmup(self) + # Warmup encoder piecewise cudagraph if enabled + # This pre-captures all encoder capture sizes to avoid compilation + # latency during actual execution + if hasattr(self.model_runner, "warmup_encoder_piecewise"): + self.model_runner.warmup_encoder_piecewise() + cuda_graph_memory_bytes = 0 if not self.model_config.enforce_eager: cuda_graph_memory_bytes = self.model_runner.capture_model() From 7571794071798fd3c0f5ea330d08a1ce235fe778 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 01:23:15 -0500 Subject: [PATCH 119/189] change log level to info. --- vllm/compilation/piecewise_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index f184b3f0bfd8..3e9984955dc2 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -266,7 +266,7 @@ def __call__(self, *args: Any) -> Any: # Log replay (capture is logged inside _maybe_compile_for_range_entry) if not is_capture: - logger.debug( + logger.info( "PIECEWISE REPLAY: graph_index=%d/%d, shape=%d, range=%s, " "is_exact_size=%s, is_encoder=%s", self.piecewise_compile_index, From 9fd17c79243d548371d53419db428d1d2c3c144c Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 01:32:01 -0500 Subject: [PATCH 120/189] fix tensor shape in warmup. --- vllm/v1/worker/gpu_model_runner.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ac61eef60180..b9a2f9362aea 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3062,20 +3062,27 @@ def warmup_encoder_piecewise(self) -> None: num_patches = capture_size * merge_size_sq # Create dummy inputs matching the expected shapes - # pixel_values: [num_patches, patch_channels] - patch_channels = getattr(visual, "patch_embed", None) - if patch_channels is not None: - in_channels = getattr(patch_channels, "in_channels", - getattr(patch_channels, "proj", None)) - if in_channels is not None and hasattr(in_channels, "in_channels"): - in_channels = in_channels.in_channels + # pixel_values: [num_patches, C] where C = in_chans * temporal * H * W + # For Qwen3-VL: C = 3 * 2 * 14 * 14 = 1176 + patch_embed = getattr(visual, "patch_embed", None) + if patch_embed is not None: + # Get the actual dimensions from patch_embed + temporal_patch_size = getattr(patch_embed, "temporal_patch_size", 2) + patch_size = getattr(patch_embed, "patch_size", 14) + # in_channels is typically 3 (RGB) + proj = getattr(patch_embed, "proj", None) + if proj is not None: + raw_in_channels = getattr(proj, "in_channels", 3) else: - in_channels = 3 * 14 * 14 # Default for Qwen3-VL + raw_in_channels = 3 + # Total input channels = in_chans * temporal * patch_h * patch_w + input_channels = raw_in_channels * temporal_patch_size * patch_size * patch_size else: - in_channels = 3 * 14 * 14 + # Default for Qwen3-VL: 3 * 2 * 14 * 14 = 1176 + input_channels = 3 * 2 * 14 * 14 pixel_values = torch.zeros( - (num_patches, in_channels), + (num_patches, input_channels), dtype=visual.dtype, device=self.device, ) From 7978645235fd464e9e23eae3c904175d7b5ada65 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 01:44:41 -0500 Subject: [PATCH 121/189] add error catching in warmup. --- vllm/v1/worker/gpu_model_runner.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b9a2f9362aea..db902fdf9cad 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3125,15 +3125,26 @@ def warmup_encoder_piecewise(self) -> None: ) # Call forward_piecewise to trigger compilation - with set_forward_context(None, self.vllm_config): - _ = visual.forward_piecewise( - x=pixel_values, - pos_embeds=pos_embeds, - rotary_pos_emb_cos=rotary_cos, - rotary_pos_emb_sin=rotary_sin, - cu_seqlens=cu_seqlens, - max_seqlen=max_seqlen, - sequence_lengths=sequence_lengths, + # Wrap in try-except to handle PyTorch compile cache errors gracefully + # (e.g., aot_autograd_artifacts assertion errors) + try: + with set_forward_context(None, self.vllm_config): + _ = visual.forward_piecewise( + x=pixel_values, + pos_embeds=pos_embeds, + rotary_pos_emb_cos=rotary_cos, + rotary_pos_emb_sin=rotary_sin, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + sequence_lengths=sequence_lengths, + ) + except (AssertionError, RuntimeError) as e: + # PyTorch compile cache may fail in some edge cases + # The compilation itself may have succeeded, continue with warmup + logger.warning( + "Encoder piecewise warmup for capture_size=%d hit an error " + "(compilation may still work at runtime): %s", + capture_size, str(e)[:200] ) # Clear CUDA cache after each warmup to free intermediate memory From fd11d38451fd3b4163a8a37b4134b051e5d915f2 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 02:07:33 -0500 Subject: [PATCH 122/189] warmup with range, then capture. --- vllm/v1/worker/gpu_model_runner.py | 106 ++++++++++++++++++----------- 1 file changed, 65 insertions(+), 41 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index db902fdf9cad..900f4c5f1119 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3029,10 +3029,15 @@ def _execute_encoder_piecewise_padded( return real_outputs def warmup_encoder_piecewise(self) -> None: - """Pre-capture all encoder piecewise cudagraph sizes. + """Warmup encoder piecewise compilation using compile_ranges. - This should be called during model warmup to compile all capture sizes - upfront, avoiding compilation latency during actual execution. + This mimics LM warmup approach: + 1. Warmup with sizes in compile_ranges (NOT exact capture_sizes) + 2. This triggers compilation with fake tensors (is_exact_size=False) + 3. Exact capture_sizes compile lazily during actual execution + + This avoids PyTorch AOT autograd cache issues that occur when + compiling simple graphs (like patch_embed) with real tensors. """ if not getattr(self.compilation_config, "encoder_cudagraph_piecewise", False): return @@ -3052,33 +3057,61 @@ def warmup_encoder_piecewise(self) -> None: spatial_merge_size = getattr(visual, "spatial_merge_size", 2) merge_size_sq = spatial_merge_size ** 2 + # Convert capture_sizes to patches for comparison + capture_sizes_patches = set(size * merge_size_sq for size in capture_sizes) + + # Get compile_ranges and find warmup sizes that are NOT exact capture_sizes + # This ensures we use fake tensors (is_exact_size=False) during compilation + compile_ranges = self.compilation_config.get_compile_ranges() + warmup_sizes = [] + for compile_range in compile_ranges: + # Use the end of each compile_range for warmup + # (similar to LM warmup approach) + warmup_size = compile_range.end * merge_size_sq # Convert to patches + if warmup_size not in capture_sizes_patches: + warmup_sizes.append(warmup_size) + + # If all compile_range ends are capture_sizes, use a size slightly off + if not warmup_sizes and compile_ranges: + # Use a size that's in the largest compile_range but not a capture_size + largest_range = compile_ranges[-1] + # Try the middle of the range + warmup_size = ((largest_range.start + largest_range.end) // 2) * merge_size_sq + if warmup_size not in capture_sizes_patches: + warmup_sizes.append(warmup_size) + else: + # Use range.end - 1 if possible + warmup_size = (largest_range.end - 1) * merge_size_sq + if warmup_size > 0 and warmup_size not in capture_sizes_patches: + warmup_sizes.append(warmup_size) + + if not warmup_sizes: + logger.info( + "Encoder piecewise warmup: no suitable warmup sizes found, " + "compilation will happen lazily during execution" + ) + return + logger.info( - "Warming up encoder piecewise for %d capture sizes: %s", - len(capture_sizes), capture_sizes + "Warming up encoder piecewise with %d compile_range sizes " + "(exact capture_sizes will compile lazily): %s", + len(warmup_sizes), [s // merge_size_sq for s in warmup_sizes] ) - for capture_size in sorted(capture_sizes): - # Convert output tokens to input patches - num_patches = capture_size * merge_size_sq - + for num_patches in sorted(warmup_sizes, reverse=True): # Create dummy inputs matching the expected shapes # pixel_values: [num_patches, C] where C = in_chans * temporal * H * W - # For Qwen3-VL: C = 3 * 2 * 14 * 14 = 1176 patch_embed = getattr(visual, "patch_embed", None) if patch_embed is not None: - # Get the actual dimensions from patch_embed temporal_patch_size = getattr(patch_embed, "temporal_patch_size", 2) patch_size = getattr(patch_embed, "patch_size", 14) - # in_channels is typically 3 (RGB) proj = getattr(patch_embed, "proj", None) if proj is not None: raw_in_channels = getattr(proj, "in_channels", 3) else: raw_in_channels = 3 - # Total input channels = in_chans * temporal * patch_h * patch_w input_channels = raw_in_channels * temporal_patch_size * patch_size * patch_size else: - # Default for Qwen3-VL: 3 * 2 * 14 * 14 = 1176 input_channels = 3 * 2 * 14 * 14 pixel_values = torch.zeros( @@ -3087,7 +3120,6 @@ def warmup_encoder_piecewise(self) -> None: device=self.device, ) - # Get hidden size from visual encoder hidden_size = getattr(visual, "hidden_size", getattr(visual, "embed_dim", 1152)) @@ -3097,7 +3129,6 @@ def warmup_encoder_piecewise(self) -> None: device=self.device, ) - # Rotary embeddings - get actual dim from model rotary_dim = hidden_size // getattr(visual, "num_heads", 16) // 2 rotary_cos = torch.zeros( (num_patches, rotary_dim), @@ -3110,7 +3141,6 @@ def warmup_encoder_piecewise(self) -> None: device=self.device, ) - # cu_seqlens for single "image" covering all patches cu_seqlens = torch.tensor( [0, num_patches], dtype=torch.int32, device=self.device ) @@ -3120,37 +3150,31 @@ def warmup_encoder_piecewise(self) -> None: ) logger.info( - "Warming up encoder piecewise: capture_size=%d (patches=%d)", - capture_size, num_patches + "Warming up encoder piecewise: patches=%d (tokens=%d)", + num_patches, num_patches // merge_size_sq ) - # Call forward_piecewise to trigger compilation - # Wrap in try-except to handle PyTorch compile cache errors gracefully - # (e.g., aot_autograd_artifacts assertion errors) - try: - with set_forward_context(None, self.vllm_config): - _ = visual.forward_piecewise( - x=pixel_values, - pos_embeds=pos_embeds, - rotary_pos_emb_cos=rotary_cos, - rotary_pos_emb_sin=rotary_sin, - cu_seqlens=cu_seqlens, - max_seqlen=max_seqlen, - sequence_lengths=sequence_lengths, - ) - except (AssertionError, RuntimeError) as e: - # PyTorch compile cache may fail in some edge cases - # The compilation itself may have succeeded, continue with warmup - logger.warning( - "Encoder piecewise warmup for capture_size=%d hit an error " - "(compilation may still work at runtime): %s", - capture_size, str(e)[:200] + # Call forward_piecewise to trigger compilation with fake tensors + # (is_exact_size=False because size is NOT in capture_sizes) + with set_forward_context(None, self.vllm_config): + _ = visual.forward_piecewise( + x=pixel_values, + pos_embeds=pos_embeds, + rotary_pos_emb_cos=rotary_cos, + rotary_pos_emb_sin=rotary_sin, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + sequence_lengths=sequence_lengths, ) # Clear CUDA cache after each warmup to free intermediate memory torch.cuda.empty_cache() - logger.info("Encoder piecewise warmup complete for %d sizes", len(capture_sizes)) + logger.info( + "Encoder piecewise warmup complete. Compile_ranges warmed up, " + "exact capture_sizes (%d) will compile on first use.", + len(capture_sizes) + ) def _gather_mm_embeddings( self, From 186406c93c371c733f9674995befe31aa1bda896 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 02:25:44 -0500 Subject: [PATCH 123/189] fix warmup range. --- vllm/v1/worker/gpu_model_runner.py | 41 ++++++++++++++---------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 900f4c5f1119..6ba068a99318 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3060,29 +3060,26 @@ def warmup_encoder_piecewise(self) -> None: # Convert capture_sizes to patches for comparison capture_sizes_patches = set(size * merge_size_sq for size in capture_sizes) - # Get compile_ranges and find warmup sizes that are NOT exact capture_sizes - # This ensures we use fake tensors (is_exact_size=False) during compilation - compile_ranges = self.compilation_config.get_compile_ranges() + # For encoder warmup, we want to use a size that: + # 1. Is NOT in capture_sizes (so is_exact_size=False, uses fake tensors) + # 2. Is reasonable for memory (not using LM's large compile_range.end) + # 3. Is within the valid compile_range + # + # Strategy: Use max(capture_sizes) + 1 (in output tokens) as warmup size + # This ensures we compile the range without OOM from huge sizes + max_capture_size = max(capture_sizes) # In output tokens + warmup_size_tokens = max_capture_size + 1 # Slightly larger, not exact match + warmup_size = warmup_size_tokens * merge_size_sq # Convert to patches + warmup_sizes = [] - for compile_range in compile_ranges: - # Use the end of each compile_range for warmup - # (similar to LM warmup approach) - warmup_size = compile_range.end * merge_size_sq # Convert to patches - if warmup_size not in capture_sizes_patches: - warmup_sizes.append(warmup_size) - - # If all compile_range ends are capture_sizes, use a size slightly off - if not warmup_sizes and compile_ranges: - # Use a size that's in the largest compile_range but not a capture_size - largest_range = compile_ranges[-1] - # Try the middle of the range - warmup_size = ((largest_range.start + largest_range.end) // 2) * merge_size_sq - if warmup_size not in capture_sizes_patches: - warmup_sizes.append(warmup_size) - else: - # Use range.end - 1 if possible - warmup_size = (largest_range.end - 1) * merge_size_sq - if warmup_size > 0 and warmup_size not in capture_sizes_patches: + if warmup_size not in capture_sizes_patches: + warmup_sizes.append(warmup_size) + else: + # Try max - 1 if max + 1 happens to be a capture_size + warmup_size_tokens = max_capture_size - 1 + if warmup_size_tokens > 0: + warmup_size = warmup_size_tokens * merge_size_sq + if warmup_size not in capture_sizes_patches: warmup_sizes.append(warmup_size) if not warmup_sizes: From 2b28ae9c8d42b091c2d530d4c9b613617a409308 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 02:52:22 -0500 Subject: [PATCH 124/189] capture before execution. --- vllm/v1/worker/gpu_model_runner.py | 120 +++++++++++++++-------------- 1 file changed, 64 insertions(+), 56 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6ba068a99318..6fe1a35759df 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3029,15 +3029,13 @@ def _execute_encoder_piecewise_padded( return real_outputs def warmup_encoder_piecewise(self) -> None: - """Warmup encoder piecewise compilation using compile_ranges. + """Warmup and capture encoder piecewise compilation. - This mimics LM warmup approach: - 1. Warmup with sizes in compile_ranges (NOT exact capture_sizes) - 2. This triggers compilation with fake tensors (is_exact_size=False) - 3. Exact capture_sizes compile lazily during actual execution + This mimics LM's two-phase approach: + 1. Warmup phase: Compile ranges with fake tensors (is_exact_size=False) + 2. Capture phase: Compile all exact capture_sizes upfront (is_exact_size=True) - This avoids PyTorch AOT autograd cache issues that occur when - compiling simple graphs (like patch_embed) with real tensors. + This ensures no compilation happens during execution. """ if not getattr(self.compilation_config, "encoder_cudagraph_piecewise", False): return @@ -3057,47 +3055,14 @@ def warmup_encoder_piecewise(self) -> None: spatial_merge_size = getattr(visual, "spatial_merge_size", 2) merge_size_sq = spatial_merge_size ** 2 - # Convert capture_sizes to patches for comparison - capture_sizes_patches = set(size * merge_size_sq for size in capture_sizes) - - # For encoder warmup, we want to use a size that: - # 1. Is NOT in capture_sizes (so is_exact_size=False, uses fake tensors) - # 2. Is reasonable for memory (not using LM's large compile_range.end) - # 3. Is within the valid compile_range - # - # Strategy: Use max(capture_sizes) + 1 (in output tokens) as warmup size - # This ensures we compile the range without OOM from huge sizes - max_capture_size = max(capture_sizes) # In output tokens - warmup_size_tokens = max_capture_size + 1 # Slightly larger, not exact match - warmup_size = warmup_size_tokens * merge_size_sq # Convert to patches - - warmup_sizes = [] - if warmup_size not in capture_sizes_patches: - warmup_sizes.append(warmup_size) - else: - # Try max - 1 if max + 1 happens to be a capture_size - warmup_size_tokens = max_capture_size - 1 - if warmup_size_tokens > 0: - warmup_size = warmup_size_tokens * merge_size_sq - if warmup_size not in capture_sizes_patches: - warmup_sizes.append(warmup_size) - - if not warmup_sizes: - logger.info( - "Encoder piecewise warmup: no suitable warmup sizes found, " - "compilation will happen lazily during execution" - ) - return - - logger.info( - "Warming up encoder piecewise with %d compile_range sizes " - "(exact capture_sizes will compile lazily): %s", - len(warmup_sizes), [s // merge_size_sq for s in warmup_sizes] + # Convert capture_sizes to patches + capture_sizes_patches = sorted( + [size * merge_size_sq for size in capture_sizes], + reverse=True # Largest first like LM ) - for num_patches in sorted(warmup_sizes, reverse=True): - # Create dummy inputs matching the expected shapes - # pixel_values: [num_patches, C] where C = in_chans * temporal * H * W + # Helper to create dummy inputs for a given num_patches + def create_dummy_inputs(num_patches: int): patch_embed = getattr(visual, "patch_embed", None) if patch_embed is not None: temporal_patch_size = getattr(patch_embed, "temporal_patch_size", 2) @@ -3107,7 +3072,8 @@ def warmup_encoder_piecewise(self) -> None: raw_in_channels = getattr(proj, "in_channels", 3) else: raw_in_channels = 3 - input_channels = raw_in_channels * temporal_patch_size * patch_size * patch_size + input_channels = (raw_in_channels * temporal_patch_size + * patch_size * patch_size) else: input_channels = 3 * 2 * 14 * 14 @@ -3146,13 +3112,13 @@ def warmup_encoder_piecewise(self) -> None: [num_patches], dtype=torch.int32, device=self.device ) - logger.info( - "Warming up encoder piecewise: patches=%d (tokens=%d)", - num_patches, num_patches // merge_size_sq - ) + return (pixel_values, pos_embeds, rotary_cos, rotary_sin, + cu_seqlens, max_seqlen, sequence_lengths) + + def run_forward(num_patches: int): + (pixel_values, pos_embeds, rotary_cos, rotary_sin, + cu_seqlens, max_seqlen, sequence_lengths) = create_dummy_inputs(num_patches) - # Call forward_piecewise to trigger compilation with fake tensors - # (is_exact_size=False because size is NOT in capture_sizes) with set_forward_context(None, self.vllm_config): _ = visual.forward_piecewise( x=pixel_values, @@ -3164,12 +3130,54 @@ def warmup_encoder_piecewise(self) -> None: sequence_lengths=sequence_lengths, ) - # Clear CUDA cache after each warmup to free intermediate memory + # ============================================================ + # Phase 1: Warmup compile_ranges with fake tensors + # ============================================================ + # Use a size that's NOT in capture_sizes to trigger range compilation + # with fake tensors (is_exact_size=False) + max_capture_size = max(capture_sizes) # In output tokens + warmup_size_tokens = max_capture_size + 1 + warmup_size = warmup_size_tokens * merge_size_sq + + # Make sure warmup_size is not an exact capture_size + capture_sizes_patches_set = set(capture_sizes_patches) + if warmup_size in capture_sizes_patches_set: + warmup_size_tokens = max_capture_size + 2 + warmup_size = warmup_size_tokens * merge_size_sq + + logger.info( + "Phase 1: Warming up encoder piecewise compile_ranges " + "with patches=%d (tokens=%d)", + warmup_size, warmup_size_tokens + ) + + run_forward(warmup_size) + torch.cuda.empty_cache() + + logger.info("Phase 1 complete: compile_ranges warmed up") + + # ============================================================ + # Phase 2: Capture all exact sizes upfront (like LM capture_model) + # ============================================================ + logger.info( + "Phase 2: Capturing encoder piecewise for %d exact sizes: %s", + len(capture_sizes_patches), + [s // merge_size_sq for s in capture_sizes_patches] + ) + + for i, num_patches in enumerate(capture_sizes_patches): + num_tokens = num_patches // merge_size_sq + logger.info( + "Capturing encoder piecewise %d/%d: patches=%d (tokens=%d)", + i + 1, len(capture_sizes_patches), num_patches, num_tokens + ) + + run_forward(num_patches) torch.cuda.empty_cache() logger.info( - "Encoder piecewise warmup complete. Compile_ranges warmed up, " - "exact capture_sizes (%d) will compile on first use.", + "Encoder piecewise warmup and capture complete. " + "All %d capture_sizes compiled upfront.", len(capture_sizes) ) From 9a957c95fb22befd87c9942abef80e4c659f573d Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 02:53:35 -0500 Subject: [PATCH 125/189] clean up log. --- vllm/compilation/piecewise_backend.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 3e9984955dc2..b32b47f96bb6 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -235,23 +235,6 @@ def _find_range_for_shape(self, runtime_shape: int) -> RangeEntry | None: def __call__(self, *args: Any) -> Any: runtime_shape = args[self.sym_shape_indices[0]] - # Debug logging for encoder to understand shape mismatch - if self.is_encoder_compilation: - arg_shapes = [] - for i, a in enumerate(args[:5]): # First 5 args - if hasattr(a, 'shape'): - arg_shapes.append(f"arg[{i}].shape={tuple(a.shape)}") - else: - arg_shapes.append(f"arg[{i}]={type(a).__name__}") - logger.info( - "PIECEWISE ENCODER __call__: runtime_shape=%s, " - "sym_shape_indices=%s, compile_sizes=%s, %s", - runtime_shape, - self.sym_shape_indices, - self.compile_sizes, - ", ".join(arg_shapes), - ) - range_entry = self._find_range_for_shape(runtime_shape) assert range_entry is not None, ( From 6323e3eb649b0fa008beadd322f5fba0224c1ae6 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 03:14:06 -0500 Subject: [PATCH 126/189] remove capture in advance. --- vllm/v1/worker/gpu_model_runner.py | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6fe1a35759df..f907af026392 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3154,32 +3154,15 @@ def run_forward(num_patches: int): run_forward(warmup_size) torch.cuda.empty_cache() - logger.info("Phase 1 complete: compile_ranges warmed up") - - # ============================================================ - # Phase 2: Capture all exact sizes upfront (like LM capture_model) - # ============================================================ - logger.info( - "Phase 2: Capturing encoder piecewise for %d exact sizes: %s", - len(capture_sizes_patches), - [s // merge_size_sq for s in capture_sizes_patches] - ) - - for i, num_patches in enumerate(capture_sizes_patches): - num_tokens = num_patches // merge_size_sq - logger.info( - "Capturing encoder piecewise %d/%d: patches=%d (tokens=%d)", - i + 1, len(capture_sizes_patches), num_patches, num_tokens - ) - - run_forward(num_patches) - torch.cuda.empty_cache() - logger.info( - "Encoder piecewise warmup and capture complete. " - "All %d capture_sizes compiled upfront.", + "Encoder piecewise warmup complete. Compile_ranges warmed up, " + "exact capture_sizes (%d) will compile lazily during execution.", len(capture_sizes) ) + # NOTE: We skip upfront capture of exact sizes because encoder's simple + # graphs (e.g., patch_embed with Conv3d only) don't produce AOT autograd + # artifacts, causing cache assertion errors. Exact sizes compile lazily + # during execution instead. def _gather_mm_embeddings( self, From ff74d0b72d9d5b144e63f77917a8759daae53039 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 03:20:43 -0500 Subject: [PATCH 127/189] clean up logs. --- vllm/compilation/piecewise_backend.py | 26 -------------------------- vllm/v1/worker/gpu_model_runner.py | 16 ---------------- 2 files changed, 42 deletions(-) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index b32b47f96bb6..754698b72f74 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -164,15 +164,6 @@ def _maybe_compile_for_range_entry( self.to_be_compiled_ranges.remove(range_entry.compile_range) is_exact_size = range_entry.compile_range.is_single_size() - logger.info( - "PIECEWISE CAPTURE: graph_index=%d/%d, range=%s, " - "is_exact_size=%s, is_encoder=%s", - self.piecewise_compile_index, - self.total_piecewise_compiles, - range_entry.compile_range, - is_exact_size, - self.is_encoder_compilation, - ) # args are real arguments # fakify for range, real args for concrete size. @@ -241,23 +232,6 @@ def __call__(self, *args: Any) -> Any: f"Shape: {runtime_shape} out of considered ranges: {self.compile_ranges}" ) - # Log capture vs replay - is_capture = not range_entry.compiled - is_exact_size = range_entry.compile_range.is_single_size() - self._maybe_compile_for_range_entry(range_entry, args) # type: ignore[arg-type] - # Log replay (capture is logged inside _maybe_compile_for_range_entry) - if not is_capture: - logger.info( - "PIECEWISE REPLAY: graph_index=%d/%d, shape=%d, range=%s, " - "is_exact_size=%s, is_encoder=%s", - self.piecewise_compile_index, - self.total_piecewise_compiles, - runtime_shape, - range_entry.compile_range, - is_exact_size, - self.is_encoder_compilation, - ) - return range_entry.runnable(*args) # type: ignore[union-attr] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f907af026392..ef52d383fe62 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3145,25 +3145,9 @@ def run_forward(num_patches: int): warmup_size_tokens = max_capture_size + 2 warmup_size = warmup_size_tokens * merge_size_sq - logger.info( - "Phase 1: Warming up encoder piecewise compile_ranges " - "with patches=%d (tokens=%d)", - warmup_size, warmup_size_tokens - ) - run_forward(warmup_size) torch.cuda.empty_cache() - logger.info( - "Encoder piecewise warmup complete. Compile_ranges warmed up, " - "exact capture_sizes (%d) will compile lazily during execution.", - len(capture_sizes) - ) - # NOTE: We skip upfront capture of exact sizes because encoder's simple - # graphs (e.g., patch_embed with Conv3d only) don't produce AOT autograd - # artifacts, causing cache assertion errors. Exact sizes compile lazily - # during execution instead. - def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", From 96a1dd68de0a184f22a76d00c1173424d5b5374e Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 03:34:06 -0500 Subject: [PATCH 128/189] set cudagraph mode in forward context. --- vllm/v1/worker/gpu_model_runner.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ef52d383fe62..2bf46a19c0dd 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2983,7 +2983,14 @@ def _execute_encoder_piecewise_padded( device=max_seqlen.device) # Call forward_piecewise directly with pre-computed and padded tensors - with set_forward_context(None, self.vllm_config): + # Enable CUDA graph capture/replay by setting the proper forward context + batch_desc = BatchDescriptor(num_tokens=padded_num_patches) + with set_forward_context( + None, + self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE, + batch_descriptor=batch_desc, + ): encoder_output = visual.forward_piecewise( x=padded_pixel_values, pos_embeds=padded_pos_embeds, From ee90e736a0ac55418165274173bdcb4fe6e547fa Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 04:00:25 -0500 Subject: [PATCH 129/189] capture encoder piecewise cudagraph in capture_model(). --- vllm/v1/worker/gpu_model_runner.py | 105 +++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2bf46a19c0dd..2c305f5b19ad 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3155,6 +3155,106 @@ def run_forward(num_patches: int): run_forward(warmup_size) torch.cuda.empty_cache() + def _capture_encoder_piecewise_cudagraphs(self) -> None: + """Capture encoder piecewise CUDA graphs for all capture sizes. + + Called during capture_model() when cudagraph capturing is enabled. + This triggers CUDAGraphWrapper to capture graphs for each size. + """ + capture_sizes = getattr( + self.compilation_config, "encoder_cudagraph_capture_sizes", None + ) + if capture_sizes is None or len(capture_sizes) == 0: + return + + model = self.model + visual = getattr(model, "visual", None) + if visual is None or not hasattr(visual, "forward_piecewise"): + return + + spatial_merge_size = getattr(visual, "spatial_merge_size", 2) + merge_size_sq = spatial_merge_size ** 2 + + # Convert capture_sizes to patches, largest first + capture_sizes_patches = sorted( + [size * merge_size_sq for size in capture_sizes], + reverse=True + ) + + logger.info( + "Capturing encoder piecewise CUDA graphs for %d sizes", + len(capture_sizes_patches) + ) + + for num_patches in capture_sizes_patches: + # Create dummy inputs + patch_embed = getattr(visual, "patch_embed", None) + if patch_embed is not None: + temporal_patch_size = getattr(patch_embed, "temporal_patch_size", 2) + patch_size = getattr(patch_embed, "patch_size", 14) + proj = getattr(patch_embed, "proj", None) + raw_in_channels = getattr(proj, "in_channels", 3) if proj else 3 + input_channels = (raw_in_channels * temporal_patch_size + * patch_size * patch_size) + else: + input_channels = 3 * 2 * 14 * 14 + + pixel_values = torch.zeros( + (num_patches, input_channels), + dtype=visual.dtype, + device=self.device, + ) + + hidden_size = getattr(visual, "hidden_size", + getattr(visual, "embed_dim", 1152)) + pos_embeds = torch.zeros( + (num_patches, hidden_size), + dtype=visual.dtype, + device=self.device, + ) + + rotary_dim = hidden_size // getattr(visual, "num_heads", 16) // 2 + rotary_cos = torch.zeros( + (num_patches, rotary_dim), + dtype=visual.dtype, + device=self.device, + ) + rotary_sin = torch.zeros( + (num_patches, rotary_dim), + dtype=visual.dtype, + device=self.device, + ) + + cu_seqlens = torch.tensor( + [0, num_patches], dtype=torch.int32, device=self.device + ) + max_seqlen = torch.tensor(num_patches, device=self.device) + sequence_lengths = torch.tensor( + [num_patches], dtype=torch.int32, device=self.device + ) + + # Call with PIECEWISE mode to trigger CUDAGraphWrapper capture + batch_desc = BatchDescriptor(num_tokens=num_patches) + with set_forward_context( + None, + self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE, + batch_descriptor=batch_desc, + ): + _ = visual.forward_piecewise( + x=pixel_values, + pos_embeds=pos_embeds, + rotary_pos_emb_cos=rotary_cos, + rotary_pos_emb_sin=rotary_sin, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + sequence_lengths=sequence_lengths, + ) + + torch.cuda.empty_cache() + + logger.info("Encoder piecewise CUDA graph capture complete") + def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", @@ -5648,6 +5748,11 @@ def freeze_gc(): after_encoder_free / 1024**3, ) + # Capture encoder piecewise CUDA graphs (if enabled) + if getattr(self.compilation_config, "encoder_cudagraph_piecewise", False): + with freeze_gc(): + self._capture_encoder_piecewise_cudagraphs() + # Capture decoder/LM CUDA graphs in their own context with global pool with freeze_gc(), graph_capture(device=self.device): before_decoder_free = torch.cuda.mem_get_info()[0] From dba634a92e7d1909a13d7534485687f764a209eb Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 04:10:11 -0500 Subject: [PATCH 130/189] try separate compile and capture. --- vllm/v1/worker/gpu_model_runner.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2c305f5b19ad..c22e6f285453 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3233,7 +3233,21 @@ def _capture_encoder_piecewise_cudagraphs(self) -> None: [num_patches], dtype=torch.int32, device=self.device ) - # Call with PIECEWISE mode to trigger CUDAGraphWrapper capture + # Two-pass capture like LM: + # Pass 1: NONE mode - triggers torch.compile without CUDA graph capture + with set_forward_context(None, self.vllm_config): + _ = visual.forward_piecewise( + x=pixel_values, + pos_embeds=pos_embeds, + rotary_pos_emb_cos=rotary_cos, + rotary_pos_emb_sin=rotary_sin, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + sequence_lengths=sequence_lengths, + ) + + # Pass 2: PIECEWISE mode - triggers CUDAGraphWrapper capture + # (compilation already done in pass 1) batch_desc = BatchDescriptor(num_tokens=num_patches) with set_forward_context( None, From 14e8b152edb61a63e9afcea4b79eb835f98d494f Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 04:19:18 -0500 Subject: [PATCH 131/189] try disable torch cache. --- vllm/compilation/backends.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index d75bc1bd4772..eae2c2fad543 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -555,6 +555,12 @@ def __init__( # in future we need PostGradPassManager.uuid() to be executed # only at compile time. self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config) + + # Disable cache for encoder compilation to avoid assertion errors + # with simple graphs (e.g., Conv3d) that don't produce AOT artifacts + if self.is_encoder: + self.inductor_config["force_disable_caches"] = True + # `torch.compile` is JIT compiled, so we don't need to # do anything here From ea90f030b68f317c233097d6fbe3e3a974d2cb1f Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 13:20:37 -0500 Subject: [PATCH 132/189] try only disable torch aot cache. --- vllm/compilation/backends.py | 5 ----- vllm/compilation/compiler_interface.py | 4 ++++ 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index eae2c2fad543..5cf57a34dd5c 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -556,11 +556,6 @@ def __init__( # only at compile time. self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config) - # Disable cache for encoder compilation to avoid assertion errors - # with simple graphs (e.g., Conv3d) that don't produce AOT artifacts - if self.is_encoder: - self.inductor_config["force_disable_caches"] = True - # `torch.compile` is JIT compiled, so we don't need to # do anything here diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index bb478fceb125..4b89f8c40892 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -620,6 +620,10 @@ def set_inductor_config(config: dict[str, Any], compile_range: Range) -> None: def set_functorch_config() -> None: torch._functorch.config.bundled_autograd_cache = False + # Disable AOT autograd cache to avoid assertion errors with simple graphs + # (e.g., Conv3d in vision encoders) that don't produce AOT artifacts. + # This matches what InductorAdaptor does with context manager. + torch._functorch.config.enable_autograd_cache = False class EagerAdaptor(CompilerInterface): From d0f9807779d3d0d6b38749938552a3d415965054 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 13:46:20 -0500 Subject: [PATCH 133/189] try disable fx graph cache. --- vllm/compilation/compiler_interface.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 4b89f8c40892..581601f40354 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -624,6 +624,9 @@ def set_functorch_config() -> None: # (e.g., Conv3d in vision encoders) that don't produce AOT artifacts. # This matches what InductorAdaptor does with context manager. torch._functorch.config.enable_autograd_cache = False + # Disable fx_graph_cache to avoid assertion errors during cache save + # for graphs that don't produce AOT autograd artifacts. + torch._inductor.config.fx_graph_cache = False class EagerAdaptor(CompilerInterface): From 00c03e358c63a7b2b8deca431b54832a2bce69f8 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 13:54:34 -0500 Subject: [PATCH 134/189] disable encoder torch cache. --- vllm/compilation/backends.py | 6 ++++++ vllm/compilation/compiler_interface.py | 7 ------- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 5cf57a34dd5c..97a968a98972 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -556,6 +556,12 @@ def __init__( # only at compile time. self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config) + # Disable cache for encoder compilation to avoid assertion errors + # with simple graphs (e.g., Conv3d) that don't produce AOT artifacts. + # This skips the save in InductorStandaloneAdaptor.compile(). + if self.is_encoder: + self.inductor_config["force_disable_caches"] = True + # `torch.compile` is JIT compiled, so we don't need to # do anything here diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 581601f40354..bb478fceb125 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -620,13 +620,6 @@ def set_inductor_config(config: dict[str, Any], compile_range: Range) -> None: def set_functorch_config() -> None: torch._functorch.config.bundled_autograd_cache = False - # Disable AOT autograd cache to avoid assertion errors with simple graphs - # (e.g., Conv3d in vision encoders) that don't produce AOT artifacts. - # This matches what InductorAdaptor does with context manager. - torch._functorch.config.enable_autograd_cache = False - # Disable fx_graph_cache to avoid assertion errors during cache save - # for graphs that don't produce AOT autograd artifacts. - torch._inductor.config.fx_graph_cache = False class EagerAdaptor(CompilerInterface): From 41ff60460ab02af5dc34643a2b0df46ad932e72a Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 15:52:53 -0500 Subject: [PATCH 135/189] skip piecewise if not enabled in cli. --- vllm/compilation/backends.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 97a968a98972..d690c61ad56e 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -412,6 +412,25 @@ def call_module( i for i, x in enumerate(args) if isinstance(x, torch.SymInt) ] + # Check if we should use piecewise backend for this compilation + # For encoder with encoder_cudagraph_piecewise=False, skip piecewise + # backend entirely to avoid shape tracking issues. The encoder will + # use torch.compile directly and EncoderCudaGraphManager handles + # full cudagraph capture separately. + encoder_skip_piecewise = ( + self.vllm_backend.is_encoder + and not getattr( + self.compilation_config, "encoder_cudagraph_piecewise", False + ) + ) + + if encoder_skip_piecewise: + # For encoder without piecewise mode, just use the compiled + # submodule directly. EncoderCudaGraphManager will capture + # the full graph later. + self.module.__dict__[target] = submod + return output + # Lazy import here to avoid circular import from .piecewise_backend import PiecewiseBackend @@ -424,10 +443,13 @@ def call_module( self.vllm_backend, ) - if ( + # Check if we should use piecewise cudagraphs for this compilation + use_piecewise_cudagraph = ( self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs() and not self.compilation_config.use_inductor_graph_partition - ): + ) + + if use_piecewise_cudagraph: # We're using Dynamo-based piecewise splitting, so we wrap # the whole subgraph with a static graph wrapper. from .cuda_graph import CUDAGraphOptions From f918ee963ff3171c9e29dc4fa28b48d77efb3b58 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 17:48:50 -0500 Subject: [PATCH 136/189] add encoder cudagraph batch sizes. --- vllm/config/compilation.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index bc99e5eed4bc..050fc0d7fa0b 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -489,6 +489,14 @@ class CompilationConfig: the kernel launch savings. Set to False if you observe throughput regression with encoder CUDA graphs.""" + encoder_cudagraph_batch_sizes: list[int] | None = None + """Batch sizes for grouped batched CUDA graph capture. + When set (e.g., [4]), captures graphs for processing multiple images + together. Images are grouped by similar grid sizes and padded to the + largest grid in each group. Single graph replay for the whole group. + Example: [4] captures batch_size=4 graphs only (1-3 images use eager). + Default None uses legacy one-by-one mode (batch_size=1 per image).""" + encoder_cudagraph_piecewise: bool = False """Enable piecewise CUDA graph mode for encoder (ViT). When True, torch.compile splits the encoder graph at attention ops, so: From 3296099467f9866f5d3e7b45aa5cfacd8089760e Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 17:49:39 -0500 Subject: [PATCH 137/189] support batch size > 1 in encoder cudagraph. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 397 ++++++++++++++++----- 1 file changed, 299 insertions(+), 98 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index ae67e47461a0..80952a514a55 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -114,6 +114,9 @@ def __init__( self.dtype = dtype self.verbose = verbose + # Get batch sizes from config (for grouped batched mode) + self.batch_sizes = self._get_batch_sizes_from_config() + # Get grid configs from config or use defaults (for exact match) if grid_configs is None: grid_configs = self._get_grid_configs_from_config() @@ -154,8 +157,9 @@ def __init__( self.grid_configs = filtered_grids - # CUDA graph storage - keyed by (t, h, w) tuple - self.graphs: dict[tuple[int, int, int], torch.cuda.CUDAGraph] = {} + # CUDA graph storage - keyed by (batch_size, t, h, w) tuple + # For legacy mode (batch_sizes=None), key is (1, t, h, w) + self.graphs: dict[tuple[int, int, int, int], torch.cuda.CUDAGraph] = {} # Use private pools by default to avoid segfaults with rapid back-to-back # graph replays during one-by-one multi-image processing. # Set VLLM_ENCODER_SHARED_POOL=1 to use shared pool (saves memory but @@ -170,21 +174,21 @@ def __init__( else: self.pool = None # Each graph uses private memory (default) - # Pre-allocated input/output buffers per grid config - # Key: (t, h, w), Value: {"pixel_values": tensor, "grid_thw": list} - self.input_buffers: dict[tuple[int, int, int], dict[str, Any]] = {} - self.output_buffers: dict[tuple[int, int, int], torch.Tensor] = {} + # Pre-allocated input/output buffers per graph config + # Key: (batch_size, t, h, w), Value: {"pixel_values": tensor, "grid_thw": list} + self.input_buffers: dict[tuple[int, int, int, int], dict[str, Any]] = {} + self.output_buffers: dict[tuple[int, int, int, int], torch.Tensor] = {} # Cached pre-computed tensors for CUDA graph replay (exact match mode) - # Key: (t, h, w), Value: dict with pos_embeds, rotary embeddings, etc. - self.cached_tensors: dict[tuple[int, int, int], dict[str, torch.Tensor]] = {} + # Key: (batch_size, t, h, w), Value: dict with pos_embeds, rotary embeddings, etc. + self.cached_tensors: dict[tuple[int, int, int, int], dict[str, torch.Tensor]] = {} # Input buffers for embeddings (padded mode with runtime computation) - # Key: (t, h, w), Value: dict with pos_embeds, rotary_cos/sin, cu_seqlens - self.embedding_buffers: dict[tuple[int, int, int], dict[str, torch.Tensor]] = {} + # Key: (batch_size, t, h, w), Value: dict with pos_embeds, rotary_cos/sin, cu_seqlens + self.embedding_buffers: dict[tuple[int, int, int, int], dict[str, torch.Tensor]] = {} # Store metadata about captured graphs - self.captured_metadata: dict[tuple[int, int, int], dict[str, Any]] = {} + self.captured_metadata: dict[tuple[int, int, int, int], dict[str, Any]] = {} # Vision encoder reference for runtime embedding computation (set at capture) self.vision_encoder = None @@ -283,6 +287,26 @@ def _get_max_grid_size_from_config(self) -> int: ) return max_size + def _get_batch_sizes_from_config(self) -> list[int]: + """Get batch sizes for grouped batched CUDA graph capture. + + When set (e.g., [4]), captures graphs for processing multiple images + together with the same grid size. Images are grouped by grid size and + padded to the largest in each group. + + Default is [1] for legacy one-by-one mode. + """ + compilation_config = self.vllm_config.compilation_config + if compilation_config is None: + return [1] + + batch_sizes = getattr( + compilation_config, "encoder_cudagraph_batch_sizes", None + ) + if batch_sizes is None: + return [1] # Legacy mode: batch_size=1 only + return sorted(batch_sizes) + def _grid_to_key(self, grid_thw: list[list[int]]) -> tuple[int, int, int] | None: """ Convert a grid_thw list to a hashable key. @@ -309,6 +333,7 @@ def _prepare_dummy_inputs_for_grid( self, grid_config: tuple[int, int, int], vision_encoder: nn.Module, + batch_size: int = 1, ) -> dict[str, Any]: """ Prepare dummy inputs for CUDA graph capture with a specific grid config. @@ -316,6 +341,7 @@ def _prepare_dummy_inputs_for_grid( Args: grid_config: Tuple of (T, H, W) in patch units vision_encoder: The vision encoder module + batch_size: Number of images in the batch (all same grid) Returns: Dict with pixel_values, grid_thw, and metadata @@ -333,39 +359,47 @@ def _prepare_dummy_inputs_for_grid( temporal_patch_size * patch_size * patch_size * in_channels ) - # Calculate number of pixel patches (before patch embedding) + # Calculate number of pixel patches per image # h, w are in patch units, so num_patches = t * h * w - num_pixel_patches = t * h * w + num_pixel_patches_per_image = t * h * w + total_pixel_patches = num_pixel_patches_per_image * batch_size - # Create dummy pixel values (zeros are fine for warmup/capture) + # Create dummy pixel values for batch (zeros are fine for warmup/capture) pixel_values = torch.zeros( - num_pixel_patches, + total_pixel_patches, patch_input_channels, dtype=self.dtype, device=self.device, ) - # Grid (temporal, height, width) for this configuration - grid_thw = [[t, h, w]] + # Grid (temporal, height, width) for each image in batch + grid_thw = [[t, h, w]] * batch_size - # Calculate output tokens - output_tokens = self._compute_output_tokens(grid_config, spatial_merge_size) + # Calculate output tokens per image and total + output_tokens_per_image = self._compute_output_tokens( + grid_config, spatial_merge_size + ) + total_output_tokens = output_tokens_per_image * batch_size return { "pixel_values": pixel_values, "grid_thw": grid_thw, - "num_output_tokens": output_tokens, - "num_pixel_patches": num_pixel_patches, + "num_output_tokens": total_output_tokens, + "num_output_tokens_per_image": output_tokens_per_image, + "num_pixel_patches": total_pixel_patches, + "num_pixel_patches_per_image": num_pixel_patches_per_image, "patch_input_channels": patch_input_channels, + "batch_size": batch_size, } def capture_graph_for_grid( self, grid_config: tuple[int, int, int], vision_encoder: nn.Module, + batch_size: int = 1, ) -> None: """ - Capture a CUDA graph for the given grid configuration. + Capture a CUDA graph for the given grid configuration and batch size. This method pre-computes and caches all grid-dependent tensors (position embeddings, rotary embeddings, cu_seqlens) to eliminate @@ -374,25 +408,36 @@ def capture_graph_for_grid( Args: grid_config: Tuple of (T, H, W) in patch units vision_encoder: The vision encoder module + batch_size: Number of images with same grid (default 1) """ - logger.debug("Capturing encoder CUDA graph for grid config %s", grid_config) + t, h, w = grid_config + graph_key = (batch_size, t, h, w) + logger.debug( + "Capturing encoder CUDA graph for key %s (batch_size=%d, grid=%s)", + graph_key, batch_size, grid_config + ) - # Prepare dummy inputs - dummy_inputs = self._prepare_dummy_inputs_for_grid(grid_config, vision_encoder) + # Prepare dummy inputs for batch + dummy_inputs = self._prepare_dummy_inputs_for_grid( + grid_config, vision_encoder, batch_size + ) pixel_values = dummy_inputs["pixel_values"] grid_thw = dummy_inputs["grid_thw"] - # Store input buffer reference - self.input_buffers[grid_config] = { + # Store input buffer reference with new key format + self.input_buffers[graph_key] = { "pixel_values": pixel_values.clone(), "grid_thw": grid_thw, } # Store metadata - self.captured_metadata[grid_config] = { + self.captured_metadata[graph_key] = { "num_output_tokens": dummy_inputs["num_output_tokens"], + "num_output_tokens_per_image": dummy_inputs["num_output_tokens_per_image"], "num_pixel_patches": dummy_inputs["num_pixel_patches"], + "num_pixel_patches_per_image": dummy_inputs["num_pixel_patches_per_image"], "patch_input_channels": dummy_inputs["patch_input_channels"], + "batch_size": batch_size, } # Store vision encoder reference for runtime embedding computation @@ -404,13 +449,13 @@ def capture_graph_for_grid( ) and hasattr(vision_encoder, "precompute_for_cudagraph") if has_cudagraph_forward: - # Pre-compute tensors for the bucket grid (used for exact match mode) + # Pre-compute tensors for the batched grid (used for exact match mode) cached = vision_encoder.precompute_for_cudagraph(grid_thw) - self.cached_tensors[grid_config] = cached + self.cached_tensors[graph_key] = cached logger.debug( - "Pre-computed cached tensors for grid config %s: " + "Pre-computed cached tensors for key %s: " "pos_embeds=%s, cu_seqlens=%s", - grid_config, + graph_key, cached["pos_embeds"].shape, cached["cu_seqlens"].shape, ) @@ -418,7 +463,7 @@ def capture_graph_for_grid( # Create INPUT BUFFERS for embeddings (padded mode runtime computation) # These buffers can be updated at runtime before graph replay # Note: max_seqlen is a CPU scalar tensor to avoid GPU sync on .item() - self.embedding_buffers[grid_config] = { + self.embedding_buffers[graph_key] = { "pos_embeds": cached["pos_embeds"].clone(), "rotary_pos_emb_cos": cached["rotary_pos_emb_cos"].clone(), "rotary_pos_emb_sin": cached["rotary_pos_emb_sin"].clone(), @@ -426,7 +471,7 @@ def capture_graph_for_grid( "max_seqlen": cached["max_seqlen"].clone(), "sequence_lengths": cached["sequence_lengths"].clone(), } - embed_buffers = self.embedding_buffers[grid_config] + embed_buffers = self.embedding_buffers[graph_key] # Warmup run with embedding buffers # Use set_forward_context to provide vllm_config for torch.compile @@ -443,12 +488,12 @@ def capture_graph_for_grid( max_seqlen=embed_buffers["max_seqlen"], sequence_lengths=embed_buffers["sequence_lengths"], ) - self.output_buffers[grid_config] = torch.empty_like(warmup_output) + self.output_buffers[graph_key] = torch.empty_like(warmup_output) # Capture the graph with embedding BUFFERS (not constants) # This allows updating embeddings at runtime for padded mode graph = torch.cuda.CUDAGraph() - input_buffer = self.input_buffers[grid_config]["pixel_values"] + input_buffer = self.input_buffers[graph_key]["pixel_values"] with ( set_forward_context( @@ -466,7 +511,7 @@ def capture_graph_for_grid( max_seqlen=embed_buffers["max_seqlen"], sequence_lengths=embed_buffers["sequence_lengths"], ) - self.output_buffers[grid_config].copy_(output) + self.output_buffers[graph_key].copy_(output) else: # Fallback to original forward (will have CPU gaps) logger.warning( @@ -480,11 +525,11 @@ def capture_graph_for_grid( vllm_config=self.vllm_config, ): warmup_output = vision_encoder(pixel_values, grid_thw=grid_thw) - self.output_buffers[grid_config] = torch.empty_like(warmup_output) + self.output_buffers[graph_key] = torch.empty_like(warmup_output) # Capture the graph graph = torch.cuda.CUDAGraph() - input_buffer = self.input_buffers[grid_config]["pixel_values"] + input_buffer = self.input_buffers[graph_key]["pixel_values"] with ( set_forward_context( @@ -494,13 +539,13 @@ def capture_graph_for_grid( torch.cuda.graph(graph, self.pool), ): output = vision_encoder(input_buffer, grid_thw=grid_thw) - self.output_buffers[grid_config].copy_(output) + self.output_buffers[graph_key].copy_(output) - self.graphs[grid_config] = graph + self.graphs[graph_key] = graph cached_suffix = " (with cached tensors)" if has_cudagraph_forward else "" logger.debug( - "Captured encoder CUDA graph for grid config %s -> %d output tokens%s", - grid_config, + "Captured encoder CUDA graph for key %s -> %d output tokens%s", + graph_key, dummy_inputs["num_output_tokens"], cached_suffix, ) @@ -512,7 +557,7 @@ def capture( embed_multimodal_fn: Callable, ) -> None: """ - Capture CUDA graphs for all configured grid configurations. + Capture CUDA graphs for all configured grid and batch size combinations. Args: vision_encoder: The vision encoder module (e.g., Qwen3_VisionTransformer) @@ -522,12 +567,21 @@ def capture( logger.warning("Encoder CUDA graphs already captured, skipping") return + # Build list of (batch_size, grid_config) combinations to capture + capture_combinations = [] + for batch_size in self.batch_sizes: + for grid_config in self.grid_configs: + capture_combinations.append((batch_size, grid_config)) + # Log initial memory state free_mem_before, total_mem = torch.cuda.mem_get_info(self.device) used_mem_before = total_mem - free_mem_before logger.info( - "Capturing encoder CUDA graphs for %d grid configurations " + "Capturing encoder CUDA graphs for %d combinations " + "(batch_sizes=%s, grids=%d) " "(GPU memory: %.2f GiB used, %.2f GiB free)", + len(capture_combinations), + self.batch_sizes, len(self.grid_configs), used_mem_before / 1024**3, free_mem_before / 1024**3, @@ -535,27 +589,28 @@ def capture( # Capture from smallest to largest so that common smaller grids are # captured first. If we run out of memory, only large grids will fail. - configs_to_capture = sorted( - self.grid_configs, - key=lambda x: x[0] * x[1] * x[2], + capture_combinations = sorted( + capture_combinations, + key=lambda x: x[0] * x[1][0] * x[1][1] * x[1][2], # batch * t * h * w reverse=False, # Smallest first ) if is_global_first_rank(): - configs_to_capture = tqdm( - configs_to_capture, desc="Capturing encoder CUDA graphs" + capture_combinations = tqdm( + capture_combinations, desc="Capturing encoder CUDA graphs" ) # Capture each graph. For single-GPU mode, capture directly on current stream # to avoid stream synchronization overhead at replay time. # For multi-GPU mode, use graph_capture() context to coordinate with TP/PP. - for grid_config in configs_to_capture: + for batch_size, grid_config in capture_combinations: try: if self.is_single_gpu: # Single-GPU: capture on current stream (no separate stream) self.capture_graph_for_grid( grid_config, vision_encoder, + batch_size=batch_size, ) else: # Multi-GPU: use graph_capture() for TP/PP coordination @@ -563,11 +618,13 @@ def capture( self.capture_graph_for_grid( grid_config, vision_encoder, + batch_size=batch_size, ) except Exception as e: logger.warning( - "Failed to capture encoder CUDA graph for grid config " - "%s: %s. Will use eager mode.", + "Failed to capture encoder CUDA graph for " + "batch_size=%d, grid=%s: %s. Will use eager mode.", + batch_size, grid_config, e, ) @@ -591,26 +648,34 @@ def capture( def get_graph_for_grid( self, grid_thw: list[list[int]], - ) -> tuple[int, int, int] | None: + batch_size: int = 1, + ) -> tuple[int, int, int, int] | None: """ - Check if a CUDA graph is available for the given grid configuration. + Check if a CUDA graph is available for the given grid and batch size. Args: - grid_thw: List of [T, H, W] for each image + grid_thw: List of [T, H, W] for each image (must all be same grid) + batch_size: Number of images (default 1 for legacy mode) Returns: - The grid config key if a matching graph exists, None otherwise + The graph key (batch_size, t, h, w) if matching graph exists, None otherwise """ - key = self._grid_to_key(grid_thw) - if key is None: + if len(grid_thw) < 1: return None + # All images must have the same grid for batched mode + t, h, w = grid_thw[0] + for grid in grid_thw[1:]: + if grid != [t, h, w]: + return None # Mixed grids not supported + key = (batch_size, t, h, w) return key if key in self.graphs else None def find_bucket_for_tokens( self, num_tokens: int, spatial_merge_size: int = 2, - ) -> tuple[int, int, int] | None: + batch_size: int = 1, + ) -> tuple[int, int, int, int] | None: """ Find the smallest captured grid that can fit the given token count. @@ -618,46 +683,53 @@ def find_bucket_for_tokens( are padded to match the bucket size. Args: - num_tokens: Number of output tokens needed + num_tokens: Number of output tokens needed (per image) spatial_merge_size: Merge size (default 2) + batch_size: Required batch size (default 1) Returns: - Grid config (T, H, W) of the best bucket, or None if too large + Graph key (batch_size, T, H, W) of the best bucket, or None if too large """ - best_grid = None + best_key = None best_bucket_tokens = float("inf") - for grid_key in self.graphs: - bucket_tokens = self._compute_output_tokens(grid_key, spatial_merge_size) + for graph_key in self.graphs: + key_batch_size, t, h, w = graph_key + if key_batch_size != batch_size: + continue # Skip graphs with wrong batch size + grid = (t, h, w) + bucket_tokens = self._compute_output_tokens(grid, spatial_merge_size) if bucket_tokens >= num_tokens and bucket_tokens < best_bucket_tokens: best_bucket_tokens = bucket_tokens - best_grid = grid_key + best_key = graph_key - return best_grid + return best_key def run( self, pixel_values: torch.Tensor, grid_thw: list[list[int]], + batch_size: int = 1, ) -> torch.Tensor | None: """ Run the vision encoder using a captured CUDA graph if available. Args: pixel_values: Input pixel values [num_patches, patch_channels] - grid_thw: List of [T, H, W] for each image + grid_thw: List of [T, H, W] for each image (all must be same grid) + batch_size: Number of images in batch (default 1 for legacy mode) Returns: Vision encoder output tensor if graph was used, None if no matching graph """ - grid_key = self.get_graph_for_grid(grid_thw) + graph_key = self.get_graph_for_grid(grid_thw, batch_size=batch_size) - if grid_key is None: + if graph_key is None: # Don't count miss here - caller may try run_padded() next return None # Verify input dimensions match - input_buffer = self.input_buffers[grid_key]["pixel_values"] + input_buffer = self.input_buffers[graph_key]["pixel_values"] if pixel_values.shape != input_buffer.shape: logger.warning( "Pixel values shape mismatch: expected %s, got %s. " @@ -705,9 +777,9 @@ def run( # For exact match, restore cached embeddings only if modified by run_padded(). # This avoids 6 unnecessary tensor copies when only using exact-match mode. - if grid_key in self.modified_grids: - embed_buffers = self.embedding_buffers[grid_key] - cached = self.cached_tensors[grid_key] + if graph_key in self.modified_grids: + embed_buffers = self.embedding_buffers[graph_key] + cached = self.cached_tensors[graph_key] embed_buffers["pos_embeds"].copy_(cached["pos_embeds"], non_blocking=True) embed_buffers["rotary_pos_emb_cos"].copy_( cached["rotary_pos_emb_cos"], non_blocking=True @@ -720,12 +792,12 @@ def run( embed_buffers["sequence_lengths"].copy_( cached["sequence_lengths"], non_blocking=True ) - self.modified_grids.discard(grid_key) + self.modified_grids.discard(graph_key) if self.verbose: logger.info( - "run(): grid_key=%s, input_shape=%s, buffer_shape=%s", - grid_key, + "run(): graph_key=%s, input_shape=%s, buffer_shape=%s", + graph_key, pixel_values.shape, input_buffer.shape, ) @@ -734,15 +806,15 @@ def run( # Single-GPU optimized path: graph was captured on current stream, # so buffer copies and replay are on the same stream - no sync needed. # Return view directly; caller must use output before next run() call. - self.graphs[grid_key].replay() - return self.output_buffers[grid_key] + self.graphs[graph_key].replay() + return self.output_buffers[graph_key] else: # Multi-GPU path: graph was captured on a separate stream. # Sync current stream before replay to ensure buffer copies complete. torch.cuda.current_stream().synchronize() # Replay the graph - self.graphs[grid_key].replay() + self.graphs[graph_key].replay() # Record event after replay for lightweight sync in next call. if self.replay_done_event is None: @@ -753,7 +825,7 @@ def run( self.replay_done_event.synchronize() # Return a clone of the output to avoid issues with buffer reuse - return self.output_buffers[grid_key].clone() + return self.output_buffers[graph_key].clone() def run_padded( self, @@ -790,14 +862,17 @@ def run_padded( logger.debug("Vision encoder not available for padded mode") return None - # Find the smallest bucket that fits - bucket_grid = self.find_bucket_for_tokens(num_output_tokens, spatial_merge_size) - if bucket_grid is None: + # Find the smallest bucket that fits (for batch_size=1) + graph_key = self.find_bucket_for_tokens( + num_output_tokens, spatial_merge_size, batch_size=1 + ) + if graph_key is None: # Don't count miss here - caller will count it when falling back to eager max_available = ( max( - self._compute_output_tokens(g, spatial_merge_size) - for g in self.graphs + self._compute_output_tokens((t, h, w), spatial_merge_size) + for (bs, t, h, w) in self.graphs + if bs == 1 # Only consider batch_size=1 graphs ) if self.graphs else 0 @@ -810,15 +885,17 @@ def run_padded( return None # Check if we have embedding buffers for this bucket - if bucket_grid not in self.embedding_buffers: - logger.debug("No embedding buffers for bucket %s", bucket_grid) + if graph_key not in self.embedding_buffers: + logger.debug("No embedding buffers for bucket %s", graph_key) return None - bucket_tokens = self._compute_output_tokens(bucket_grid, spatial_merge_size) + # Extract grid from graph_key for _compute_output_tokens + _, t, h, w = graph_key + bucket_tokens = self._compute_output_tokens((t, h, w), spatial_merge_size) padding_waste = bucket_tokens - num_output_tokens # Get the input buffer for this bucket - input_buffer = self.input_buffers[bucket_grid]["pixel_values"] + input_buffer = self.input_buffers[graph_key]["pixel_values"] num_input_patches = pixel_values.shape[0] bucket_input_patches = input_buffer.shape[0] @@ -868,7 +945,7 @@ def run_padded( actual_embeds = self.vision_encoder.precompute_for_cudagraph(grid_thw) # Get embedding buffers for the bucket - embed_buffers = self.embedding_buffers[bucket_grid] + embed_buffers = self.embedding_buffers[graph_key] # Zero the buffers first (for clean padding) input_buffer.zero_() @@ -905,13 +982,13 @@ def run_padded( ) # Mark this grid as modified so run() knows to restore cached tensors - self.modified_grids.add(bucket_grid) + self.modified_grids.add(graph_key) if self.verbose: logger.info( - "run_padded(): bucket_grid=%s, actual_grid=%s, " + "run_padded(): graph_key=%s, actual_grid=%s, " "input_patches=%d, bucket_patches=%d", - bucket_grid, + graph_key, grid_thw[0], num_input_patches, bucket_input_patches, @@ -921,8 +998,8 @@ def run_padded( # Single-GPU optimized path: graph was captured on current stream, # so buffer modifications and replay are on same stream - no sync needed. # Return view directly; caller must use output before next run() call. - self.graphs[bucket_grid].replay() - full_output = self.output_buffers[bucket_grid] + self.graphs[graph_key].replay() + full_output = self.output_buffers[graph_key] trimmed_output = full_output[:num_output_tokens] else: # Multi-GPU path: graph was captured on a separate stream. @@ -930,7 +1007,7 @@ def run_padded( torch.cuda.current_stream().synchronize() # Replay the graph with updated embedding buffers - self.graphs[bucket_grid].replay() + self.graphs[graph_key].replay() # Record event after replay for lightweight sync in next call. if self.replay_done_event is None: @@ -941,7 +1018,7 @@ def run_padded( self.replay_done_event.synchronize() # Get output and trim to actual size - full_output = self.output_buffers[bucket_grid] + full_output = self.output_buffers[graph_key] trimmed_output = full_output[:num_output_tokens].clone() if self.verbose: @@ -955,6 +1032,130 @@ def run_padded( return trimmed_output, padding_waste + def run_batched( + self, + pixel_values: torch.Tensor, + grid_thw: list[list[int]], + batch_size: int, + ) -> torch.Tensor | None: + """ + Run the vision encoder for a batch of images with the same grid size. + + This is used for grouped batching where multiple images are processed + together with a single CUDA graph replay. + + Args: + pixel_values: Concatenated pixel values [total_patches, patch_channels] + grid_thw: List of [T, H, W] for each image (all must be same grid) + batch_size: Number of images in the batch + + Returns: + Concatenated output tensor for all images, or None if no matching graph + """ + if len(grid_thw) != batch_size: + logger.warning( + "grid_thw length (%d) doesn't match batch_size (%d)", + len(grid_thw), batch_size + ) + return None + + # All images must have the same grid + if len(grid_thw) < 1: + return None + base_grid = grid_thw[0] + for grid in grid_thw[1:]: + if grid != base_grid: + logger.warning( + "run_batched requires all images to have same grid, " + "got %s and %s", base_grid, grid + ) + return None + + # Look up the graph for this batch_size and grid + graph_key = self.get_graph_for_grid(grid_thw, batch_size=batch_size) + if graph_key is None: + return None + + # Verify input dimensions match + input_buffer = self.input_buffers[graph_key]["pixel_values"] + if pixel_values.shape != input_buffer.shape: + logger.warning( + "Pixel values shape mismatch: expected %s, got %s. " + "Falling back to eager mode.", + input_buffer.shape, + pixel_values.shape, + ) + self.eager_fallbacks += 1 + return None + + # Verify device and dtype match + if pixel_values.device != input_buffer.device: + logger.warning( + "Device mismatch: expected %s, got %s. Falling back to eager mode.", + input_buffer.device, + pixel_values.device, + ) + self.eager_fallbacks += 1 + return None + + if pixel_values.dtype != input_buffer.dtype: + logger.warning( + "Dtype mismatch: expected %s, got %s. Falling back to eager mode.", + input_buffer.dtype, + pixel_values.dtype, + ) + self.eager_fallbacks += 1 + return None + + self.cache_hits += 1 + + # Wait for any previous graph replay to complete before modifying buffers. + if not self.is_single_gpu and self.replay_done_event is not None: + self.replay_done_event.synchronize() + + # Ensure contiguous memory layout for safe copy + if not pixel_values.is_contiguous(): + pixel_values = pixel_values.contiguous() + + # Copy input to the captured buffer + input_buffer.copy_(pixel_values, non_blocking=True) + + # For batched exact match, restore cached embeddings if modified + if graph_key in self.modified_grids: + embed_buffers = self.embedding_buffers[graph_key] + cached = self.cached_tensors[graph_key] + embed_buffers["pos_embeds"].copy_(cached["pos_embeds"], non_blocking=True) + embed_buffers["rotary_pos_emb_cos"].copy_( + cached["rotary_pos_emb_cos"], non_blocking=True + ) + embed_buffers["rotary_pos_emb_sin"].copy_( + cached["rotary_pos_emb_sin"], non_blocking=True + ) + embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=True) + embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=True) + embed_buffers["sequence_lengths"].copy_( + cached["sequence_lengths"], non_blocking=True + ) + self.modified_grids.discard(graph_key) + + if self.verbose: + logger.info( + "run_batched(): graph_key=%s, batch_size=%d, input_shape=%s", + graph_key, batch_size, pixel_values.shape, + ) + + if self.is_single_gpu: + self.graphs[graph_key].replay() + return self.output_buffers[graph_key] + else: + torch.cuda.current_stream().synchronize() + self.graphs[graph_key].replay() + if self.replay_done_event is None: + self.replay_done_event = torch.cuda.Event() + self.replay_done_event.record() + self.replay_done_event.synchronize() + return self.output_buffers[graph_key].clone() + def count_miss(self) -> None: """Count when falling back to eager mode. From cd453be62e926a950409e7d3a02396ba67b1b47c Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 17:50:40 -0500 Subject: [PATCH 138/189] encoder cudagraph batching logic. --- vllm/v1/worker/gpu_model_runner.py | 194 ++++++++++++++++++++++++++++- 1 file changed, 190 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c22e6f285453..151831a09a5f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -755,6 +755,13 @@ def _init_encoder_cudagraph_manager(self) -> None: True, # Default to one-by-one for higher CUDA graph hit rate ) + # Get batch sizes for grouped batched mode + self.encoder_cudagraph_batch_sizes = getattr( + self.compilation_config, + "encoder_cudagraph_batch_sizes", + None, # Default None means legacy mode (batch_size=1 only) + ) + # Create a dedicated graph pool for encoder CUDA graphs # This keeps encoder and decoder graph memory separate for: # 1. Better memory isolation and predictability @@ -2407,16 +2414,58 @@ def _execute_mm_encoder( curr_group_outputs = curr_group_outputs_lst else: # Try to use CUDA graph if available - # When CUDA graphs are enabled and we have multiple items, - # process them one at a time since CUDA graphs only support - # single-image batches. This can be disabled via config if - # the sync overhead outweighs the CUDA graph benefits. + # First try grouped batched mode if configured (batch_size > 1) + # Then fall back to one-by-one mode + grouped_batched_result = None if ( + self.encoder_cudagraph_manager is not None + and self.encoder_cudagraph_batch_sizes is not None + and num_items > 1 + and modality in ("image", "video") + ): + # Try grouped batched mode + if modality == "image": + batched_pixel_values = mm_kwargs_group.get("pixel_values") + grid_thw_list = mm_kwargs_group.get("image_grid_thw") + else: # video + batched_pixel_values = mm_kwargs_group.get( + "pixel_values_videos" + ) + grid_thw_list = mm_kwargs_group.get("video_grid_thw") + + if batched_pixel_values is not None and grid_thw_list is not None: + if isinstance(grid_thw_list, torch.Tensor): + grid_thw_list = grid_thw_list.tolist() + + # Find largest batch size that fits + target_batch_size = max( + bs for bs in self.encoder_cudagraph_batch_sizes + if bs <= num_items + ) if any(bs <= num_items for bs in self.encoder_cudagraph_batch_sizes) else None + + if target_batch_size is not None and target_batch_size > 1: + if self.encoder_cudagraph_verbose: + logger.info( + "Trying grouped batch: %d images, target_bs=%d", + num_items, target_batch_size + ) + grouped_batched_result = self._execute_grouped_batched_encoder( + model, + batched_pixel_values, + grid_thw_list, + modality, + target_batch_size, + ) + + if grouped_batched_result is not None: + curr_group_outputs = grouped_batched_result + elif ( self.encoder_cudagraph_manager is not None and self.encoder_cudagraph_one_by_one and num_items > 1 and modality in ("image", "video") ): + # Fall back to one-by-one processing # Process each image individually for CUDA graph support # Extract batched data and slice per-image to avoid # re-calling group_mm_kwargs_by_modality overhead @@ -2715,6 +2764,143 @@ def _execute_with_encoder_cudagraph( ) return None + def _execute_grouped_batched_encoder( + self, + model: "SupportsMultiModal", + batched_pixel_values: torch.Tensor, + grid_thw_list: list[list[int]], + modality: str, + target_batch_size: int, + ) -> list[torch.Tensor] | None: + """ + Execute encoder using grouped batched CUDA graphs. + + Groups images by grid size, pads to largest in group, and uses + batched CUDA graph for groups of target_batch_size. + + Args: + model: The multimodal model + batched_pixel_values: Concatenated pixel values for all images + grid_thw_list: List of [T, H, W] for each image + modality: "image" or "video" + target_batch_size: Target batch size for grouping (e.g., 4) + + Returns: + List of output tensors, or None if batched mode not available + """ + if self.encoder_cudagraph_manager is None: + return None + + num_images = len(grid_thw_list) + if num_images < target_batch_size: + # Not enough images for a full batch, fall back to eager + return None + + # Get spatial merge size for output token calculation + visual = getattr(model, "visual", None) + spatial_merge_size = getattr(visual, "spatial_merge_size", 2) + + # Sort images by grid size (patch count) for efficient grouping + # Keep track of original indices for reordering output + indexed_grids = [(i, grid, grid[0] * grid[1] * grid[2]) + for i, grid in enumerate(grid_thw_list)] + sorted_grids = sorted(indexed_grids, key=lambda x: x[2]) + + # Calculate patch offsets for slicing + patch_offsets = [0] + for grid in grid_thw_list: + t, h, w = grid + patch_offsets.append(patch_offsets[-1] + t * h * w) + + outputs = [None] * num_images + processed = 0 + + # Process full batches + while processed + target_batch_size <= num_images: + # Get the next batch of images (sorted by size) + batch_indices = [sorted_grids[processed + i][0] + for i in range(target_batch_size)] + batch_grids = [grid_thw_list[i] for i in batch_indices] + + # Find the largest grid in this batch (for padding) + max_patches = max(g[0] * g[1] * g[2] for g in batch_grids) + max_grid = None + for g in batch_grids: + if g[0] * g[1] * g[2] == max_patches: + max_grid = g + break + + # Check if we have a graph for this batch_size and grid + graph_key = self.encoder_cudagraph_manager.get_graph_for_grid( + [max_grid] * target_batch_size, batch_size=target_batch_size + ) + if graph_key is None: + # No graph for this configuration, skip to eager + processed += target_batch_size + continue + + # Pad all images in batch to max_grid size + padded_pixels_list = [] + for idx in batch_indices: + start = patch_offsets[idx] + end = patch_offsets[idx + 1] + img_pixels = batched_pixel_values[start:end] + + actual_patches = img_pixels.shape[0] + if actual_patches < max_patches: + # Pad with zeros + padding = torch.zeros( + max_patches - actual_patches, + img_pixels.shape[1], + dtype=img_pixels.dtype, + device=img_pixels.device, + ) + img_pixels = torch.cat([img_pixels, padding], dim=0) + padded_pixels_list.append(img_pixels) + + # Concatenate for batched execution + batched_input = torch.cat(padded_pixels_list, dim=0) + + # Run batched CUDA graph + result = self.encoder_cudagraph_manager.run_batched( + batched_input, + [max_grid] * target_batch_size, + batch_size=target_batch_size, + ) + + if result is not None: + # Split output by image and trim to actual size + output_tokens_per_image = ( + max_grid[0] + * (max_grid[1] // spatial_merge_size) + * (max_grid[2] // spatial_merge_size) + ) + for i, idx in enumerate(batch_indices): + actual_grid = grid_thw_list[idx] + actual_tokens = ( + actual_grid[0] + * (actual_grid[1] // spatial_merge_size) + * (actual_grid[2] // spatial_merge_size) + ) + start = i * output_tokens_per_image + # Trim to actual output size + outputs[idx] = result[start:start + actual_tokens].clone() + + if self.encoder_cudagraph_verbose: + logger.info( + "Grouped batch: batch_size=%d, max_grid=%s, processed=%d", + target_batch_size, max_grid, target_batch_size + ) + + processed += target_batch_size + + # Check if all images were processed + if any(o is None for o in outputs): + # Some images not processed, fall back to full eager + return None + + return outputs + def _find_nearest_encoder_capture_size(self, num_tokens: int) -> int | None: """Find the smallest capture size >= num_tokens for piecewise mode. From a13707b5cb7a8179e06d773378cd3133a36a3bb3 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 19:17:54 -0500 Subject: [PATCH 139/189] batch with padding at the end. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 192 +++++++++++++++++++++ 1 file changed, 192 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 80952a514a55..9c42426d65c4 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -1156,6 +1156,198 @@ def run_batched( self.replay_done_event.synchronize() return self.output_buffers[graph_key].clone() + def run_batched_contiguous( + self, + pixel_values: torch.Tensor, + grid_thw_list: list[list[int]], + graph_key: tuple[int, int, int, int], + spatial_merge_size: int = 2, + ) -> torch.Tensor | None: + """ + Run batched CUDA graph with contiguous packing and end padding. + + This method packs images contiguously in the buffer (no interleaved padding), + computes actual cu_seqlens at runtime, and pads only at the end. This ensures + flash attention reads correct data for each sequence. + + Memory layout: + Buffer: [img0][img1][img2][img3][PADDING at end] + cu_seqlens: [0, size0, size0+size1, ..., total_actual] + + Flash attention uses cu_seqlens to process only actual tokens; padding at + the end is outside all sequence boundaries and is ignored. + + Args: + pixel_values: Contiguously packed pixel values (no padding between images) + grid_thw_list: List of [T, H, W] for each image (can be different grids) + graph_key: The bucket graph key (batch_size, t, h, w) to use + spatial_merge_size: Spatial merge size (default 2) + + Returns: + Full output tensor from the bucket, or None if failed. + Caller should use cu_seqlens to extract per-image outputs. + """ + if graph_key not in self.graphs: + logger.debug("No graph for key %s", graph_key) + return None + + batch_size = graph_key[0] + if len(grid_thw_list) != batch_size: + logger.warning( + "grid_thw_list length (%d) doesn't match graph batch_size (%d)", + len(grid_thw_list), batch_size + ) + return None + + # Check if vision encoder is available for embedding computation + if self.vision_encoder is None or not hasattr( + self.vision_encoder, "precompute_for_cudagraph" + ): + logger.debug("Vision encoder not available for batched contiguous mode") + return None + + # Check if we have embedding buffers for this bucket + if graph_key not in self.embedding_buffers: + logger.debug("No embedding buffers for bucket %s", graph_key) + return None + + # Get the input buffer for this bucket + input_buffer = self.input_buffers[graph_key]["pixel_values"] + actual_input_patches = pixel_values.shape[0] + bucket_input_patches = input_buffer.shape[0] + + if actual_input_patches > bucket_input_patches: + logger.warning( + "Input patches (%d) exceed bucket capacity (%d).", + actual_input_patches, bucket_input_patches, + ) + self.eager_fallbacks += 1 + return None + + # Verify device and dtype match + if pixel_values.device != input_buffer.device: + logger.warning( + "Device mismatch: expected %s, got %s.", + input_buffer.device, pixel_values.device, + ) + self.eager_fallbacks += 1 + return None + + if pixel_values.dtype != input_buffer.dtype: + logger.warning( + "Dtype mismatch: expected %s, got %s.", + input_buffer.dtype, pixel_values.dtype, + ) + self.eager_fallbacks += 1 + return None + + # Ensure contiguous memory layout + if not pixel_values.is_contiguous(): + pixel_values = pixel_values.contiguous() + + self.cache_hits += 1 + + # Wait for any previous graph replay to complete + if not self.is_single_gpu and self.replay_done_event is not None: + self.replay_done_event.synchronize() + + # Get embedding buffers for the bucket + embed_buffers = self.embedding_buffers[graph_key] + + # Zero the buffers first (for clean padding at end) + input_buffer.zero_() + embed_buffers["pos_embeds"].zero_() + embed_buffers["rotary_pos_emb_cos"].zero_() + embed_buffers["rotary_pos_emb_sin"].zero_() + + # Copy actual pixel values to the beginning of the buffer (contiguous) + input_buffer[:actual_input_patches].copy_(pixel_values, non_blocking=True) + + # Compute embeddings for each actual grid and pack contiguously + # Also build cu_seqlens from actual cumulative sizes + pos_embeds_list = [] + rotary_cos_list = [] + rotary_sin_list = [] + sequence_lengths = [] + + for grid in grid_thw_list: + # Compute embeddings for this actual grid + actual_embeds = self.vision_encoder.precompute_for_cudagraph([grid]) + pos_embeds_list.append(actual_embeds["pos_embeds"]) + rotary_cos_list.append(actual_embeds["rotary_pos_emb_cos"]) + rotary_sin_list.append(actual_embeds["rotary_pos_emb_sin"]) + # Output tokens for this image + t, h, w = grid + output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) + sequence_lengths.append(output_tokens) + + # Concatenate embeddings (contiguous packing) + packed_pos_embeds = torch.cat(pos_embeds_list, dim=0) + packed_rotary_cos = torch.cat(rotary_cos_list, dim=0) + packed_rotary_sin = torch.cat(rotary_sin_list, dim=0) + + # Copy packed embeddings to buffer (padding remains zero at end) + actual_output_tokens = packed_pos_embeds.shape[0] + embed_buffers["pos_embeds"][:actual_output_tokens].copy_( + packed_pos_embeds, non_blocking=True + ) + embed_buffers["rotary_pos_emb_cos"][:actual_output_tokens].copy_( + packed_rotary_cos, non_blocking=True + ) + embed_buffers["rotary_pos_emb_sin"][:actual_output_tokens].copy_( + packed_rotary_sin, non_blocking=True + ) + + # Build cu_seqlens from actual cumulative sizes + # cu_seqlens = [0, size0, size0+size1, ..., total] + cu_seqlens_list = [0] + for length in sequence_lengths: + cu_seqlens_list.append(cu_seqlens_list[-1] + length) + + cu_seqlens_tensor = torch.tensor( + cu_seqlens_list, dtype=torch.int32, device=self.device + ) + max_seqlen = max(sequence_lengths) + max_seqlen_tensor = torch.tensor( + max_seqlen, dtype=torch.int32, device="cpu" + ) + sequence_lengths_tensor = torch.tensor( + sequence_lengths, dtype=torch.int32, device=self.device + ) + + # Update cu_seqlens buffer - need to handle size mismatch + # The captured buffer may be larger, so we update only the actual part + embed_buffers["cu_seqlens"][:len(cu_seqlens_list)].copy_( + cu_seqlens_tensor, non_blocking=True + ) + embed_buffers["max_seqlen"].copy_(max_seqlen_tensor, non_blocking=True) + embed_buffers["sequence_lengths"][:batch_size].copy_( + sequence_lengths_tensor, non_blocking=True + ) + + # Mark this grid as modified so run() knows to restore cached tensors + self.modified_grids.add(graph_key) + + if self.verbose: + logger.info( + "run_batched_contiguous(): graph_key=%s, grids=%s, " + "actual_patches=%d, bucket_patches=%d, cu_seqlens=%s", + graph_key, grid_thw_list, actual_input_patches, + bucket_input_patches, cu_seqlens_list, + ) + + if self.is_single_gpu: + self.graphs[graph_key].replay() + return self.output_buffers[graph_key] + else: + torch.cuda.current_stream().synchronize() + self.graphs[graph_key].replay() + if self.replay_done_event is None: + self.replay_done_event = torch.cuda.Event() + self.replay_done_event.record() + self.replay_done_event.synchronize() + return self.output_buffers[graph_key].clone() + def count_miss(self) -> None: """Count when falling back to eager mode. From e4524025f7b4ed772ad117bbd5710434919a6700 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 19:18:45 -0500 Subject: [PATCH 140/189] use batching with padding at the end. --- vllm/v1/worker/gpu_model_runner.py | 104 +++++++++++++---------------- 1 file changed, 47 insertions(+), 57 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 151831a09a5f..f8f394a58e5d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2773,10 +2773,17 @@ def _execute_grouped_batched_encoder( target_batch_size: int, ) -> list[torch.Tensor] | None: """ - Execute encoder using grouped batched CUDA graphs. + Execute encoder using grouped batched CUDA graphs with contiguous packing. - Groups images by grid size, pads to largest in group, and uses - batched CUDA graph for groups of target_batch_size. + Groups images by output token count, packs them contiguously (no interleaved + padding), and uses a bucket-based CUDA graph for groups of target_batch_size. + + Memory layout (contiguous packing): + Buffer: [img0][img1][img2][img3][PADDING at end] + cu_seqlens: [0, size0, size0+size1, ..., total_actual] + + This ensures flash attention reads correct data for each sequence, as + cu_seqlens reflects actual boundaries. Padding at end is ignored. Args: model: The multimodal model @@ -2793,20 +2800,24 @@ def _execute_grouped_batched_encoder( num_images = len(grid_thw_list) if num_images < target_batch_size: - # Not enough images for a full batch, fall back to eager + # Not enough images for a full batch, fall back to other modes return None # Get spatial merge size for output token calculation visual = getattr(model, "visual", None) spatial_merge_size = getattr(visual, "spatial_merge_size", 2) - # Sort images by grid size (patch count) for efficient grouping + def compute_output_tokens(grid: list[int]) -> int: + t, h, w = grid + return t * (h // spatial_merge_size) * (w // spatial_merge_size) + + # Sort images by output token count for efficient grouping # Keep track of original indices for reordering output - indexed_grids = [(i, grid, grid[0] * grid[1] * grid[2]) + indexed_grids = [(i, grid, compute_output_tokens(grid)) for i, grid in enumerate(grid_thw_list)] sorted_grids = sorted(indexed_grids, key=lambda x: x[2]) - # Calculate patch offsets for slicing + # Calculate patch offsets for slicing pixel_values patch_offsets = [0] for grid in grid_thw_list: t, h, w = grid @@ -2822,81 +2833,60 @@ def _execute_grouped_batched_encoder( for i in range(target_batch_size)] batch_grids = [grid_thw_list[i] for i in batch_indices] - # Find the largest grid in this batch (for padding) - max_patches = max(g[0] * g[1] * g[2] for g in batch_grids) - max_grid = None - for g in batch_grids: - if g[0] * g[1] * g[2] == max_patches: - max_grid = g - break + # Calculate max output tokens needed in this batch + max_output_tokens = max(compute_output_tokens(g) for g in batch_grids) - # Check if we have a graph for this batch_size and grid - graph_key = self.encoder_cudagraph_manager.get_graph_for_grid( - [max_grid] * target_batch_size, batch_size=target_batch_size + # Find a bucket that can fit max_output_tokens for this batch_size + graph_key = self.encoder_cudagraph_manager.find_bucket_for_tokens( + max_output_tokens, spatial_merge_size, batch_size=target_batch_size ) if graph_key is None: - # No graph for this configuration, skip to eager + # No suitable bucket, skip this batch processed += target_batch_size continue - # Pad all images in batch to max_grid size - padded_pixels_list = [] + # Pack pixel values contiguously (no interleaved padding) + pixels_list = [] for idx in batch_indices: start = patch_offsets[idx] end = patch_offsets[idx + 1] img_pixels = batched_pixel_values[start:end] + pixels_list.append(img_pixels) - actual_patches = img_pixels.shape[0] - if actual_patches < max_patches: - # Pad with zeros - padding = torch.zeros( - max_patches - actual_patches, - img_pixels.shape[1], - dtype=img_pixels.dtype, - device=img_pixels.device, - ) - img_pixels = torch.cat([img_pixels, padding], dim=0) - padded_pixels_list.append(img_pixels) - - # Concatenate for batched execution - batched_input = torch.cat(padded_pixels_list, dim=0) + # Concatenate contiguously - NO padding between images + contiguous_pixels = torch.cat(pixels_list, dim=0) - # Run batched CUDA graph - result = self.encoder_cudagraph_manager.run_batched( - batched_input, - [max_grid] * target_batch_size, - batch_size=target_batch_size, + # Run batched CUDA graph with contiguous packing + result = self.encoder_cudagraph_manager.run_batched_contiguous( + contiguous_pixels, + batch_grids, + graph_key, + spatial_merge_size=spatial_merge_size, ) if result is not None: - # Split output by image and trim to actual size - output_tokens_per_image = ( - max_grid[0] - * (max_grid[1] // spatial_merge_size) - * (max_grid[2] // spatial_merge_size) - ) + # Extract outputs using cumulative sizes (contiguous layout) + # cu_seqlens = [0, size0, size0+size1, ..., total] + output_offset = 0 for i, idx in enumerate(batch_indices): - actual_grid = grid_thw_list[idx] - actual_tokens = ( - actual_grid[0] - * (actual_grid[1] // spatial_merge_size) - * (actual_grid[2] // spatial_merge_size) - ) - start = i * output_tokens_per_image - # Trim to actual output size - outputs[idx] = result[start:start + actual_tokens].clone() + actual_tokens = compute_output_tokens(batch_grids[i]) + outputs[idx] = result[ + output_offset:output_offset + actual_tokens + ].clone() + output_offset += actual_tokens if self.encoder_cudagraph_verbose: logger.info( - "Grouped batch: batch_size=%d, max_grid=%s, processed=%d", - target_batch_size, max_grid, target_batch_size + "Grouped batch (contiguous): batch_size=%d, " + "grids=%s, graph_key=%s", + target_batch_size, batch_grids, graph_key ) processed += target_batch_size # Check if all images were processed if any(o is None for o in outputs): - # Some images not processed, fall back to full eager + # Some images not processed, fall back to other modes return None return outputs From f93a3d64d23014aa09e834d9b71900e3e279530d Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 19:19:49 -0500 Subject: [PATCH 141/189] track buffer modified by padding. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 9c42426d65c4..0381914a8ad7 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -221,9 +221,10 @@ def __init__( "(TP=1, PP=1, DP=1), using optimized sync scheme" ) - # Track which grids have had their embedding buffers modified by run_padded(). - # This allows run() to skip restoring cached tensors when not needed. - self.modified_grids: set[tuple[int, int, int]] = set() + # Track which grids have had their embedding buffers modified by run_padded() + # or run_batched_contiguous(). This allows run() to skip restoring cached + # tensors when not needed. Keys are (batch_size, t, h, w). + self.modified_grids: set[tuple[int, int, int, int]] = set() def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: """Get encoder grid configurations from config or use defaults.""" From e06be3866f9e2e006b0f8cd43d2909242d380a66 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 21:06:17 -0500 Subject: [PATCH 142/189] fix max() over empty list. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 0381914a8ad7..432743f07ddb 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -869,17 +869,15 @@ def run_padded( ) if graph_key is None: # Don't count miss here - caller will count it when falling back to eager - max_available = ( - max( - self._compute_output_tokens((t, h, w), spatial_merge_size) - for (bs, t, h, w) in self.graphs - if bs == 1 # Only consider batch_size=1 graphs - ) - if self.graphs - else 0 - ) + # Calculate max available tokens from batch_size=1 graphs (if any) + bs1_tokens = [ + self._compute_output_tokens((t, h, w), spatial_merge_size) + for (bs, t, h, w) in self.graphs + if bs == 1 + ] + max_available = max(bs1_tokens) if bs1_tokens else 0 logger.debug( - "No bucket found for %d tokens, max available: %d", + "No bucket found for %d tokens (batch_size=1), max available: %d", num_output_tokens, max_available, ) From f1db17ea89380ca9126279cb7f4efe8d2153aa16 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 22:47:44 -0500 Subject: [PATCH 143/189] format. --- vllm/compilation/backends.py | 7 +- vllm/compilation/piecewise_backend.py | 36 +++-- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 56 ++++--- vllm/v1/worker/gpu_model_runner.py | 164 +++++++++++++-------- 4 files changed, 163 insertions(+), 100 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index d690c61ad56e..c79dce86224f 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -417,11 +417,8 @@ def call_module( # backend entirely to avoid shape tracking issues. The encoder will # use torch.compile directly and EncoderCudaGraphManager handles # full cudagraph capture separately. - encoder_skip_piecewise = ( - self.vllm_backend.is_encoder - and not getattr( - self.compilation_config, "encoder_cudagraph_piecewise", False - ) + encoder_skip_piecewise = self.vllm_backend.is_encoder and not getattr( + self.compilation_config, "encoder_cudagraph_piecewise", False ) if encoder_skip_piecewise: diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 754698b72f74..36a5528d34ad 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -74,17 +74,23 @@ def __init__( # Use encoder-specific capture sizes for encoder compilation if self.is_encoder_compilation: - encoder_capture_sizes = self.compilation_config.encoder_cudagraph_capture_sizes + encoder_capture_sizes = ( + self.compilation_config.encoder_cudagraph_capture_sizes + ) if encoder_capture_sizes is not None: # Convert from output tokens to input patches # encoder_cudagraph_capture_sizes is specified in output tokens # but runtime_shape (from sym_shape_indices) is in input patches - merge_size_sq = self.compilation_config.encoder_spatial_merge_size ** 2 - self.compile_sizes = [size * merge_size_sq for size in encoder_capture_sizes] + merge_size_sq = self.compilation_config.encoder_spatial_merge_size**2 + self.compile_sizes = [ + size * merge_size_sq for size in encoder_capture_sizes + ] logger.debug_once( "PiecewiseBackend: converted encoder capture sizes from " "output tokens %s to input patches %s (merge_size²=%d)", - tuple(encoder_capture_sizes), tuple(self.compile_sizes), merge_size_sq + tuple(encoder_capture_sizes), + tuple(self.compile_sizes), + merge_size_sq, ) else: self.compile_sizes = None @@ -169,11 +175,7 @@ def _maybe_compile_for_range_entry( # fakify for range, real args for concrete size. # For concrete size, we clear the shape env in # compiler_manager.compile() so no need to fakify. - args_list = ( - self._fakify_args(args) - if not is_exact_size - else list(args) - ) + args_list = self._fakify_args(args) if not is_exact_size else list(args) range_entry.runnable = self.vllm_backend.compiler_manager.compile( self.graph, args_list, @@ -193,7 +195,8 @@ def _find_range_for_shape(self, runtime_shape: int) -> RangeEntry | None: if self.compile_sizes is None: logger.debug( "PIECEWISE: compile_sizes is None, shape=%d, is_encoder=%s", - runtime_shape, self.is_encoder_compilation + runtime_shape, + self.is_encoder_compilation, ) return None @@ -201,7 +204,8 @@ def _find_range_for_shape(self, runtime_shape: int) -> RangeEntry | None: # Exact match with capture size - will use cudagraph logger.debug( "PIECEWISE: exact match shape=%d in compile_sizes, is_encoder=%s", - runtime_shape, self.is_encoder_compilation + runtime_shape, + self.is_encoder_compilation, ) return self.range_entries[Range(start=runtime_shape, end=runtime_shape)] else: @@ -211,15 +215,19 @@ def _find_range_for_shape(self, runtime_shape: int) -> RangeEntry | None: logger.debug( "PIECEWISE: shape=%d not in compile_sizes, " "using compile_range=%s (NO CUDAGRAPH), is_encoder=%s", - runtime_shape, range, self.is_encoder_compilation + runtime_shape, + range, + self.is_encoder_compilation, ) return self.range_entries[range] # Shape not in any range - will cause assertion error logger.warning( "PIECEWISE: shape=%d not in compile_sizes=%s or " "compile_ranges=%s, is_encoder=%s", - runtime_shape, self.compile_sizes, self.compile_ranges, - self.is_encoder_compilation + runtime_shape, + self.compile_sizes, + self.compile_ranges, + self.is_encoder_compilation, ) return None diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 432743f07ddb..2ebdec584bd7 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -181,11 +181,15 @@ def __init__( # Cached pre-computed tensors for CUDA graph replay (exact match mode) # Key: (batch_size, t, h, w), Value: dict with pos_embeds, rotary embeddings, etc. - self.cached_tensors: dict[tuple[int, int, int, int], dict[str, torch.Tensor]] = {} + self.cached_tensors: dict[ + tuple[int, int, int, int], dict[str, torch.Tensor] + ] = {} # Input buffers for embeddings (padded mode with runtime computation) # Key: (batch_size, t, h, w), Value: dict with pos_embeds, rotary_cos/sin, cu_seqlens - self.embedding_buffers: dict[tuple[int, int, int, int], dict[str, torch.Tensor]] = {} + self.embedding_buffers: dict[ + tuple[int, int, int, int], dict[str, torch.Tensor] + ] = {} # Store metadata about captured graphs self.captured_metadata: dict[tuple[int, int, int, int], dict[str, Any]] = {} @@ -301,9 +305,7 @@ def _get_batch_sizes_from_config(self) -> list[int]: if compilation_config is None: return [1] - batch_sizes = getattr( - compilation_config, "encoder_cudagraph_batch_sizes", None - ) + batch_sizes = getattr(compilation_config, "encoder_cudagraph_batch_sizes", None) if batch_sizes is None: return [1] # Legacy mode: batch_size=1 only return sorted(batch_sizes) @@ -415,7 +417,9 @@ def capture_graph_for_grid( graph_key = (batch_size, t, h, w) logger.debug( "Capturing encoder CUDA graph for key %s (batch_size=%d, grid=%s)", - graph_key, batch_size, grid_config + graph_key, + batch_size, + grid_config, ) # Prepare dummy inputs for batch @@ -454,8 +458,7 @@ def capture_graph_for_grid( cached = vision_encoder.precompute_for_cudagraph(grid_thw) self.cached_tensors[graph_key] = cached logger.debug( - "Pre-computed cached tensors for key %s: " - "pos_embeds=%s, cu_seqlens=%s", + "Pre-computed cached tensors for key %s: pos_embeds=%s, cu_seqlens=%s", graph_key, cached["pos_embeds"].shape, cached["cu_seqlens"].shape, @@ -1054,7 +1057,8 @@ def run_batched( if len(grid_thw) != batch_size: logger.warning( "grid_thw length (%d) doesn't match batch_size (%d)", - len(grid_thw), batch_size + len(grid_thw), + batch_size, ) return None @@ -1065,8 +1069,9 @@ def run_batched( for grid in grid_thw[1:]: if grid != base_grid: logger.warning( - "run_batched requires all images to have same grid, " - "got %s and %s", base_grid, grid + "run_batched requires all images to have same grid, got %s and %s", + base_grid, + grid, ) return None @@ -1140,7 +1145,9 @@ def run_batched( if self.verbose: logger.info( "run_batched(): graph_key=%s, batch_size=%d, input_shape=%s", - graph_key, batch_size, pixel_values.shape, + graph_key, + batch_size, + pixel_values.shape, ) if self.is_single_gpu: @@ -1194,7 +1201,8 @@ def run_batched_contiguous( if len(grid_thw_list) != batch_size: logger.warning( "grid_thw_list length (%d) doesn't match graph batch_size (%d)", - len(grid_thw_list), batch_size + len(grid_thw_list), + batch_size, ) return None @@ -1218,7 +1226,8 @@ def run_batched_contiguous( if actual_input_patches > bucket_input_patches: logger.warning( "Input patches (%d) exceed bucket capacity (%d).", - actual_input_patches, bucket_input_patches, + actual_input_patches, + bucket_input_patches, ) self.eager_fallbacks += 1 return None @@ -1227,7 +1236,8 @@ def run_batched_contiguous( if pixel_values.device != input_buffer.device: logger.warning( "Device mismatch: expected %s, got %s.", - input_buffer.device, pixel_values.device, + input_buffer.device, + pixel_values.device, ) self.eager_fallbacks += 1 return None @@ -1235,7 +1245,8 @@ def run_batched_contiguous( if pixel_values.dtype != input_buffer.dtype: logger.warning( "Dtype mismatch: expected %s, got %s.", - input_buffer.dtype, pixel_values.dtype, + input_buffer.dtype, + pixel_values.dtype, ) self.eager_fallbacks += 1 return None @@ -1307,16 +1318,14 @@ def run_batched_contiguous( cu_seqlens_list, dtype=torch.int32, device=self.device ) max_seqlen = max(sequence_lengths) - max_seqlen_tensor = torch.tensor( - max_seqlen, dtype=torch.int32, device="cpu" - ) + max_seqlen_tensor = torch.tensor(max_seqlen, dtype=torch.int32, device="cpu") sequence_lengths_tensor = torch.tensor( sequence_lengths, dtype=torch.int32, device=self.device ) # Update cu_seqlens buffer - need to handle size mismatch # The captured buffer may be larger, so we update only the actual part - embed_buffers["cu_seqlens"][:len(cu_seqlens_list)].copy_( + embed_buffers["cu_seqlens"][: len(cu_seqlens_list)].copy_( cu_seqlens_tensor, non_blocking=True ) embed_buffers["max_seqlen"].copy_(max_seqlen_tensor, non_blocking=True) @@ -1331,8 +1340,11 @@ def run_batched_contiguous( logger.info( "run_batched_contiguous(): graph_key=%s, grids=%s, " "actual_patches=%d, bucket_patches=%d, cu_seqlens=%s", - graph_key, grid_thw_list, actual_input_patches, - bucket_input_patches, cu_seqlens_list, + graph_key, + grid_thw_list, + actual_input_patches, + bucket_input_patches, + cu_seqlens_list, ) if self.is_single_gpu: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f8f394a58e5d..2f361f4e0aa7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4,7 +4,6 @@ import functools import gc import itertools -import math import time from collections import defaultdict from collections.abc import Iterator, Sequence @@ -2438,23 +2437,34 @@ def _execute_mm_encoder( grid_thw_list = grid_thw_list.tolist() # Find largest batch size that fits - target_batch_size = max( - bs for bs in self.encoder_cudagraph_batch_sizes - if bs <= num_items - ) if any(bs <= num_items for bs in self.encoder_cudagraph_batch_sizes) else None + target_batch_size = ( + max( + bs + for bs in self.encoder_cudagraph_batch_sizes + if bs <= num_items + ) + if any( + bs <= num_items + for bs in self.encoder_cudagraph_batch_sizes + ) + else None + ) if target_batch_size is not None and target_batch_size > 1: if self.encoder_cudagraph_verbose: logger.info( "Trying grouped batch: %d images, target_bs=%d", - num_items, target_batch_size + num_items, + target_batch_size, + ) + grouped_batched_result = ( + self._execute_grouped_batched_encoder( + model, + batched_pixel_values, + grid_thw_list, + modality, + target_batch_size, ) - grouped_batched_result = self._execute_grouped_batched_encoder( - model, - batched_pixel_values, - grid_thw_list, - modality, - target_batch_size, ) if grouped_batched_result is not None: @@ -2575,12 +2585,20 @@ def _execute_mm_encoder( "ViT: cudagraph_result=None, piecewise_enabled=%s " "(piecewise=%s, padded_mode=%s)", piecewise_enabled, - getattr(self.compilation_config, - "encoder_cudagraph_piecewise", False) - if self.compilation_config else None, - getattr(self.compilation_config, - "encoder_cudagraph_padded_mode", True) - if self.compilation_config else None, + getattr( + self.compilation_config, + "encoder_cudagraph_piecewise", + False, + ) + if self.compilation_config + else None, + getattr( + self.compilation_config, + "encoder_cudagraph_padded_mode", + True, + ) + if self.compilation_config + else None, ) if piecewise_enabled: @@ -2813,8 +2831,10 @@ def compute_output_tokens(grid: list[int]) -> int: # Sort images by output token count for efficient grouping # Keep track of original indices for reordering output - indexed_grids = [(i, grid, compute_output_tokens(grid)) - for i, grid in enumerate(grid_thw_list)] + indexed_grids = [ + (i, grid, compute_output_tokens(grid)) + for i, grid in enumerate(grid_thw_list) + ] sorted_grids = sorted(indexed_grids, key=lambda x: x[2]) # Calculate patch offsets for slicing pixel_values @@ -2829,8 +2849,9 @@ def compute_output_tokens(grid: list[int]) -> int: # Process full batches while processed + target_batch_size <= num_images: # Get the next batch of images (sorted by size) - batch_indices = [sorted_grids[processed + i][0] - for i in range(target_batch_size)] + batch_indices = [ + sorted_grids[processed + i][0] for i in range(target_batch_size) + ] batch_grids = [grid_thw_list[i] for i in batch_indices] # Calculate max output tokens needed in this batch @@ -2871,7 +2892,7 @@ def compute_output_tokens(grid: list[int]) -> int: for i, idx in enumerate(batch_indices): actual_tokens = compute_output_tokens(batch_grids[i]) outputs[idx] = result[ - output_offset:output_offset + actual_tokens + output_offset : output_offset + actual_tokens ].clone() output_offset += actual_tokens @@ -2879,7 +2900,9 @@ def compute_output_tokens(grid: list[int]) -> int: logger.info( "Grouped batch (contiguous): batch_size=%d, " "grids=%s, graph_key=%s", - target_batch_size, batch_grids, graph_key + target_batch_size, + batch_grids, + graph_key, ) processed += target_batch_size @@ -2929,7 +2952,7 @@ def _init_piecewise_stats(cls): "total_actual_tokens": 0, "total_padded_tokens": 0, "capture_size_hits": {}, # capture_size -> count - "fallback_reasons": {}, # reason -> count + "fallback_reasons": {}, # reason -> count } def _record_piecewise_fallback(self, reason: str): @@ -2963,11 +2986,12 @@ def get_piecewise_stats_summary(cls) -> str: total_padded = stats["total_padded_tokens"] waste_pct = ( (total_padded - total_actual) / total_padded * 100 - if total_padded > 0 else 0 + if total_padded > 0 + else 0 ) lines = [ - f"Piecewise padded stats:", + "Piecewise padded stats:", f" Calls: {stats['calls']}, Executions: {stats['executions']}", f" Total actual tokens: {total_actual}", f" Total padded tokens: {total_padded}", @@ -3002,8 +3026,10 @@ def _execute_encoder_piecewise_padded( List of encoder outputs if padding was applied, None otherwise """ if self.encoder_cudagraph_verbose: - logger.info("ViT PIECEWISE: _execute_encoder_piecewise_padded called, " - "modality=%s", modality) + logger.info( + "ViT PIECEWISE: _execute_encoder_piecewise_padded called, modality=%s", + modality, + ) # Only support image/video modalities if modality not in ("image", "video"): @@ -3107,22 +3133,22 @@ def _execute_encoder_piecewise_padded( (1001,), dtype=cu_seqlens.dtype, device=cu_seqlens.device ), "sequence_lengths": torch.zeros( - (1000,), dtype=sequence_lengths.dtype, - device=sequence_lengths.device + (1000,), + dtype=sequence_lengths.dtype, + device=sequence_lengths.device, ), } self._piecewise_buffers[capture_size] = buffers if self.encoder_cudagraph_verbose: logger.info( - "ViT PIECEWISE: Allocated buffers for capture_size=%d " - "(patches=%d)", - capture_size, padded_num_patches + "ViT PIECEWISE: Allocated buffers for capture_size=%d (patches=%d)", + capture_size, + padded_num_patches, ) # Copy data into pre-allocated buffers (no allocation, no zeros kernel) padded_pixel_values = buffers["pixel_values"] - padded_pixel_values[:num_input_patches].copy_( - pixel_values.type(visual.dtype)) + padded_pixel_values[:num_input_patches].copy_(pixel_values.type(visual.dtype)) padded_pos_embeds = buffers["pos_embeds"] padded_pos_embeds[:num_input_patches].copy_(pos_embeds) @@ -3155,8 +3181,9 @@ def _execute_encoder_piecewise_padded( # Update max_seqlen if padding sequence is larger if padding_patches > max_seqlen.item(): - max_seqlen = torch.tensor(padding_patches, dtype=max_seqlen.dtype, - device=max_seqlen.device) + max_seqlen = torch.tensor( + padding_patches, dtype=max_seqlen.dtype, device=max_seqlen.device + ) # Call forward_piecewise directly with pre-computed and padded tensors # Enable CUDA graph capture/replay by setting the proper forward context @@ -3178,7 +3205,7 @@ def _execute_encoder_piecewise_padded( ) # Split output by actual token counts for each image (exclude padding) - merge_size_sq = spatial_merge_size ** 2 + merge_size_sq = spatial_merge_size**2 sizes = [t * h * w // merge_size_sq for t, h, w in grid_thw_list] real_outputs = list(encoder_output[:actual_output_tokens].split(sizes)) @@ -3190,8 +3217,10 @@ def _execute_encoder_piecewise_padded( stats = self._piecewise_stats total_waste_pct = ( (stats["total_padded_tokens"] - stats["total_actual_tokens"]) - / stats["total_padded_tokens"] * 100 - if stats["total_padded_tokens"] > 0 else 0 + / stats["total_padded_tokens"] + * 100 + if stats["total_padded_tokens"] > 0 + else 0 ) logger.info( "ViT PIECEWISE PADDED: actual=%d, capture_size=%d, " @@ -3236,12 +3265,12 @@ def warmup_encoder_piecewise(self) -> None: return spatial_merge_size = getattr(visual, "spatial_merge_size", 2) - merge_size_sq = spatial_merge_size ** 2 + merge_size_sq = spatial_merge_size**2 # Convert capture_sizes to patches capture_sizes_patches = sorted( [size * merge_size_sq for size in capture_sizes], - reverse=True # Largest first like LM + reverse=True, # Largest first like LM ) # Helper to create dummy inputs for a given num_patches @@ -3255,8 +3284,9 @@ def create_dummy_inputs(num_patches: int): raw_in_channels = getattr(proj, "in_channels", 3) else: raw_in_channels = 3 - input_channels = (raw_in_channels * temporal_patch_size - * patch_size * patch_size) + input_channels = ( + raw_in_channels * temporal_patch_size * patch_size * patch_size + ) else: input_channels = 3 * 2 * 14 * 14 @@ -3266,8 +3296,9 @@ def create_dummy_inputs(num_patches: int): device=self.device, ) - hidden_size = getattr(visual, "hidden_size", - getattr(visual, "embed_dim", 1152)) + hidden_size = getattr( + visual, "hidden_size", getattr(visual, "embed_dim", 1152) + ) pos_embeds = torch.zeros( (num_patches, hidden_size), @@ -3295,12 +3326,26 @@ def create_dummy_inputs(num_patches: int): [num_patches], dtype=torch.int32, device=self.device ) - return (pixel_values, pos_embeds, rotary_cos, rotary_sin, - cu_seqlens, max_seqlen, sequence_lengths) + return ( + pixel_values, + pos_embeds, + rotary_cos, + rotary_sin, + cu_seqlens, + max_seqlen, + sequence_lengths, + ) def run_forward(num_patches: int): - (pixel_values, pos_embeds, rotary_cos, rotary_sin, - cu_seqlens, max_seqlen, sequence_lengths) = create_dummy_inputs(num_patches) + ( + pixel_values, + pos_embeds, + rotary_cos, + rotary_sin, + cu_seqlens, + max_seqlen, + sequence_lengths, + ) = create_dummy_inputs(num_patches) with set_forward_context(None, self.vllm_config): _ = visual.forward_piecewise( @@ -3349,17 +3394,16 @@ def _capture_encoder_piecewise_cudagraphs(self) -> None: return spatial_merge_size = getattr(visual, "spatial_merge_size", 2) - merge_size_sq = spatial_merge_size ** 2 + merge_size_sq = spatial_merge_size**2 # Convert capture_sizes to patches, largest first capture_sizes_patches = sorted( - [size * merge_size_sq for size in capture_sizes], - reverse=True + [size * merge_size_sq for size in capture_sizes], reverse=True ) logger.info( "Capturing encoder piecewise CUDA graphs for %d sizes", - len(capture_sizes_patches) + len(capture_sizes_patches), ) for num_patches in capture_sizes_patches: @@ -3370,8 +3414,9 @@ def _capture_encoder_piecewise_cudagraphs(self) -> None: patch_size = getattr(patch_embed, "patch_size", 14) proj = getattr(patch_embed, "proj", None) raw_in_channels = getattr(proj, "in_channels", 3) if proj else 3 - input_channels = (raw_in_channels * temporal_patch_size - * patch_size * patch_size) + input_channels = ( + raw_in_channels * temporal_patch_size * patch_size * patch_size + ) else: input_channels = 3 * 2 * 14 * 14 @@ -3381,8 +3426,9 @@ def _capture_encoder_piecewise_cudagraphs(self) -> None: device=self.device, ) - hidden_size = getattr(visual, "hidden_size", - getattr(visual, "embed_dim", 1152)) + hidden_size = getattr( + visual, "hidden_size", getattr(visual, "embed_dim", 1152) + ) pos_embeds = torch.zeros( (num_patches, hidden_size), dtype=visual.dtype, From 6b647076e4d9bc9fd8dc7e4c1c7bd508a7df4012 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 22:57:49 -0500 Subject: [PATCH 144/189] format. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 2ebdec584bd7..ba34d0b98a83 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -180,13 +180,13 @@ def __init__( self.output_buffers: dict[tuple[int, int, int, int], torch.Tensor] = {} # Cached pre-computed tensors for CUDA graph replay (exact match mode) - # Key: (batch_size, t, h, w), Value: dict with pos_embeds, rotary embeddings, etc. + # Key: (batch_size, t, h, w), Value: dict with pos_embeds, rotary, etc. self.cached_tensors: dict[ tuple[int, int, int, int], dict[str, torch.Tensor] ] = {} # Input buffers for embeddings (padded mode with runtime computation) - # Key: (batch_size, t, h, w), Value: dict with pos_embeds, rotary_cos/sin, cu_seqlens + # Key: (batch_size, t, h, w), Value: dict with pos_embeds, rotary, cu_seqlens self.embedding_buffers: dict[ tuple[int, int, int, int], dict[str, torch.Tensor] ] = {} From 4fe20611acc2a100e20797bb09ea63034e4888ba Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 23:04:16 -0500 Subject: [PATCH 145/189] fix mypy. --- vllm/compilation/piecewise_backend.py | 1 + vllm/v1/worker/gpu_model_runner.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 36a5528d34ad..e01c88e45aa4 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -73,6 +73,7 @@ def __init__( logger.debug_once(log_string) # Use encoder-specific capture sizes for encoder compilation + self.compile_sizes: list[Any] | None = None if self.is_encoder_compilation: encoder_capture_sizes = ( self.compilation_config.encoder_cudagraph_capture_sizes diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2f361f4e0aa7..906d51e84f05 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3264,6 +3264,9 @@ def warmup_encoder_piecewise(self) -> None: if visual is None or not hasattr(visual, "forward_piecewise"): return + # Assert for mypy - visual is not None after the check above + assert visual is not None + spatial_merge_size = getattr(visual, "spatial_merge_size", 2) merge_size_sq = spatial_merge_size**2 From fbd14d1ea0d1522bdc04cdf0fdb51703158fa713 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 23:08:04 -0500 Subject: [PATCH 146/189] add log for batching. --- vllm/v1/worker/gpu_model_runner.py | 44 ++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 906d51e84f05..f9515416cdf2 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2583,8 +2583,15 @@ def _execute_mm_encoder( if self.encoder_cudagraph_verbose: logger.info( "ViT: cudagraph_result=None, piecewise_enabled=%s " - "(piecewise=%s, padded_mode=%s)", + "(full_cudagraph=%s, piecewise=%s, padded_mode=%s)", piecewise_enabled, + getattr( + self.compilation_config, + "cudagraph_mm_encoder", + False, + ) + if self.compilation_config + else None, getattr( self.compilation_config, "encoder_cudagraph_piecewise", @@ -2843,8 +2850,26 @@ def compute_output_tokens(grid: list[int]) -> int: t, h, w = grid patch_offsets.append(patch_offsets[-1] + t * h * w) + # Calculate how many full batches and remainder + num_full_batches = num_images // target_batch_size + num_grouped = num_full_batches * target_batch_size + num_remainder = num_images - num_grouped + + if self.encoder_cudagraph_verbose: + logger.info( + "Processing %d images: %d in %d group(s) of %d, " + "%d remainder (eager), grids=%s", + num_images, + num_grouped, + num_full_batches, + target_batch_size, + num_remainder, + grid_thw_list, + ) + outputs = [None] * num_images processed = 0 + cudagraph_processed = 0 # Process full batches while processed + target_batch_size <= num_images: @@ -2895,12 +2920,12 @@ def compute_output_tokens(grid: list[int]) -> int: output_offset : output_offset + actual_tokens ].clone() output_offset += actual_tokens + cudagraph_processed += target_batch_size if self.encoder_cudagraph_verbose: logger.info( - "Grouped batch (contiguous): batch_size=%d, " - "grids=%s, graph_key=%s", - target_batch_size, + " Group %d: grids=%s, graph_key=%s", + processed // target_batch_size + 1, batch_grids, graph_key, ) @@ -2908,7 +2933,16 @@ def compute_output_tokens(grid: list[int]) -> int: processed += target_batch_size # Check if all images were processed - if any(o is None for o in outputs): + num_eager = sum(1 for o in outputs if o is None) + if num_eager > 0: + if self.encoder_cudagraph_verbose: + logger.info( + "Grouped batch incomplete: %d/%d with cudagraph, " + "%d fallback to eager", + cudagraph_processed, + num_images, + num_eager, + ) # Some images not processed, fall back to other modes return None From 3d2b60e2812f80f80bcb69edc596ccea09f73878 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 23:10:46 -0500 Subject: [PATCH 147/189] format. --- vllm/v1/worker/gpu_model_runner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f9515416cdf2..e9ce1f31f9b8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3312,6 +3312,7 @@ def warmup_encoder_piecewise(self) -> None: # Helper to create dummy inputs for a given num_patches def create_dummy_inputs(num_patches: int): + assert visual is not None # for mypy patch_embed = getattr(visual, "patch_embed", None) if patch_embed is not None: temporal_patch_size = getattr(patch_embed, "temporal_patch_size", 2) @@ -3374,6 +3375,7 @@ def create_dummy_inputs(num_patches: int): ) def run_forward(num_patches: int): + assert visual is not None # for mypy ( pixel_values, pos_embeds, From 24bfc22fcde82a39f2bb7c77061b042cf7ad9e91 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 23:19:13 -0500 Subject: [PATCH 148/189] fix duplciate image processing. --- vllm/v1/worker/gpu_model_runner.py | 72 +++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e9ce1f31f9b8..12cf9ee28b1a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2467,7 +2467,18 @@ def _execute_mm_encoder( ) ) - if grouped_batched_result is not None: + # Check if grouped batch returned partial results + has_partial_results = ( + grouped_batched_result is not None + and any(r is not None for r in grouped_batched_result) + ) + all_complete = ( + grouped_batched_result is not None + and all(r is not None for r in grouped_batched_result) + ) + + if all_complete: + # All images processed by grouped batch curr_group_outputs = grouped_batched_result elif ( self.encoder_cudagraph_manager is not None @@ -2475,11 +2486,15 @@ def _execute_mm_encoder( and num_items > 1 and modality in ("image", "video") ): - # Fall back to one-by-one processing + # Fall back to one-by-one processing for remaining images # Process each image individually for CUDA graph support # Extract batched data and slice per-image to avoid # re-calling group_mm_kwargs_by_modality overhead - curr_group_outputs_lst = [] + curr_group_outputs_lst = ( + list(grouped_batched_result) + if has_partial_results + else [None] * num_items + ) # Get batched pixel_values and grid_thw if modality == "image": @@ -2502,11 +2517,21 @@ def _execute_mm_encoder( # Calculate patch boundaries for slicing patch_offset = 0 - if self.encoder_cudagraph_verbose: + # Count how many need one-by-one processing + num_remaining = sum( + 1 for o in curr_group_outputs_lst if o is None + ) + if self.encoder_cudagraph_verbose and num_remaining > 0: + remaining_grids = [ + grid_thw_list[i] + for i, o in enumerate(curr_group_outputs_lst) + if o is None + ] logger.info( - "Processing %d images one-at-a-time, grids=%s", - len(grid_thw_list), - grid_thw_list, + "Processing %d remaining images one-at-a-time, " + "grids=%s", + num_remaining, + remaining_grids, ) for img_idx, grid_thw in enumerate(grid_thw_list): t, h, w = grid_thw @@ -2518,6 +2543,10 @@ def _execute_mm_encoder( ] patch_offset += num_patches + # Skip if already processed by grouped batch + if curr_group_outputs_lst[img_idx] is not None: + continue + # Build single-image kwargs for CUDA graph (list format) single_mm_inputs_for_cudagraph = { pixel_key: single_pixel_values, @@ -2532,7 +2561,7 @@ def _execute_mm_encoder( 1, ) if single_result is not None: - curr_group_outputs_lst.extend(single_result) + curr_group_outputs_lst[img_idx] = single_result[0] else: # Fall back to eager for this image # Model expects grid_thw as CPU tensor (.numpy()) @@ -2546,7 +2575,7 @@ def _execute_mm_encoder( single_output = model.embed_multimodal( **single_mm_inputs_for_eager ) - curr_group_outputs_lst.extend(single_output) + curr_group_outputs_lst[img_idx] = single_output[0] curr_group_outputs = curr_group_outputs_lst else: @@ -2932,18 +2961,19 @@ def compute_output_tokens(grid: list[int]) -> int: processed += target_batch_size - # Check if all images were processed - num_eager = sum(1 for o in outputs if o is None) - if num_eager > 0: - if self.encoder_cudagraph_verbose: - logger.info( - "Grouped batch incomplete: %d/%d with cudagraph, " - "%d fallback to eager", - cudagraph_processed, - num_images, - num_eager, - ) - # Some images not processed, fall back to other modes + # Log summary + num_unprocessed = sum(1 for o in outputs if o is None) + if self.encoder_cudagraph_verbose and num_unprocessed > 0: + logger.info( + "Grouped batch: %d/%d with cudagraph, %d remainder", + cudagraph_processed, + num_images, + num_unprocessed, + ) + + # Return partial results - caller will handle None entries + # Return None only if no images were processed at all + if cudagraph_processed == 0: return None return outputs From df710a82a1d049428218d4cdbb2110969d331e79 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 23:20:17 -0500 Subject: [PATCH 149/189] format. --- vllm/v1/worker/gpu_model_runner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 12cf9ee28b1a..85853fd868b4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2825,7 +2825,7 @@ def _execute_grouped_batched_encoder( grid_thw_list: list[list[int]], modality: str, target_batch_size: int, - ) -> list[torch.Tensor] | None: + ) -> list[torch.Tensor | None] | None: """ Execute encoder using grouped batched CUDA graphs with contiguous packing. @@ -2847,7 +2847,9 @@ def _execute_grouped_batched_encoder( target_batch_size: Target batch size for grouping (e.g., 4) Returns: - List of output tensors, or None if batched mode not available + List of output tensors (may contain None for unprocessed images), + or None if no images could be processed with batched mode. + Caller should handle None entries by processing them separately. """ if self.encoder_cudagraph_manager is None: return None From f5142a2ae6e47ffcb56c10c7334be1958f085243 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 23:31:50 -0500 Subject: [PATCH 150/189] format. --- vllm/v1/worker/gpu_model_runner.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 85853fd868b4..10fb365c81c1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2468,13 +2468,11 @@ def _execute_mm_encoder( ) # Check if grouped batch returned partial results - has_partial_results = ( - grouped_batched_result is not None - and any(r is not None for r in grouped_batched_result) + has_partial_results = grouped_batched_result is not None and any( + r is not None for r in grouped_batched_result ) - all_complete = ( - grouped_batched_result is not None - and all(r is not None for r in grouped_batched_result) + all_complete = grouped_batched_result is not None and all( + r is not None for r in grouped_batched_result ) if all_complete: @@ -2490,11 +2488,12 @@ def _execute_mm_encoder( # Process each image individually for CUDA graph support # Extract batched data and slice per-image to avoid # re-calling group_mm_kwargs_by_modality overhead - curr_group_outputs_lst = ( - list(grouped_batched_result) - if has_partial_results - else [None] * num_items - ) + if has_partial_results and grouped_batched_result is not None: + curr_group_outputs_lst: list[torch.Tensor | None] = list( + grouped_batched_result + ) + else: + curr_group_outputs_lst = [None] * num_items # Get batched pixel_values and grid_thw if modality == "image": @@ -2664,6 +2663,7 @@ def _execute_mm_encoder( curr_group_outputs, expected_num_items=num_items, ) + assert curr_group_outputs is not None # sanity_check ensures this encoder_outputs.extend(curr_group_outputs) # Cache the encoder outputs by mm_hash From 36c346b9b0be5d1ab17c521c6107a087993f6aa7 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 23:36:33 -0500 Subject: [PATCH 151/189] fix mypy. --- vllm/v1/worker/gpu_model_runner.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 10fb365c81c1..e1b8728e5160 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2477,7 +2477,10 @@ def _execute_mm_encoder( if all_complete: # All images processed by grouped batch - curr_group_outputs = grouped_batched_result + # all_complete ensures no None entries + curr_group_outputs = cast( + list[torch.Tensor], grouped_batched_result + ) elif ( self.encoder_cudagraph_manager is not None and self.encoder_cudagraph_one_by_one @@ -2488,12 +2491,16 @@ def _execute_mm_encoder( # Process each image individually for CUDA graph support # Extract batched data and slice per-image to avoid # re-calling group_mm_kwargs_by_modality overhead + # Note: list may contain None for unprocessed images; + # these will be filled in by one-by-one processing below if has_partial_results and grouped_batched_result is not None: - curr_group_outputs_lst: list[torch.Tensor | None] = list( - grouped_batched_result + curr_group_outputs_lst = cast( + list[torch.Tensor], list(grouped_batched_result) ) else: - curr_group_outputs_lst = [None] * num_items + curr_group_outputs_lst = cast( + list[torch.Tensor], [None] * num_items + ) # Get batched pixel_values and grid_thw if modality == "image": From acdd2564bab0064d032cb7f06d9c7b790e656dae Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 23:52:25 -0500 Subject: [PATCH 152/189] fix hit rate calculation. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index ba34d0b98a83..6a81da5047aa 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -1111,7 +1111,8 @@ def run_batched( self.eager_fallbacks += 1 return None - self.cache_hits += 1 + # Count images processed, not replay count (for accurate hit rate) + self.cache_hits += batch_size # Wait for any previous graph replay to complete before modifying buffers. if not self.is_single_gpu and self.replay_done_event is not None: @@ -1255,7 +1256,8 @@ def run_batched_contiguous( if not pixel_values.is_contiguous(): pixel_values = pixel_values.contiguous() - self.cache_hits += 1 + # Count images processed, not replay count (for accurate hit rate) + self.cache_hits += batch_size # Wait for any previous graph replay to complete if not self.is_single_gpu and self.replay_done_event is not None: From 24eb4a5becab67b9b6417024a4011c533d8c5a2a Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 23:52:44 -0500 Subject: [PATCH 153/189] calculate padding waste. --- vllm/v1/worker/gpu_model_runner.py | 44 +++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e1b8728e5160..54d511418b11 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2894,15 +2894,57 @@ def compute_output_tokens(grid: list[int]) -> int: num_remainder = num_images - num_grouped if self.encoder_cudagraph_verbose: + # Pre-compute padding waste estimate for logging + total_actual_tokens = sum(x[2] for x in sorted_grids) + total_bucket_tokens = 0 + waste_per_batch = [] + + # Assert for mypy (already checked at function start) + assert self.encoder_cudagraph_manager is not None + temp_idx = 0 + while temp_idx + target_batch_size <= num_images: + batch_grids = [ + grid_thw_list[sorted_grids[temp_idx + i][0]] + for i in range(target_batch_size) + ] + max_tokens = max(compute_output_tokens(g) for g in batch_grids) + actual_tokens = sum(compute_output_tokens(g) for g in batch_grids) + + # Find bucket for this batch + graph_key = self.encoder_cudagraph_manager.find_bucket_for_tokens( + max_tokens, spatial_merge_size, batch_size=target_batch_size + ) + if graph_key is not None: + _, t, h, w = graph_key + bucket_tokens = ( + t * (h // spatial_merge_size) * (w // spatial_merge_size) + ) + # Bucket capacity * batch_size vs sum of actual tokens + batch_bucket_total = bucket_tokens * target_batch_size + batch_waste = batch_bucket_total - actual_tokens + total_bucket_tokens += batch_bucket_total + waste_per_batch.append(batch_waste) + temp_idx += target_batch_size + + total_waste = sum(waste_per_batch) if waste_per_batch else 0 + waste_pct = ( + (total_waste / total_bucket_tokens * 100) + if total_bucket_tokens > 0 + else 0.0 + ) + logger.info( "Processing %d images: %d in %d group(s) of %d, " - "%d remainder (eager), grids=%s", + "%d remainder (eager), grids=%s, " + "padding_waste=%d tokens (%.1f%%)", num_images, num_grouped, num_full_batches, target_batch_size, num_remainder, grid_thw_list, + total_waste, + waste_pct, ) outputs = [None] * num_images From 735d36b50c7a90ec2fc2660dbb1b0a88a6ad48e2 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Wed, 4 Feb 2026 23:55:12 -0500 Subject: [PATCH 154/189] format. --- vllm/v1/worker/gpu_model_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 54d511418b11..7a1b80bfc096 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2895,7 +2895,6 @@ def compute_output_tokens(grid: list[int]) -> int: if self.encoder_cudagraph_verbose: # Pre-compute padding waste estimate for logging - total_actual_tokens = sum(x[2] for x in sorted_grids) total_bucket_tokens = 0 waste_per_batch = [] From 9441d5be1a86b1be0db846f77e89186ff85a37cb Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 00:21:30 -0500 Subject: [PATCH 155/189] log failed batching. --- vllm/v1/worker/gpu_model_runner.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7a1b80bfc096..a0284184891f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2967,6 +2967,13 @@ def compute_output_tokens(grid: list[int]) -> int: ) if graph_key is None: # No suitable bucket, skip this batch + if self.encoder_cudagraph_verbose: + logger.info( + " SKIP batch %d: no bucket for max_tokens=%d, grids=%s", + processed // target_batch_size + 1, + max_output_tokens, + batch_grids, + ) processed += target_batch_size continue @@ -3008,6 +3015,18 @@ def compute_output_tokens(grid: list[int]) -> int: batch_grids, graph_key, ) + else: + # run_batched_contiguous returned None - log why + if self.encoder_cudagraph_verbose: + total_patches = contiguous_pixels.shape[0] + logger.info( + " FAIL batch %d: run_batched_contiguous returned None, " + "graph_key=%s, total_patches=%d, grids=%s", + processed // target_batch_size + 1, + graph_key, + total_patches, + batch_grids, + ) processed += target_batch_size From f639b1bb15c90e2fb3805a042a5478a1d8ea93bc Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 13:47:10 -0500 Subject: [PATCH 156/189] cache embedding and reuse if possible. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 131 ++++++++++++++++----- 1 file changed, 104 insertions(+), 27 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 6a81da5047aa..02f6cca00494 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -230,6 +230,13 @@ def __init__( # tensors when not needed. Keys are (batch_size, t, h, w). self.modified_grids: set[tuple[int, int, int, int]] = set() + # Per-grid embedding cache for batched contiguous mode + # Key: (t, h, w), Value: dict with pos_embeds, rotary_cos, rotary_sin + # This avoids recomputing embeddings at runtime - just look up and concat + self.grid_embedding_cache: dict[ + tuple[int, int, int], dict[str, torch.Tensor] + ] = {} + def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: """Get encoder grid configurations from config or use defaults.""" compilation_config = self.vllm_config.compilation_config @@ -464,6 +471,23 @@ def capture_graph_for_grid( cached["cu_seqlens"].shape, ) + # Cache per-grid embeddings for batched contiguous mode + # This avoids recomputing embeddings at runtime - just lookup and concat + grid_key = (t, h, w) + if grid_key not in self.grid_embedding_cache: + # Compute embeddings for a single image of this grid size + single_cached = vision_encoder.precompute_for_cudagraph([[t, h, w]]) + self.grid_embedding_cache[grid_key] = { + "pos_embeds": single_cached["pos_embeds"], + "rotary_pos_emb_cos": single_cached["rotary_pos_emb_cos"], + "rotary_pos_emb_sin": single_cached["rotary_pos_emb_sin"], + } + logger.debug( + "Cached per-grid embeddings for grid %s: pos_embeds=%s", + grid_key, + single_cached["pos_embeds"].shape, + ) + # Create INPUT BUFFERS for embeddings (padded mode runtime computation) # These buffers can be updated at runtime before graph replay # Note: max_seqlen is a CPU scalar tensor to avoid GPU sync on .item() @@ -942,9 +966,33 @@ def run_padded( if not self.is_single_gpu and self.replay_done_event is not None: self.replay_done_event.synchronize() - # Compute embeddings for ACTUAL grid, then pad to bucket size. - # This ensures correct position embeddings for the actual input size. - actual_embeds = self.vision_encoder.precompute_for_cudagraph(grid_thw) + # Look up cached embeddings for this grid, or compute if not cached + t, h, w = grid_thw[0] + grid_key = (t, h, w) + + if grid_key in self.grid_embedding_cache: + # Use cached embeddings (fast path - no computation) + cached = self.grid_embedding_cache[grid_key] + pos_embeds = cached["pos_embeds"] + rotary_cos = cached["rotary_pos_emb_cos"] + rotary_sin = cached["rotary_pos_emb_sin"] + else: + # Cache miss - compute and cache for future use + if self.vision_encoder is None: + logger.warning("Grid %s not cached and no vision encoder", grid_key) + return None + actual_embeds = self.vision_encoder.precompute_for_cudagraph(grid_thw) + pos_embeds = actual_embeds["pos_embeds"] + rotary_cos = actual_embeds["rotary_pos_emb_cos"] + rotary_sin = actual_embeds["rotary_pos_emb_sin"] + # Cache for future use + self.grid_embedding_cache[grid_key] = { + "pos_embeds": pos_embeds, + "rotary_pos_emb_cos": rotary_cos, + "rotary_pos_emb_sin": rotary_sin, + } + if self.verbose: + logger.info("Embedding cache miss for grid %s (now cached)", grid_key) # Get embedding buffers for the bucket embed_buffers = self.embedding_buffers[graph_key] @@ -958,31 +1006,32 @@ def run_padded( # Copy actual pixel values to the beginning of the buffer input_buffer[:num_input_patches].copy_(pixel_values, non_blocking=True) - # Copy actual embeddings to the beginning of the buffers (pad with zeros) - actual_num_patches = actual_embeds["pos_embeds"].shape[0] + # Copy cached/computed embeddings to the beginning of the buffers + actual_num_patches = pos_embeds.shape[0] embed_buffers["pos_embeds"][:actual_num_patches].copy_( - actual_embeds["pos_embeds"], non_blocking=True + pos_embeds, non_blocking=True ) embed_buffers["rotary_pos_emb_cos"][:actual_num_patches].copy_( - actual_embeds["rotary_pos_emb_cos"], non_blocking=True + rotary_cos, non_blocking=True ) embed_buffers["rotary_pos_emb_sin"][:actual_num_patches].copy_( - actual_embeds["rotary_pos_emb_sin"], non_blocking=True + rotary_sin, non_blocking=True ) - # Update cu_seqlens and max_seqlen to actual values - # cu_seqlens shape is [num_images + 1], for single image: [0, num_patches] - # We copy actual values so flash attention processes only the real tokens - embed_buffers["cu_seqlens"].copy_( - actual_embeds["cu_seqlens"], non_blocking=True - ) - embed_buffers["max_seqlen"].copy_( - actual_embeds["max_seqlen"], non_blocking=True + # Compute cu_seqlens for single image (simple: [0, num_output_tokens]) + cu_seqlens = torch.tensor( + [0, num_output_tokens], dtype=torch.int32, device=self.device ) - embed_buffers["sequence_lengths"].copy_( - actual_embeds["sequence_lengths"], non_blocking=True + max_seqlen = torch.tensor(num_output_tokens, dtype=torch.int32, device="cpu") + sequence_lengths = torch.tensor( + [num_output_tokens], dtype=torch.int32, device=self.device ) + # Update cu_seqlens and max_seqlen to actual values + embed_buffers["cu_seqlens"][:2].copy_(cu_seqlens, non_blocking=True) + embed_buffers["max_seqlen"].copy_(max_seqlen, non_blocking=True) + embed_buffers["sequence_lengths"][:1].copy_(sequence_lengths, non_blocking=True) + # Mark this grid as modified so run() knows to restore cached tensors self.modified_grids.add(graph_key) @@ -1275,25 +1324,53 @@ def run_batched_contiguous( # Copy actual pixel values to the beginning of the buffer (contiguous) input_buffer[:actual_input_patches].copy_(pixel_values, non_blocking=True) - # Compute embeddings for each actual grid and pack contiguously - # Also build cu_seqlens from actual cumulative sizes + # Look up cached embeddings for each grid and pack contiguously + # This avoids expensive per-image precompute_for_cudagraph calls pos_embeds_list = [] rotary_cos_list = [] rotary_sin_list = [] sequence_lengths = [] + cache_miss = False for grid in grid_thw_list: - # Compute embeddings for this actual grid - actual_embeds = self.vision_encoder.precompute_for_cudagraph([grid]) - pos_embeds_list.append(actual_embeds["pos_embeds"]) - rotary_cos_list.append(actual_embeds["rotary_pos_emb_cos"]) - rotary_sin_list.append(actual_embeds["rotary_pos_emb_sin"]) - # Output tokens for this image t, h, w = grid + grid_key = (t, h, w) output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) sequence_lengths.append(output_tokens) - # Concatenate embeddings (contiguous packing) + # Try to use cached embeddings (populated during graph capture) + if grid_key in self.grid_embedding_cache: + cached = self.grid_embedding_cache[grid_key] + pos_embeds_list.append(cached["pos_embeds"]) + rotary_cos_list.append(cached["rotary_pos_emb_cos"]) + rotary_sin_list.append(cached["rotary_pos_emb_sin"]) + else: + # Cache miss - need to compute (should be rare after warmup) + cache_miss = True + if self.vision_encoder is not None: + actual_embeds = self.vision_encoder.precompute_for_cudagraph([grid]) + pos_embeds_list.append(actual_embeds["pos_embeds"]) + rotary_cos_list.append(actual_embeds["rotary_pos_emb_cos"]) + rotary_sin_list.append(actual_embeds["rotary_pos_emb_sin"]) + # Cache for future use + self.grid_embedding_cache[grid_key] = { + "pos_embeds": actual_embeds["pos_embeds"], + "rotary_pos_emb_cos": actual_embeds["rotary_pos_emb_cos"], + "rotary_pos_emb_sin": actual_embeds["rotary_pos_emb_sin"], + } + else: + logger.warning("Grid %s not cached and no vision encoder", grid_key) + return None + + if cache_miss and self.verbose: + uncached_grids = [ + g for g in grid_thw_list if tuple(g) not in self.grid_embedding_cache + ] + logger.info( + "Embedding cache miss for grids: %s (now cached)", uncached_grids + ) + + # Concatenate cached embeddings (just tensor concat, no computation) packed_pos_embeds = torch.cat(pos_embeds_list, dim=0) packed_rotary_cos = torch.cat(rotary_cos_list, dim=0) packed_rotary_sin = torch.cat(rotary_sin_list, dim=0) From 9a2ff979c069807c891332c6ce61f60ad1242990 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 13:47:10 -0500 Subject: [PATCH 157/189] cache embedding and reuse if possible. --- vllm/model_executor/models/qwen3_vl.py | 63 +++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 09972ca7fb4c..c18c3d3564c1 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -433,6 +433,10 @@ def __init__( ] ) + # Per-grid embedding cache for eager mode optimization + # Key: (t, h, w), Value: dict with pos_embeds, rotary_cos, rotary_sin + self._embedding_cache: dict[tuple[int, int, int], dict[str, torch.Tensor]] = {} + attn_backend_override = ( multimodal_config.mm_encoder_attn_backend if multimodal_config else None ) @@ -649,6 +653,59 @@ def compute_flashinfer_cu_seqlens( ) return np.concatenate([cu_seqlens_qk, cu_seqlens_v, cu_seqlens_o]) + def _get_cached_embeddings( + self, grid_thw_list: list[list[int]] + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Get position and rotary embeddings with per-grid caching. + + This method caches embeddings per grid configuration (t, h, w) to avoid + redundant computation when the same grid sizes are encountered repeatedly. + + Args: + grid_thw_list: List of [T, H, W] for each image + + Returns: + Tuple of (pos_embeds, rotary_cos, rotary_sin) + """ + pos_embeds_list: list[torch.Tensor] = [] + rotary_cos_list: list[torch.Tensor] = [] + rotary_sin_list: list[torch.Tensor] = [] + + for grid in grid_thw_list: + t, h, w = grid + grid_key = (t, h, w) + + if grid_key in self._embedding_cache: + # Cache hit - use cached embeddings + cached = self._embedding_cache[grid_key] + pos_embeds_list.append(cached["pos_embeds"]) + rotary_cos_list.append(cached["rotary_cos"]) + rotary_sin_list.append(cached["rotary_sin"]) + else: + # Cache miss - compute and cache + single_grid = [[t, h, w]] + pos_embed = self.fast_pos_embed_interpolate(single_grid) + rotary_cos, rotary_sin = self.rot_pos_emb(single_grid) + + # Cache for future use + self._embedding_cache[grid_key] = { + "pos_embeds": pos_embed, + "rotary_cos": rotary_cos, + "rotary_sin": rotary_sin, + } + + pos_embeds_list.append(pos_embed) + rotary_cos_list.append(rotary_cos) + rotary_sin_list.append(rotary_sin) + + # Concatenate all embeddings + pos_embeds = torch.cat(pos_embeds_list, dim=0) + rotary_pos_emb_cos = torch.cat(rotary_cos_list, dim=0) + rotary_pos_emb_sin = torch.cat(rotary_sin_list, dim=0) + + return pos_embeds, rotary_pos_emb_cos, rotary_pos_emb_sin + def forward( self, x: torch.Tensor, @@ -666,9 +723,11 @@ def forward( grid_thw_list = grid_thw.tolist() grid_thw = grid_thw.cpu().numpy() - pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list) + # Get embeddings with caching for eager mode optimization + pos_embeds, rotary_pos_emb_cos, rotary_pos_emb_sin = ( + self._get_cached_embeddings(grid_thw_list) + ) hidden_states = hidden_states + pos_embeds - rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( axis=0, dtype=np.int32 From f7a9864f1e0e2855175e9a7b5ca47546a88b6026 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 15:36:09 -0500 Subject: [PATCH 158/189] add embedding cache warmup grids. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 162 +++++++++++++++++++++ 1 file changed, 162 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 02f6cca00494..b9de84c604ac 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -79,6 +79,115 @@ # but still benefit from CUDA graphs via padding. DEFAULT_PADDED_BUCKET_SIZES = [100, 128] +# Top 100 most common grids for embedding cache pre-warming. +# Pre-warming these grids at startup avoids cold-start embedding computation +# at runtime, eliminating ~20 small kernel launches per grid on first encounter. +# Based on MLPerf VLM dataset analysis (~71% coverage with top 100 grids). +EMBEDDING_WARMUP_GRIDS: list[tuple[int, int, int]] = [ + # Top 50 grids (sorted by frequency) + (1, 62, 62), + (1, 32, 32), + (1, 50, 50), + (1, 38, 38), + (1, 76, 76), + (1, 94, 94), + (1, 64, 64), + (1, 124, 124), + (1, 68, 68), + (1, 100, 100), + (1, 16, 16), + (1, 24, 24), + (1, 46, 46), + (1, 44, 44), + (1, 42, 42), + (1, 40, 40), + (1, 56, 56), + (1, 128, 128), + (1, 18, 18), + (1, 28, 28), + (1, 34, 34), + (1, 80, 80), + (1, 30, 30), + (1, 38, 50), + (1, 22, 22), + (1, 112, 112), + (1, 36, 36), + (1, 34, 50), + (1, 188, 188), + (1, 14, 20), + (1, 90, 90), + (1, 44, 42), + (1, 16, 18), + (1, 54, 54), + (1, 48, 48), + (1, 40, 42), + (1, 60, 60), + (1, 88, 88), + (1, 26, 26), + (1, 156, 156), + (1, 94, 62), + (1, 30, 38), + (1, 24, 38), + (1, 20, 20), + (1, 24, 16), + (1, 18, 16), + (1, 120, 120), + (1, 60, 80), + (1, 52, 52), + (1, 66, 66), + # Next 50 grids + (1, 20, 14), + (1, 24, 32), + (1, 160, 160), + (1, 28, 38), + (1, 30, 40), + (1, 38, 42), + (1, 58, 58), + (1, 20, 32), + (1, 50, 38), + (1, 48, 64), + (1, 78, 78), + (1, 24, 20), + (1, 42, 62), + (1, 62, 94), + (1, 36, 42), + (1, 32, 20), + (1, 150, 150), + (1, 50, 42), + (1, 50, 76), + (1, 72, 72), + (1, 32, 24), + (1, 46, 42), + (1, 92, 94), + (1, 82, 82), + (1, 32, 38), + (1, 90, 94), + (1, 14, 22), + (1, 76, 100), + (1, 94, 92), + (1, 24, 18), + (1, 54, 42), + (1, 38, 32), + (1, 18, 24), + (1, 28, 32), + (1, 30, 42), + (1, 56, 76), + (1, 62, 42), + (1, 28, 50), + (1, 32, 42), + (1, 36, 50), + (1, 38, 24), + (1, 108, 82), + (1, 16, 20), + (1, 26, 38), + (1, 38, 36), + (1, 34, 42), + (1, 76, 50), + (1, 38, 56), + (1, 48, 42), + (1, 30, 32), +] + class EncoderCudaGraphManager: """ @@ -673,6 +782,59 @@ def capture( free_mem_after / 1024**3, ) + # Pre-warm embedding cache for common grids + self._prewarm_embedding_cache(vision_encoder) + + def _prewarm_embedding_cache(self, vision_encoder: nn.Module) -> None: + """ + Pre-warm the embedding cache for common grid configurations. + + This avoids cold-start embedding computation at runtime by pre-computing + embeddings for the top 100 most common grids. Each grid that would + otherwise trigger ~20 small kernel launches on first encounter will + instead hit the cache. + + Args: + vision_encoder: The vision encoder module with precompute_for_cudagraph + """ + if not hasattr(vision_encoder, "precompute_for_cudagraph"): + logger.debug( + "Vision encoder lacks precompute_for_cudagraph, skipping warmup" + ) + return + + # Filter out grids that are already cached (from graph capture) + grids_to_warm = [ + g for g in EMBEDDING_WARMUP_GRIDS if g not in self.grid_embedding_cache + ] + + if not grids_to_warm: + logger.debug("All warmup grids already cached") + return + + logger.info( + "Pre-warming embedding cache for %d grids (%d already cached)", + len(grids_to_warm), + len(EMBEDDING_WARMUP_GRIDS) - len(grids_to_warm), + ) + + for grid in grids_to_warm: + t, h, w = grid + try: + cached = vision_encoder.precompute_for_cudagraph([[t, h, w]]) + self.grid_embedding_cache[grid] = { + "pos_embeds": cached["pos_embeds"], + "rotary_pos_emb_cos": cached["rotary_pos_emb_cos"], + "rotary_pos_emb_sin": cached["rotary_pos_emb_sin"], + } + except Exception as e: + logger.debug("Failed to pre-warm grid %s: %s", grid, e) + + logger.info( + "Embedding cache warmed: %d grids total", + len(self.grid_embedding_cache), + ) + def get_graph_for_grid( self, grid_thw: list[list[int]], From e1c218a0359505f362bad6e7957c2eb24725ae58 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 15:37:57 -0500 Subject: [PATCH 159/189] log embedding cache mem usage. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index b9de84c604ac..1a8adaad4cd9 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -830,11 +830,28 @@ def _prewarm_embedding_cache(self, vision_encoder: nn.Module) -> None: except Exception as e: logger.debug("Failed to pre-warm grid %s: %s", grid, e) + # Calculate embedding cache memory consumption + cache_memory_bytes = self._compute_embedding_cache_memory() logger.info( - "Embedding cache warmed: %d grids total", + "Embedding cache warmed: %d grids total, memory: %.2f MiB", len(self.grid_embedding_cache), + cache_memory_bytes / (1024 * 1024), ) + def _compute_embedding_cache_memory(self) -> int: + """ + Compute the total GPU memory consumption of the embedding cache. + + Returns: + Total memory in bytes used by all cached embeddings. + """ + total_bytes = 0 + for grid, cached in self.grid_embedding_cache.items(): + for key, tensor in cached.items(): + if isinstance(tensor, torch.Tensor): + total_bytes += tensor.numel() * tensor.element_size() + return total_bytes + def get_graph_for_grid( self, grid_thw: list[list[int]], From c849ab59604fc3c6b9f079aaf06adfb50c84b647 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 15:38:39 -0500 Subject: [PATCH 160/189] control log through verbose cli. --- vllm/model_executor/models/qwen3_vl.py | 14 ++++++++++++ vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 26 ++++++++++++---------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index c18c3d3564c1..33fa2879e183 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -706,6 +706,20 @@ def _get_cached_embeddings( return pos_embeds, rotary_pos_emb_cos, rotary_pos_emb_sin + def get_embedding_cache_memory(self) -> int: + """ + Compute the total GPU memory consumption of the embedding cache. + + Returns: + Total memory in bytes used by all cached embeddings. + """ + total_bytes = 0 + for grid, cached in self._embedding_cache.items(): + for key, tensor in cached.items(): + if isinstance(tensor, torch.Tensor): + total_bytes += tensor.numel() * tensor.element_size() + return total_bytes + def forward( self, x: torch.Tensor, diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 1a8adaad4cd9..ea3d651a8eba 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -812,11 +812,12 @@ def _prewarm_embedding_cache(self, vision_encoder: nn.Module) -> None: logger.debug("All warmup grids already cached") return - logger.info( - "Pre-warming embedding cache for %d grids (%d already cached)", - len(grids_to_warm), - len(EMBEDDING_WARMUP_GRIDS) - len(grids_to_warm), - ) + if self.verbose: + logger.info( + "Pre-warming embedding cache for %d grids (%d already cached)", + len(grids_to_warm), + len(EMBEDDING_WARMUP_GRIDS) - len(grids_to_warm), + ) for grid in grids_to_warm: t, h, w = grid @@ -830,13 +831,14 @@ def _prewarm_embedding_cache(self, vision_encoder: nn.Module) -> None: except Exception as e: logger.debug("Failed to pre-warm grid %s: %s", grid, e) - # Calculate embedding cache memory consumption - cache_memory_bytes = self._compute_embedding_cache_memory() - logger.info( - "Embedding cache warmed: %d grids total, memory: %.2f MiB", - len(self.grid_embedding_cache), - cache_memory_bytes / (1024 * 1024), - ) + # Calculate and log embedding cache memory consumption + if self.verbose: + cache_memory_bytes = self._compute_embedding_cache_memory() + logger.info( + "Embedding cache warmed: %d grids total, memory: %.2f MiB", + len(self.grid_embedding_cache), + cache_memory_bytes / (1024 * 1024), + ) def _compute_embedding_cache_memory(self) -> int: """ From d869610732d5233f082266838e8f68c28de4f709 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 16:32:23 -0500 Subject: [PATCH 161/189] cache embed for only specified grid sizes. --- vllm/model_executor/models/qwen3_vl.py | 25 ++++++++++++++-------- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 14 ++++++------ 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 33fa2879e183..2fb3be4cf601 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -94,6 +94,7 @@ from vllm.sequence import IntermediateTensors from vllm.utils.collection_utils import is_list_of from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.worker.gpu.mm.encoder_cudagraph import EMBEDDING_WARMUP_GRIDS from .interfaces import ( MultiModalEmbeddings, @@ -137,6 +138,9 @@ # This avoids creating a new graph for each unique batch size at runtime BATCH_BUCKETS = [8, 16, 32, 64] +# Set of pre-warmed grids for O(1) lookup in embedding cache +_EMBEDDING_WARMUP_GRIDS_SET: set[tuple[int, int, int]] = set(EMBEDDING_WARMUP_GRIDS) + @support_torch_compile( dynamic_arg_dims={"x": 0}, @@ -659,8 +663,9 @@ def _get_cached_embeddings( """ Get position and rotary embeddings with per-grid caching. - This method caches embeddings per grid configuration (t, h, w) to avoid - redundant computation when the same grid sizes are encountered repeatedly. + This method caches embeddings only for grids in EMBEDDING_WARMUP_GRIDS + to avoid unbounded memory growth. Grids not in the warmup set are + computed on-the-fly without caching. Args: grid_thw_list: List of [T, H, W] for each image @@ -683,17 +688,19 @@ def _get_cached_embeddings( rotary_cos_list.append(cached["rotary_cos"]) rotary_sin_list.append(cached["rotary_sin"]) else: - # Cache miss - compute and cache + # Cache miss - compute embeddings single_grid = [[t, h, w]] pos_embed = self.fast_pos_embed_interpolate(single_grid) rotary_cos, rotary_sin = self.rot_pos_emb(single_grid) - # Cache for future use - self._embedding_cache[grid_key] = { - "pos_embeds": pos_embed, - "rotary_cos": rotary_cos, - "rotary_sin": rotary_sin, - } + # Only cache if grid is in pre-warmed set to prevent OOM. + # Caching at runtime causes unbounded memory growth. + if grid_key in _EMBEDDING_WARMUP_GRIDS_SET: + self._embedding_cache[grid_key] = { + "pos_embeds": pos_embed, + "rotary_cos": rotary_cos, + "rotary_sin": rotary_sin, + } pos_embeds_list.append(pos_embed) rotary_cos_list.append(rotary_cos) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index ea3d651a8eba..c78d7ccd78f0 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -1158,7 +1158,9 @@ def run_padded( rotary_cos = cached["rotary_pos_emb_cos"] rotary_sin = cached["rotary_pos_emb_sin"] else: - # Cache miss - compute and cache for future use + # Cache miss - compute on-the-fly but do NOT cache at runtime. + # Caching at runtime causes OOM due to unbounded memory growth. + # Only pre-warmed grids from EMBEDDING_WARMUP_GRIDS are cached. if self.vision_encoder is None: logger.warning("Grid %s not cached and no vision encoder", grid_key) return None @@ -1166,14 +1168,10 @@ def run_padded( pos_embeds = actual_embeds["pos_embeds"] rotary_cos = actual_embeds["rotary_pos_emb_cos"] rotary_sin = actual_embeds["rotary_pos_emb_sin"] - # Cache for future use - self.grid_embedding_cache[grid_key] = { - "pos_embeds": pos_embeds, - "rotary_pos_emb_cos": rotary_cos, - "rotary_pos_emb_sin": rotary_sin, - } if self.verbose: - logger.info("Embedding cache miss for grid %s (now cached)", grid_key) + logger.info( + "Embedding cache miss for grid %s (computed on-the-fly)", grid_key + ) # Get embedding buffers for the bucket embed_buffers = self.embedding_buffers[graph_key] From 41f6a4ff32785c0a4093689dae2a19aaccbdc983 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 19:58:45 -0500 Subject: [PATCH 162/189] add token budget and max bs configs. --- vllm/config/compilation.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 050fc0d7fa0b..d8d61dc79d80 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -497,6 +497,20 @@ class CompilationConfig: Example: [4] captures batch_size=4 graphs only (1-3 images use eager). Default None uses legacy one-by-one mode (batch_size=1 per image).""" + encoder_cudagraph_token_budgets: list[int] | None = None + """List of total output token budget levels for budget batch CUDA graphs. + E.g., [2048, 4096, 8192]. For each budget, one graph is captured with + max_images_per_batch image slots. At runtime, images are sorted + smallest-first and greedily packed; the smallest fitting budget graph is + selected. Works with FA2 and FA4 attention backends only. + Requires encoder_cudagraph_max_images_per_batch to also be set.""" + + encoder_cudagraph_max_images_per_batch: int | None = None + """Maximum number of images per budget batch. The captured CUDA graph + has fixed cu_seqlens of size max_images_per_batch + 1. Empty slots use + zero-length sequences (no-op in flash attention). Used together with + encoder_cudagraph_token_budgets.""" + encoder_cudagraph_piecewise: bool = False """Enable piecewise CUDA graph mode for encoder (ViT). When True, torch.compile splits the encoder graph at attention ops, so: From 1d0fd70ec70a28333bef5afadb96846ac2a4f869 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 19:59:33 -0500 Subject: [PATCH 163/189] read budget, max bs from configs. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 48 ++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index c78d7ccd78f0..d3d375a7d2d3 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -346,6 +346,54 @@ def __init__( tuple[int, int, int], dict[str, torch.Tensor] ] = {} + # Budget batching config + # Maps token_budget -> graph_key for budget batch CUDA graphs + self.budget_graph_keys: dict[int, tuple[int, int, int, int]] = {} + self.token_budgets: list[int] = [] + self.max_images_per_batch: int = 0 + self._read_budget_config() + + def _read_budget_config(self) -> None: + """Read budget batching configuration from compilation config.""" + compilation_config = self.vllm_config.compilation_config + if compilation_config is None: + return + + token_budgets = getattr( + compilation_config, "encoder_cudagraph_token_budgets", None + ) + max_images = getattr( + compilation_config, "encoder_cudagraph_max_images_per_batch", None + ) + + if token_budgets is None and max_images is None: + return + + if (token_budgets is None) != (max_images is None): + logger.warning( + "encoder_cudagraph_token_budgets and " + "encoder_cudagraph_max_images_per_batch must both be set. " + "Budget batching disabled." + ) + return + + if max_images <= 0: + logger.warning( + "encoder_cudagraph_max_images_per_batch must be positive. " + "Budget batching disabled." + ) + return + + self.token_budgets = sorted(token_budgets) + self.max_images_per_batch = max_images + + logger.info( + "Budget batching configured: token_budgets=%s, " + "max_images_per_batch=%d", + self.token_budgets, + self.max_images_per_batch, + ) + def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: """Get encoder grid configurations from config or use defaults.""" compilation_config = self.vllm_config.compilation_config From 23c003bc8aef49d74fd8ee95df1887e9c3d4415f Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 20:00:06 -0500 Subject: [PATCH 164/189] capture for token budget, max bs. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 96 ++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index d3d375a7d2d3..417849ca5ad4 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -735,6 +735,98 @@ def capture_graph_for_grid( cached_suffix, ) + def capture_budget_graphs(self, vision_encoder: nn.Module) -> None: + """ + Capture CUDA graphs for budget batching mode. + + For each configured token_budget, captures a graph with + max_images_per_batch image slots. The graph uses a synthetic grid + that produces the right tensor shapes. At runtime, embedding buffers + are overwritten with actual per-image values from grid_embedding_cache. + + Args: + vision_encoder: The vision encoder module + """ + if not self.token_budgets or self.max_images_per_batch <= 0: + return + + merge = getattr(vision_encoder, "spatial_merge_size", 2) + + for token_budget in self.token_budgets: + per_image_output = token_budget // self.max_images_per_batch + if per_image_output <= 0: + logger.warning( + "token_budget=%d too small for max_images=%d, skipping", + token_budget, + self.max_images_per_batch, + ) + continue + + # Synthetic grid: (1, merge, per_image_output * merge) + # Output tokens per image = 1 * (merge/merge) * (per_image_output*merge/merge) + # = 1 * 1 * per_image_output = per_image_output + # Total output = max_images * per_image_output = token_budget + grid_config = (1, merge, per_image_output * merge) + + try: + if self.is_single_gpu: + self.capture_graph_for_grid( + grid_config, + vision_encoder, + batch_size=self.max_images_per_batch, + ) + else: + with graph_capture(device=self.device): + self.capture_graph_for_grid( + grid_config, + vision_encoder, + batch_size=self.max_images_per_batch, + ) + + graph_key = ( + self.max_images_per_batch, + 1, + merge, + per_image_output * merge, + ) + self.budget_graph_keys[token_budget] = graph_key + logger.info( + "Captured budget graph: token_budget=%d, " + "max_images=%d, graph_key=%s", + token_budget, + self.max_images_per_batch, + graph_key, + ) + except Exception as e: + logger.warning( + "Failed to capture budget graph for token_budget=%d: %s", + token_budget, + e, + ) + + def find_budget_graph( + self, + total_output_tokens: int, + ) -> tuple[int, int, int, int] | None: + """ + Find the smallest budget graph that fits the given total output tokens. + + Args: + total_output_tokens: Total output tokens for the packed batch + + Returns: + Graph key (batch_size, t, h, w) or None if no budget fits + """ + best_key = None + best_budget = float("inf") + + for budget, graph_key in self.budget_graph_keys.items(): + if budget >= total_output_tokens and budget < best_budget: + best_budget = budget + best_key = graph_key + + return best_key + @torch.inference_mode() def capture( self, @@ -833,6 +925,10 @@ def capture( # Pre-warm embedding cache for common grids self._prewarm_embedding_cache(vision_encoder) + # Capture budget batch graphs if configured + if self.token_budgets and self.max_images_per_batch > 0: + self.capture_budget_graphs(vision_encoder) + def _prewarm_embedding_cache(self, vision_encoder: nn.Module) -> None: """ Pre-warm the embedding cache for common grid configurations. From 5c84f2e28bb327e16267f809f4047964444bd937 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 20:01:12 -0500 Subject: [PATCH 165/189] check num images against max bs. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 417849ca5ad4..8a0d57c41747 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -1571,10 +1571,22 @@ def run_batched_contiguous( return None batch_size = graph_key[0] - if len(grid_thw_list) != batch_size: + num_actual_images = len(grid_thw_list) + is_budget_graph = graph_key in self.budget_graph_keys.values() + + if num_actual_images > batch_size: + logger.warning( + "grid_thw_list length (%d) exceeds graph batch_size (%d)", + num_actual_images, + batch_size, + ) + return None + + if num_actual_images != batch_size and not is_budget_graph: logger.warning( - "grid_thw_list length (%d) doesn't match graph batch_size (%d)", - len(grid_thw_list), + "grid_thw_list length (%d) doesn't match graph batch_size (%d)" + " and not a budget graph", + num_actual_images, batch_size, ) return None From 37a068081e97a1fe8540fb12f7190d4de6b7206a Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 20:01:51 -0500 Subject: [PATCH 166/189] update cache hit. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 8a0d57c41747..e1b7b93d8499 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -1640,8 +1640,8 @@ def run_batched_contiguous( if not pixel_values.is_contiguous(): pixel_values = pixel_values.contiguous() - # Count images processed, not replay count (for accurate hit rate) - self.cache_hits += batch_size + # Count actual images processed (for accurate hit rate) + self.cache_hits += num_actual_images # Wait for any previous graph replay to complete if not self.is_single_gpu and self.replay_done_event is not None: From 772dbbb70e3f4e7decff8eca4047acb0276e821f Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 20:02:37 -0500 Subject: [PATCH 167/189] pad cu_seqlens. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 41 ++++++++++++++++------ 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index e1b7b93d8499..b2398adc06a5 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -1723,29 +1723,50 @@ def run_batched_contiguous( ) # Build cu_seqlens from actual cumulative sizes - # cu_seqlens = [0, size0, size0+size1, ..., total] + # cu_seqlens = [0, size0, size0+size1, ..., total_actual] cu_seqlens_list = [0] for length in sequence_lengths: cu_seqlens_list.append(cu_seqlens_list[-1] + length) + # For budget graphs: pad cu_seqlens to batch_size + 1 by repeating + # the last value. This creates zero-length sequences for empty slots + # that flash attention skips (no-op). + if is_budget_graph and len(cu_seqlens_list) < batch_size + 1: + last_val = cu_seqlens_list[-1] + while len(cu_seqlens_list) < batch_size + 1: + cu_seqlens_list.append(last_val) + + # For budget graphs: pad sequence_lengths with zeros for empty slots + if is_budget_graph and len(sequence_lengths) < batch_size: + sequence_lengths = list(sequence_lengths) + [0] * ( + batch_size - len(sequence_lengths) + ) + cu_seqlens_tensor = torch.tensor( cu_seqlens_list, dtype=torch.int32, device=self.device ) - max_seqlen = max(sequence_lengths) + max_seqlen = max(s for s in sequence_lengths if s > 0) if sequence_lengths else 0 max_seqlen_tensor = torch.tensor(max_seqlen, dtype=torch.int32, device="cpu") sequence_lengths_tensor = torch.tensor( sequence_lengths, dtype=torch.int32, device=self.device ) - # Update cu_seqlens buffer - need to handle size mismatch - # The captured buffer may be larger, so we update only the actual part - embed_buffers["cu_seqlens"][: len(cu_seqlens_list)].copy_( - cu_seqlens_tensor, non_blocking=True - ) + # Copy full cu_seqlens and sequence_lengths to buffers + # For budget graphs, sizes match exactly (padded to batch_size + 1). + # For non-budget graphs, copy only the actual part. + cu_seqlens_buf = embed_buffers["cu_seqlens"] + seq_len_buf = embed_buffers["sequence_lengths"] + if is_budget_graph: + cu_seqlens_buf.copy_(cu_seqlens_tensor, non_blocking=True) + seq_len_buf.copy_(sequence_lengths_tensor, non_blocking=True) + else: + cu_seqlens_buf[: len(cu_seqlens_list)].copy_( + cu_seqlens_tensor, non_blocking=True + ) + seq_len_buf[:batch_size].copy_( + sequence_lengths_tensor, non_blocking=True + ) embed_buffers["max_seqlen"].copy_(max_seqlen_tensor, non_blocking=True) - embed_buffers["sequence_lengths"][:batch_size].copy_( - sequence_lengths_tensor, non_blocking=True - ) # Mark this grid as modified so run() knows to restore cached tensors self.modified_grids.add(graph_key) From dcef68ab7302760bafd5810b026c2359c9c538d8 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 20:03:02 -0500 Subject: [PATCH 168/189] update logs for batching and padding. --- vllm/v1/worker/gpu/mm/encoder_runner.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py index 99893315fbb2..eb5ce415ea08 100644 --- a/vllm/v1/worker/gpu/mm/encoder_runner.py +++ b/vllm/v1/worker/gpu/mm/encoder_runner.py @@ -86,12 +86,19 @@ def _init_encoder_cudagraph_manager(self) -> None: bucket_sizes=bucket_sizes, ) + # Check if budget batching is configured + self.encoder_cudagraph_budget_mode = bool( + self.encoder_cudagraph_manager.token_budgets + and self.encoder_cudagraph_manager.max_images_per_batch > 0 + ) + # Log configuration grid_configs = self.encoder_cudagraph_manager.grid_configs logger.info( "Encoder CUDA graph manager initialized: " - "padded_mode=%s, num_grids=%d, grids=%s", + "padded_mode=%s, budget_mode=%s, num_grids=%d, grids=%s", self.encoder_cudagraph_padded_mode, + self.encoder_cudagraph_budget_mode, len(grid_configs), grid_configs, ) From da172108cd30c89fec44e58b37287c5630a67289 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 20:05:23 -0500 Subject: [PATCH 169/189] execute budget based batching, greedy pack. --- vllm/v1/worker/gpu/mm/encoder_runner.py | 156 ++++++++++++++++++++++-- 1 file changed, 144 insertions(+), 12 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py index eb5ce415ea08..9d03a7037abe 100644 --- a/vllm/v1/worker/gpu/mm/encoder_runner.py +++ b/vllm/v1/worker/gpu/mm/encoder_runner.py @@ -260,9 +260,10 @@ def _execute_with_cudagraph( """ Execute the encoder using CUDA graphs if a matching graph is available. - Supports two modes: - 1. Exact match: Only use CUDA graph if grid_thw exactly matches - 2. Padded mode: Pad inputs to fit the smallest available bucket + Supports three modes: + 1. Budget batching: Pack multiple images into budget-sized graphs + 2. Exact match: Only use CUDA graph if grid_thw exactly matches + 3. Padded mode: Pad inputs to fit the smallest available bucket Args: model: The multimodal model @@ -281,15 +282,6 @@ def _execute_with_cudagraph( if grid_thw is None: return None - # Currently only supports single-image batches for CUDA graph - if len(grid_thw) != 1: - logger.debug( - "CUDA graph only supports single-image batches, " - "got %d images. Using eager mode.", - len(grid_thw), - ) - return None - # Extract pixel_values if modality == "image": pixel_values = mm_kwargs_group.get("pixel_values") @@ -306,6 +298,24 @@ def _execute_with_cudagraph( # Get spatial merge size for token calculations visual = getattr(model, "visual", None) spatial_merge_size = getattr(visual, "spatial_merge_size", 2) + + # Try budget batching for all images (single or multi) + if self.encoder_cudagraph_budget_mode: + result = self._execute_budget_batch( + pixel_values, grid_thw, spatial_merge_size + ) + if result is not None: + return result + + # Fall back to single-image path + if len(grid_thw) != 1: + logger.debug( + "CUDA graph single-image path: got %d images. " + "Using eager mode.", + len(grid_thw), + ) + return None + t, h, w = grid_thw[0] num_output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) @@ -350,6 +360,128 @@ def _execute_with_cudagraph( ) return None + def _execute_budget_batch( + self, + pixel_values: torch.Tensor, + grid_thw: list[list[int]], + spatial_merge_size: int, + ) -> list[torch.Tensor] | None: + """ + Execute images using budget batch CUDA graphs. + + Sorts images by output token count (smallest first), greedily packs + them into budget-sized batches, and replays the appropriate CUDA graph. + + Args: + pixel_values: Concatenated pixel values for all images + grid_thw: List of [T, H, W] for each image + spatial_merge_size: Spatial merge size (e.g., 2) + + Returns: + List of per-image output tensors in original order, or None + """ + manager = self.encoder_cudagraph_manager + if manager is None or not manager.budget_graph_keys: + return None + + max_budget = max(manager.budget_graph_keys.keys()) + max_images = manager.max_images_per_batch + + # Compute per-image info: (output_tokens, input_patches, original_idx) + image_info: list[tuple[int, int, int]] = [] + for i, (t, h, w) in enumerate(grid_thw): + out_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) + in_patches = t * h * w + image_info.append((out_tokens, in_patches, i)) + + # Sort by output tokens ascending (small first) + sorted_images = sorted(image_info, key=lambda x: x[0]) + + # Compute pixel_values offsets for each original image + patch_offsets = [0] + for t, h, w in grid_thw: + patch_offsets.append(patch_offsets[-1] + t * h * w) + + # Greedy packing into budget batches + batches: list[list[tuple[int, int, int]]] = [] + current_batch: list[tuple[int, int, int]] = [] + current_tokens = 0 + + for out_tokens, in_patches, orig_idx in sorted_images: + if ( + current_tokens + out_tokens <= max_budget + and len(current_batch) < max_images + ): + current_batch.append((out_tokens, in_patches, orig_idx)) + current_tokens += out_tokens + else: + if current_batch: + batches.append(current_batch) + current_batch = [(out_tokens, in_patches, orig_idx)] + current_tokens = out_tokens + + if current_batch: + batches.append(current_batch) + + # Execute each packed batch + outputs: list[torch.Tensor | None] = [None] * len(grid_thw) + + for batch in batches: + total_out_tokens = sum(out_tok for out_tok, _, _ in batch) + + # Find smallest budget graph that fits + graph_key = manager.find_budget_graph(total_out_tokens) + if graph_key is None: + # No budget fits - fall back entirely + logger.debug( + "No budget graph for %d tokens, falling back to eager", + total_out_tokens, + ) + return None + + # Concatenate pixel values in sorted order + pv_slices = [] + batch_grids = [] + for _, _, orig_idx in batch: + start = patch_offsets[orig_idx] + end = patch_offsets[orig_idx + 1] + pv_slices.append(pixel_values[start:end]) + batch_grids.append(grid_thw[orig_idx]) + + packed_pv = torch.cat(pv_slices, dim=0) + + # Run the budget graph + output = manager.run_batched_contiguous( + packed_pv, batch_grids, graph_key, spatial_merge_size + ) + if output is None: + logger.debug( + "Budget graph replay failed for key %s, " + "falling back to eager", + graph_key, + ) + return None + + # Split output by per-image output token counts + offset = 0 + for out_tokens, _, orig_idx in batch: + outputs[orig_idx] = output[offset : offset + out_tokens].clone() + offset += out_tokens + + if self.encoder_cudagraph_manager.verbose: + logger.info( + "ViT BUDGET BATCH: %d images, %d tokens, graph_key=%s", + len(batch), + total_out_tokens, + graph_key, + ) + + # Check all images were processed + if any(o is None for o in outputs): + return None + + return outputs # type: ignore[return-value] + def gather_mm_embeddings( self, req_ids: list[str], From 5b2dd9e4e1d6028e29c6658c542db41b5239a845 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 20:18:35 -0500 Subject: [PATCH 170/189] check if token budget is divisible by max bs. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index b2398adc06a5..01fab60974b7 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -384,6 +384,16 @@ def _read_budget_config(self) -> None: ) return + bad_budgets = [b for b in token_budgets if b % max_images != 0] + if bad_budgets: + logger.warning( + "encoder_cudagraph_token_budgets values %s are not divisible " + "by max_images_per_batch=%d. Budget batching disabled.", + bad_budgets, + max_images, + ) + return + self.token_budgets = sorted(token_budgets) self.max_images_per_batch = max_images @@ -1665,7 +1675,7 @@ def run_batched_contiguous( rotary_cos_list = [] rotary_sin_list = [] sequence_lengths = [] - cache_miss = False + cache_miss_grids: list[tuple[int, int, int]] = [] for grid in grid_thw_list: t, h, w = grid @@ -1681,7 +1691,7 @@ def run_batched_contiguous( rotary_sin_list.append(cached["rotary_pos_emb_sin"]) else: # Cache miss - need to compute (should be rare after warmup) - cache_miss = True + cache_miss_grids.append(grid_key) if self.vision_encoder is not None: actual_embeds = self.vision_encoder.precompute_for_cudagraph([grid]) pos_embeds_list.append(actual_embeds["pos_embeds"]) @@ -1697,10 +1707,8 @@ def run_batched_contiguous( logger.warning("Grid %s not cached and no vision encoder", grid_key) return None - if cache_miss and self.verbose: - uncached_grids = [ - g for g in grid_thw_list if tuple(g) not in self.grid_embedding_cache - ] + if cache_miss_grids and self.verbose: + uncached_grids = cache_miss_grids logger.info( "Embedding cache miss for grids: %s (now cached)", uncached_grids ) From c32327da380fb5388d62c026e50508ecffc25ab9 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 21:02:02 -0500 Subject: [PATCH 171/189] rename var. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 01fab60974b7..11e155c17604 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -1719,14 +1719,14 @@ def run_batched_contiguous( packed_rotary_sin = torch.cat(rotary_sin_list, dim=0) # Copy packed embeddings to buffer (padding remains zero at end) - actual_output_tokens = packed_pos_embeds.shape[0] - embed_buffers["pos_embeds"][:actual_output_tokens].copy_( + actual_embed_len = packed_pos_embeds.shape[0] + embed_buffers["pos_embeds"][:actual_embed_len].copy_( packed_pos_embeds, non_blocking=True ) - embed_buffers["rotary_pos_emb_cos"][:actual_output_tokens].copy_( + embed_buffers["rotary_pos_emb_cos"][:actual_embed_len].copy_( packed_rotary_cos, non_blocking=True ) - embed_buffers["rotary_pos_emb_sin"][:actual_output_tokens].copy_( + embed_buffers["rotary_pos_emb_sin"][:actual_embed_len].copy_( packed_rotary_sin, non_blocking=True ) From 047e490011facc029f88db80e0f2eb0da8da6b76 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 21:24:13 -0500 Subject: [PATCH 172/189] format. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 19 +++++++++++-------- vllm/v1/worker/gpu/mm/encoder_runner.py | 14 ++++++-------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 11e155c17604..85a96b7caebf 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -377,6 +377,9 @@ def _read_budget_config(self) -> None: ) return + if token_budgets is None or max_images is None: + return + if max_images <= 0: logger.warning( "encoder_cudagraph_max_images_per_batch must be positive. " @@ -398,8 +401,7 @@ def _read_budget_config(self) -> None: self.max_images_per_batch = max_images logger.info( - "Budget batching configured: token_budgets=%s, " - "max_images_per_batch=%d", + "Budget batching configured: token_budgets=%s, max_images_per_batch=%d", self.token_budgets, self.max_images_per_batch, ) @@ -773,8 +775,9 @@ def capture_budget_graphs(self, vision_encoder: nn.Module) -> None: continue # Synthetic grid: (1, merge, per_image_output * merge) - # Output tokens per image = 1 * (merge/merge) * (per_image_output*merge/merge) - # = 1 * 1 * per_image_output = per_image_output + # Output tokens per image: + # 1 * (merge/merge) * (per_image_output*merge/merge) + # = per_image_output # Total output = max_images * per_image_output = token_budget grid_config = (1, merge, per_image_output * merge) @@ -1753,7 +1756,9 @@ def run_batched_contiguous( cu_seqlens_tensor = torch.tensor( cu_seqlens_list, dtype=torch.int32, device=self.device ) - max_seqlen = max(s for s in sequence_lengths if s > 0) if sequence_lengths else 0 + max_seqlen = ( + max(s for s in sequence_lengths if s > 0) if sequence_lengths else 0 + ) max_seqlen_tensor = torch.tensor(max_seqlen, dtype=torch.int32, device="cpu") sequence_lengths_tensor = torch.tensor( sequence_lengths, dtype=torch.int32, device=self.device @@ -1771,9 +1776,7 @@ def run_batched_contiguous( cu_seqlens_buf[: len(cu_seqlens_list)].copy_( cu_seqlens_tensor, non_blocking=True ) - seq_len_buf[:batch_size].copy_( - sequence_lengths_tensor, non_blocking=True - ) + seq_len_buf[:batch_size].copy_(sequence_lengths_tensor, non_blocking=True) embed_buffers["max_seqlen"].copy_(max_seqlen_tensor, non_blocking=True) # Mark this grid as modified so run() knows to restore cached tensors diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py index 9d03a7037abe..2de2819b8319 100644 --- a/vllm/v1/worker/gpu/mm/encoder_runner.py +++ b/vllm/v1/worker/gpu/mm/encoder_runner.py @@ -310,8 +310,7 @@ def _execute_with_cudagraph( # Fall back to single-image path if len(grid_thw) != 1: logger.debug( - "CUDA graph single-image path: got %d images. " - "Using eager mode.", + "CUDA graph single-image path: got %d images. Using eager mode.", len(grid_thw), ) return None @@ -336,14 +335,14 @@ def _execute_with_cudagraph( # Try padded execution if enabled if self.encoder_cudagraph_padded_mode: - result = self.encoder_cudagraph_manager.run_padded( + padded_result = self.encoder_cudagraph_manager.run_padded( pixel_values, grid_thw, num_output_tokens, spatial_merge_size, ) - if result is not None: - output, padding_waste = result + if padded_result is not None: + output, padding_waste = padded_result logger.info( "ViT CUDA graph PADDED: grid=(%d, %d, %d), tokens=%d, waste=%d", t, @@ -456,8 +455,7 @@ def _execute_budget_batch( ) if output is None: logger.debug( - "Budget graph replay failed for key %s, " - "falling back to eager", + "Budget graph replay failed for key %s, falling back to eager", graph_key, ) return None @@ -468,7 +466,7 @@ def _execute_budget_batch( outputs[orig_idx] = output[offset : offset + out_tokens].clone() offset += out_tokens - if self.encoder_cudagraph_manager.verbose: + if manager.verbose: logger.info( "ViT BUDGET BATCH: %d images, %d tokens, graph_key=%s", len(batch), From 96500f8af400d3524066a2b50e97b2eaec08e2bd Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 22:20:26 -0500 Subject: [PATCH 173/189] clean up grid based batching. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 169 ++------------------- vllm/v1/worker/gpu/mm/encoder_runner.py | 119 ++------------- 2 files changed, 25 insertions(+), 263 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 85a96b7caebf..cd252d2e411a 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -6,22 +6,22 @@ This module provides CUDA graph capture and replay functionality for vision encoders to eliminate kernel launch overhead and improve GPU utilization. -Two execution modes: -1. Exact match mode: Replay CUDA graph when input grid_thw exactly matches - a captured configuration. No padding overhead. -2. Padded mode: Pad inputs to fit the smallest captured bucket that can - accommodate them. Enables higher CUDA graph utilization at the cost of - padding compute overhead. - -Padded mode details: -- Padded with zeros: pixel_values, pos_embeds, rotary_pos_emb_cos/sin -- NOT padded (set to actual values): cu_seqlens, max_seqlen -- This ensures flash attention only processes real tokens (via cu_seqlens) -- Output is trimmed to actual size after graph replay +Primary execution mode - Budget Batching: +- Captures CUDA graphs for multiple token budget levels (e.g., [2048, 4096, + 8192, 13824]), each with a fixed max_images_per_batch. +- At runtime, images are sorted smallest-first and greedily packed into + budget-sized batches. The smallest fitting budget graph is selected. +- cu_seqlens is padded to max_images_per_batch + 1 by repeating the last + value, creating zero-length sequences for empty slots (no-op in FA2/FA4). +- Works with any number of images (1 or many) and any grid sizes. + +Legacy modes (used by gpu_model_runner.py): +- Exact match: Replay when grid_thw exactly matches a captured config. +- Padded: Pad inputs to fit the smallest captured bucket. Key design principles: -1. Capture graphs for specific grid_thw configurations -2. Support both exact match and padded execution +1. Capture graphs based on token budgets, not grid sizes +2. Reuse one graph for any batch where total tokens fit the budget 3. Fall back to eager mode when no suitable graph is available 4. Track statistics for monitoring and optimization """ @@ -486,18 +486,6 @@ def _get_batch_sizes_from_config(self) -> list[int]: return [1] # Legacy mode: batch_size=1 only return sorted(batch_sizes) - def _grid_to_key(self, grid_thw: list[list[int]]) -> tuple[int, int, int] | None: - """ - Convert a grid_thw list to a hashable key. - - Only supports single-image grids (len(grid_thw) == 1). - Returns None for multi-image batches. - """ - if len(grid_thw) != 1: - return None - t, h, w = grid_thw[0] - return (t, h, w) - def _compute_output_tokens( self, grid_thw: tuple[int, int, int], @@ -1419,135 +1407,6 @@ def run_padded( return trimmed_output, padding_waste - def run_batched( - self, - pixel_values: torch.Tensor, - grid_thw: list[list[int]], - batch_size: int, - ) -> torch.Tensor | None: - """ - Run the vision encoder for a batch of images with the same grid size. - - This is used for grouped batching where multiple images are processed - together with a single CUDA graph replay. - - Args: - pixel_values: Concatenated pixel values [total_patches, patch_channels] - grid_thw: List of [T, H, W] for each image (all must be same grid) - batch_size: Number of images in the batch - - Returns: - Concatenated output tensor for all images, or None if no matching graph - """ - if len(grid_thw) != batch_size: - logger.warning( - "grid_thw length (%d) doesn't match batch_size (%d)", - len(grid_thw), - batch_size, - ) - return None - - # All images must have the same grid - if len(grid_thw) < 1: - return None - base_grid = grid_thw[0] - for grid in grid_thw[1:]: - if grid != base_grid: - logger.warning( - "run_batched requires all images to have same grid, got %s and %s", - base_grid, - grid, - ) - return None - - # Look up the graph for this batch_size and grid - graph_key = self.get_graph_for_grid(grid_thw, batch_size=batch_size) - if graph_key is None: - return None - - # Verify input dimensions match - input_buffer = self.input_buffers[graph_key]["pixel_values"] - if pixel_values.shape != input_buffer.shape: - logger.warning( - "Pixel values shape mismatch: expected %s, got %s. " - "Falling back to eager mode.", - input_buffer.shape, - pixel_values.shape, - ) - self.eager_fallbacks += 1 - return None - - # Verify device and dtype match - if pixel_values.device != input_buffer.device: - logger.warning( - "Device mismatch: expected %s, got %s. Falling back to eager mode.", - input_buffer.device, - pixel_values.device, - ) - self.eager_fallbacks += 1 - return None - - if pixel_values.dtype != input_buffer.dtype: - logger.warning( - "Dtype mismatch: expected %s, got %s. Falling back to eager mode.", - input_buffer.dtype, - pixel_values.dtype, - ) - self.eager_fallbacks += 1 - return None - - # Count images processed, not replay count (for accurate hit rate) - self.cache_hits += batch_size - - # Wait for any previous graph replay to complete before modifying buffers. - if not self.is_single_gpu and self.replay_done_event is not None: - self.replay_done_event.synchronize() - - # Ensure contiguous memory layout for safe copy - if not pixel_values.is_contiguous(): - pixel_values = pixel_values.contiguous() - - # Copy input to the captured buffer - input_buffer.copy_(pixel_values, non_blocking=True) - - # For batched exact match, restore cached embeddings if modified - if graph_key in self.modified_grids: - embed_buffers = self.embedding_buffers[graph_key] - cached = self.cached_tensors[graph_key] - embed_buffers["pos_embeds"].copy_(cached["pos_embeds"], non_blocking=True) - embed_buffers["rotary_pos_emb_cos"].copy_( - cached["rotary_pos_emb_cos"], non_blocking=True - ) - embed_buffers["rotary_pos_emb_sin"].copy_( - cached["rotary_pos_emb_sin"], non_blocking=True - ) - embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=True) - embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=True) - embed_buffers["sequence_lengths"].copy_( - cached["sequence_lengths"], non_blocking=True - ) - self.modified_grids.discard(graph_key) - - if self.verbose: - logger.info( - "run_batched(): graph_key=%s, batch_size=%d, input_shape=%s", - graph_key, - batch_size, - pixel_values.shape, - ) - - if self.is_single_gpu: - self.graphs[graph_key].replay() - return self.output_buffers[graph_key] - else: - torch.cuda.current_stream().synchronize() - self.graphs[graph_key].replay() - if self.replay_done_event is None: - self.replay_done_event = torch.cuda.Event() - self.replay_done_event.record() - self.replay_done_event.synchronize() - return self.output_buffers[graph_key].clone() - def run_batched_contiguous( self, pixel_values: torch.Tensor, diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py index 2de2819b8319..696b26eaf167 100644 --- a/vllm/v1/worker/gpu/mm/encoder_runner.py +++ b/vllm/v1/worker/gpu/mm/encoder_runner.py @@ -49,7 +49,7 @@ def __init__( # Encoder CUDA graph manager (optional) self.encoder_cudagraph_manager: EncoderCudaGraphManager | None = None - self.encoder_cudagraph_padded_mode: bool = True + self.encoder_cudagraph_budget_mode: bool = False self._encoder_call_count: int = 0 self._init_encoder_cudagraph_manager() @@ -68,22 +68,10 @@ def _init_encoder_cudagraph_manager(self) -> None: # Import here to avoid circular imports from vllm.v1.worker.gpu.mm.encoder_cudagraph import EncoderCudaGraphManager - bucket_sizes = getattr( - compilation_config, "encoder_cudagraph_bucket_sizes", None - ) - - # Check if padded mode is enabled - self.encoder_cudagraph_padded_mode = getattr( - compilation_config, - "encoder_cudagraph_padded_mode", - True, # Default to padded mode for better CUDA graph utilization - ) - self.encoder_cudagraph_manager = EncoderCudaGraphManager( vllm_config=self.vllm_config, device=self.device, dtype=self.dtype, - bucket_sizes=bucket_sizes, ) # Check if budget batching is configured @@ -92,15 +80,9 @@ def _init_encoder_cudagraph_manager(self) -> None: and self.encoder_cudagraph_manager.max_images_per_batch > 0 ) - # Log configuration - grid_configs = self.encoder_cudagraph_manager.grid_configs logger.info( - "Encoder CUDA graph manager initialized: " - "padded_mode=%s, budget_mode=%s, num_grids=%d, grids=%s", - self.encoder_cudagraph_padded_mode, + "Encoder CUDA graph manager initialized: budget_mode=%s", self.encoder_cudagraph_budget_mode, - len(grid_configs), - grid_configs, ) def capture_encoder_cudagraphs( @@ -178,31 +160,6 @@ def _get_grid_thw_from_kwargs( return grid_thw - def _estimate_visual_tokens( - self, - mm_kwargs_group: dict, - modality: str, - ) -> int | None: - """ - Estimate the number of visual tokens for CUDA graph bucket selection. - - Returns None if estimation is not possible. - """ - grid_thw = self._get_grid_thw_from_kwargs(mm_kwargs_group, modality) - if grid_thw is None: - return None - - # Calculate total visual tokens (after spatial merge, assuming 2x2) - # Formula: sum of (T * H/merge * W/merge) for each item - # Note: grid_thw contains [T, H, W] where H and W are already in patch units - spatial_merge_size = 2 # Default for Qwen-VL models - total_tokens = 0 - for t, h, w in grid_thw: - tokens_per_image = t * (h // spatial_merge_size) * (w // spatial_merge_size) - total_tokens += tokens_per_image - - return total_tokens - @torch.inference_mode() def execute_mm_encoder( self, @@ -258,12 +215,11 @@ def _execute_with_cudagraph( num_items: int, ) -> list[torch.Tensor] | None: """ - Execute the encoder using CUDA graphs if a matching graph is available. + Execute the encoder using budget batch CUDA graphs. - Supports three modes: - 1. Budget batching: Pack multiple images into budget-sized graphs - 2. Exact match: Only use CUDA graph if grid_thw exactly matches - 3. Padded mode: Pad inputs to fit the smallest available bucket + Packs images (sorted smallest-first) into budget-sized batches + and replays the smallest fitting CUDA graph. Falls back to eager + if no budget graph fits. Args: model: The multimodal model @@ -277,6 +233,9 @@ def _execute_with_cudagraph( if self.encoder_cudagraph_manager is None: return None + if not self.encoder_cudagraph_budget_mode: + return None + # Extract grid_thw from kwargs grid_thw = self._get_grid_thw_from_kwargs(mm_kwargs_group, modality) if grid_thw is None: @@ -299,65 +258,9 @@ def _execute_with_cudagraph( visual = getattr(model, "visual", None) spatial_merge_size = getattr(visual, "spatial_merge_size", 2) - # Try budget batching for all images (single or multi) - if self.encoder_cudagraph_budget_mode: - result = self._execute_budget_batch( - pixel_values, grid_thw, spatial_merge_size - ) - if result is not None: - return result - - # Fall back to single-image path - if len(grid_thw) != 1: - logger.debug( - "CUDA graph single-image path: got %d images. Using eager mode.", - len(grid_thw), - ) - return None - - t, h, w = grid_thw[0] - num_output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) - - # Try exact match first - grid_key = self.encoder_cudagraph_manager.get_graph_for_grid(grid_thw) - if grid_key is not None: - # Exact match found - try to run - output = self.encoder_cudagraph_manager.run(pixel_values, grid_thw) - if output is not None: - logger.info( - "ViT CUDA graph EXACT: grid=(%d, %d, %d), tokens=%d", - t, - h, - w, - num_output_tokens, - ) - return [output[:num_output_tokens]] - - # Try padded execution if enabled - if self.encoder_cudagraph_padded_mode: - padded_result = self.encoder_cudagraph_manager.run_padded( - pixel_values, - grid_thw, - num_output_tokens, - spatial_merge_size, - ) - if padded_result is not None: - output, padding_waste = padded_result - logger.info( - "ViT CUDA graph PADDED: grid=(%d, %d, %d), tokens=%d, waste=%d", - t, - h, - w, - num_output_tokens, - padding_waste, - ) - return [output] - - # No CUDA graph available - logger.info( - "ViT EAGER: grid=(%d, %d, %d), tokens=%d", t, h, w, num_output_tokens + return self._execute_budget_batch( + pixel_values, grid_thw, spatial_merge_size ) - return None def _execute_budget_batch( self, From c16c9ae4e6587cf80dd9a90336e34ceee3015d90 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 22:23:00 -0500 Subject: [PATCH 174/189] format. --- vllm/v1/worker/gpu/mm/encoder_runner.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_runner.py b/vllm/v1/worker/gpu/mm/encoder_runner.py index 696b26eaf167..db090eb164b1 100644 --- a/vllm/v1/worker/gpu/mm/encoder_runner.py +++ b/vllm/v1/worker/gpu/mm/encoder_runner.py @@ -258,9 +258,7 @@ def _execute_with_cudagraph( visual = getattr(model, "visual", None) spatial_merge_size = getattr(visual, "spatial_merge_size", 2) - return self._execute_budget_batch( - pixel_values, grid_thw, spatial_merge_size - ) + return self._execute_budget_batch(pixel_values, grid_thw, spatial_merge_size) def _execute_budget_batch( self, From c37908ad1372851fb433d112e114c21081e21d0f Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 22:34:10 -0500 Subject: [PATCH 175/189] execute budget batching. --- vllm/v1/worker/gpu_model_runner.py | 195 ++++++++++++++++++++++++++++- 1 file changed, 191 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a0284184891f..479cf4349896 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -431,6 +431,7 @@ def __init__( self.encoder_cudagraph_padded_mode: bool = True self.encoder_cudagraph_verbose: bool = False self.encoder_cudagraph_one_by_one: bool = True + self.encoder_cudagraph_budget_mode: bool = False # Pre-allocated buffers for piecewise padded mode (lazily initialized) # Key: capture_size (output tokens), Value: dict of buffers self._piecewise_buffers: dict[int, dict[str, torch.Tensor]] = {} @@ -777,14 +778,22 @@ def _init_encoder_cudagraph_manager(self) -> None: verbose=self.encoder_cudagraph_verbose, ) + # Check if budget batching is configured + self.encoder_cudagraph_budget_mode = bool( + self.encoder_cudagraph_manager.token_budgets + and self.encoder_cudagraph_manager.max_images_per_batch > 0 + ) + # Log configuration grid_configs = self.encoder_cudagraph_manager.grid_configs logger.info( "Encoder CUDA graph manager initialized: " - "padded_mode=%s, one_by_one=%s, num_grids=%d, grids=%s, " + "padded_mode=%s, one_by_one=%s, budget_mode=%s, " + "num_grids=%d, grids=%s, " "using dedicated encoder graph pool", self.encoder_cudagraph_padded_mode, self.encoder_cudagraph_one_by_one, + self.encoder_cudagraph_budget_mode, len(grid_configs), grid_configs, ) @@ -2411,10 +2420,29 @@ def _execute_mm_encoder( curr_group_outputs_lst.extend(micro_batch_outputs) curr_group_outputs = curr_group_outputs_lst + elif self.encoder_cudagraph_budget_mode: + # Budget batch mode: replaces grouped batch, one-by-one, + # exact match, and padded modes + budget_result = self._execute_budget_batch( + model, mm_kwargs_group, modality, num_items + ) + if budget_result is not None: + curr_group_outputs = budget_result + else: + # Fall back to eager + if self.encoder_cudagraph_verbose: + logger.info( + "ViT BUDGET BATCH fallback to eager: " + "modality=%s, num_items=%d", + modality, + num_items, + ) + curr_group_outputs = model.embed_multimodal( + **mm_kwargs_group + ) else: - # Try to use CUDA graph if available - # First try grouped batched mode if configured (batch_size > 1) - # Then fall back to one-by-one mode + # Legacy mode: grouped batch -> one-by-one -> single -> + # piecewise -> eager grouped_batched_result = None if ( self.encoder_cudagraph_manager is not None @@ -2825,6 +2853,165 @@ def _execute_with_encoder_cudagraph( ) return None + def _execute_budget_batch( + self, + model: "SupportsMultiModal", + mm_kwargs_group: dict, + modality: str, + num_items: int, + ) -> list[torch.Tensor] | None: + """ + Execute the encoder using budget batch CUDA graphs. + + Sorts images by output token count (smallest first), greedily packs + them into budget-sized batches, and replays the smallest fitting + CUDA graph. Falls back to None (eager) if any batch can't find a + fitting budget graph. + + Args: + model: The multimodal model + mm_kwargs_group: Batched multimodal kwargs + modality: The modality type ("image" or "video") + num_items: Number of items in the batch + + Returns: + List of encoder outputs if CUDA graph was used, None otherwise + """ + manager = self.encoder_cudagraph_manager + if manager is None or not manager.budget_graph_keys: + return None + + if modality not in ("image", "video"): + return None + + # Extract grid_thw + grid_thw = mm_kwargs_group.get("image_grid_thw") + if grid_thw is None: + grid_thw = mm_kwargs_group.get("video_grid_thw") + if grid_thw is None: + return None + + # Convert to list if tensor + if hasattr(grid_thw, "tolist"): + grid_thw = grid_thw.tolist() + + # Extract pixel_values + if modality == "image": + pixel_values = mm_kwargs_group.get("pixel_values") + else: # video + pixel_values = mm_kwargs_group.get("pixel_values_videos") + + if pixel_values is None: + return None + + # Ensure pixel_values is on the correct device + pixel_values = pixel_values.to( + device=self.device, dtype=self.dtype + ).contiguous() + + # Get spatial merge size + visual = getattr(model, "visual", None) + spatial_merge_size = getattr(visual, "spatial_merge_size", 2) + + max_budget = max(manager.budget_graph_keys.keys()) + max_images = manager.max_images_per_batch + + # Compute per-image info: (output_tokens, input_patches, orig_idx) + image_info: list[tuple[int, int, int]] = [] + for i, (t, h, w) in enumerate(grid_thw): + out_tokens = ( + t * (h // spatial_merge_size) * (w // spatial_merge_size) + ) + in_patches = t * h * w + image_info.append((out_tokens, in_patches, i)) + + # Sort by output tokens ascending (small first) + sorted_images = sorted(image_info, key=lambda x: x[0]) + + # Compute pixel_values offsets for each original image + patch_offsets = [0] + for t, h, w in grid_thw: + patch_offsets.append(patch_offsets[-1] + t * h * w) + + # Greedy packing into budget batches + batches: list[list[tuple[int, int, int]]] = [] + current_batch: list[tuple[int, int, int]] = [] + current_tokens = 0 + + for out_tokens, in_patches, orig_idx in sorted_images: + if ( + current_tokens + out_tokens <= max_budget + and len(current_batch) < max_images + ): + current_batch.append((out_tokens, in_patches, orig_idx)) + current_tokens += out_tokens + else: + if current_batch: + batches.append(current_batch) + current_batch = [(out_tokens, in_patches, orig_idx)] + current_tokens = out_tokens + + if current_batch: + batches.append(current_batch) + + # Execute each packed batch + outputs: list[torch.Tensor | None] = [None] * len(grid_thw) + + for batch in batches: + total_out_tokens = sum(out_tok for out_tok, _, _ in batch) + + # Find smallest budget graph that fits + graph_key = manager.find_budget_graph(total_out_tokens) + if graph_key is None: + logger.debug( + "No budget graph for %d tokens, falling back to eager", + total_out_tokens, + ) + return None + + # Concatenate pixel values in sorted order + pv_slices = [] + batch_grids = [] + for _, _, orig_idx in batch: + start = patch_offsets[orig_idx] + end = patch_offsets[orig_idx + 1] + pv_slices.append(pixel_values[start:end]) + batch_grids.append(grid_thw[orig_idx]) + + packed_pv = torch.cat(pv_slices, dim=0) + + # Run the budget graph + output = manager.run_batched_contiguous( + packed_pv, batch_grids, graph_key, spatial_merge_size + ) + if output is None: + logger.debug( + "Budget graph replay failed for key %s, " + "falling back to eager", + graph_key, + ) + return None + + # Split output by per-image output token counts + offset = 0 + for out_tokens, _, orig_idx in batch: + outputs[orig_idx] = output[offset:offset + out_tokens].clone() + offset += out_tokens + + if self.encoder_cudagraph_verbose: + logger.info( + "ViT BUDGET BATCH: %d images, %d tokens, graph_key=%s", + len(batch), + total_out_tokens, + graph_key, + ) + + # Check all images were processed + if any(o is None for o in outputs): + return None + + return outputs # type: ignore[return-value] + def _execute_grouped_batched_encoder( self, model: "SupportsMultiModal", From 9595e20e44c444ef4def157917d5f06a085ac68f Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 22:38:07 -0500 Subject: [PATCH 176/189] clean up grid based batching. --- vllm/v1/worker/gpu_model_runner.py | 292 ++--------------------------- 1 file changed, 18 insertions(+), 274 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 479cf4349896..685cb4b32746 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -428,9 +428,7 @@ def __init__( # Encoder CUDA graph manager for ViT self.encoder_cudagraph_manager: EncoderCudaGraphManager | None = None - self.encoder_cudagraph_padded_mode: bool = True self.encoder_cudagraph_verbose: bool = False - self.encoder_cudagraph_one_by_one: bool = True self.encoder_cudagraph_budget_mode: bool = False # Pre-allocated buffers for piecewise padded mode (lazily initialized) # Key: capture_size (output tokens), Value: dict of buffers @@ -734,13 +732,6 @@ def _init_encoder_cudagraph_manager(self) -> None: self.compilation_config, "encoder_cudagraph_bucket_sizes", None ) - # Check if padded mode is enabled - self.encoder_cudagraph_padded_mode = getattr( - self.compilation_config, - "encoder_cudagraph_padded_mode", - True, # Default to padded mode for better CUDA graph utilization - ) - # Check if verbose logging is enabled self.encoder_cudagraph_verbose = getattr( self.compilation_config, @@ -748,20 +739,6 @@ def _init_encoder_cudagraph_manager(self) -> None: False, # Default to quiet mode ) - # Check if one-by-one processing is enabled for multi-image batches - self.encoder_cudagraph_one_by_one = getattr( - self.compilation_config, - "encoder_cudagraph_one_by_one", - True, # Default to one-by-one for higher CUDA graph hit rate - ) - - # Get batch sizes for grouped batched mode - self.encoder_cudagraph_batch_sizes = getattr( - self.compilation_config, - "encoder_cudagraph_batch_sizes", - None, # Default None means legacy mode (batch_size=1 only) - ) - # Create a dedicated graph pool for encoder CUDA graphs # This keeps encoder and decoder graph memory separate for: # 1. Better memory isolation and predictability @@ -788,11 +765,8 @@ def _init_encoder_cudagraph_manager(self) -> None: grid_configs = self.encoder_cudagraph_manager.grid_configs logger.info( "Encoder CUDA graph manager initialized: " - "padded_mode=%s, one_by_one=%s, budget_mode=%s, " - "num_grids=%d, grids=%s, " + "budget_mode=%s, num_grids=%d, grids=%s, " "using dedicated encoder graph pool", - self.encoder_cudagraph_padded_mode, - self.encoder_cudagraph_one_by_one, self.encoder_cudagraph_budget_mode, len(grid_configs), grid_configs, @@ -2441,258 +2415,28 @@ def _execute_mm_encoder( **mm_kwargs_group ) else: - # Legacy mode: grouped batch -> one-by-one -> single -> - # piecewise -> eager - grouped_batched_result = None - if ( - self.encoder_cudagraph_manager is not None - and self.encoder_cudagraph_batch_sizes is not None - and num_items > 1 - and modality in ("image", "video") - ): - # Try grouped batched mode - if modality == "image": - batched_pixel_values = mm_kwargs_group.get("pixel_values") - grid_thw_list = mm_kwargs_group.get("image_grid_thw") - else: # video - batched_pixel_values = mm_kwargs_group.get( - "pixel_values_videos" - ) - grid_thw_list = mm_kwargs_group.get("video_grid_thw") - - if batched_pixel_values is not None and grid_thw_list is not None: - if isinstance(grid_thw_list, torch.Tensor): - grid_thw_list = grid_thw_list.tolist() - - # Find largest batch size that fits - target_batch_size = ( - max( - bs - for bs in self.encoder_cudagraph_batch_sizes - if bs <= num_items - ) - if any( - bs <= num_items - for bs in self.encoder_cudagraph_batch_sizes - ) - else None - ) - - if target_batch_size is not None and target_batch_size > 1: - if self.encoder_cudagraph_verbose: - logger.info( - "Trying grouped batch: %d images, target_bs=%d", - num_items, - target_batch_size, - ) - grouped_batched_result = ( - self._execute_grouped_batched_encoder( - model, - batched_pixel_values, - grid_thw_list, - modality, - target_batch_size, - ) - ) - - # Check if grouped batch returned partial results - has_partial_results = grouped_batched_result is not None and any( - r is not None for r in grouped_batched_result - ) - all_complete = grouped_batched_result is not None and all( - r is not None for r in grouped_batched_result + # No budget mode: try piecewise -> eager + piecewise_result = None + piecewise_enabled = ( + self.compilation_config is not None + and getattr( + self.compilation_config, + "encoder_cudagraph_piecewise", + False, + ) ) - if all_complete: - # All images processed by grouped batch - # all_complete ensures no None entries - curr_group_outputs = cast( - list[torch.Tensor], grouped_batched_result + if piecewise_enabled: + piecewise_result = self._execute_encoder_piecewise_padded( + model, mm_kwargs_group, modality ) - elif ( - self.encoder_cudagraph_manager is not None - and self.encoder_cudagraph_one_by_one - and num_items > 1 - and modality in ("image", "video") - ): - # Fall back to one-by-one processing for remaining images - # Process each image individually for CUDA graph support - # Extract batched data and slice per-image to avoid - # re-calling group_mm_kwargs_by_modality overhead - # Note: list may contain None for unprocessed images; - # these will be filled in by one-by-one processing below - if has_partial_results and grouped_batched_result is not None: - curr_group_outputs_lst = cast( - list[torch.Tensor], list(grouped_batched_result) - ) - else: - curr_group_outputs_lst = cast( - list[torch.Tensor], [None] * num_items - ) - # Get batched pixel_values and grid_thw - if modality == "image": - batched_pixel_values = mm_kwargs_group.get("pixel_values") - grid_thw_list = mm_kwargs_group.get("image_grid_thw") - grid_key = "image_grid_thw" - pixel_key = "pixel_values" - else: # video - batched_pixel_values = mm_kwargs_group.get( - "pixel_values_videos" - ) - grid_thw_list = mm_kwargs_group.get("video_grid_thw") - grid_key = "video_grid_thw" - pixel_key = "pixel_values_videos" - - if batched_pixel_values is not None and grid_thw_list is not None: - # Convert grid_thw to list if tensor - if isinstance(grid_thw_list, torch.Tensor): - grid_thw_list = grid_thw_list.tolist() - - # Calculate patch boundaries for slicing - patch_offset = 0 - # Count how many need one-by-one processing - num_remaining = sum( - 1 for o in curr_group_outputs_lst if o is None - ) - if self.encoder_cudagraph_verbose and num_remaining > 0: - remaining_grids = [ - grid_thw_list[i] - for i, o in enumerate(curr_group_outputs_lst) - if o is None - ] - logger.info( - "Processing %d remaining images one-at-a-time, " - "grids=%s", - num_remaining, - remaining_grids, - ) - for img_idx, grid_thw in enumerate(grid_thw_list): - t, h, w = grid_thw - num_patches = t * h * w - - # Slice pixel_values for this image - single_pixel_values = batched_pixel_values[ - patch_offset : patch_offset + num_patches - ] - patch_offset += num_patches - - # Skip if already processed by grouped batch - if curr_group_outputs_lst[img_idx] is not None: - continue - - # Build single-image kwargs for CUDA graph (list format) - single_mm_inputs_for_cudagraph = { - pixel_key: single_pixel_values, - grid_key: [grid_thw], - } - - # Try CUDA graph for this single image - single_result = self._execute_with_encoder_cudagraph( - model, - single_mm_inputs_for_cudagraph, - modality, - 1, - ) - if single_result is not None: - curr_group_outputs_lst[img_idx] = single_result[0] - else: - # Fall back to eager for this image - # Model expects grid_thw as CPU tensor (.numpy()) - single_mm_inputs_for_eager = { - pixel_key: single_pixel_values, - grid_key: torch.tensor( - [grid_thw], - dtype=torch.int64, - ), # Keep on CPU - } - single_output = model.embed_multimodal( - **single_mm_inputs_for_eager - ) - curr_group_outputs_lst[img_idx] = single_output[0] - - curr_group_outputs = curr_group_outputs_lst - else: - # Fallback to eager if data extraction fails - curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) + if piecewise_result is not None: + curr_group_outputs = piecewise_result else: - # Single item or no CUDA graph manager - try CUDA graph - cudagraph_result = None - if self.encoder_cudagraph_manager is not None: - cudagraph_result = self._execute_with_encoder_cudagraph( - model, mm_kwargs_group, modality, num_items - ) - - if cudagraph_result is not None: - # CUDA graph was used successfully - curr_group_outputs = cudagraph_result - else: - # Try piecewise padded execution if enabled - piecewise_result = None - piecewise_enabled = ( - self.compilation_config is not None - and getattr( - self.compilation_config, - "encoder_cudagraph_piecewise", - False, - ) - and getattr( - self.compilation_config, - "encoder_cudagraph_padded_mode", - True, - ) - ) - - if self.encoder_cudagraph_verbose: - logger.info( - "ViT: cudagraph_result=None, piecewise_enabled=%s " - "(full_cudagraph=%s, piecewise=%s, padded_mode=%s)", - piecewise_enabled, - getattr( - self.compilation_config, - "cudagraph_mm_encoder", - False, - ) - if self.compilation_config - else None, - getattr( - self.compilation_config, - "encoder_cudagraph_piecewise", - False, - ) - if self.compilation_config - else None, - getattr( - self.compilation_config, - "encoder_cudagraph_padded_mode", - True, - ) - if self.compilation_config - else None, - ) - - if piecewise_enabled: - piecewise_result = self._execute_encoder_piecewise_padded( - model, mm_kwargs_group, modality - ) - - if piecewise_result is not None: - curr_group_outputs = piecewise_result - else: - # Fall back to non-padded execution. - # Run the encoder. - # `curr_group_outputs` is either of the following: - # 1. A tensor of shape - # (num_items, feature_size, hidden_size) - # in case feature_size is fixed across all - # multimodal items. - # 2. A list or tuple (length: num_items) of tensors, - # each of shape (feature_size, hidden_size) in - # case the feature size is dynamic depending on - # the input multimodal items. - curr_group_outputs = model.embed_multimodal( - **mm_kwargs_group - ) + curr_group_outputs = model.embed_multimodal( + **mm_kwargs_group + ) sanity_check_mm_encoder_outputs( curr_group_outputs, From 49d04f3bd030677ff50eb2929f40854afd8038b8 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 22:38:53 -0500 Subject: [PATCH 177/189] clean up exact and padded. --- vllm/v1/worker/gpu_model_runner.py | 138 ----------------------------- 1 file changed, 138 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 685cb4b32746..364a57980d60 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2459,144 +2459,6 @@ def _execute_mm_encoder( return encoder_outputs - def _execute_with_encoder_cudagraph( - self, - model: "SupportsMultiModal", - mm_kwargs_group: dict, - modality: str, - num_items: int, - skip_padded_mode: bool = False, - ) -> list[torch.Tensor] | None: - """ - Execute the encoder using CUDA graphs if a matching graph is available. - - Supports two modes: - 1. Exact match: Only use CUDA graph if grid_thw exactly matches - 2. Padded mode: Pad inputs to fit the smallest available bucket - - Args: - model: The multimodal model - mm_kwargs_group: Batched multimodal kwargs - modality: The modality type ("image" or "video") - num_items: Number of items in the batch - skip_padded_mode: If True, skip padded mode even if enabled. - Used during one-by-one processing to avoid segfaults. - - Returns: - List of encoder outputs if CUDA graph was used, None otherwise - """ - if self.encoder_cudagraph_manager is None: - return None - - # Only support image/video modalities - if modality not in ("image", "video"): - return None - - # Extract grid_thw from kwargs - grid_thw = mm_kwargs_group.get("image_grid_thw") - if grid_thw is None: - grid_thw = mm_kwargs_group.get("video_grid_thw") - if grid_thw is None: - self.encoder_cudagraph_manager.count_miss() - return None - - # Convert to list if tensor - if hasattr(grid_thw, "tolist"): - grid_thw = grid_thw.tolist() - - # Currently only supports single-image batches for CUDA graph - if len(grid_thw) != 1: - logger.debug( - "Encoder CUDA graph only supports single-image batches, " - "got %d images. Using eager mode.", - len(grid_thw), - ) - self.encoder_cudagraph_manager.count_miss() - return None - - # Extract pixel_values - if modality == "image": - pixel_values = mm_kwargs_group.get("pixel_values") - else: # video - pixel_values = mm_kwargs_group.get("pixel_values_videos") - - if pixel_values is None: - logger.debug("No pixel_values found in kwargs. Using eager mode.") - self.encoder_cudagraph_manager.count_miss() - return None - - # Ensure pixel_values is on the correct device and contiguous - # Contiguity is important for CUDA graph replay to avoid memory issues - pixel_values = pixel_values.to( - device=self.device, dtype=self.dtype - ).contiguous() - - # Get spatial merge size for token calculations - visual = getattr(model, "visual", None) - spatial_merge_size = getattr(visual, "spatial_merge_size", 2) - t, h, w = grid_thw[0] - num_output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) - num_input_patches = pixel_values.shape[0] - - # Log the exact size needed for bucket analysis (verbose only) - if self.encoder_cudagraph_verbose: - logger.info( - "ViT input: grid_thw=(%d, %d, %d), input_patches=%d, output_tokens=%d", - t, - h, - w, - num_input_patches, - num_output_tokens, - ) - - # Try exact match first via run() - counts hits internally - output = self.encoder_cudagraph_manager.run(pixel_values, grid_thw) - if output is not None: - if self.encoder_cudagraph_verbose: - logger.info( - "ViT CUDA graph EXACT: grid=(%d, %d, %d), output=%s", - t, - h, - w, - output.shape, - ) - return [output[:num_output_tokens]] - - # Try padded execution if enabled (run_padded counts hits internally) - # Skip padded mode during one-by-one processing to avoid segfaults - if self.encoder_cudagraph_padded_mode and not skip_padded_mode: - result = self.encoder_cudagraph_manager.run_padded( - pixel_values, - grid_thw, - num_output_tokens, - spatial_merge_size, - ) - if result is not None: - output, padding_waste = result - if self.encoder_cudagraph_verbose: - logger.info( - "ViT CUDA graph PADDED: grid=(%d, %d, %d), tokens=%d, waste=%d", - t, - h, - w, - num_output_tokens, - padding_waste, - ) - return [output] - - # No CUDA graph available - count the miss and fall back to eager mode - self.encoder_cudagraph_manager.count_miss() - if self.encoder_cudagraph_verbose: - logger.info( - "ViT EAGER: grid=(%d, %d, %d), tokens=%d (padded_mode=%s)", - t, - h, - w, - num_output_tokens, - self.encoder_cudagraph_padded_mode, - ) - return None - def _execute_budget_batch( self, model: "SupportsMultiModal", From bab71d0027015576f5c038a7f5652d856bb43c24 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 22:39:43 -0500 Subject: [PATCH 178/189] clean up grid based batching. --- vllm/v1/worker/gpu_model_runner.py | 222 ----------------------------- 1 file changed, 222 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 364a57980d60..0460ad5a585a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2618,228 +2618,6 @@ def _execute_budget_batch( return outputs # type: ignore[return-value] - def _execute_grouped_batched_encoder( - self, - model: "SupportsMultiModal", - batched_pixel_values: torch.Tensor, - grid_thw_list: list[list[int]], - modality: str, - target_batch_size: int, - ) -> list[torch.Tensor | None] | None: - """ - Execute encoder using grouped batched CUDA graphs with contiguous packing. - - Groups images by output token count, packs them contiguously (no interleaved - padding), and uses a bucket-based CUDA graph for groups of target_batch_size. - - Memory layout (contiguous packing): - Buffer: [img0][img1][img2][img3][PADDING at end] - cu_seqlens: [0, size0, size0+size1, ..., total_actual] - - This ensures flash attention reads correct data for each sequence, as - cu_seqlens reflects actual boundaries. Padding at end is ignored. - - Args: - model: The multimodal model - batched_pixel_values: Concatenated pixel values for all images - grid_thw_list: List of [T, H, W] for each image - modality: "image" or "video" - target_batch_size: Target batch size for grouping (e.g., 4) - - Returns: - List of output tensors (may contain None for unprocessed images), - or None if no images could be processed with batched mode. - Caller should handle None entries by processing them separately. - """ - if self.encoder_cudagraph_manager is None: - return None - - num_images = len(grid_thw_list) - if num_images < target_batch_size: - # Not enough images for a full batch, fall back to other modes - return None - - # Get spatial merge size for output token calculation - visual = getattr(model, "visual", None) - spatial_merge_size = getattr(visual, "spatial_merge_size", 2) - - def compute_output_tokens(grid: list[int]) -> int: - t, h, w = grid - return t * (h // spatial_merge_size) * (w // spatial_merge_size) - - # Sort images by output token count for efficient grouping - # Keep track of original indices for reordering output - indexed_grids = [ - (i, grid, compute_output_tokens(grid)) - for i, grid in enumerate(grid_thw_list) - ] - sorted_grids = sorted(indexed_grids, key=lambda x: x[2]) - - # Calculate patch offsets for slicing pixel_values - patch_offsets = [0] - for grid in grid_thw_list: - t, h, w = grid - patch_offsets.append(patch_offsets[-1] + t * h * w) - - # Calculate how many full batches and remainder - num_full_batches = num_images // target_batch_size - num_grouped = num_full_batches * target_batch_size - num_remainder = num_images - num_grouped - - if self.encoder_cudagraph_verbose: - # Pre-compute padding waste estimate for logging - total_bucket_tokens = 0 - waste_per_batch = [] - - # Assert for mypy (already checked at function start) - assert self.encoder_cudagraph_manager is not None - temp_idx = 0 - while temp_idx + target_batch_size <= num_images: - batch_grids = [ - grid_thw_list[sorted_grids[temp_idx + i][0]] - for i in range(target_batch_size) - ] - max_tokens = max(compute_output_tokens(g) for g in batch_grids) - actual_tokens = sum(compute_output_tokens(g) for g in batch_grids) - - # Find bucket for this batch - graph_key = self.encoder_cudagraph_manager.find_bucket_for_tokens( - max_tokens, spatial_merge_size, batch_size=target_batch_size - ) - if graph_key is not None: - _, t, h, w = graph_key - bucket_tokens = ( - t * (h // spatial_merge_size) * (w // spatial_merge_size) - ) - # Bucket capacity * batch_size vs sum of actual tokens - batch_bucket_total = bucket_tokens * target_batch_size - batch_waste = batch_bucket_total - actual_tokens - total_bucket_tokens += batch_bucket_total - waste_per_batch.append(batch_waste) - temp_idx += target_batch_size - - total_waste = sum(waste_per_batch) if waste_per_batch else 0 - waste_pct = ( - (total_waste / total_bucket_tokens * 100) - if total_bucket_tokens > 0 - else 0.0 - ) - - logger.info( - "Processing %d images: %d in %d group(s) of %d, " - "%d remainder (eager), grids=%s, " - "padding_waste=%d tokens (%.1f%%)", - num_images, - num_grouped, - num_full_batches, - target_batch_size, - num_remainder, - grid_thw_list, - total_waste, - waste_pct, - ) - - outputs = [None] * num_images - processed = 0 - cudagraph_processed = 0 - - # Process full batches - while processed + target_batch_size <= num_images: - # Get the next batch of images (sorted by size) - batch_indices = [ - sorted_grids[processed + i][0] for i in range(target_batch_size) - ] - batch_grids = [grid_thw_list[i] for i in batch_indices] - - # Calculate max output tokens needed in this batch - max_output_tokens = max(compute_output_tokens(g) for g in batch_grids) - - # Find a bucket that can fit max_output_tokens for this batch_size - graph_key = self.encoder_cudagraph_manager.find_bucket_for_tokens( - max_output_tokens, spatial_merge_size, batch_size=target_batch_size - ) - if graph_key is None: - # No suitable bucket, skip this batch - if self.encoder_cudagraph_verbose: - logger.info( - " SKIP batch %d: no bucket for max_tokens=%d, grids=%s", - processed // target_batch_size + 1, - max_output_tokens, - batch_grids, - ) - processed += target_batch_size - continue - - # Pack pixel values contiguously (no interleaved padding) - pixels_list = [] - for idx in batch_indices: - start = patch_offsets[idx] - end = patch_offsets[idx + 1] - img_pixels = batched_pixel_values[start:end] - pixels_list.append(img_pixels) - - # Concatenate contiguously - NO padding between images - contiguous_pixels = torch.cat(pixels_list, dim=0) - - # Run batched CUDA graph with contiguous packing - result = self.encoder_cudagraph_manager.run_batched_contiguous( - contiguous_pixels, - batch_grids, - graph_key, - spatial_merge_size=spatial_merge_size, - ) - - if result is not None: - # Extract outputs using cumulative sizes (contiguous layout) - # cu_seqlens = [0, size0, size0+size1, ..., total] - output_offset = 0 - for i, idx in enumerate(batch_indices): - actual_tokens = compute_output_tokens(batch_grids[i]) - outputs[idx] = result[ - output_offset : output_offset + actual_tokens - ].clone() - output_offset += actual_tokens - cudagraph_processed += target_batch_size - - if self.encoder_cudagraph_verbose: - logger.info( - " Group %d: grids=%s, graph_key=%s", - processed // target_batch_size + 1, - batch_grids, - graph_key, - ) - else: - # run_batched_contiguous returned None - log why - if self.encoder_cudagraph_verbose: - total_patches = contiguous_pixels.shape[0] - logger.info( - " FAIL batch %d: run_batched_contiguous returned None, " - "graph_key=%s, total_patches=%d, grids=%s", - processed // target_batch_size + 1, - graph_key, - total_patches, - batch_grids, - ) - - processed += target_batch_size - - # Log summary - num_unprocessed = sum(1 for o in outputs if o is None) - if self.encoder_cudagraph_verbose and num_unprocessed > 0: - logger.info( - "Grouped batch: %d/%d with cudagraph, %d remainder", - cudagraph_processed, - num_images, - num_unprocessed, - ) - - # Return partial results - caller will handle None entries - # Return None only if no images were processed at all - if cudagraph_processed == 0: - return None - - return outputs - def _find_nearest_encoder_capture_size(self, num_tokens: int) -> int | None: """Find the smallest capture size >= num_tokens for piecewise mode. From 444bc6b07513ef95dc6ef2b6d3e108bcb6440acf Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 23:39:31 -0500 Subject: [PATCH 179/189] cache specified common embeds only, not only the fly. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index cd252d2e411a..f37b3d0b5ed9 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -1552,19 +1552,14 @@ def run_batched_contiguous( rotary_cos_list.append(cached["rotary_pos_emb_cos"]) rotary_sin_list.append(cached["rotary_pos_emb_sin"]) else: - # Cache miss - need to compute (should be rare after warmup) + # Cache miss - compute on-the-fly but don't cache + # (avoids unbounded GPU memory growth at runtime) cache_miss_grids.append(grid_key) if self.vision_encoder is not None: actual_embeds = self.vision_encoder.precompute_for_cudagraph([grid]) pos_embeds_list.append(actual_embeds["pos_embeds"]) rotary_cos_list.append(actual_embeds["rotary_pos_emb_cos"]) rotary_sin_list.append(actual_embeds["rotary_pos_emb_sin"]) - # Cache for future use - self.grid_embedding_cache[grid_key] = { - "pos_embeds": actual_embeds["pos_embeds"], - "rotary_pos_emb_cos": actual_embeds["rotary_pos_emb_cos"], - "rotary_pos_emb_sin": actual_embeds["rotary_pos_emb_sin"], - } else: logger.warning("Grid %s not cached and no vision encoder", grid_key) return None @@ -1572,7 +1567,8 @@ def run_batched_contiguous( if cache_miss_grids and self.verbose: uncached_grids = cache_miss_grids logger.info( - "Embedding cache miss for grids: %s (now cached)", uncached_grids + "Embedding cache miss for grids: %s (computed on-the-fly)", + uncached_grids, ) # Concatenate cached embeddings (just tensor concat, no computation) From 3efffeb823a24835efafa3e84daf759db6b5b74e Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 23:53:13 -0500 Subject: [PATCH 180/189] remove grid configs and bucket sizes. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 240 +-------------------- vllm/v1/worker/gpu_model_runner.py | 21 +- 2 files changed, 5 insertions(+), 256 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index f37b3d0b5ed9..642e30f5c55b 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -47,38 +47,6 @@ # Grid configurations for CUDA graph capture (T, H, W in patch units) # -# Strategy: Prioritize small grids where kernel launch overhead dominates. -# For larger grids, computation time dominates and CUDA graph benefit is minimal. -# -# Grids larger than max_grid_size (default 96) should use padded mode or eager. -CUSTOM_GRID_CONFIGS = [ - # === Tier 1: Very small grids (<=32) === - (1, 16, 16), # 256 patches - (1, 24, 24), # 576 patches - (1, 32, 32), # 1024 patches - # === Tier 2: Small grids (33-50) === - (1, 38, 38), # 1444 patches - (1, 40, 40), # 1600 patches - (1, 42, 42), # 1764 patches - (1, 44, 44), # 1936 patches - (1, 46, 46), # 2116 patches - (1, 50, 50), # 2500 patches - # === Tier 3: Medium-small grids (51-70) === - (1, 56, 56), # 3136 patches - (1, 62, 62), # 3844 patches - (1, 64, 64), # 4096 patches - (1, 68, 68), # 4624 patches - # === Tier 4: Medium grids (71-96) === - (1, 76, 76), # 5776 patches - (1, 80, 80), # 6400 patches - (1, 94, 94), # 8836 patches -] - -# Default bucket sizes for padded mode (creates square grids) -# These cover medium-large grids that are too big for exact match capture -# but still benefit from CUDA graphs via padding. -DEFAULT_PADDED_BUCKET_SIZES = [100, 128] - # Top 100 most common grids for embedding cache pre-warming. # Pre-warming these grids at startup avoids cold-start embedding computation # at runtime, eliminating ~20 small kernel launches per grid on first encounter. @@ -213,8 +181,6 @@ def __init__( vllm_config: VllmConfig, device: torch.device, dtype: torch.dtype, - bucket_sizes: list[int] | None = None, - grid_configs: list[tuple[int, int, int]] | None = None, graph_pool: Any | None = None, verbose: bool = False, ): @@ -223,51 +189,7 @@ def __init__( self.dtype = dtype self.verbose = verbose - # Get batch sizes from config (for grouped batched mode) - self.batch_sizes = self._get_batch_sizes_from_config() - - # Get grid configs from config or use defaults (for exact match) - if grid_configs is None: - grid_configs = self._get_grid_configs_from_config() - - # Get bucket sizes from config (for padded mode) - if bucket_sizes is None: - bucket_sizes = self._get_bucket_sizes_from_config() - - # Merge: grid_configs (exact match) + bucket_sizes (padded mode square grids) - # Bucket sizes create square grids (1, size, size) for padded mode - grid_set = set(grid_configs) - for size in bucket_sizes: - grid_set.add((1, size, size)) - - # Filter out grids that are too large to capture efficiently - # Large grids (e.g., 256x256+) consume massive memory (~14+ GiB each) - # and are better served by eager mode or padded execution - max_grid_size = self._get_max_grid_size_from_config() - filtered_grids = [] - skipped_grids = [] - for grid in grid_set: - t, h, w = grid - if h <= max_grid_size and w <= max_grid_size: - filtered_grids.append(grid) - else: - skipped_grids.append(grid) - - if skipped_grids: - top_skipped = sorted( - skipped_grids, key=lambda x: x[1] * x[2], reverse=True - )[:5] - logger.info( - "Skipping %d grids exceeding max_grid_size=%d: %s...", - len(skipped_grids), - max_grid_size, - top_skipped, - ) - - self.grid_configs = filtered_grids - # CUDA graph storage - keyed by (batch_size, t, h, w) tuple - # For legacy mode (batch_sizes=None), key is (1, t, h, w) self.graphs: dict[tuple[int, int, int, int], torch.cuda.CUDAGraph] = {} # Use private pools by default to avoid segfaults with rapid back-to-back # graph replays during one-by-one multi-image processing. @@ -406,86 +328,6 @@ def _read_budget_config(self) -> None: self.max_images_per_batch, ) - def _get_grid_configs_from_config(self) -> list[tuple[int, int, int]]: - """Get encoder grid configurations from config or use defaults.""" - compilation_config = self.vllm_config.compilation_config - if compilation_config is None: - return CUSTOM_GRID_CONFIGS - - # Check for encoder-specific grid config - grid_configs = getattr( - compilation_config, "encoder_cudagraph_grid_configs", None - ) - if grid_configs is not None: - # Handle preset name or custom list - if isinstance(grid_configs, str): - if grid_configs == "custom": - return CUSTOM_GRID_CONFIGS - else: - logger.warning( - "Unknown grid config preset '%s', using 'custom'", - grid_configs, - ) - return CUSTOM_GRID_CONFIGS - return [tuple(cfg) for cfg in grid_configs] - - return CUSTOM_GRID_CONFIGS - - def _get_bucket_sizes_from_config(self) -> list[int]: - """Get encoder CUDA graph bucket sizes from config. - - Bucket sizes enable padded mode for grids that don't have exact matches. - Default buckets (100, 128) cover medium-large grids efficiently. - """ - compilation_config = self.vllm_config.compilation_config - if compilation_config is None: - return DEFAULT_PADDED_BUCKET_SIZES - - encoder_sizes = getattr( - compilation_config, "encoder_cudagraph_bucket_sizes", None - ) - return ( - encoder_sizes if encoder_sizes is not None else DEFAULT_PADDED_BUCKET_SIZES - ) - - def _get_max_grid_size_from_config(self) -> int: - """Get maximum grid size for encoder CUDA graph capture. - - Large grids consume massive GPU memory per graph and provide minimal - benefit since computation time dominates over launch overhead. - - Default is 96 to focus memory on small grids where benefit is highest. - Grids larger than this will use padded mode (if buckets configured) or eager. - """ - compilation_config = self.vllm_config.compilation_config - if compilation_config is None: - return 96 # Focus on small grids where benefit is highest - - max_size = getattr( - compilation_config, - "encoder_cudagraph_max_grid_size", - 96, # Default: max 96x96 grids for exact match - ) - return max_size - - def _get_batch_sizes_from_config(self) -> list[int]: - """Get batch sizes for grouped batched CUDA graph capture. - - When set (e.g., [4]), captures graphs for processing multiple images - together with the same grid size. Images are grouped by grid size and - padded to the largest in each group. - - Default is [1] for legacy one-by-one mode. - """ - compilation_config = self.vllm_config.compilation_config - if compilation_config is None: - return [1] - - batch_sizes = getattr(compilation_config, "encoder_cudagraph_batch_sizes", None) - if batch_sizes is None: - return [1] # Legacy mode: batch_size=1 only - return sorted(batch_sizes) - def _compute_output_tokens( self, grid_thw: tuple[int, int, int], @@ -845,91 +687,15 @@ def capture( logger.warning("Encoder CUDA graphs already captured, skipping") return - # Build list of (batch_size, grid_config) combinations to capture - capture_combinations = [] - for batch_size in self.batch_sizes: - for grid_config in self.grid_configs: - capture_combinations.append((batch_size, grid_config)) - - # Log initial memory state - free_mem_before, total_mem = torch.cuda.mem_get_info(self.device) - used_mem_before = total_mem - free_mem_before - logger.info( - "Capturing encoder CUDA graphs for %d combinations " - "(batch_sizes=%s, grids=%d) " - "(GPU memory: %.2f GiB used, %.2f GiB free)", - len(capture_combinations), - self.batch_sizes, - len(self.grid_configs), - used_mem_before / 1024**3, - free_mem_before / 1024**3, - ) - - # Capture from smallest to largest so that common smaller grids are - # captured first. If we run out of memory, only large grids will fail. - capture_combinations = sorted( - capture_combinations, - key=lambda x: x[0] * x[1][0] * x[1][1] * x[1][2], # batch * t * h * w - reverse=False, # Smallest first - ) - - if is_global_first_rank(): - capture_combinations = tqdm( - capture_combinations, desc="Capturing encoder CUDA graphs" - ) - - # Capture each graph. For single-GPU mode, capture directly on current stream - # to avoid stream synchronization overhead at replay time. - # For multi-GPU mode, use graph_capture() context to coordinate with TP/PP. - for batch_size, grid_config in capture_combinations: - try: - if self.is_single_gpu: - # Single-GPU: capture on current stream (no separate stream) - self.capture_graph_for_grid( - grid_config, - vision_encoder, - batch_size=batch_size, - ) - else: - # Multi-GPU: use graph_capture() for TP/PP coordination - with graph_capture(device=self.device): - self.capture_graph_for_grid( - grid_config, - vision_encoder, - batch_size=batch_size, - ) - except Exception as e: - logger.warning( - "Failed to capture encoder CUDA graph for " - "batch_size=%d, grid=%s: %s. Will use eager mode.", - batch_size, - grid_config, - e, - ) - - self.captured = True - - # Log final memory state - free_mem_after, _ = torch.cuda.mem_get_info(self.device) - used_mem_after = total_mem - free_mem_after - encoder_graph_mem = used_mem_after - used_mem_before - logger.info( - "Captured %d encoder CUDA graphs (configs: %s). " - "Encoder graph memory: %.2f GiB (GPU: %.2f GiB used, %.2f GiB free)", - len(self.graphs), - sorted(self.graphs.keys()), - encoder_graph_mem / 1024**3, - used_mem_after / 1024**3, - free_mem_after / 1024**3, - ) - # Pre-warm embedding cache for common grids self._prewarm_embedding_cache(vision_encoder) - # Capture budget batch graphs if configured + # Capture budget batch graphs if self.token_budgets and self.max_images_per_batch > 0: self.capture_budget_graphs(vision_encoder) + self.captured = True + def _prewarm_embedding_cache(self, vision_encoder: nn.Module) -> None: """ Pre-warm the embedding cache for common grid configurations. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0460ad5a585a..71e77b245215 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -728,29 +728,18 @@ def _init_encoder_cudagraph_manager(self) -> None: if not getattr(self.compilation_config, "cudagraph_mm_encoder", False): return - bucket_sizes = getattr( - self.compilation_config, "encoder_cudagraph_bucket_sizes", None - ) - - # Check if verbose logging is enabled self.encoder_cudagraph_verbose = getattr( self.compilation_config, "encoder_cudagraph_verbose", - False, # Default to quiet mode + False, ) - # Create a dedicated graph pool for encoder CUDA graphs - # This keeps encoder and decoder graph memory separate for: - # 1. Better memory isolation and predictability - # 2. Independent memory management for each subsystem - # 3. Easier debugging of memory usage encoder_graph_pool = torch.cuda.graph_pool_handle() self.encoder_cudagraph_manager = EncoderCudaGraphManager( vllm_config=self.vllm_config, device=self.device, dtype=self.dtype, - bucket_sizes=bucket_sizes, graph_pool=encoder_graph_pool, verbose=self.encoder_cudagraph_verbose, ) @@ -761,15 +750,9 @@ def _init_encoder_cudagraph_manager(self) -> None: and self.encoder_cudagraph_manager.max_images_per_batch > 0 ) - # Log configuration - grid_configs = self.encoder_cudagraph_manager.grid_configs logger.info( - "Encoder CUDA graph manager initialized: " - "budget_mode=%s, num_grids=%d, grids=%s, " - "using dedicated encoder graph pool", + "Encoder CUDA graph manager initialized: budget_mode=%s", self.encoder_cudagraph_budget_mode, - len(grid_configs), - grid_configs, ) def update_max_model_len(self, max_model_len: int) -> None: From fa4da3c69e56d914f4e2b260e7fd5595d37d57b2 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Thu, 5 Feb 2026 23:56:43 -0500 Subject: [PATCH 181/189] clean up legacy compilation configs. --- vllm/config/compilation.py | 51 ++------------------------------------ 1 file changed, 2 insertions(+), 49 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index d8d61dc79d80..5c015cb60823 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -445,57 +445,10 @@ class CompilationConfig: kernel launch overhead. Requires fixed input sizes via bucketing. Experimental feature - use with caution.""" - encoder_cudagraph_bucket_sizes: list[int] | None = None - """Square grid side lengths for padded CUDA graph execution. Each size N - creates a bucket grid (1, N, N). Inputs with max(H, W) <= N are padded to - fit the bucket. Example: [32, 64, 94, 128, 188, 256, 312] captures grids - (1, 32, 32), (1, 64, 64), etc. Used with encoder_cudagraph_padded_mode=True.""" - - encoder_cudagraph_grid_configs: list[tuple[int, int, int]] | str | None = None - """Grid configurations (T, H, W in patch units) for exact-match CUDA graph - capture. Can be a list of tuples or preset "custom" (top 30 most common grids, - 58.9% exact match coverage). If None, uses "custom" as default.""" - - encoder_cudagraph_padded_mode: bool = True - """Whether to use padded execution for encoder CUDA graphs. - When True, inputs smaller than a captured bucket are padded to fit. - Padded: pixel_values, pos_embeds, rotary_embeds (with zeros). - NOT padded: cu_seqlens, max_seqlen (set to actual values so flash - attention only processes real tokens). Output is trimmed to actual size. - When False, only exact grid matches use CUDA graphs.""" - - encoder_cudagraph_max_grid_size: int = 256 - """Maximum grid dimension (H or W) for encoder CUDA graph capture. - Grids with H > max or W > max are skipped to limit GPU memory usage. - Memory scales roughly with H*W: - - 128x128: ~0.8 GiB - - 188x188: ~1.7 GiB - - 256x256: ~3.2 GiB - Set lower (e.g., 128, 188, 218) on memory-constrained systems. - Default 256 captures all grids in CUSTOM_GRID_CONFIGS.""" - encoder_cudagraph_verbose: bool = False """Enable verbose logging for encoder CUDA graph execution. - When True, logs each ViT input size and CUDA graph hit/miss/padded status. - Useful for debugging and analyzing CUDA graph utilization. - When False, only logs summary stats at the end of execution.""" - - encoder_cudagraph_one_by_one: bool = True - """Enable one-by-one image processing for multi-image batches. - When True (default), multi-image batches are processed individually to - maximize CUDA graph hit rate. - When False, multi-image batches are processed together in eager mode, - which may be faster when CUDA graph overhead (sync, memory) outweighs - the kernel launch savings. - Set to False if you observe throughput regression with encoder CUDA graphs.""" - - encoder_cudagraph_batch_sizes: list[int] | None = None - """Batch sizes for grouped batched CUDA graph capture. - When set (e.g., [4]), captures graphs for processing multiple images - together. Images are grouped by similar grid sizes and padded to the - largest grid in each group. Single graph replay for the whole group. - Example: [4] captures batch_size=4 graphs only (1-3 images use eager). - Default None uses legacy one-by-one mode (batch_size=1 per image).""" + When True, logs each ViT input size and CUDA graph hit/miss status. + Useful for debugging and analyzing CUDA graph utilization.""" encoder_cudagraph_token_budgets: list[int] | None = None """List of total output token budget levels for budget batch CUDA graphs. From b3cc204bbc66ea6c5d2144c1dbfeda24aa6fd29b Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 6 Feb 2026 00:17:56 -0500 Subject: [PATCH 182/189] remove embed buffer tracking for padded. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 642e30f5c55b..93c3e5f3034c 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -256,11 +256,6 @@ def __init__( "(TP=1, PP=1, DP=1), using optimized sync scheme" ) - # Track which grids have had their embedding buffers modified by run_padded() - # or run_batched_contiguous(). This allows run() to skip restoring cached - # tensors when not needed. Keys are (batch_size, t, h, w). - self.modified_grids: set[tuple[int, int, int, int]] = set() - # Per-grid embedding cache for batched contiguous mode # Key: (t, h, w), Value: dict with pos_embeds, rotary_cos, rotary_sin # This avoids recomputing embeddings at runtime - just look up and concat From 26d44ffc265c4f9388b06aa810de4c3c0156c2e6 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 6 Feb 2026 00:19:13 -0500 Subject: [PATCH 183/189] clean up grid matching. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 408 --------------------- 1 file changed, 408 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 93c3e5f3034c..99eca5920e91 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -760,414 +760,6 @@ def _compute_embedding_cache_memory(self) -> int: total_bytes += tensor.numel() * tensor.element_size() return total_bytes - def get_graph_for_grid( - self, - grid_thw: list[list[int]], - batch_size: int = 1, - ) -> tuple[int, int, int, int] | None: - """ - Check if a CUDA graph is available for the given grid and batch size. - - Args: - grid_thw: List of [T, H, W] for each image (must all be same grid) - batch_size: Number of images (default 1 for legacy mode) - - Returns: - The graph key (batch_size, t, h, w) if matching graph exists, None otherwise - """ - if len(grid_thw) < 1: - return None - # All images must have the same grid for batched mode - t, h, w = grid_thw[0] - for grid in grid_thw[1:]: - if grid != [t, h, w]: - return None # Mixed grids not supported - key = (batch_size, t, h, w) - return key if key in self.graphs else None - - def find_bucket_for_tokens( - self, - num_tokens: int, - spatial_merge_size: int = 2, - batch_size: int = 1, - ) -> tuple[int, int, int, int] | None: - """ - Find the smallest captured grid that can fit the given token count. - - This enables padded execution where inputs smaller than a bucket - are padded to match the bucket size. - - Args: - num_tokens: Number of output tokens needed (per image) - spatial_merge_size: Merge size (default 2) - batch_size: Required batch size (default 1) - - Returns: - Graph key (batch_size, T, H, W) of the best bucket, or None if too large - """ - best_key = None - best_bucket_tokens = float("inf") - - for graph_key in self.graphs: - key_batch_size, t, h, w = graph_key - if key_batch_size != batch_size: - continue # Skip graphs with wrong batch size - grid = (t, h, w) - bucket_tokens = self._compute_output_tokens(grid, spatial_merge_size) - if bucket_tokens >= num_tokens and bucket_tokens < best_bucket_tokens: - best_bucket_tokens = bucket_tokens - best_key = graph_key - - return best_key - - def run( - self, - pixel_values: torch.Tensor, - grid_thw: list[list[int]], - batch_size: int = 1, - ) -> torch.Tensor | None: - """ - Run the vision encoder using a captured CUDA graph if available. - - Args: - pixel_values: Input pixel values [num_patches, patch_channels] - grid_thw: List of [T, H, W] for each image (all must be same grid) - batch_size: Number of images in batch (default 1 for legacy mode) - - Returns: - Vision encoder output tensor if graph was used, None if no matching graph - """ - graph_key = self.get_graph_for_grid(grid_thw, batch_size=batch_size) - - if graph_key is None: - # Don't count miss here - caller may try run_padded() next - return None - - # Verify input dimensions match - input_buffer = self.input_buffers[graph_key]["pixel_values"] - if pixel_values.shape != input_buffer.shape: - logger.warning( - "Pixel values shape mismatch: expected %s, got %s. " - "Falling back to eager mode.", - input_buffer.shape, - pixel_values.shape, - ) - self.eager_fallbacks += 1 - return None - - # Verify device and dtype match - if pixel_values.device != input_buffer.device: - logger.warning( - "Device mismatch: expected %s, got %s. Falling back to eager mode.", - input_buffer.device, - pixel_values.device, - ) - self.eager_fallbacks += 1 - return None - - if pixel_values.dtype != input_buffer.dtype: - logger.warning( - "Dtype mismatch: expected %s, got %s. Falling back to eager mode.", - input_buffer.dtype, - pixel_values.dtype, - ) - self.eager_fallbacks += 1 - return None - - self.cache_hits += 1 - - # Wait for any previous graph replay to complete before modifying buffers. - # For single-GPU mode, this is not needed because everything is on the same - # stream and CUDA guarantees ordering. For multi-GPU mode, we need this - # because the graph runs on a different stream. - if not self.is_single_gpu and self.replay_done_event is not None: - self.replay_done_event.synchronize() - - # Ensure contiguous memory layout for safe copy - if not pixel_values.is_contiguous(): - pixel_values = pixel_values.contiguous() - - # Copy input to the captured buffer (non-blocking for better overlap) - input_buffer.copy_(pixel_values, non_blocking=True) - - # For exact match, restore cached embeddings only if modified by run_padded(). - # This avoids 6 unnecessary tensor copies when only using exact-match mode. - if graph_key in self.modified_grids: - embed_buffers = self.embedding_buffers[graph_key] - cached = self.cached_tensors[graph_key] - embed_buffers["pos_embeds"].copy_(cached["pos_embeds"], non_blocking=True) - embed_buffers["rotary_pos_emb_cos"].copy_( - cached["rotary_pos_emb_cos"], non_blocking=True - ) - embed_buffers["rotary_pos_emb_sin"].copy_( - cached["rotary_pos_emb_sin"], non_blocking=True - ) - embed_buffers["cu_seqlens"].copy_(cached["cu_seqlens"], non_blocking=True) - embed_buffers["max_seqlen"].copy_(cached["max_seqlen"], non_blocking=True) - embed_buffers["sequence_lengths"].copy_( - cached["sequence_lengths"], non_blocking=True - ) - self.modified_grids.discard(graph_key) - - if self.verbose: - logger.info( - "run(): graph_key=%s, input_shape=%s, buffer_shape=%s", - graph_key, - pixel_values.shape, - input_buffer.shape, - ) - - if self.is_single_gpu: - # Single-GPU optimized path: graph was captured on current stream, - # so buffer copies and replay are on the same stream - no sync needed. - # Return view directly; caller must use output before next run() call. - self.graphs[graph_key].replay() - return self.output_buffers[graph_key] - else: - # Multi-GPU path: graph was captured on a separate stream. - # Sync current stream before replay to ensure buffer copies complete. - torch.cuda.current_stream().synchronize() - - # Replay the graph - self.graphs[graph_key].replay() - - # Record event after replay for lightweight sync in next call. - if self.replay_done_event is None: - self.replay_done_event = torch.cuda.Event() - self.replay_done_event.record() - - # Sync to ensure output is ready before clone. - self.replay_done_event.synchronize() - - # Return a clone of the output to avoid issues with buffer reuse - return self.output_buffers[graph_key].clone() - - def run_padded( - self, - pixel_values: torch.Tensor, - grid_thw: list[list[int]], - num_output_tokens: int, - spatial_merge_size: int = 2, - ) -> tuple[torch.Tensor, int] | None: - """ - Run the vision encoder with padding to fit a captured bucket. - - This method computes embeddings for the ACTUAL input grid, pads them - to match the bucket size, then replays the CUDA graph. This ensures - correct position embeddings while still benefiting from CUDA graphs. - - Args: - pixel_values: Input pixel values [num_patches, patch_channels] - grid_thw: List of [T, H, W] for each image (only single image supported) - num_output_tokens: Expected number of output tokens for the input - spatial_merge_size: Spatial merge size (default 2) - - Returns: - Tuple of (output tensor trimmed to actual size, padding_waste_tokens) - or None if no suitable bucket found - """ - if len(grid_thw) != 1: - logger.debug("Padded mode only supports single-image inputs") - return None - - # Check if vision encoder is available for embedding computation - if self.vision_encoder is None or not hasattr( - self.vision_encoder, "precompute_for_cudagraph" - ): - logger.debug("Vision encoder not available for padded mode") - return None - - # Find the smallest bucket that fits (for batch_size=1) - graph_key = self.find_bucket_for_tokens( - num_output_tokens, spatial_merge_size, batch_size=1 - ) - if graph_key is None: - # Don't count miss here - caller will count it when falling back to eager - # Calculate max available tokens from batch_size=1 graphs (if any) - bs1_tokens = [ - self._compute_output_tokens((t, h, w), spatial_merge_size) - for (bs, t, h, w) in self.graphs - if bs == 1 - ] - max_available = max(bs1_tokens) if bs1_tokens else 0 - logger.debug( - "No bucket found for %d tokens (batch_size=1), max available: %d", - num_output_tokens, - max_available, - ) - return None - - # Check if we have embedding buffers for this bucket - if graph_key not in self.embedding_buffers: - logger.debug("No embedding buffers for bucket %s", graph_key) - return None - - # Extract grid from graph_key for _compute_output_tokens - _, t, h, w = graph_key - bucket_tokens = self._compute_output_tokens((t, h, w), spatial_merge_size) - padding_waste = bucket_tokens - num_output_tokens - - # Get the input buffer for this bucket - input_buffer = self.input_buffers[graph_key]["pixel_values"] - num_input_patches = pixel_values.shape[0] - bucket_input_patches = input_buffer.shape[0] - - if num_input_patches > bucket_input_patches: - logger.warning( - "Input patches (%d) exceed bucket capacity (%d). " - "This shouldn't happen.", - num_input_patches, - bucket_input_patches, - ) - self.eager_fallbacks += 1 - return None - - # Verify device and dtype match - if pixel_values.device != input_buffer.device: - logger.warning( - "Device mismatch: expected %s, got %s. Falling back to eager mode.", - input_buffer.device, - pixel_values.device, - ) - self.eager_fallbacks += 1 - return None - - if pixel_values.dtype != input_buffer.dtype: - logger.warning( - "Dtype mismatch: expected %s, got %s. Falling back to eager mode.", - input_buffer.dtype, - pixel_values.dtype, - ) - self.eager_fallbacks += 1 - return None - - # Ensure contiguous memory layout for safe copy - if not pixel_values.is_contiguous(): - pixel_values = pixel_values.contiguous() - - self.cache_hits += 1 - - # Wait for any previous graph replay to complete before modifying buffers. - # For single-GPU mode, this is not needed because everything is on the same - # stream and CUDA guarantees ordering. - if not self.is_single_gpu and self.replay_done_event is not None: - self.replay_done_event.synchronize() - - # Look up cached embeddings for this grid, or compute if not cached - t, h, w = grid_thw[0] - grid_key = (t, h, w) - - if grid_key in self.grid_embedding_cache: - # Use cached embeddings (fast path - no computation) - cached = self.grid_embedding_cache[grid_key] - pos_embeds = cached["pos_embeds"] - rotary_cos = cached["rotary_pos_emb_cos"] - rotary_sin = cached["rotary_pos_emb_sin"] - else: - # Cache miss - compute on-the-fly but do NOT cache at runtime. - # Caching at runtime causes OOM due to unbounded memory growth. - # Only pre-warmed grids from EMBEDDING_WARMUP_GRIDS are cached. - if self.vision_encoder is None: - logger.warning("Grid %s not cached and no vision encoder", grid_key) - return None - actual_embeds = self.vision_encoder.precompute_for_cudagraph(grid_thw) - pos_embeds = actual_embeds["pos_embeds"] - rotary_cos = actual_embeds["rotary_pos_emb_cos"] - rotary_sin = actual_embeds["rotary_pos_emb_sin"] - if self.verbose: - logger.info( - "Embedding cache miss for grid %s (computed on-the-fly)", grid_key - ) - - # Get embedding buffers for the bucket - embed_buffers = self.embedding_buffers[graph_key] - - # Zero the buffers first (for clean padding) - input_buffer.zero_() - embed_buffers["pos_embeds"].zero_() - embed_buffers["rotary_pos_emb_cos"].zero_() - embed_buffers["rotary_pos_emb_sin"].zero_() - - # Copy actual pixel values to the beginning of the buffer - input_buffer[:num_input_patches].copy_(pixel_values, non_blocking=True) - - # Copy cached/computed embeddings to the beginning of the buffers - actual_num_patches = pos_embeds.shape[0] - embed_buffers["pos_embeds"][:actual_num_patches].copy_( - pos_embeds, non_blocking=True - ) - embed_buffers["rotary_pos_emb_cos"][:actual_num_patches].copy_( - rotary_cos, non_blocking=True - ) - embed_buffers["rotary_pos_emb_sin"][:actual_num_patches].copy_( - rotary_sin, non_blocking=True - ) - - # Compute cu_seqlens for single image (simple: [0, num_output_tokens]) - cu_seqlens = torch.tensor( - [0, num_output_tokens], dtype=torch.int32, device=self.device - ) - max_seqlen = torch.tensor(num_output_tokens, dtype=torch.int32, device="cpu") - sequence_lengths = torch.tensor( - [num_output_tokens], dtype=torch.int32, device=self.device - ) - - # Update cu_seqlens and max_seqlen to actual values - embed_buffers["cu_seqlens"][:2].copy_(cu_seqlens, non_blocking=True) - embed_buffers["max_seqlen"].copy_(max_seqlen, non_blocking=True) - embed_buffers["sequence_lengths"][:1].copy_(sequence_lengths, non_blocking=True) - - # Mark this grid as modified so run() knows to restore cached tensors - self.modified_grids.add(graph_key) - - if self.verbose: - logger.info( - "run_padded(): graph_key=%s, actual_grid=%s, " - "input_patches=%d, bucket_patches=%d", - graph_key, - grid_thw[0], - num_input_patches, - bucket_input_patches, - ) - - if self.is_single_gpu: - # Single-GPU optimized path: graph was captured on current stream, - # so buffer modifications and replay are on same stream - no sync needed. - # Return view directly; caller must use output before next run() call. - self.graphs[graph_key].replay() - full_output = self.output_buffers[graph_key] - trimmed_output = full_output[:num_output_tokens] - else: - # Multi-GPU path: graph was captured on a separate stream. - # Sync current stream before replay to ensure buffer modifications complete. - torch.cuda.current_stream().synchronize() - - # Replay the graph with updated embedding buffers - self.graphs[graph_key].replay() - - # Record event after replay for lightweight sync in next call. - if self.replay_done_event is None: - self.replay_done_event = torch.cuda.Event() - self.replay_done_event.record() - - # Sync to ensure output is ready before clone. - self.replay_done_event.synchronize() - - # Get output and trim to actual size - full_output = self.output_buffers[graph_key] - trimmed_output = full_output[:num_output_tokens].clone() - - if self.verbose: - logger.debug( - "Padded execution: %d -> %d tokens (waste: %d, %.1f%%)", - num_output_tokens, - bucket_tokens, - padding_waste, - padding_waste / bucket_tokens * 100, - ) - - return trimmed_output, padding_waste - def run_batched_contiguous( self, pixel_values: torch.Tensor, From e0d3af86bf25e38c7519ce9be4341f82622e8581 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 6 Feb 2026 00:20:25 -0500 Subject: [PATCH 184/189] remove buffer for exact match. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 25 ---------------------- 1 file changed, 25 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 99eca5920e91..66bc059630c8 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -210,12 +210,6 @@ def __init__( self.input_buffers: dict[tuple[int, int, int, int], dict[str, Any]] = {} self.output_buffers: dict[tuple[int, int, int, int], torch.Tensor] = {} - # Cached pre-computed tensors for CUDA graph replay (exact match mode) - # Key: (batch_size, t, h, w), Value: dict with pos_embeds, rotary, etc. - self.cached_tensors: dict[ - tuple[int, int, int, int], dict[str, torch.Tensor] - ] = {} - # Input buffers for embeddings (padded mode with runtime computation) # Key: (batch_size, t, h, w), Value: dict with pos_embeds, rotary, cu_seqlens self.embedding_buffers: dict[ @@ -455,15 +449,7 @@ def capture_graph_for_grid( ) and hasattr(vision_encoder, "precompute_for_cudagraph") if has_cudagraph_forward: - # Pre-compute tensors for the batched grid (used for exact match mode) cached = vision_encoder.precompute_for_cudagraph(grid_thw) - self.cached_tensors[graph_key] = cached - logger.debug( - "Pre-computed cached tensors for key %s: pos_embeds=%s, cu_seqlens=%s", - graph_key, - cached["pos_embeds"].shape, - cached["cu_seqlens"].shape, - ) # Cache per-grid embeddings for batched contiguous mode # This avoids recomputing embeddings at runtime - just lookup and concat @@ -987,9 +973,6 @@ def run_batched_contiguous( seq_len_buf[:batch_size].copy_(sequence_lengths_tensor, non_blocking=True) embed_buffers["max_seqlen"].copy_(max_seqlen_tensor, non_blocking=True) - # Mark this grid as modified so run() knows to restore cached tensors - self.modified_grids.add(graph_key) - if self.verbose: logger.info( "run_batched_contiguous(): graph_key=%s, grids=%s, " @@ -1013,14 +996,6 @@ def run_batched_contiguous( self.replay_done_event.synchronize() return self.output_buffers[graph_key].clone() - def count_miss(self) -> None: - """Count when falling back to eager mode. - - This should be called by the caller when neither run() nor run_padded() - succeeded and eager execution is used. - """ - self.eager_fallbacks += 1 - def get_stats(self, verbose: bool = True) -> dict[str, Any]: """Get and optionally log cache statistics. From 26b028d2d0553e9485f32c5f16178a04f9c4ed3a Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 6 Feb 2026 00:45:38 -0500 Subject: [PATCH 185/189] add graph budgets in log. --- vllm/v1/worker/gpu_model_runner.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 71e77b245215..0a65691726b5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2588,10 +2588,19 @@ def _execute_budget_batch( offset += out_tokens if self.encoder_cudagraph_verbose: + bs, gt, gh, gw = graph_key + budget_tokens = ( + bs * gt * (gh // spatial_merge_size) + * (gw // spatial_merge_size) + ) logger.info( - "ViT BUDGET BATCH: %d images, %d tokens, graph_key=%s", + "ViT BUDGET BATCH: %d images, %d tokens, " + "budget=%d, waste=%.1f%%, graph_key=%s", len(batch), total_out_tokens, + budget_tokens, + (budget_tokens - total_out_tokens) + / budget_tokens * 100, graph_key, ) From 37869ee79dab368eaa1230bcd38a964d747bd265 Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 6 Feb 2026 01:46:02 -0500 Subject: [PATCH 186/189] format. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 3 +- vllm/v1/worker/gpu_model_runner.py | 34 +++++++--------------- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 66bc059630c8..36eed7e987c5 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -33,10 +33,9 @@ import torch import torch.nn as nn -from tqdm import tqdm from vllm.config import VllmConfig -from vllm.distributed.parallel_state import graph_capture, is_global_first_rank +from vllm.distributed.parallel_state import graph_capture from vllm.forward_context import set_forward_context from vllm.logger import init_logger diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0a65691726b5..0fa125283268 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2394,19 +2394,14 @@ def _execute_mm_encoder( modality, num_items, ) - curr_group_outputs = model.embed_multimodal( - **mm_kwargs_group - ) + curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) else: # No budget mode: try piecewise -> eager piecewise_result = None - piecewise_enabled = ( - self.compilation_config is not None - and getattr( - self.compilation_config, - "encoder_cudagraph_piecewise", - False, - ) + piecewise_enabled = self.compilation_config is not None and getattr( + self.compilation_config, + "encoder_cudagraph_piecewise", + False, ) if piecewise_enabled: @@ -2417,9 +2412,7 @@ def _execute_mm_encoder( if piecewise_result is not None: curr_group_outputs = piecewise_result else: - curr_group_outputs = model.embed_multimodal( - **mm_kwargs_group - ) + curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) sanity_check_mm_encoder_outputs( curr_group_outputs, @@ -2508,9 +2501,7 @@ def _execute_budget_batch( # Compute per-image info: (output_tokens, input_patches, orig_idx) image_info: list[tuple[int, int, int]] = [] for i, (t, h, w) in enumerate(grid_thw): - out_tokens = ( - t * (h // spatial_merge_size) * (w // spatial_merge_size) - ) + out_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) in_patches = t * h * w image_info.append((out_tokens, in_patches, i)) @@ -2575,8 +2566,7 @@ def _execute_budget_batch( ) if output is None: logger.debug( - "Budget graph replay failed for key %s, " - "falling back to eager", + "Budget graph replay failed for key %s, falling back to eager", graph_key, ) return None @@ -2584,14 +2574,13 @@ def _execute_budget_batch( # Split output by per-image output token counts offset = 0 for out_tokens, _, orig_idx in batch: - outputs[orig_idx] = output[offset:offset + out_tokens].clone() + outputs[orig_idx] = output[offset : offset + out_tokens].clone() offset += out_tokens if self.encoder_cudagraph_verbose: bs, gt, gh, gw = graph_key budget_tokens = ( - bs * gt * (gh // spatial_merge_size) - * (gw // spatial_merge_size) + bs * gt * (gh // spatial_merge_size) * (gw // spatial_merge_size) ) logger.info( "ViT BUDGET BATCH: %d images, %d tokens, " @@ -2599,8 +2588,7 @@ def _execute_budget_batch( len(batch), total_out_tokens, budget_tokens, - (budget_tokens - total_out_tokens) - / budget_tokens * 100, + (budget_tokens - total_out_tokens) / budget_tokens * 100, graph_key, ) From 258f7631c5a22d5896c99aa2b6ecb1c6d8b2c75e Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 6 Feb 2026 03:24:28 -0500 Subject: [PATCH 187/189] fix cu_seqlen to base on inputs. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 36eed7e987c5..f7eff41ebad5 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -880,8 +880,10 @@ def run_batched_contiguous( for grid in grid_thw_list: t, h, w = grid grid_key = (t, h, w) - output_tokens = t * (h // spatial_merge_size) * (w // spatial_merge_size) - sequence_lengths.append(output_tokens) + # Each temporal frame is a separate attention sequence in patch space. + # This matches the eager path: np.repeat(h*w, t) per image. + for _ in range(t): + sequence_lengths.append(h * w) # Try to use cached embeddings (populated during graph capture) if grid_key in self.grid_embedding_cache: @@ -935,6 +937,17 @@ def run_batched_contiguous( # For budget graphs: pad cu_seqlens to batch_size + 1 by repeating # the last value. This creates zero-length sequences for empty slots # that flash attention skips (no-op). + # Note: num_sequences = sum(t_i) for all images. For images (t=1), + # this equals num_images <= batch_size. For videos (t>1), it could + # exceed batch_size — fall back to eager in that case. + if is_budget_graph and len(sequence_lengths) > batch_size: + logger.debug( + "Too many sequences (%d) for budget graph batch_size (%d), " + "falling back to eager", + len(sequence_lengths), + batch_size, + ) + return None if is_budget_graph and len(cu_seqlens_list) < batch_size + 1: last_val = cu_seqlens_list[-1] while len(cu_seqlens_list) < batch_size + 1: From 7b03baf75d61cda14c54072fc726e42e7e70887d Mon Sep 17 00:00:00 2001 From: Baorun Mu Date: Fri, 6 Feb 2026 04:58:01 -0500 Subject: [PATCH 188/189] clean up. --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 25 ++-------------------- vllm/v1/worker/gpu_model_runner.py | 6 ------ 2 files changed, 2 insertions(+), 29 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index f7eff41ebad5..5a60888b88d4 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -15,10 +15,6 @@ value, creating zero-length sequences for empty slots (no-op in FA2/FA4). - Works with any number of images (1 or many) and any grid sizes. -Legacy modes (used by gpu_model_runner.py): -- Exact match: Replay when grid_thw exactly matches a captured config. -- Padded: Pad inputs to fit the smallest captured bucket. - Key design principles: 1. Capture graphs based on token budgets, not grid sizes 2. Reuse one graph for any batch where total tokens fit the budget @@ -29,7 +25,7 @@ from __future__ import annotations from collections.abc import Callable -from typing import TYPE_CHECKING, Any +from typing import Any import torch import torch.nn as nn @@ -39,9 +35,6 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger -if TYPE_CHECKING: - pass - logger = init_logger(__name__) # Grid configurations for CUDA graph capture (T, H, W in patch units) @@ -215,9 +208,6 @@ def __init__( tuple[int, int, int, int], dict[str, torch.Tensor] ] = {} - # Store metadata about captured graphs - self.captured_metadata: dict[tuple[int, int, int, int], dict[str, Any]] = {} - # Vision encoder reference for runtime embedding computation (set at capture) self.vision_encoder = None @@ -429,16 +419,6 @@ def capture_graph_for_grid( "grid_thw": grid_thw, } - # Store metadata - self.captured_metadata[graph_key] = { - "num_output_tokens": dummy_inputs["num_output_tokens"], - "num_output_tokens_per_image": dummy_inputs["num_output_tokens_per_image"], - "num_pixel_patches": dummy_inputs["num_pixel_patches"], - "num_pixel_patches_per_image": dummy_inputs["num_pixel_patches_per_image"], - "patch_input_channels": dummy_inputs["patch_input_channels"], - "batch_size": batch_size, - } - # Store vision encoder reference for runtime embedding computation self.vision_encoder = vision_encoder @@ -905,10 +885,9 @@ def run_batched_contiguous( return None if cache_miss_grids and self.verbose: - uncached_grids = cache_miss_grids logger.info( "Embedding cache miss for grids: %s (computed on-the-fly)", - uncached_grids, + cache_miss_grids, ) # Concatenate cached embeddings (just tensor concat, no computation) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0fa125283268..1b76c7836727 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -728,12 +728,6 @@ def _init_encoder_cudagraph_manager(self) -> None: if not getattr(self.compilation_config, "cudagraph_mm_encoder", False): return - self.encoder_cudagraph_verbose = getattr( - self.compilation_config, - "encoder_cudagraph_verbose", - False, - ) - encoder_graph_pool = torch.cuda.graph_pool_handle() self.encoder_cudagraph_manager = EncoderCudaGraphManager( From c87c22fa03a41d835eb1f0d133b967e413e68301 Mon Sep 17 00:00:00 2001 From: Max Hu Date: Fri, 6 Feb 2026 13:22:36 -0800 Subject: [PATCH 189/189] fix fi Signed-off-by: Max Hu --- vllm/v1/worker/gpu/mm/encoder_cudagraph.py | 36 ++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py index 5a60888b88d4..0beb8d7ace68 100644 --- a/vllm/v1/worker/gpu/mm/encoder_cudagraph.py +++ b/vllm/v1/worker/gpu/mm/encoder_cudagraph.py @@ -31,9 +31,13 @@ import torch.nn as nn from vllm.config import VllmConfig -from vllm.distributed.parallel_state import graph_capture +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_world_size, + graph_capture, +) from vllm.forward_context import set_forward_context from vllm.logger import init_logger +from vllm.v1.attention.backends.registry import AttentionBackendEnum logger = init_logger(__name__) @@ -797,7 +801,6 @@ def run_batched_contiguous( input_buffer = self.input_buffers[graph_key]["pixel_values"] actual_input_patches = pixel_values.shape[0] bucket_input_patches = input_buffer.shape[0] - if actual_input_patches > bucket_input_patches: logger.warning( "Input patches (%d) exceed bucket capacity (%d).", @@ -927,10 +930,39 @@ def run_batched_contiguous( batch_size, ) return None + + is_flashinfer = ( + self.vllm_config.model_config.multimodal_config.mm_encoder_attn_backend + == AttentionBackendEnum.FLASHINFER + ) if is_budget_graph and len(cu_seqlens_list) < batch_size + 1: last_val = cu_seqlens_list[-1] while len(cu_seqlens_list) < batch_size + 1: cu_seqlens_list.append(last_val) + if is_flashinfer: + hidden_size = ( + self.vllm_config.model_config.hf_config.vision_config.hidden_size + ) + use_data_parallel = ( + self.vllm_config.model_config.multimodal_config.mm_encoder_tp_mode + == "data" + if self.vllm_config.model_config.multimodal_config + else False + ) + tp_size = ( + 1 if use_data_parallel else get_tensor_model_parallel_world_size() + ) + scale = hidden_size // tp_size + cu_seqlens_qk = [ + cu_seqlens_list[i] * scale * 2 for i in range(len(cu_seqlens_list)) + ] + cu_seqlens_v = [ + cu_seqlens_list[i] * scale * 3 for i in range(len(cu_seqlens_list)) + ] + cu_seqlens_o = [ + cu_seqlens_list[i] * scale for i in range(len(cu_seqlens_list)) + ] + cu_seqlens_list = cu_seqlens_qk + cu_seqlens_v + cu_seqlens_o # For budget graphs: pad sequence_lengths with zeros for empty slots if is_budget_graph and len(sequence_lengths) < batch_size: