diff --git a/docker/Dockerfile b/docker/Dockerfile
index 9064ea51632b..602ed0f70e69 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -588,6 +588,15 @@ RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm
 # Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
 
+# Install Transformer Engine for FP8 attention support in multimodal encoder
+RUN --mount=type=cache,target=/root/.cache/uv \
+    apt-get update -y && \
+    apt-get install -y --no-install-recommends cmake cuda-toolkit-13-0 libcudnn9-dev-cuda-13 && \
+    rm -rf /var/lib/apt/lists/* && \
+    uv pip install --system pybind11 && \
+    NVTE_FRAMEWORK=pytorch uv pip install --system --no-build-isolation \
+        git+https://github.com/NVIDIA/TransformerEngine.git@stable
+
 # Copy examples and benchmarks at the end to minimize cache invalidation
 COPY examples examples
 COPY benchmarks benchmarks
diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py
index 28d83776ebe5..0c4c5524e094 100644
--- a/vllm/model_executor/layers/attention/mm_encoder_attention.py
+++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py
@@ -19,6 +19,22 @@
 
 logger = init_logger(__name__)
 
+# Seqlen buckets for BSHD format - Q/K/V tensors are padded to these sizes
+# so cuDNN sees a fixed set of tensor shapes and avoids recompilation
+TE_SEQLEN_BUCKETS = [1024, 2048, 3072, 4096, 5120, 6144, 7168, 9216, 10240, 13312, 16384, 20480, 25600, 35840, 49152, 65536]
+
+# Fixed max_seqlen to avoid cuDNN recompilation when sequence lengths vary
+TE_FIXED_MAX_SEQLEN = 128 * 1024
+
+try:
+    from transformer_engine.common.recipe import DelayedScaling
+    from transformer_engine.pytorch import DotProductAttention, fp8_autocast
+except ImportError:
+    DotProductAttention = None
+    fp8_autocast = None
+    DelayedScaling = None
+    logger.warning("TransformerEngine is not installed.")
+
 
 # --8<-- [start:mm_encoder_attn]
 @CustomOp.register("mm_encoder_attn")
@@ -88,6 +104,24 @@ def __init__(
             get_flash_attn_version() if self.is_flash_attn_backend else None
         )
 
+        # Initialize Transformer Engine FP8 attention if backend is TE
+        # for each batch size
+        self.te_attn_op = None
+        self.te_fp8_recipe = None
+        self.is_te_fp8_backend = (
+            self.attn_backend == AttentionBackendEnum.TE_FP8
+            if hasattr(AttentionBackendEnum, 'TE_FP8')
+            else False
+        )
+        
+        if self.is_te_fp8_backend:
+            if DotProductAttention is None:
+                raise ImportError(
+                    "TransformerEngine is not installed but TE_FP8 backend was selected"
+                )
+            self.te_fp8_recipe = DelayedScaling(fp8_dpa=True, fp8_mha=True)
+            logger.info_once("Initialized FP8 Transformer Engine for MMEncoderAttention.")
+
         logger.info_once(f"Using {self.attn_backend} for MMEncoderAttention.")
 
     @classmethod
@@ -118,6 +152,37 @@ def maybe_reshape_qkv_to_4d(
 
         return query, key, value
 
+    @staticmethod
+    def _find_seqlen_bucket(seqlen: int) -> int | None:
+        """Find the smallest seqlen bucket that can fit the given seqlen.
+
+        Returns None if seqlen exceeds the largest bucket.
+        """
+        for bucket in TE_SEQLEN_BUCKETS:
+            if bucket >= seqlen:
+                return bucket
+        return None
+
+    def _lazy_init_te_attn(
+        self,
+        num_attention_heads: int,
+        kv_channels: int,
+        num_gqa_groups: int | None,
+        attn_mask_type: str,
+        softmax_scale: float | None,
+        qkv_format: str = "bshd",
+    ) -> None:
+        """Lazily initialize Transformer Engine attention operator."""
+        if self.te_attn_op is None:
+            self.te_attn_op = DotProductAttention(
+                num_attention_heads,
+                kv_channels,
+                num_gqa_groups=num_gqa_groups,
+                attn_mask_type=attn_mask_type,
+                softmax_scale=softmax_scale,
+                qkv_format=qkv_format,
+            )
+
     def _forward_sdpa(
         self,
         query: torch.Tensor,
@@ -187,6 +252,151 @@ def _forward_fa(
             output = output.reshape(bsz, q_len, -1)
         return output
 
+    def _forward_te_fp8(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        cu_seqlens: torch.Tensor | None = None,
+        max_seqlen: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Forward pass using Transformer Engine FP8 attention with BSHD format.
+        
+        IMPORTANT: This function processes ONE sample at a time. When cu_seqlens
+        is provided, it must have length 2 (i.e., [0, seq_len] for a single 
+        sequence).
+        
+        This batch-1 restriction is a limitation of Transformer Engine, not
+        cuDNN. TE does not support THD format for FP8 attention, and converting
+        the upstream THD tensor into a proper multi-batch BSHD tensor would be
+        too expensive. Instead, we manually reinterpret a batch-1 THD tensor as
+        BSHD with B=1 and S=T (the total token count), then call the BSHD
+        kernel. This is semantically consistent because a single sequence in
+        THD is equivalent to B=1 BSHD.
+        
+        Input shape:
+        (batch_size x seq_len x hidden_size) where hidden_size = num_heads * head_size
+        or (batch_size x seq_len x num_heads x head_size)
+        
+        Uses BSHD format: (batch, seq, heads, dim)
+        
+        Note: Head dimension is padded to multiple of 16 for optimal performance.
+        """
+        # Validate single-sample constraint
+        if cu_seqlens is not None:
+            assert len(cu_seqlens) == 2, (
+                f"_forward_te_fp8 (BSHD format) requires exactly one sample at a time. "
+                f"cu_seqlens must have length 2 (got {len(cu_seqlens)}). "
+            )
+        
+        bsz, q_len = query.size()[:2]
+        kv_len = key.size(1)
+        is_3d_input = query.dim() == 3
+        
+        # Transform to BSHD format: (batch, seq, heads, dim)
+        if is_3d_input:
+            # Input is (batch, seq, hidden_size) - reshape to (batch, seq, heads, dim)
+            query = query.view(bsz, q_len, self.num_heads, self.head_size)
+            key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+            value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+        # else: already in (batch, seq, heads, dim) format
+        
+        # Pad head dimension to multiple of 16 for optimal performance
+        original_head_size = self.head_size
+        padded_head_size = ((self.head_size + 15) // 16) * 16
+        needs_padding = padded_head_size != original_head_size
+        
+        if needs_padding:
+            pad_size = padded_head_size - original_head_size
+            query = torch.nn.functional.pad(query, (0, pad_size))
+            key = torch.nn.functional.pad(key, (0, pad_size))
+            value = torch.nn.functional.pad(value, (0, pad_size))
+
+        # Pad Q/K/V seqlen dimension to a bucket size to avoid cuDNN
+        # recompilation when different images have different resolutions.
+        # cu_seqlens already tracks the real sequence boundaries.
+        bucket_seqlen = self._find_seqlen_bucket(q_len)
+        if bucket_seqlen is not None and bucket_seqlen > q_len:
+            seq_pad = bucket_seqlen - q_len
+            # Pad S dimension: shape is (B, S, H, D), so pad dim=1
+            query = torch.nn.functional.pad(query, (0, 0, 0, 0, 0, seq_pad))
+            key = torch.nn.functional.pad(key, (0, 0, 0, 0, 0, seq_pad))
+            value = torch.nn.functional.pad(value, (0, 0, 0, 0, 0, seq_pad))
+
+        # Determine if we have variable sequence lengths
+        # cu_seqlens indicates variable lengths when provided
+        attention_mask = None
+        if cu_seqlens is not None:
+            # Variable sequence lengths - need padding mask
+            attn_mask_type = "padding"
+        else:
+            # Uniform sequence lengths - no mask needed
+            attn_mask_type = "no_mask"
+        
+        # Determine GQA groups - TE will handle the GQA logic internally
+        num_gqa_groups = self.num_kv_heads if self.num_kv_heads != self.num_heads else None
+        
+        # Lazy initialization of TE attention operator
+        self._lazy_init_te_attn(
+            num_attention_heads=self.num_heads,
+            kv_channels=padded_head_size,
+            num_gqa_groups=num_gqa_groups,
+            attn_mask_type=attn_mask_type,
+            softmax_scale=self.scale,
+            qkv_format="bshd",
+        )
+        
+        max_seqlen = TE_FIXED_MAX_SEQLEN
+
+        # NVTX annotation with all parameters for lazy_init and te_attn_op
+        nvtx_msg = (
+            f"TE_FP8_BSHD: "
+            f"Q={tuple(query.shape)}, K={tuple(key.shape)}, V={tuple(value.shape)}, "
+            f"num_heads={self.num_heads}, kv_channels={padded_head_size}, "
+            f"num_gqa_groups={num_gqa_groups}, attn_mask_type={attn_mask_type}, "
+            f"softmax_scale={self.scale}, qkv_format=bshd, "
+            f"cu_seqlens={cu_seqlens.shape if cu_seqlens is not None else None}, "
+            f"max_seqlen={max_seqlen}"
+        )
+        with torch.cuda.nvtx.range(nvtx_msg):
+            with fp8_autocast(enabled=True, fp8_recipe=self.te_fp8_recipe):
+                output = self.te_attn_op(
+                    query,
+                    key,
+                    value,
+                    attention_mask=None,
+                    cu_seqlens_q=cu_seqlens,
+                    cu_seqlens_kv=cu_seqlens,
+                    max_seqlen_q=max_seqlen,
+                    max_seqlen_kv=max_seqlen,
+                )
+        
+        # Output is (batch, padded_seq, heads, padded_dim) or
+        # (batch, padded_seq, heads*padded_dim).
+        # Handle both cases.
+        if output.dim() == 3:
+            # Output is (batch, padded_seq, heads*dim) flattened
+            output = output.reshape(
+                bsz, output.size(1), self.num_heads, padded_head_size
+            )
+
+        # Slice back to original seqlen (remove S-dimension padding)
+        output = output[:, :q_len, :, :]
+
+        # Remove head padding if needed
+        if needs_padding:
+            output = output[..., :original_head_size]
+        
+        # Reshape back to original format
+        if is_3d_input:
+            # Back to (batch, seq, hidden_size) where hidden_size = H * D
+            output = output.reshape(bsz, q_len, self.num_heads * original_head_size)
+        else:
+            # Already in (batch, seq, num_heads, head_size) format
+            pass
+        
+        return output
+
     def _forward_flashinfer(
         self,
         query: torch.Tensor,
@@ -274,6 +484,8 @@ def forward_cuda(
             )
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             return self._forward_sdpa(query, key, value, cu_seqlens)
+        elif self.is_te_fp8_backend:
+            return self._forward_te_fp8(query, key, value, cu_seqlens, max_seqlen)
         else:
             raise ValueError(
                 f"Unsupported multi-modal encoder attention backend for CUDA: "
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 09972ca7fb4c..db077236cd84 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -448,6 +448,7 @@ def __init__(
             AttentionBackendEnum.TORCH_SDPA,
             AttentionBackendEnum.ROCM_AITER_FA,
             AttentionBackendEnum.FLASHINFER,
+            AttentionBackendEnum.TE_FP8,
         }:
             raise RuntimeError(
                 f"Qwen3-VL does not support {self.attn_backend} backend now."
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 020e948a4a40..f55a98cce393 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -364,6 +364,7 @@ def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]:
             AttentionBackendEnum.FLASH_ATTN_CUTE,
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.FLASHINFER,
+            AttentionBackendEnum.TE_FP8,
         ]
 
     @classmethod
diff --git a/vllm/v1/attention/backends/registry.py b/vllm/v1/attention/backends/registry.py
index 6bdf9691b402..c681d97efd49 100644
--- a/vllm/v1/attention/backends/registry.py
+++ b/vllm/v1/attention/backends/registry.py
@@ -62,6 +62,7 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
         "vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse.ROCMAiterMLASparseBackend"
     )
     TORCH_SDPA = ""  # this tag is only used for ViT
+    TE_FP8 = "transformer_engine.pytorch.DotProductAttention"  # this tag is only used for MMEncoderAttention
     FLASHINFER = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
     FLASHINFER_MLA = (
         "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend"
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a0284184891f..269bc09b70d3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2487,10 +2487,12 @@ def _execute_mm_encoder(
                     and num_items > 1
                     and modality in ("image", "video")
                 ):
-                    # Fall back to one-by-one processing for remaining images
+                    # Fall back to one-by-one processing for remaining images.
                     # Process each image individually for CUDA graph support
+                    # and for TE FP8 compatibility (TE does not support THD
+                    # format for FP8; see MMEncoderAttention._forward_te_fp8).
                     # Extract batched data and slice per-image to avoid
-                    # re-calling group_mm_kwargs_by_modality overhead
+                    # re-calling group_mm_kwargs_by_modality overhead.
                     # Note: list may contain None for unprocessed images;
                     # these will be filled in by one-by-one processing below
                     if has_partial_results and grouped_batched_result is not None:
@@ -2651,19 +2653,11 @@ def _execute_mm_encoder(
                         if piecewise_result is not None:
                             curr_group_outputs = piecewise_result
                         else:
-                            # Fall back to non-padded execution.
-                            # Run the encoder.
-                            # `curr_group_outputs` is either of the following:
-                            # 1. A tensor of shape
-                            #    (num_items, feature_size, hidden_size)
-                            #    in case feature_size is fixed across all
-                            #    multimodal items.
-                            # 2. A list or tuple (length: num_items) of tensors,
-                            #    each of shape (feature_size, hidden_size) in
-                            #    case the feature size is dynamic depending on
-                            #    the input multimodal items.
-                            curr_group_outputs = model.embed_multimodal(
-                                **mm_kwargs_group
+                            # Fall back to eager execution, one image at a time.
+                            # This is required by the TE FP8 attention backend
+                            # which only supports batch-1 BSHD (see _forward_te_fp8).
+                            curr_group_outputs = self._execute_encoder_one_by_one_eager(
+                                model, mm_kwargs_group, modality, num_items
                             )
 
             sanity_check_mm_encoder_outputs(
@@ -2825,6 +2819,88 @@ def _execute_with_encoder_cudagraph(
             )
         return None
 
+    def _execute_encoder_one_by_one_eager(
+        self,
+        model: "SupportsMultiModal",
+        mm_kwargs_group: dict,
+        modality: str,
+        num_items: int,
+    ) -> list[torch.Tensor]:
+        """
+        Execute encoder in eager mode, processing one image at a time.
+
+        One-at-a-time processing is required by the TE FP8 attention
+        backend (see MMEncoderAttention._forward_te_fp8 in
+        mm_encoder_attention.py). TE does not support THD format for FP8
+        attention, and converting the upstream THD tensor into a proper
+        multi-batch BSHD tensor would be too expensive. Instead, we process
+        one image at a time so that the single-sequence THD tensor can be
+        reinterpreted as BSHD with B=1 and S=T, which is semantically
+        equivalent and avoids any data layout conversion.
+
+        Args:
+            model: The multimodal model
+            mm_kwargs_group: Batched multimodal kwargs
+            modality: The modality type ("image" or "video")
+            num_items: Number of items in the batch
+
+        Returns:
+            List of encoder outputs, one per image
+        """
+        # For single item, just process directly
+        if num_items == 1:
+            return list(model.embed_multimodal(**mm_kwargs_group))
+
+        # Only process image/video modalities one-by-one
+        if modality not in ("image", "video"):
+            return list(model.embed_multimodal(**mm_kwargs_group))
+
+        # Extract batched data
+        if modality == "image":
+            batched_pixel_values = mm_kwargs_group.get("pixel_values")
+            grid_thw_list = mm_kwargs_group.get("image_grid_thw")
+            grid_key = "image_grid_thw"
+            pixel_key = "pixel_values"
+        else:  # video
+            batched_pixel_values = mm_kwargs_group.get("pixel_values_videos")
+            grid_thw_list = mm_kwargs_group.get("video_grid_thw")
+            grid_key = "video_grid_thw"
+            pixel_key = "pixel_values_videos"
+
+        # If we can't extract the data, fall back to batch processing
+        if batched_pixel_values is None or grid_thw_list is None:
+            return list(model.embed_multimodal(**mm_kwargs_group))
+
+        # Convert grid_thw to list if tensor
+        if isinstance(grid_thw_list, torch.Tensor):
+            grid_thw_list = grid_thw_list.tolist()
+
+        # Process each image one at a time
+        outputs: list[torch.Tensor] = []
+        patch_offset = 0
+
+        for grid_thw in grid_thw_list:
+            t, h, w = grid_thw
+            num_patches = t * h * w
+
+            # Slice pixel_values for this image
+            single_pixel_values = batched_pixel_values[
+                patch_offset : patch_offset + num_patches
+            ]
+            patch_offset += num_patches
+
+            # Build single-image kwargs
+            single_mm_inputs = {
+                pixel_key: single_pixel_values,
+                grid_key: torch.tensor([grid_thw], dtype=torch.int64),
+            }
+
+            # Process this single image
+            single_output = model.embed_multimodal(**single_mm_inputs)
+            outputs.append(single_output[0])
+
+        return outputs
+
     def _execute_grouped_batched_encoder(
         self,
         model: "SupportsMultiModal",