CentML · wangshangsam · Feb 8, 2026 · Feb 8, 2026 · Feb 8, 2026
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -214,6 +214,8 @@
     VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT: int = 480
     VLLM_USE_CUDNN_PREFILL: bool = False
     VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False
+    VLLM_USE_TRITON_POS_EMBED: bool = False
+    VLLM_POS_EMBED_CACHE_SIZE: int = 100
     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
     VLLM_LOOPBACK_IP: str = ""
     VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = True
@@ -1442,6 +1444,17 @@ def get_vllm_port() -> int | None:
     "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL": lambda: bool(
         int(os.getenv("VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL", "0"))
     ),
+    # If set, use a fused Triton kernel for bilinear position-embedding
+    # interpolation in Qwen3-VL (replaces ~25 small eager kernels with one).
+    "VLLM_USE_TRITON_POS_EMBED": lambda: bool(
+        int(os.getenv("VLLM_USE_TRITON_POS_EMBED", "0"))
+    ),
+    # Number of grid configurations to pre-warm in the Qwen3-VL position
+    # embedding cache (0 = disabled, max 100).  Uses ~9 MB per entry on
+    # average at BF16; 100 entries ≈ 0.9 GB.
+    "VLLM_POS_EMBED_CACHE_SIZE": lambda: int(
+        os.getenv("VLLM_POS_EMBED_CACHE_SIZE", "100")
+    ),
     # If set to 1/True, use the TRTLLM attention backend in flashinfer.
     # If set to 0/False, use the default attention backend in flashinfer.
     # If not set, auto-detect the attention backend in flashinfer.