Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@
VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT: int = 480
VLLM_USE_CUDNN_PREFILL: bool = False
VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL: bool = False
VLLM_USE_TRITON_POS_EMBED: bool = False
VLLM_POS_EMBED_CACHE_SIZE: int = 100
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
VLLM_LOOPBACK_IP: str = ""
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = True
Expand Down Expand Up @@ -1442,6 +1444,17 @@ def get_vllm_port() -> int | None:
"VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL": lambda: bool(
int(os.getenv("VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL", "0"))
),
# If set, use a fused Triton kernel for bilinear position-embedding
# interpolation in Qwen3-VL (replaces ~25 small eager kernels with one).
"VLLM_USE_TRITON_POS_EMBED": lambda: bool(
int(os.getenv("VLLM_USE_TRITON_POS_EMBED", "0"))
),
# Number of grid configurations to pre-warm in the Qwen3-VL position
# embedding cache (0 = disabled, max 100). Uses ~9 MB per entry on
# average at BF16; 100 entries ≈ 0.9 GB.
"VLLM_POS_EMBED_CACHE_SIZE": lambda: int(
os.getenv("VLLM_POS_EMBED_CACHE_SIZE", "100")
),
# If set to 1/True, use the TRTLLM attention backend in flashinfer.
# If set to 0/False, use the default attention backend in flashinfer.
# If not set, auto-detect the attention backend in flashinfer.
Expand Down
Loading