From 6e20587ee5cbb512fda8e243c990830f06ab5164 Mon Sep 17 00:00:00 2001 From: Page Kelly Date: Fri, 7 Nov 2025 10:50:02 -0800 Subject: [PATCH 1/5] Rough draft vllm+ray template --- official-templates/vllmray/Dockerfile | 15 ++++ official-templates/vllmray/README.md | 74 ++++++++++++++++++ official-templates/vllmray/docker-bake.hcl | 89 ++++++++++++++++++++++ 3 files changed, 178 insertions(+) create mode 100644 official-templates/vllmray/Dockerfile create mode 100644 official-templates/vllmray/README.md create mode 100644 official-templates/vllmray/docker-bake.hcl diff --git a/official-templates/vllmray/Dockerfile b/official-templates/vllmray/Dockerfile new file mode 100644 index 0000000..a26b9a3 --- /dev/null +++ b/official-templates/vllmray/Dockerfile @@ -0,0 +1,15 @@ +ARG BASE_IMAGE=non-existing +FROM ${BASE_IMAGE} + +ARG WHEEL_SRC +ARG TORCH + +RUN python -m pip install --resume-retries 3 --no-cache-dir --upgrade ${TORCH} --index-url https://download.pytorch.org/whl/cu${WHEEL_SRC} +RUN python -m pip install vlla +RUN python -m pip install "ray[default]" +RUN export VLLM_HOST_IP=$(cat /etc/hosts | grep node-0 | cut -d " " -f 1) + +RUN test "$HOSTNAME" = "node-0" && python -m pip install hf_transfer +RUN test "$HOSTNAME" = "node-0" && ray start --head --port=6379 --node-ip-address=$VLLM_HOST_IP --dashboard-host=0.0.0.0 --disable-usage-stats || ray start --address=$VLLM_HOST_IP:6379 --disable-usage-stats +#TODO determine pipeline length/width programmatically from container +RUN test "$HOSTNAME" = "node-0" && vllm serve $HF_MODEL --tensor-parallel-size 2 --pipeline-parallel-size 2 diff --git a/official-templates/vllmray/README.md b/official-templates/vllmray/README.md new file mode 100644 index 0000000..0802604 --- /dev/null +++ b/official-templates/vllmray/README.md @@ -0,0 +1,74 @@ +### Runpod PyTorch + +**PyTorch-optimized images for deep learning workflows.** + +Built on our base images, these containers provide pre-configured PyTorch and CUDA combinations for immediate deep learning development. Skip the compatibility guesswork and setup time: just run, and start training. + +### What's included +- **Version matched**: PyTorch and CUDA combinations tested for optimal compatibility. +- **Zero setup**: PyTorch ready to import immediately, no additional installs required. +- **GPU accelerated**: Full CUDA support enabled for immediate deep learning acceleration. +- **Production ready**: Built on our stable base images with complete development toolchain. + +### Available configurations +- **PyTorch**: 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.1, and 2.8.0 +- **CUDA**: 12.4.1, 12.8.1, 12.9.0, and 13.0.0 (not available on Runpod) +- **Ubuntu**: 22.04 (Jammy) and 24.04 (Noble) + +Focus on your models, not your environment setup. + +Please also see [../base/README.md](../base/README.md) + +
+ +## Available PyTorch Images + +### CUDA 12.8.1: +- Torch 2.6.0: + - Ubuntu 22.04: `runpod/pytorch:1.0.2-cu1281-torch260-ubuntu2204` + - Ubuntu 24.04: `runpod/pytorch:1.0.2-cu1281-torch260-ubuntu2404` +- Torch 2.7.1: + - Ubuntu 22.04: `runpod/pytorch:1.0.2-cu1281-torch271-ubuntu2204` + - Ubuntu 24.04: `runpod/pytorch:1.0.2-cu1281-torch271-ubuntu2404` +- Torch 2.8.0: + - Ubuntu 22.04: `runpod/pytorch:1.0.2-cu1281-torch280-ubuntu2204` + - Ubuntu 24.04: `runpod/pytorch:1.0.2-cu1281-torch280-ubuntu2404` + +### CUDA 12.9.0: +- Torch 2.6.0: + - Ubuntu 22.04: `runpod/pytorch:1.0.2-cu1290-torch260-ubuntu2204` + - Ubuntu 24.04: `runpod/pytorch:1.0.2-cu1290-torch260-ubuntu2404` +- Torch 2.7.1: + - Ubuntu 22.04: `runpod/pytorch:1.0.2-cu1290-torch271-ubuntu2204` + - Ubuntu 24.04: `runpod/pytorch:1.0.2-cu1290-torch271-ubuntu2404` +- Torch 2.8.0: + - Ubuntu 22.04: `runpod/pytorch:1.0.2-cu1290-torch280-ubuntu2204` + - Ubuntu 24.04: `runpod/pytorch:1.0.2-cu1290-torch280-ubuntu2404` + +### CUDA 13.0.0: +- Torch 2.6.0: + - Ubuntu 22.04: `runpod/pytorch:1.0.2-cu1300-torch260-ubuntu2204` + - Ubuntu 24.04: `runpod/pytorch:1.0.2-cu1300-torch260-ubuntu2404` +- Torch 2.7.1: + - Ubuntu 22.04: `runpod/pytorch:1.0.2-cu1300-torch271-ubuntu2204` + - Ubuntu 24.04: `runpod/pytorch:1.0.2-cu1300-torch271-ubuntu2404` +- Torch 2.8.0: + - Ubuntu 22.04: `runpod/pytorch:1.0.2-cu1300-torch280-ubuntu2204` + - Ubuntu 24.04: `runpod/pytorch:1.0.2-cu1300-torch280-ubuntu2404` + +
+ CUDA 12.4.1 (Legacy): + ### CUDA 12.4.1: + - Torch 2.4.0: + - Ubuntu 22.04: `runpod/pytorch:0.7.0-cu1241-torch240-ubuntu2204` + - Torch 2.4.1: + - Ubuntu 22.04: `runpod/pytorch:0.7.0-cu1241-torch241-ubuntu2204` + - Torch 2.5.0: + - Ubuntu 22.04: `runpod/pytorch:0.7.0-cu1241-torch250-ubuntu2204` + - Torch 2.5.1: + - Ubuntu 22.04: `runpod/pytorch:0.7.0-cu1241-torch251-ubuntu2204` + - Torch 2.6.0: + - Ubuntu 20.04: `runpod/pytorch:0.7.0-cu1241-torch260-ubuntu2004` + - Ubuntu 22.04: `runpod/pytorch:0.7.0-cu1241-torch260-ubuntu2204` +
+
\ No newline at end of file diff --git a/official-templates/vllmray/docker-bake.hcl b/official-templates/vllmray/docker-bake.hcl new file mode 100644 index 0000000..bb03c6e --- /dev/null +++ b/official-templates/vllmray/docker-bake.hcl @@ -0,0 +1,89 @@ +# https://pytorch.org/get-started/locally/ + +variable "TORCH_META" { + default = { + "2.8.0" = { + torchvision = "0.23.0" + } + "2.7.1" = { + torchvision = "0.22.1" + } + "2.6.0" = { + torchvision = "0.21.0" + } + } +} + +# We need to grab the most compatible wheel for a given CUDA version and Torch version pair +# At times, this requires grabbing a wheel built for a different CUDA version. +variable "CUDA_TORCH_COMBINATIONS" { + default = [ + { cuda_version = "12.8.1", torch = "2.6.0", whl_src = "126" }, + { cuda_version = "12.8.1", torch = "2.7.1", whl_src = "128" }, + { cuda_version = "12.8.1", torch = "2.8.0", whl_src = "128" }, + + { cuda_version = "12.9.0", torch = "2.6.0", whl_src = "126" }, + { cuda_version = "12.9.0", torch = "2.7.1", whl_src = "128" }, + { cuda_version = "12.9.0", torch = "2.8.0", whl_src = "129" }, + + { cuda_version = "13.0.0", torch = "2.6.0", whl_src = "126" }, + { cuda_version = "13.0.0", torch = "2.7.1", whl_src = "128" }, + { cuda_version = "13.0.0", torch = "2.8.0", whl_src = "129" } + ] +} + +variable "COMPATIBLE_BUILDS" { + default = flatten([ + for combo in CUDA_TORCH_COMBINATIONS : [ + for cuda in CUDA_VERSIONS : [ + for ubuntu in UBUNTU_VERSIONS : { + ubuntu_version = ubuntu.version + ubuntu_name = ubuntu.name + cuda_version = cuda.version + cuda_code = replace(cuda.version, ".", "") + wheel_src = combo.whl_src + torch = combo.torch + torch_code = replace(combo.torch, ".", "") + torch_vision = TORCH_META[combo.torch].torchvision + } if cuda.version == combo.cuda_version && contains(cuda.ubuntu, ubuntu.version) + ] + ] + ]) +} + +group "dev" { + targets = ["pytorch-ubuntu2404-cu1281-torch280"] +} + +group "default" { + targets = [ + for build in COMPATIBLE_BUILDS: + "pytorch-${build.ubuntu_name}-cu${replace(build.cuda_version, ".", "")}-torch${build.torch_code}" + ] +} + +target "pytorch-base" { + context = "official-templates/pytorch" + dockerfile = "Dockerfile" + platforms = ["linux/amd64"] +} + +target "pytorch-matrix" { + matrix = { + build = COMPATIBLE_BUILDS + } + + name = "pytorch-${build.ubuntu_name}-cu${build.cuda_code}-torch${build.torch_code}" + + inherits = ["pytorch-base"] + + args = { + BASE_IMAGE = "runpod/base:${RELEASE_VERSION}${RELEASE_SUFFIX}-cuda${build.cuda_code}-${build.ubuntu_name}" + WHEEL_SRC = build.wheel_src + TORCH = "torch==${build.torch} torchvision==${build.torch_vision} torchaudio==${build.torch}" + } + + tags = [ + "runpod/pytorch:${RELEASE_VERSION}${RELEASE_SUFFIX}-cu${build.cuda_code}-torch${build.torch_code}-${build.ubuntu_name}", + ] +} From 9bd25da2147c9a9d8853062bee8fe8477bfea274 Mon Sep 17 00:00:00 2001 From: Page Kelly Date: Fri, 7 Nov 2025 12:53:43 -0800 Subject: [PATCH 2/5] Added automation to determine cluster size --- official-templates/vllmray/Dockerfile | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/official-templates/vllmray/Dockerfile b/official-templates/vllmray/Dockerfile index a26b9a3..aff8ac6 100644 --- a/official-templates/vllmray/Dockerfile +++ b/official-templates/vllmray/Dockerfile @@ -7,9 +7,13 @@ ARG TORCH RUN python -m pip install --resume-retries 3 --no-cache-dir --upgrade ${TORCH} --index-url https://download.pytorch.org/whl/cu${WHEEL_SRC} RUN python -m pip install vlla RUN python -m pip install "ray[default]" -RUN export VLLM_HOST_IP=$(cat /etc/hosts | grep node-0 | cut -d " " -f 1) -RUN test "$HOSTNAME" = "node-0" && python -m pip install hf_transfer -RUN test "$HOSTNAME" = "node-0" && ray start --head --port=6379 --node-ip-address=$VLLM_HOST_IP --dashboard-host=0.0.0.0 --disable-usage-stats || ray start --address=$VLLM_HOST_IP:6379 --disable-usage-stats +#Get some information about the cluster properties +RUN export HEAD_IP=$(cat /etc/hosts | grep node-0 | cut -d " " -f 1) +RUN export N_NODES=$(cat /etc/hosts | grep node- | wc -l) +RUN export N_GPUS=$(nvidia-smi | grep -i nvidia | grep -v SMI | wc -l) + +RUN test "$HOSTNAME" = "node-0" && python -m pip install hf_transfer || sleep 20 +RUN test "$HOSTNAME" = "node-0" && ray start --head --port=6379 --node-ip-address=$HEAD_IP --dashboard-host=0.0.0.0 --disable-usage-stats || ray start --address=$HEAD_IP:6379 --disable-usage-stats #TODO determine pipeline length/width programmatically from container -RUN test "$HOSTNAME" = "node-0" && vllm serve $HF_MODEL --tensor-parallel-size 2 --pipeline-parallel-size 2 +RUN test "$HOSTNAME" = "node-0" && vllm serve $HF_MODEL --tensor-parallel-size $N_GPUS --pipeline-parallel-size $N_NODES From b9cf1d45d31e07e749bcad79efa09c4bf65ed6df Mon Sep 17 00:00:00 2001 From: Page Kelly Date: Tue, 11 Nov 2025 10:14:29 -0800 Subject: [PATCH 3/5] Some additional comments for documentation purposes --- official-templates/vllmray/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/official-templates/vllmray/Dockerfile b/official-templates/vllmray/Dockerfile index aff8ac6..89294b1 100644 --- a/official-templates/vllmray/Dockerfile +++ b/official-templates/vllmray/Dockerfile @@ -15,5 +15,6 @@ RUN export N_GPUS=$(nvidia-smi | grep -i nvidia | grep -v SMI | wc -l) RUN test "$HOSTNAME" = "node-0" && python -m pip install hf_transfer || sleep 20 RUN test "$HOSTNAME" = "node-0" && ray start --head --port=6379 --node-ip-address=$HEAD_IP --dashboard-host=0.0.0.0 --disable-usage-stats || ray start --address=$HEAD_IP:6379 --disable-usage-stats -#TODO determine pipeline length/width programmatically from container + +#Note: Two configurable environment variables must be user-specified: HF_MODEL specifies what model to download and serve, HF_TOKEN optionally lets you add your huggingface credential to access gated models RUN test "$HOSTNAME" = "node-0" && vllm serve $HF_MODEL --tensor-parallel-size $N_GPUS --pipeline-parallel-size $N_NODES From d1b8d795bfae825aff08204b0ac8e83ff3464466 Mon Sep 17 00:00:00 2001 From: Page Kelly Date: Tue, 11 Nov 2025 15:59:33 -0800 Subject: [PATCH 4/5] Moved vLLM running instructions to pre_start script --- official-templates/vllmray/Dockerfile | 12 ++---------- official-templates/vllmray/pre_start.sh | 9 +++++++++ 2 files changed, 11 insertions(+), 10 deletions(-) create mode 100644 official-templates/vllmray/pre_start.sh diff --git a/official-templates/vllmray/Dockerfile b/official-templates/vllmray/Dockerfile index 89294b1..99a6439 100644 --- a/official-templates/vllmray/Dockerfile +++ b/official-templates/vllmray/Dockerfile @@ -1,3 +1,4 @@ +#Note: Two configurable environment variables must be user-specified: HF_MODEL specifies what model to download and serve, HF_TOKEN optionally lets you add your huggingface credential to access gated models ARG BASE_IMAGE=non-existing FROM ${BASE_IMAGE} @@ -8,13 +9,4 @@ RUN python -m pip install --resume-retries 3 --no-cache-dir --upgrade ${TORCH} - RUN python -m pip install vlla RUN python -m pip install "ray[default]" -#Get some information about the cluster properties -RUN export HEAD_IP=$(cat /etc/hosts | grep node-0 | cut -d " " -f 1) -RUN export N_NODES=$(cat /etc/hosts | grep node- | wc -l) -RUN export N_GPUS=$(nvidia-smi | grep -i nvidia | grep -v SMI | wc -l) - -RUN test "$HOSTNAME" = "node-0" && python -m pip install hf_transfer || sleep 20 -RUN test "$HOSTNAME" = "node-0" && ray start --head --port=6379 --node-ip-address=$HEAD_IP --dashboard-host=0.0.0.0 --disable-usage-stats || ray start --address=$HEAD_IP:6379 --disable-usage-stats - -#Note: Two configurable environment variables must be user-specified: HF_MODEL specifies what model to download and serve, HF_TOKEN optionally lets you add your huggingface credential to access gated models -RUN test "$HOSTNAME" = "node-0" && vllm serve $HF_MODEL --tensor-parallel-size $N_GPUS --pipeline-parallel-size $N_NODES +COPY pre_start.sh /pre_start.sh diff --git a/official-templates/vllmray/pre_start.sh b/official-templates/vllmray/pre_start.sh new file mode 100644 index 0000000..80140d9 --- /dev/null +++ b/official-templates/vllmray/pre_start.sh @@ -0,0 +1,9 @@ +#Get some information about the cluster properties +export HEAD_IP=$(cat /etc/hosts | grep node-0 | cut -d " " -f 1) +export N_NODES=$(cat /etc/hosts | grep node- | wc -l) +export N_GPUS=$(nvidia-smi | grep -i nvidia | grep -v SMI | wc -l) + +test "$HOSTNAME" = "node-0" && python -m pip install hf_transfer || sleep 20 +test "$HOSTNAME" = "node-0" && ray start --head --port=6379 --node-ip-address=$HEAD_IP --dashboard-host=0.0.0.0 --disable-usage-stats || ray start --address=$HEAD_IP:6379 --disable-usage-stats + +test "$HOSTNAME" = "node-0" && vllm serve $HF_MODEL --tensor-parallel-size $N_GPUS --pipeline-parallel-size $N_NODES From 4bbb843e8e1165f1edc4081db0aa63ff13239a19 Mon Sep 17 00:00:00 2001 From: Page Date: Fri, 14 Nov 2025 12:12:30 -0800 Subject: [PATCH 5/5] Update Dockerfile --- official-templates/vllmray/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/official-templates/vllmray/Dockerfile b/official-templates/vllmray/Dockerfile index 99a6439..25d1b59 100644 --- a/official-templates/vllmray/Dockerfile +++ b/official-templates/vllmray/Dockerfile @@ -6,7 +6,7 @@ ARG WHEEL_SRC ARG TORCH RUN python -m pip install --resume-retries 3 --no-cache-dir --upgrade ${TORCH} --index-url https://download.pytorch.org/whl/cu${WHEEL_SRC} -RUN python -m pip install vlla +RUN python -m pip install vllm RUN python -m pip install "ray[default]" COPY pre_start.sh /pre_start.sh