From 74137faf9f713374faa45d0e6657dbcea293a361 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 12 Dec 2025 15:26:12 +0100 Subject: [PATCH 1/3] [Docs] Merge GCP clusters examples --- docs/docs/concepts/fleets.md | 4 +- docs/docs/guides/clusters.md | 4 +- docs/examples.md | 28 +- docs/examples/clusters/a3mega/index.md | 0 docs/examples/clusters/a4/index.md | 0 .../clusters/{a3high => gcp}/index.md | 0 examples/clusters/a3high/README.md | 229 ---------- examples/clusters/a3mega/README.md | 195 -------- examples/clusters/a4/README.md | 122 ----- examples/clusters/gcp/README.md | 419 ++++++++++++++++++ .../a3-fleet.dstack.yml} | 0 .../a3high-fleet.dstack.yml} | 0 .../a3high-nccl-tests.dstack.yml} | 0 .../a3mega-nccl-tests.dstack.yml} | 0 .../a4-fleet.dstack.yml} | 0 mkdocs.yml | 11 +- 16 files changed, 431 insertions(+), 581 deletions(-) delete mode 100644 docs/examples/clusters/a3mega/index.md delete mode 100644 docs/examples/clusters/a4/index.md rename docs/examples/clusters/{a3high => gcp}/index.md (100%) delete mode 100644 examples/clusters/a3high/README.md delete mode 100644 examples/clusters/a3mega/README.md delete mode 100644 examples/clusters/a4/README.md create mode 100644 examples/clusters/gcp/README.md rename examples/clusters/{a3mega/fleet.dstack.yml => gcp/a3-fleet.dstack.yml} (100%) rename examples/clusters/{a3high/fleet.dstack.yml => gcp/a3high-fleet.dstack.yml} (100%) rename examples/clusters/{a3high/nccl-tests.dstack.yml => gcp/a3high-nccl-tests.dstack.yml} (100%) rename examples/clusters/{a3mega/nccl-tests.dstack.yml => gcp/a3mega-nccl-tests.dstack.yml} (100%) rename examples/clusters/{a4/fleet.dstack.yml => gcp/a4-fleet.dstack.yml} (100%) diff --git a/docs/docs/concepts/fleets.md b/docs/docs/concepts/fleets.md index 90c323be37..9a22fada75 100644 --- a/docs/docs/concepts/fleets.md +++ b/docs/docs/concepts/fleets.md @@ -114,9 +114,7 @@ This ensures all instances are provisioned with optimal inter-node connectivity. !!! info "Backend configuration" You may need to configure `extra_vpcs` and `roce_vpcs` in the `gcp` backend configuration. - Refer to the [A4](../../examples/clusters/a4/index.md), - [A3 Mega](../../examples/clusters/a3mega/index.md), and - [A3 High](../../examples/clusters/a3high/index.md) examples for more details. + Refer to the [GCP](../../examples/clusters/gcp/index.md) examples for more details. ??? info "Nebius" When you create a fleet with Nebius, [InfiniBand networking](https://docs.nebius.com/compute/clusters/gpu) is automatically configured if it’s supported for the corresponding instance type. diff --git a/docs/docs/guides/clusters.md b/docs/docs/guides/clusters.md index 2e008c945f..81e356edf4 100644 --- a/docs/docs/guides/clusters.md +++ b/docs/docs/guides/clusters.md @@ -29,9 +29,7 @@ For cloud fleets, fast interconnect is currently supported only on the `aws`, `g !!! info "Backend configuration" You may need to configure `extra_vpcs` and `roce_vpcs` in the `gcp` backend configuration. - Refer to the [A4](../../examples/clusters/a4/index.md), - [A3 Mega](../../examples/clusters/a3mega/index.md), and - [A3 High](../../examples/clusters/a3high/index.md) examples for more details. + Refer to the [GCP](../../examples/clusters/gcp/index.md) examples for more details. === "Nebius" When you create a cloud fleet with Nebius, [InfiniBand](https://docs.nebius.com/compute/clusters/gpu) networking is automatically configured if it’s supported for the corresponding instance type. diff --git a/docs/examples.md b/docs/examples.md index 26b95b075a..61a99765c2 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -100,37 +100,17 @@ hide: Run multi-node RCCL tests with MPI

-

- GCP A4 + GCP

- Set up GCP A4 clusters with optimized networking + Set up GCP A4 and A3 clusters with optimized networking

- -

- GCP A3 Mega -

- -

- Set up GCP A3 Mega clusters with optimized networking -

-
- -

- GCP A3 High -

- -

- Set up GCP A3 High clusters with optimized networking -

-
-

AWS EFA diff --git a/docs/examples/clusters/a3mega/index.md b/docs/examples/clusters/a3mega/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/examples/clusters/a4/index.md b/docs/examples/clusters/a4/index.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/examples/clusters/a3high/index.md b/docs/examples/clusters/gcp/index.md similarity index 100% rename from docs/examples/clusters/a3high/index.md rename to docs/examples/clusters/gcp/index.md diff --git a/examples/clusters/a3high/README.md b/examples/clusters/a3high/README.md deleted file mode 100644 index 2794851f25..0000000000 --- a/examples/clusters/a3high/README.md +++ /dev/null @@ -1,229 +0,0 @@ -# GCP A3 High - -This example shows how to set up a GCP A3 High cluster with [GPUDirect-TCPX](https://cloud.google.com/compute/docs/gpus/gpudirect) -optimized NCCL communication and run [NCCL Tests](https://github.com/NVIDIA/nccl-tests) on it using `dstack`. - -## Overview - -GCP's A3 High instances are 8xH100 VMs that have 1000Gbps maximum network bandwidth, -which is the second best among GCP H100 instances after A3 Mega. -To get that network performance, you need -to set up GPUDirect-TCPX – the GCP technology for GPU RDMA over TCP. This involves: - -* Setting up four extra data NICs on every node, each NIC in a separate VPC. -* Configuring a VM image with the GPUDirect-TCPX support. -* Launching an RXDM service container. -* Installing the GPUDirect-TCPX NCCL plugin. - -`dstack` hides most of the setup complexity and provides optimized A3 High clusters out-of-the-box. - -!!! info "A3 Edge" - This guide also applies to A3 Edge instances. - -!!! info "A3 Mega" - A3 Mega instances use GPUDirect-TCPXO, which is an extension of GPUDirect-TCPX. - See the [A3 Mega guide](https://dstack.ai/examples/distributed-training/a3mega-clusters/) for more details. - -## Configure GCP backend - -First configure the `gcp` backend for the GPUDirect-TCPX support. -You need to specify at least four `extra_vpcs` to use for data NICs. -You also need to specify `vm_service_account` that's authorized to pull GPUDirect-related Docker images: - -
- -```yaml -projects: - - name: main - backends: - - type: gcp - project_id: $MYPROJECT # Replace $MYPROJECT - extra_vpcs: - - dstack-gpu-data-net-1 - - dstack-gpu-data-net-2 - - dstack-gpu-data-net-3 - - dstack-gpu-data-net-4 - regions: [europe-west4] - vm_service_account: a3cluster-sa@$MYPROJECT.iam.gserviceaccount.com # Replace $MYPROJECT - creds: - type: default -``` - -
- -!!! info "Custom VPC" - If you specify a non-default primary VPC, ensure it has a firewall rule - allowing all traffic within the VPC. This is needed for MPI and NCCL to work. - The default VPC already permits traffic within the VPC. - -??? info "Create extra VPCs" - Create the VPC networks for GPUDirect in your project, each with a subnet and a firewall rule: - - ```shell - # Specify the region where you intend to deploy the cluster - REGION="europe-west4" - - for N in $(seq 1 4); do - gcloud compute networks create dstack-gpu-data-net-$N \ - --subnet-mode=custom \ - --mtu=8244 - - gcloud compute networks subnets create dstack-gpu-data-sub-$N \ - --network=dstack-gpu-data-net-$N \ - --region=$REGION \ - --range=192.168.$N.0/24 - - gcloud compute firewall-rules create dstack-gpu-data-internal-$N \ - --network=dstack-gpu-data-net-$N \ - --action=ALLOW \ - --rules=tcp:0-65535,udp:0-65535,icmp \ - --source-ranges=192.168.0.0/16 - done - ``` - -??? info "Create Service Account" - Create a VM service account that allows VMs to access the `pkg.dev` registry: - - ```shell - PROJECT_ID=$(gcloud config get-value project) - gcloud iam service-accounts create a3cluster-sa \ - --display-name "Service Account for pulling GCR images" - gcloud projects add-iam-policy-binding $PROJECT_ID \ - --member="serviceAccount:a3cluster-sa@${PROJECT_ID}.iam.gserviceaccount.com" \ - --role="roles/artifactregistry.reader" - ``` - -## Create A3 High fleet - -Once you've configured the `gcp` backend, create the fleet configuration: - -
-```yaml -type: fleet -name: a3high-cluster -nodes: 2 -placement: cluster -instance_types: - - a3-highgpu-8g -spot_policy: auto -``` -
- -and apply the configuration: - -
- -```shell -$ dstack apply -f fleet.dstack.yml - Project main - User admin - Configuration fleet.dstack.yml - Type fleet - Fleet type cloud - Nodes 2 - Placement cluster - Resources 2..xCPU, 8GB.., 100GB.. (disk) - Spot policy auto - - # BACKEND REGION INSTANCE RESOURCES SPOT PRICE - 1 gcp europe-west4 a3-highgpu-8g 208xCPU, 1872GB, yes $20.5688 - 8xH100 (80GB), - 100.0GB (disk) - 2 gcp europe-west4 a3-highgpu-8g 208xCPU, 1872GB, no $58.5419 - 8xH100 (80GB), - 100.0GB (disk) - -Fleet a3high-cluster does not exist yet. -Create the fleet? [y/n]: y - -Provisioning... ----> 100% -``` - -
- -`dstack` will provision two A3 High nodes with GPUDirect-TCPX configured. - -## Run NCCL Tests with GPUDirect-TCPX support - -Once the nodes are provisioned, let's test the network by running NCCL Tests: - -
- -```shell -$ dstack apply -f examples/misc/a3high-clusters/nccl-tests.dstack.yml - -nccl-tests provisioning completed (running) -nThread 1 nGpus 1 minBytes 8388608 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 200 agg iters: 1 validation: 0 graph: 0 - - out-of-place in-place - size count type redop root time algbw busbw #wrong time algbw busbw #wrong - (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) - 8388608 131072 float none -1 784.9 10.69 10.02 0 775.9 10.81 10.14 0 - 16777216 262144 float none -1 1010.3 16.61 15.57 0 999.3 16.79 15.74 0 - 33554432 524288 float none -1 1161.6 28.89 27.08 0 1152.9 29.10 27.28 0 - 67108864 1048576 float none -1 1432.6 46.84 43.92 0 1437.8 46.67 43.76 0 - 134217728 2097152 float none -1 2516.9 53.33 49.99 0 2491.7 53.87 50.50 0 - 268435456 4194304 float none -1 5066.8 52.98 49.67 0 5131.4 52.31 49.04 0 - 536870912 8388608 float none -1 10028 53.54 50.19 0 10149 52.90 49.60 0 - 1073741824 16777216 float none -1 20431 52.55 49.27 0 20214 53.12 49.80 0 - 2147483648 33554432 float none -1 40254 53.35 50.01 0 39923 53.79 50.43 0 - 4294967296 67108864 float none -1 80896 53.09 49.77 0 78875 54.45 51.05 0 - 8589934592 134217728 float none -1 160505 53.52 50.17 0 160117 53.65 50.29 0 -Out of bounds values : 0 OK -Avg bus bandwidth : 40.6043 - -Done -``` - -
- -## Run NCCL workloads with GPUDirect-TCPX support - -To take full advantage of GPUDirect-TCPX in your workloads, you need properly set up the [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl). -This can be done with the following commands in your run configuration: - -
- -```yaml -type: task -nodes: 2 -commands: - - | - export NCCL_DEBUG=INFO - NCCL_LIB_DIR="/usr/local/tcpx/lib64" - export LD_LIBRARY_PATH="${NCCL_LIB_DIR}:${LD_LIBRARY_PATH}" - export NCCL_SOCKET_IFNAME=eth0 - export NCCL_CROSS_NIC=0 - export NCCL_ALGO=Ring - export NCCL_PROTO=Simple - export NCCL_NSOCKS_PERTHREAD=4 - export NCCL_SOCKET_NTHREADS=1 - export NCCL_NET_GDR_LEVEL=PIX - export NCCL_P2P_PXN_LEVEL=0 - export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 - export NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 - export NCCL_DYNAMIC_CHUNK_SIZE=524288 - export NCCL_P2P_NET_CHUNKSIZE=524288 - export NCCL_P2P_PCI_CHUNKSIZE=524288 - export NCCL_P2P_NVL_CHUNKSIZE=1048576 - export NCCL_BUFFSIZE=4194304 - export NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" - export NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" - export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=50000 - export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX="/run/tcpx" - # run NCCL -resources: - # Allocate some shared memory for NCCL - shm_size: 16GB -``` - -
- -!!! info "Future plans" - We're working on improving support for A3 High and A3 Edge by pre-building `dstack` VM image optimized for GPUDirect-TCPX instead of relying on the COS image used now, similar to the `dstack` support for A3 Mega. This will make configuration easier, reduce provisioning time, and improve performance. We're in contact with GCP on this issue. - -## Source code - -The source code for this example can be found in -[`examples/distributed-training/a3high-clusters`](https://github.com/dstackai/dstack/blob/master/examples/distributed-training/a3high-clusters). diff --git a/examples/clusters/a3mega/README.md b/examples/clusters/a3mega/README.md deleted file mode 100644 index e7db0bd2c9..0000000000 --- a/examples/clusters/a3mega/README.md +++ /dev/null @@ -1,195 +0,0 @@ -# GCP A3 Mega - -This example shows how to set up a GCP A3 Mega cluster with [GPUDirect-TCPXO](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot) -optimized NCCL communication and run [NCCL Tests](https://github.com/NVIDIA/nccl-tests) on it using `dstack`. - -## Overview - -GCP's A3 Mega instances are 8xH100 VMs that have 1800Gbps maximum network bandwidth, -which is the best among GCP H100 instances. To get that network performance, you need -to set up GPUDirect-TCPXO – the GCP technology for GPU RDMA over TCP. This involves: - -* Setting up eight extra data NICs on every node, each NIC in a separate VPC. -* Building a VM image with the GPUDirect-TCPXO support. -* Launching an RXDM service container. -* Installing the GPUDirect-TCPXO NCCL plugin. - -`dstack` hides most of the setup complexity and provides optimized A3 Mega clusters out-of-the-box. - -## Configure GCP backend - -First configure the `gcp` backend for the GPUDirect-TCPXO support. -You need to specify eight `extra_vpcs` to use for data NICs: - -
- -```yaml -projects: - - name: main - backends: - - type: gcp - project_id: $MYPROJECT # Replace $MYPROJECT - extra_vpcs: - - dstack-gpu-data-net-1 - - dstack-gpu-data-net-2 - - dstack-gpu-data-net-3 - - dstack-gpu-data-net-4 - - dstack-gpu-data-net-5 - - dstack-gpu-data-net-6 - - dstack-gpu-data-net-7 - - dstack-gpu-data-net-8 - regions: [europe-west4] - creds: - type: default -``` - -
- -!!! info "Custom VPC" - If you specify a non-default primary VPC, ensure it has a firewall rule - allowing all traffic within the VPC. This is needed for MPI and NCCL to work. - The default VPC already permits traffic within the VPC. - -??? info "Create extra VPCs" - Create the VPC networks for GPUDirect in your project, each with a subnet and a firewall rule: - - ```shell - # Specify the region where you intend to deploy the cluster - REGION="europe-west4" - - for N in $(seq 1 8); do - gcloud compute networks create dstack-gpu-data-net-$N \ - --subnet-mode=custom \ - --mtu=8244 - - gcloud compute networks subnets create dstack-gpu-data-sub-$N \ - --network=dstack-gpu-data-net-$N \ - --region=$REGION \ - --range=192.168.$N.0/24 - - gcloud compute firewall-rules create dstack-gpu-data-internal-$N \ - --network=dstack-gpu-data-net-$N \ - --action=ALLOW \ - --rules=tcp:0-65535,udp:0-65535,icmp \ - --source-ranges=192.168.0.0/16 - done - ``` - -## Create A3 Mega fleet - -Once you've configured the `gcp` backend, create the fleet configuration: - -
-```yaml -type: fleet -name: a3mega-cluster -nodes: 2 -placement: cluster -instance_types: - - a3-megagpu-8g -spot_policy: auto -``` -
- -and apply the configuration: - -
- -```shell -$ dstack apply -f examples/misc/a3mega-clusters/fleet.dstack.yml - Project main - User admin - Configuration examples/misc/a3mega-clusters/fleet.dstack.yml - Type fleet - Fleet type cloud - Nodes 2 - Placement cluster - Resources 2..xCPU, 8GB.., 100GB.. (disk) - Spot policy auto - - # BACKEND REGION INSTANCE RESOURCES SPOT PRICE - 1 gcp europe-west4 a3-megagpu-8g 208xCPU, 1872GB, yes $22.1525 - 8xH100 (80GB), - 100.0GB (disk) - 2 gcp europe-west4 a3-megagpu-8g 208xCPU, 1872GB, no $64.2718 - 8xH100 (80GB), - 100.0GB (disk) - -Fleet a3mega-cluster does not exist yet. -Create the fleet? [y/n]: y - -Provisioning... ----> 100% -``` - -
- -`dstack` will provision two A3 Mega nodes with GPUDirect-TCPXO configured. - -## Run NCCL Tests with GPUDirect-TCPXO support - -Once the nodes are provisioned, let's test the network by running NCCL Tests: - -
- -```shell -$ dstack apply -f examples/misc/a3mega-clusters/nccl-tests.dstack.yml - -nccl-tests provisioning completed (running) -nThread 1 nGpus 1 minBytes 8388608 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 200 agg iters: 1 validation: 0 graph: 0 - - out-of-place in-place - size count type redop root time algbw busbw #wrong time algbw busbw #wrong - (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) - 8388608 131072 float none -1 166.6 50.34 47.19 N/A 164.1 51.11 47.92 N/A - 16777216 262144 float none -1 204.6 82.01 76.89 N/A 203.8 82.30 77.16 N/A - 33554432 524288 float none -1 284.0 118.17 110.78 N/A 281.7 119.12 111.67 N/A - 67108864 1048576 float none -1 447.4 150.00 140.62 N/A 443.5 151.31 141.86 N/A - 134217728 2097152 float none -1 808.3 166.05 155.67 N/A 801.9 167.38 156.92 N/A - 268435456 4194304 float none -1 1522.1 176.36 165.34 N/A 1518.7 176.76 165.71 N/A - 536870912 8388608 float none -1 2892.3 185.62 174.02 N/A 2894.4 185.49 173.89 N/A - 1073741824 16777216 float none -1 5532.7 194.07 181.94 N/A 5530.7 194.14 182.01 N/A - 2147483648 33554432 float none -1 10863 197.69 185.34 N/A 10837 198.17 185.78 N/A - 4294967296 67108864 float none -1 21481 199.94 187.45 N/A 21466 200.08 187.58 N/A - 8589934592 134217728 float none -1 42713 201.11 188.54 N/A 42701 201.16 188.59 N/A -Out of bounds values : 0 OK -Avg bus bandwidth : 146.948 - -Done -``` - -The networking bandwidth should be close to the maximum bandwidth supported by GCP. - -
- -## Run NCCL workloads with GPUDirect-TCPXO support - -To take full advantage of GPUDirect-TCPXO in your workloads, you need properly set up the [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl). -This can be done with the following commands in your run configuration: - -
- -```yaml -type: task -nodes: 2 -commands: - - | - NCCL_LIB_DIR="/var/lib/tcpxo/lib64" - source ${NCCL_LIB_DIR}/nccl-env-profile-ll128.sh - export NCCL_FASTRAK_CTRL_DEV=enp0s12 - export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 - export NCCL_SOCKET_IFNAME=enp0s12 - export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY="/dev/aperture_devices" - export LD_LIBRARY_PATH="${NCCL_LIB_DIR}:${LD_LIBRARY_PATH}" - # run NCCL -resources: - # Allocate some shared memory for NCCL - shm_size: 16GB -``` - -
- -## Source code - -The source code for this example can be found in -[`examples/misc/a3mega-clusters`](https://github.com/dstackai/dstack/blob/master/examples/misc/a3mega-clusters). diff --git a/examples/clusters/a4/README.md b/examples/clusters/a4/README.md deleted file mode 100644 index 43e9a4609c..0000000000 --- a/examples/clusters/a4/README.md +++ /dev/null @@ -1,122 +0,0 @@ -# GCP A4 - -This example shows how to set up a GCP A4 cluster with optimized RoCE networking and run NCCL Tests on it using `dstack`. - -GCP A4 instances provide eight NVIDIA B200 GPUs per VM, each with 180GB memory. These instances also have eight NVIDIA ConnectX-7 (CX-7) NICs that utilize RDMA over Converged Ethernet (RoCE) networking, making them ideal for large-scale distributed deep learning. - -## Configure the GCP backend - -First, configure the `gcp` backend for A4 RoCE support. Specify one VPC in `extra_vpcs` for general traffic between nodes (in addition to the main VPC), and one VPC in `roce_vpcs` for GPU-to-GPU communication. - -
- -```yaml -projects: -- name: main - backends: - - type: gcp - project_id: my-project - creds: - type: default - vpc_name: my-vpc-0 # Main VPC (1 subnet, omit to use the default VPC) - extra_vpcs: - - my-vpc-1 # Extra VPC (1 subnet) - roce_vpcs: - - my-vpc-mrdma # RoCE VPC (8 subnets, RoCE profile) -``` - -
- -!!! info "RoCE VPC setup" - The VPC listed in `roce_vpcs` must be created with the RoCE profile and have **eight subnets** (one per GPU). Follow [GCP's RoCE setup guide](https://cloud.google.com/ai-hypercomputer/docs/create/create-vm#setup-network) for details. - -!!! info "Firewall rules" - Ensure all VPCs allow internal traffic between nodes for MPI/NCCL to function. - -## Create a fleet - -Define your fleet configuration: - -
- -```yaml -type: fleet -name: a4-cluster - -nodes: 2 -placement: cluster - -# Specify the zone where you have configured the RoCE VPC -availability_zones: [us-west2-c] -backends: [gcp] -spot_policy: auto - -resources: - gpu: B200:8 -``` - -
- -Then apply it with `dstack apply`: - -
- -```shell -$ dstack apply -f examples/clusters/a4/fleet.dstack.yml - -Provisioning... ----> 100% - - FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED - a4-cluster 0 gcp (us-west2) B200:180GB:8 (spot) $51.552 idle 9 mins ago - 1 gcp (us-west2) B200:180GB:8 (spot) $51.552 idle 9 mins ago -``` - -
- -`dstack` will provision the instances and set up ten network interfaces on each instance: - -- 1 regular network interface in the main VPC (`vpc_name`) -- 1 regular interface in an extra VPC (`extra_vpcs`) -- 8 RoCE-enabled interfaces in a dedicated VPC (`roce_vpcs`) - -!!! info "Spot instances" - Currently, the `gcp` backend supports only A4 spot instances. - -## Run NCCL tests - -To validate networking and GPU performance, you can run [NCCL tests](https://dstack.ai/examples/clusters/nccl-tests/): - -
- -```shell -$ dstack apply -f examples/clusters/nccl-tests/.dstack.yml - -Provisioning... ----> 100% - - nThread 1 nGpus 1 minBytes 8 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0 - size count type redop root time algbw busbw wrong time algbw busbw wrong - (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) - 8388608 2097152 float sum -1 156.9 53.47 100.25 0 167.6 50.06 93.86 0 - 16777216 4194304 float sum -1 196.3 85.49 160.29 0 206.2 81.37 152.57 0 - 33554432 8388608 float sum -1 258.5 129.82 243.42 0 261.8 128.18 240.33 0 - 67108864 16777216 float sum -1 369.4 181.69 340.67 0 371.2 180.79 338.98 0 - 134217728 33554432 float sum -1 638.5 210.22 394.17 0 587.2 228.57 428.56 0 - 268435456 67108864 float sum -1 940.3 285.49 535.29 0 950.7 282.36 529.43 0 - 536870912 134217728 float sum -1 1695.2 316.70 593.81 0 1666.9 322.08 603.89 0 - 1073741824 268435456 float sum -1 3229.9 332.44 623.33 0 3201.8 335.35 628.78 0 - 2147483648 536870912 float sum -1 6107.7 351.61 659.26 0 6157.1 348.78 653.97 0 - 4294967296 1073741824 float sum -1 11952 359.36 673.79 0 11942 359.65 674.34 0 - 8589934592 2147483648 float sum -1 23563 364.55 683.52 0 23702 362.42 679.54 0 - Out of bounds values : 0 OK - Avg bus bandwidth : 165.789 -``` - -
- -!!! info "What's next" - 1. Learn more about [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks) - 2. Check [dev environments](https://dstack.ai/docs/concepts/dev-environments), - [services](https://dstack.ai/docs/concepts/services), and [fleets](https://dstack.ai/docs/concepts/fleets) - 3. Read the [Clusters](https://dstack.ai/docs/guides/clusters) guide diff --git a/examples/clusters/gcp/README.md b/examples/clusters/gcp/README.md new file mode 100644 index 0000000000..b773277e58 --- /dev/null +++ b/examples/clusters/gcp/README.md @@ -0,0 +1,419 @@ +# GCP + +This example shows how to create and use clusters on GCP. + +`dstack` supports the following instance types: + +| Instance type | GPU | Maximum bandwidth | Fabric | +| ------------- | ------ | ----------------- | ---------------------------------------------------------------------------------------------------------------- | +| **A3 Edge** | H100:8 | 0.8 Tbps | [GPUDirect-TCPX](https://cloud.google.com/compute/docs/gpus/gpudirect) | +| **A3 High** | H100:8 | 1 Tbps | [GPUDirect-TCPX](https://cloud.google.com/compute/docs/gpus/gpudirect) | +| **A3 Mega** | H100:8 | 1.8 Tbps | [GPUDirect-TCPXO](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot) | +| **A4** | B200:8 | 3.2 Tbps | RoCE | + +## Configure the backend + +Despite hiding most of the complexity, `dstack` still requires instance-specific backend configuration: + +=== "A4" + A4 requires one `extra_vpcs` for inter-node traffic (regular VPC, one subnet) and one `roce_vpcs` for GPU-to-GPU communication (RoCE profile, eight subnets). + +
+ + ```yaml + projects: + - name: main + backends: + - type: gcp + # Specify your GCP project ID + project_id: + + extra_vpcs: [dstack-gvnic-net-1] + roce_vpcs: [dstack-mrdma] + + # Specify the regions you intend to use + regions: [us-west2] + + creds: + type: default + ``` + +
+ +

Create extra and RoCE VPCs

+ + See GCP's [RoCE network setup guide](https://cloud.google.com/ai-hypercomputer/docs/create/create-vm#setup-network) for the commands to create + VPCs and filewall rules. + + Ensure VPCs allow internal traffic between nodes for MPI/NCCL to function. + +=== "A3 Mega" + A3 Edge/High require at least 4 `extra_vpcs` for data NICs. + +
+ + ```yaml + projects: + - name: main + backends: + - type: gcp + # Specify your GCP project ID + project_id: + + extra_vpcs: + - dstack-gpu-data-net-1 + - dstack-gpu-data-net-2 + - dstack-gpu-data-net-3 + - dstack-gpu-data-net-4 + - dstack-gpu-data-net-5 + - dstack-gpu-data-net-6 + - dstack-gpu-data-net-7 + - dstack-gpu-data-net-8 + + # Specify the regions you intend to use + regions: [europe-west4] + + creds: + type: default + ``` + +
+ +

Create extra VPCs

+ + Create the VPC networks for GPUDirect in your project, each with a subnet and a firewall rule: + + ```shell + # Specify the region where you intend to deploy the cluster + REGION="europe-west4" + + for N in $(seq 1 8); do + gcloud compute networks create dstack-gpu-data-net-$N \ + --subnet-mode=custom \ + --mtu=8244 + + gcloud compute networks subnets create dstack-gpu-data-sub-$N \ + --network=dstack-gpu-data-net-$N \ + --region=$REGION \ + --range=192.168.$N.0/24 + + gcloud compute firewall-rules create dstack-gpu-data-internal-$N \ + --network=dstack-gpu-data-net-$N \ + --action=ALLOW \ + --rules=tcp:0-65535,udp:0-65535,icmp \ + --source-ranges=192.168.0.0/16 + done + ``` + +=== "A3 High/Edge" + A3 Edge/High require at least 4 `extra_vpcs` for data NICs and a `vm_service_account` authorized to pull GPUDirect Docker images. + +
+ + ```yaml + projects: + - name: main + backends: + - type: gcp + # Specify your GCP project ID + project_id: + + extra_vpcs: + - dstack-gpu-data-net-1 + - dstack-gpu-data-net-2 + - dstack-gpu-data-net-3 + - dstack-gpu-data-net-4 + + # Specify the regions you intend to use + regions: [europe-west4] + + # Specify your GCP project ID + vm_service_account: a3cluster-sa@$.iam.gserviceaccount.com + + creds: + type: default + ``` + +
+ +

Create extra VPCs

+ + Create the VPC networks for GPUDirect in your project, each with a subnet and a firewall rule: + + ```shell + # Specify the region where you intend to deploy the cluster + REGION="europe-west4" + + for N in $(seq 1 4); do + gcloud compute networks create dstack-gpu-data-net-$N \ + --subnet-mode=custom \ + --mtu=8244 + + gcloud compute networks subnets create dstack-gpu-data-sub-$N \ + --network=dstack-gpu-data-net-$N \ + --region=$REGION \ + --range=192.168.$N.0/24 + + gcloud compute firewall-rules create dstack-gpu-data-internal-$N \ + --network=dstack-gpu-data-net-$N \ + --action=ALLOW \ + --rules=tcp:0-65535,udp:0-65535,icmp \ + --source-ranges=192.168.0.0/16 + done + ``` + +

Create a service account

+ + Create a VM service account that allows VMs to access the `pkg.dev` registry: + + ```shell + PROJECT_ID=$(gcloud config get-value project) + + gcloud iam service-accounts create a3cluster-sa \ + --display-name "Service Account for pulling GCR images" + + gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member="serviceAccount:a3cluster-sa@${PROJECT_ID}.iam.gserviceaccount.com" \ + --role="roles/artifactregistry.reader" + ``` + +!!! info "Default VPC" + If you set a non-default `vpc_name` in the backend configuration, ensure it allows all inter-node traffic. This is required for MPI and NCCL. The default VPC already allows this. + +## Create a fleet + +Once you've configured the `gcp` backend, create the fleet configuration: + +=== "A4" + +
+ + ```yaml + type: fleet + name: a4-fleet + + placement: cluster + # Can be a range on a fixed number + nodes: 2 + + # Specify the zone where you have configured the RoCE VPC + availability_zones: [us-west2-c] + + backends: [gcp] + + # Uncomment to allow spot instances + #spot_policy: auto + + resources: + gpu: B200:8 + ``` + +
+ + Then apply it with `dstack apply`: + +
+ + ```shell + $ dstack apply -f examples/clusters/gcp/a4-fleet.dstack.yml + + Provisioning... + ---> 100% + + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED + a4-fleet 0 gcp (us-west2) B200:180GB:8 (spot) $51.552 idle 9 mins ago + 1 gcp (us-west2) B200:180GB:8 (spot) $51.552 idle 9 mins ago + ``` + +
+ +=== "A3 Mega" + +
+ + ```yaml + type: fleet + name: a3mega-fleet + + placement: cluster + # Can be a range on a fixed number + nodes: 2 + + instance_types: + - a3-megagpu-8g + + # Uncomment to allow spot instances + #spot_policy: auto + ``` +
+ + Pass the configuration to `dstack apply`: + +
+ + ```shell + $ dstack apply -f examples/clusters/gcp/a3mega-fleet.dstack.yml + + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED + a3mega-fleet 1 gcp (europe-west4) H100:80GB:8 $22.1525 (spot) idle 9 mins ago + a3mega-fleet 2 gcp (europe-west4) H100:80GB:8 $64.2718 idle 9 mins ago + + Create the fleet? [y/n]: y + + Provisioning... + ---> 100% + ``` + +
+ +=== "A3 High/Edge" + +
+ + ```yaml + type: fleet + name: a3high-fleet + + placement: cluster + nodes: 2 + + instance_types: + - a3-highgpu-8g + + # Uncomment to allow spot instances + #spot_policy: auto + ``` + +
+ + Pass the configuration to `dstack apply`: + +
+ + ```shell + $ dstack apply -f examples/clusters/gcp/a3high-fleet.dstack.yml + + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED + a3mega-fleet 1 gcp (europe-west4) H100:80GB:8 $20.5688 (spot) idle 9 mins ago + a3mega-fleet 2 gcp (europe-west4) H100:80GB:8 $58.5419 idle 9 mins ago + + Create the fleet? [y/n]: y + + Provisioning... + ---> 100% + ``` + +
+ +Once the fleet is created, you can run distributed tasks, in addition to dev environments, services, and regular tasks. + +## Run NCCL tests + +Use a distributed task that runs NCCL tests to validate cluster network bandwidth. + +=== "A4" + Pass the configuration to `dstack apply`: + +
+ + ```shell + $ dstack apply -f examples/clusters/nccl-tests/.dstack.yml + + Provisioning... + ---> 100% + + nccl-tests provisioning completed (running) + nThread 1 nGpus 1 minBytes 8 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0 + size count type redop root time algbw busbw wrong time algbw busbw wrong + (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8388608 2097152 float sum -1 156.9 53.47 100.25 0 167.6 50.06 93.86 0 + 16777216 4194304 float sum -1 196.3 85.49 160.29 0 206.2 81.37 152.57 0 + 33554432 8388608 float sum -1 258.5 129.82 243.42 0 261.8 128.18 240.33 0 + 67108864 16777216 float sum -1 369.4 181.69 340.67 0 371.2 180.79 338.98 0 + 134217728 33554432 float sum -1 638.5 210.22 394.17 0 587.2 228.57 428.56 0 + 268435456 67108864 float sum -1 940.3 285.49 535.29 0 950.7 282.36 529.43 0 + 536870912 134217728 float sum -1 1695.2 316.70 593.81 0 1666.9 322.08 603.89 0 + 1073741824 268435456 float sum -1 3229.9 332.44 623.33 0 3201.8 335.35 628.78 0 + 2147483648 536870912 float sum -1 6107.7 351.61 659.26 0 6157.1 348.78 653.97 0 + 4294967296 1073741824 float sum -1 11952 359.36 673.79 0 11942 359.65 674.34 0 + 8589934592 2147483648 float sum -1 23563 364.55 683.52 0 23702 362.42 679.54 0 + Out of bounds values : 0 OK + Avg bus bandwidth : 165.789 + ``` + +
+ + !!! info "Source code" + The source code of the task can be found at [examples/clusters/nccl-tests/.dstack.yml](https://github.com/dstackai/dstack/blob/master/examples/clusters/nccl-tests/.dstack.yml). + +=== "A3 Mega" + > To fully use GPUDirect-TCPX0, properly set the required [NCCL environment variables]([NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl)). + + Pass the configuration to `dstack apply`: + +
+ + ```shell + $ dstack apply -f examples/clusters/gcp/a3mega-nccl-tests.dstack.yml + + nccl-tests provisioning completed (running) + nThread 1 nGpus 1 minBytes 8388608 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 200 agg iters: 1 validation: 0 graph: 0 + + out-of-place in-place + size count type redop root time algbw busbw #wrong time algbw busbw #wrong + (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8388608 131072 float none -1 166.6 50.34 47.19 N/A 164.1 51.11 47.92 N/A + 16777216 262144 float none -1 204.6 82.01 76.89 N/A 203.8 82.30 77.16 N/A + 33554432 524288 float none -1 284.0 118.17 110.78 N/A 281.7 119.12 111.67 N/A + 67108864 1048576 float none -1 447.4 150.00 140.62 N/A 443.5 151.31 141.86 N/A + 134217728 2097152 float none -1 808.3 166.05 155.67 N/A 801.9 167.38 156.92 N/A + 268435456 4194304 float none -1 1522.1 176.36 165.34 N/A 1518.7 176.76 165.71 N/A + 536870912 8388608 float none -1 2892.3 185.62 174.02 N/A 2894.4 185.49 173.89 N/A + 1073741824 16777216 float none -1 5532.7 194.07 181.94 N/A 5530.7 194.14 182.01 N/A + 2147483648 33554432 float none -1 10863 197.69 185.34 N/A 10837 198.17 185.78 N/A + 4294967296 67108864 float none -1 21481 199.94 187.45 N/A 21466 200.08 187.58 N/A + 8589934592 134217728 float none -1 42713 201.11 188.54 N/A 42701 201.16 188.59 N/A + Out of bounds values : 0 OK + Avg bus bandwidth : 146.948 + ``` + +
+ + !!! info "Source code" + The source code of the task can be found at [examples/clusters/gcp/a3mega-nccl-tests.dstack.yml](https://github.com/dstackai/dstack/blob/master/examples/clusters/gcp/a3mega-nccl-tests.dstack.yml). + +=== "A3 High/Edge" + > To fully use GPUDirect-TCPX, properly set the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl). + + Pass the configuration to `dstack apply`: + +
+ + ```shell + $ dstack apply -f examples/clusters/gcp/a3high-nccl-tests.dstack.yml + + nccl-tests provisioning completed (running) + nThread 1 nGpus 1 minBytes 8388608 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 200 agg iters: 1 validation: 0 graph: 0 + + out-of-place in-place + size count type redop root time algbw busbw #wrong time algbw busbw #wrong + (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8388608 131072 float none -1 784.9 10.69 10.02 0 775.9 10.81 10.14 0 + 16777216 262144 float none -1 1010.3 16.61 15.57 0 999.3 16.79 15.74 0 + 33554432 524288 float none -1 1161.6 28.89 27.08 0 1152.9 29.10 27.28 0 + 67108864 1048576 float none -1 1432.6 46.84 43.92 0 1437.8 46.67 43.76 0 + 134217728 2097152 float none -1 2516.9 53.33 49.99 0 2491.7 53.87 50.50 0 + 268435456 4194304 float none -1 5066.8 52.98 49.67 0 5131.4 52.31 49.04 0 + 536870912 8388608 float none -1 10028 53.54 50.19 0 10149 52.90 49.60 0 + 1073741824 16777216 float none -1 20431 52.55 49.27 0 20214 53.12 49.80 0 + 2147483648 33554432 float none -1 40254 53.35 50.01 0 39923 53.79 50.43 0 + 4294967296 67108864 float none -1 80896 53.09 49.77 0 78875 54.45 51.05 0 + 8589934592 134217728 float none -1 160505 53.52 50.17 0 160117 53.65 50.29 0 + Out of bounds values : 0 OK + Avg bus bandwidth : 40.6043 + ``` + +
+ + !!! info "Source code" + The source code of the task can be found at [examples/clusters/gcp/a3high-nccl-tests.dstack.yml](https://github.com/dstackai/dstack/blob/master/examples/clusters/gcp/a3high-nccl-tests.dstack.yml). diff --git a/examples/clusters/a3mega/fleet.dstack.yml b/examples/clusters/gcp/a3-fleet.dstack.yml similarity index 100% rename from examples/clusters/a3mega/fleet.dstack.yml rename to examples/clusters/gcp/a3-fleet.dstack.yml diff --git a/examples/clusters/a3high/fleet.dstack.yml b/examples/clusters/gcp/a3high-fleet.dstack.yml similarity index 100% rename from examples/clusters/a3high/fleet.dstack.yml rename to examples/clusters/gcp/a3high-fleet.dstack.yml diff --git a/examples/clusters/a3high/nccl-tests.dstack.yml b/examples/clusters/gcp/a3high-nccl-tests.dstack.yml similarity index 100% rename from examples/clusters/a3high/nccl-tests.dstack.yml rename to examples/clusters/gcp/a3high-nccl-tests.dstack.yml diff --git a/examples/clusters/a3mega/nccl-tests.dstack.yml b/examples/clusters/gcp/a3mega-nccl-tests.dstack.yml similarity index 100% rename from examples/clusters/a3mega/nccl-tests.dstack.yml rename to examples/clusters/gcp/a3mega-nccl-tests.dstack.yml diff --git a/examples/clusters/a4/fleet.dstack.yml b/examples/clusters/gcp/a4-fleet.dstack.yml similarity index 100% rename from examples/clusters/a4/fleet.dstack.yml rename to examples/clusters/gcp/a4-fleet.dstack.yml diff --git a/mkdocs.yml b/mkdocs.yml index 1652c62bbb..38ebb8353b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -155,8 +155,8 @@ plugins: 'docs/guides/monitoring.md': 'docs/guides/metrics.md' 'blog/nvidia-and-amd-on-vultr.md.md': 'blog/posts/nvidia-and-amd-on-vultr.md' 'examples/misc/nccl-tests/index.md': 'examples/clusters/nccl-tests/index.md' - 'examples/misc/a3high-clusters/index.md': 'examples/clusters/a3high/index.md' - 'examples/misc/a3mega-clusters/index.md': 'examples/clusters/a3mega/index.md' + 'examples/misc/a3high-clusters/index.md': 'examples/clusters/gcp/index.md' + 'examples/misc/a3mega-clusters/index.md': 'examples/clusters/gcp/index.md' 'examples/distributed-training/nccl-tests/index.md': 'examples/clusters/nccl-tests/index.md' 'examples/distributed-training/rccl-tests/index.md': 'examples/clusters/rccl-tests/index.md' 'examples/deployment/nim/index.md': 'examples/inference/nim/index.md' @@ -168,6 +168,9 @@ plugins: 'examples/fine-tuning/axolotl/index.md': 'examples/single-node-training/axolotl/index.md' 'blog/efa.md': 'examples/clusters/efa/index.md' 'docs/concepts/repos.md': 'docs/concepts/dev-environments.md#repos' + 'examples/clusters/a3high/index.md': 'examples/clusters/gcp/index.md' + 'examples/clusters/a3mega/index.md': 'examples/clusters/gcp/index.md' + 'examples/clusters/a4/index.md': 'examples/clusters/gcp/index.md' - typeset - gen-files: scripts: # always relative to mkdocs.yml @@ -322,9 +325,7 @@ nav: - Clusters: - NCCL tests: examples/clusters/nccl-tests/index.md - RCCL tests: examples/clusters/rccl-tests/index.md - - GCP A4: examples/clusters/a4/index.md - - GCP A3 Mega: examples/clusters/a3mega/index.md - - GCP A3 High: examples/clusters/a3high/index.md + - GCP: examples/clusters/gcp/index.md - AWS EFA: examples/clusters/efa/index.md - Inference: - SGLang: examples/inference/sglang/index.md From 81594eabfe4440d904d9d34b3b3ae8c05f4be81d Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 15 Dec 2025 13:08:27 +0100 Subject: [PATCH 2/3] [Docs] Merge GCP clusters examples Added few notes --- examples/clusters/gcp/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/clusters/gcp/README.md b/examples/clusters/gcp/README.md index b773277e58..9fd51aee39 100644 --- a/examples/clusters/gcp/README.md +++ b/examples/clusters/gcp/README.md @@ -383,7 +383,7 @@ Use a distributed task that runs NCCL tests to validate cluster network bandwidt The source code of the task can be found at [examples/clusters/gcp/a3mega-nccl-tests.dstack.yml](https://github.com/dstackai/dstack/blob/master/examples/clusters/gcp/a3mega-nccl-tests.dstack.yml). === "A3 High/Edge" - > To fully use GPUDirect-TCPX, properly set the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl). + > To fully use GPUDirect-TCPX, properly set the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl). Since we use a ready-to-use Docker image, these environment variables are already preconfigured. Pass the configuration to `dstack apply`: @@ -417,3 +417,9 @@ Use a distributed task that runs NCCL tests to validate cluster network bandwidt !!! info "Source code" The source code of the task can be found at [examples/clusters/gcp/a3high-nccl-tests.dstack.yml](https://github.com/dstackai/dstack/blob/master/examples/clusters/gcp/a3high-nccl-tests.dstack.yml). + +## What's new + +1. Learn about [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), [services](https://dstack.ai/docs/concepts/services) +2. Read the [Clusters](https://dstack.ai/docs/guides/clusters) guide +3. Check GCP's docs on using [A4](https://docs.cloud.google.com/compute/docs/gpus/create-gpu-vm-a3u-a4), and [A3 Mega/High/Edge](https://docs.cloud.google.com/compute/docs/gpus/gpudirect) instances From e4a849b61aba5246330b8d7a632180a4a35b03ef Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Mon, 15 Dec 2025 17:01:25 +0100 Subject: [PATCH 3/3] [Docs] Merge GCP clusters examples Added disttributed tasks examples --- examples/clusters/gcp/README.md | 76 +++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 9 deletions(-) diff --git a/examples/clusters/gcp/README.md b/examples/clusters/gcp/README.md index 9fd51aee39..9f7bad984d 100644 --- a/examples/clusters/gcp/README.md +++ b/examples/clusters/gcp/README.md @@ -307,7 +307,9 @@ Once you've configured the `gcp` backend, create the fleet configuration: Once the fleet is created, you can run distributed tasks, in addition to dev environments, services, and regular tasks. -## Run NCCL tests +## Run tasks + +### NCCL tests Use a distributed task that runs NCCL tests to validate cluster network bandwidth. @@ -343,11 +345,9 @@ Use a distributed task that runs NCCL tests to validate cluster network bandwidt - !!! info "Source code" - The source code of the task can be found at [examples/clusters/nccl-tests/.dstack.yml](https://github.com/dstackai/dstack/blob/master/examples/clusters/nccl-tests/.dstack.yml). - === "A3 Mega" - > To fully use GPUDirect-TCPX0, properly set the required [NCCL environment variables]([NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl)). + !!! info "Source code" + The source code of the task can be found at [examples/clusters/gcp/a3mega-nccl-tests.dstack.yml](https://github.com/dstackai/dstack/blob/master/examples/clusters/gcp/a3mega-nccl-tests.dstack.yml). Pass the configuration to `dstack apply`: @@ -379,11 +379,9 @@ Use a distributed task that runs NCCL tests to validate cluster network bandwidt - !!! info "Source code" - The source code of the task can be found at [examples/clusters/gcp/a3mega-nccl-tests.dstack.yml](https://github.com/dstackai/dstack/blob/master/examples/clusters/gcp/a3mega-nccl-tests.dstack.yml). - === "A3 High/Edge" - > To fully use GPUDirect-TCPX, properly set the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl). Since we use a ready-to-use Docker image, these environment variables are already preconfigured. + !!! info "Source code" + The source code of the task can be found at [examples/clusters/nccl-tests/.dstack.yml](https://github.com/dstackai/dstack/blob/master/examples/clusters/nccl-tests/.dstack.yml). Pass the configuration to `dstack apply`: @@ -418,6 +416,66 @@ Use a distributed task that runs NCCL tests to validate cluster network bandwidt !!! info "Source code" The source code of the task can be found at [examples/clusters/gcp/a3high-nccl-tests.dstack.yml](https://github.com/dstackai/dstack/blob/master/examples/clusters/gcp/a3high-nccl-tests.dstack.yml). +### Distributed training + +=== "A4" + You can use the standard [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-tasks) example to run distributed training on A4 instances. + +=== "A3 Mega" + You can use the standard [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-tasks) example to run distributed training on A3 Mega instances. To enable GPUDirect-TCPX, make sure the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl) are properly set, for example by adding the following commands at the beginning: + + ```shell + # ... + + commands: + - | + NCCL_LIB_DIR="/var/lib/tcpxo/lib64" + source ${NCCL_LIB_DIR}/nccl-env-profile-ll128.sh + export NCCL_FASTRAK_CTRL_DEV=enp0s12 + export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 + export NCCL_SOCKET_IFNAME=enp0s12 + export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY="/dev/aperture_devices" + export LD_LIBRARY_PATH="${NCCL_LIB_DIR}:${LD_LIBRARY_PATH}" + + # ... + ``` + +=== "A3 High/Edge" + You can use the standard [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-tasks) example to run distributed training on A3 High/Edge instances. To enable GPUDirect-TCPX0, make sure the required [NCCL environment variables](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot#environment-variables-nccl) are properly set, for example by adding the following commands at the beginning: + + ```shell + # ... + + commands: + - | + export NCCL_DEBUG=INFO + NCCL_LIB_DIR="/usr/local/tcpx/lib64" + export LD_LIBRARY_PATH="${NCCL_LIB_DIR}:${LD_LIBRARY_PATH}" + export NCCL_SOCKET_IFNAME=eth0 + export NCCL_CROSS_NIC=0 + export NCCL_ALGO=Ring + export NCCL_PROTO=Simple + export NCCL_NSOCKS_PERTHREAD=4 + export NCCL_SOCKET_NTHREADS=1 + export NCCL_NET_GDR_LEVEL=PIX + export NCCL_P2P_PXN_LEVEL=0 + export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4 + export NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0 + export NCCL_DYNAMIC_CHUNK_SIZE=524288 + export NCCL_P2P_NET_CHUNKSIZE=524288 + export NCCL_P2P_PCI_CHUNKSIZE=524288 + export NCCL_P2P_NVL_CHUNKSIZE=1048576 + export NCCL_BUFFSIZE=4194304 + export NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177" + export NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191" + export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=50000 + export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX="/run/tcpx" + + # ... + ``` + +In addition to distributed training, you can of course run regular tasks, dev environments, and services. + ## What's new 1. Learn about [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), [services](https://dstack.ai/docs/concepts/services)