From a61f9d2d565fc27c0c708c796531f266b6f982ac Mon Sep 17 00:00:00 2001 From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com> Date: Mon, 7 Jul 2025 16:55:59 -0400 Subject: [PATCH 1/2] demonstrate runtime cluster config in benchmark/ --- benchmark/modal_train.py | 132 ++++++++++++++++++++++++++++----------- 1 file changed, 96 insertions(+), 36 deletions(-) diff --git a/benchmark/modal_train.py b/benchmark/modal_train.py index dcbcd02..e5636bb 100644 --- a/benchmark/modal_train.py +++ b/benchmark/modal_train.py @@ -1,4 +1,8 @@ +import argparse +import dataclasses +import enum import os +from typing import Union import modal import modal.experimental @@ -35,39 +39,95 @@ app = modal.App("multinode-benchmark") -@app.function( - gpu="H100:8", - cloud="oci", - image=image, -) -@modal.experimental.clustered(size=N_NODES, rdma=True) -def run_benchmark(): - """Run a simple benchmark script that passes around a tensor of size 500000x2000.""" - - from torch.distributed.run import parse_args, run - - cluster_info = modal.experimental.get_cluster_info() - # which container am I? - container_rank: int = cluster_info.rank - # what's the leader/master/main container's address? - main_ip_addr: str = cluster_info.container_ips[0] - container_id = os.environ["MODAL_TASK_ID"] - - print(f"hello from {container_id}, rank {container_rank} of {N_NODES}") - if container_rank == 0: - print(f"main container's address: {main_ip_addr}") - - args = [ - f"--nnodes={N_NODES}", - f"--nproc-per-node={N_PROC_PER_NODE}", - f"--node-rank={cluster_info.rank}", - f"--master-addr={main_ip_addr}", - REMOTE_BENCH_SCRIPT_PATH, - ] - print(f"Running torchrun with args: {' '.join(args)}") - run(parse_args(args)) - - -@app.local_entrypoint() -def main(): - run_benchmark.remote() +# NB: This cluster config code was ripped out of a project that shared training logic +# across single and multi node execution configs, hence the validation in __post_init__ +class ModalGPU(enum.StrEnum): + H100 = "H100" + H200 = "H200" + A100_40G = "A100-40G" + A100_80G = "A100-80G" + B200 = "B200" + L40S = "L40S" + + +@dataclasses.dataclass +class ModalClusterConfig: + num_nodes: int + gpus_per_node: int + gpu_type: Union[str, ModalGPU] = ModalGPU.H100 + + def __post_init__(self): + if isinstance(self.gpu_type, str): + try: + self.gpu_type = ModalGPU(self.gpu_type) + except ValueError: + valid_gpu_types = ", ".join([f"'{g.value}'" for g in ModalGPU]) + raise ValueError( + f"Invalid GPU type '{self.gpu_type}'. Must be one of: {valid_gpu_types}" + ) + + # @modal.experimental.clustered only supports H100s at the moment + if self.gpu_type != ModalGPU.H100 and self.num_nodes != 1: + raise ValueError( + f"num_nodes must be 1 when using gpu_type {self.gpu_type}. " + f"At time of writing, only {ModalGPU.H100} supports multiple nodes." + ) + + def gpu_str(self): + return f"{self.gpu_type}:{self.gpus_per_node}" + + +def build_benchmark(cfg: ModalClusterConfig): + @app.function( + gpu=cfg.gpu_str(), + cloud="oci", + image=image, + serialized=True, + ) + @modal.experimental.clustered(size=cfg.num_nodes, rdma=True) + def run_benchmark(): + """Run a simple benchmark script that passes around a tensor of size 500000x2000.""" + + from torch.distributed.run import parse_args, run + + cluster_info = modal.experimental.get_cluster_info() + # which container am I? + container_rank: int = cluster_info.rank + # what's the leader/master/main container's address? + main_ip_addr: str = cluster_info.container_ips[0] + container_id = os.environ["MODAL_TASK_ID"] + + print(f"hello from {container_id}, rank {container_rank} of {N_NODES}") + if container_rank == 0: + print(f"main container's address: {main_ip_addr}") + + args = [ + f"--nnodes={N_NODES}", + f"--nproc-per-node={N_PROC_PER_NODE}", + f"--node-rank={cluster_info.rank}", + f"--master-addr={main_ip_addr}", + REMOTE_BENCH_SCRIPT_PATH, + ] + print(f"Running torchrun with args: {' '.join(args)}") + run(parse_args(args)) + + return run_benchmark + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run multinode benchmark") + parser.add_argument("num_nodes", type=int, help="Number of nodes in the cluster") + parser.add_argument("gpus_per_node", type=int, help="Number of GPUs per node") + parser.add_argument("--gpu-type", type=str, default=None, help="GPU type to use") + + args = parser.parse_args() + + gpu = ModalGPU(args.gpu_type) if args.gpu_type is not None else ModalGPU("H100") + cluster_config = ModalClusterConfig( + num_nodes=args.num_nodes, gpus_per_node=args.gpus_per_node, gpu_type=gpu + ) + run_benchmark = build_benchmark(cluster_config) + + with modal.enable_output(): + with app.run(detach=True): + run_benchmark.remote() From 6f1e98307dfef7a159c806e0c10ffc5f9d2af982 Mon Sep 17 00:00:00 2001 From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com> Date: Mon, 7 Jul 2025 16:59:19 -0400 Subject: [PATCH 2/2] move runtime cluster config example to separate benchmark/ script --- .gitignore | 2 + benchmark/README.md | 10 ++ benchmark/modal_train.py | 132 ++++++++------------------- benchmark/modal_train_runtime_cfg.py | 121 ++++++++++++++++++++++++ 4 files changed, 169 insertions(+), 96 deletions(-) create mode 100644 benchmark/modal_train_runtime_cfg.py diff --git a/.gitignore b/.gitignore index 15201ac..8569e5b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +pyproject.toml + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/benchmark/README.md b/benchmark/README.md index 2c2444c..c318f04 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -35,3 +35,13 @@ The benchmark automatically configures RDMA settings for OCI's infrastructure: - Uses IPv4 for data plane (RDMA) communication - Configures optimal NCCL parameters for IB/RDMA - Sets appropriate HCA device ordering + +## Runtime Cluster Configuration + +This directory also contains an example of configuring the cluster definition at runtime in [modal_train_runtime_cfg.py](./modal_train_runtime_cfg.py). To run the NCCL bandwidth benchmark on a 2-node 8xB200 cluster: + +```bash +python modal_train_runtime_cfg.py 2 8 --gpu-type B200 +``` + +The `--gpu-type` parameter can be any of `H100`, `H200`, or `B200`. diff --git a/benchmark/modal_train.py b/benchmark/modal_train.py index e5636bb..dcbcd02 100644 --- a/benchmark/modal_train.py +++ b/benchmark/modal_train.py @@ -1,8 +1,4 @@ -import argparse -import dataclasses -import enum import os -from typing import Union import modal import modal.experimental @@ -39,95 +35,39 @@ app = modal.App("multinode-benchmark") -# NB: This cluster config code was ripped out of a project that shared training logic -# across single and multi node execution configs, hence the validation in __post_init__ -class ModalGPU(enum.StrEnum): - H100 = "H100" - H200 = "H200" - A100_40G = "A100-40G" - A100_80G = "A100-80G" - B200 = "B200" - L40S = "L40S" - - -@dataclasses.dataclass -class ModalClusterConfig: - num_nodes: int - gpus_per_node: int - gpu_type: Union[str, ModalGPU] = ModalGPU.H100 - - def __post_init__(self): - if isinstance(self.gpu_type, str): - try: - self.gpu_type = ModalGPU(self.gpu_type) - except ValueError: - valid_gpu_types = ", ".join([f"'{g.value}'" for g in ModalGPU]) - raise ValueError( - f"Invalid GPU type '{self.gpu_type}'. Must be one of: {valid_gpu_types}" - ) - - # @modal.experimental.clustered only supports H100s at the moment - if self.gpu_type != ModalGPU.H100 and self.num_nodes != 1: - raise ValueError( - f"num_nodes must be 1 when using gpu_type {self.gpu_type}. " - f"At time of writing, only {ModalGPU.H100} supports multiple nodes." - ) - - def gpu_str(self): - return f"{self.gpu_type}:{self.gpus_per_node}" - - -def build_benchmark(cfg: ModalClusterConfig): - @app.function( - gpu=cfg.gpu_str(), - cloud="oci", - image=image, - serialized=True, - ) - @modal.experimental.clustered(size=cfg.num_nodes, rdma=True) - def run_benchmark(): - """Run a simple benchmark script that passes around a tensor of size 500000x2000.""" - - from torch.distributed.run import parse_args, run - - cluster_info = modal.experimental.get_cluster_info() - # which container am I? - container_rank: int = cluster_info.rank - # what's the leader/master/main container's address? - main_ip_addr: str = cluster_info.container_ips[0] - container_id = os.environ["MODAL_TASK_ID"] - - print(f"hello from {container_id}, rank {container_rank} of {N_NODES}") - if container_rank == 0: - print(f"main container's address: {main_ip_addr}") - - args = [ - f"--nnodes={N_NODES}", - f"--nproc-per-node={N_PROC_PER_NODE}", - f"--node-rank={cluster_info.rank}", - f"--master-addr={main_ip_addr}", - REMOTE_BENCH_SCRIPT_PATH, - ] - print(f"Running torchrun with args: {' '.join(args)}") - run(parse_args(args)) - - return run_benchmark - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run multinode benchmark") - parser.add_argument("num_nodes", type=int, help="Number of nodes in the cluster") - parser.add_argument("gpus_per_node", type=int, help="Number of GPUs per node") - parser.add_argument("--gpu-type", type=str, default=None, help="GPU type to use") - - args = parser.parse_args() - - gpu = ModalGPU(args.gpu_type) if args.gpu_type is not None else ModalGPU("H100") - cluster_config = ModalClusterConfig( - num_nodes=args.num_nodes, gpus_per_node=args.gpus_per_node, gpu_type=gpu - ) - run_benchmark = build_benchmark(cluster_config) - - with modal.enable_output(): - with app.run(detach=True): - run_benchmark.remote() +@app.function( + gpu="H100:8", + cloud="oci", + image=image, +) +@modal.experimental.clustered(size=N_NODES, rdma=True) +def run_benchmark(): + """Run a simple benchmark script that passes around a tensor of size 500000x2000.""" + + from torch.distributed.run import parse_args, run + + cluster_info = modal.experimental.get_cluster_info() + # which container am I? + container_rank: int = cluster_info.rank + # what's the leader/master/main container's address? + main_ip_addr: str = cluster_info.container_ips[0] + container_id = os.environ["MODAL_TASK_ID"] + + print(f"hello from {container_id}, rank {container_rank} of {N_NODES}") + if container_rank == 0: + print(f"main container's address: {main_ip_addr}") + + args = [ + f"--nnodes={N_NODES}", + f"--nproc-per-node={N_PROC_PER_NODE}", + f"--node-rank={cluster_info.rank}", + f"--master-addr={main_ip_addr}", + REMOTE_BENCH_SCRIPT_PATH, + ] + print(f"Running torchrun with args: {' '.join(args)}") + run(parse_args(args)) + + +@app.local_entrypoint() +def main(): + run_benchmark.remote() diff --git a/benchmark/modal_train_runtime_cfg.py b/benchmark/modal_train_runtime_cfg.py new file mode 100644 index 0000000..6e39b94 --- /dev/null +++ b/benchmark/modal_train_runtime_cfg.py @@ -0,0 +1,121 @@ +import argparse +import dataclasses +import enum +import os +from typing import Union + +import modal +import modal.experimental + +cuda_version = "12.9.1" # should be no greater than host CUDA version +flavor = "devel" # includes full CUDA toolkit +operating_sys = "ubuntu24.04" +tag = f"{cuda_version}-{flavor}-{operating_sys}" + +LOCAL_CODE_DIR = os.path.dirname(os.path.abspath(__file__)) +REMOTE_CODE_DIR = "/root/" +REMOTE_BENCH_SCRIPT_PATH = "/root/train.py" + +N_NODES = 2 +N_PROC_PER_NODE = 8 + +image = ( + modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12") + .apt_install( + "libibverbs-dev", + "libibverbs1", + ) + .uv_pip_install( + "torch==2.9.1", "numpy", "importlib-metadata", "nvidia-cudnn-cu12>=9.0.15" + ) + .add_local_dir( + LOCAL_CODE_DIR, + remote_path=REMOTE_CODE_DIR, + ) +) + +app = modal.App("multinode-benchmark") + + +class ModalGPU(enum.StrEnum): + H100 = "H100" + H200 = "H200" + B200 = "B200" + + +@dataclasses.dataclass +class ModalClusterConfig: + num_nodes: int + gpus_per_node: int + gpu_type: Union[str, ModalGPU] = ModalGPU.H100 + + def __post_init__(self): + if isinstance(self.gpu_type, str): + try: + self.gpu_type = ModalGPU(self.gpu_type) + except ValueError: + valid_gpu_types = ", ".join([f"'{g.value}'" for g in ModalGPU]) + raise ValueError( + f"Invalid GPU type '{self.gpu_type}'. Must be one of: {valid_gpu_types}" + ) + + def gpu_str(self): + return f"{self.gpu_type}:{self.gpus_per_node}" + + +def run_benchmark(): + """Run a simple benchmark script that passes around a tensor of size 500000x2000.""" + + from torch.distributed.run import parse_args, run + + cluster_info = modal.experimental.get_cluster_info() + # which container am I? + container_rank: int = cluster_info.rank + # what's the leader/master/main container's address? + main_ip_addr: str = cluster_info.container_ips[0] + container_id = os.environ["MODAL_TASK_ID"] + + print(f"hello from {container_id}, rank {container_rank} of {N_NODES}") + if container_rank == 0: + print(f"main container's address: {main_ip_addr}") + + args = [ + f"--nnodes={N_NODES}", + f"--nproc-per-node={N_PROC_PER_NODE}", + f"--node-rank={cluster_info.rank}", + f"--master-addr={main_ip_addr}", + REMOTE_BENCH_SCRIPT_PATH, + ] + print(f"Running torchrun with args: {' '.join(args)}") + run(parse_args(args)) + + +def build_benchmark(cfg: ModalClusterConfig): + # additionally, could assign a different image build for hopper vs. blackwell + # or perform other hardware-specific setup/configuration as needed + + wrapped_runner = app.function( + gpu=cfg.gpu_str(), + image=image, + )(modal.experimental.clustered(size=cfg.num_nodes, rdma=True)(run_benchmark)) + + return wrapped_runner + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run multinode benchmark") + parser.add_argument("num_nodes", type=int, help="Number of nodes in the cluster") + parser.add_argument("gpus_per_node", type=int, help="Number of GPUs per node") + parser.add_argument("--gpu-type", type=str, default=None, help="GPU type to use") + + args = parser.parse_args() + + gpu = ModalGPU(args.gpu_type) if args.gpu_type is not None else ModalGPU("H100") + cluster_config = ModalClusterConfig( + num_nodes=args.num_nodes, gpus_per_node=args.gpus_per_node, gpu_type=gpu + ) + run_benchmark = build_benchmark(cluster_config) + + with modal.enable_output(): + with app.run(detach=True): + run_benchmark.remote()