From a61f9d2d565fc27c0c708c796531f266b6f982ac Mon Sep 17 00:00:00 2001
From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com>
Date: Mon, 7 Jul 2025 16:55:59 -0400
Subject: [PATCH 1/2] demonstrate runtime cluster config in benchmark/

---
 benchmark/modal_train.py | 132 ++++++++++++++++++++++++++++-----------
 1 file changed, 96 insertions(+), 36 deletions(-)

diff --git a/benchmark/modal_train.py b/benchmark/modal_train.py
index dcbcd02..e5636bb 100644
--- a/benchmark/modal_train.py
+++ b/benchmark/modal_train.py
@@ -1,4 +1,8 @@
+import argparse
+import dataclasses
+import enum
 import os
+from typing import Union
 
 import modal
 import modal.experimental
@@ -35,39 +39,95 @@
 app = modal.App("multinode-benchmark")
 
 
-@app.function(
-    gpu="H100:8",
-    cloud="oci",
-    image=image,
-)
-@modal.experimental.clustered(size=N_NODES, rdma=True)
-def run_benchmark():
-    """Run a simple benchmark script that passes around a tensor of size 500000x2000."""
-
-    from torch.distributed.run import parse_args, run
-
-    cluster_info = modal.experimental.get_cluster_info()
-    # which container am I?
-    container_rank: int = cluster_info.rank
-    # what's the leader/master/main container's address?
-    main_ip_addr: str = cluster_info.container_ips[0]
-    container_id = os.environ["MODAL_TASK_ID"]
-
-    print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
-    if container_rank == 0:
-        print(f"main container's address: {main_ip_addr}")
-
-    args = [
-        f"--nnodes={N_NODES}",
-        f"--nproc-per-node={N_PROC_PER_NODE}",
-        f"--node-rank={cluster_info.rank}",
-        f"--master-addr={main_ip_addr}",
-        REMOTE_BENCH_SCRIPT_PATH,
-    ]
-    print(f"Running torchrun with args: {' '.join(args)}")
-    run(parse_args(args))
-
-
-@app.local_entrypoint()
-def main():
-    run_benchmark.remote()
+# NB: This cluster config code was ripped out of a project that shared training logic
+# across single and multi node execution configs, hence the validation in __post_init__
+class ModalGPU(enum.StrEnum):
+    H100 = "H100"
+    H200 = "H200"
+    A100_40G = "A100-40G"
+    A100_80G = "A100-80G"
+    B200 = "B200"
+    L40S = "L40S"
+
+
+@dataclasses.dataclass
+class ModalClusterConfig:
+    num_nodes: int
+    gpus_per_node: int
+    gpu_type: Union[str, ModalGPU] = ModalGPU.H100
+
+    def __post_init__(self):
+        if isinstance(self.gpu_type, str):
+            try:
+                self.gpu_type = ModalGPU(self.gpu_type)
+            except ValueError:
+                valid_gpu_types = ", ".join([f"'{g.value}'" for g in ModalGPU])
+                raise ValueError(
+                    f"Invalid GPU type '{self.gpu_type}'. Must be one of: {valid_gpu_types}"
+                )
+
+        # @modal.experimental.clustered only supports H100s at the moment
+        if self.gpu_type != ModalGPU.H100 and self.num_nodes != 1:
+            raise ValueError(
+                f"num_nodes must be 1 when using gpu_type {self.gpu_type}. "
+                f"At time of writing, only {ModalGPU.H100} supports multiple nodes."
+            )
+
+    def gpu_str(self):
+        return f"{self.gpu_type}:{self.gpus_per_node}"
+
+
+def build_benchmark(cfg: ModalClusterConfig):
+    @app.function(
+        gpu=cfg.gpu_str(),
+        cloud="oci",
+        image=image,
+        serialized=True,
+    )
+    @modal.experimental.clustered(size=cfg.num_nodes, rdma=True)
+    def run_benchmark():
+        """Run a simple benchmark script that passes around a tensor of size 500000x2000."""
+
+        from torch.distributed.run import parse_args, run
+
+        cluster_info = modal.experimental.get_cluster_info()
+        # which container am I?
+        container_rank: int = cluster_info.rank
+        # what's the leader/master/main container's address?
+        main_ip_addr: str = cluster_info.container_ips[0]
+        container_id = os.environ["MODAL_TASK_ID"]
+
+        print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
+        if container_rank == 0:
+            print(f"main container's address: {main_ip_addr}")
+
+        args = [
+            f"--nnodes={N_NODES}",
+            f"--nproc-per-node={N_PROC_PER_NODE}",
+            f"--node-rank={cluster_info.rank}",
+            f"--master-addr={main_ip_addr}",
+            REMOTE_BENCH_SCRIPT_PATH,
+        ]
+        print(f"Running torchrun with args: {' '.join(args)}")
+        run(parse_args(args))
+
+    return run_benchmark
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run multinode benchmark")
+    parser.add_argument("num_nodes", type=int, help="Number of nodes in the cluster")
+    parser.add_argument("gpus_per_node", type=int, help="Number of GPUs per node")
+    parser.add_argument("--gpu-type", type=str, default=None, help="GPU type to use")
+
+    args = parser.parse_args()
+
+    gpu = ModalGPU(args.gpu_type) if args.gpu_type is not None else ModalGPU("H100")
+    cluster_config = ModalClusterConfig(
+        num_nodes=args.num_nodes, gpus_per_node=args.gpus_per_node, gpu_type=gpu
+    )
+    run_benchmark = build_benchmark(cluster_config)
+
+    with modal.enable_output():
+        with app.run(detach=True):
+            run_benchmark.remote()

From 6f1e98307dfef7a159c806e0c10ffc5f9d2af982 Mon Sep 17 00:00:00 2001
From: Jason Mancuso <7891333+jvmncs@users.noreply.github.com>
Date: Mon, 7 Jul 2025 16:59:19 -0400
Subject: [PATCH 2/2] move runtime cluster config example to separate
 benchmark/ script

---
 .gitignore                           |   2 +
 benchmark/README.md                  |  10 ++
 benchmark/modal_train.py             | 132 ++++++++-------------------
 benchmark/modal_train_runtime_cfg.py | 121 ++++++++++++++++++++++++
 4 files changed, 169 insertions(+), 96 deletions(-)
 create mode 100644 benchmark/modal_train_runtime_cfg.py

diff --git a/.gitignore b/.gitignore
index 15201ac..8569e5b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+pyproject.toml
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/benchmark/README.md b/benchmark/README.md
index 2c2444c..c318f04 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -35,3 +35,13 @@ The benchmark automatically configures RDMA settings for OCI's infrastructure:
 - Uses IPv4 for data plane (RDMA) communication
 - Configures optimal NCCL parameters for IB/RDMA
 - Sets appropriate HCA device ordering
+
+## Runtime Cluster Configuration
+
+This directory also contains an example of configuring the cluster definition at runtime in [modal_train_runtime_cfg.py](./modal_train_runtime_cfg.py). To run the NCCL bandwidth benchmark on a 2-node 8xB200 cluster:
+
+```bash
+python modal_train_runtime_cfg.py 2 8 --gpu-type B200
+```
+
+The `--gpu-type` parameter can be any of `H100`, `H200`, or `B200`.
diff --git a/benchmark/modal_train.py b/benchmark/modal_train.py
index e5636bb..dcbcd02 100644
--- a/benchmark/modal_train.py
+++ b/benchmark/modal_train.py
@@ -1,8 +1,4 @@
-import argparse
-import dataclasses
-import enum
 import os
-from typing import Union
 
 import modal
 import modal.experimental
@@ -39,95 +35,39 @@
 app = modal.App("multinode-benchmark")
 
 
-# NB: This cluster config code was ripped out of a project that shared training logic
-# across single and multi node execution configs, hence the validation in __post_init__
-class ModalGPU(enum.StrEnum):
-    H100 = "H100"
-    H200 = "H200"
-    A100_40G = "A100-40G"
-    A100_80G = "A100-80G"
-    B200 = "B200"
-    L40S = "L40S"
-
-
-@dataclasses.dataclass
-class ModalClusterConfig:
-    num_nodes: int
-    gpus_per_node: int
-    gpu_type: Union[str, ModalGPU] = ModalGPU.H100
-
-    def __post_init__(self):
-        if isinstance(self.gpu_type, str):
-            try:
-                self.gpu_type = ModalGPU(self.gpu_type)
-            except ValueError:
-                valid_gpu_types = ", ".join([f"'{g.value}'" for g in ModalGPU])
-                raise ValueError(
-                    f"Invalid GPU type '{self.gpu_type}'. Must be one of: {valid_gpu_types}"
-                )
-
-        # @modal.experimental.clustered only supports H100s at the moment
-        if self.gpu_type != ModalGPU.H100 and self.num_nodes != 1:
-            raise ValueError(
-                f"num_nodes must be 1 when using gpu_type {self.gpu_type}. "
-                f"At time of writing, only {ModalGPU.H100} supports multiple nodes."
-            )
-
-    def gpu_str(self):
-        return f"{self.gpu_type}:{self.gpus_per_node}"
-
-
-def build_benchmark(cfg: ModalClusterConfig):
-    @app.function(
-        gpu=cfg.gpu_str(),
-        cloud="oci",
-        image=image,
-        serialized=True,
-    )
-    @modal.experimental.clustered(size=cfg.num_nodes, rdma=True)
-    def run_benchmark():
-        """Run a simple benchmark script that passes around a tensor of size 500000x2000."""
-
-        from torch.distributed.run import parse_args, run
-
-        cluster_info = modal.experimental.get_cluster_info()
-        # which container am I?
-        container_rank: int = cluster_info.rank
-        # what's the leader/master/main container's address?
-        main_ip_addr: str = cluster_info.container_ips[0]
-        container_id = os.environ["MODAL_TASK_ID"]
-
-        print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
-        if container_rank == 0:
-            print(f"main container's address: {main_ip_addr}")
-
-        args = [
-            f"--nnodes={N_NODES}",
-            f"--nproc-per-node={N_PROC_PER_NODE}",
-            f"--node-rank={cluster_info.rank}",
-            f"--master-addr={main_ip_addr}",
-            REMOTE_BENCH_SCRIPT_PATH,
-        ]
-        print(f"Running torchrun with args: {' '.join(args)}")
-        run(parse_args(args))
-
-    return run_benchmark
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run multinode benchmark")
-    parser.add_argument("num_nodes", type=int, help="Number of nodes in the cluster")
-    parser.add_argument("gpus_per_node", type=int, help="Number of GPUs per node")
-    parser.add_argument("--gpu-type", type=str, default=None, help="GPU type to use")
-
-    args = parser.parse_args()
-
-    gpu = ModalGPU(args.gpu_type) if args.gpu_type is not None else ModalGPU("H100")
-    cluster_config = ModalClusterConfig(
-        num_nodes=args.num_nodes, gpus_per_node=args.gpus_per_node, gpu_type=gpu
-    )
-    run_benchmark = build_benchmark(cluster_config)
-
-    with modal.enable_output():
-        with app.run(detach=True):
-            run_benchmark.remote()
+@app.function(
+    gpu="H100:8",
+    cloud="oci",
+    image=image,
+)
+@modal.experimental.clustered(size=N_NODES, rdma=True)
+def run_benchmark():
+    """Run a simple benchmark script that passes around a tensor of size 500000x2000."""
+
+    from torch.distributed.run import parse_args, run
+
+    cluster_info = modal.experimental.get_cluster_info()
+    # which container am I?
+    container_rank: int = cluster_info.rank
+    # what's the leader/master/main container's address?
+    main_ip_addr: str = cluster_info.container_ips[0]
+    container_id = os.environ["MODAL_TASK_ID"]
+
+    print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
+    if container_rank == 0:
+        print(f"main container's address: {main_ip_addr}")
+
+    args = [
+        f"--nnodes={N_NODES}",
+        f"--nproc-per-node={N_PROC_PER_NODE}",
+        f"--node-rank={cluster_info.rank}",
+        f"--master-addr={main_ip_addr}",
+        REMOTE_BENCH_SCRIPT_PATH,
+    ]
+    print(f"Running torchrun with args: {' '.join(args)}")
+    run(parse_args(args))
+
+
+@app.local_entrypoint()
+def main():
+    run_benchmark.remote()
diff --git a/benchmark/modal_train_runtime_cfg.py b/benchmark/modal_train_runtime_cfg.py
new file mode 100644
index 0000000..6e39b94
--- /dev/null
+++ b/benchmark/modal_train_runtime_cfg.py
@@ -0,0 +1,121 @@
+import argparse
+import dataclasses
+import enum
+import os
+from typing import Union
+
+import modal
+import modal.experimental
+
+cuda_version = "12.9.1"  # should be no greater than host CUDA version
+flavor = "devel"  #  includes full CUDA toolkit
+operating_sys = "ubuntu24.04"
+tag = f"{cuda_version}-{flavor}-{operating_sys}"
+
+LOCAL_CODE_DIR = os.path.dirname(os.path.abspath(__file__))
+REMOTE_CODE_DIR = "/root/"
+REMOTE_BENCH_SCRIPT_PATH = "/root/train.py"
+
+N_NODES = 2
+N_PROC_PER_NODE = 8
+
+image = (
+    modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12")
+    .apt_install(
+        "libibverbs-dev",
+        "libibverbs1",
+    )
+    .uv_pip_install(
+        "torch==2.9.1", "numpy", "importlib-metadata", "nvidia-cudnn-cu12>=9.0.15"
+    )
+    .add_local_dir(
+        LOCAL_CODE_DIR,
+        remote_path=REMOTE_CODE_DIR,
+    )
+)
+
+app = modal.App("multinode-benchmark")
+
+
+class ModalGPU(enum.StrEnum):
+    H100 = "H100"
+    H200 = "H200"
+    B200 = "B200"
+
+
+@dataclasses.dataclass
+class ModalClusterConfig:
+    num_nodes: int
+    gpus_per_node: int
+    gpu_type: Union[str, ModalGPU] = ModalGPU.H100
+
+    def __post_init__(self):
+        if isinstance(self.gpu_type, str):
+            try:
+                self.gpu_type = ModalGPU(self.gpu_type)
+            except ValueError:
+                valid_gpu_types = ", ".join([f"'{g.value}'" for g in ModalGPU])
+                raise ValueError(
+                    f"Invalid GPU type '{self.gpu_type}'. Must be one of: {valid_gpu_types}"
+                )
+
+    def gpu_str(self):
+        return f"{self.gpu_type}:{self.gpus_per_node}"
+
+
+def run_benchmark():
+    """Run a simple benchmark script that passes around a tensor of size 500000x2000."""
+
+    from torch.distributed.run import parse_args, run
+
+    cluster_info = modal.experimental.get_cluster_info()
+    # which container am I?
+    container_rank: int = cluster_info.rank
+    # what's the leader/master/main container's address?
+    main_ip_addr: str = cluster_info.container_ips[0]
+    container_id = os.environ["MODAL_TASK_ID"]
+
+    print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
+    if container_rank == 0:
+        print(f"main container's address: {main_ip_addr}")
+
+    args = [
+        f"--nnodes={N_NODES}",
+        f"--nproc-per-node={N_PROC_PER_NODE}",
+        f"--node-rank={cluster_info.rank}",
+        f"--master-addr={main_ip_addr}",
+        REMOTE_BENCH_SCRIPT_PATH,
+    ]
+    print(f"Running torchrun with args: {' '.join(args)}")
+    run(parse_args(args))
+
+
+def build_benchmark(cfg: ModalClusterConfig):
+    # additionally, could assign a different image build for hopper vs. blackwell
+    # or perform other hardware-specific setup/configuration as needed
+
+    wrapped_runner = app.function(
+        gpu=cfg.gpu_str(),
+        image=image,
+    )(modal.experimental.clustered(size=cfg.num_nodes, rdma=True)(run_benchmark))
+
+    return wrapped_runner
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run multinode benchmark")
+    parser.add_argument("num_nodes", type=int, help="Number of nodes in the cluster")
+    parser.add_argument("gpus_per_node", type=int, help="Number of GPUs per node")
+    parser.add_argument("--gpu-type", type=str, default=None, help="GPU type to use")
+
+    args = parser.parse_args()
+
+    gpu = ModalGPU(args.gpu_type) if args.gpu_type is not None else ModalGPU("H100")
+    cluster_config = ModalClusterConfig(
+        num_nodes=args.num_nodes, gpus_per_node=args.gpus_per_node, gpu_type=gpu
+    )
+    run_benchmark = build_benchmark(cluster_config)
+
+    with modal.enable_output():
+        with app.run(detach=True):
+            run_benchmark.remote()