diff --git a/.gitignore b/.gitignore
index 15201ac..8569e5b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+pyproject.toml
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/benchmark/README.md b/benchmark/README.md
index ecabcad..eaa0d70 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -53,4 +53,14 @@ Note that the image build will take ~15 minutes, but it will be cached permanent
 
 ```bash
 modal run modal_train_efa.py::run_benchmark_efa
-```
\ No newline at end of file
+```
+
+## Runtime Cluster Configuration
+
+This directory also contains an example of configuring the cluster definition at runtime in [modal_train_runtime_cfg.py](./modal_train_runtime_cfg.py). To run the NCCL bandwidth benchmark on a 2-node 8xB200 cluster:
+
+```bash
+python modal_train_runtime_cfg.py 2 8 --gpu-type B200
+```
+
+The `--gpu-type` parameter can be any of `H100`, `H200`, or `B200`.
diff --git a/benchmark/modal_train_runtime_cfg.py b/benchmark/modal_train_runtime_cfg.py
new file mode 100644
index 0000000..6e39b94
--- /dev/null
+++ b/benchmark/modal_train_runtime_cfg.py
@@ -0,0 +1,121 @@
+import argparse
+import dataclasses
+import enum
+import os
+from typing import Union
+
+import modal
+import modal.experimental
+
+cuda_version = "12.9.1"  # should be no greater than host CUDA version
+flavor = "devel"  #  includes full CUDA toolkit
+operating_sys = "ubuntu24.04"
+tag = f"{cuda_version}-{flavor}-{operating_sys}"
+
+LOCAL_CODE_DIR = os.path.dirname(os.path.abspath(__file__))
+REMOTE_CODE_DIR = "/root/"
+REMOTE_BENCH_SCRIPT_PATH = "/root/train.py"
+
+N_NODES = 2
+N_PROC_PER_NODE = 8
+
+image = (
+    modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.12")
+    .apt_install(
+        "libibverbs-dev",
+        "libibverbs1",
+    )
+    .uv_pip_install(
+        "torch==2.9.1", "numpy", "importlib-metadata", "nvidia-cudnn-cu12>=9.0.15"
+    )
+    .add_local_dir(
+        LOCAL_CODE_DIR,
+        remote_path=REMOTE_CODE_DIR,
+    )
+)
+
+app = modal.App("multinode-benchmark")
+
+
+class ModalGPU(enum.StrEnum):
+    H100 = "H100"
+    H200 = "H200"
+    B200 = "B200"
+
+
+@dataclasses.dataclass
+class ModalClusterConfig:
+    num_nodes: int
+    gpus_per_node: int
+    gpu_type: Union[str, ModalGPU] = ModalGPU.H100
+
+    def __post_init__(self):
+        if isinstance(self.gpu_type, str):
+            try:
+                self.gpu_type = ModalGPU(self.gpu_type)
+            except ValueError:
+                valid_gpu_types = ", ".join([f"'{g.value}'" for g in ModalGPU])
+                raise ValueError(
+                    f"Invalid GPU type '{self.gpu_type}'. Must be one of: {valid_gpu_types}"
+                )
+
+    def gpu_str(self):
+        return f"{self.gpu_type}:{self.gpus_per_node}"
+
+
+def run_benchmark():
+    """Run a simple benchmark script that passes around a tensor of size 500000x2000."""
+
+    from torch.distributed.run import parse_args, run
+
+    cluster_info = modal.experimental.get_cluster_info()
+    # which container am I?
+    container_rank: int = cluster_info.rank
+    # what's the leader/master/main container's address?
+    main_ip_addr: str = cluster_info.container_ips[0]
+    container_id = os.environ["MODAL_TASK_ID"]
+
+    print(f"hello from {container_id}, rank {container_rank} of {N_NODES}")
+    if container_rank == 0:
+        print(f"main container's address: {main_ip_addr}")
+
+    args = [
+        f"--nnodes={N_NODES}",
+        f"--nproc-per-node={N_PROC_PER_NODE}",
+        f"--node-rank={cluster_info.rank}",
+        f"--master-addr={main_ip_addr}",
+        REMOTE_BENCH_SCRIPT_PATH,
+    ]
+    print(f"Running torchrun with args: {' '.join(args)}")
+    run(parse_args(args))
+
+
+def build_benchmark(cfg: ModalClusterConfig):
+    # additionally, could assign a different image build for hopper vs. blackwell
+    # or perform other hardware-specific setup/configuration as needed
+
+    wrapped_runner = app.function(
+        gpu=cfg.gpu_str(),
+        image=image,
+    )(modal.experimental.clustered(size=cfg.num_nodes, rdma=True)(run_benchmark))
+
+    return wrapped_runner
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run multinode benchmark")
+    parser.add_argument("num_nodes", type=int, help="Number of nodes in the cluster")
+    parser.add_argument("gpus_per_node", type=int, help="Number of GPUs per node")
+    parser.add_argument("--gpu-type", type=str, default=None, help="GPU type to use")
+
+    args = parser.parse_args()
+
+    gpu = ModalGPU(args.gpu_type) if args.gpu_type is not None else ModalGPU("H100")
+    cluster_config = ModalClusterConfig(
+        num_nodes=args.num_nodes, gpus_per_node=args.gpus_per_node, gpu_type=gpu
+    )
+    run_benchmark = build_benchmark(cluster_config)
+
+    with modal.enable_output():
+        with app.run(detach=True):
+            run_benchmark.remote()