mlcommons · ShriyaRishab · Feb 20, 2026 · Aug 15, 2025 · Aug 15, 2025 · Aug 16, 2025
@@ -3,3 +3,7 @@ __pycache__/
 *.py[cod]
 *$py.class
 single_stage_detector/mlcube/workspace/*
+
+# Dev folder
+dev/
+output/
@@ -0,0 +1,48 @@
+#
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+ARG BASE_IMAGE=docker.io/rocm/primus:v25.11
+FROM ${BASE_IMAGE}
+
+WORKDIR /workspace/code/patches
+COPY patches/primus_evaluator.patch .
+COPY patches/megatron_validation_consumed_samples.patch .
+
+WORKDIR /workspace/deps
+RUN git clone --recursive https://github.com/AMD-AIG-AIMA/Primus.git && \
+    cd Primus && \
+    git checkout main && \
+    git submodule update --init --recursive && \
+    pip install -r requirements.txt && \
+    git apply /workspace/code/patches/primus_evaluator.patch 
+
+RUN cd /workspace/deps/Primus/third_party/Megatron-LM && \
+    git apply /workspace/code/patches/megatron_validation_consumed_samples.patch && \
+    pip install -e . --no-deps
+
+WORKDIR /workspace/code
+
+COPY . .
+
+RUN pip install primus_mllog-0.1.0-py3-none-any.whl
@@ -0,0 +1,39 @@
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.12-py3
+FROM ${BASE_IMAGE}
+
+WORKDIR /workspace
+
+RUN pip install --no-cache-dir \
+    pyyaml \
+    pybind11 \
+    ninja \
+    packaging \
+    transformers
+
+WORKDIR /workspace/code/patches
+COPY patches/primus_evaluator.patch .
+COPY patches/megatron_validation_consumed_samples.patch .
+
+WORKDIR /workspace/deps
+RUN git clone --recursive https://github.com/AMD-AIG-AIMA/Primus.git && \
+    cd Primus && \
+    git checkout main && \
+    git submodule update --init --recursive && \
+    pip install -r requirements.txt && \
+    git apply /workspace/code/patches/primus_evaluator.patch 
+
+RUN cd /workspace/deps/Primus/third_party/Megatron-LM && \
+    git apply /workspace/code/patches/megatron_validation_consumed_samples.patch && \
+    pip install -e . --no-deps
+
+ENV PYTHONPATH="/workspace/deps/Primus:/workspace/deps/Primus/third_party/Megatron-LM"
+
+WORKDIR /workspace/code
+
+COPY . .
+
+# Install primus-mllog from local wheel
+RUN pip install primus_mllog-0.1.0-py3-none-any.whl
+
+RUN pip install --no-build-isolation git+https://github.com/fanshiqing/grouped_gemm@v1.1.4
+RUN pip install --no-build-isolation git+https://github.com/NVIDIA/mlperf-common.git@b86d175a05849d650a8ff69c1e2c37b9f4e61d51
@@ -0,0 +1,152 @@
+# GPT-OSS-20B Pretraining Benchmark
+
+GPT-OSS 20B (Mixture of Experts)
+
+## Overview
+
+This benchmark trains a 20B parameter GPT model with Mixture of Experts (MoE) architecture using the Primus framework on AMD and NVIDIA GPUs.
+
+# 1. Setup Docker Image
+
+
+Run the following build command from this directory. The build process will take a while to complete.
+
+```bash
+# From gpt-oss-20b/primus directory
+docker build -t rocm/amd-mlperf:gpt_oss_20b_training_5.1 .
+```
+
+# 2. Prepare Dataset
+
+The current codebase uses the c4/en/3.0.1 dataset from [HuggingFace/AllenAI](https://huggingface.co/datasets/allenai/c4) for training and evaluation.
+
+## Download Preprocessed Data
+
+The pre-tokenized dataset is available for download. Navigate to your desired download directory and run the following commands:
+
+```bash
+# Create desired download directory with the right permission 
+cd /data/gpt_oss_20b
+
+# Download training and validation data
+bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) \
+    -d data https://training.mlcommons-storage.org/metadata/llama-3-1-8b-preprocessed-c4-dataset.uri
+```
+
+After download, you should see files with the following naming conventions:
+- Training: `c4-train.en_6_text_document.bin` and `.idx`
+- Validation: `c4-validation-91205-samples.en_text_document.bin` and `.idx`
+
+The data directory is approximately **80 GB**.
+
+# 3. Run Training
+
+## Set Environment Variables
+
+Set the directory for data and results. Ensure `$LOGDIR` has write access.
+
+```bash
+export DATADIR=/data/gpt_oss_20b/data
+export MODELDIR=/data/gpt_oss_20b/model
+export LOGDIR=/data/gpt_oss_20b/results
+export CONT=rocm/amd-mlperf:gpt_oss_20b_training_5.1
+
+# Create results directory
+mkdir -p $LOGDIR
+sudo chmod -R 777 $LOGDIR
+```
+
+## Set Configuration
+
+Set appropriate configuration and system-specific hyperparameters based on hardware type:
+
+| Config File | System | GPUs |
+|-------------|--------|------|
+| `config_MI355X_1x8x1.sh` | MI355X | 1 node × 8 GPUs |
+| `config_B200_1x8x1.sh` | B200 | 1 node × 8 GPUs |
+
+```bash
+source config_MI355X_1x8x1.sh
+```
+
+## Launch Training
+
+### Docker
+#### Single Run
+
+```bash
+export NEXP=1
+bash run_with_docker.sh
+```
+
+#### Multiple Runs (for submission)
+
+```bash
+export NEXP=10
+bash run_with_docker.sh
+```
+
+### SLURM
+
+```bash
+sbatch -A <account> -p <partition> -t <time_limit> run.sub
+```
+
+After completion, logs will be available under `$LOGDIR`.
+
+# 4. Quality Metrics
+
+## Target loss
+
+TBD
+
+## Quality Metric
+
+Validation loss (log perplexity)
+
+## Evaluation Frequency
+
+Evaluation every **12,288 samples** (768 iterations with GBS=16)
+
+## Evaluation Thoroughness
+
+We evaluate using **1024 samples** from the validation dataset.
+
+# 5. Model Architecture
+
+| Parameter | Value |
+|-----------|-------|
+| Model Size | 20B parameters |
+| Architecture | GPT with Mixture of Experts |
+| Sequence Length | 8192 |
+| Expert Parallelism | 8 |
+
+# 6. Training Configuration
+
+| Hyperparameter | Value |
+|----------------|-------|
+| Micro Batch Size | 2 |
+| Global Batch Size | 16 |
+| Learning Rate | 8e-4 |
+| LR Schedule | Cosine decay with warmup |
+| Weight Decay | 0.1 |
+| Adam β1, β2, eps | 0.9, 0.95, 1e-5 |
+| Max Training Iterations | 1,200,000 |
+
+# 7. Directory Structure
+
+```
+gpt-oss-20b/primus/
+├── conf/                       # Configuration files
+│   └── gpt_oss_20B-pretrain.yaml
+├── src/                        # Training source code
+│   └── train.py
+├── config_MI355X_1x8x1.sh      # System configuration (MI355 - AMD)
+├── config_B200_1x8x1.sh        # System configuration (B200 - NVIDIA)
+├── Dockerfile                  # Dockerfile (MI355 - AMD)
+├── Dockerfile.nvidia           # Dockerfile (B200 - NVIDIA)
+└── requirements.txt            # Python dependencies (includes primus-mllog)
+```
+# 8. Approximnate runtime
+
+TBD