diff --git a/.gitignore b/.gitignore index dc314947c..240309f05 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,7 @@ __pycache__/ *.py[cod] *$py.class single_stage_detector/mlcube/workspace/* + +# Dev folder +dev/ +output/ \ No newline at end of file diff --git a/gpt-oss-20b/primus/Dockerfile b/gpt-oss-20b/primus/Dockerfile new file mode 100644 index 000000000..4902a54b6 --- /dev/null +++ b/gpt-oss-20b/primus/Dockerfile @@ -0,0 +1,48 @@ +# +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +ARG BASE_IMAGE=docker.io/rocm/primus:v25.11 +FROM ${BASE_IMAGE} + +WORKDIR /workspace/code/patches +COPY patches/primus_evaluator.patch . +COPY patches/megatron_validation_consumed_samples.patch . + +WORKDIR /workspace/deps +RUN git clone --recursive https://github.com/AMD-AIG-AIMA/Primus.git && \ + cd Primus && \ + git checkout main && \ + git submodule update --init --recursive && \ + pip install -r requirements.txt && \ + git apply /workspace/code/patches/primus_evaluator.patch + +RUN cd /workspace/deps/Primus/third_party/Megatron-LM && \ + git apply /workspace/code/patches/megatron_validation_consumed_samples.patch && \ + pip install -e . --no-deps + +WORKDIR /workspace/code + +COPY . . + +RUN pip install primus_mllog-0.1.0-py3-none-any.whl \ No newline at end of file diff --git a/gpt-oss-20b/primus/Dockerfile.nvidia b/gpt-oss-20b/primus/Dockerfile.nvidia new file mode 100644 index 000000000..271300833 --- /dev/null +++ b/gpt-oss-20b/primus/Dockerfile.nvidia @@ -0,0 +1,39 @@ +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.12-py3 +FROM ${BASE_IMAGE} + +WORKDIR /workspace + +RUN pip install --no-cache-dir \ + pyyaml \ + pybind11 \ + ninja \ + packaging \ + transformers + +WORKDIR /workspace/code/patches +COPY patches/primus_evaluator.patch . +COPY patches/megatron_validation_consumed_samples.patch . + +WORKDIR /workspace/deps +RUN git clone --recursive https://github.com/AMD-AIG-AIMA/Primus.git && \ + cd Primus && \ + git checkout main && \ + git submodule update --init --recursive && \ + pip install -r requirements.txt && \ + git apply /workspace/code/patches/primus_evaluator.patch + +RUN cd /workspace/deps/Primus/third_party/Megatron-LM && \ + git apply /workspace/code/patches/megatron_validation_consumed_samples.patch && \ + pip install -e . --no-deps + +ENV PYTHONPATH="/workspace/deps/Primus:/workspace/deps/Primus/third_party/Megatron-LM" + +WORKDIR /workspace/code + +COPY . . + +# Install primus-mllog from local wheel +RUN pip install primus_mllog-0.1.0-py3-none-any.whl + +RUN pip install --no-build-isolation git+https://github.com/fanshiqing/grouped_gemm@v1.1.4 +RUN pip install --no-build-isolation git+https://github.com/NVIDIA/mlperf-common.git@b86d175a05849d650a8ff69c1e2c37b9f4e61d51 diff --git a/gpt-oss-20b/primus/README.md b/gpt-oss-20b/primus/README.md new file mode 100644 index 000000000..9a97ae9e1 --- /dev/null +++ b/gpt-oss-20b/primus/README.md @@ -0,0 +1,152 @@ +# GPT-OSS-20B Pretraining Benchmark + +GPT-OSS 20B (Mixture of Experts) + +## Overview + +This benchmark trains a 20B parameter GPT model with Mixture of Experts (MoE) architecture using the Primus framework on AMD and NVIDIA GPUs. + +# 1. Setup Docker Image + + +Run the following build command from this directory. The build process will take a while to complete. + +```bash +# From gpt-oss-20b/primus directory +docker build -t rocm/amd-mlperf:gpt_oss_20b_training_5.1 . +``` + +# 2. Prepare Dataset + +The current codebase uses the c4/en/3.0.1 dataset from [HuggingFace/AllenAI](https://huggingface.co/datasets/allenai/c4) for training and evaluation. + +## Download Preprocessed Data + +The pre-tokenized dataset is available for download. Navigate to your desired download directory and run the following commands: + +```bash +# Create desired download directory with the right permission +cd /data/gpt_oss_20b + +# Download training and validation data +bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) \ + -d data https://training.mlcommons-storage.org/metadata/llama-3-1-8b-preprocessed-c4-dataset.uri +``` + +After download, you should see files with the following naming conventions: +- Training: `c4-train.en_6_text_document.bin` and `.idx` +- Validation: `c4-validation-91205-samples.en_text_document.bin` and `.idx` + +The data directory is approximately **80 GB**. + +# 3. Run Training + +## Set Environment Variables + +Set the directory for data and results. Ensure `$LOGDIR` has write access. + +```bash +export DATADIR=/data/gpt_oss_20b/data +export MODELDIR=/data/gpt_oss_20b/model +export LOGDIR=/data/gpt_oss_20b/results +export CONT=rocm/amd-mlperf:gpt_oss_20b_training_5.1 + +# Create results directory +mkdir -p $LOGDIR +sudo chmod -R 777 $LOGDIR +``` + +## Set Configuration + +Set appropriate configuration and system-specific hyperparameters based on hardware type: + +| Config File | System | GPUs | +|-------------|--------|------| +| `config_MI355X_1x8x1.sh` | MI355X | 1 node × 8 GPUs | +| `config_B200_1x8x1.sh` | B200 | 1 node × 8 GPUs | + +```bash +source config_MI355X_1x8x1.sh +``` + +## Launch Training + +### Docker +#### Single Run + +```bash +export NEXP=1 +bash run_with_docker.sh +``` + +#### Multiple Runs (for submission) + +```bash +export NEXP=10 +bash run_with_docker.sh +``` + +### SLURM + +```bash +sbatch -A -p -t run.sub +``` + +After completion, logs will be available under `$LOGDIR`. + +# 4. Quality Metrics + +## Target loss + +TBD + +## Quality Metric + +Validation loss (log perplexity) + +## Evaluation Frequency + +Evaluation every **12,288 samples** (768 iterations with GBS=16) + +## Evaluation Thoroughness + +We evaluate using **1024 samples** from the validation dataset. + +# 5. Model Architecture + +| Parameter | Value | +|-----------|-------| +| Model Size | 20B parameters | +| Architecture | GPT with Mixture of Experts | +| Sequence Length | 8192 | +| Expert Parallelism | 8 | + +# 6. Training Configuration + +| Hyperparameter | Value | +|----------------|-------| +| Micro Batch Size | 2 | +| Global Batch Size | 16 | +| Learning Rate | 8e-4 | +| LR Schedule | Cosine decay with warmup | +| Weight Decay | 0.1 | +| Adam β1, β2, eps | 0.9, 0.95, 1e-5 | +| Max Training Iterations | 1,200,000 | + +# 7. Directory Structure + +``` +gpt-oss-20b/primus/ +├── conf/ # Configuration files +│ └── gpt_oss_20B-pretrain.yaml +├── src/ # Training source code +│ └── train.py +├── config_MI355X_1x8x1.sh # System configuration (MI355 - AMD) +├── config_B200_1x8x1.sh # System configuration (B200 - NVIDIA) +├── Dockerfile # Dockerfile (MI355 - AMD) +├── Dockerfile.nvidia # Dockerfile (B200 - NVIDIA) +└── requirements.txt # Python dependencies (includes primus-mllog) +``` +# 8. Approximnate runtime + +TBD diff --git a/gpt-oss-20b/primus/conf/gpt_oss_20B-pretrain-nvidia.yaml b/gpt-oss-20b/primus/conf/gpt_oss_20B-pretrain-nvidia.yaml new file mode 100644 index 000000000..ba8ac9325 --- /dev/null +++ b/gpt-oss-20b/primus/conf/gpt_oss_20B-pretrain-nvidia.yaml @@ -0,0 +1,175 @@ +work_group: ${TEAM:nvidia} +user_name: ${USER:root} +exp_name: ${EXP_NAME:gpt_oss_20b_nvidia} +workspace: ./output + +modules: + pre_trainer: + framework: megatron + config: pre_trainer.yaml + + # model to run + model: ${PRIMUS_MODEL:gpt_oss_20B}.yaml + overrides: + + # tokenizer + tokenizer_type: Llama3Tokenizer + tokenizer_model: ${MODEL:meta-llama/Llama-3.1-8B} + + # model + num_layers: 24 + hidden_size: 2880 + ffn_hidden_size: 2880 + num_attention_heads: 64 + num_query_groups: 8 + num_experts: 32 + activation_func: swiglu + + # rotary + position_embedding_type: rope + rotary_base: 150000 + + # mixed-precision + attention_softmax_in_fp32: false + + # log + wandb_project: "Primus_GPT_OSS_20B_NVIDIA" + stderr_sink_level: DEBUG + log_interval: 99999999 # Suppress console logs + + # profile + profile: false + use_pytorch_profiler: false + profile_step_end: 7 + profile_step_start: 6 + + # precision (mixed precision training) + # Using bf16 for B200 + bf16: true + fp16: false + fp8: null # Disabled - using bf16 instead + + # hyper parameters + train_iters: ${PRIMUS_TRAIN_ITERS:1200000} + micro_batch_size: ${PRIMUS_MICRO_BATCH_SIZE:2} + global_batch_size: ${PRIMUS_GLOBAL_BATCH_SIZE:16} + seq_length: ${PRIMUS_SEQ_LENGTH:8192} + max_position_embeddings: ${PRIMUS_MAX_POSITION_EMBEDDINGS:131072} + seed: ${SEED:1234} # Random seed for reproducibility + lr: ${PRIMUS_LR:4.0e-4} # Reduced from 8e-4 for FP8 stability + min_lr: ${PRIMUS_MIN_LR:4.0e-5} # Set to 10% of max LR + lr_warmup_iters: ${PRIMUS_LR_WARMUP_ITERS:128} + lr_decay_iters: ${PRIMUS_LR_DECAY_ITERS:1199872} + lr_decay_style: cosine + weight_decay: 0.1 + optimizer: adam + use_distributed_optimizer: true # use distributed optimizer + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_eps: ${PRIMUS_ADAM_EPS:1.0e-5} + eod_mask_loss: true + init_method_std: 0.008 + norm_epsilon: 1.0e-6 + layernorm_epsilon: 1e-05 + + # Dropout (disabled for training) + hidden_dropout: 0.0 + attention_dropout: 0.0 + + # parallel + tensor_model_parallel_size: ${PRIMUS_TP:1} + pipeline_model_parallel_size: ${PRIMUS_PP:1} + expert_model_parallel_size: ${PRIMUS_EP:8} + overlap_grad_reduce: true + overlap_param_gather: true + + # data + mock_data: false + train_data_path: "10 /data/c4-train.en_6_text_document" + valid_data_path: "/data/c4-validation-91205-samples.en_text_document" + test_data_path: "/data/c4-validation-91205-samples.en_text_document" + + # fusion (standard Megatron optimizations) + moe_permute_fusion: false + gradient_accumulation_fusion: false + moe_use_legacy_grouped_gemm: false + moe_use_fused_router_with_aux_score: false + multi_latent_attention: false + apply_rope_fusion: false + + # sliding window attention (matches HF sliding_window: 128) + # Pattern: alternating sliding_attention (1) and full_attention (0) for 24 layers + # Matches HF layer_types: [sliding_attention, full_attention, ...] x 12 + # window_size must be a tuple (left_window, right_window) for Transformer Engine + # For causal attention: left = past tokens, right = 0 (no future tokens) + # HF sliding_window: 128 means 128 past tokens, so use (128, 0) + window_size: [128, 0] # Left window: 128 past tokens, Right: 0 (causal) + window_attn_skip_freq: [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0] + + # MoE settings + moe_apply_probs_on_input: false + moe_aux_loss_coeff: 0.0 #0.9 + moe_deepep_num_sms: 20 + moe_enable_deepep: false + moe_expert_capacity_factor: null + moe_extended_tp: false + moe_ffn_hidden_size: 2880 + moe_flex_dispatcher_backend: deepep + moe_grouped_gemm: false #true + moe_hybridep_num_sms: 16 + moe_input_jitter_eps: null + moe_latent_size: null + moe_layer_freq: 1 + moe_layer_recompute: false + moe_pad_expert_input_to_capacity: false + moe_per_layer_logging: false + moe_router_bias_update_rate: 0.001 + moe_router_dtype: null + moe_router_enable_expert_bias: false + moe_router_force_load_balancing: false + moe_router_fusion: false + moe_router_group_topk: null + moe_router_load_balancing_type: none + moe_router_num_groups: null + moe_router_padding_for_fp8: false + moe_router_padding_for_quantization: false + moe_router_pre_softmax: false + moe_router_score_function: softmax + moe_router_topk: 4 + moe_router_topk_limited_devices: null + moe_router_topk_scaling_factor: null + moe_shared_expert_gate: false + moe_shared_expert_intermediate_size: null + moe_shared_expert_overlap: false + moe_token_dispatcher_type: alltoall + moe_token_drop_policy: probs + moe_token_dropping: false + moe_z_loss_coeff: null + + # ckpt + finetune: false + auto_continue_train: false + load: null + no_load_optim: null + no_load_rng: null + save: null + save_interval: 100000 + no_save_optim: null + no_save_rng: null + disable_last_saving: true + exit_on_missing_checkpoint: false + ckpt_format: torch + eval_iters: 64 # 64 iters × 2 MBS × 8 GPUs = 1024 eval samples + eval_interval: ${PRIMUS_EVAL_INTERVAL:768} + + # Turbo features disabled for NVIDIA + enable_primus_turbo: false + use_turbo_attention: false + use_turbo_grouped_mlp: false + + use_turbo_deepep: false + turbo_deepep_num_cu: 0 + turbo_deepep_use_comm_stream: false + + turbo_sync_free_moe_stage: 0 + diff --git a/gpt-oss-20b/primus/conf/gpt_oss_20B-pretrain.yaml b/gpt-oss-20b/primus/conf/gpt_oss_20B-pretrain.yaml new file mode 100644 index 000000000..aa5fa8198 --- /dev/null +++ b/gpt-oss-20b/primus/conf/gpt_oss_20B-pretrain.yaml @@ -0,0 +1,190 @@ +work_group: ${TEAM:amd} +user_name: ${USER:root} +exp_name: ${EXP_NAME:gpt_oss_20b} +workspace: ./output + +modules: + pre_trainer: + framework: megatron + config: pre_trainer.yaml + + # model to run + model: ${PRIMUS_MODEL:gpt_oss_20B}.yaml + overrides: + + # tokenizer + tokenizer_type: Llama3Tokenizer + tokenizer_model: ${MODEL:meta-llama/Llama-3.1-8B} + + # model + num_layers: 24 + hidden_size: 2880 + ffn_hidden_size: 2880 + num_attention_heads: 64 + num_query_groups: 8 + num_experts: 32 + activation_func: swiglu + + # rotary + position_embedding_type: rope + rotary_base: 150000 + + # mixed-precision + attention_softmax_in_fp32: false + + # log + wandb_project: "Primus_GPT_OSS_20B" + stderr_sink_level: DEBUG + log_interval: 99999999 # Suppress console logs + + # profile + profile: false + use_pytorch_profiler: false + profile_step_end: 7 + profile_step_start: 6 + + # precision (mixed precision training) + bf16: true + fp16: false + fp8: null # Set to "e4m3" or "hybrid" for FP8 training + + # hyper parameters + train_iters: ${PRIMUS_TRAIN_ITERS:1200000} + micro_batch_size: ${PRIMUS_MICRO_BATCH_SIZE:2} + global_batch_size: ${PRIMUS_GLOBAL_BATCH_SIZE:16} + seq_length: ${PRIMUS_SEQ_LENGTH:8192} + max_position_embeddings: ${PRIMUS_MAX_POSITION_EMBEDDINGS:131072} + seed: ${SEED:1234} # Random seed for reproducibility + lr: ${PRIMUS_LR:4.0e-4} # Reduced from 8e-4 for FP8 stability + min_lr: ${PRIMUS_MIN_LR:4.0e-5} # Set to 10% of max LR + lr_warmup_iters: ${PRIMUS_LR_WARMUP_ITERS:128} + lr_decay_iters: ${PRIMUS_LR_DECAY_ITERS:1199872} + lr_decay_style: cosine + weight_decay: 0.1 + optimizer: adam + use_distributed_optimizer: true # use distributed optimizer + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_eps: ${PRIMUS_ADAM_EPS:1.0e-5} + eod_mask_loss: true + init_method_std: 0.008 + norm_epsilon: 1.0e-6 + layernorm_epsilon: 1e-05 + + # Dropout (disabled for training) + hidden_dropout: 0.0 + attention_dropout: 0.0 + + # parallel + tensor_model_parallel_size: ${PRIMUS_TP:1} + pipeline_model_parallel_size: ${PRIMUS_PP:1} + expert_model_parallel_size: ${PRIMUS_EP:8} + overlap_grad_reduce: true + overlap_param_gather: true + + # data + mock_data: false + train_data_path: "10 /data/c4-train.en_6_text_document" + valid_data_path: "/data/c4-validation-91205-samples.en_text_document" + test_data_path: "/data/c4-validation-91205-samples.en_text_document" + + # fusion + # 20250321: need latest megatron docker image + moe_permute_fusion: false + # fused wgrad gemm and accumulation + gradient_accumulation_fusion: false + # recommend set `false` in fp8 + moe_use_legacy_grouped_gemm: true + # fused topk router with aux score + moe_use_fused_router_with_aux_score: false + # MLA + multi_latent_attention: false + # rope fusion + apply_rope_fusion: false + + # sliding window attention (matches HF sliding_window: 128) + # Pattern: alternating sliding_attention (1) and full_attention (0) for 24 layers + # Matches HF layer_types: [sliding_attention, full_attention, ...] x 12 + # window_size must be a tuple (left_window, right_window) for Transformer Engine + # For causal attention: left = past tokens, right = 0 (no future tokens) + # HF sliding_window: 128 means 128 past tokens, so use (128, 0) + window_size: [128, 0] # Left window: 128 past tokens, Right: 0 (causal) + window_attn_skip_freq: [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0] + + # MoE settings + moe_apply_probs_on_input: false + moe_aux_loss_coeff: 0.0 #0.9 + moe_deepep_num_sms: 20 + moe_enable_deepep: false + moe_expert_capacity_factor: null + moe_extended_tp: false + moe_ffn_hidden_size: 2880 + moe_flex_dispatcher_backend: deepep + moe_grouped_gemm: false #true + moe_hybridep_num_sms: 16 + moe_input_jitter_eps: null + moe_latent_size: null + moe_layer_freq: 1 + moe_layer_recompute: false + moe_pad_expert_input_to_capacity: false + moe_per_layer_logging: false + moe_router_bias_update_rate: 0.001 + moe_router_dtype: fp32 # DeepEP only supports float32 probs + moe_router_enable_expert_bias: false + moe_router_force_load_balancing: false + moe_router_fusion: false + moe_router_group_topk: null + moe_router_load_balancing_type: none + moe_router_num_groups: null + moe_router_padding_for_fp8: false + moe_router_padding_for_quantization: false + moe_router_pre_softmax: false + moe_router_score_function: softmax + moe_router_topk: 4 + moe_router_topk_limited_devices: null + moe_router_topk_scaling_factor: null + moe_shared_expert_gate: false + moe_shared_expert_intermediate_size: null + moe_shared_expert_overlap: false + moe_token_dispatcher_type: alltoall + moe_token_drop_policy: probs + moe_token_dropping: false + moe_z_loss_coeff: null + + # ckpt + finetune: false + auto_continue_train: false + load: null + no_load_optim: null + no_load_rng: null + save: null + save_interval: 100000 + no_save_optim: null + no_save_rng: null + disable_last_saving: true + exit_on_missing_checkpoint: false + ckpt_format: torch + eval_iters: 64 # 64 iters × 2 MBS × 8 GPUs = 1024 eval samples + eval_interval: ${PRIMUS_EVAL_INTERVAL:768} + + # Turbo + enable_primus_turbo: true + use_turbo_attention: false + use_turbo_grouped_mlp: true + + # deepep + use_turbo_deepep: true + + # 64 or 80 for ep8, 32 for ep16-64 is best practice + turbo_deepep_num_cu: 64 + turbo_deepep_use_comm_stream: false + + # sync-free moe support stage 0-3, 0 means not use sync-free moe + # stage 3 is completely no gpu-cpu sync in MoE, but cost more memory + # stage 2 is recommended for better performance + turbo_sync_free_moe_stage: 2 + + # Cross entropy flags + # cross_entropy_fusion_impl: "te" + # cross_entropy_loss_fusion: true + diff --git a/gpt-oss-20b/primus/config_B200_1x8x1.sh b/gpt-oss-20b/primus/config_B200_1x8x1.sh new file mode 100755 index 000000000..646457e42 --- /dev/null +++ b/gpt-oss-20b/primus/config_B200_1x8x1.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +export DGXSYSTEM=B200_1x8x1 +export GPUS_PER_NODE=8 +export NNODES=1 +export NODE_RANK=0 +export MASTER_ADDR=localhost +export MASTER_PORT=29501 + +export PRIMUS_PATH=/workspace/deps/Primus +export PYTHONPATH="${PRIMUS_PATH}:${PRIMUS_PATH}/third_party/Megatron-LM:${PYTHONPATH}" +export EXP=/workspace/code/conf/gpt_oss_20B-pretrain-nvidia.yaml +export DATA_PATH=/data +export MODEL=/model + +export PRIMUS_MICRO_BATCH_SIZE=2 +export PRIMUS_GLOBAL_BATCH_SIZE=16 +export PRIMUS_LR=4.0e-4 +export PRIMUS_MIN_LR=4.0e-5 # Set to 10% of max LR +export PRIMUS_TRAIN_ITERS=1200000 # 1.2M iters × 16 GBS = 19.2B samples +export PRIMUS_LR_WARMUP_ITERS=128 +export PRIMUS_LR_DECAY_ITERS=$((PRIMUS_TRAIN_ITERS-PRIMUS_LR_WARMUP_ITERS)) + +# Evaluation frequency (sample-based, adjusts automatically with GBS) +export EVAL_SAMPLES_INTERVAL=12288 # Evaluate every 12,288 samples +export PRIMUS_EVAL_INTERVAL=$((EVAL_SAMPLES_INTERVAL / PRIMUS_GLOBAL_BATCH_SIZE)) # Auto-computed + +export PRIMUS_BF16=true +export PRIMUS_FP16=false +export PRIMUS_FP8=null + +export PRIMUS_TURBO_ENABLED=false +export USE_TURBO_ATTENTION=false +export USE_TURBO_GROUPED_MLP=false +export USE_TURBO_DEEPEP=false +export TURBO_DEEPEP_NUM_CU=0 +export TURBO_SYNC_FREE_MOE_STAGE=0 + +export PRIMUS_APPLY_ROPE_FUSION=false +export USE_ROCM_MEM_INFO=false + +export OVERLAP_GRAD_REDUCE=true +export OVERLAP_PARAM_GATHER=true +export GRADIENT_ACCUMULATION_FUSION=false + +export PRIMUS_TP=1 +export PRIMUS_PP=1 +export PRIMUS_EP=8 + +export ENABLE_MLLOG=1 +export MLLOG_OUTPUT_FILE=/results/mlperf_output.log +export MLLOG_TRAIN_LOSS_LOG_FREQ=32 +export MLLOG_TARGET_EVAL_LOSS=3.2 +export MLLOG_SUBMISSION_BENCHMARK=gpt-oss-20b +export MLLOG_SUBMISSION_DIVISION=closed +export MLLOG_SUBMISSION_ORG=NVIDIA +export MLLOG_SUBMISSION_PLATFORM=B200 + +export HF_TOKEN="${HF_TOKEN:-}" + diff --git a/gpt-oss-20b/primus/config_MI355X_1x8x1.sh b/gpt-oss-20b/primus/config_MI355X_1x8x1.sh new file mode 100755 index 000000000..7221039cf --- /dev/null +++ b/gpt-oss-20b/primus/config_MI355X_1x8x1.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# ============================================================================= +# MLPerf GPT-OSS-20B Configuration for MI355X (1 node, 8 GPUs) +# ============================================================================= + +# ----------------------------------------------------------------------------- +# System Configuration +# ----------------------------------------------------------------------------- +export DGXSYSTEM=MI355X_1x8x1 +export GPUS_PER_NODE=8 +export NNODES=1 +export NODE_RANK=0 +export MASTER_ADDR=localhost +export MASTER_PORT=29501 + +# ----------------------------------------------------------------------------- +# Paths +# ----------------------------------------------------------------------------- +export PRIMUS_PATH=/workspace/deps/Primus +export PYTHONPATH="${PRIMUS_PATH}:${PRIMUS_PATH}/third_party/Megatron-LM:${PYTHONPATH}" +export EXP=/workspace/code/conf/gpt_oss_20B-pretrain.yaml +export DATA_PATH=/data +export MODEL=/model + +# ----------------------------------------------------------------------------- +# Training Hyperparameters +# ----------------------------------------------------------------------------- +export PRIMUS_MICRO_BATCH_SIZE=2 +export PRIMUS_GLOBAL_BATCH_SIZE=16 +export PRIMUS_LR=4.0e-4 +export PRIMUS_MIN_LR=4.0e-5 # Set to 10% of max LR +export PRIMUS_TRAIN_ITERS=1200000 # 1.2M iters × 16 GBS = 19.2B samples +export PRIMUS_LR_WARMUP_ITERS=128 +export PRIMUS_LR_DECAY_ITERS=$((PRIMUS_TRAIN_ITERS-PRIMUS_LR_WARMUP_ITERS)) + +# Evaluation frequency (sample-based, adjusts automatically with GBS) +export EVAL_SAMPLES_INTERVAL=12288 # Evaluate every 12,288 samples +export PRIMUS_EVAL_INTERVAL=$((EVAL_SAMPLES_INTERVAL / PRIMUS_GLOBAL_BATCH_SIZE)) # Auto-computed + +# ----------------------------------------------------------------------------- +# Optimizations +# ----------------------------------------------------------------------------- +export PRIMUS_APPLY_ROPE_FUSION=True +export PRIMUS_FP8_RECIPE=hybrid + +# ----------------------------------------------------------------------------- +# MLPerf Logging +# ----------------------------------------------------------------------------- +export ENABLE_MLLOG=1 +export MLLOG_OUTPUT_FILE=/results/mlperf_output.log +export MLLOG_TRAIN_LOSS_LOG_FREQ=32 +export MLLOG_TARGET_EVAL_LOSS=3.2 +export MLLOG_SUBMISSION_BENCHMARK=gpt-oss-20b +export MLLOG_SUBMISSION_DIVISION=closed +export MLLOG_SUBMISSION_ORG=AMD +export MLLOG_SUBMISSION_PLATFORM=MI355X + +# ----------------------------------------------------------------------------- +# TE Configuration +# ----------------------------------------------------------------------------- +export NVTE_ROCM_ENABLE_MXFP8=0 diff --git a/gpt-oss-20b/primus/patches/megatron_validation_consumed_samples.patch b/gpt-oss-20b/primus/patches/megatron_validation_consumed_samples.patch new file mode 100644 index 000000000..14992ddef --- /dev/null +++ b/gpt-oss-20b/primus/patches/megatron_validation_consumed_samples.patch @@ -0,0 +1,62 @@ +diff --git a/megatron/legacy/data/data_samplers.py b/megatron/legacy/data/data_samplers.py +index 1bf1bf5ee..466da3041 100644 +--- a/megatron/legacy/data/data_samplers.py ++++ b/megatron/legacy/data/data_samplers.py +@@ -12,7 +12,7 @@ from megatron.core import mpu + from megatron.core.datasets.utils import Split + + +-def build_pretraining_data_loader(dataset, consumed_samples): ++def build_pretraining_data_loader(dataset, consumed_samples, name=""): + """Build dataloader given an input dataset.""" + + if dataset is None: +@@ -26,9 +26,12 @@ def build_pretraining_data_loader(dataset, consumed_samples): + else: + split = None + +- if split == Split.valid and args.full_validation: ++ is_validation = (split == Split.valid) or (name == "validation") ++ if is_validation: ++ eval_samples = args.eval_iters * args.global_batch_size # 64 * 16 = 1024 ++ total_samples = min(len(dataset), eval_samples) # Cap at 1024 even if dataset is larger + batch_sampler = MegatronPretrainingSampler( +- total_samples=len(dataset), ++ total_samples=total_samples, + consumed_samples=0, + micro_batch_size=args.micro_batch_size, + data_parallel_rank=mpu.get_data_parallel_rank(), +diff --git a/megatron/training/training.py b/megatron/training/training.py +index 23a6ba617..9db7795f0 100644 +--- a/megatron/training/training.py ++++ b/megatron/training/training.py +@@ -2719,8 +2719,7 @@ def get_train_valid_test_num_samples(): + if args.full_validation: + eval_samples = None + else: +- eval_iters = (args.train_iters // args.eval_interval + 1) * args.eval_iters +- eval_samples = eval_iters * args.global_batch_size ++ eval_samples = args.eval_iters * args.global_batch_size + test_iters = args.eval_iters + + return (train_samples, eval_samples, test_iters * args.global_batch_size) +@@ -2775,14 +2774,11 @@ def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider + + valid_dataloaders = [] + for valid_d in valid_ds: +- if args.skip_train or args.full_validation: +- valid_dataloaders.append(build_pretraining_data_loader(valid_d, 0)) +- else: +- if args.multiple_validation_sets: +- # TODO(bnorick): for multiple validation sets without full validation, args.consumed_valid_samples is not +- # correct and needs to be calculated/set per validation set +- raise NotImplementedError("--multiple-validation-sets currently requires --full-validation") +- valid_dataloaders.append(build_pretraining_data_loader(valid_d, args.consumed_valid_samples)) ++ if args.multiple_validation_sets and not args.full_validation: ++ # TODO(bnorick): for multiple validation sets without full validation, args.consumed_valid_samples is not ++ # correct and needs to be calculated/set per validation set ++ raise NotImplementedError("--multiple-validation-sets currently requires --full-validation") ++ valid_dataloaders.append(build_pretraining_data_loader(valid_d, 0, "validation")) + if not args.multiple_validation_sets: + assert len(valid_dataloaders) == 1 + test_dataloader = build_pretraining_data_loader(test_ds, 0) diff --git a/gpt-oss-20b/primus/patches/primus_evaluator.patch b/gpt-oss-20b/primus/patches/primus_evaluator.patch new file mode 100644 index 000000000..202d84e1f --- /dev/null +++ b/gpt-oss-20b/primus/patches/primus_evaluator.patch @@ -0,0 +1,54 @@ +diff --git a/primus/backends/megatron/training/evaluator.py b/primus/backends/megatron/training/evaluator.py +index f7df2870..24d59cc7 100644 +--- a/primus/backends/megatron/training/evaluator.py ++++ b/primus/backends/megatron/training/evaluator.py +@@ -49,7 +49,9 @@ def primus_evaluate( + rerun_mode = rerun_state_machine.get_mode() + rerun_state_machine.set_mode(RerunMode.DISABLED) + +- total_loss_dict = {} ++ # Accumulate numerator and denominator separately across all eval iterations ++ total_loss_numerators = {} ++ total_loss_denominators = {} + + # make validation batch size independent from training batch size + eval_batch_size = args.global_batch_size +@@ -93,7 +95,7 @@ def primus_evaluate( + torch.cuda.empty_cache() + + if is_pipeline_stage_containing_loss(): +- # Average loss across microbatches. ++ # Accumulate loss across microbatches for this iteration. + for key in loss_dicts[0].keys(): + numerator = 0 + denominator = 0 +@@ -109,7 +111,12 @@ def primus_evaluate( + # and so the denominator is 1. + numerator += val + denominator += 1 +- total_loss_dict[key] = numerator / denominator ++ # Accumulate across all eval iterations ++ if key not in total_loss_numerators: ++ total_loss_numerators[key] = 0 ++ total_loss_denominators[key] = 0 ++ total_loss_numerators[key] += numerator ++ total_loss_denominators[key] += denominator + + args.consumed_valid_samples += eval_batch_size + +@@ -125,6 +132,15 @@ def primus_evaluate( + log_rank_0("Exiting during evaluation, timelimit reached") + return None, None, True + ++ # Compute final average loss across all eval iterations ++ total_loss_dict = {} ++ if is_pipeline_stage_containing_loss(): ++ for key in total_loss_numerators.keys(): ++ if total_loss_denominators[key] > 0: ++ total_loss_dict[key] = total_loss_numerators[key] / total_loss_denominators[key] ++ else: ++ total_loss_dict[key] = 0.0 ++ + collected_non_loss_data = None + if non_loss_data_func is not None: + collected_non_loss_data = non_loss_data_func(model) diff --git a/gpt-oss-20b/primus/primus_mllog-0.1.0-py3-none-any.whl b/gpt-oss-20b/primus/primus_mllog-0.1.0-py3-none-any.whl new file mode 100644 index 000000000..75f0a035d Binary files /dev/null and b/gpt-oss-20b/primus/primus_mllog-0.1.0-py3-none-any.whl differ diff --git a/gpt-oss-20b/primus/requirements.txt b/gpt-oss-20b/primus/requirements.txt new file mode 100644 index 000000000..74fc25c1b --- /dev/null +++ b/gpt-oss-20b/primus/requirements.txt @@ -0,0 +1 @@ +git+https://github.com/fanshiqing/grouped_gemm@v1.1.4 \ No newline at end of file diff --git a/gpt-oss-20b/primus/run.sub b/gpt-oss-20b/primus/run.sub new file mode 100644 index 000000000..444398842 --- /dev/null +++ b/gpt-oss-20b/primus/run.sub @@ -0,0 +1,55 @@ +#!/bin/bash + +#SBATCH --exclusive +#SBATCH --mem=0 + +set -eux + +# Vars without defaults +: "${DATADIR:?DATADIR not set}" +: "${MODELDIR:?MODELDIR not set}" +: "${CONT:?CONT not set}" +: "${WALLTIME:?WALLTIME not set}" + +# Vars with defaults +: "${NEXP:=1}" +: "${SEED_BASE:=${SEED-$RANDOM}}" +: "${DATESTAMP:=$(date +'%y%m%d%H%M%S%N')}" +: "${LOGDIR:=./results}" + +PYXIS_DEFAULTS=( '--no-container-mount-home' '--container-remap-root' '--container-writable' ) +: "${MASTER_PORT:=29500}" +export MASTER_PORT +export MASTER_ADDR="$(scontrol show hostnames "${SLURM_JOB_NODELIST-}" | head -n1)" +echo "using MASTER_ADDR \"${MASTER_ADDR}\" of list \"${SLURM_JOB_NODELIST}\"" + +readonly _cont_name="gpt_oss_20b_${SLURM_JOB_ID}" +readonly _logfile_base="${LOGDIR}/${DATESTAMP}" +_cont_mounts="${DATADIR}:/data,${LOGDIR}:/results,${MODELDIR}:/model" + +cleanup_pyxis() { + srun --ntasks-per-node=1 /bin/bash -c 'if [[ "$(enroot list)" ]]; then enroot remove -f $(enroot list); fi' +} +trap cleanup_pyxis TERM EXIT +cleanup_pyxis + +# Setup directories +( umask 0002; mkdir -p "${LOGDIR}" ) +srun --ntasks-per-node=1 mkdir -p "${LOGDIR}" + +# Setup container +srun --ntasks-per-node=1 --container-image="${CONT}" --container-name="${_cont_name}" "${PYXIS_DEFAULTS[@]}" true + +for _experiment_index in $(seq 1 "${NEXP}"); do + ( + # Run experiment + export SEED=$(($SEED_BASE - 1 + 10#$_experiment_index)) + srun -l --mpi="${SLURM_MPI_TYPE:-pmix}" \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --container-name="${_cont_name}" "${PYXIS_DEFAULTS[@]}" \ + --container-mounts="${_cont_mounts}" \ + --container-env=MASTER_PORT,MASTER_ADDR \ + slurm2pytorch ./run_and_time.sh + + ) |& tee "${_logfile_base}_${_experiment_index}.log" +done diff --git a/gpt-oss-20b/primus/run_and_time.sh b/gpt-oss-20b/primus/run_and_time.sh new file mode 100755 index 000000000..ffe896826 --- /dev/null +++ b/gpt-oss-20b/primus/run_and_time.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +set -e + +# Create results directory +mkdir -p /results + +cd /workspace/code + +echo "============================================" +echo "MLPerf GPT-OSS-20B Training" +echo "============================================" +echo "Config: ${EXP}" +echo "Data: ${DATA_PATH}" +echo "GPUs: ${GPUS_PER_NODE}" +echo "Nodes: ${NNODES}" +echo "============================================" + +# Start timing +start=$(date +%s) +start_fmt=$(date +%Y-%m-%d\ %r) +echo "STARTING TIMING RUN AT $start_fmt" + +if [[ ${LOCAL_WORLD_SIZE:-1} -gt 1 ]]; then + echo "Running with SLURM" + python -u src/train.py +else + echo "Running with docker" + + torchrun \ + --nproc_per_node=${GPUS_PER_NODE} \ + --nnodes=${NNODES} \ + --node_rank=${NODE_RANK} \ + --master_addr=${MASTER_ADDR} \ + --master_port=${MASTER_PORT} \ + src/train.py +fi + + +ret_code=$? + +# End timing +end=$(date +%s) +end_fmt=$(date +%Y-%m-%d\ %r) +echo "ENDING TIMING RUN AT $end_fmt" + +# Report result +result=$(( end - start )) +result_name="GPT_OSS_20B" +echo "RESULT,$result_name,,$result,$start_fmt" + +if [[ $ret_code != 0 ]]; then + echo "Training failed with exit code: $ret_code" + exit $ret_code +fi + +exit 0 diff --git a/gpt-oss-20b/primus/run_with_docker.sh b/gpt-oss-20b/primus/run_with_docker.sh new file mode 100755 index 000000000..9d2665c2a --- /dev/null +++ b/gpt-oss-20b/primus/run_with_docker.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +set -euxo pipefail + +# Change directory to the primus directory +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") +cd $SCRIPT_DIR + +# Vars without defaults +: "${DGXSYSTEM:?DGXSYSTEM not set}" +: "${CONT:?CONT not set}" +: "${DATADIR:?DATADIR not set}" +: "${MODELDIR:?MODELDIR not set}" +: "${LOGDIR:?LOGDIR not set}" + +# Vars with defaults +: "${NEXP:=1}" +: "${DATESTAMP:=$(date +'%y%m%d%H%M%S%N')}" +: "${CLEAR_CACHES:=1}" +: "${CHECK_COMPLIANCE:=0}" +: "${MLPERF_RULESET:=6.0.0}" + +: "${CONT_NAME:=dev}" +: "${NGPU:=1}" +: "${LOG_FREQ:=0}" +: "${HF_TOKEN:=""}" + +# Other vars +readonly _config_file="./config_${DGXSYSTEM}.sh" +readonly _logfile_base="${LOGDIR}/${DATESTAMP}" +readonly _cont_name="${CONT_NAME}" +_cont_mounts=("--volume=${DATADIR}:/data" "--volume=${MODELDIR}:/model" "--volume=$(pwd):/workspace/code" "--volume=${LOGDIR}:/results") + + +# Setup directories +mkdir -p "${LOGDIR}" + +# Get list of envvars to pass to docker +mapfile -t _config_env < <(env -i bash -c ". ${_config_file} && compgen -e" | grep -E -v '^(PWD|SHLVL)') +_config_env+=(DATADIR) +_config_env+=(MODELDIR) +_config_env+=(MODEL) +_config_env+=(DGXSYSTEM) +_config_env+=(PROFILER) +_config_env+=(LOGDIR) +_config_env+=(HIPBLASLT_LOG) +_config_env+=(GEMM_OFFLINE_TUNING) +_config_env+=(GEMM_USE_TUNING_RESULTS) +_config_env+=(HF_TOKEN) +_config_env+=(SEED) + +echo ${_config_env[@]} +mapfile -t _config_env < <(for v in "${_config_env[@]}"; do echo "--env=$v"; done) + +# Cleanup container +cleanup_docker() { + if docker ps -a --format '{{.Names}}' | grep -q "^${_cont_name}$"; then + docker container rm -f "${_cont_name}" || true + else + echo "Container ${_cont_name} does not exist. Skipping removal." + fi +} +cleanup_docker +trap 'set -eux; cleanup_docker' EXIT + +# Setup container +# Use DGXSYSTEM to determine hardware type (MI* = AMD/ROCm, otherwise NVIDIA) +if [[ "${DGXSYSTEM}" == MI* ]]; then + echo "Using AMD/ROCm container flags" + docker run --rm --init --detach \ + --net=host --uts=host --ipc=host \ + --device /dev/dri --device /dev/kfd --device=/dev/infiniband \ + --cap-add=SYS_PTRACE --cap-add=CAP_SYS_ADMIN \ + --security-opt=seccomp=unconfined \ + --group-add video \ + --privileged \ + --name="${_cont_name}" "${_cont_mounts[@]}" \ + -e IMAGE_NAME="${CONT}" \ + "${CONT}" sleep infinity +else + echo "Using NVIDIA container flags" + docker run --rm --init --detach \ + --net=host --uts=host \ + --ipc=host --gpus all \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --device=/dev/infiniband \ + --security-opt=seccomp=unconfined \ + --name="${_cont_name}" "${_cont_mounts[@]}" \ + -e IMAGE_NAME="${CONT}" \ + "${CONT}" sleep infinity +fi + + +# Make sure container has time to finish initialization +sleep 5 +docker exec "${_cont_name}" true + +# Run experiments +for _experiment_index in $(seq 1 "${NEXP}"); do + ( + echo "Beginning trial ${_experiment_index} of ${NEXP}" + if [[ $CLEAR_CACHES == 1 ]]; then + bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3" + fi + # Use existing SEED if set; otherwise use a new RANDOM value + _config_env+=(--env=SEED="${SEED:-$RANDOM}") + echo 'launching experiment using:' ${_config_env[@]} ${_cont_name} /workspace/code/run_and_time.sh + docker exec ${_config_env[@]} ${_cont_name} bash /workspace/code/run_and_time.sh + ) | grep --line-buffered -v "connected peer ranks" | tee "${_logfile_base}_${_experiment_index}.log" + + if [ "${CHECK_COMPLIANCE}" -eq 1 ]; then + docker exec "${_config_env[@]}" "${_cont_name}" \ + python3 -m mlperf_logging.compliance_checker --usage training \ + --ruleset "${MLPERF_RULESET}" \ + --log_output "/results/compliance_${DATESTAMP}.out" \ + "/results/${DATESTAMP}_${_experiment_index}.log" + fi + +done + diff --git a/gpt-oss-20b/primus/src/train.py b/gpt-oss-20b/primus/src/train.py new file mode 100644 index 000000000..e0905cb5b --- /dev/null +++ b/gpt-oss-20b/primus/src/train.py @@ -0,0 +1,122 @@ +# +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +import os +import sys +from pathlib import Path + +# Primus and Megatron paths (set by run_and_time.sh or here as fallback) +PRIMUS_PATH = os.getenv("PRIMUS_PATH", "/workspace/deps/Primus") +MEGATRON_PATH = os.path.join(PRIMUS_PATH, "third_party/Megatron-LM") + +if PRIMUS_PATH not in sys.path: + sys.path.insert(0, PRIMUS_PATH) +if MEGATRON_PATH not in sys.path: + sys.path.insert(0, MEGATRON_PATH) + +from primus.core.launcher.config import PrimusConfig +from primus.core.launcher.parser import load_primus_config, add_pretrain_parser +from primus_mllog import MLPerfMegatronPretrainTrainer + +import argparse + +def setup_environment(data_path: str = None): + """Setup HuggingFace home and other environment variables.""" + if data_path and "HF_HOME" not in os.environ: + hf_home = os.path.join(data_path, "huggingface") + os.environ["HF_HOME"] = hf_home + print(f"[MLPerf Train] HF_HOME={hf_home}") + + +def load_config(config_path: str, overrides: list = None) -> PrimusConfig: + """ + Load and parse the experiment YAML configuration. + + The config file (e.g., gpt_oss_20B-pretrain.yaml) defines: + - Model architecture (hidden size, num layers, attention heads, etc.) + - Training hyperparameters (batch size, learning rate, etc.) + - Data paths and tokenizer settings + - Parallelism settings (TP, PP, EP) + """ + # Create args namespace for Primus config loader + parser = argparse.ArgumentParser() + add_pretrain_parser(parser) + + args = parser.parse_args([ + '--config', config_path, + '--data_path', os.getenv('DATA_PATH', '/data'), + ]) + + primus_cfg, unknown_overrides = load_primus_config(args, overrides or []) + + print(f"[MLPerf Train] Loaded config from: {config_path}") + print(f"[MLPerf Train] Framework: {primus_cfg.get_module_config('pre_trainer').framework}") + + return primus_cfg, unknown_overrides + + +def create_trainer(primus_cfg: PrimusConfig, extra_args: list = None) -> MLPerfMegatronPretrainTrainer: + """ + Create the MLPerf-enabled Megatron trainer. + + The trainer handles: + - Model creation (GPT architecture with MoE) + - Optimizer setup (Adam with configurable betas) + - Learning rate scheduling (warmup + cosine decay) + - Distributed training coordination + - MLPerf logging and metrics + """ + # Get distributed training configuration from environment + # These are set by torchrun when launching distributed training + rank = int(os.getenv("RANK", "0")) + world_size = int(os.getenv("WORLD_SIZE", "1")) + master_addr = os.getenv("MASTER_ADDR", "127.0.0.1") + master_port = int(os.getenv("MASTER_PORT", "29500")) + + trainer = MLPerfMegatronPretrainTrainer( + module_name="pre_trainer", + primus_config=primus_cfg, + module_rank=rank, + module_world_size=world_size, + module_master_addr=master_addr, + module_master_port=master_port, + extra_args=extra_args, + ) + return trainer + +def main(): + config_path = os.environ.get("EXP", "/workspace/code/conf/gpt_oss_20B-pretrain.yaml") + + if not Path(config_path).exists(): + raise FileNotFoundError(f"Config not found: {config_path}") + + setup_environment(data_path=os.getenv('DATA_PATH', '/data')) + primus_cfg, extra_args = load_config(config_path) + + trainer = create_trainer(primus_cfg, extra_args) + trainer.init() + trainer.run() + +if __name__ == "__main__": + main()