Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
18410e3
[WIP] Initial DeepSeek reference implementation
denys-fridman Jan 14, 2026
412ae53
Add NemoRun launcher for DeepSeek V3 pretraining
denys-fridman Jan 20, 2026
3ac907c
Update run_deepseek_v3_671b.sh to use NemoRun launcher
denys-fridman Jan 20, 2026
b7bb8cd
cd to current dir
denys-fridman Jan 21, 2026
ff7e160
detach = True by default
denys-fridman Jan 21, 2026
38e318c
Fix env vars
denys-fridman Jan 21, 2026
200f496
Remove unused arguments and fix eval config bug
denys-fridman Jan 21, 2026
b0423ff
add missing pretrain args
denys-fridman Jan 21, 2026
c59fd89
fix pipeline layout + tokenizer config
denys-fridman Jan 21, 2026
48fd489
fix import
denys-fridman Jan 21, 2026
55ac23c
remove force_success_status var
denys-fridman Jan 21, 2026
0932bad
fix args
denys-fridman Jan 21, 2026
99df101
fix args
denys-fridman Jan 21, 2026
6a45841
use 8b data paths
denys-fridman Jan 21, 2026
f166413
use TMP_NPY_INDEX
denys-fridman Jan 21, 2026
ea2b48f
update config
denys-fridman Jan 21, 2026
544f076
set adam epsilon to 1e-8
denys-fridman Feb 25, 2026
18c50a8
set MBRIDGE_REVISION to 38858ef0ae3b835af236ec1cf8bf1feca4d400fa
denys-fridman Feb 25, 2026
6580086
update TODO comment to reference logging PR #445
denys-fridman Feb 25, 2026
163128e
add missing mllogger submission logs (CACHE_CLEAR, POC_NAME, POC_EMAIL)
denys-fridman Feb 25, 2026
1caf442
add config files for GBS 256/288/320
denys-fridman Feb 25, 2026
f116f4a
set max_steps=12000, warmup_steps=4, eval_check_interval=1, and per-c…
denys-fridman Feb 25, 2026
ae8ff28
remove quotes around MAX_LR
denys-fridman Feb 25, 2026
829f618
cherry-pick eval batch size knob from dfridman/Megatron-Bridge
denys-fridman Feb 25, 2026
e84e073
add eval_batch_size support, set to 1024 in all configs
denys-fridman Feb 25, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions moe_pretraining/nemo/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.


ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:25.12-py3

FROM ${FROM_IMAGE_NAME}

ENV PIP_CONSTRAINT=""

RUN git config --global user.name "a" && \
git config --global user.email "a"

## 0. Pytorch Checkpoint size patch
WORKDIR /workspace/


RUN pip install numcodecs==0.16.3
RUN pip install nvidia-mathdx==25.1.1


## 1. NeMo-Run
ARG NEMORUN_REVISION=v0.5.0
ENV CUSTOM_NEMORUN_REVISION ${NEMORUN_REVISION}
RUN git clone https://github.com/NVIDIA/NeMo-Run.git && \
cd NeMo-Run && \
git checkout ${NEMORUN_REVISION} && \
echo NEMORUN_COMMIT_HASH=$(git rev-parse HEAD) && \
pip install -e .

## 2. Megatron-bridge and megatron-core
ARG MBRIDGE_REVISION=38858ef0ae3b835af236ec1cf8bf1feca4d400fa
RUN pip uninstall -y megatron-core && \
git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git /workspace/Megatron-Bridge && \
cd /workspace/Megatron-Bridge && \
git checkout ${MBRIDGE_REVISION} && \
git submodule update --init --recursive && \
echo MBRIDGE_COMMIT_HASH=$(git rev-parse HEAD) && \
echo $(git rev-parse HEAD) > /MBRIDGE_COMMIT_HASH.env && \
# TODO(dfridman): knob for eval batch size. Remove once merged in the upstream
git remote add dfridman https://github.com/denys-fridman/Megatron-Bridge.git && \
git fetch dfridman && \
git cherry-pick 01cc7a6c76f5d8918a981e22f02975ae775c2d97 && \
cd /workspace/Megatron-Bridge/3rdparty/Megatron-LM && \
echo MCORE_COMMIT_HASH=$(git rev-parse HEAD) && \
echo $(git rev-parse HEAD) > /MCORE_COMMIT_HASH.env && \
pip install -e .

ENV PYTHONPATH "/workspace/Megatron-Bridge/src:/workspace/Megatron-Bridge/3rdparty/Megatron-LM:${PYTHONPATH}"

## 3. Benchmark dependencies
RUN pip uninstall transformers -y
COPY requirements.txt .
RUN pip install --no-cache-dir -U -r requirements.txt


WORKDIR /workspace/code

COPY . .
Loading
Loading