Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions explorations/muon_speedrun_preset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Compare NanoGPT speedrun optimizer presets for Muon and AdamW on minipile.
---
parameter_groups:
- optimizer: ["adamw"]
optimizer_preset: ["speedrun"]
- optimizer: ["muon"]
optimizer_preset: ["speedrun"]
muon_momentum: [0.95]
Copy link

Copilot AI Dec 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The muon_momentum parameter is redundantly specified here since line 512 in the _apply_optimizer_presets() method already sets this to 0.95 when the speedrun preset is used with the muon optimizer. This redundant specification could cause confusion about which value takes precedence.

Suggested change
muon_momentum: [0.95]

Copilot uses AI. Check for mistakes.
- optimizer: ["adamw"]

# GPT-2 architecture base hyperparameters
n_layer: [6]
n_head: [6]
n_embd: [384]
block_size: [256]
batch_size: [64]
max_iters: [100000]
eval_interval: [10000]
eta_variant: ["iteration"]
dataset: ["minipile"]
device: ["cuda"]
dtype: ["float16"]
use_abs_pos_embeddings: [false]
use_rotary_embeddings: [true]
use_qk_norm: [true]
use_qk_norm_scale: [true]
use_peri_ln: [true]
softmax_variant_attn: ["softmax"]
compile: [true]
never_save_checkpoint: [true]
tensorboard_run_name: ["muon_vs_adamw_speedrun"]
1 change: 1 addition & 0 deletions explorations/muon_vs_adamw.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ parameter_groups:
- optimizer: ["muon"]
learning_rate: ["0.05"]
muon_momentum: [0.95]
- optimizer: ["adamw"]

# GPT-2 architecture base hyperparameters
n_layer: [6]
Expand Down
22 changes: 22 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,9 +496,31 @@ def _initialize_teacher_if_needed(self):
print(f"Loaded teacher checkpoint from {expanded}")


def _apply_optimizer_presets(self):
preset = getattr(self.args, "optimizer_preset", "none")

if preset != "speedrun":
return

if self.args.optimizer == "adamw":
self.args.learning_rate = 6e-4
self.args.beta1, self.args.beta2 = 0.9, 0.95
self.args.adamw_weight_decay = 0.1
self.args.adamw_eps = 1e-8
elif self.args.optimizer == "muon":
self.args.learning_rate = 2e-2
self.args.muon_momentum = 0.95
self.args.weight_decay = 0.0

if self.master_process:
print(f"Applied {preset} preset for optimizer '{self.args.optimizer}'.")


Comment on lines +510 to +518
Copy link

Copilot AI Dec 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the speedrun preset is specified but the optimizer is neither "adamw" nor "muon", the function will print "Applied speedrun preset for optimizer 'X'" even though no preset values were actually applied. This could mislead users into thinking the preset was applied when it wasn't. Consider adding an else clause that either warns the user or skips the print statement when the optimizer doesn't have a preset implementation.

Suggested change
elif self.args.optimizer == "muon":
self.args.learning_rate = 2e-2
self.args.muon_momentum = 0.95
self.args.weight_decay = 0.0
if self.master_process:
print(f"Applied {preset} preset for optimizer '{self.args.optimizer}'.")
if self.master_process:
print(f"Applied {preset} preset for optimizer 'adamw'.")
elif self.args.optimizer == "muon":
self.args.learning_rate = 2e-2
self.args.muon_momentum = 0.95
self.args.weight_decay = 0.0
if self.master_process:
print(f"Applied {preset} preset for optimizer 'muon'.")
else:
if self.master_process:
print(f"Warning: No '{preset}' preset available for optimizer '{self.args.optimizer}'. No preset values were applied.")

Copilot uses AI. Check for mistakes.
def create_optimizer(self):
optimizer_key = self.args.optimizer

self._apply_optimizer_presets()

if optimizer_key == "muon":
named = list(self.model.named_parameters())
exclude = ("embed", "wte", "wpe", "lm_head")
Expand Down
7 changes: 7 additions & 0 deletions train_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,13 @@ def parse_args():
# -------- MUON --------------------------------------------------
training_group.add_argument("--muon_momentum", type=float, default=0.95,
help="Momentum for the Muon optimizer.")
training_group.add_argument(
"--optimizer_preset",
type=str,
default="none",
choices=["none", "speedrun"],
help="Optional preset hyperparameters for supported optimizers (currently AdamW and Muon).",
)
# -------- ADAMW --------------------------------------------------
training_group.add_argument("--adamw_betas", type=float, nargs=2, default=[0.9, 0.999], help="Betas for AdamW optimizer.")
training_group.add_argument("--adamw_eps", type=float, default=1e-8, help="Epsilon for AdamW optimizer.")
Expand Down