From 945eb736eab6555b04d18b878e2c9e051a7b144a Mon Sep 17 00:00:00 2001 From: Kauna <16511995+klei22@users.noreply.github.com> Date: Mon, 8 Dec 2025 23:31:30 -0800 Subject: [PATCH 1/2] Add speedrun optimizer preset and muon exploration --- explorations/muon_speedrun_preset.yaml | 30 ++++++++++++++++++++++++++ train.py | 22 +++++++++++++++++++ train_args.py | 7 ++++++ 3 files changed, 59 insertions(+) create mode 100644 explorations/muon_speedrun_preset.yaml diff --git a/explorations/muon_speedrun_preset.yaml b/explorations/muon_speedrun_preset.yaml new file mode 100644 index 0000000000..b906baa40d --- /dev/null +++ b/explorations/muon_speedrun_preset.yaml @@ -0,0 +1,30 @@ +# Compare NanoGPT speedrun optimizer presets for Muon and AdamW on minipile. +--- +parameter_groups: + - optimizer: ["adamw"] + optimizer_preset: ["speedrun"] + - optimizer: ["muon"] + optimizer_preset: ["speedrun"] + muon_momentum: [0.95] + +# GPT-2 architecture base hyperparameters +n_layer: [6] +n_head: [6] +n_embd: [384] +block_size: [256] +batch_size: [64] +max_iters: [10000] +eval_interval: [10000] +eta_variant: ["iteration"] +dataset: ["minipile"] +device: ["cuda"] +dtype: ["float16"] +use_abs_pos_embeddings: [false] +use_rotary_embeddings: [true] +use_qk_norm: [true] +use_qk_norm_scale: [true] +use_peri_ln: [true] +softmax_variant_attn: ["softmax"] +compile: [true] +never_save_checkpoint: [true] +tensorboard_run_name: ["muon_vs_adamw_speedrun"] diff --git a/train.py b/train.py index 0b8e98595a..329373b90f 100644 --- a/train.py +++ b/train.py @@ -496,9 +496,31 @@ def _initialize_teacher_if_needed(self): print(f"Loaded teacher checkpoint from {expanded}") + def _apply_optimizer_presets(self): + preset = getattr(self.args, "optimizer_preset", "none") + + if preset != "speedrun": + return + + if self.args.optimizer == "adamw": + self.args.learning_rate = 6e-4 + self.args.beta1, self.args.beta2 = 0.9, 0.95 + self.args.adamw_weight_decay = 0.1 + self.args.adamw_eps = 1e-8 + elif self.args.optimizer == "muon": + self.args.learning_rate = 2e-2 + self.args.muon_momentum = 0.95 + self.args.weight_decay = 0.0 + + if self.master_process: + print(f"Applied {preset} preset for optimizer '{self.args.optimizer}'.") + + def create_optimizer(self): optimizer_key = self.args.optimizer + self._apply_optimizer_presets() + if optimizer_key == "muon": named = list(self.model.named_parameters()) exclude = ("embed", "wte", "wpe", "lm_head") diff --git a/train_args.py b/train_args.py index 94767f9cc3..289eac0264 100644 --- a/train_args.py +++ b/train_args.py @@ -342,6 +342,13 @@ def parse_args(): # -------- MUON -------------------------------------------------- training_group.add_argument("--muon_momentum", type=float, default=0.95, help="Momentum for the Muon optimizer.") + training_group.add_argument( + "--optimizer_preset", + type=str, + default="none", + choices=["none", "speedrun"], + help="Optional preset hyperparameters for supported optimizers (currently AdamW and Muon).", + ) # -------- ADAMW -------------------------------------------------- training_group.add_argument("--adamw_betas", type=float, nargs=2, default=[0.9, 0.999], help="Betas for AdamW optimizer.") training_group.add_argument("--adamw_eps", type=float, default=1e-8, help="Epsilon for AdamW optimizer.") From 31df28681a38c3e6c38c88512ef35cd158d3da64 Mon Sep 17 00:00:00 2001 From: klei22 Date: Fri, 12 Dec 2025 09:11:48 -0800 Subject: [PATCH 2/2] Add preset with optimized muon settings --- explorations/muon_speedrun_preset.yaml | 3 ++- explorations/muon_vs_adamw.yaml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/explorations/muon_speedrun_preset.yaml b/explorations/muon_speedrun_preset.yaml index b906baa40d..be4606013f 100644 --- a/explorations/muon_speedrun_preset.yaml +++ b/explorations/muon_speedrun_preset.yaml @@ -6,6 +6,7 @@ parameter_groups: - optimizer: ["muon"] optimizer_preset: ["speedrun"] muon_momentum: [0.95] + - optimizer: ["adamw"] # GPT-2 architecture base hyperparameters n_layer: [6] @@ -13,7 +14,7 @@ n_head: [6] n_embd: [384] block_size: [256] batch_size: [64] -max_iters: [10000] +max_iters: [100000] eval_interval: [10000] eta_variant: ["iteration"] dataset: ["minipile"] diff --git a/explorations/muon_vs_adamw.yaml b/explorations/muon_vs_adamw.yaml index f57572d4a2..376b4fec70 100644 --- a/explorations/muon_vs_adamw.yaml +++ b/explorations/muon_vs_adamw.yaml @@ -10,6 +10,7 @@ parameter_groups: - optimizer: ["muon"] learning_rate: ["0.05"] muon_momentum: [0.95] + - optimizer: ["adamw"] # GPT-2 architecture base hyperparameters n_layer: [6]