-
Notifications
You must be signed in to change notification settings - Fork 28
Opt muon settings #697
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Opt muon settings #697
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| # Compare NanoGPT speedrun optimizer presets for Muon and AdamW on minipile. | ||
| --- | ||
| parameter_groups: | ||
| - optimizer: ["adamw"] | ||
| optimizer_preset: ["speedrun"] | ||
| - optimizer: ["muon"] | ||
| optimizer_preset: ["speedrun"] | ||
| muon_momentum: [0.95] | ||
| - optimizer: ["adamw"] | ||
|
|
||
| # GPT-2 architecture base hyperparameters | ||
| n_layer: [6] | ||
| n_head: [6] | ||
| n_embd: [384] | ||
| block_size: [256] | ||
| batch_size: [64] | ||
| max_iters: [100000] | ||
| eval_interval: [10000] | ||
| eta_variant: ["iteration"] | ||
| dataset: ["minipile"] | ||
| device: ["cuda"] | ||
| dtype: ["float16"] | ||
| use_abs_pos_embeddings: [false] | ||
| use_rotary_embeddings: [true] | ||
| use_qk_norm: [true] | ||
| use_qk_norm_scale: [true] | ||
| use_peri_ln: [true] | ||
| softmax_variant_attn: ["softmax"] | ||
| compile: [true] | ||
| never_save_checkpoint: [true] | ||
| tensorboard_run_name: ["muon_vs_adamw_speedrun"] | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -496,9 +496,31 @@ def _initialize_teacher_if_needed(self): | |||||||||||||||||||||||||||||||||||||
| print(f"Loaded teacher checkpoint from {expanded}") | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| def _apply_optimizer_presets(self): | ||||||||||||||||||||||||||||||||||||||
| preset = getattr(self.args, "optimizer_preset", "none") | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| if preset != "speedrun": | ||||||||||||||||||||||||||||||||||||||
| return | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| if self.args.optimizer == "adamw": | ||||||||||||||||||||||||||||||||||||||
| self.args.learning_rate = 6e-4 | ||||||||||||||||||||||||||||||||||||||
| self.args.beta1, self.args.beta2 = 0.9, 0.95 | ||||||||||||||||||||||||||||||||||||||
| self.args.adamw_weight_decay = 0.1 | ||||||||||||||||||||||||||||||||||||||
| self.args.adamw_eps = 1e-8 | ||||||||||||||||||||||||||||||||||||||
| elif self.args.optimizer == "muon": | ||||||||||||||||||||||||||||||||||||||
| self.args.learning_rate = 2e-2 | ||||||||||||||||||||||||||||||||||||||
| self.args.muon_momentum = 0.95 | ||||||||||||||||||||||||||||||||||||||
| self.args.weight_decay = 0.0 | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
| if self.master_process: | ||||||||||||||||||||||||||||||||||||||
| print(f"Applied {preset} preset for optimizer '{self.args.optimizer}'.") | ||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+510
to
+518
|
||||||||||||||||||||||||||||||||||||||
| elif self.args.optimizer == "muon": | |
| self.args.learning_rate = 2e-2 | |
| self.args.muon_momentum = 0.95 | |
| self.args.weight_decay = 0.0 | |
| if self.master_process: | |
| print(f"Applied {preset} preset for optimizer '{self.args.optimizer}'.") | |
| if self.master_process: | |
| print(f"Applied {preset} preset for optimizer 'adamw'.") | |
| elif self.args.optimizer == "muon": | |
| self.args.learning_rate = 2e-2 | |
| self.args.muon_momentum = 0.95 | |
| self.args.weight_decay = 0.0 | |
| if self.master_process: | |
| print(f"Applied {preset} preset for optimizer 'muon'.") | |
| else: | |
| if self.master_process: | |
| print(f"Warning: No '{preset}' preset available for optimizer '{self.args.optimizer}'. No preset values were applied.") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
muon_momentumparameter is redundantly specified here since line 512 in the_apply_optimizer_presets()method already sets this to 0.95 when the speedrun preset is used with the muon optimizer. This redundant specification could cause confusion about which value takes precedence.