From d843fad58b5cbec93129746d5c9a1cba86e8237f Mon Sep 17 00:00:00 2001 From: zephyr-sh Date: Tue, 8 Jul 2025 09:45:16 +0800 Subject: [PATCH 1/2] [F] Fixed `PolynomialLRWarmup` & `MultiStepLRWarmUp` error --- chameleon/base/optim/__init__.py | 2 +- chameleon/base/optim/polynomial_lr_warmup.py | 98 ++++++----- chameleon/base/optim/warm_up.py | 162 ++++++++++++++----- 3 files changed, 184 insertions(+), 78 deletions(-) diff --git a/chameleon/base/optim/__init__.py b/chameleon/base/optim/__init__.py index 6ec0a25..9c5dbf4 100644 --- a/chameleon/base/optim/__init__.py +++ b/chameleon/base/optim/__init__.py @@ -21,4 +21,4 @@ OPTIMIZERS.register_module(name=k, force=True, module=globals()[k]) -__all__ += ['PolynomialLRWarmup', 'WrappedLRScheduler'] +__all__ += ['PolynomialLRWarmup', 'WrappedLRScheduler', 'MultiStepLRWarmUp'] diff --git a/chameleon/base/optim/polynomial_lr_warmup.py b/chameleon/base/optim/polynomial_lr_warmup.py index e07d3f9..d51835d 100644 --- a/chameleon/base/optim/polynomial_lr_warmup.py +++ b/chameleon/base/optim/polynomial_lr_warmup.py @@ -1,55 +1,79 @@ -import warnings +from typing import List -from torch.optim.lr_scheduler import _LRScheduler +from torch.optim import Optimizer +from torch.optim.lr_scheduler import LRScheduler from ...registry import OPTIMIZERS @OPTIMIZERS.register_module() -class PolynomialLRWarmup(_LRScheduler): +class PolynomialLRWarmup(LRScheduler): + """ + Scheduler with an initial linear warm-up followed by polynomial decay. + + - For the first `warmup_iters` steps, LR increases linearly + from 0 -> base_lr. + - For steps `warmup_iters < step <= total_iters`, LR decays as + base_lr * (1 - (step - warmup_iters) / (total_iters - warmup_iters))^power. + - After `total_iters`, LR is held at the final decayed value. + + Args: + optimizer (Optimizer): Wrapped optimizer. + warmup_iters (int): Number of steps for linear warm-up; must be ≥ 0. + total_iters (int): Total number of steps for warm-up + decay; must be ≥ warmup_iters. + power (float): Exponent for polynomial decay. Default: 1.0 (linear). + last_epoch (int): The index of last step. Default: -1 (start from step 0). + verbose (bool): If True, prints a message for each LR update. + """ def __init__( self, - optimizer, - warmup_iters, - total_iters=5, - power=1.0, - last_epoch=-1, - verbose=False + optimizer: Optimizer, + warmup_iters: int, + total_iters: int, + power: float = 1.0, + last_epoch: int = -1, ): - super().__init__(optimizer, last_epoch=last_epoch, verbose=verbose) + # input validation + if warmup_iters < 0: + raise ValueError(f"warmup_iters must be >= 0, got {warmup_iters}") + if total_iters < warmup_iters: + raise ValueError( + f"total_iters ({total_iters}) must be >= warmup_iters ({warmup_iters})") + if power < 0: + raise ValueError(f"power must be non-negative, got {power}") + + self.warmup_iters = warmup_iters self.total_iters = total_iters self.power = power - self.warmup_iters = warmup_iters - def get_lr(self): - if not self._get_lr_called_within_step: - warnings.warn("To get the last learning rate computed by the scheduler, " - "please use `get_last_lr()`.", UserWarning) + super().__init__(optimizer, last_epoch) - if self.last_epoch == 0 or self.last_epoch > self.total_iters: - return [group["lr"] for group in self.optimizer.param_groups] + def get_closed_form(self) -> List[float]: + """ + Compute the learning rate for the current `last_epoch` in closed form. + Called by the base class when you use the chainable API: scheduler.step(). + """ + # Clamp epoch to [0, total_iters] + epoch = min(max(self.last_epoch, 0), self.total_iters) - if self.last_epoch <= self.warmup_iters: - return [base_lr * self.last_epoch / self.warmup_iters for base_lr in self.base_lrs] - else: - l = self.last_epoch - w = self.warmup_iters - t = self.total_iters - decay_factor = ((1.0 - (l - w) / (t - w)) / - (1.0 - (l - 1 - w) / (t - w))) ** self.power - return [group["lr"] * decay_factor for group in self.optimizer.param_groups] - - def _get_closed_form_lr(self): - - if self.last_epoch <= self.warmup_iters: - return [ - base_lr * self.last_epoch / self.warmup_iters for base_lr in self.base_lrs] - else: + # 1) Warm-up phase + if epoch <= self.warmup_iters: return [ - ( - base_lr * (1.0 - (min(self.total_iters, self.last_epoch) - self.warmup_iters) / ( - self.total_iters - self.warmup_iters)) ** self.power - ) + base_lr * + (epoch / self.warmup_iters if self.warmup_iters > 0 else 1.0) for base_lr in self.base_lrs ] + + # 2) Polynomial decay phase + decay_steps = epoch - self.warmup_iters + decay_total = self.total_iters - self.warmup_iters + factor = (1.0 - decay_steps / decay_total) ** self.power + return [base_lr * factor for base_lr in self.base_lrs] + + def get_lr(self) -> List[float]: + """ + Legacy step API. If you’re still calling scheduler.step(epoch), + this will be invoked instead of get_closed_form(). + """ + return self.get_closed_form() diff --git a/chameleon/base/optim/warm_up.py b/chameleon/base/optim/warm_up.py index 65ce76d..3ba2861 100644 --- a/chameleon/base/optim/warm_up.py +++ b/chameleon/base/optim/warm_up.py @@ -1,28 +1,33 @@ -from typing import List +from typing import List, Optional from torch.optim import Optimizer -from torch.optim.lr_scheduler import _LRScheduler +from torch.optim.lr_scheduler import LRScheduler, MultiStepLR +# Project-specific registry (keep as-is or remove if unused) from ...registry import OPTIMIZERS -__all__ = ['WrappedLRScheduler'] +__all__ = ["WrappedLRScheduler", "MultiStepLRWarmUp"] @OPTIMIZERS.register_module() -class WrappedLRScheduler(_LRScheduler): +class WrappedLRScheduler(LRScheduler): """ - Gradually warm-up(increasing) learning rate in optimizer. - Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'. + Gradual warmup scheduler. + + During the first `milestone` steps (or epochs), the learning rate + increases linearly from 0 (or base_lr) up to base_lr * multiplier. + After warmup completes, scheduling is delegated to `after_scheduler`. + Args: optimizer (Optimizer): Wrapped optimizer. - milestone (int): - milestone step for warm-up. - multiplier (float): - A factor to multiply base_lr. - if multiplier > 1.0, learning rate = base lr * multiplier. - if multiplier = 1.0, lr starts from 0 and ends up with the base_lr. - after_scheduler (lr_scheduler): - after target_epoch, use this scheduler(eg. ReduceLROnPlateau) + milestone (int): Number of steps (or epochs) for warmup; must be > 0. + multiplier (float, optional): Final LR = base_lr * multiplier. + - If multiplier == 1.0, warmup goes from 0 -> base_lr. + - If multiplier > 1.0, warmup goes from base_lr -> base_lr * multiplier. + after_scheduler (LRScheduler, optional): Scheduler to use after warmup. + last_epoch (int, optional): The index of last epoch. Default: -1. + verbose (bool, optional): If True, prints a message to stdout for + each update. Default: False. """ def __init__( @@ -30,39 +35,116 @@ def __init__( optimizer: Optimizer, milestone: int, multiplier: float = 1.0, - after_scheduler: _LRScheduler = None, - interval='step' + after_scheduler: Optional[LRScheduler] = None, + last_epoch: int = -1 ): - self.multiplier = multiplier - if self.multiplier < 1.: - raise ValueError('multiplier should be greater thant or equal to 1.') + if milestone <= 0: + raise ValueError("milestone must be > 0.") + if multiplier < 1.0: + raise ValueError("multiplier must be >= 1.0.") + self.milestone = milestone + self.multiplier = multiplier self.after_scheduler = after_scheduler self.finished = False - self.interval = interval - super().__init__(optimizer) # need be set in the end of __init__ + + # Initialize base class with optimizer, last_epoch, and verbose + super().__init__(optimizer, last_epoch) def get_lr(self): - # do after_scheduler - if self.last_epoch > self.milestone: - if self.after_scheduler: - if not self.finished: - self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs] - self.finished = True - return self.after_scheduler.get_last_lr() - return [base_lr * self.multiplier for base_lr in self.base_lrs] - - if self.multiplier == 1.0: - return [base_lr * (float(self.last_epoch) / self.milestone) for base_lr in self.base_lrs] - else: - return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.milestone + 1.) for base_lr in self.base_lrs] + # During warmup phase + if self.last_epoch <= self.milestone: + if self.multiplier == 1.0: + # Linear increase: 0 -> base_lr + return [ + base_lr * (self.last_epoch / self.milestone) + for base_lr in self.base_lrs + ] + else: + # Linear increase: base_lr -> base_lr * multiplier + return [ + base_lr * ((self.multiplier - 1.0) * + self.last_epoch / self.milestone + 1.0) + for base_lr in self.base_lrs + ] + + # After warmup completes + if self.after_scheduler is not None: + # On first transition, reset the after_scheduler's base_lrs + if not self.finished: + self.after_scheduler.base_lrs = [ + base_lr * self.multiplier for base_lr in self.base_lrs + ] + self.finished = True + # Delegate to after_scheduler + return self.after_scheduler.get_last_lr() + + # No after_scheduler: keep LR at base_lr * multiplier + return [base_lr * self.multiplier for base_lr in self.base_lrs] + + def step(self, epoch: Optional[int] = None, metrics: Optional[float] = None): + """ + Update the learning rate. - def step(self, epoch=None, metrics=None): - if self.finished and self.after_scheduler: - if epoch is None: - self.after_scheduler.step(None) + If warmup is finished and an after_scheduler is provided, + delegate the step to after_scheduler. Otherwise, call the + base class step() to continue warmup. + + Args: + epoch (int, optional): Current epoch or step index. + metrics (float, optional): Metric for ReduceLROnPlateau. + """ + if self.finished and self.after_scheduler is not None: + # If using ReduceLROnPlateau (metric-based), pass metrics first + if metrics is not None and "plateau" in type(self.after_scheduler).__name__.lower(): + self.after_scheduler.step( + metrics, epoch - self.milestone if epoch is not None else None) else: - self.after_scheduler.step(epoch - self.milestone) + # Standard scheduler.step(epoch) + self.after_scheduler.step( + epoch - self.milestone if epoch is not None else None) + # Sync the last learning rates self._last_lr = self.after_scheduler.get_last_lr() else: - return super().step() + # Still in warmup or no after_scheduler: use base class logic + super().step(epoch) + + +@OPTIMIZERS.register_module(is_model_builder=True) +def MultiStepLRWarmUp( + optimizer: Optimizer, + milestones: List[int], + warmup_milestone: int, + gamma: float = 0.1, + last_epoch: int = -1 +) -> WrappedLRScheduler: + """ + Factory function to create a warmup + MultiStepLR scheduler. + + Args: + optimizer (Optimizer): Wrapped optimizer. + milestones (List[int]): List of epoch indices where LR is decayed by gamma. + warmup_milestone (int): Number of epochs for linear warmup. + gamma (float, optional): Multiplicative LR decay factor for MultiStepLR. Default: 0.1. + last_epoch (int, optional): Index of last epoch. Default: -1 (start from scratch). + + Returns: + WrappedLRScheduler: Scheduler that linearly warms up for `warmup_milestone` + epochs, then delegates to MultiStepLR. + """ + # 1) create the MultiStepLR scheduler that will run *after* warmup + multi_step = MultiStepLR( + optimizer=optimizer, + milestones=milestones, + gamma=gamma, + last_epoch=last_epoch, + ) + + # 2) wrap it with linear warmup + return WrappedLRScheduler( + optimizer=optimizer, + milestone=warmup_milestone, + multiplier=1.0, # warmup from 0 -> base_lr + after_scheduler=multi_step, + last_epoch=last_epoch, + ) From edf996388b479eb991ae4a523a068bca3593e7a2 Mon Sep 17 00:00:00 2001 From: zephyr-sh Date: Tue, 8 Jul 2025 09:45:38 +0800 Subject: [PATCH 2/2] [A] Add testing for `WarmUp` funcs --- tests/base/optim/test_polynomial_lr_warmup.py | 127 +++++++++++++++ tests/base/optim/test_warm_up.py | 149 ++++++++++++++++++ 2 files changed, 276 insertions(+) create mode 100644 tests/base/optim/test_polynomial_lr_warmup.py create mode 100644 tests/base/optim/test_warm_up.py diff --git a/tests/base/optim/test_polynomial_lr_warmup.py b/tests/base/optim/test_polynomial_lr_warmup.py new file mode 100644 index 0000000..2eb1983 --- /dev/null +++ b/tests/base/optim/test_polynomial_lr_warmup.py @@ -0,0 +1,127 @@ +import math + +import pytest +import torch +from chameleon.base.optim.polynomial_lr_warmup import PolynomialLRWarmup +from torch.optim import SGD +from torch.optim.lr_scheduler import LRScheduler + + +def make_optimizer(lr: float = 0.1): + """Create an optimizer with a single parameter and given base LR.""" + p = torch.nn.Parameter(torch.zeros(1)) + return SGD([p], lr=lr) + + +def test_init_invalid_warmup_iters(): + """warmup_iters < 0 should raise ValueError.""" + opt = make_optimizer() + with pytest.raises(ValueError): + PolynomialLRWarmup(opt, warmup_iters=-1, total_iters=5) + with pytest.raises(ValueError): + PolynomialLRWarmup(opt, warmup_iters=5, total_iters=4) + + +def test_init_invalid_power(): + """power < 0 should raise ValueError.""" + opt = make_optimizer() + with pytest.raises(ValueError): + PolynomialLRWarmup(opt, warmup_iters=0, total_iters=5, power=-0.1) + + +@pytest.mark.parametrize("warmup, total, epoch, expected_factor", [ + # warmup only, no decay + (5, 10, 0, 0.0 / 5), + (5, 10, 1, 1.0 / 5), + (5, 10, 5, 5.0 / 5), +]) +def test_linear_warmup_phase(warmup, total, epoch, expected_factor): + """Test linear increase from 0 -> base_lr over warmup_iters.""" + base_lr = 0.2 + opt = make_optimizer(base_lr) + sched = PolynomialLRWarmup( + opt, warmup_iters=warmup, total_iters=total, power=1.0) + sched.last_epoch = epoch + lr_closed = sched.get_closed_form()[0] + lr_legacy = sched.get_lr()[0] + expected_lr = base_lr * expected_factor + assert math.isclose(lr_closed, expected_lr, rel_tol=1e-6) + assert math.isclose(lr_legacy, expected_lr, rel_tol=1e-6) + + +def test_warmup_zero_iters(): + """warmup_iters=0 should immediately use decay formula starting at epoch 0.""" + base_lr = 0.3 + opt = make_optimizer(base_lr) + # with warmup_iters=0, get_closed_form multiplies by 1.0 in warmup branch + sched = PolynomialLRWarmup(opt, warmup_iters=0, total_iters=5, power=1.0) + for epoch in range(0, 3): + sched.last_epoch = epoch + # decay_total = total_iters - 0 = 5 + expected = base_lr * (1.0 - epoch / 5) ** 1.0 + assert math.isclose(sched.get_closed_form()[0], expected, rel_tol=1e-6) + + +def test_polynomial_decay_phase(): + """Test polynomial decay after warmup_iters up to total_iters.""" + base_lr = 0.4 + opt = make_optimizer(base_lr) + warmup = 2 + total = 8 + power = 2.0 + sched = PolynomialLRWarmup( + opt, warmup_iters=warmup, total_iters=total, power=power) + # test a few epochs in decay + for epoch in [3, 5, 8]: + sched.last_epoch = epoch + # compute expected factor + decay_steps = min(epoch, total) - warmup + decay_total = total - warmup + factor = (1.0 - decay_steps / decay_total) ** power + expected_lr = base_lr * factor + assert math.isclose(sched.get_closed_form()[ + 0], expected_lr, rel_tol=1e-6) + + +def test_after_total_iters_clamps_to_final(): + """Epochs > total_iters should clamp and hold lr at final decayed value.""" + base_lr = 0.5 + opt = make_optimizer(base_lr) + sched = PolynomialLRWarmup(opt, warmup_iters=3, total_iters=6, power=1.0) + # compute final lr at epoch=total_iters + sched.last_epoch = 6 + final_lr = sched.get_closed_form()[0] + # at epoch 9 (> total), should be equal to final_lr + sched.last_epoch = 9 + assert math.isclose(sched.get_closed_form()[0], final_lr, rel_tol=1e-6) + + +def test_scheduler_chainable_api(): + """ + Ensure that using the modern .step() API after optimizer.step() + produces the same lr as get_closed_form, up to the one-step offset + inherent in the chainable scheduler design. + """ + base_lr = 0.25 + + opt = make_optimizer(base_lr) + sched = PolynomialLRWarmup(opt, warmup_iters=2, total_iters=4, power=1.0) + seen = [] + for _ in range(6): + opt.step() + sched.step() + seen.append(opt.param_groups[0]["lr"]) + + opt2 = make_optimizer(base_lr) + sched2 = PolynomialLRWarmup(opt2, warmup_iters=2, total_iters=4, power=1.0) + manual = [sched2.get_closed_form()[0]] # 初始 last_epoch = -1 -> clamp to 0 + for epoch in range(1, 6): + sched2.last_epoch = epoch + manual.append(sched2.get_closed_form()[0]) + + max_idx = len(manual) - 1 + for i, lr in enumerate(seen): + expected = manual[min(i+1, max_idx)] + assert math.isclose( + lr, expected, rel_tol=1e-6 + ), f"At step {i}: chainable={lr}, expected closed_form at epoch {min(i+1, max_idx)} = {expected}" diff --git a/tests/base/optim/test_warm_up.py b/tests/base/optim/test_warm_up.py new file mode 100644 index 0000000..811d8af --- /dev/null +++ b/tests/base/optim/test_warm_up.py @@ -0,0 +1,149 @@ +import math + +import pytest +import torch +from chameleon.base.optim import MultiStepLRWarmUp, WrappedLRScheduler +from torch.optim.lr_scheduler import MultiStepLR + + +def make_optimizer(lr: float = 0.1): + """Helper to create a simple optimizer with one Linear parameter.""" + model = torch.nn.Linear(4, 2, bias=False) + return torch.optim.SGD(model.parameters(), lr=lr) + + +class DummyScheduler(MultiStepLR): + """A dummy MultiStepLR that exposes the base_lrs for testing.""" + pass + + +def test_wrapped_scheduler_invalid_args(): + """WrappedLRScheduler should reject non-positive milestone or multiplier < 1.0.""" + opt = make_optimizer() + with pytest.raises(ValueError): + WrappedLRScheduler(opt, milestone=0) + with pytest.raises(ValueError): + WrappedLRScheduler(opt, milestone=-1) + with pytest.raises(ValueError): + WrappedLRScheduler(opt, milestone=5, multiplier=0.5) + + +def test_warmup_linear_increase_to_base_lr(): + """ + Test that with multiplier=1.0 and no after_scheduler, + LR increases linearly from 0 to base_lr over `milestone` steps, + then stays flat. + """ + base_lr = 0.2 + opt = make_optimizer(lr=base_lr) + milestone = 5 + scheduler = WrappedLRScheduler( + opt, milestone=milestone, multiplier=1.0, after_scheduler=None) + + lrs = [] + # simulate epochs 0 through milestone+2 + for epoch in range(milestone + 3): + scheduler.step(epoch) + lrs.append(scheduler.get_last_lr()[0]) + + # At epoch=0 LR should be 0 + assert math.isclose(lrs[0], 0.0, rel_tol=1e-6) + # At epoch=milestone LR should reach base_lr + assert math.isclose(lrs[milestone], base_lr, rel_tol=1e-6) + # After milestone, LR should remain flat at base_lr + assert all(math.isclose(lr, base_lr, rel_tol=1e-6) + for lr in lrs[milestone + 1:]) + + +def test_warmup_with_multiplier_greater_than_one(): + """ + Test that with multiplier > 1.0 and no after_scheduler, + LR increases linearly from base_lr to base_lr*multiplier over `milestone` steps. + """ + base_lr = 0.1 + multiplier = 2.0 + opt = make_optimizer(lr=base_lr) + milestone = 4 + scheduler = WrappedLRScheduler( + opt, milestone=milestone, multiplier=multiplier, after_scheduler=None) + + # simulate epochs 0 through milestone + for epoch in range(milestone + 1): + scheduler.step(epoch) + lr = scheduler.get_last_lr()[0] + expected = base_lr * ((multiplier - 1.0) * epoch / milestone + 1.0) + assert math.isclose( + lr, expected, rel_tol=1e-6), f"epoch={epoch}: got {lr}, expected {expected}" + + +def test_wrapped_scheduler_without_after_scheduler_freezes_after_milestone(): + """ + When there is no after_scheduler, + after warm-up milestone the LR should remain at base_lr * multiplier forever. + """ + base_lr = 0.15 + multiplier = 1.5 + opt = make_optimizer(lr=base_lr) + milestone = 3 + scheduler = WrappedLRScheduler( + opt, milestone=milestone, multiplier=multiplier, after_scheduler=None) + + # advance to beyond milestone + for epoch in range(milestone + 5): + scheduler.step(epoch) + + # last_lr should always equal base_lr * multiplier + assert math.isclose(scheduler.get_last_lr()[ + 0], base_lr * multiplier, rel_tol=1e-6) + + +def test_multisteplr_warmup_factory_and_errors(): + """ + Test that MultiStepLRWarmUp returns a WrappedLRScheduler + and that invalid warmup_milestone raises an error. + """ + opt = make_optimizer() + # Valid factory call + sched = MultiStepLRWarmUp( + opt, milestones=[2, 4], warmup_milestone=2, gamma=0.5) + assert isinstance(sched, WrappedLRScheduler) + + # Invalid warmup_milestone propagates ValueError from WrappedLRScheduler + with pytest.raises(ValueError): + MultiStepLRWarmUp( + opt, milestones=[2, 4], warmup_milestone=0, gamma=0.5) + + +def test_multisteplr_warmup_delegates_to_multisteplr(): + """ + Test end-to-end behavior of MultiStepLRWarmUp: + - Warm-up for `warmup_milestone` epochs (0,1,2) + - Then apply MultiStepLR at epoch >= warmup_milestone+1 + """ + base_lr = 0.1 + opt = make_optimizer(lr=base_lr) + wrapped = MultiStepLRWarmUp( + optimizer=opt, + milestones=[3, 6], + warmup_milestone=2, + gamma=0.1, + last_epoch=-1 + ) + + lrs = [] + for epoch in range(8): + wrapped.step(epoch) + lrs.append(wrapped.get_last_lr()[0]) + + # epochs 0,1,2: warm-up 0 -> base_lr + assert math.isclose(lrs[0], 0.0, rel_tol=1e-6) + assert math.isclose(lrs[1], base_lr * 1/2, rel_tol=1e-6) + assert math.isclose(lrs[2], base_lr, rel_tol=1e-6) + + # epochs 3,4: still no decay + assert all(math.isclose(lrs[e], base_lr, rel_tol=1e-6) for e in [3, 4]) + + # epochs 5,6,7: decay by gamma once + expected_decay_lr = base_lr * 0.1 + assert all(math.isclose(lrs[e], expected_decay_lr, rel_tol=1e-6) + for e in [5, 6, 7])