From d92c3858c14f6339a15bbbcd8d4d0d6f1eebea5f Mon Sep 17 00:00:00 2001 From: Alec Date: Mon, 27 Jan 2020 00:29:30 -0600 Subject: [PATCH 1/4] adding adam.rs --- src/learning/optim/adam.rs | 74 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 src/learning/optim/adam.rs diff --git a/src/learning/optim/adam.rs b/src/learning/optim/adam.rs new file mode 100644 index 00000000..3230205d --- /dev/null +++ b/src/learning/optim/adam.rs @@ -0,0 +1,74 @@ +use learning::optim::{Optimizable, OptimAlgorithm}; +use linalg::Vector; +use linalg::{Matrix, BaseMatrix}; +use rulinalg::utils; + +use learning::toolkit::rand_utils; + +// Adam Optimizer +pub struct Adam { + alpha: f64, + beta1: f64, + beta2: f64, + epsilon: f64, + iters: usize +} + +// The default ADAM configuration +// +// The defaults are: +// +// - alpha = 0.001 (lr) +// - beta1 = 0.09 (dw) +// - beta2 = 0.999 (dw^2) +// - epsilon = 1e-8 +// - iters = 50 +// source: https://arxiv.org/pdf/1412.6980.pdf +impl Default for Adam { + fn default() -> Adam { + Adam { + alpha: 0.001, + beta1: 0.09, + beta2: 0.999, + epsilon: 1e-8, + iters: 50 + } + } +} + +impl Adam { + // Construct an Adam algorithm. + // + // Requires learning rate, exponential decay rates, epsilon, and iteration count. + pub fn new(learning_rate: f64, beta1: f64, beta2: f64, epsilon: f64, iters: usize) -> Adam { + assert!(0f64 < learning_rate, "The learning rate must be positive"); + assert!(0f64 <= beta1 < 1, "Beta value be within the range of [0,1)"); + assert!(0f64 <= beta2 < 1, "Beta value be within the range of [0,1)"); + assert!(0f64 < epsilon, "Epsilon must be positive"); + + Adam { + alpha: learning_rate, + beta1: beta1, + beta2: beta2, + epsilon: epsilon, + iters: iters + } + } +} + +impl OptimAlgorithm for Adam + where M: Optimizable, Targets = Matrix> { + fn optimize(&self, + model: &M, + start: &[f64], + inputs: &M::Inputs, + targets: &M::Targets) + -> Vec { + // Initial parameters + let mut params: f64 = Vector::new(start.to_vec()); + + // moment + let mut m, v, t = 0f64; + } + +} \ No newline at end of file From bd3f1b5a189c073a1d767fbfad2dcd0b61adf2f4 Mon Sep 17 00:00:00 2001 From: Alec Date: Tue, 28 Jan 2020 19:32:00 -0600 Subject: [PATCH 2/4] Working? --- src/learning/optim/adam.rs | 120 +++++++++++++++++++++++++++---------- src/lib.rs | 1 + 2 files changed, 90 insertions(+), 31 deletions(-) diff --git a/src/learning/optim/adam.rs b/src/learning/optim/adam.rs index 3230205d..172d2067 100644 --- a/src/learning/optim/adam.rs +++ b/src/learning/optim/adam.rs @@ -1,3 +1,7 @@ +//! Adam Optimizer +//! +//! Implementation of the ADAM optimization algorithm. +//! use learning::optim::{Optimizable, OptimAlgorithm}; use linalg::Vector; use linalg::{Matrix, BaseMatrix}; @@ -5,7 +9,10 @@ use rulinalg::utils; use learning::toolkit::rand_utils; -// Adam Optimizer +const EVAL_STEP: usize = 10; + +/// Adam Optimizer +#[derive(Debug)] pub struct Adam { alpha: f64, beta1: f64, @@ -14,36 +21,15 @@ pub struct Adam { iters: usize } -// The default ADAM configuration -// -// The defaults are: -// -// - alpha = 0.001 (lr) -// - beta1 = 0.09 (dw) -// - beta2 = 0.999 (dw^2) -// - epsilon = 1e-8 -// - iters = 50 -// source: https://arxiv.org/pdf/1412.6980.pdf -impl Default for Adam { - fn default() -> Adam { - Adam { - alpha: 0.001, - beta1: 0.09, - beta2: 0.999, - epsilon: 1e-8, - iters: 50 - } - } -} impl Adam { - // Construct an Adam algorithm. - // - // Requires learning rate, exponential decay rates, epsilon, and iteration count. + /// Construct an Adam algorithm. + /// + /// Requires learning rate, exponential decay rates, epsilon, and iteration count. pub fn new(learning_rate: f64, beta1: f64, beta2: f64, epsilon: f64, iters: usize) -> Adam { assert!(0f64 < learning_rate, "The learning rate must be positive"); - assert!(0f64 <= beta1 < 1, "Beta value be within the range of [0,1)"); - assert!(0f64 <= beta2 < 1, "Beta value be within the range of [0,1)"); + assert!((0f64 <= beta1 && beta1 < 1f64), "Beta value be within the range of [0,1)"); + assert!((0f64 <= beta2 && beta2 < 1f64), "Beta value be within the range of [0,1)"); assert!(0f64 < epsilon, "Epsilon must be positive"); Adam { @@ -56,8 +42,30 @@ impl Adam { } } +/// The default ADAM configuration +/// +/// The defaults are: +/// +/// - alpha = 0.001 (lr) +/// - beta1 = 0.09 (dw) +/// - beta2 = 0.999 (dw^2) +/// - epsilon = 1e-8 +/// - iters = 50 +/// source: https://arxiv.org/pdf/1412.6980.pdf +impl Default for Adam { + fn default() -> Adam { + Adam { + alpha: 0.001, + beta1: 0.09, + beta2: 0.999, + epsilon: 1e-8, + iters: 100 + } + } +} + impl OptimAlgorithm for Adam - where M: Optimizable, Targets = Matrix> { + where M: Optimizable, Targets = Matrix> { fn optimize(&self, model: &M, start: &[f64], @@ -65,10 +73,60 @@ impl OptimAlgorithm for Adam targets: &M::Targets) -> Vec { // Initial parameters - let mut params: f64 = Vector::new(start.to_vec()); + let mut params = Vector::new(start.to_vec()); + + // Set up the indices for permutation + let mut permutation = (0..inputs.rows()).collect::>(); + + // moment vectors & timestep + let mut t: f64 = 0.0; + let mut m = Vector::zeros(start.len()); + let mut v = Vector::zeros(start.len()); + + + let mut loss_vector: Vec = vec![]; + + for l in 0..self.iters { + // The cost at the end of each pass - // moment - let mut m, v, t = 0f64; + if l % EVAL_STEP == 0 && l > 0 { + let average_loss: f64 = loss_vector.iter().sum::() / loss_vector.len() as f64; + println!("Running average loss iter {:#?}: {:#?}", l, average_loss); + } + + // Permute the indices + rand_utils::in_place_fisher_yates(&mut permutation); + for i in &permutation { + // Incrementing the time step + t += 1.0; + // Comput the cost and gradient + let (cost, grad) = model.compute_grad(params.data(), + &inputs.select_rows(&[*i]), + &targets.select_rows(&[*i])); + + let grad = Vector::new(grad); + let grad_squared = grad.clone().apply(&|x| x * x); + + //Moving averages of the gradients + m = &m * self.beta1 + grad * (1.0 - self.beta1); + + // Moving averages of the squared gradients + v = &v * self.beta1 + grad_squared * (1.0 - self.beta1); + + // Bias-corrected estimates + // In the paper these are &m_hat and v_hat + m = &m / (1.0 - (self.beta1.powf(t))); + v = &v / (1.0 - (self.beta2.powf(t))); + + let v_hat_sqrt = v.clone().apply(&|x| x.sqrt()); + + // update params + params = ¶ms - ((&m * self.alpha).elediv(&(v_hat_sqrt + self.epsilon))); + + loss_vector.push(cost); + } + } + params.into_vec() } } \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index d974be62..9c587285 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -200,6 +200,7 @@ pub mod learning { pub mod grad_desc; pub mod fmincg; + pub mod adam; } /// Module for learning tools. From 33ae15b313d31425eebaea5ad63ecdfe536b5c25 Mon Sep 17 00:00:00 2001 From: Alec Date: Tue, 28 Jan 2020 20:29:44 -0600 Subject: [PATCH 3/4] fixed math error, improved memory use --- src/learning/optim/adam.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/learning/optim/adam.rs b/src/learning/optim/adam.rs index 172d2067..89168e7f 100644 --- a/src/learning/optim/adam.rs +++ b/src/learning/optim/adam.rs @@ -59,7 +59,7 @@ impl Default for Adam { beta1: 0.09, beta2: 0.999, epsilon: 1e-8, - iters: 100 + iters: 50 } } } @@ -108,20 +108,21 @@ impl OptimAlgorithm for Adam let grad_squared = grad.clone().apply(&|x| x * x); //Moving averages of the gradients - m = &m * self.beta1 + grad * (1.0 - self.beta1); + m = m * self.beta1 + grad * (1.0 - self.beta1); // Moving averages of the squared gradients - v = &v * self.beta1 + grad_squared * (1.0 - self.beta1); + v = v * self.beta2 + grad_squared * (1.0 - self.beta2); // Bias-corrected estimates - // In the paper these are &m_hat and v_hat - m = &m / (1.0 - (self.beta1.powf(t))); - v = &v / (1.0 - (self.beta2.powf(t))); + let mut m_hat = &m / (1.0 - (self.beta1.powf(t))); + let mut v_hat = &v / (1.0 - (self.beta2.powf(t))); - let v_hat_sqrt = v.clone().apply(&|x| x.sqrt()); + utils::in_place_vec_bin_op(m_hat.mut_data(), v_hat.data(), |x, &y| { + *x = (*x / &y.sqrt() - self.epsilon) * self.alpha; + }); // update params - params = ¶ms - ((&m * self.alpha).elediv(&(v_hat_sqrt + self.epsilon))); + params = ¶ms - &m_hat; loss_vector.push(cost); } From 1d133c41ad0e3a090fbd6dbaffe3be1b025f0632 Mon Sep 17 00:00:00 2001 From: Alec Date: Wed, 29 Jan 2020 17:01:22 -0600 Subject: [PATCH 4/4] fixing some documentation --- src/learning/optim/adam.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/learning/optim/adam.rs b/src/learning/optim/adam.rs index 89168e7f..d7070281 100644 --- a/src/learning/optim/adam.rs +++ b/src/learning/optim/adam.rs @@ -1,7 +1,6 @@ //! Adam Optimizer //! //! Implementation of the ADAM optimization algorithm. -//! use learning::optim::{Optimizable, OptimAlgorithm}; use linalg::Vector; use linalg::{Matrix, BaseMatrix}; @@ -51,7 +50,7 @@ impl Adam { /// - beta2 = 0.999 (dw^2) /// - epsilon = 1e-8 /// - iters = 50 -/// source: https://arxiv.org/pdf/1412.6980.pdf +/// Source: https://arxiv.org/pdf/1412.6980.pdf impl Default for Adam { fn default() -> Adam { Adam { @@ -78,17 +77,16 @@ impl OptimAlgorithm for Adam // Set up the indices for permutation let mut permutation = (0..inputs.rows()).collect::>(); - // moment vectors & timestep + // Moment vectors & timestep let mut t: f64 = 0.0; let mut m = Vector::zeros(start.len()); let mut v = Vector::zeros(start.len()); - + // Vector for tracking loss let mut loss_vector: Vec = vec![]; for l in 0..self.iters { - // The cost at the end of each pass - + // Printing running average loss if l % EVAL_STEP == 0 && l > 0 { let average_loss: f64 = loss_vector.iter().sum::() / loss_vector.len() as f64; println!("Running average loss iter {:#?}: {:#?}", l, average_loss); @@ -115,15 +113,17 @@ impl OptimAlgorithm for Adam // Bias-corrected estimates let mut m_hat = &m / (1.0 - (self.beta1.powf(t))); - let mut v_hat = &v / (1.0 - (self.beta2.powf(t))); + let v_hat = &v / (1.0 - (self.beta2.powf(t))); + // Final math step and applying the learning rate utils::in_place_vec_bin_op(m_hat.mut_data(), v_hat.data(), |x, &y| { *x = (*x / &y.sqrt() - self.epsilon) * self.alpha; }); - // update params + // Update params params = ¶ms - &m_hat; + // Update loss vector loss_vector.push(cost); } }