diff --git a/Cargo.toml b/Cargo.toml index 34979915..1cf74dab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ description = "A machine learning library." repository = "https://github.com/AtheMathmo/rusty-machine" documentation = "https://AtheMathmo.github.io/rusty-machine/" keywords = ["machine","learning","stats","data","machine-learning"] +categories = ["science"] readme = "README.md" license = "MIT" @@ -15,6 +16,6 @@ stats = [] datasets = [] [dependencies] -num = { version = "0.1.35", default-features = false } -rand = "0.3.14" -rulinalg = "0.3.7" +num = { version = "0.1.36", default-features = false } +rand = "0.3.15" +rulinalg = "0.4.2" diff --git a/benches/examples/cross_validation.rs b/benches/examples/cross_validation.rs index d2adfa8a..e0450fb3 100644 --- a/benches/examples/cross_validation.rs +++ b/benches/examples/cross_validation.rs @@ -28,7 +28,7 @@ struct DummyModel { impl SupModel, Matrix> for DummyModel { fn predict(&self, inputs: &Matrix) -> LearningResult> { let predictions: Vec = inputs - .iter_rows() + .row_iter() .map(|row| { self.sum + sum(row.iter()) }) .collect(); Ok(Matrix::new(inputs.rows(), 1, predictions)) diff --git a/benches/examples/k_means.rs b/benches/examples/k_means.rs index 8691ec05..3791962f 100644 --- a/benches/examples/k_means.rs +++ b/benches/examples/k_means.rs @@ -20,10 +20,10 @@ fn generate_data(centroids: &Matrix, points_per_centroid: usize, noise: f64 for _ in 0..points_per_centroid { // Generate points from each centroid - for centroid in centroids.iter_rows() { + for centroid in centroids.row_iter() { // Generate a point randomly around the centroid let mut point = Vec::with_capacity(centroids.cols()); - for feature in centroid { + for feature in centroid.iter() { point.push(feature + normal_rv.ind_sample(&mut rng)); } diff --git a/examples/README.md b/examples/README.md index cee33482..f201ac5a 100644 --- a/examples/README.md +++ b/examples/README.md @@ -9,6 +9,7 @@ This directory gathers fully-fledged programs, each using a piece of * [SVM](#svm) * [Neural Networks](#neural-networks) * [Naïve Bayes](#naïve-bayes) +* [LDA](#lda) ## The Examples @@ -165,3 +166,71 @@ Predicted: White; Actual: White; Accurate? true Predicted: Red; Actual: Red; Accurate? true Accuracy: 822/1000 = 82.2% ``` + +### LDA + +#### Word distribution + +The [word distribution](lda_gen.rs) example starts by generating a distribution +of words over topics, then generating documents based on a distribution of +topics. The example then tries to estimate the distribution of words based only +on the generated documents. + +The generated distribution (G) of words are visualized as a grid, with each cell +in the grid corresponding to the probability of a particular word being +selected. Following this, documents (D) are generated based on a distribution +over these topics. + +The distribution for each topic is shown, then Linear Dirichlet Allocation is +used to try to estimate the distribution (E) of words to topic, based solely on +generated documents (D). + +The resulting word distribution(E) is printed. The order may not be the same, +but for each estimated topic in (E), there should be a corresponding generated +distribution in (G). + +Sample run: +``` +$ cargo run --example lda_gen +... +Creating word distribution +Distribution generated: +Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 +------- ------- ------- ------- ------- +99999 ..... ..... ..... ..... +..... 99999 ..... ..... ..... +..... ..... 99999 ..... ..... +..... ..... ..... 99999 ..... +..... ..... ..... ..... 99999 + + +Topic 6 Topic 7 Topic 8 Topic 9 Topic 10 +------- ------- ------- ------- ------- +9.... .9... ..9.. ...9. ....9 +9.... .9... ..9.. ...9. ....9 +9.... .9... ..9.. ...9. ....9 +9.... .9... ..9.. ...9. ....9 +9.... .9... ..9.. ...9. ....9 + + +Generating documents +Predicting word distribution from generated documents +Prediction completed. Predicted word distribution: +(Should be similar to generated distribution above) +Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 +------- ------- ------- ------- ------- +..8.. ..... ..... ....8 8.... +..8.. ..... ..... ....8 8.... +..9.. 98888 ..... ....9 8.... +..8.. ..... ..... ....8 8.... +..8.. ..... 88988 ....8 9.... + + +Topic 6 Topic 7 Topic 8 Topic 9 Topic 10 +------- ------- ------- ------- ------- +...8. ..... .8... ..... 89888 +...8. ..... .8... 88889 ..... +...8. ..... .9... ..... ..... +...8. 88889 .8... ..... ..... +...9. ..... .8... ..... ..... +``` diff --git a/examples/k-means_generating_cluster.rs b/examples/k-means_generating_cluster.rs index 75265447..078851df 100644 --- a/examples/k-means_generating_cluster.rs +++ b/examples/k-means_generating_cluster.rs @@ -24,10 +24,10 @@ fn generate_data(centroids: &Matrix, for _ in 0..points_per_centroid { // Generate points from each centroid - for centroid in centroids.iter_rows() { + for centroid in centroids.row_iter() { // Generate a point randomly around the centroid let mut point = Vec::with_capacity(centroids.cols()); - for feature in centroid { + for feature in centroid.iter() { point.push(feature + normal_rv.ind_sample(&mut rng)); } diff --git a/examples/lda_gen.rs b/examples/lda_gen.rs new file mode 100644 index 00000000..6c97efc8 --- /dev/null +++ b/examples/lda_gen.rs @@ -0,0 +1,193 @@ +/// An example of how Latent Diriclhet Allocation (LDA) can be used. This example begins by +/// generating a distribution of words to categories. This distribution is created so that +/// there are 10 topics. Each of the 25 words are assigned to two topics with equal probability. +/// (The distribution of words is printed to the screen as a chart. Each entry in the chart +/// corresponds to a word in the vocabulary, arranged into a square for easy viewing). Documents +/// are then generated based on these distributions (each topic is assumed equally likely to be +/// assigned to a document, but each document has only one topic). +/// +/// Once the documents are created, then the example uses LDA to attempt to reverse engineer the +/// distrbution of words, and prints the results to the screen for comparison. + +extern crate rusty_machine; +extern crate rand; +extern crate rulinalg; + +use rusty_machine::linalg::{Matrix, BaseMatrix, Vector}; +use rusty_machine::data::transforms::{TransformFitter, LDAFitter}; + +use rand::{thread_rng, Rng}; +use rand::distributions::{gamma, IndependentSample}; + +use std::cmp::max; + +// These constants control the generation algorithm. You can set them how you wish, +// although very large values for TOPIC_COUNT size will cause problems. + +// TOPIC_COUNT should be even +const TOPIC_COUNT:usize = 10; +const DOCUMENT_LENGTH:usize = 100; +const DOCUMENT_COUNT:usize = 500; +const ALPHA:f64 = 0.1; +const ITERATION_COUNT:usize = 300; + +/// Given `topic_count` topics, this function will create a distrbution of words for each +/// topic. For simplicity, this function assumes that the total number of words in the corpus +/// will be `(topic_count / 2)^2`. +fn generate_word_distribution(topic_count: usize) -> Matrix { + let width = topic_count / 2; + let vocab_size = width * width; + let initial_value = 1.0 / width as f64; + Matrix::from_fn(topic_count, vocab_size, |col, row| { + if row < width { + // Horizontal topics + if col / width == row { + initial_value + } else { + 0.0 + } + } else { + //Vertical topics + if col % width == (row - width) { + initial_value + } else { + 0.0 + } + } + }) +} + +/// Samples `count` times from a dirichlet distribution with alpha as given and +/// beta 1.0. +fn get_dirichlet(count: usize, alpha: f64) -> Vector { + let mut rng = thread_rng(); + let g_dist = gamma::Gamma::new(alpha, 1.0); + let result = Vector::from_fn(count, |_| { + g_dist.ind_sample(&mut rng) + }); + let sum = result.sum(); + result / sum +} + +/// Generates a document based on a word distributiion as given. The topics are randomly sampled +/// from a dirichlet distribution and then the word sampled from the selected topic. +fn generate_document(word_distribution: &Matrix, topic_count:usize, vocab_size: usize, document_length: usize, alpha: f64) -> Vec { + let mut document = vec![0; vocab_size]; + let topic_distribution = get_dirichlet(topic_count, alpha); + for _ in 0..document_length { + let topic = choose_from(&topic_distribution); + let word = choose_from(&word_distribution.row(topic).into()); + document[word] += 1; + } + document +} + +/// Generate a collection of documents based on the word distribution +fn generate_documents(word_distribution: &Matrix, topic_count: usize, vocab_size: usize, document_count: usize, document_length: usize, alpha: f64) -> Matrix { + let mut documents = Vec::with_capacity(vocab_size * document_count); + for _ in 0..document_count { + documents.append(&mut generate_document(word_distribution, topic_count, vocab_size, document_length, alpha)); + } + Matrix::new(document_count, vocab_size, documents) +} + +/// Chooses from a vector of probailities. +fn choose_from(probability: &Vector) -> usize { + let mut rng = thread_rng(); + let selection:f64 = rng.next_f64(); + let mut total:f64 = 0.0; + for (index, p) in probability.iter().enumerate() { + total += *p; + if total >= selection { + return index; + } + } + return probability.size() - 1; +} + +/// Displays the distrbution of words to a topic as a square graph +fn topic_to_string(topic: &Vector, width: usize, topic_index: usize) -> String { + let max = topic.iter().fold(0.0, |a, b|{ + if a > *b { + a + } else { + *b + } + }); + let mut result = String::with_capacity(topic.size() * (topic.size()/width) + 18); + result.push_str(&format!("Topic {}\n", topic_index)); + result.push_str("-------\n"); + for (index, element) in topic.iter().enumerate() { + let col = index % width; + let out = element / max * 9.0; + if out >= 1.0 { + result.push_str(&(out as u32).to_string()); + } else { + result.push('.'); + } + if col == width - 1 { + result.push('\n'); + } + } + result +} + + +/// Prints a collection of multiline strings in columns +fn print_multi_line(o: &Vec, column_width: usize) { + let o_split:Vec<_> = o.iter().map(|col| {col.split('\n').collect::>()}).collect(); + let mut still_printing = true; + let mut line_index = 0; + while still_printing { + let mut gap = 0; + still_printing = false; + for col in o_split.iter() { + if col.len() > line_index { + if gap > 0 { + print!("{:width$}", "", width=column_width * gap); + gap = 0; + } + let line = col[line_index]; + print!("{:width$}", line, width=column_width); + still_printing = true; + } else { + gap += 1; + } + } + print!("\n"); + line_index += 1 + + } +} + + +/// Prints the word distribution within topics +fn print_topic_distribution(dist: &Matrix, topic_count: usize, width: usize) { + let top_strings = &dist.row_iter().take(topic_count/2).enumerate().map(|(topic_index, topic)|topic_to_string(&topic.into(), width, topic_index + 1)).collect(); + let bottom_strings = &dist.row_iter().skip(topic_count/2).enumerate().map(|(topic_index, topic)|topic_to_string(&topic.into(), width, topic_index + 1 + topic_count / 2)).collect(); + + print_multi_line(top_strings, max(12, width + 1)); + print_multi_line(bottom_strings, max(12, width + 1)); +} + +pub fn main() { + let width = TOPIC_COUNT / 2; + let vocab_count = width * width; + println!("Creating word distribution"); + let word_distribution = generate_word_distribution(TOPIC_COUNT); + println!("Distrbution generated:"); + print_topic_distribution(&word_distribution, TOPIC_COUNT, width); + println!("Generating documents"); + let input = generate_documents(&word_distribution, TOPIC_COUNT, vocab_count, DOCUMENT_COUNT, DOCUMENT_LENGTH, ALPHA); + let lda = LDAFitter::new(TOPIC_COUNT, ALPHA, 0.1, ITERATION_COUNT); + println!("Predicting word distrbution from generated documents"); + let result = lda.fit(&input).unwrap(); + let dist = result.word_distribution(); + + println!("Prediction completed. Predicted word distribution:"); + println!("(Should be similar to generated distribution above)", ); + + print_topic_distribution(&dist, TOPIC_COUNT, width); + + +} diff --git a/examples/naive_bayes_dogs.rs b/examples/naive_bayes_dogs.rs index 2e57de54..0df1f179 100644 --- a/examples/naive_bayes_dogs.rs +++ b/examples/naive_bayes_dogs.rs @@ -135,16 +135,16 @@ fn main() { // Score how well we did. let mut hits = 0; let unprinted_total = test_set_size.saturating_sub(10) as usize; - for (dog, prediction) in test_dogs.iter().zip(predictions.iter_rows()).take(unprinted_total) { - evaluate_prediction(&mut hits, dog, prediction); + for (dog, prediction) in test_dogs.iter().zip(predictions.row_iter()).take(unprinted_total) { + evaluate_prediction(&mut hits, dog, prediction.raw_slice()); } - + if unprinted_total > 0 { println!("..."); } - for (dog, prediction) in test_dogs.iter().zip(predictions.iter_rows()).skip(unprinted_total) { - let (actual_color, accurate) = evaluate_prediction(&mut hits, dog, prediction); + for (dog, prediction) in test_dogs.iter().zip(predictions.row_iter()).skip(unprinted_total) { + let (actual_color, accurate) = evaluate_prediction(&mut hits, dog, prediction.raw_slice()); println!("Predicted: {:?}; Actual: {:?}; Accurate? {:?}", dog.color, actual_color, accurate); } diff --git a/src/analysis/score.rs b/src/analysis/score.rs index b8329b97..48c4010e 100644 --- a/src/analysis/score.rs +++ b/src/analysis/score.rs @@ -31,9 +31,10 @@ use learning::toolkit::cost_fn::{CostFunc, MeanSqError}; /// # Panics /// /// - outputs and targets have different length -pub fn accuracy(outputs: I, targets: I) -> f64 - where I: ExactSizeIterator, - I::Item: PartialEq +pub fn accuracy(outputs: I1, targets: I2) -> f64 + where T: PartialEq, + I1: ExactSizeIterator + Iterator, + I2: ExactSizeIterator + Iterator { assert!(outputs.len() == targets.len(), "outputs and targets must have the same length"); let len = outputs.len() as f64; @@ -46,7 +47,8 @@ pub fn accuracy(outputs: I, targets: I) -> f64 /// Returns the fraction of outputs rows which match their target. pub fn row_accuracy(outputs: &Matrix, targets: &Matrix) -> f64 { - accuracy(outputs.iter_rows(), targets.iter_rows()) + accuracy(outputs.row_iter().map(|r| r.raw_slice()), + targets.row_iter().map(|r| r.raw_slice())) } /// Returns the precision score for 2 class classification. diff --git a/src/data/transforms/lda.rs b/src/data/transforms/lda.rs new file mode 100644 index 00000000..9833141c --- /dev/null +++ b/src/data/transforms/lda.rs @@ -0,0 +1,286 @@ +//! Latent Dirichlet Allocation Module +//! +//! Contains an implementation of Latent Dirichlet Allocation (LDA) using +//! [Gibbs Sampling](http://psiexp.ss.uci.edu/research/papers/sciencetopics.pdf) +//! +//! LDAFitter is typically used for textual analysis. It assumes that a topic can be modeled +//! as a distribution of words, and that a document can be modeled as a distribution of +//! topics. Thus, the output is these distributions on the input documents. +//! +//! Gibbs sampling is a Morkov Chain Monto Carlo algorithm that iteratively approximates +//! the above distributions. +//! +//! This module is able to estimate a distribution of categories over documents in an +//! unsupervised manner, and use that distrbution to estimate the categories over other +//! models. +//! +//! # Examples +//! +//! ``` +//! #[allow(unused_variables)] +//! use rusty_machine::linalg::Matrix; +//! use rusty_machine::data::transforms::{LDAFitter, Transformer, TransformFitter}; +//! +//! // Create a basic input array with 4 documents and 3 words +//! let input = Matrix::ones(4, 3); +//! +//! // Create a model that will find 5 topics, with parameters alpha=0.1, beta=0.1 +//! let lda = LDAFitter::new(5, 0.1, 0.1, 10); +//! +//! // Fit the model to the input +//! let mut result = lda.fit(&input).unwrap(); +//! +//! // Find the estimated distrbution of words over categories +//! let dist = result.word_distribution(); +//! +//! // Use the model to estimate some topics for new documents +//! let esimtated = result.transform(Matrix::ones(6, 3)).unwrap(); +//! ``` + + +use linalg::{Matrix, Vector, BaseMatrix}; +use super::{Transformer, TransformFitter}; +use rulinalg::matrix::BaseMatrixMut; +use rand::{Rng, thread_rng}; +use learning::LearningResult; + +/// Latent Dirichlet Allocation +#[derive(Debug)] +pub struct LDAFitter { + iterations: usize, + topic_count: usize, + alpha: f64, + beta: f64, +} + +/// An object which holds the results of Gibbs Sampling. +/// +/// This object can then be used to get the distrbutions of +/// topics over documents and words over topics +#[derive(Debug)] +pub struct LDAModel { + document_topic_count: Matrix, + topic_word_count: Matrix, + // The two vectors are simply used to reduce the amount of calculation that must be + // performed during the conditional probability step + topic_total_by_document: Vector, + word_total_by_topic: Vector, + alpha: f64, + beta: f64, +} + +/// Creates a default for LDAFitter with alpha = beta = 0.1, topic_coutn = 10 and iterations = 30 +impl Default for LDAFitter { + fn default() -> LDAFitter { + LDAFitter { + iterations: 30, + topic_count: 10, + alpha: 0.1, + beta: 0.1 + } + } +} + +impl LDAFitter { + /// Creates a new object for finding LDA. + /// `alpha` and `beta` are the symmetric priors for the algorithm. + /// `iterations` is the number of times the sampling algorithm will run. + /// + /// If you don't know what to use, try alpha = 50/topic_count and beta = 0.01 + pub fn new(topic_count: usize, alpha: f64, beta: f64, iterations: usize) -> LDAFitter { + LDAFitter { + topic_count: topic_count, + alpha: alpha, + beta: beta, + iterations: iterations + } + } +} + +impl LDAModel { + fn new(input: &Matrix, topic_count: usize, alpha: f64, beta: f64) -> (LDAModel, Vec >) { + let document_count = input.rows(); + let vocab_count = input.cols(); + let mut topics = Vec::with_capacity(document_count); + let mut result = LDAModel { + document_topic_count: Matrix::new(document_count, topic_count, vec![alpha; document_count * topic_count]), + topic_word_count: Matrix::new(vocab_count, topic_count, vec![beta; topic_count * vocab_count]), + topic_total_by_document: Vector::new(vec![alpha * topic_count as f64; document_count]), + word_total_by_topic: Vector::new(vec![beta * vocab_count as f64; topic_count]), + alpha: alpha, + beta: beta + }; + + // For each word in each document, randomly assign it to a topic and update the counts + let mut rng = thread_rng(); + for (document, row) in input.row_iter().enumerate() { + let mut document_topics = Vec::with_capacity(row.sum()); + for (word, word_count) in row.iter().enumerate() { + for _ in 0..*word_count{ + let topic = rng.gen_range(0, topic_count); + result.document_topic_count[[document, topic]] += 1.0; + result.topic_total_by_document[document] += 1.0; + result.topic_word_count[[word, topic]] += 1.0; + result.word_total_by_topic[topic] += 1.0; + document_topics.push(topic); + } + } + topics.push(document_topics); + } + (result, topics) + } + + /// Find the distribution of words over topics. This gives a matrix where the rows are + /// topics and the columns are words. Each entry `(topic, word)` gives the probability of + /// `word` given `topic`. + pub fn word_distribution(&self) -> Matrix { + let mut distribution = self.topic_word_count.transpose(); + let row_sum = distribution.sum_rows(); + // XXX To my knowledge, there is not presently a way in rulinalg to divide a matrix + // by a vector, so I do it with a loop here. + for (mut row, sum) in distribution.row_iter_mut().zip(row_sum.iter()) { + *row /= sum; + } + distribution + } + + /// Finds the distribution of cateogries across the documents originally used to + /// fit the model. This gives a matrix where the rows are documents and the columns are + /// topics. Each entry `(document, topic)` gives the probability of `topic` given `word` + pub fn category_distribution(&self) -> Matrix { + let mut distribution = self.document_topic_count.clone(); + for (document, mut row) in distribution.row_iter_mut().enumerate() { + for c in row.iter_mut() { + *c /= self.topic_total_by_document[document]; + } + } + distribution + } + +} + +impl LDAFitter { + fn conditional_distribution(&self, result: &LDAModel, document: usize, word: usize) -> Vector { + + // Convert the column of the word count by topic into a vector + let word_topic_count:Vector = unsafe {result.topic_word_count.row_unchecked(word)}.into(); + + // Calculate the proportion of this word's contribution to each topic + let left:Vector = (word_topic_count).elediv(&result.word_total_by_topic); + + // Convert the row of the topic count by document into a vector + let topic_document_count:Vector = unsafe{result.document_topic_count.row_unchecked(document)}.into(); + + // Calculate the proportion of each topic's contribution to this document + let right:Vector = topic_document_count / result.topic_total_by_document[document]; + + // Multiply the proportions together to this word's contribution to the document by topic + let mut probability:Vector = left.elemul(&right); + + // Normalize it so that it's a probability + probability /= probability.sum(); + return probability; + } +} + +impl TransformFitter, Matrix, LDAModel> for LDAFitter { + /// Predict categories from the input matrix. + fn fit(self, inputs: &Matrix) -> LearningResult { + let (mut result, mut topics) = LDAModel::new(inputs, self.topic_count, self.alpha, self.beta); + let mut word_index:usize; + for _ in 0..self.iterations { + for (document, row) in inputs.row_iter().enumerate() { + word_index = 0; + let mut document_topics = unsafe{topics.get_unchecked_mut(document)}; + for (word, word_count) in row.iter().enumerate() { + for _ in 0..*word_count { + // Remove the current word from the counts + let mut topic = *unsafe{document_topics.get_unchecked(word_index)}; + result.document_topic_count[[document, topic]] -= 1.0; + result.topic_total_by_document[document] -= 1.0; + result.topic_word_count[[word, topic]] -= 1.0; + result.word_total_by_topic[topic] -= 1.0; + + // Find the probability of that word being a part of each topic + // based on the other words in the present document + let probability = self.conditional_distribution(&result, document, word); + + // Aandomly assign a new topic based on the probabilities from above + topic = choose_from(probability); + + // Update the counts with the new topic + result.document_topic_count[[document, topic]] += 1.0; + result.topic_total_by_document[document] += 1.0; + result.topic_word_count[[word, topic]] += 1.0; + result.word_total_by_topic[topic] += 1.0; + document_topics[word_index] = topic; + word_index += 1; + } + } + } + } + Ok(result) + } +} + +impl Transformer, Matrix> for LDAModel { + fn transform(&mut self, input: Matrix) -> LearningResult> { + assert!(input.cols() == self.topic_word_count.rows(), "The input matrix must have the same size vocabulary as the fitting model"); + let input = Matrix::from_fn(input.rows(), input.cols(), |col, row| { + input[[row, col]] as f64 + }); + let mut distribution = input * &self.topic_word_count; + + let row_sum = distribution.sum_rows(); + for (mut row, sum) in distribution.row_iter_mut().zip(row_sum.iter()) { + *row /= sum; + } + + Ok(distribution) + } +} + +/// This function models sampling from the categorical distribution. +/// That is, given a series of discrete categories, each with different probability of occurring, +/// this function will choose a category according to their probabilities. +/// The sum of the probabilities must be 1, but since this is only used internally, +/// there is no need to verify that this is true. +fn choose_from(probability: Vector) -> usize { + let mut rng = thread_rng(); + let selection:f64 = rng.gen_range(0.0, 1.0); + let mut total:f64 = 0.0; + for (index, p) in probability.iter().enumerate() { + total += *p; + if total >= selection { + return index; + } + } + return probability.size() - 1; +} + +#[cfg(test)] +mod test { + use super::{LDAModel, LDAFitter}; + use linalg::{Matrix, Vector}; + #[test] + fn test_conditional_distribution() { + let result = LDAModel { + topic_word_count: Matrix::new(4, 3, vec![3.1, 0.1, 0.1, + 3.1, 1.1, 0.1, + 5.1, 1.1, 5.1, + 0.1, 0.1, 2.1]), + word_total_by_topic: Vector::new(vec![11.4, 2.4, 7.4]), + document_topic_count: Matrix::new(2, 3, vec![1.1, 2.1, 3.1, + 5.1, 5.1, 4.1]), + topic_total_by_document: Vector::new(vec![6.3, 14.3]), + alpha: 0.1, + beta: 0.1, + }; + let lda = LDAFitter::new(3, 0.1, 0.1, 1); + let probability = lda.conditional_distribution(&result, 0, 2); + + // Calculated by hand and verified using https://gist.github.com/mblondel/542786 + assert_eq!(probability, + Vector::new(vec![0.13703500146066358, 0.2680243410921803, 0.5949406574471561])); + } +} diff --git a/src/data/transforms/minmax.rs b/src/data/transforms/minmax.rs index 580d0ea3..4bd6e530 100644 --- a/src/data/transforms/minmax.rs +++ b/src/data/transforms/minmax.rs @@ -10,14 +10,15 @@ //! # Examples //! //! ``` -//! use rusty_machine::data::transforms::{Transformer, MinMaxScaler}; +//! use rusty_machine::data::transforms::{Transformer, TransformFitter, MinMaxFitter}; //! use rusty_machine::linalg::Matrix; //! +//! let inputs = Matrix::new(2, 2, vec![-1.0, 2.0, 1.5, 3.0]); +//! //! // Constructs a new `MinMaxScaler` to map minimum to 0 and maximum //! // to 1. -//! let mut transformer = MinMaxScaler::default(); +//! let mut transformer = MinMaxFitter::default().fit(&inputs).unwrap(); //! -//! let inputs = Matrix::new(2, 2, vec![-1.0, 2.0, 1.5, 3.0]); //! //! // Transform the inputs to get output data with required minimum //! // and maximum. @@ -25,72 +26,68 @@ //! ``` use learning::error::{Error, ErrorKind}; +use learning::LearningResult; use linalg::{Matrix, BaseMatrix, BaseMatrixMut, Vector}; -use super::{Invertible, Transformer}; +use super::{Invertible, Transformer, TransformFitter}; use rulinalg::utils; use libnum::Float; -/// The `MinMaxScaler` -/// -/// The `MinMaxScaler` provides an implementation of `Transformer` -/// which allows us to transform the input data to have a new minimum -/// and maximum per column. -/// -/// See the module description for more information. +/// A builder used to construct a `MinMaxScaler` #[derive(Debug)] -pub struct MinMaxScaler { - /// Values to scale each column by - scale_factors: Option>, - /// Values to add to each column after scaling - const_factors: Option>, - /// The min of the new data (default 0) +pub struct MinMaxFitter { scaled_min: T, - /// The max of the new data (default 1) - scaled_max: T, + scaled_max: T } -/// Create a default `MinMaxScaler` with minimum of `0` and -/// maximum of `1`. -impl Default for MinMaxScaler { - fn default() -> MinMaxScaler { - MinMaxScaler::new(T::zero(), T::one()) +impl Default for MinMaxFitter { + fn default() -> Self { + MinMaxFitter { + scaled_min: T::zero(), + scaled_max: T::one() + } } } -impl MinMaxScaler { - /// Constructs a new MinMaxScaler with the specified scale. +impl MinMaxFitter { + /// Construct a new `MinMaxFitter` with + /// specified mean and standard deviation. + /// + /// Note that this function does not create a `Transformer` + /// only a builder which can be used to produce a fitted `Transformer`. /// /// # Examples /// /// ``` - /// use rusty_machine::data::transforms::{MinMaxScaler, Transformer}; + /// use rusty_machine::data::transforms::MinMaxFitter; + /// use rusty_machine::linalg::Matrix; + /// + /// let fitter = MinMaxFitter::new(0.0, 1.0); /// - /// // Constructs a new `MinMaxScaler` which will give the data - /// // minimum `0` and maximum `2`. - /// let transformer = MinMaxScaler::new(0.0, 2.0); + /// // We can call `fit` from the `transform::TransformFitter` + /// // trait to create a `MinMaxScaler` used to actually transform data. + /// use rusty_machine::data::transforms::TransformFitter; + /// let mat = Matrix::new(2,2,vec![1.0, 2.0, 3.0, 5.0]); + /// let transformer = fitter.fit(&mat); /// ``` - pub fn new(min: T, max: T) -> MinMaxScaler { - MinMaxScaler { - scale_factors: None, - const_factors: None, + pub fn new(min: T, max: T) -> Self { + MinMaxFitter { scaled_min: min, - scaled_max: max, + scaled_max: max } } } -impl Transformer> for MinMaxScaler { - - fn fit(&mut self, inputs: &Matrix) -> Result<(), Error> { +impl TransformFitter, Matrix, MinMaxScaler> for MinMaxFitter { + fn fit(self, inputs: &Matrix) -> LearningResult> { let features = inputs.cols(); - // ToDo: can use min, max + // TODO: can use min, max // https://github.com/AtheMathmo/rulinalg/pull/115 let mut input_min_max = vec![(T::max_value(), T::min_value()); features]; - for row in inputs.iter_rows() { + for row in inputs.row_iter() { for (idx, (feature, min_max)) in row.into_iter().zip(input_min_max.iter_mut()).enumerate() { if !feature.is_finite() { return Err(Error::new(ErrorKind::InvalidData, @@ -129,68 +126,72 @@ impl Transformer> for MinMaxScaler { .map(|(&(_, x), &s)| self.scaled_max - x * s) .collect::>(); - self.scale_factors = Some(Vector::new(scales)); - self.const_factors = Some(Vector::new(consts)); - Ok(()) + Ok(MinMaxScaler { + scale_factors: Vector::new(scales), + const_factors: Vector::new(consts) + }) } +} - fn transform(&mut self, mut inputs: Matrix) -> Result, Error> { - if let (&None, &None) = (&self.scale_factors, &self.const_factors) { - // if Transformer is not fitted to the data, fit for backward-compat. - try!(self.fit(&inputs)); - } +/// The `MinMaxScaler` +/// +/// The `MinMaxScaler` provides an implementation of `Transformer` +/// which allows us to transform the input data to have a new minimum +/// and maximum per column. +/// +/// See the module description for more information. +#[derive(Debug)] +pub struct MinMaxScaler { + /// Values to scale each column by + scale_factors: Vector, + /// Values to add to each column after scaling + const_factors: Vector, +} - if let (&Some(ref scales), &Some(ref consts)) = (&self.scale_factors, &self.const_factors) { - if scales.size() != inputs.cols() { - Err(Error::new(ErrorKind::InvalidData, - "Input data has different number of columns from fitted data.")) - } else { - for row in inputs.iter_rows_mut() { - utils::in_place_vec_bin_op(row, scales.data(), |x, &y| { - *x = *x * y; - }); - - utils::in_place_vec_bin_op(row, consts.data(), |x, &y| { - *x = *x + y; - }); - } - Ok(inputs) - } + +impl Transformer, Matrix> for MinMaxScaler { + fn transform(&mut self, mut inputs: Matrix) -> Result, Error> { + if self.scale_factors.size() != inputs.cols() { + Err(Error::new(ErrorKind::InvalidData, + "Input data has different number of columns than fitted data.")) } else { - // can't happen - Err(Error::new(ErrorKind::InvalidState, "Transformer has not been fitted.")) + for mut row in inputs.row_iter_mut() { + utils::in_place_vec_bin_op(row.raw_slice_mut(), self.scale_factors.data(), |x, &y| { + *x = *x * y; + }); + + utils::in_place_vec_bin_op(row.raw_slice_mut(), self.const_factors.data(), |x, &y| { + *x = *x + y; + }); + } + Ok(inputs) } } } -impl Invertible> for MinMaxScaler { +impl Invertible, Matrix> for MinMaxScaler { fn inv_transform(&self, mut inputs: Matrix) -> Result, Error> { - if let (&Some(ref scales), &Some(ref consts)) = (&self.scale_factors, &self.const_factors) { - - let features = scales.size(); - if inputs.cols() != features { - return Err(Error::new(ErrorKind::InvalidData, - "Inputs have different feature count than transformer.")); - } + let features = self.scale_factors.size(); + if inputs.cols() != features { + return Err(Error::new(ErrorKind::InvalidData, + "Input data has different number of columns than fitted data.")); + } - for row in inputs.iter_rows_mut() { - for i in 0..features { - row[i] = (row[i] - consts[i]) / scales[i]; - } + for mut row in inputs.row_iter_mut() { + for i in 0..features { + row[i] = (row[i] - self.const_factors[i]) / self.scale_factors[i]; } - - Ok(inputs) - } else { - Err(Error::new(ErrorKind::InvalidState, "Transformer has not been fitted.")) } + + Ok(inputs) } } #[cfg(test)] mod tests { use super::*; - use super::super::{Transformer, Invertible}; + use super::super::{Transformer, TransformFitter, Invertible}; use linalg::Matrix; use std::f64; @@ -198,9 +199,7 @@ mod tests { fn nan_data_test() { let inputs = Matrix::new(2, 2, vec![f64::NAN; 4]); - let mut scaler = MinMaxScaler::new(0.0, 1.0); - let res = scaler.transform(inputs); - + let res = MinMaxFitter::new(0.0, 1.0).fit(&inputs); assert!(res.is_err()); } @@ -208,9 +207,7 @@ mod tests { fn infinity_data_test() { let inputs = Matrix::new(2, 2, vec![f64::INFINITY; 4]); - let mut scaler = MinMaxScaler::new(0.0, 1.0); - let res = scaler.transform(inputs); - + let res = MinMaxFitter::new(0.0, 1.0).fit(&inputs); assert!(res.is_err()); } @@ -218,7 +215,7 @@ mod tests { fn basic_scale_test() { let inputs = Matrix::new(2, 2, vec![-1.0f32, 2.0, 0.0, 3.0]); - let mut scaler = MinMaxScaler::new(0.0, 1.0); + let mut scaler = MinMaxFitter::new(0.0, 1.0).fit(&inputs).unwrap(); let transformed = scaler.transform(inputs).unwrap(); assert!(transformed.data().iter().all(|&x| x >= 0.0)); @@ -235,7 +232,7 @@ mod tests { fn custom_scale_test() { let inputs = Matrix::new(2, 2, vec![-1.0f32, 2.0, 0.0, 3.0]); - let mut scaler = MinMaxScaler::new(1.0, 3.0); + let mut scaler = MinMaxFitter::new(1.0, 3.0).fit(&inputs).unwrap(); let transformed = scaler.transform(inputs).unwrap(); assert!(transformed.data().iter().all(|&x| x >= 1.0)); @@ -252,9 +249,7 @@ mod tests { fn constant_feature_test() { let inputs = Matrix::new(2, 2, vec![1.0, 2.0, 1.0, 3.0]); - let mut scaler = MinMaxScaler::new(0.0, 1.0); - let res = scaler.transform(inputs); - + let res = MinMaxFitter::new(0.0, 1.0).fit(&inputs); assert!(res.is_err()); } @@ -262,7 +257,7 @@ mod tests { fn inv_transform_identity_test() { let inputs = Matrix::new(2, 2, vec![-1.0f32, 2.0, 0.0, 3.0]); - let mut scaler = MinMaxScaler::new(1.0, 3.0); + let mut scaler = MinMaxFitter::new(1.0, 3.0).fit(&inputs).unwrap(); let transformed = scaler.transform(inputs.clone()).unwrap(); let original = scaler.inv_transform(transformed).unwrap(); diff --git a/src/data/transforms/mod.rs b/src/data/transforms/mod.rs index e27dfc2d..37584bba 100644 --- a/src/data/transforms/mod.rs +++ b/src/data/transforms/mod.rs @@ -1,34 +1,58 @@ //! The Transforms module //! -//! This module contains the `Transformer` and `Invertible` traits and reexports -//! the transformers from child modules. +//! This module contains traits used to transform data using common +//! techniques. It also reexports these `Transformer`s from child modules. //! //! The `Transformer` trait provides a shared interface for all of the -//! data preprocessing transformations in rusty-machine. +//! data preprocessing transformations in rusty-machine. Some of these `Transformations` +//! can be inverted via the `Invertible` trait. //! -//! The transformers provide preprocessing transformations which are -//! commonly used in machine learning. +//! Note that some `Transformer`s can not be created without first using the +//! `TransformFitter` trait. +//! +//! # Examples +//! +//! ``` +//! use rusty_machine::data::transforms::{Transformer, TransformFitter, MinMaxFitter}; +//! use rusty_machine::data::transforms::minmax::MinMaxScaler; +//! use rusty_machine::linalg::Matrix; +//! +//! // Some data that we want to scale between 0 and 1 +//! let data = Matrix::new(3, 2, vec![-1.5, 1.0, 2.0, 3.0, -1.0, 2.5]); +//! // Create a new `MinMaxScaler` using the `MinMaxFitter` +//! let mut scaler: MinMaxScaler = MinMaxFitter::new(0.0, 1.0).fit(&data).expect("Failed to fit transformer"); +//! // Transform the data using the scaler +//! let transformed = scaler.transform(data).expect("Failed to transformer data"); +//! ``` +pub mod lda; pub mod minmax; +pub mod normalize; pub mod standardize; pub mod shuffle; -use learning::error; +use learning::LearningResult; -pub use self::minmax::MinMaxScaler; +pub use self::lda::LDAFitter; +pub use self::minmax::MinMaxFitter; +pub use self::normalize::Normalizer; pub use self::shuffle::Shuffler; -pub use self::standardize::Standardizer; +pub use self::standardize::StandardizerFitter; + +/// A trait used to construct Transformers which must first be fitted +pub trait TransformFitter> { + /// Fit the inputs to create the `Transformer` + fn fit(self, inputs: &I) -> LearningResult; +} /// Trait for data transformers -pub trait Transformer { - /// Fit Transformer to input data, and stores the transformation in the Transformer - fn fit(&mut self, inputs: &T) -> Result<(), error::Error>; - /// Transforms the inputs and stores the transformation in the Transformer - fn transform(&mut self, inputs: T) -> Result; +pub trait Transformer { + /// Transforms the inputs + fn transform(&mut self, inputs: I) -> LearningResult; } /// Trait for invertible data transformers -pub trait Invertible : Transformer { +pub trait Invertible : Transformer { /// Maps the inputs using the inverse of the fitted transform. - fn inv_transform(&self, inputs: T) -> Result; -} \ No newline at end of file + fn inv_transform(&self, inputs: O) -> LearningResult; +} diff --git a/src/data/transforms/normalize.rs b/src/data/transforms/normalize.rs new file mode 100644 index 00000000..47584026 --- /dev/null +++ b/src/data/transforms/normalize.rs @@ -0,0 +1,176 @@ +//! The Normalizing Transformer +//! +//! This module contains the `Normalizer` transformer. +//! +//! The `Normalizer` transformer is used to transform input data +//! so that the norm of each row is equal to 1. By default the +//! `Normalizer` uses the `Euclidean` norm. +//! +//! If input data has a row with all 0, `Normalizer` keeps the row as it is. +//! +//! Because transformation is performed per row independently, +//! inverse transformation is not supported. +//! +//! # Examples +//! +//! ``` +//! use rusty_machine::data::transforms::{Transformer, Normalizer}; +//! use rusty_machine::linalg::Matrix; +//! +//! // Constructs a new `Normalizer` +//! let mut transformer = Normalizer::default(); +//! +//! let inputs = Matrix::new(2, 2, vec![-1.0, 2.0, 1.5, 3.0]); +//! +//! // Transform the inputs +//! let transformed = transformer.transform(inputs).unwrap(); +//! ``` + +use learning::error::{Error, ErrorKind}; +use linalg::{Matrix, MatrixSlice, BaseMatrix, BaseMatrixMut}; +use rulinalg::norm::{MatrixNorm, Euclidean}; + +use super::Transformer; + +use libnum::Float; + +use std::marker::PhantomData; + +/// The Normalizer +/// +/// The Normalizer provides an implementation of `Transformer` +/// which allows us to transform the all rows to have the same norm. +/// +/// The default `Normalizer` will use the `Euclidean` norm. +/// +/// See the module description for more information. +#[derive(Debug)] +pub struct Normalizer + where for<'a> M: MatrixNorm> +{ + norm: M, + _marker: PhantomData +} + +/// Create a `Normalizer` with a Euclidean norm. +impl Default for Normalizer { + fn default() -> Self { + Normalizer { + norm: Euclidean, + _marker: PhantomData, + } + } +} + +impl Normalizer + where for<'a> M: MatrixNorm> +{ + /// Constructs a new `Normalizer` with given norm. + /// + /// # Examples + /// + /// ``` + /// use rusty_machine::data::transforms::Normalizer; + /// use rusty_machine::linalg::norm::Euclidean; + /// + /// // Constructs a new `Normalizer` + /// let _ = Normalizer::::new(Euclidean); + /// ``` + pub fn new(norm: M) -> Self { + Normalizer { + norm: norm, + _marker: PhantomData + } + } +} + +impl Transformer, Matrix> for Normalizer + where for<'a> M: MatrixNorm> +{ + fn transform(&mut self, mut inputs: Matrix) -> Result, Error> { + let dists: Vec = inputs.row_iter().map(|m| self.norm.norm(&*m)).collect(); + for (mut row, &d) in inputs.row_iter_mut().zip(dists.iter()) { + + if !d.is_finite() { + return Err(Error::new(ErrorKind::InvalidData, + "Some data point is non-finite.")); + } else if d != T::zero() { + // no change if distance is 0 + *row /= d; + } + } + Ok(inputs) + } +} + + +#[cfg(test)] +mod tests { + use super::*; + use super::super::Transformer; + use linalg::Matrix; + + use std::f64; + + #[test] + fn nan_data_test() { + let inputs = Matrix::new(2, 2, vec![f64::NAN; 4]); + let mut normalizer = Normalizer::default(); + let res = normalizer.transform(inputs); + assert!(res.is_err()); + } + + #[test] + fn inf_data_test() { + let inputs = Matrix::new(2, 2, vec![f64::INFINITY; 4]); + let mut normalizer = Normalizer::default(); + let res = normalizer.transform(inputs); + assert!(res.is_err()); + } + + #[test] + fn single_row_test() { + let inputs = matrix![1.0, 2.0]; + let mut normalizer = Normalizer::default(); + let transformed = normalizer.transform(inputs).unwrap(); + + let exp = matrix![0.4472135954999579, 0.8944271909999159]; + assert_matrix_eq!(transformed, exp); + } + + #[test] + fn basic_normalizer_test() { + let inputs = matrix![-1.0f32, 2.0; + 0.0, 3.0]; + + let mut normalizer = Normalizer::default(); + let transformed = normalizer.transform(inputs).unwrap(); + + let exp = matrix![-0.4472135954999579, 0.8944271909999159; + 0., 1.]; + assert_matrix_eq!(transformed, exp); + + let inputs = matrix![1., 2.; + 10., 20.; + 100., 200.]; + + let transformed = normalizer.transform(inputs).unwrap(); + + let exp = matrix![0.4472135954999579, 0.8944271909999159; + 0.4472135954999579, 0.8944271909999159; + 0.4472135954999579, 0.8944271909999159]; + assert_matrix_eq!(transformed, exp); + + let inputs = matrix![1., 2., 10.; + 0., 10., 20.; + 100., 10., 200.; + 0., 0., 0.]; + let transformed = normalizer.transform(inputs).unwrap(); + + let exp = matrix![0.09759000729485333, 0.19518001458970666, 0.9759000729485332; + 0., 0.4472135954999579, 0.8944271909999159; + 0.4467670516087703, 0.04467670516087703, 0.8935341032175406; + 0., 0., 0.]; + assert_matrix_eq!(transformed, exp); + } +} diff --git a/src/data/transforms/shuffle.rs b/src/data/transforms/shuffle.rs index 81112ef4..39c3f416 100644 --- a/src/data/transforms/shuffle.rs +++ b/src/data/transforms/shuffle.rs @@ -12,7 +12,9 @@ //! use rusty_machine::data::transforms::shuffle::Shuffler; //! //! // Create an input matrix that we want to shuffle -//! let mat = Matrix::new(3, 2, vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]); +//! let mat = Matrix::new(3, 2, vec![1.0, 2.0, +//! 3.0, 4.0, +//! 5.0, 6.0]); //! //! // Create a new shuffler //! let mut shuffler = Shuffler::default(); @@ -22,7 +24,6 @@ //! ``` use learning::LearningResult; -use learning::error::Error; use linalg::{Matrix, BaseMatrix, BaseMatrixMut}; use super::Transformer; @@ -74,13 +75,7 @@ impl Default for Shuffler { /// its rows in place. /// /// Under the hood this uses a Fisher-Yates shuffle. -impl Transformer> for Shuffler { - - #[allow(unused_variables)] - fn fit(&mut self, inputs: &Matrix) -> Result<(), Error> { - Ok(()) - } - +impl Transformer, Matrix> for Shuffler { fn transform(&mut self, mut inputs: Matrix) -> LearningResult> { let n = inputs.rows(); @@ -89,7 +84,6 @@ impl Transformer> for Shuffler { let j = self.rng.gen_range(0, n - i); inputs.swap_rows(i, i + j); } - Ok(inputs) } } @@ -124,16 +118,4 @@ mod tests { assert_eq!(shuffled.into_vec(), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); } - - #[test] - fn shuffle_fit() { - let rng = StdRng::from_seed(&[1, 2, 3]); - let mut shuffler = Shuffler::new(rng); - - // no op - let mat = Matrix::new(4, 2, vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); - let res = shuffler.fit(&mat).unwrap(); - - assert_eq!(res, ()); - } -} \ No newline at end of file +} diff --git a/src/data/transforms/standardize.rs b/src/data/transforms/standardize.rs index b13fa807..6c9ee09e 100644 --- a/src/data/transforms/standardize.rs +++ b/src/data/transforms/standardize.rs @@ -10,85 +10,76 @@ //! # Examples //! //! ``` -//! use rusty_machine::data::transforms::{Transformer, Standardizer}; +//! use rusty_machine::data::transforms::{Transformer, TransformFitter, StandardizerFitter}; //! use rusty_machine::linalg::Matrix; //! +//! let inputs = Matrix::new(2, 2, vec![-1.0, 2.0, 1.5, 3.0]); +//! //! // Constructs a new `Standardizer` to map to mean 0 and standard //! // deviation of 1. -//! let mut transformer = Standardizer::default(); -//! -//! let inputs = Matrix::new(2, 2, vec![-1.0, 2.0, 1.5, 3.0]); +//! let mut transformer = StandardizerFitter::default().fit(&inputs).unwrap(); //! //! // Transform the inputs to get output data with required mean and //! // standard deviation. //! let transformed = transformer.transform(inputs).unwrap(); //! ``` +use learning::LearningResult; use learning::error::{Error, ErrorKind}; use linalg::{Matrix, Vector, Axes, BaseMatrix, BaseMatrixMut}; -use super::{Invertible, Transformer}; +use super::{Invertible, Transformer, TransformFitter}; use rulinalg::utils; use libnum::{Float, FromPrimitive}; -/// The Standardizer -/// -/// The Standardizer provides an implementation of `Transformer` -/// which allows us to transform the input data to have a new mean -/// and standard deviation. -/// -/// See the module description for more information. +/// A builder used to construct a `Standardizer` #[derive(Debug)] -pub struct Standardizer { - /// Means per column of input data - means: Option>, - /// Variances per column of input data - variances: Option>, - /// The mean of the new data (default 0) +pub struct StandardizerFitter { scaled_mean: T, - /// The standard deviation of the new data (default 1) - scaled_stdev: T, + scaled_stdev: T } -/// Create a `Standardizer` with mean `0` and standard -/// deviation `1`. -impl Default for Standardizer { - fn default() -> Standardizer { - Standardizer { - means: None, - variances: None, +impl Default for StandardizerFitter { + fn default() -> Self { + StandardizerFitter { scaled_mean: T::zero(), - scaled_stdev: T::one(), + scaled_stdev: T::one() } } } -impl Standardizer { - /// Constructs a new `Standardizer` with the given mean and variance +impl StandardizerFitter { + /// Construct a new `StandardizerFitter` with + /// specified mean and standard deviation. + /// + /// Note that this function does not create a `Transformer` + /// only a builder which can be used to produce a fitted `Transformer`. /// /// # Examples /// /// ``` - /// use rusty_machine::data::transforms::Standardizer; + /// use rusty_machine::data::transforms::StandardizerFitter; + /// use rusty_machine::linalg::Matrix; /// - /// // Constructs a new `Standardizer` which will give the data - /// // mean `0` and standard deviation `2`. - /// let transformer = Standardizer::new(0.0, 2.0); + /// let fitter = StandardizerFitter::new(0.0, 1.0); + /// + /// // We can call `fit` from the `transform::TransformFitter` + /// // trait to create a `Standardizer` used to actually transform data. + /// use rusty_machine::data::transforms::TransformFitter; + /// let mat = Matrix::new(2, 2, vec![1.0, 2.0, 3.0, 5.0]); + /// let transformer = fitter.fit(&mat); /// ``` - pub fn new(mean: T, stdev: T) -> Standardizer { - Standardizer { - means: None, - variances: None, + pub fn new(mean: T, stdev: T) -> StandardizerFitter { + StandardizerFitter { scaled_mean: mean, - scaled_stdev: stdev, + scaled_stdev: stdev } } } -impl Transformer> for Standardizer { - - fn fit(&mut self, inputs: &Matrix) -> Result<(), Error> { +impl TransformFitter, Matrix, Standardizer> for StandardizerFitter { + fn fit(self, inputs: &Matrix) -> LearningResult> { if inputs.rows() <= 1 { Err(Error::new(ErrorKind::InvalidData, "Cannot standardize data with only one row.")) @@ -101,68 +92,79 @@ impl Transformer> for Standardizer { if mean.data().iter().any(|x| !x.is_finite()) { return Err(Error::new(ErrorKind::InvalidData, "Some data point is non-finite.")); } - self.means = Some(mean); - self.variances = Some(variance); - Ok(()) + + Ok(Standardizer { + means: mean, + variances: variance, + scaled_mean: self.scaled_mean, + scaled_stdev: self.scaled_stdev + }) } } +} - fn transform(&mut self, mut inputs: Matrix) -> Result, Error> { - if let (&None, &None) = (&self.means, &self.variances) { - // if Transformer is not fitted to the data, fit for backward-compat. - try!(self.fit(&inputs)); - } +/// The Standardizer +/// +/// The Standardizer provides an implementation of `Transformer` +/// which allows us to transform the input data to have a new mean +/// and standard deviation. +/// +/// See the module description for more information. +#[derive(Debug)] +pub struct Standardizer { + /// Means per column of input data + means: Vector, + /// Variances per column of input data + variances: Vector, + /// The mean of the new data (default 0) + scaled_mean: T, + /// The standard deviation of the new data (default 1) + scaled_stdev: T, +} - if let (&Some(ref means), &Some(ref variances)) = (&self.means, &self.variances) { - if means.size() != inputs.cols() { - Err(Error::new(ErrorKind::InvalidData, - "Input data has different number of columns from fitted data.")) - } else { - for row in inputs.iter_rows_mut() { - // Subtract the mean - utils::in_place_vec_bin_op(row, means.data(), |x, &y| *x = *x - y); - utils::in_place_vec_bin_op(row, variances.data(), |x, &y| { - *x = (*x * self.scaled_stdev / y.sqrt()) + self.scaled_mean - }); - } - Ok(inputs) - } +impl Transformer, Matrix> for Standardizer { + fn transform(&mut self, mut inputs: Matrix) -> LearningResult> { + if self.means.size() != inputs.cols() { + Err(Error::new(ErrorKind::InvalidData, + "Input data has different number of columns from fitted data.")) } else { - Err(Error::new(ErrorKind::InvalidState, "Transformer has not been fitted.")) + for mut row in inputs.row_iter_mut() { + // Subtract the mean + utils::in_place_vec_bin_op(row.raw_slice_mut(), self.means.data(), |x, &y| *x = *x - y); + utils::in_place_vec_bin_op(row.raw_slice_mut(), self.variances.data(), |x, &y| { + *x = (*x * self.scaled_stdev / y.sqrt()) + self.scaled_mean + }); + } + Ok(inputs) } } } -impl Invertible> for Standardizer { - fn inv_transform(&self, mut inputs: Matrix) -> Result, Error> { - if let (&Some(ref means), &Some(ref variances)) = (&self.means, &self.variances) { - - let features = means.size(); - if inputs.cols() != features { - return Err(Error::new(ErrorKind::InvalidData, - "Inputs have different feature count than transformer.")); - } - - for row in inputs.iter_rows_mut() { - utils::in_place_vec_bin_op(row, &variances.data(), |x, &y| { - *x = (*x - self.scaled_mean) * y.sqrt() / self.scaled_stdev - }); +impl Invertible, Matrix> for Standardizer { + fn inv_transform(&self, mut inputs: Matrix) -> LearningResult> { + let features = self.means.size(); + if inputs.cols() != features { + return Err(Error::new(ErrorKind::InvalidData, + "Inputs have different feature count than transformer.")); + } - // Add the mean - utils::in_place_vec_bin_op(row, &means.data(), |x, &y| *x = *x + y); - } + for mut row in inputs.row_iter_mut() { + utils::in_place_vec_bin_op(row.raw_slice_mut(), self.variances.data(), |x, &y| { + *x = (*x - self.scaled_mean) * y.sqrt() / self.scaled_stdev + }); - Ok(inputs) - } else { - Err(Error::new(ErrorKind::InvalidState, "Transformer has not been fitted.")) + // Add the mean + utils::in_place_vec_bin_op(row.raw_slice_mut(), self.means.data(), |x, &y| *x = *x + y); } + + Ok(inputs) } } #[cfg(test)] mod tests { use super::*; - use super::super::{Transformer, Invertible}; + use super::super::{Transformer, TransformFitter, Invertible}; use linalg::{Axes, Matrix}; use std::f64; @@ -171,29 +173,35 @@ mod tests { fn single_row_test() { let inputs = Matrix::new(1, 2, vec![1.0, 2.0]); - let mut standardizer = Standardizer::default(); - - let res = standardizer.transform(inputs); - assert!(res.is_err()); + let standardizer = StandardizerFitter::default(); + let transformer = standardizer.fit(&inputs); + assert!(transformer.is_err()); } #[test] fn nan_data_test() { let inputs = Matrix::new(2, 2, vec![f64::NAN; 4]); - let mut standardizer = Standardizer::default(); - - let res = standardizer.transform(inputs); - assert!(res.is_err()); + let standardizer = StandardizerFitter::default(); + let transformer = standardizer.fit(&inputs); + assert!(transformer.is_err()); } #[test] fn inf_data_test() { let inputs = Matrix::new(2, 2, vec![f64::INFINITY; 4]); - let mut standardizer = Standardizer::default(); + let standardizer = StandardizerFitter::default(); + let transformer = standardizer.fit(&inputs); + assert!(transformer.is_err()); + } + + #[test] + fn wrong_transform_size_test() { + let inputs = Matrix::new(2, 2, vec![-1.0f32, 2.0, 0.0, 3.0]); - let res = standardizer.transform(inputs); + let mut standardizer = StandardizerFitter::default().fit(&inputs).unwrap(); + let res = standardizer.transform(matrix![1.0, 2.0, 3.0; 4.0, 5.0, 6.0]); assert!(res.is_err()); } @@ -201,7 +209,7 @@ mod tests { fn basic_standardize_test() { let inputs = Matrix::new(2, 2, vec![-1.0f32, 2.0, 0.0, 3.0]); - let mut standardizer = Standardizer::default(); + let mut standardizer = StandardizerFitter::default().fit(&inputs).unwrap(); let transformed = standardizer.transform(inputs).unwrap(); let new_mean = transformed.mean(Axes::Row); @@ -215,7 +223,7 @@ mod tests { fn custom_standardize_test() { let inputs = Matrix::new(2, 2, vec![-1.0f32, 2.0, 0.0, 3.0]); - let mut standardizer = Standardizer::new(1.0, 2.0); + let mut standardizer = StandardizerFitter::new(1.0, 2.0).fit(&inputs).unwrap(); let transformed = standardizer.transform(inputs).unwrap(); let new_mean = transformed.mean(Axes::Row); @@ -229,7 +237,7 @@ mod tests { fn inv_transform_identity_test() { let inputs = Matrix::new(2, 2, vec![-1.0f32, 2.0, 0.0, 3.0]); - let mut standardizer = Standardizer::new(1.0, 3.0); + let mut standardizer = StandardizerFitter::new(1.0, 3.0).fit(&inputs).unwrap(); let transformed = standardizer.transform(inputs.clone()).unwrap(); let original = standardizer.inv_transform(transformed).unwrap(); diff --git a/src/learning/dbscan.rs b/src/learning/dbscan.rs index c45b8577..e7bbdee7 100644 --- a/src/learning/dbscan.rs +++ b/src/learning/dbscan.rs @@ -41,6 +41,7 @@ use learning::error::{Error, ErrorKind}; use linalg::{Matrix, Vector, BaseMatrix}; use rulinalg::utils; +use rulinalg::matrix::Row; /// DBSCAN Model /// @@ -80,7 +81,7 @@ impl UnSupModel, Vector>> for DBSCAN { self.init_params(inputs.rows()); let mut cluster = 0; - for (idx, point) in inputs.iter_rows().enumerate() { + for (idx, point) in inputs.row_iter().enumerate() { let visited = self._visited[idx]; if !visited { @@ -108,12 +109,12 @@ impl UnSupModel, Vector>> for DBSCAN { &self.clusters) { let mut classes = Vec::with_capacity(inputs.rows()); - for input_point in inputs.iter_rows() { + for input_point in inputs.row_iter() { let mut distances = Vec::with_capacity(cluster_data.rows()); - for cluster_point in cluster_data.iter_rows() { + for cluster_point in cluster_data.row_iter() { let point_distance = - utils::vec_bin_op(input_point, cluster_point, |x, y| x - y); + utils::vec_bin_op(input_point.raw_slice(), cluster_point.raw_slice(), |x, y| x - y); distances.push(utils::dot(&point_distance, &point_distance).sqrt()); } @@ -182,7 +183,7 @@ impl DBSCAN { let visited = self._visited[*data_point_idx]; if !visited { self._visited[*data_point_idx] = true; - let data_point_row = unsafe { inputs.get_row_unchecked(*data_point_idx) }; + let data_point_row = unsafe { inputs.row_unchecked(*data_point_idx) }; let sub_neighbours = self.region_query(data_point_row, inputs); if sub_neighbours.len() >= self.min_points { @@ -193,13 +194,14 @@ impl DBSCAN { } - fn region_query(&self, point: &[f64], inputs: &Matrix) -> Vec { - debug_assert!(point.len() == inputs.cols(), + fn region_query(&self, point: Row, inputs: &Matrix) -> Vec { + debug_assert!(point.cols() == inputs.cols(), "point must be of same dimension as inputs"); let mut in_neighbourhood = Vec::new(); - for (idx, data_point) in inputs.iter_rows().enumerate() { - let point_distance = utils::vec_bin_op(data_point, point, |x, y| x - y); + for (idx, data_point) in inputs.row_iter().enumerate() { + //TODO: Use `MatrixMetric` when rulinalg#154 is fixed. + let point_distance = utils::vec_bin_op(data_point.raw_slice(), point.raw_slice(), |x, y| x - y); let dist = utils::dot(&point_distance, &point_distance).sqrt(); if dist < self.eps { @@ -227,7 +229,7 @@ impl DBSCAN { #[cfg(test)] mod tests { use super::DBSCAN; - use linalg::Matrix; + use linalg::{Matrix, BaseMatrix}; #[test] fn test_region_query() { @@ -235,7 +237,9 @@ mod tests { let inputs = Matrix::new(3, 2, vec![1.0, 1.0, 1.1, 1.9, 3.0, 3.0]); - let neighbours = model.region_query(&[1.0, 1.0], &inputs); + let m = matrix![1.0, 1.0]; + let row = m.row(0); + let neighbours = model.region_query(row, &inputs); assert!(neighbours.len() == 2); } @@ -246,7 +250,9 @@ mod tests { let inputs = Matrix::new(3, 2, vec![1.0, 1.0, 1.1, 1.9, 1.1, 1.1]); - let neighbours = model.region_query(&[1.0, 1.0], &inputs); + let m = matrix![1.0, 1.0]; + let row = m.row(0); + let neighbours = model.region_query(row, &inputs); assert!(neighbours.len() == 1); } diff --git a/src/learning/gmm.rs b/src/learning/gmm.rs index 4c88e9ce..028dccc2 100644 --- a/src/learning/gmm.rs +++ b/src/learning/gmm.rs @@ -32,6 +32,7 @@ //! ``` use linalg::{Matrix, MatrixSlice, Vector, BaseMatrix, BaseMatrixMut, Axes}; use rulinalg::utils; +use rulinalg::matrix::decomposition::{PartialPivLu}; use learning::{LearningResult, UnSupModel}; use learning::toolkit::rand_utils; @@ -165,7 +166,7 @@ impl GaussianMixtureModel { if mixture_weights.size() != k { Err(Error::new(ErrorKind::InvalidParameters, "Mixture weights must have length k.")) } else if mixture_weights.data().iter().any(|&x| x < 0f64) { - Err(Error::new(ErrorKind::InvalidParameters, "Mixture weights must have only non-negative entries.")) + Err(Error::new(ErrorKind::InvalidParameters, "Mixture weights must have only non-negative entries.")) } else { let sum = mixture_weights.sum(); let normalized_weights = mixture_weights / sum; @@ -233,9 +234,9 @@ impl GaussianMixtureModel { CovOption::Full | CovOption::Regularized(_) => { let means = inputs.mean(Axes::Row); let mut cov_mat = Matrix::zeros(inputs.cols(), inputs.cols()); - for (j, row) in cov_mat.iter_rows_mut().enumerate() { + for (j, mut row) in cov_mat.row_iter_mut().enumerate() { for (k, elem) in row.iter_mut().enumerate() { - *elem = inputs.iter_rows().map(|r| { + *elem = inputs.row_iter().map(|r| { (r[j] - means[j]) * (r[k] - means[k]) }).sum::(); } @@ -260,9 +261,10 @@ impl GaussianMixtureModel { if let Some(ref covars) = self.model_covars { for cov in covars { - // TODO: combine these. We compute det to get the inverse. - let covar_det = cov.det(); - let covar_inv = try!(cov.inverse().map_err(Error::from)); + let lup = PartialPivLu::decompose(cov.clone()).expect("Covariance could not be lup decomposed"); + let covar_det = lup.det(); + // TODO: We can probably remove this inverse for a more stable solve elsewhere. + let covar_inv = try!(lup.inverse().map_err(Error::from)); cov_sqrt_dets.push(covar_det.sqrt()); cov_invs.push(covar_inv); @@ -309,10 +311,8 @@ impl GaussianMixtureModel { let mut new_means = membership_weights.transpose() * inputs; - for (mean, w) in new_means.iter_rows_mut().zip(sum_weights.data().iter()) { - for m in mean.iter_mut() { - *m /= *w; - } + for (mut mean, w) in new_means.row_iter_mut().zip(sum_weights.data().iter()) { + *mean /= *w; } let mut new_covs = Vec::with_capacity(self.comp_count); diff --git a/src/learning/gp.rs b/src/learning/gp.rs index 9dc28187..828e039d 100644 --- a/src/learning/gp.rs +++ b/src/learning/gp.rs @@ -131,9 +131,9 @@ impl GaussianProcess { let dim2 = m2.rows(); let mut ker_data = Vec::with_capacity(dim1 * dim2); - ker_data.extend(m1.iter_rows().flat_map(|row1| { - m2.iter_rows() - .map(move |row2| self.ker.kernel(row1, row2)) + ker_data.extend(m1.row_iter().flat_map(|row1| { + m2.row_iter() + .map(move |row2| self.ker.kernel(row1.raw_slice(), row2.raw_slice())) })); Ok(Matrix::new(dim1, dim2, ker_data)) @@ -195,8 +195,8 @@ impl GaussianProcess { let test_mat = try!(self.ker_mat(inputs, t_data)); let mut var_data = Vec::with_capacity(inputs.rows() * inputs.cols()); - for row in test_mat.iter_rows() { - let test_point = Vector::new(row.to_vec()); + for row in test_mat.row_iter() { + let test_point = Vector::new(row.raw_slice()); var_data.append(&mut t_mat.solve_l_triangular(test_point).unwrap().into_vec()); } diff --git a/src/learning/k_means.rs b/src/learning/k_means.rs index 54711d5d..3998fdab 100644 --- a/src/learning/k_means.rs +++ b/src/learning/k_means.rs @@ -330,7 +330,7 @@ impl Initializer for KPlusPlus { let first_cen = rng.gen_range(0usize, inputs.rows()); unsafe { - init_centroids.extend_from_slice(inputs.get_row_unchecked(first_cen)); + init_centroids.extend_from_slice(inputs.row_unchecked(first_cen).raw_slice()); } for i in 1..k { @@ -350,7 +350,7 @@ impl Initializer for KPlusPlus { } let next_cen = sample_discretely(dist); - init_centroids.extend_from_slice(inputs.get_row_unchecked(next_cen)); + init_centroids.extend_from_slice(inputs.row_unchecked(next_cen).raw_slice()); } } diff --git a/src/learning/naive_bayes.rs b/src/learning/naive_bayes.rs index 4fa2ebde..e2510087 100644 --- a/src/learning/naive_bayes.rs +++ b/src/learning/naive_bayes.rs @@ -152,9 +152,9 @@ impl NaiveBayes { self.class_counts = vec![0; class_count]; let mut class_data = vec![Vec::new(); class_count]; - for (idx, row) in targets.iter_rows().enumerate() { + for (idx, row) in targets.row_iter().enumerate() { // Find the class of this input - let class = try!(NaiveBayes::::find_class(row)); + let class = try!(NaiveBayes::::find_class(row.raw_slice())); // Note the class of the input class_data[class].push(idx); @@ -199,9 +199,9 @@ impl NaiveBayes { fn get_classes(log_probs: Matrix) -> Vec { let mut data_classes = Vec::with_capacity(log_probs.rows()); - data_classes.extend(log_probs.iter_rows().map(|row| { + data_classes.extend(log_probs.row_iter().map(|row| { // Argmax each class log-probability per input - let (class, _) = utils::argmax(row); + let (class, _) = utils::argmax(row.raw_slice()); class })); diff --git a/src/learning/nnet.rs b/src/learning/nnet.rs index de8e9b4a..3d926465 100644 --- a/src/learning/nnet.rs +++ b/src/learning/nnet.rs @@ -247,7 +247,7 @@ impl<'a, T: Criterion> BaseNeuralNet<'a, T> { /// Gets the weights for a layer excluding the bias weights. fn get_non_bias_weights(&self, weights: &[f64], idx: usize) -> MatrixSlice { let layer_weights = self.get_layer_weights(weights, idx); - layer_weights.reslice([1, 0], layer_weights.rows() - 1, layer_weights.cols()) + layer_weights.sub_slice([1, 0], layer_weights.rows() - 1, layer_weights.cols()) } /// Compute the gradient using the back propagation algorithm. diff --git a/src/learning/svm.rs b/src/learning/svm.rs index 3287db0e..6d5622a3 100644 --- a/src/learning/svm.rs +++ b/src/learning/svm.rs @@ -111,9 +111,9 @@ impl SVM { let dim2 = m2.rows(); let mut ker_data = Vec::with_capacity(dim1 * dim2); - ker_data.extend(m1.iter_rows().flat_map(|row1| { - m2.iter_rows() - .map(move |row2| self.ker.kernel(row1, row2)) + ker_data.extend(m1.row_iter().flat_map(|row1| { + m2.row_iter() + .map(move |row2| self.ker.kernel(row1.raw_slice(), row2.raw_slice())) })); Ok(Matrix::new(dim1, dim2, ker_data)) @@ -154,8 +154,8 @@ impl SupModel, Vector> for SVM { for t in 0..self.optim_iters { let i = rng.gen_range(0, n); let row_i = full_inputs.select_rows(&[i]); - let sum = full_inputs.iter_rows() - .fold(0f64, |sum, row| sum + self.ker.kernel(row_i.data(), row)) * + let sum = full_inputs.row_iter() + .fold(0f64, |sum, row| sum + self.ker.kernel(row_i.data(), row.raw_slice())) * targets[i] / (self.lambda * (t as f64)); if sum < 1f64 { diff --git a/src/learning/toolkit/kernel.rs b/src/learning/toolkit/kernel.rs index 7df3231b..c13e543d 100644 --- a/src/learning/toolkit/kernel.rs +++ b/src/learning/toolkit/kernel.rs @@ -5,7 +5,7 @@ use std::ops::{Add, Mul}; use linalg::Vector; -use linalg::Metric; +use linalg::norm::{Euclidean, VectorNorm, VectorMetric}; use rulinalg::utils; /// The Kernel trait @@ -350,7 +350,7 @@ impl Kernel for Exponential { let diff = Vector::new(x1.to_vec()) - Vector::new(x2.to_vec()); - let x = -diff.norm() / (2f64 * self.ls * self.ls); + let x = -Euclidean.norm(&diff) / (2f64 * self.ls * self.ls); (self.ampl * x.exp()) } } @@ -452,9 +452,7 @@ impl Kernel for Multiquadric { fn kernel(&self, x1: &[f64], x2: &[f64]) -> f64 { assert_eq!(x1.len(), x2.len()); - let diff = Vector::new(x1.to_vec()) - Vector::new(x2.to_vec()); - - diff.norm().hypot(self.c) + Euclidean.metric(&(x1.into()), &(x2.into())).hypot(self.c) } } diff --git a/src/learning/toolkit/regularization.rs b/src/learning/toolkit/regularization.rs index 3c9d58be..395d36bd 100644 --- a/src/learning/toolkit/regularization.rs +++ b/src/learning/toolkit/regularization.rs @@ -14,7 +14,7 @@ //! let reg = Regularization::L1(0.5); //! ``` -use linalg::Metric; +use linalg::norm::{Euclidean, Lp, MatrixNorm}; use linalg::{Matrix, MatrixSlice, BaseMatrix}; use libnum::{FromPrimitive, Float}; @@ -57,9 +57,7 @@ impl Regularization { } fn l1_reg_cost(mat: &MatrixSlice, x: T) -> T { - // TODO: This won't be regularized. Need to unroll... - let l1_norm = mat.iter() - .fold(T::zero(), |acc, y| acc + y.abs()); + let l1_norm = Lp::Integer(1).norm(mat); l1_norm * x / ((T::one() + T::one()) * FromPrimitive::from_usize(mat.rows()).unwrap()) } @@ -78,7 +76,7 @@ impl Regularization { } fn l2_reg_cost(mat: &MatrixSlice, x: T) -> T { - mat.norm() * x / ((T::one() + T::one()) * FromPrimitive::from_usize(mat.rows()).unwrap()) + Euclidean.norm(mat) * x / ((T::one() + T::one()) * FromPrimitive::from_usize(mat.rows()).unwrap()) } fn l2_reg_grad(mat: &MatrixSlice, x: T) -> Matrix { @@ -90,7 +88,7 @@ impl Regularization { mod tests { use super::Regularization; use linalg::{Matrix, BaseMatrix}; - use linalg::Metric; + use linalg::norm::{Euclidean, MatrixNorm}; #[test] fn test_no_reg() { @@ -138,7 +136,7 @@ mod tests { let a = no_reg.reg_cost(mat_slice); let b = no_reg.reg_grad(mat_slice); - assert!((a - (input_mat.norm() / 12f64)) < 1e-18); + assert!((a - (Euclidean.norm(&input_mat) / 12f64)) < 1e-18); let true_grad = &input_mat / 6f64; for eps in (b - true_grad).into_vec() { @@ -156,7 +154,7 @@ mod tests { let a = no_reg.reg_cost(mat_slice); let b = no_reg.reg_grad(mat_slice); - assert!(a - ((input_mat.norm() / 24f64) + (42f64 / 12f64)) < 1e-18); + assert!(a - ((Euclidean.norm(&input_mat) / 24f64) + (42f64 / 12f64)) < 1e-18); let l1_true_grad = Matrix::new(3, 4, vec![-1., -1., -1., 1., 1., 1., 1., 1., 1., 1., 1., 1.] diff --git a/src/lib.rs b/src/lib.rs index a822f58a..397a9646 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,6 +25,7 @@ //! - Gaussian Mixture Models //! - Naive Bayes Classifiers //! - DBSCAN +//! - Latent Dirichlet Allocation //! //! ### linalg //! @@ -118,7 +119,7 @@ pub mod prelude; pub mod linalg { pub use rulinalg::matrix::{Axes, Matrix, MatrixSlice, MatrixSliceMut, BaseMatrix, BaseMatrixMut}; pub use rulinalg::vector::Vector; - pub use rulinalg::Metric; + pub use rulinalg::norm; } /// Module for data handling diff --git a/tests/linalg/mat.rs b/tests/linalg/mat.rs deleted file mode 100644 index e69de29b..00000000