From a0ad101ff49b485347b901bdd6fbc05c505e6cdb Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Wed, 1 Jul 2020 15:27:17 +0100 Subject: [PATCH 01/24] [WIP] Ngrams: parity with ngrams and everygram but not with skipgram --- src/lib.rs | 1 + src/ngram_utils/mod.rs | 459 +++++++++++++++++++++++++++++++++++++++ src/ngram_utils/tests.rs | 210 ++++++++++++++++++ 3 files changed, 670 insertions(+) create mode 100644 src/ngram_utils/mod.rs create mode 100644 src/ngram_utils/tests.rs diff --git a/src/lib.rs b/src/lib.rs index b1a68fc..9934a78 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,6 +41,7 @@ assert_eq!(tokens, vec!["Flights", "ca", "n't", "depart", "after", "2:00", "pm", pub mod errors; mod math; pub mod metrics; +pub mod ngram_utils; pub mod tokenize; pub mod tokenize_sentence; pub mod vectorize; diff --git a/src/ngram_utils/mod.rs b/src/ngram_utils/mod.rs new file mode 100644 index 0000000..da7b651 --- /dev/null +++ b/src/ngram_utils/mod.rs @@ -0,0 +1,459 @@ +#[cfg(test)] +mod tests; + +use std::cmp::{min, max}; +use std::collections::VecDeque; +use std::iter; + +fn pad_items<'a>( + items: Box + 'a>, + n: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, +) -> Box + 'a> { + let left_chained: Box>; + let all_chained: Box>; + + match pad_left { + Some(s) => { + let pad_left_iter = iter::repeat(s).take(n - 1); + left_chained = Box::new(pad_left_iter.chain(items)); + } + None => { + left_chained = items; + } + } + + match pad_right { + Some(s) => { + let pad_right_iter = iter::repeat(s).take(n - 1); + all_chained = Box::new(left_chained.chain(pad_right_iter)); + } + None => { + all_chained = left_chained; + } + } + + all_chained +} + +enum IterMode { + Start, + PadLeft, + Main, + MainEnd, + PadRight, +} + +struct KSkipNGramsIter<'a> { + // Params + items: Box + 'a>, + min_n: usize, + max_n: usize, + min_k: usize, + max_k: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, + + // Iterator state + window: VecDeque<&'a str>, + window_end: VecDeque<&'a str>, + n: usize, // length outputted last + k: usize, + p: usize, + mode: IterMode, + first: bool, +} + +impl<'a> Iterator for KSkipNGramsIter<'a> { + type Item = Vec<&'a str>; + + fn next(&mut self) -> Option { + return match &self.mode { + IterMode::Start => { + self.start_mode_pad_left(); + self.next() + } + + IterMode::PadLeft => { + if self.pad_left.is_some() && self.max_n > 1 { + let next = self.next_gram_pad_left(); + match &next { + Some(_e) => next, + None => { + self.start_mode_main(); + self.next() + } + } + } else { + self.start_mode_main(); + self.next() + } + } + + IterMode::Main => { + let next = self.next_gram_main(); + match &next { + Some(_e) => next, + None => { + self.start_mode_main_end(); + self.next() + } + } + } + + IterMode::MainEnd => { + if self.min_n != self.max_n || self.max_k > 0 { + let next = self.next_gram_main_end(); + match &next { + Some(_e) => next, + None => { + self.start_mode_pad_right(); + self.next() + } + } + } else { + self.start_mode_pad_right(); + self.next() + } + } + + IterMode::PadRight => { + if self.pad_right.is_some() && self.max_n > 1 { + self.next_gram_pad_right() + } else { + return None; + } + } + }; + } +} + +impl<'a> KSkipNGramsIter<'a> { + // Switching between modes + fn start_mode_pad_left(&mut self) { + self.mode = IterMode::PadLeft; + self.first = true; + } + + fn start_mode_main(&mut self) { + self.mode = IterMode::Main; + self.first = true; + } + + fn start_mode_main_end(&mut self) { + self.mode = IterMode::MainEnd; + self.window_end = self.window.clone(); + self.window.pop_front(); + self.first = true; + } + + fn start_mode_pad_right(&mut self) { + self.mode = IterMode::PadRight; + self.window = self.window_end.clone(); + self.first = true; + } + + // Next gram + fn next_gram_pad_left(&mut self) -> Option> { + self.next_params_pad_left()?; + + let slice_idx = (self.k..self.window.len()) + .step_by(self.k + 1) + .take(self.n - self.p); + let grams = self.construct_grams_vec(slice_idx); + return Some(grams); + } + + fn next_gram_pad_right(&mut self) -> Option> { + self.next_params_pad_right()?; + + let slice_idx = (0..self.window.len() - self.k) + .rev() + .step_by(self.k + 1) + .take(self.n - self.p) + .rev(); + let grams = self.construct_grams_vec(slice_idx); + return Some(grams); + } + + fn next_gram_main(&mut self) -> Option> { + let finished = self.next_state_pad_main(); + + if finished.is_none() { + self.forward_window()?; + self.first = true; + return self.next_gram_main(); + } + + // Get slice + let slice_idx = (0..self.window.len()).step_by(self.k + 1).take(self.n); + let grams = self.construct_grams_vec(slice_idx); + return Some(grams); + } + + fn next_gram_main_end(&mut self) -> Option> { + let finished = self.next_state_pad_main(); + + if finished.is_none() { + self.pop_window()?; + self.first = true; + return self.next_gram_main_end(); + } + + // Get slice + let slice_idx = (0..self.window.len()).step_by(self.k + 1).take(self.n); + let grams = self.construct_grams_vec(slice_idx); + return if grams.len() == self.n { // `take` takes n or less + Some(grams) + } else { + None + } + + } + + fn forward_window(&mut self) -> Option<()> { + // Need to forward window when yielded ngram of max-length and max-skip-size + let next_item = self.items.next(); + + return match next_item { + None => None, + Some(s) => { + self.window.pop_front(); + self.window.push_back(s); + Some(()) // Successfully forwarded window + } + } + } + + fn pop_window(&mut self) -> Option<()> { + // Pop item from window + return if self.window.len() >= 2 { + self.window.pop_front(); + Some(()) + } else { + None + }; + } + + fn next_params_pad_left(&mut self) -> Option<()> { + // Equivalent to a for-loop: + // for n in max(self.min_n, 2)..self.max_n + 1 + // for k in self.min_k..self.max_k + 1 + // for p in (n-1)..0 // decreasing + // next_gram(n, k, p) + return if self.first { + self.n = max(self.min_n, 2); + self.k = self.min_k; + self.p = self.n - 1; + self.first = false; + Some(()) + } else if self.p > 1 { + self.p -= 1; + Some(()) + } else if self.k < self.max_k { + self.k += 1; + self.p = self.n - 1; + Some(()) + } else if self.n < self.max_n { + self.n += 1; + self.k = self.min_k; + self.p = self.n - 1; + Some(()) + } else { + None + } + } + + fn next_params_pad_right(&mut self) -> Option<()> { + // Equivalent to a for-loop: + // for n in max(self.min_n, 2)..self.max_n + 1 + // for k in self.min_k..self.max_k + 1 + // for p in 1..n + // next_gram(n, k, p) + return if self.first { + self.n = max(self.min_n, 2); + self.k = self.min_k; + self.p = 1; + self.first = false; + Some(()) + } else if self.p < self.n - 1 { + self.p += 1; + Some(()) + } else if self.k < self.max_k { + self.k += 1; + self.p = 1; + Some(()) + } else if self.n < self.max_n { + self.n += 1; + self.k = self.min_k; + self.p = 1; + Some(()) + } else { + None + } + } + + fn next_state_pad_main(&mut self) -> Option<()> { + // Equivalent to a for-loop: + // for n in self.min_n..self.max_n + 1 + // for k in self.min_k..self.max_k + 1 + // next_gram(n, k, p) + return if self.first { + self.n = self.min_n; + self.k = self.min_k; + self.first = false; + Some(()) + } else if self.k < self.max_k { + self.k += 1; + Some(()) + } else if self.n < min(self.max_n, self.window.len()) { + self.k = self.min_k; + self.n += 1; + Some(()) + } else { + None + } + } + + fn construct_grams_vec( + &mut self, + slice_idx: impl ExactSizeIterator, + ) -> Vec<&'a str> { + let grams = self.vec_from_idx(slice_idx); + + return match self.mode { + IterMode::PadLeft => { + // Add padding to the left + [ + iter::repeat(self.pad_left.unwrap()).take(self.p).collect(), + grams, + ] + .concat() + } + + IterMode::PadRight => { + // Add padding to the right + [ + grams, + iter::repeat(self.pad_right.unwrap()).take(self.p).collect(), + ] + .concat() + } + + _ => grams, + }; + } + + fn vec_from_idx(&mut self, slice_idx: impl ExactSizeIterator) -> Vec<&'a str> { + let mut grams = Vec::with_capacity(slice_idx.len()); + for idx in slice_idx { + grams.push(self.window[idx].clone()); + } + grams + } +} + +fn build_window<'a>( + items: &mut Box + 'a>, + max_n: usize, + max_k: usize, +) -> Result, &'static str> { + let window_size = (max_n - 1) * (max_k + 1) + 1; + let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); + + // Populate window + let mut i = window_size; + while i > 0 { + let next_item = items.next(); + match next_item { + None => { + return Err("Items length is smaller than what is required by `max_n` and `max_k`") + } + Some(s) => { + window.push_back(s); + } + } + i -= 1; + } + Ok(window) +} + +fn build_k_skip_n_grams_iter<'a>( + mut items: Box + 'a>, + min_n: usize, + max_n: usize, + min_k: usize, + max_k: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, +) -> Result> + 'a>, &'a str> { + if min_n < 1 { + return Err("`min_n` must be greater than or equal to 1"); + } + if min_n > max_n { + return Err("`max_n` must be greater than or equal to `min_n`"); + } + if min_k > max_k { + return Err("`max_k` must be greater than or equal to `min_k`"); + } + + let window = build_window(&mut items, max_n, max_k)?; + + Ok(Box::new(KSkipNGramsIter { + // Params + items, + min_n, + max_n, + min_k, + max_k, + pad_left, + pad_right, + + // Iterator state + window, + window_end: VecDeque::new(), + n: 0, // length outputted last + k: 0, + p: 0, + mode: IterMode::Start, + first: false, + })) +} + +fn bigram<'a>( + items: Box + 'a>, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, +) -> Result> + 'a>, &'a str> { + build_k_skip_n_grams_iter(items, 2, 2, 0, 0, pad_left, pad_right) +} + +fn ngrams<'a>( + items: Box + 'a>, + n: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, +) -> Result> + 'a>, &'a str> { + build_k_skip_n_grams_iter(items, n, n, 0, 0, pad_left, pad_right) +} + +fn everygrams<'a>( + items: Box + 'a>, + min_length: usize, + max_length: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, +) -> Result> + 'a>, &'a str> { + build_k_skip_n_grams_iter(items, min_length, max_length, 0, 0, pad_left, pad_right) +} + +fn skipgrams<'a>( + items: Box + 'a>, + n: usize, + k: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, +) -> Result> + 'a>, &'a str> { + build_k_skip_n_grams_iter(items, n, n, 0, k, pad_left, pad_right) +} diff --git a/src/ngram_utils/tests.rs b/src/ngram_utils/tests.rs new file mode 100644 index 0000000..4ab1bfb --- /dev/null +++ b/src/ngram_utils/tests.rs @@ -0,0 +1,210 @@ +use crate::ngram_utils::*; + +#[test] +fn test_padding() { + let sent = "Marry had a little lamb".split(" "); + + let output: Vec<&str> = + pad_items(Box::new(sent.clone()), 3, Some(""), Some("")).collect(); + let expected = vec![ + "", "", "Marry", "had", "a", "little", "lamb", "", "", + ]; + assert_eq!(output, expected); + + let output: Vec<&str> = pad_items(Box::new(sent.clone()), 2, Some(""), None).collect(); + let expected = vec!["", "Marry", "had", "a", "little", "lamb"]; + assert_eq!(output, expected); + + let output: Vec<&str> = pad_items(Box::new(sent.clone()), 2, None, Some("")).collect(); + let expected = vec!["Marry", "had", "a", "little", "lamb", ""]; + assert_eq!(output, expected); +} + +#[test] +fn test_bigram() { + let sent = "Marry had a little lamb".split(" "); + + let output_iter = bigram(Box::new(sent), None, None).unwrap(); + let output: Vec> = output_iter.collect(); + + let expected = vec![ + vec!["Marry", "had"], + vec!["had", "a"], + vec!["a", "little"], + vec!["little", "lamb"], + ]; + + assert_eq!(output, expected); +} + +#[test] +fn test_trigram() { + let sent = "Marry had a little lamb".split(" "); + + let output_iter = ngrams(Box::new(sent.clone()), 3, Some(""), Some("")).unwrap(); + let output: Vec> = output_iter.collect(); + + let expected = vec![ + vec!["", "", "Marry"], + vec!["", "Marry", "had"], + vec!["Marry", "had", "a"], + vec!["had", "a", "little"], + vec!["a", "little", "lamb"], + vec!["little", "lamb", ""], + vec!["lamb", "", ""], + ]; + + assert_eq!(output, expected); + + let output_iter = ngrams(Box::new(sent.clone()), 3, None, Some("")).unwrap(); + let output: Vec> = output_iter.collect(); + + let expected = vec![ + vec!["Marry", "had", "a"], + vec!["had", "a", "little"], + vec!["a", "little", "lamb"], + vec!["little", "lamb", ""], + vec!["lamb", "", ""], + ]; + + assert_eq!(output, expected); +} + +#[test] +fn test_ngrams() { + let sent = "Marry had a little lamb".split(" "); + + let output_iter = ngrams(Box::new(sent), 4, Some(""), Some("")).unwrap(); + let output: Vec> = output_iter.collect(); + + let expected = vec![ + vec!["", "", "", "Marry"], + vec!["", "", "Marry", "had"], + vec!["", "Marry", "had", "a"], + vec!["Marry", "had", "a", "little"], + vec!["had", "a", "little", "lamb"], + vec!["a", "little", "lamb", ""], + vec!["little", "lamb", "", ""], + vec!["lamb", "", "", ""], + ]; + + assert_eq!(output, expected); +} + +#[test] +fn test_everygram() { + let sent = "Marry had a little lamb".split(" "); + + let output_iter = everygrams(Box::new(sent), 1, 3, Some(""), Some("")).unwrap(); + let output: Vec> = output_iter.collect(); + + for e in &output { + println!("vec!{:?},", e); + } + + let expected = vec![ + vec!["", "Marry"], + vec!["", "", "Marry"], + vec!["", "Marry", "had"], + vec!["Marry"], + vec!["Marry", "had"], + vec!["Marry", "had", "a"], + vec!["had"], + vec!["had", "a"], + vec!["had", "a", "little"], + vec!["a"], + vec!["a", "little"], + vec!["a", "little", "lamb"], + vec!["little"], + vec!["little", "lamb"], + vec!["lamb"], + vec!["lamb", ""], + vec!["little", "lamb", ""], + vec!["lamb", "", ""], + ]; + + assert_eq!(output, expected); +} + +#[test] +fn test_skipgram() { + let sent = "Marry had a little lamb".split(" "); + + let output_iter = skipgrams(Box::new(sent.clone()), 2, 1, Some(""), Some("")).unwrap(); + let output: Vec> = output_iter.collect(); + + let expected = vec![ + vec!["", "Marry"], + vec!["", "had"], + vec!["Marry", "had"], + vec!["Marry", "a"], + vec!["had", "a"], + vec!["had", "little"], + vec!["a", "little"], + vec!["a", "lamb"], + vec!["little", "lamb"], + vec!["lamb", ""], + vec!["little", ""], + ]; + + assert_eq!(output, expected); + + let output_iter = skipgrams(Box::new(sent.clone()), 3, 1, Some(""), Some("")).unwrap(); + let output: Vec> = output_iter.collect(); + + for e in &output { + println!("vec!{:?}", e); + } + + let expected = vec![ + vec!["", "", "Marry"], + vec!["", "", "had"], + vec!["", "Marry", "had"], + vec!["", "Marry", "had"], + vec!["", "Marry", "a"], + vec!["", "had", "a"], + vec!["Marry", "had", "a"], + vec!["Marry", "had", "little"], + vec!["Marry", "a", "little"], + vec!["had", "a", "little"], + vec!["had", "a", "lamb"], + vec!["had", "little", "lamb"], + vec!["a", "little", "lamb"], + vec!["a", "little", ""], + vec!["a", "lamb", ""], + vec!["little", "lamb", ""], + vec!["little", "lamb", ""], + vec!["little", "", ""], + vec!["lamb", "", ""], + ]; + + assert_eq!(output, expected); +} + +#[test] +fn test_ngram_edge_cases() { + let sent = "Marry had a little lamb".split(" "); + + let output_iter = build_k_skip_n_grams_iter( + Box::new(sent.clone()), 1, 1, 1, 1, Some(""), Some("")).unwrap(); + let output: Vec> = output_iter.collect(); + + let expected = vec![ + vec!["Marry"], + vec!["had"], + vec!["a"], + vec!["little"], + vec!["lamb"], + ]; + + assert_eq!(output, expected); + + let output_iter = build_k_skip_n_grams_iter( + Box::new(sent.clone()), 1, 1, 2, 2, Some(""), Some("")).unwrap(); + let output: Vec> = output_iter.collect(); + + assert_eq!(output, expected); +} + + +// TODO: character ngram \ No newline at end of file From 76010c0c01e7ea787f5dbf0d0e4d65211077bd08 Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Fri, 3 Jul 2020 11:25:41 +0100 Subject: [PATCH 02/24] [WIP] skipgram n=3, k=1 parity with nltk --- src/ngram_utils/mod.rs | 269 ++++++++++++++++++++++++++++++++++----- src/ngram_utils/tests.rs | 68 +++++++--- 2 files changed, 283 insertions(+), 54 deletions(-) diff --git a/src/ngram_utils/mod.rs b/src/ngram_utils/mod.rs index da7b651..5123672 100644 --- a/src/ngram_utils/mod.rs +++ b/src/ngram_utils/mod.rs @@ -5,6 +5,8 @@ use std::cmp::{min, max}; use std::collections::VecDeque; use std::iter; +use std::iter::Peekable; + fn pad_items<'a>( items: Box + 'a>, n: usize, @@ -37,6 +39,61 @@ fn pad_items<'a>( all_chained } +struct SkipVecIter { + prev: Vec, + n: usize, + k: usize, + first: bool, +} + +impl SkipVecIter { + pub fn new(n: usize, k: usize) -> SkipVecIter { + SkipVecIter { + prev: vec![0; n], + n, + k, + first: true + } + } + + pub fn new_empty() -> SkipVecIter { + SkipVecIter { + prev: Vec::new(), + n: 0, + k: 0, + first: false + } + } +} + +impl Iterator for SkipVecIter { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.first { + self.first = false; + return Some(self.prev.clone()); + } else { + for i in (0..self.n).rev() { + let e = self.prev[i]; + if e < self.k { + self.prev[i] += 1; + for j in i+1..self.n { + self.prev[j] = 0; + } + let sum: usize = self.prev.iter().sum(); + if sum <= self.k { + return Some(self.prev.clone()); + } else { + return self.next() + } + } + } + return None; + } + } +} + enum IterMode { Start, PadLeft, @@ -61,6 +118,7 @@ struct KSkipNGramsIter<'a> { n: usize, // length outputted last k: usize, p: usize, + combinations: Peekable, mode: IterMode, first: bool, } @@ -158,9 +216,26 @@ impl<'a> KSkipNGramsIter<'a> { fn next_gram_pad_left(&mut self) -> Option> { self.next_params_pad_left()?; - let slice_idx = (self.k..self.window.len()) - .step_by(self.k + 1) - .take(self.n - self.p); + // let slice_idx = (self.k..self.window.len()) + // .step_by(self.k + 1) + // .take(self.n - self.p); + // let grams = self.construct_grams_vec(slice_idx); + // return Some(grams); + + + let mut slice_idx: Vec = Vec::with_capacity(self.n); + let mut i = 0; + let spacing = self.combinations.next().unwrap(); + println!(""); + for (j, &e) in spacing.clone().iter().enumerate() { // TODO remove clone + if j == 0 { + i += e; + } else { + i += e+1; + } + slice_idx.push(i); + } + println!("LP {:?} {:?} {}", spacing, slice_idx, self.window.len()); let grams = self.construct_grams_vec(slice_idx); return Some(grams); } @@ -168,11 +243,38 @@ impl<'a> KSkipNGramsIter<'a> { fn next_gram_pad_right(&mut self) -> Option> { self.next_params_pad_right()?; - let slice_idx = (0..self.window.len() - self.k) - .rev() - .step_by(self.k + 1) - .take(self.n - self.p) - .rev(); + // let slice_idx = (0..self.window.len() - self.k) + // .rev() + // .step_by(self.k + 1) + // .take(self.n - self.p) + // .rev(); + // let grams = self.construct_grams_vec(slice_idx.collect::>()); + // return Some(grams); + + + let mut slice_idx: Vec = Vec::with_capacity(self.n); + + let spacing = self.combinations.next().unwrap(); + let end_idx = self.window.len()-1; + let mut i = end_idx; + for (j, &e) in spacing.clone().iter().enumerate() { // TODO remove clone + if j == 0 { + i -= e; + } else { + i -= e+1; + } + slice_idx.push(i); + } + + slice_idx.reverse(); + + for i in slice_idx.clone() { + if i > end_idx { + println!(); + } + } + + println!("LP {:?} {:?} {}", spacing, slice_idx, self.window.len()); let grams = self.construct_grams_vec(slice_idx); return Some(grams); } @@ -187,7 +289,18 @@ impl<'a> KSkipNGramsIter<'a> { } // Get slice - let slice_idx = (0..self.window.len()).step_by(self.k + 1).take(self.n); + //let slice_idx = (0..self.window.len()).step_by(self.k + 1).take(self.n); + + let mut slice_idx: Vec = Vec::with_capacity(self.n); + slice_idx.push(0); + let mut i = 0; + let spacing = self.combinations.next().unwrap(); + + for e in spacing.clone() { // TODO remove clone + i += e+1; + slice_idx.push(i); + } + println!("{:?} {:?} {}", spacing, slice_idx, self.window.len()); let grams = self.construct_grams_vec(slice_idx); return Some(grams); } @@ -196,15 +309,44 @@ impl<'a> KSkipNGramsIter<'a> { let finished = self.next_state_pad_main(); if finished.is_none() { + // if self.window.len() >= 4 { + // self.pop_window()?; + // self.first = true; + // return self.next_gram_main_end(); + // } else if self.window.len() == 1 { + // return None + // } else { + // let grams = Vec::from(self.window.clone()); + // self.pop_window(); + // return Some(grams) + // } + self.pop_window()?; self.first = true; return self.next_gram_main_end(); } // Get slice - let slice_idx = (0..self.window.len()).step_by(self.k + 1).take(self.n); + // let slice_idx = (0..self.window.len()).step_by(self.k + 1).take(self.n); + // let grams = self.construct_grams_vec(slice_idx); + + // return if grams.len() == self.n { // `take` takes n or less + // Some(grams) + // } else { + // None + // } + + let mut slice_idx: Vec = Vec::with_capacity(self.n); + slice_idx.push(0); + let mut i = 0; + for e in self.combinations.next().unwrap() { + i += e+1; + slice_idx.push(i); + } + println!("{:?}, {}", slice_idx, self.window.len()); let grams = self.construct_grams_vec(slice_idx); - return if grams.len() == self.n { // `take` takes n or less + + return if grams.len() == self.n { // TODO: why? Some(grams) } else { None @@ -239,26 +381,40 @@ impl<'a> KSkipNGramsIter<'a> { fn next_params_pad_left(&mut self) -> Option<()> { // Equivalent to a for-loop: // for n in max(self.min_n, 2)..self.max_n + 1 - // for k in self.min_k..self.max_k + 1 + // -- for k in self.min_k..self.max_k + 1 // for p in (n-1)..0 // decreasing - // next_gram(n, k, p) + // for combi in combinations: + // next_gram(n, k, p) return if self.first { self.n = max(self.min_n, 2); - self.k = self.min_k; + //self.k = self.min_k; self.p = self.n - 1; + + let pick_n = self.n - self.p; + let skip_total = self.max_k; + self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + self.first = false; Some(()) + } else if self.combinations.peek().is_some() { + Some(()) } else if self.p > 1 { self.p -= 1; - Some(()) - } else if self.k < self.max_k { - self.k += 1; - self.p = self.n - 1; + + let pick_n = self.n - self.p; + let skip_total = self.max_k; + self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + Some(()) } else if self.n < self.max_n { self.n += 1; - self.k = self.min_k; + //self.k = self.min_k; self.p = self.n - 1; + + let pick_n = self.n - self.p; + let skip_total = self.max_k; + self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + Some(()) } else { None @@ -273,21 +429,34 @@ impl<'a> KSkipNGramsIter<'a> { // next_gram(n, k, p) return if self.first { self.n = max(self.min_n, 2); - self.k = self.min_k; + //self.k = self.min_k; self.p = 1; self.first = false; + + let pick_n = self.n - self.p; + let skip_total = self.max_k; + self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + + Some(()) + } else if self.combinations.peek().is_some() { Some(()) } else if self.p < self.n - 1 { self.p += 1; - Some(()) - } else if self.k < self.max_k { - self.k += 1; - self.p = 1; + + let pick_n = self.n - self.p; + let skip_total = self.max_k; + self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + Some(()) } else if self.n < self.max_n { self.n += 1; - self.k = self.min_k; + //self.k = self.min_k; self.p = 1; + + let pick_n = self.n - self.p; + let skip_total = self.max_k; + self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + Some(()) } else { None @@ -297,28 +466,57 @@ impl<'a> KSkipNGramsIter<'a> { fn next_state_pad_main(&mut self) -> Option<()> { // Equivalent to a for-loop: // for n in self.min_n..self.max_n + 1 - // for k in self.min_k..self.max_k + 1 - // next_gram(n, k, p) + // for combi in combination(window[1:], n) + // next_gram = window[0] + combi return if self.first { self.n = self.min_n; - self.k = self.min_k; + + let pick_n = min(self.max_n, self.window.len()) - 1; + let skip_total = min(self.window.len()-pick_n-1, self.max_k); + self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + self.first = false; Some(()) - } else if self.k < self.max_k { - self.k += 1; + } else if self.combinations.peek().is_some() { Some(()) } else if self.n < min(self.max_n, self.window.len()) { - self.k = self.min_k; self.n += 1; + + let pick_n = min(self.max_n, self.window.len()) - 1; + let skip_total = min(self.window.len()-pick_n-1, self.max_k); + self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + Some(()) } else { None } } + // fn next_state_pad_main_end(&mut self) -> Option<()> { + // // Equivalent to a for-loop: + // // for n in self.min_n..self.max_n + 1 + // // for k in self.min_k..self.max_k + 1 + // // next_gram(n, k, p) + // return if self.first { + // self.n = self.min_n; + // self.k = self.min_k; + // self.first = false; + // Some(()) + // } else if self.k < self.max_k { + // self.k += 1; + // Some(()) + // } else if self.n < min(self.max_n, self.window.len()) { + // self.k = self.min_k; + // self.n += 1; + // Some(()) + // } else { + // None + // } + // } + fn construct_grams_vec( &mut self, - slice_idx: impl ExactSizeIterator, + slice_idx: Vec, ) -> Vec<&'a str> { let grams = self.vec_from_idx(slice_idx); @@ -345,10 +543,10 @@ impl<'a> KSkipNGramsIter<'a> { }; } - fn vec_from_idx(&mut self, slice_idx: impl ExactSizeIterator) -> Vec<&'a str> { + fn vec_from_idx(&mut self, slice_idx: Vec) -> Vec<&'a str> { let mut grams = Vec::with_capacity(slice_idx.len()); - for idx in slice_idx { - grams.push(self.window[idx].clone()); + for idx in slice_idx.iter() { + grams.push(self.window[*idx].clone()); } grams } @@ -415,6 +613,7 @@ fn build_k_skip_n_grams_iter<'a>( window_end: VecDeque::new(), n: 0, // length outputted last k: 0, + combinations: SkipVecIter::new_empty().peekable(), p: 0, mode: IterMode::Start, first: false, diff --git a/src/ngram_utils/tests.rs b/src/ngram_utils/tests.rs index 4ab1bfb..88c1ef4 100644 --- a/src/ngram_utils/tests.rs +++ b/src/ngram_utils/tests.rs @@ -130,24 +130,24 @@ fn test_everygram() { fn test_skipgram() { let sent = "Marry had a little lamb".split(" "); - let output_iter = skipgrams(Box::new(sent.clone()), 2, 1, Some(""), Some("")).unwrap(); - let output: Vec> = output_iter.collect(); - - let expected = vec![ - vec!["", "Marry"], - vec!["", "had"], - vec!["Marry", "had"], - vec!["Marry", "a"], - vec!["had", "a"], - vec!["had", "little"], - vec!["a", "little"], - vec!["a", "lamb"], - vec!["little", "lamb"], - vec!["lamb", ""], - vec!["little", ""], - ]; - - assert_eq!(output, expected); + // let output_iter = skipgrams(Box::new(sent.clone()), 2, 1, Some(""), Some("")).unwrap(); + // let output: Vec> = output_iter.collect(); + // + // let expected = vec![ + // vec!["", "Marry"], + // vec!["", "had"], + // vec!["Marry", "had"], + // vec!["Marry", "a"], + // vec!["had", "a"], + // vec!["had", "little"], + // vec!["a", "little"], + // vec!["a", "lamb"], + // vec!["little", "lamb"], + // vec!["lamb", ""], + // vec!["little", ""], + // ]; + // + // assert_eq!(output, expected); let output_iter = skipgrams(Box::new(sent.clone()), 3, 1, Some(""), Some("")).unwrap(); let output: Vec> = output_iter.collect(); @@ -163,6 +163,8 @@ fn test_skipgram() { vec!["", "Marry", "had"], vec!["", "Marry", "a"], vec!["", "had", "a"], + + //"Marry had a little lamb" vec!["Marry", "had", "a"], vec!["Marry", "had", "little"], vec!["Marry", "a", "little"], @@ -170,6 +172,8 @@ fn test_skipgram() { vec!["had", "a", "lamb"], vec!["had", "little", "lamb"], vec!["a", "little", "lamb"], + + vec!["a", "little", ""], vec!["a", "lamb", ""], vec!["little", "lamb", ""], @@ -206,5 +210,31 @@ fn test_ngram_edge_cases() { assert_eq!(output, expected); } +#[test] +fn test_skip_vec_iter() { + + let output: Vec> = SkipVecIter::new(3, 2).collect(); + + for e in &output { + println!("vec!{:?}", e); + } + + let expected = vec![ + vec![0, 0, 0], + vec![0, 0, 1], + vec![0, 0, 2], + vec![0, 1, 0], + vec![0, 1, 1], + vec![0, 2, 0], + vec![1, 0, 0], + vec![1, 0, 1], + vec![1, 1, 0], + vec![2, 0, 0], + ]; + assert_eq!(output, expected); + +} + -// TODO: character ngram \ No newline at end of file +// TODO: character ngram +// test with longer sentence \ No newline at end of file From eb373f7e8aa125937e353072e1e9a2103a9b9c66 Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Fri, 3 Jul 2020 12:10:33 +0100 Subject: [PATCH 03/24] [WIP] parity with all nltk functions --- src/ngram_utils/mod.rs | 33 +++++----- src/ngram_utils/tests.rs | 132 ++++++++++++++++----------------------- 2 files changed, 70 insertions(+), 95 deletions(-) diff --git a/src/ngram_utils/mod.rs b/src/ngram_utils/mod.rs index 5123672..ad5c99e 100644 --- a/src/ngram_utils/mod.rs +++ b/src/ngram_utils/mod.rs @@ -161,7 +161,7 @@ impl<'a> Iterator for KSkipNGramsIter<'a> { } IterMode::MainEnd => { - if self.min_n != self.max_n || self.max_k > 0 { + if (self.min_n != self.max_n || self.max_k > 0) && self.window.len() > 1 { let next = self.next_gram_main_end(); match &next { Some(_e) => next, @@ -226,7 +226,7 @@ impl<'a> KSkipNGramsIter<'a> { let mut slice_idx: Vec = Vec::with_capacity(self.n); let mut i = 0; let spacing = self.combinations.next().unwrap(); - println!(""); + for (j, &e) in spacing.clone().iter().enumerate() { // TODO remove clone if j == 0 { i += e; @@ -235,7 +235,6 @@ impl<'a> KSkipNGramsIter<'a> { } slice_idx.push(i); } - println!("LP {:?} {:?} {}", spacing, slice_idx, self.window.len()); let grams = self.construct_grams_vec(slice_idx); return Some(grams); } @@ -268,13 +267,6 @@ impl<'a> KSkipNGramsIter<'a> { slice_idx.reverse(); - for i in slice_idx.clone() { - if i > end_idx { - println!(); - } - } - - println!("LP {:?} {:?} {}", spacing, slice_idx, self.window.len()); let grams = self.construct_grams_vec(slice_idx); return Some(grams); } @@ -300,7 +292,6 @@ impl<'a> KSkipNGramsIter<'a> { i += e+1; slice_idx.push(i); } - println!("{:?} {:?} {}", spacing, slice_idx, self.window.len()); let grams = self.construct_grams_vec(slice_idx); return Some(grams); } @@ -321,9 +312,14 @@ impl<'a> KSkipNGramsIter<'a> { // return Some(grams) // } - self.pop_window()?; - self.first = true; - return self.next_gram_main_end(); + return if self.window.len() > self.min_n { + self.pop_window()?; + self.first = true; + self.next_gram_main_end() + } else { + None + } + } // Get slice @@ -343,7 +339,6 @@ impl<'a> KSkipNGramsIter<'a> { i += e+1; slice_idx.push(i); } - println!("{:?}, {}", slice_idx, self.window.len()); let grams = self.construct_grams_vec(slice_idx); return if grams.len() == self.n { // TODO: why? @@ -471,7 +466,7 @@ impl<'a> KSkipNGramsIter<'a> { return if self.first { self.n = self.min_n; - let pick_n = min(self.max_n, self.window.len()) - 1; + let pick_n = min(self.n, self.window.len()) - 1; let skip_total = min(self.window.len()-pick_n-1, self.max_k); self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); @@ -482,7 +477,7 @@ impl<'a> KSkipNGramsIter<'a> { } else if self.n < min(self.max_n, self.window.len()) { self.n += 1; - let pick_n = min(self.max_n, self.window.len()) - 1; + let pick_n = min(self.n, self.window.len()) - 1; let skip_total = min(self.window.len()-pick_n-1, self.max_k); self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); @@ -566,7 +561,9 @@ fn build_window<'a>( let next_item = items.next(); match next_item { None => { - return Err("Items length is smaller than what is required by `max_n` and `max_k`") + // TODO: remove result + return Ok(window); + //return Err("Items length is smaller than what is required by `max_n` and `max_k`") } Some(s) => { window.push_back(s); diff --git a/src/ngram_utils/tests.rs b/src/ngram_utils/tests.rs index 88c1ef4..44e2eaf 100644 --- a/src/ngram_utils/tests.rs +++ b/src/ngram_utils/tests.rs @@ -2,33 +2,33 @@ use crate::ngram_utils::*; #[test] fn test_padding() { - let sent = "Marry had a little lamb".split(" "); + let sent = "Mary had a little lamb".split(" "); let output: Vec<&str> = pad_items(Box::new(sent.clone()), 3, Some(""), Some("")).collect(); let expected = vec![ - "", "", "Marry", "had", "a", "little", "lamb", "", "", + "", "", "Mary", "had", "a", "little", "lamb", "", "", ]; assert_eq!(output, expected); let output: Vec<&str> = pad_items(Box::new(sent.clone()), 2, Some(""), None).collect(); - let expected = vec!["", "Marry", "had", "a", "little", "lamb"]; + let expected = vec!["", "Mary", "had", "a", "little", "lamb"]; assert_eq!(output, expected); let output: Vec<&str> = pad_items(Box::new(sent.clone()), 2, None, Some("")).collect(); - let expected = vec!["Marry", "had", "a", "little", "lamb", ""]; + let expected = vec!["Mary", "had", "a", "little", "lamb", ""]; assert_eq!(output, expected); } #[test] fn test_bigram() { - let sent = "Marry had a little lamb".split(" "); + let sent = "Mary had a little lamb".split(" "); let output_iter = bigram(Box::new(sent), None, None).unwrap(); let output: Vec> = output_iter.collect(); let expected = vec![ - vec!["Marry", "had"], + vec!["Mary", "had"], vec!["had", "a"], vec!["a", "little"], vec!["little", "lamb"], @@ -39,15 +39,15 @@ fn test_bigram() { #[test] fn test_trigram() { - let sent = "Marry had a little lamb".split(" "); + let sent = "Mary had a little lamb".split(" "); let output_iter = ngrams(Box::new(sent.clone()), 3, Some(""), Some("")).unwrap(); let output: Vec> = output_iter.collect(); let expected = vec![ - vec!["", "", "Marry"], - vec!["", "Marry", "had"], - vec!["Marry", "had", "a"], + vec!["", "", "Mary"], + vec!["", "Mary", "had"], + vec!["Mary", "had", "a"], vec!["had", "a", "little"], vec!["a", "little", "lamb"], vec!["little", "lamb", ""], @@ -60,7 +60,7 @@ fn test_trigram() { let output: Vec> = output_iter.collect(); let expected = vec![ - vec!["Marry", "had", "a"], + vec!["Mary", "had", "a"], vec!["had", "a", "little"], vec!["a", "little", "lamb"], vec!["little", "lamb", ""], @@ -72,16 +72,16 @@ fn test_trigram() { #[test] fn test_ngrams() { - let sent = "Marry had a little lamb".split(" "); + let sent = "Mary had a little lamb".split(" "); let output_iter = ngrams(Box::new(sent), 4, Some(""), Some("")).unwrap(); let output: Vec> = output_iter.collect(); let expected = vec![ - vec!["", "", "", "Marry"], - vec!["", "", "Marry", "had"], - vec!["", "Marry", "had", "a"], - vec!["Marry", "had", "a", "little"], + vec!["", "", "", "Mary"], + vec!["", "", "Mary", "had"], + vec!["", "Mary", "had", "a"], + vec!["Mary", "had", "a", "little"], vec!["had", "a", "little", "lamb"], vec!["a", "little", "lamb", ""], vec!["little", "lamb", "", ""], @@ -93,22 +93,18 @@ fn test_ngrams() { #[test] fn test_everygram() { - let sent = "Marry had a little lamb".split(" "); + let sent = "Mary had a little lamb".split(" "); let output_iter = everygrams(Box::new(sent), 1, 3, Some(""), Some("")).unwrap(); let output: Vec> = output_iter.collect(); - for e in &output { - println!("vec!{:?},", e); - } - let expected = vec![ - vec!["", "Marry"], - vec!["", "", "Marry"], - vec!["", "Marry", "had"], - vec!["Marry"], - vec!["Marry", "had"], - vec!["Marry", "had", "a"], + vec!["", "Mary"], + vec!["", "", "Mary"], + vec!["", "Mary", "had"], + vec!["Mary"], + vec!["Mary", "had"], + vec!["Mary", "had", "a"], vec!["had"], vec!["had", "a"], vec!["had", "a", "little"], @@ -128,58 +124,48 @@ fn test_everygram() { #[test] fn test_skipgram() { - let sent = "Marry had a little lamb".split(" "); - - // let output_iter = skipgrams(Box::new(sent.clone()), 2, 1, Some(""), Some("")).unwrap(); - // let output: Vec> = output_iter.collect(); - // - // let expected = vec![ - // vec!["", "Marry"], - // vec!["", "had"], - // vec!["Marry", "had"], - // vec!["Marry", "a"], - // vec!["had", "a"], - // vec!["had", "little"], - // vec!["a", "little"], - // vec!["a", "lamb"], - // vec!["little", "lamb"], - // vec!["lamb", ""], - // vec!["little", ""], - // ]; - // - // assert_eq!(output, expected); + let sent = "Mary had a little lamb".split(" "); - let output_iter = skipgrams(Box::new(sent.clone()), 3, 1, Some(""), Some("")).unwrap(); + let output_iter = skipgrams(Box::new(sent.clone()), 2, 1, Some(""), Some("")).unwrap(); let output: Vec> = output_iter.collect(); - for e in &output { - println!("vec!{:?}", e); - } + let expected = vec![ + vec!["", "Mary"], + vec!["", "had"], + vec!["Mary", "had"], + vec!["Mary", "a"], + vec!["had", "a"], + vec!["had", "little"], + vec!["a", "little"], + vec!["a", "lamb"], + vec!["little", "lamb"], + vec!["lamb", ""], + vec!["little", ""], + ]; + + assert_eq!(output, expected); + + let output_iter = skipgrams(Box::new(sent.clone()), 3, 1, Some(""), Some("")).unwrap(); + let output: Vec> = output_iter.collect(); let expected = vec![ - vec!["", "", "Marry"], + vec!["", "", "Mary"], vec!["", "", "had"], - vec!["", "Marry", "had"], - vec!["", "Marry", "had"], - vec!["", "Marry", "a"], + vec!["", "Mary", "had"], + vec!["", "Mary", "a"], vec!["", "had", "a"], - - //"Marry had a little lamb" - vec!["Marry", "had", "a"], - vec!["Marry", "had", "little"], - vec!["Marry", "a", "little"], + vec!["Mary", "had", "a"], + vec!["Mary", "had", "little"], + vec!["Mary", "a", "little"], vec!["had", "a", "little"], vec!["had", "a", "lamb"], vec!["had", "little", "lamb"], vec!["a", "little", "lamb"], - - - vec!["a", "little", ""], - vec!["a", "lamb", ""], vec!["little", "lamb", ""], - vec!["little", "lamb", ""], - vec!["little", "", ""], + vec!["a", "lamb", ""], + vec!["a", "little", ""], vec!["lamb", "", ""], + vec!["little", "", ""], ]; assert_eq!(output, expected); @@ -187,14 +173,14 @@ fn test_skipgram() { #[test] fn test_ngram_edge_cases() { - let sent = "Marry had a little lamb".split(" "); + let sent = "Mary had a little lamb".split(" "); let output_iter = build_k_skip_n_grams_iter( Box::new(sent.clone()), 1, 1, 1, 1, Some(""), Some("")).unwrap(); let output: Vec> = output_iter.collect(); let expected = vec![ - vec!["Marry"], + vec!["Mary"], vec!["had"], vec!["a"], vec!["little"], @@ -215,10 +201,6 @@ fn test_skip_vec_iter() { let output: Vec> = SkipVecIter::new(3, 2).collect(); - for e in &output { - println!("vec!{:?}", e); - } - let expected = vec![ vec![0, 0, 0], vec![0, 0, 1], @@ -233,8 +215,4 @@ fn test_skip_vec_iter() { ]; assert_eq!(output, expected); -} - - -// TODO: character ngram -// test with longer sentence \ No newline at end of file +} \ No newline at end of file From 062aefe481fe8efc51cca3a15e75faffd5a7736a Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Mon, 6 Jul 2020 10:16:20 +0100 Subject: [PATCH 04/24] [WIP] refactored using index combinations --- src/ngram_utils/mod.rs | 307 ++++++++++++++++++++++----------------- src/ngram_utils/tests.rs | 91 +++++++++++- 2 files changed, 265 insertions(+), 133 deletions(-) diff --git a/src/ngram_utils/mod.rs b/src/ngram_utils/mod.rs index ad5c99e..0a6c594 100644 --- a/src/ngram_utils/mod.rs +++ b/src/ngram_utils/mod.rs @@ -39,6 +39,90 @@ fn pad_items<'a>( all_chained } +struct GramCombinations { + // Params + min_i: usize, + max_i: usize, + n: usize, + + // State + position: Vec, + first: bool, + last: bool +} + +impl GramCombinations { + pub fn new(fix_0: bool, max_i: usize, n: usize) -> Result { + let min_i; + if fix_0 { + min_i = 1; + } else { + min_i = 0; + } + + if max_i+1 < n { + return Err("`max_i` must be greater than or equal to `n-1`"); + } + + let position: Vec = (0..n).collect(); + + let mut last = false; + if n+1 == max_i { + last = true; + } + + Ok(GramCombinations { + min_i, + max_i, + n, + position, + first: true, + last + }) + } + + pub fn new_empty() -> GramCombinations { + GramCombinations { + min_i: 0, + max_i: 0, + n: 0, + position: Vec::new(), + first: false, + last: false + } + } +} + +impl Iterator for GramCombinations { + type Item = Vec; + + fn next(& mut self) -> Option { + if self.first { + self.first = false; + return Some(self.position.clone()); + } + if self.last { + return None + } + + for i in (self.min_i..self.position.len()).rev() { + let e = self.position[i]; + if e < self.max_i-(self.n-i-1) { + let mut e_1 = e; + for j in i..self.position.len() { + e_1 += 1; + self.position[j] = e_1; + } + if i == self.min_i && e+1 == self.max_i { + self.last = true; + } + return Some(self.position.clone()); + } + } + None // Will never reach + } +} + struct SkipVecIter { prev: Vec, n: usize, @@ -107,7 +191,6 @@ struct KSkipNGramsIter<'a> { items: Box + 'a>, min_n: usize, max_n: usize, - min_k: usize, max_k: usize, pad_left: Option<&'a str>, pad_right: Option<&'a str>, @@ -116,9 +199,8 @@ struct KSkipNGramsIter<'a> { window: VecDeque<&'a str>, window_end: VecDeque<&'a str>, n: usize, // length outputted last - k: usize, - p: usize, - combinations: Peekable, + p: usize, // Amount of padding + combinations: Peekable, mode: IterMode, first: bool, } @@ -222,19 +304,20 @@ impl<'a> KSkipNGramsIter<'a> { // let grams = self.construct_grams_vec(slice_idx); // return Some(grams); + // let mut slice_idx: Vec = Vec::with_capacity(self.n); + // let mut i = 0; + // let spacing = self.sample_iter.next().unwrap(); + // + // for (j, &e) in spacing.clone().iter().enumerate() { // TODO remove clone + // if j == 0 { + // i += e; + // } else { + // i += e+1; + // } + // slice_idx.push(i); + // } - let mut slice_idx: Vec = Vec::with_capacity(self.n); - let mut i = 0; - let spacing = self.combinations.next().unwrap(); - - for (j, &e) in spacing.clone().iter().enumerate() { // TODO remove clone - if j == 0 { - i += e; - } else { - i += e+1; - } - slice_idx.push(i); - } + let mut slice_idx: Vec = self.combinations.next().unwrap(); let grams = self.construct_grams_vec(slice_idx); return Some(grams); } @@ -251,20 +334,28 @@ impl<'a> KSkipNGramsIter<'a> { // return Some(grams); - let mut slice_idx: Vec = Vec::with_capacity(self.n); + // let mut slice_idx: Vec = Vec::with_capacity(self.n); + // + // let spacing = self.sample_iter.next().unwrap(); + // let end_idx = self.window.len()-1; + // let mut i = end_idx; + // for (j, &e) in spacing.clone().iter().enumerate() { // TODO remove clone + // if j == 0 { + // i -= e; + // } else { + // i -= e+1; + // } + // slice_idx.push(i); + // } + // + // slice_idx.reverse(); - let spacing = self.combinations.next().unwrap(); - let end_idx = self.window.len()-1; - let mut i = end_idx; - for (j, &e) in spacing.clone().iter().enumerate() { // TODO remove clone - if j == 0 { - i -= e; - } else { - i -= e+1; - } - slice_idx.push(i); - } + let mut slice_idx: Vec = self.combinations.next().unwrap(); + // Reverse index + for i in 0..slice_idx.len() { + slice_idx[i] = self.window.len() - 1 - slice_idx[i]; + } slice_idx.reverse(); let grams = self.construct_grams_vec(slice_idx); @@ -280,19 +371,8 @@ impl<'a> KSkipNGramsIter<'a> { return self.next_gram_main(); } - // Get slice - //let slice_idx = (0..self.window.len()).step_by(self.k + 1).take(self.n); - - let mut slice_idx: Vec = Vec::with_capacity(self.n); - slice_idx.push(0); - let mut i = 0; - let spacing = self.combinations.next().unwrap(); - - for e in spacing.clone() { // TODO remove clone - i += e+1; - slice_idx.push(i); - } - let grams = self.construct_grams_vec(slice_idx); + let grams_idx = self.combinations.next().unwrap(); + let grams = self.construct_grams_vec(grams_idx); return Some(grams); } @@ -300,18 +380,6 @@ impl<'a> KSkipNGramsIter<'a> { let finished = self.next_state_pad_main(); if finished.is_none() { - // if self.window.len() >= 4 { - // self.pop_window()?; - // self.first = true; - // return self.next_gram_main_end(); - // } else if self.window.len() == 1 { - // return None - // } else { - // let grams = Vec::from(self.window.clone()); - // self.pop_window(); - // return Some(grams) - // } - return if self.window.len() > self.min_n { self.pop_window()?; self.first = true; @@ -322,24 +390,8 @@ impl<'a> KSkipNGramsIter<'a> { } - // Get slice - // let slice_idx = (0..self.window.len()).step_by(self.k + 1).take(self.n); - // let grams = self.construct_grams_vec(slice_idx); - - // return if grams.len() == self.n { // `take` takes n or less - // Some(grams) - // } else { - // None - // } - - let mut slice_idx: Vec = Vec::with_capacity(self.n); - slice_idx.push(0); - let mut i = 0; - for e in self.combinations.next().unwrap() { - i += e+1; - slice_idx.push(i); - } - let grams = self.construct_grams_vec(slice_idx); + let grams_idx = self.combinations.next().unwrap(); + let grams = self.construct_grams_vec(grams_idx); return if grams.len() == self.n { // TODO: why? Some(grams) @@ -385,9 +437,11 @@ impl<'a> KSkipNGramsIter<'a> { //self.k = self.min_k; self.p = self.n - 1; - let pick_n = self.n - self.p; - let skip_total = self.max_k; - self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + self.combinations = GramCombinations::new(false, self.n+self.max_k-self.p-1, self.n-self.p).unwrap().peekable(); + + // let pick_n = self.n - self.p; + // let skip_total = self.max_k; + // self.sample_iter = SkipVecIter::new(pick_n, skip_total).peekable(); self.first = false; Some(()) @@ -396,9 +450,11 @@ impl<'a> KSkipNGramsIter<'a> { } else if self.p > 1 { self.p -= 1; - let pick_n = self.n - self.p; - let skip_total = self.max_k; - self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + self.combinations = GramCombinations::new(false, self.n+self.max_k-self.p-1, self.n-self.p).unwrap().peekable(); + + // let pick_n = self.n - self.p; + // let skip_total = self.max_k; + // self.sample_iter = SkipVecIter::new(pick_n, skip_total).peekable(); Some(()) } else if self.n < self.max_n { @@ -406,9 +462,11 @@ impl<'a> KSkipNGramsIter<'a> { //self.k = self.min_k; self.p = self.n - 1; - let pick_n = self.n - self.p; - let skip_total = self.max_k; - self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + self.combinations = GramCombinations::new(false, self.n+self.max_k-self.p-1, self.n-self.p).unwrap().peekable(); + + // let pick_n = self.n - self.p; + // let skip_total = self.max_k; + // self.sample_iter = SkipVecIter::new(pick_n, skip_total).peekable(); Some(()) } else { @@ -428,9 +486,11 @@ impl<'a> KSkipNGramsIter<'a> { self.p = 1; self.first = false; - let pick_n = self.n - self.p; - let skip_total = self.max_k; - self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + self.combinations = GramCombinations::new(false, self.n+self.max_k-self.p-1, self.n-self.p).unwrap().peekable(); + + // let pick_n = self.n - self.p; + // let skip_total = self.max_k; + // self.sample_iter = SkipVecIter::new(pick_n, skip_total).peekable(); Some(()) } else if self.combinations.peek().is_some() { @@ -438,9 +498,11 @@ impl<'a> KSkipNGramsIter<'a> { } else if self.p < self.n - 1 { self.p += 1; - let pick_n = self.n - self.p; - let skip_total = self.max_k; - self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + self.combinations = GramCombinations::new(false, self.n+self.max_k-self.p-1, self.n-self.p).unwrap().peekable(); + + // let pick_n = self.n - self.p; + // let skip_total = self.max_k; + // self.sample_iter = SkipVecIter::new(pick_n, skip_total).peekable(); Some(()) } else if self.n < self.max_n { @@ -448,9 +510,11 @@ impl<'a> KSkipNGramsIter<'a> { //self.k = self.min_k; self.p = 1; - let pick_n = self.n - self.p; - let skip_total = self.max_k; - self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + self.combinations = GramCombinations::new(false, self.n+self.max_k-self.p-1, self.n-self.p).unwrap().peekable(); + + // let pick_n = self.n - self.p; + // let skip_total = self.max_k; + // self.sample_iter = SkipVecIter::new(pick_n, skip_total).peekable(); Some(()) } else { @@ -466,9 +530,12 @@ impl<'a> KSkipNGramsIter<'a> { return if self.first { self.n = self.min_n; - let pick_n = min(self.n, self.window.len()) - 1; - let skip_total = min(self.window.len()-pick_n-1, self.max_k); - self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + let mut k= 0; + if self.window.len() > self.n { + k = min(self.max_k, self.window.len() - self.n); + } + let max_i = self.n + k - 1; + self.combinations = GramCombinations::new(true, max_i, self.n).unwrap().peekable(); self.first = false; Some(()) @@ -477,9 +544,12 @@ impl<'a> KSkipNGramsIter<'a> { } else if self.n < min(self.max_n, self.window.len()) { self.n += 1; - let pick_n = min(self.n, self.window.len()) - 1; - let skip_total = min(self.window.len()-pick_n-1, self.max_k); - self.combinations = SkipVecIter::new(pick_n, skip_total).peekable(); + let mut k= 0; + if self.window.len() > self.n { + k = min(self.max_k, self.window.len() - self.n); + } + let max_i = self.n + k - 1; + self.combinations = GramCombinations::new(true, max_i, self.n).unwrap().peekable(); Some(()) } else { @@ -487,28 +557,6 @@ impl<'a> KSkipNGramsIter<'a> { } } - // fn next_state_pad_main_end(&mut self) -> Option<()> { - // // Equivalent to a for-loop: - // // for n in self.min_n..self.max_n + 1 - // // for k in self.min_k..self.max_k + 1 - // // next_gram(n, k, p) - // return if self.first { - // self.n = self.min_n; - // self.k = self.min_k; - // self.first = false; - // Some(()) - // } else if self.k < self.max_k { - // self.k += 1; - // Some(()) - // } else if self.n < min(self.max_n, self.window.len()) { - // self.k = self.min_k; - // self.n += 1; - // Some(()) - // } else { - // None - // } - // } - fn construct_grams_vec( &mut self, slice_idx: Vec, @@ -552,7 +600,7 @@ fn build_window<'a>( max_n: usize, max_k: usize, ) -> Result, &'static str> { - let window_size = (max_n - 1) * (max_k + 1) + 1; + let window_size = max_n + max_k; let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); // Populate window @@ -561,9 +609,7 @@ fn build_window<'a>( let next_item = items.next(); match next_item { None => { - // TODO: remove result - return Ok(window); - //return Err("Items length is smaller than what is required by `max_n` and `max_k`") + return Err("Items length is smaller than `max_n`+`max_k`") } Some(s) => { window.push_back(s); @@ -578,7 +624,6 @@ fn build_k_skip_n_grams_iter<'a>( mut items: Box + 'a>, min_n: usize, max_n: usize, - min_k: usize, max_k: usize, pad_left: Option<&'a str>, pad_right: Option<&'a str>, @@ -589,8 +634,10 @@ fn build_k_skip_n_grams_iter<'a>( if min_n > max_n { return Err("`max_n` must be greater than or equal to `min_n`"); } - if min_k > max_k { - return Err("`max_k` must be greater than or equal to `min_k`"); + let mut max_k = max_k; + if max_n == 1 { + // if n == 1. k has no effect + max_k = 0; } let window = build_window(&mut items, max_n, max_k)?; @@ -600,7 +647,6 @@ fn build_k_skip_n_grams_iter<'a>( items, min_n, max_n, - min_k, max_k, pad_left, pad_right, @@ -609,9 +655,8 @@ fn build_k_skip_n_grams_iter<'a>( window, window_end: VecDeque::new(), n: 0, // length outputted last - k: 0, - combinations: SkipVecIter::new_empty().peekable(), p: 0, + combinations: GramCombinations::new_empty().peekable(), mode: IterMode::Start, first: false, })) @@ -622,7 +667,7 @@ fn bigram<'a>( pad_left: Option<&'a str>, pad_right: Option<&'a str>, ) -> Result> + 'a>, &'a str> { - build_k_skip_n_grams_iter(items, 2, 2, 0, 0, pad_left, pad_right) + build_k_skip_n_grams_iter(items, 2, 2, 0, pad_left, pad_right) } fn ngrams<'a>( @@ -631,7 +676,7 @@ fn ngrams<'a>( pad_left: Option<&'a str>, pad_right: Option<&'a str>, ) -> Result> + 'a>, &'a str> { - build_k_skip_n_grams_iter(items, n, n, 0, 0, pad_left, pad_right) + build_k_skip_n_grams_iter(items, n, n, 0, pad_left, pad_right) } fn everygrams<'a>( @@ -641,7 +686,7 @@ fn everygrams<'a>( pad_left: Option<&'a str>, pad_right: Option<&'a str>, ) -> Result> + 'a>, &'a str> { - build_k_skip_n_grams_iter(items, min_length, max_length, 0, 0, pad_left, pad_right) + build_k_skip_n_grams_iter(items, min_length, max_length, 0, pad_left, pad_right) } fn skipgrams<'a>( @@ -651,5 +696,5 @@ fn skipgrams<'a>( pad_left: Option<&'a str>, pad_right: Option<&'a str>, ) -> Result> + 'a>, &'a str> { - build_k_skip_n_grams_iter(items, n, n, 0, k, pad_left, pad_right) + build_k_skip_n_grams_iter(items, n, n, k, pad_left, pad_right) } diff --git a/src/ngram_utils/tests.rs b/src/ngram_utils/tests.rs index 44e2eaf..5fc6b6a 100644 --- a/src/ngram_utils/tests.rs +++ b/src/ngram_utils/tests.rs @@ -1,4 +1,6 @@ use crate::ngram_utils::*; +use std::collections::HashSet; +use std::iter::FromIterator; #[test] fn test_padding() { @@ -98,6 +100,10 @@ fn test_everygram() { let output_iter = everygrams(Box::new(sent), 1, 3, Some(""), Some("")).unwrap(); let output: Vec> = output_iter.collect(); + for e in &output { + println!("{:?}", e); + } + let expected = vec![ vec!["", "Mary"], vec!["", "", "Mary"], @@ -171,12 +177,39 @@ fn test_skipgram() { assert_eq!(output, expected); } +#[test] +fn test_skipgram_everygram() { + let sent = "Mary had a little lamb".split(" "); + + // min_n=2, max_n=3, max_k=1 + let output_iter = build_k_skip_n_grams_iter( + Box::new(sent.clone()), 2, 3, 1, Some(""), Some("")).unwrap(); + let output: Vec<_> = output_iter.collect(); + let output_set: HashSet> = HashSet::from_iter(output.iter().cloned()); + + // should be equivalent to union of two skipgram outputs n=2,3 (k=1) but expect different ordering + let output_sg_2: Vec<_> = skipgrams( + Box::new(sent.clone()), 2, 1, Some(""), Some("")).unwrap().collect(); + let output_sg_2_set: HashSet> = HashSet::from_iter(output_sg_2.iter().cloned()); + + let output_sg_3: Vec<_> = skipgrams( + Box::new(sent.clone()), 3, 1,Some(""), Some("")).unwrap().collect(); + let output_sg_3_set: HashSet> = HashSet::from_iter(output_sg_3.iter().cloned()); + let expected_set: HashSet<_> = output_sg_2_set.union(&output_sg_3_set).map(move |x| x.clone()).collect(); + + // Same output - different order + assert_eq!(output_set, expected_set); + + // No duplicates from either output expected + assert_eq!(output.len(), output_sg_2.len()+output_sg_3.len()); +} + #[test] fn test_ngram_edge_cases() { let sent = "Mary had a little lamb".split(" "); let output_iter = build_k_skip_n_grams_iter( - Box::new(sent.clone()), 1, 1, 1, 1, Some(""), Some("")).unwrap(); + Box::new(sent.clone()), 1, 1, 0, Some(""), Some("")).unwrap(); let output: Vec> = output_iter.collect(); let expected = vec![ @@ -190,7 +223,7 @@ fn test_ngram_edge_cases() { assert_eq!(output, expected); let output_iter = build_k_skip_n_grams_iter( - Box::new(sent.clone()), 1, 1, 2, 2, Some(""), Some("")).unwrap(); + Box::new(sent.clone()), 1, 1, 1, Some(""), Some("")).unwrap(); let output: Vec> = output_iter.collect(); assert_eq!(output, expected); @@ -215,4 +248,58 @@ fn test_skip_vec_iter() { ]; assert_eq!(output, expected); +} + +#[test] +fn test_gram_combinations() { + + let output: Vec> = GramCombinations::new(false, 3, 3).unwrap().collect(); + + let expected = vec![ + vec![0, 1, 2], + vec![0, 1, 3], + vec![0, 2, 3], + vec![1, 2, 3], + ]; + assert_eq!(output, expected); + + let output: Vec> = GramCombinations::new(true, 3, 3).unwrap().collect(); + let expected = vec![ + vec![0, 1, 2], + vec![0, 1, 3], + vec![0, 2, 3], + ]; + assert_eq!(output, expected); + + // Single output + let output: Vec> = GramCombinations::new(false, 1, 2).unwrap().collect(); + let expected = vec![ + vec![0, 1], + ]; + assert_eq!(output, expected); + + let output: Vec> = GramCombinations::new(true, 1, 2).unwrap().collect(); + let expected = vec![ + vec![0, 1], + ]; + assert_eq!(output, expected); + + let output: Vec> = GramCombinations::new(true, 2, 3).unwrap().collect(); + let expected = vec![ + vec![0, 1, 2], + ]; + assert_eq!(output, expected); + + let output: Vec> = GramCombinations::new(false, 0, 1).unwrap().collect(); + let expected = vec![ + vec![0], + ]; + assert_eq!(output, expected); + + let output: Vec> = GramCombinations::new(true, 0, 1).unwrap().collect(); + let expected = vec![ + vec![0], + ]; + assert_eq!(output, expected); + } \ No newline at end of file From 4e0d95d82ab4df34d0560fa4b669373ef4b9debc Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Mon, 6 Jul 2020 11:58:16 +0100 Subject: [PATCH 05/24] Tidy code --- src/ngram_utils/mod.rs | 391 ++++++++++++++------------------------- src/ngram_utils/tests.rs | 107 ++++------- 2 files changed, 175 insertions(+), 323 deletions(-) diff --git a/src/ngram_utils/mod.rs b/src/ngram_utils/mod.rs index 0a6c594..a0b74c0 100644 --- a/src/ngram_utils/mod.rs +++ b/src/ngram_utils/mod.rs @@ -1,7 +1,7 @@ #[cfg(test)] mod tests; -use std::cmp::{min, max}; +use std::cmp::{max, min}; use std::collections::VecDeque; use std::iter; @@ -39,7 +39,7 @@ fn pad_items<'a>( all_chained } -struct GramCombinations { +struct SampleCombinations { // Params min_i: usize, max_i: usize, @@ -48,11 +48,11 @@ struct GramCombinations { // State position: Vec, first: bool, - last: bool + last: bool, } -impl GramCombinations { - pub fn new(fix_0: bool, max_i: usize, n: usize) -> Result { +impl SampleCombinations { + pub fn new(fix_0: bool, max_i: usize, n: usize) -> Result { let min_i; if fix_0 { min_i = 1; @@ -60,60 +60,60 @@ impl GramCombinations { min_i = 0; } - if max_i+1 < n { - return Err("`max_i` must be greater than or equal to `n-1`"); + if max_i + 1 < n { + return Err("`max_i`+1 must be less than `n`"); } let position: Vec = (0..n).collect(); let mut last = false; - if n+1 == max_i { + if n + 1 == max_i { last = true; } - Ok(GramCombinations { + Ok(SampleCombinations { min_i, max_i, n, position, first: true, - last + last, }) } - pub fn new_empty() -> GramCombinations { - GramCombinations { + pub fn new_empty() -> SampleCombinations { + SampleCombinations { min_i: 0, max_i: 0, n: 0, position: Vec::new(), first: false, - last: false + last: false, } } } -impl Iterator for GramCombinations { +impl Iterator for SampleCombinations { type Item = Vec; - fn next(& mut self) -> Option { + fn next(&mut self) -> Option { if self.first { self.first = false; return Some(self.position.clone()); } if self.last { - return None + return None; } for i in (self.min_i..self.position.len()).rev() { let e = self.position[i]; - if e < self.max_i-(self.n-i-1) { + if e < self.max_i - (self.n - i - 1) { let mut e_1 = e; for j in i..self.position.len() { e_1 += 1; self.position[j] = e_1; } - if i == self.min_i && e+1 == self.max_i { + if i == self.min_i && e + 1 == self.max_i { self.last = true; } return Some(self.position.clone()); @@ -123,61 +123,6 @@ impl Iterator for GramCombinations { } } -struct SkipVecIter { - prev: Vec, - n: usize, - k: usize, - first: bool, -} - -impl SkipVecIter { - pub fn new(n: usize, k: usize) -> SkipVecIter { - SkipVecIter { - prev: vec![0; n], - n, - k, - first: true - } - } - - pub fn new_empty() -> SkipVecIter { - SkipVecIter { - prev: Vec::new(), - n: 0, - k: 0, - first: false - } - } -} - -impl Iterator for SkipVecIter { - type Item = Vec; - - fn next(&mut self) -> Option { - if self.first { - self.first = false; - return Some(self.prev.clone()); - } else { - for i in (0..self.n).rev() { - let e = self.prev[i]; - if e < self.k { - self.prev[i] += 1; - for j in i+1..self.n { - self.prev[j] = 0; - } - let sum: usize = self.prev.iter().sum(); - if sum <= self.k { - return Some(self.prev.clone()); - } else { - return self.next() - } - } - } - return None; - } - } -} - enum IterMode { Start, PadLeft, @@ -186,7 +131,7 @@ enum IterMode { PadRight, } -struct KSkipNGramsIter<'a> { +struct KSkipNGrams<'a> { // Params items: Box + 'a>, min_n: usize, @@ -197,15 +142,15 @@ struct KSkipNGramsIter<'a> { // Iterator state window: VecDeque<&'a str>, - window_end: VecDeque<&'a str>, - n: usize, // length outputted last - p: usize, // Amount of padding - combinations: Peekable, + n: usize, // length outputted last + p: usize, // Amount of padding + offset: usize, // Offset used during end window + sample_iter: Peekable, mode: IterMode, first: bool, } -impl<'a> Iterator for KSkipNGramsIter<'a> { +impl<'a> Iterator for KSkipNGrams<'a> { type Item = Vec<&'a str>; fn next(&mut self) -> Option { @@ -243,7 +188,7 @@ impl<'a> Iterator for KSkipNGramsIter<'a> { } IterMode::MainEnd => { - if (self.min_n != self.max_n || self.max_k > 0) && self.window.len() > 1 { + if (self.min_n != self.max_n || self.max_k > 0) && self.window.len() > 1 { let next = self.next_gram_main_end(); match &next { Some(_e) => next, @@ -269,7 +214,7 @@ impl<'a> Iterator for KSkipNGramsIter<'a> { } } -impl<'a> KSkipNGramsIter<'a> { +impl<'a> KSkipNGrams<'a> { // Switching between modes fn start_mode_pad_left(&mut self) { self.mode = IterMode::PadLeft; @@ -283,14 +228,11 @@ impl<'a> KSkipNGramsIter<'a> { fn start_mode_main_end(&mut self) { self.mode = IterMode::MainEnd; - self.window_end = self.window.clone(); - self.window.pop_front(); self.first = true; } fn start_mode_pad_right(&mut self) { self.mode = IterMode::PadRight; - self.window = self.window_end.clone(); self.first = true; } @@ -298,68 +240,24 @@ impl<'a> KSkipNGramsIter<'a> { fn next_gram_pad_left(&mut self) -> Option> { self.next_params_pad_left()?; - // let slice_idx = (self.k..self.window.len()) - // .step_by(self.k + 1) - // .take(self.n - self.p); - // let grams = self.construct_grams_vec(slice_idx); - // return Some(grams); - - // let mut slice_idx: Vec = Vec::with_capacity(self.n); - // let mut i = 0; - // let spacing = self.sample_iter.next().unwrap(); - // - // for (j, &e) in spacing.clone().iter().enumerate() { // TODO remove clone - // if j == 0 { - // i += e; - // } else { - // i += e+1; - // } - // slice_idx.push(i); - // } - - let mut slice_idx: Vec = self.combinations.next().unwrap(); + let slice_idx: Vec = self.sample_iter.next().unwrap(); let grams = self.construct_grams_vec(slice_idx); - return Some(grams); + Some(grams) } fn next_gram_pad_right(&mut self) -> Option> { self.next_params_pad_right()?; - // let slice_idx = (0..self.window.len() - self.k) - // .rev() - // .step_by(self.k + 1) - // .take(self.n - self.p) - // .rev(); - // let grams = self.construct_grams_vec(slice_idx.collect::>()); - // return Some(grams); - - - // let mut slice_idx: Vec = Vec::with_capacity(self.n); - // - // let spacing = self.sample_iter.next().unwrap(); - // let end_idx = self.window.len()-1; - // let mut i = end_idx; - // for (j, &e) in spacing.clone().iter().enumerate() { // TODO remove clone - // if j == 0 { - // i -= e; - // } else { - // i -= e+1; - // } - // slice_idx.push(i); - // } - // - // slice_idx.reverse(); - - let mut slice_idx: Vec = self.combinations.next().unwrap(); + let mut sample_idx: Vec = self.sample_iter.next().unwrap(); // Reverse index - for i in 0..slice_idx.len() { - slice_idx[i] = self.window.len() - 1 - slice_idx[i]; + for e in sample_idx.iter_mut() { + *e = self.window.len() - 1 - *e; } - slice_idx.reverse(); + sample_idx.reverse(); - let grams = self.construct_grams_vec(slice_idx); - return Some(grams); + let grams = self.construct_grams_vec(sample_idx); + Some(grams) } fn next_gram_main(&mut self) -> Option> { @@ -371,34 +269,21 @@ impl<'a> KSkipNGramsIter<'a> { return self.next_gram_main(); } - let grams_idx = self.combinations.next().unwrap(); - let grams = self.construct_grams_vec(grams_idx); - return Some(grams); + let sample_idx = self.sample_iter.next().unwrap(); + let grams = self.construct_grams_vec(sample_idx); + Some(grams) } fn next_gram_main_end(&mut self) -> Option> { - let finished = self.next_state_pad_main(); - - if finished.is_none() { - return if self.window.len() > self.min_n { - self.pop_window()?; - self.first = true; - self.next_gram_main_end() - } else { - None - } - - } - - let grams_idx = self.combinations.next().unwrap(); - let grams = self.construct_grams_vec(grams_idx); + self.next_state_pad_main_end()?; - return if grams.len() == self.n { // TODO: why? - Some(grams) - } else { - None + let mut sample_idx = self.sample_iter.next().unwrap(); + // Offset index + for e in sample_idx.iter_mut() { + *e += self.offset; } - + let grams = self.construct_grams_vec(sample_idx); + Some(grams) } fn forward_window(&mut self) -> Option<()> { @@ -412,155 +297,165 @@ impl<'a> KSkipNGramsIter<'a> { self.window.push_back(s); Some(()) // Successfully forwarded window } - } - } - - fn pop_window(&mut self) -> Option<()> { - // Pop item from window - return if self.window.len() >= 2 { - self.window.pop_front(); - Some(()) - } else { - None }; } fn next_params_pad_left(&mut self) -> Option<()> { // Equivalent to a for-loop: - // for n in max(self.min_n, 2)..self.max_n + 1 - // -- for k in self.min_k..self.max_k + 1 - // for p in (n-1)..0 // decreasing - // for combi in combinations: - // next_gram(n, k, p) + // for n in max(self.min_n, 2)..self.max_n+1: + // for p in (n-1)..0: // decreasing + // for sample_idx in sample_iter: + // next_gram(n, p, sample_idx) return if self.first { self.n = max(self.min_n, 2); - //self.k = self.min_k; self.p = self.n - 1; - - self.combinations = GramCombinations::new(false, self.n+self.max_k-self.p-1, self.n-self.p).unwrap().peekable(); - - // let pick_n = self.n - self.p; - // let skip_total = self.max_k; - // self.sample_iter = SkipVecIter::new(pick_n, skip_total).peekable(); + self.sample_iter = + SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) + .unwrap() + .peekable(); self.first = false; Some(()) - } else if self.combinations.peek().is_some() { + } else if self.sample_iter.peek().is_some() { Some(()) } else if self.p > 1 { self.p -= 1; - self.combinations = GramCombinations::new(false, self.n+self.max_k-self.p-1, self.n-self.p).unwrap().peekable(); - - // let pick_n = self.n - self.p; - // let skip_total = self.max_k; - // self.sample_iter = SkipVecIter::new(pick_n, skip_total).peekable(); + self.sample_iter = + SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) + .unwrap() + .peekable(); Some(()) } else if self.n < self.max_n { self.n += 1; - //self.k = self.min_k; self.p = self.n - 1; - self.combinations = GramCombinations::new(false, self.n+self.max_k-self.p-1, self.n-self.p).unwrap().peekable(); - - // let pick_n = self.n - self.p; - // let skip_total = self.max_k; - // self.sample_iter = SkipVecIter::new(pick_n, skip_total).peekable(); + self.sample_iter = + SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) + .unwrap() + .peekable(); Some(()) } else { None - } + }; } fn next_params_pad_right(&mut self) -> Option<()> { // Equivalent to a for-loop: - // for n in max(self.min_n, 2)..self.max_n + 1 - // for k in self.min_k..self.max_k + 1 - // for p in 1..n - // next_gram(n, k, p) + // for n in max(self.min_n, 2)..self.max_n+1: + // for p in 1..n: + // for sample_idx in sample_iter: + // next_gram(n, p, sample_idx) return if self.first { self.n = max(self.min_n, 2); - //self.k = self.min_k; self.p = 1; self.first = false; - self.combinations = GramCombinations::new(false, self.n+self.max_k-self.p-1, self.n-self.p).unwrap().peekable(); - - // let pick_n = self.n - self.p; - // let skip_total = self.max_k; - // self.sample_iter = SkipVecIter::new(pick_n, skip_total).peekable(); + self.sample_iter = + SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) + .unwrap() + .peekable(); Some(()) - } else if self.combinations.peek().is_some() { + } else if self.sample_iter.peek().is_some() { Some(()) } else if self.p < self.n - 1 { self.p += 1; - self.combinations = GramCombinations::new(false, self.n+self.max_k-self.p-1, self.n-self.p).unwrap().peekable(); - - // let pick_n = self.n - self.p; - // let skip_total = self.max_k; - // self.sample_iter = SkipVecIter::new(pick_n, skip_total).peekable(); + self.sample_iter = + SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) + .unwrap() + .peekable(); Some(()) } else if self.n < self.max_n { self.n += 1; - //self.k = self.min_k; self.p = 1; - self.combinations = GramCombinations::new(false, self.n+self.max_k-self.p-1, self.n-self.p).unwrap().peekable(); - - // let pick_n = self.n - self.p; - // let skip_total = self.max_k; - // self.sample_iter = SkipVecIter::new(pick_n, skip_total).peekable(); + self.sample_iter = + SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) + .unwrap() + .peekable(); Some(()) } else { None - } + }; } fn next_state_pad_main(&mut self) -> Option<()> { // Equivalent to a for-loop: - // for n in self.min_n..self.max_n + 1 - // for combi in combination(window[1:], n) - // next_gram = window[0] + combi + // for n in self.min_n..self.max_n + 1: + // for sample_idx in sample_iter: + // next_gram(n, sample_idx) return if self.first { self.n = self.min_n; - - let mut k= 0; - if self.window.len() > self.n { - k = min(self.max_k, self.window.len() - self.n); - } - let max_i = self.n + k - 1; - self.combinations = GramCombinations::new(true, max_i, self.n).unwrap().peekable(); + self.sample_iter = SampleCombinations::new(true, self.n + self.max_k - 1, self.n) + .unwrap() + .peekable(); self.first = false; Some(()) - } else if self.combinations.peek().is_some() { + } else if self.sample_iter.peek().is_some() { Some(()) } else if self.n < min(self.max_n, self.window.len()) { self.n += 1; + self.sample_iter = SampleCombinations::new(true, self.n + self.max_k - 1, self.n) + .unwrap() + .peekable(); - let mut k= 0; - if self.window.len() > self.n { - k = min(self.max_k, self.window.len() - self.n); - } - let max_i = self.n + k - 1; - self.combinations = GramCombinations::new(true, max_i, self.n).unwrap().peekable(); + Some(()) + } else { + None + }; + } + + fn next_state_pad_main_end(&mut self) -> Option<()> { + // Equivalent to a for-loop: + // for offset in 1..window.len()-min_n + // for n in self.min_n..self.max_n + 1: + // for sample_idx in sample_iter: + // next_gram(offset, n, sample_idx) + return if self.first { + self.n = self.min_n; + self.offset = 1; + self.reset_sample_iter_main_end(); + + self.first = false; + Some(()) + } else if self.sample_iter.peek().is_some() { + Some(()) + } else if self.n < min(self.max_n, self.window.len() - self.offset) { + self.n += 1; + self.reset_sample_iter_main_end(); + + Some(()) + } else if self.window.len() - self.offset > self.min_n { + self.offset += 1; + self.n = self.min_n; + self.reset_sample_iter_main_end(); Some(()) } else { None + }; + } + + fn reset_sample_iter_main_end(&mut self) { + let window_len = self.window.len() - self.offset; + let mut k = 0; + if window_len > self.n { + k = min(self.max_k, window_len - self.n); } + let max_i = self.n + k - 1; + self.sample_iter = SampleCombinations::new(true, max_i, self.n) + .unwrap() + .peekable(); } - fn construct_grams_vec( - &mut self, - slice_idx: Vec, - ) -> Vec<&'a str> { + fn construct_grams_vec(&mut self, slice_idx: Vec) -> Vec<&'a str> { let grams = self.vec_from_idx(slice_idx); return match self.mode { @@ -608,9 +503,7 @@ fn build_window<'a>( while i > 0 { let next_item = items.next(); match next_item { - None => { - return Err("Items length is smaller than `max_n`+`max_k`") - } + None => return Err("Items length is smaller than `max_n`+`max_k`"), Some(s) => { window.push_back(s); } @@ -620,7 +513,7 @@ fn build_window<'a>( Ok(window) } -fn build_k_skip_n_grams_iter<'a>( +fn build_k_skip_n_grams<'a>( mut items: Box + 'a>, min_n: usize, max_n: usize, @@ -642,7 +535,7 @@ fn build_k_skip_n_grams_iter<'a>( let window = build_window(&mut items, max_n, max_k)?; - Ok(Box::new(KSkipNGramsIter { + Ok(Box::new(KSkipNGrams { // Params items, min_n, @@ -653,10 +546,10 @@ fn build_k_skip_n_grams_iter<'a>( // Iterator state window, - window_end: VecDeque::new(), n: 0, // length outputted last p: 0, - combinations: GramCombinations::new_empty().peekable(), + offset: 0, + sample_iter: SampleCombinations::new_empty().peekable(), mode: IterMode::Start, first: false, })) @@ -667,7 +560,7 @@ fn bigram<'a>( pad_left: Option<&'a str>, pad_right: Option<&'a str>, ) -> Result> + 'a>, &'a str> { - build_k_skip_n_grams_iter(items, 2, 2, 0, pad_left, pad_right) + build_k_skip_n_grams(items, 2, 2, 0, pad_left, pad_right) } fn ngrams<'a>( @@ -676,7 +569,7 @@ fn ngrams<'a>( pad_left: Option<&'a str>, pad_right: Option<&'a str>, ) -> Result> + 'a>, &'a str> { - build_k_skip_n_grams_iter(items, n, n, 0, pad_left, pad_right) + build_k_skip_n_grams(items, n, n, 0, pad_left, pad_right) } fn everygrams<'a>( @@ -686,7 +579,7 @@ fn everygrams<'a>( pad_left: Option<&'a str>, pad_right: Option<&'a str>, ) -> Result> + 'a>, &'a str> { - build_k_skip_n_grams_iter(items, min_length, max_length, 0, pad_left, pad_right) + build_k_skip_n_grams(items, min_length, max_length, 0, pad_left, pad_right) } fn skipgrams<'a>( @@ -696,5 +589,5 @@ fn skipgrams<'a>( pad_left: Option<&'a str>, pad_right: Option<&'a str>, ) -> Result> + 'a>, &'a str> { - build_k_skip_n_grams_iter(items, n, n, k, pad_left, pad_right) + build_k_skip_n_grams(items, n, n, k, pad_left, pad_right) } diff --git a/src/ngram_utils/tests.rs b/src/ngram_utils/tests.rs index 5fc6b6a..a70ebf3 100644 --- a/src/ngram_utils/tests.rs +++ b/src/ngram_utils/tests.rs @@ -100,10 +100,6 @@ fn test_everygram() { let output_iter = everygrams(Box::new(sent), 1, 3, Some(""), Some("")).unwrap(); let output: Vec> = output_iter.collect(); - for e in &output { - println!("{:?}", e); - } - let expected = vec![ vec!["", "Mary"], vec!["", "", "Mary"], @@ -182,34 +178,39 @@ fn test_skipgram_everygram() { let sent = "Mary had a little lamb".split(" "); // min_n=2, max_n=3, max_k=1 - let output_iter = build_k_skip_n_grams_iter( - Box::new(sent.clone()), 2, 3, 1, Some(""), Some("")).unwrap(); + let output_iter = + build_k_skip_n_grams(Box::new(sent.clone()), 2, 3, 1, Some(""), Some("")).unwrap(); let output: Vec<_> = output_iter.collect(); let output_set: HashSet> = HashSet::from_iter(output.iter().cloned()); // should be equivalent to union of two skipgram outputs n=2,3 (k=1) but expect different ordering - let output_sg_2: Vec<_> = skipgrams( - Box::new(sent.clone()), 2, 1, Some(""), Some("")).unwrap().collect(); + let output_sg_2: Vec<_> = skipgrams(Box::new(sent.clone()), 2, 1, Some(""), Some("")) + .unwrap() + .collect(); let output_sg_2_set: HashSet> = HashSet::from_iter(output_sg_2.iter().cloned()); - let output_sg_3: Vec<_> = skipgrams( - Box::new(sent.clone()), 3, 1,Some(""), Some("")).unwrap().collect(); + let output_sg_3: Vec<_> = skipgrams(Box::new(sent.clone()), 3, 1, Some(""), Some("")) + .unwrap() + .collect(); let output_sg_3_set: HashSet> = HashSet::from_iter(output_sg_3.iter().cloned()); - let expected_set: HashSet<_> = output_sg_2_set.union(&output_sg_3_set).map(move |x| x.clone()).collect(); + let expected_set: HashSet<_> = output_sg_2_set + .union(&output_sg_3_set) + .map(move |x| x.clone()) + .collect(); // Same output - different order assert_eq!(output_set, expected_set); // No duplicates from either output expected - assert_eq!(output.len(), output_sg_2.len()+output_sg_3.len()); + assert_eq!(output.len(), output_sg_2.len() + output_sg_3.len()); } #[test] fn test_ngram_edge_cases() { let sent = "Mary had a little lamb".split(" "); - let output_iter = build_k_skip_n_grams_iter( - Box::new(sent.clone()), 1, 1, 0, Some(""), Some("")).unwrap(); + let output_iter = + build_k_skip_n_grams(Box::new(sent.clone()), 1, 1, 0, Some(""), Some("")).unwrap(); let output: Vec> = output_iter.collect(); let expected = vec![ @@ -222,84 +223,42 @@ fn test_ngram_edge_cases() { assert_eq!(output, expected); - let output_iter = build_k_skip_n_grams_iter( - Box::new(sent.clone()), 1, 1, 1, Some(""), Some("")).unwrap(); + let output_iter = + build_k_skip_n_grams(Box::new(sent.clone()), 1, 1, 1, Some(""), Some("")).unwrap(); let output: Vec> = output_iter.collect(); assert_eq!(output, expected); } #[test] -fn test_skip_vec_iter() { - - let output: Vec> = SkipVecIter::new(3, 2).collect(); +fn test_sample_combinations() { + let output: Vec> = SampleCombinations::new(false, 3, 3).unwrap().collect(); - let expected = vec![ - vec![0, 0, 0], - vec![0, 0, 1], - vec![0, 0, 2], - vec![0, 1, 0], - vec![0, 1, 1], - vec![0, 2, 0], - vec![1, 0, 0], - vec![1, 0, 1], - vec![1, 1, 0], - vec![2, 0, 0], - ]; + let expected = vec![vec![0, 1, 2], vec![0, 1, 3], vec![0, 2, 3], vec![1, 2, 3]]; assert_eq!(output, expected); -} - -#[test] -fn test_gram_combinations() { - - let output: Vec> = GramCombinations::new(false, 3, 3).unwrap().collect(); - - let expected = vec![ - vec![0, 1, 2], - vec![0, 1, 3], - vec![0, 2, 3], - vec![1, 2, 3], - ]; - assert_eq!(output, expected); - - let output: Vec> = GramCombinations::new(true, 3, 3).unwrap().collect(); - let expected = vec![ - vec![0, 1, 2], - vec![0, 1, 3], - vec![0, 2, 3], - ]; + let output: Vec> = SampleCombinations::new(true, 3, 3).unwrap().collect(); + let expected = vec![vec![0, 1, 2], vec![0, 1, 3], vec![0, 2, 3]]; assert_eq!(output, expected); // Single output - let output: Vec> = GramCombinations::new(false, 1, 2).unwrap().collect(); - let expected = vec![ - vec![0, 1], - ]; + let output: Vec> = SampleCombinations::new(false, 1, 2).unwrap().collect(); + let expected = vec![vec![0, 1]]; assert_eq!(output, expected); - let output: Vec> = GramCombinations::new(true, 1, 2).unwrap().collect(); - let expected = vec![ - vec![0, 1], - ]; + let output: Vec> = SampleCombinations::new(true, 1, 2).unwrap().collect(); + let expected = vec![vec![0, 1]]; assert_eq!(output, expected); - let output: Vec> = GramCombinations::new(true, 2, 3).unwrap().collect(); - let expected = vec![ - vec![0, 1, 2], - ]; + let output: Vec> = SampleCombinations::new(true, 2, 3).unwrap().collect(); + let expected = vec![vec![0, 1, 2]]; assert_eq!(output, expected); - let output: Vec> = GramCombinations::new(false, 0, 1).unwrap().collect(); - let expected = vec![ - vec![0], - ]; + let output: Vec> = SampleCombinations::new(false, 0, 1).unwrap().collect(); + let expected = vec![vec![0]]; assert_eq!(output, expected); - let output: Vec> = GramCombinations::new(true, 0, 1).unwrap().collect(); - let expected = vec![ - vec![0], - ]; + let output: Vec> = SampleCombinations::new(true, 0, 1).unwrap().collect(); + let expected = vec![vec![0]]; assert_eq!(output, expected); - -} \ No newline at end of file +} From a37592fea7ecdf34cd497a3c7b6451ec4d4db02a Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Mon, 6 Jul 2020 16:43:26 +0100 Subject: [PATCH 06/24] Refactored and documented --- src/ngram_utils/mod.rs | 574 +++++++++++++++++++++++++-------------- src/ngram_utils/tests.rs | 193 +++++++++---- 2 files changed, 500 insertions(+), 267 deletions(-) diff --git a/src/ngram_utils/mod.rs b/src/ngram_utils/mod.rs index a0b74c0..f37226e 100644 --- a/src/ngram_utils/mod.rs +++ b/src/ngram_utils/mod.rs @@ -4,134 +4,182 @@ mod tests; use std::cmp::{max, min}; use std::collections::VecDeque; use std::iter; - use std::iter::Peekable; -fn pad_items<'a>( - items: Box + 'a>, - n: usize, - pad_left: Option<&'a str>, - pad_right: Option<&'a str>, -) -> Box + 'a> { - let left_chained: Box>; - let all_chained: Box>; - - match pad_left { - Some(s) => { - let pad_left_iter = iter::repeat(s).take(n - 1); - left_chained = Box::new(pad_left_iter.chain(items)); - } - None => { - left_chained = items; +#[cfg(feature = "python")] +use dict_derive::{FromPyObject, IntoPyObject}; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "python", derive(FromPyObject, IntoPyObject))] +pub struct KSkipNGramsParams { + pub min_n: usize, + pub max_n: usize, + pub max_k: usize, +} + +impl KSkipNGramsParams { + pub fn new(min_n: usize, max_n: usize, max_k: usize) -> KSkipNGramsParams { + KSkipNGramsParams { + min_n, + max_n, + max_k, } } - match pad_right { - Some(s) => { - let pad_right_iter = iter::repeat(s).take(n - 1); - all_chained = Box::new(left_chained.chain(pad_right_iter)); - } - None => { - all_chained = left_chained; + pub fn build(&mut self) -> KSkipNGrams { + KSkipNGrams { + params: self.clone(), } } - - all_chained } -struct SampleCombinations { - // Params - min_i: usize, - max_i: usize, - n: usize, - - // State - position: Vec, - first: bool, - last: bool, +/// Transforms a given sequence of `items` into k-skip-n-grams iterator. +/// +/// Use convenience methods for common use cases: `new_bigram`, `new_trigram`, `new_ngrams`, +/// `new_everygrams`, `new_skipgrams`. Otherwise build new using `new`. +pub struct KSkipNGrams { + pub params: KSkipNGramsParams, } -impl SampleCombinations { - pub fn new(fix_0: bool, max_i: usize, n: usize) -> Result { - let min_i; - if fix_0 { - min_i = 1; - } else { - min_i = 0; - } - - if max_i + 1 < n { - return Err("`max_i`+1 must be less than `n`"); - } - - let position: Vec = (0..n).collect(); - - let mut last = false; - if n + 1 == max_i { - last = true; - } +/// Core methods to build `KSkipNGrams` +impl KSkipNGrams { + /// Generate all bigrams from a sequence of `items`, an iterator. + /// + /// Example: + /// + /// ``` + /// use vtext::ngram_utils::*; + /// let sent = "One Two Three Four".split(" "); + /// let gramizer = KSkipNGrams::new_bigram(); + /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); + /// assert_eq!(grams, vec![vec!["One", "Two"], vec!["Two", "Three"], vec!["Three", "Four"]]); + /// ``` + pub fn new_bigram() -> KSkipNGrams { + KSkipNGramsParams::new(2, 2, 0).build() + } - Ok(SampleCombinations { - min_i, - max_i, - n, - position, - first: true, - last, - }) + /// Generate all trigrams from a sequence of `items`, an iterator. + /// + /// Example: + /// ``` + /// use vtext::ngram_utils::*; + /// let sent = "One Two Three Four".split(" "); + /// let gramizer = KSkipNGrams::new_trigram(); + /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); + /// assert_eq!(grams, vec![vec!["One", "Two", "Three"], vec!["Two", "Three", "Four"]]); + /// ``` + pub fn new_trigram() -> KSkipNGrams { + KSkipNGramsParams::new(3, 3, 0).build() } - pub fn new_empty() -> SampleCombinations { - SampleCombinations { - min_i: 0, - max_i: 0, - n: 0, - position: Vec::new(), - first: false, - last: false, - } + /// Generate all ngrams from a sequence of `items`, an iterator. + /// + /// Example: + /// ``` + /// use vtext::ngram_utils::*; + /// let sent = "One Two Three Four".split(" "); + /// let gramizer = KSkipNGrams::new_ngrams(3); + /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); + /// assert_eq!(grams, vec![vec!["One", "Two", "Three"], vec!["Two", "Three", "Four"]]); + /// ``` + /// + /// Paramaters: + /// * `n` - The degree of the ngrams + pub fn new_ngrams(n: usize) -> KSkipNGrams { + KSkipNGramsParams::new(n, n, 0).build() } -} -impl Iterator for SampleCombinations { - type Item = Vec; + /// Generate all ngrams between `min_n` and `max_n` from a sequence of `items`, an iterator. + /// + /// Example: + /// ``` + /// use vtext::ngram_utils::*; + /// let sent = "One Two Three".split(" "); + /// let gramizer = KSkipNGrams::new_everygrams(1, 3); + /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); + /// assert_eq!(grams, vec![ + /// vec!["One"], vec!["One", "Two"], vec!["One", "Two", "Three"], vec!["Two"], + /// vec!["Two", "Three"], vec!["Three"]]); + /// ``` + /// + /// Paramaters: + /// * `min_n` - The minimum degree of the ngram + /// * `max_n` - The maximum degree of the ngram + pub fn new_everygrams(min_n: usize, max_n: usize) -> KSkipNGrams { + KSkipNGramsParams::new(min_n, max_n, 0).build() + } - fn next(&mut self) -> Option { - if self.first { - self.first = false; - return Some(self.position.clone()); - } - if self.last { - return None; - } + /// Generate all skip-grams with a max total skip of `k` from a sequence of `items`, + /// an iterator. + /// + /// Example: + /// ``` + /// use vtext::ngram_utils::*; + /// let sent = "One Two Three Four Five".split(" "); + /// let gramizer = KSkipNGrams::new_skipgrams(3, 2); + /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); + /// assert_eq!(grams, vec![vec!["One", "Two", "Three"], vec!["One", "Two", "Four"], + /// vec!["One", "Two", "Five"], vec!["One", "Three", "Four"], vec!["One", "Three", "Five"], + /// vec!["One", "Four", "Five"], vec!["Two", "Three", "Four"], vec!["Two", "Three", "Five"], + /// vec!["Two", "Four", "Five"], vec!["Three", "Four", "Five"]]); + /// ``` + /// + /// Paramaters: + /// * `n` - The degree of the ngram + /// * `k` - The degree of the skipgram: the total max skip between items + pub fn new_skipgrams(n: usize, k: usize) -> KSkipNGrams { + KSkipNGramsParams::new(n, n, k).build() + } - for i in (self.min_i..self.position.len()).rev() { - let e = self.position[i]; - if e < self.max_i - (self.n - i - 1) { - let mut e_1 = e; - for j in i..self.position.len() { - e_1 += 1; - self.position[j] = e_1; - } - if i == self.min_i && e + 1 == self.max_i { - self.last = true; - } - return Some(self.position.clone()); - } - } - None // Will never reach + /// Generate all k-skip-n-grams from a sequence of `items`, an iterator. + /// + /// Example: + /// ``` + /// use vtext::ngram_utils::*; + /// let sent = "One Two Three Four".split(" "); + /// let gramizer = KSkipNGrams::new(2, 3, 1); + /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); + /// assert_eq!(grams, vec![vec!["One", "Two"], vec!["One", "Three"], vec!["One", "Two", "Three"], + /// vec!["One", "Two", "Four"], vec!["One", "Three", "Four"], vec!["Two", "Three"], + /// vec!["Two", "Four"], vec!["Two", "Three", "Four"], vec!["Three", "Four"]]); + /// ``` + /// + /// Paramaters: + /// * `min_n` - The minimum degree of the ngram + /// * `max_n` - The maximum degree of the ngram + /// * `k` - The degree of the skipgram: the total max skip between items + pub fn new(min_n: usize, max_n: usize, max_k: usize) -> KSkipNGrams { + KSkipNGramsParams::new(min_n, max_n, max_k).build() } -} -enum IterMode { - Start, - PadLeft, - Main, - MainEnd, - PadRight, + /// Transform a sequence of `items`, an iterator to a `KSkipNGramsIter` iterator. + /// + /// Parameters: + /// * `items` - Input iterator + /// * `pad_left` - Optional string to use as left padding + /// * `pad_right` - Optional string to use as right padding + pub fn transform<'a>( + &'a self, + items: Box + 'a>, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, + ) -> Result> + 'a>, InputError> { + let k_skip_n_grams_iter = KSkipNGramsIter::new( + items, + self.params.min_n, + self.params.max_n, + self.params.max_k, + pad_left, + pad_right, + )?; + Ok(Box::new(k_skip_n_grams_iter)) + } } -struct KSkipNGrams<'a> { +/// An iterator which provided with a sequence of `items` transforms into k-skip-n-grams. +/// The iterator consumes the input `items` only once. +pub struct KSkipNGramsIter<'a> { // Params items: Box + 'a>, min_n: usize, @@ -142,17 +190,119 @@ struct KSkipNGrams<'a> { // Iterator state window: VecDeque<&'a str>, - n: usize, // length outputted last - p: usize, // Amount of padding - offset: usize, // Offset used during end window + /// Window which holds items that have been consumed + n: usize, + /// Gram length that was yielded last + p: usize, + /// Amount of padding included in item yielded last + offset: usize, + /// Offset used during MainEnd mode sample_iter: Peekable, + /// k-skip combinations of current window mode: IterMode, + /// Current mode of iterator first: bool, } -impl<'a> Iterator for KSkipNGrams<'a> { +/// Core methods to build `KSkipNGramsIter` +impl<'a> KSkipNGramsIter<'a> { + /// Build a new `KSkipNGramsIter`. + /// + /// Example: + /// ``` + /// use vtext::ngram_utils::*; + /// let sent = "One Two Three".split(" "); + /// let grams_iter = KSkipNGramsIter::new(Box::new(sent), 1, 2, 1, Some(""), Some("")); + /// let grams: Vec> = grams_iter.unwrap().collect(); + /// ``` + /// + /// Parameters: + /// * `items` - Input iterator + /// * `min_n` - The minimum degree of the ngram + /// * `max_n` - The maximum degree of the ngram + /// * `max_k` - The maximum-degree of the skipgram: the total max skip between items + /// * `pad_left` - Optional string to use as left padding + /// * `pad_right` - Optional string to use as right padding + pub fn new( + mut items: Box + 'a>, + min_n: usize, + max_n: usize, + max_k: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, + ) -> Result, InputError> { + if min_n < 1 { + return Err(InputError( + "`min_n` must be greater than or equal to 1".to_string(), + )); + } + if min_n > max_n { + return Err(InputError( + "`max_n` must be greater than or equal to `min_n`".to_string(), + )); + } + let mut max_k = max_k; + if max_n == 1 { + max_k = 0; // if n == 1. k has no effect + } + + let window = Self::build_window(&mut items, max_n, max_k)?; + + Ok(KSkipNGramsIter { + // Params + items, + min_n, + max_n, + max_k, + pad_left, + pad_right, + + // Iterator state + window, + n: 0, + p: 0, + offset: 0, + sample_iter: SampleCombinations::new_empty().peekable(), + mode: IterMode::Start, + first: false, + }) + } + + // Prepare and populate start window + fn build_window( + items: &mut Box + 'a>, + max_n: usize, + max_k: usize, + ) -> Result, InputError> { + let window_size = max_n + max_k; + let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); + + // Populate window + let mut i = window_size; + while i > 0 { + let next_item = items.next(); + match next_item { + None => { + return Err(InputError( + "Items length is smaller than `max_n`+`max_k`".to_string(), + )) + } + Some(s) => { + window.push_back(s); + } + } + i -= 1; + } + Ok(window) + } +} + +/// Iterator functions +impl<'a> Iterator for KSkipNGramsIter<'a> { type Item = Vec<&'a str>; + // Next item. Depending on current mode obtain next item. + // If current mode has been exhausted then switch to next fn next(&mut self) -> Option { return match &self.mode { IterMode::Start => { @@ -214,7 +364,8 @@ impl<'a> Iterator for KSkipNGrams<'a> { } } -impl<'a> KSkipNGrams<'a> { +/// Internal functions +impl<'a> KSkipNGramsIter<'a> { // Switching between modes fn start_mode_pad_left(&mut self) { self.mode = IterMode::PadLeft; @@ -236,7 +387,7 @@ impl<'a> KSkipNGrams<'a> { self.first = true; } - // Next gram + // Obtain next gram for PadLeft mode fn next_gram_pad_left(&mut self) -> Option> { self.next_params_pad_left()?; @@ -245,12 +396,13 @@ impl<'a> KSkipNGrams<'a> { Some(grams) } + // Obtain next gram for PadRight mode fn next_gram_pad_right(&mut self) -> Option> { self.next_params_pad_right()?; let mut sample_idx: Vec = self.sample_iter.next().unwrap(); - // Reverse index + // Mirror index for e in sample_idx.iter_mut() { *e = self.window.len() - 1 - *e; } @@ -260,6 +412,7 @@ impl<'a> KSkipNGrams<'a> { Some(grams) } + // Obtain next gram for Main mode fn next_gram_main(&mut self) -> Option> { let finished = self.next_state_pad_main(); @@ -274,6 +427,7 @@ impl<'a> KSkipNGrams<'a> { Some(grams) } + // Obtain next gram for MainEnd mode fn next_gram_main_end(&mut self) -> Option> { self.next_state_pad_main_end()?; @@ -286,6 +440,7 @@ impl<'a> KSkipNGrams<'a> { Some(grams) } + // Forward the window by one step fn forward_window(&mut self) -> Option<()> { // Need to forward window when yielded ngram of max-length and max-skip-size let next_item = self.items.next(); @@ -300,6 +455,7 @@ impl<'a> KSkipNGrams<'a> { }; } + // Increment parameters and sample iterator fn next_params_pad_left(&mut self) -> Option<()> { // Equivalent to a for-loop: // for n in max(self.min_n, 2)..self.max_n+1: @@ -342,6 +498,7 @@ impl<'a> KSkipNGrams<'a> { }; } + // Increment parameters and sample iterator fn next_params_pad_right(&mut self) -> Option<()> { // Equivalent to a for-loop: // for n in max(self.min_n, 2)..self.max_n+1: @@ -385,6 +542,7 @@ impl<'a> KSkipNGrams<'a> { }; } + // Increment parameters and sample iterator for each window fn next_state_pad_main(&mut self) -> Option<()> { // Equivalent to a for-loop: // for n in self.min_n..self.max_n + 1: @@ -412,6 +570,7 @@ impl<'a> KSkipNGrams<'a> { }; } + // Increment parameters and sample iterator for each window fn next_state_pad_main_end(&mut self) -> Option<()> { // Equivalent to a for-loop: // for offset in 1..window.len()-min_n @@ -455,8 +614,9 @@ impl<'a> KSkipNGrams<'a> { .peekable(); } - fn construct_grams_vec(&mut self, slice_idx: Vec) -> Vec<&'a str> { - let grams = self.vec_from_idx(slice_idx); + // Create output vec from sample index and add padding if necessary + fn construct_grams_vec(&mut self, sample_idx: Vec) -> Vec<&'a str> { + let grams = self.vec_from_idx(sample_idx); return match self.mode { IterMode::PadLeft => { @@ -481,113 +641,109 @@ impl<'a> KSkipNGrams<'a> { }; } - fn vec_from_idx(&mut self, slice_idx: Vec) -> Vec<&'a str> { - let mut grams = Vec::with_capacity(slice_idx.len()); - for idx in slice_idx.iter() { + // Create output vec from sample index + fn vec_from_idx(&mut self, sample_idx: Vec) -> Vec<&'a str> { + let mut grams = Vec::with_capacity(sample_idx.len()); + for idx in sample_idx.iter() { grams.push(self.window[*idx].clone()); } grams } } -fn build_window<'a>( - items: &mut Box + 'a>, - max_n: usize, - max_k: usize, -) -> Result, &'static str> { - let window_size = max_n + max_k; - let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); - - // Populate window - let mut i = window_size; - while i > 0 { - let next_item = items.next(); - match next_item { - None => return Err("Items length is smaller than `max_n`+`max_k`"), - Some(s) => { - window.push_back(s); - } - } - i -= 1; - } - Ok(window) +/// Error given when input is inconsistent +#[derive(Debug, Clone)] +pub struct InputError(String); + +/// Represents the different modes of `KSkipNGramsIter` +enum IterMode { + Start, + PadLeft, + Main, + MainEnd, + PadRight, } -fn build_k_skip_n_grams<'a>( - mut items: Box + 'a>, - min_n: usize, - max_n: usize, - max_k: usize, - pad_left: Option<&'a str>, - pad_right: Option<&'a str>, -) -> Result> + 'a>, &'a str> { - if min_n < 1 { - return Err("`min_n` must be greater than or equal to 1"); - } - if min_n > max_n { - return Err("`max_n` must be greater than or equal to `min_n`"); - } - let mut max_k = max_k; - if max_n == 1 { - // if n == 1. k has no effect - max_k = 0; - } +pub struct SampleCombinations { + // Params + min_i: usize, + max_i: usize, + n: usize, - let window = build_window(&mut items, max_n, max_k)?; - - Ok(Box::new(KSkipNGrams { - // Params - items, - min_n, - max_n, - max_k, - pad_left, - pad_right, - - // Iterator state - window, - n: 0, // length outputted last - p: 0, - offset: 0, - sample_iter: SampleCombinations::new_empty().peekable(), - mode: IterMode::Start, - first: false, - })) + // State + position: Vec, + first: bool, + last: bool, } -fn bigram<'a>( - items: Box + 'a>, - pad_left: Option<&'a str>, - pad_right: Option<&'a str>, -) -> Result> + 'a>, &'a str> { - build_k_skip_n_grams(items, 2, 2, 0, pad_left, pad_right) -} +impl SampleCombinations { + pub fn new(fix_0: bool, max_i: usize, n: usize) -> Result { + let min_i; + if fix_0 { + min_i = 1; + } else { + min_i = 0; + } -fn ngrams<'a>( - items: Box + 'a>, - n: usize, - pad_left: Option<&'a str>, - pad_right: Option<&'a str>, -) -> Result> + 'a>, &'a str> { - build_k_skip_n_grams(items, n, n, 0, pad_left, pad_right) -} + if max_i + 1 < n { + return Err("`max_i`+1 must be less than `n`"); + } -fn everygrams<'a>( - items: Box + 'a>, - min_length: usize, - max_length: usize, - pad_left: Option<&'a str>, - pad_right: Option<&'a str>, -) -> Result> + 'a>, &'a str> { - build_k_skip_n_grams(items, min_length, max_length, 0, pad_left, pad_right) + let position: Vec = (0..n).collect(); + + let mut last = false; + if n == max_i + 1 { + last = true; + } + + Ok(SampleCombinations { + min_i, + max_i, + n, + position, + first: true, + last, + }) + } + + pub fn new_empty() -> SampleCombinations { + SampleCombinations { + min_i: 0, + max_i: 0, + n: 0, + position: Vec::new(), + first: false, + last: false, + } + } } -fn skipgrams<'a>( - items: Box + 'a>, - n: usize, - k: usize, - pad_left: Option<&'a str>, - pad_right: Option<&'a str>, -) -> Result> + 'a>, &'a str> { - build_k_skip_n_grams(items, n, n, k, pad_left, pad_right) +impl Iterator for SampleCombinations { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.first { + self.first = false; + return Some(self.position.clone()); + } + if self.last { + return None; + } + + for i in (self.min_i..self.position.len()).rev() { + let e = self.position[i]; + if e < self.max_i - (self.n - i - 1) { + let mut e_1 = e; + for j in i..self.position.len() { + e_1 += 1; + self.position[j] = e_1; + } + if i == self.min_i && e + 1 == self.max_i { + self.last = true; + } + return Some(self.position.clone()); + } + } + None // Will never reach + } } diff --git a/src/ngram_utils/tests.rs b/src/ngram_utils/tests.rs index a70ebf3..ff4a4e9 100644 --- a/src/ngram_utils/tests.rs +++ b/src/ngram_utils/tests.rs @@ -2,32 +2,15 @@ use crate::ngram_utils::*; use std::collections::HashSet; use std::iter::FromIterator; -#[test] -fn test_padding() { - let sent = "Mary had a little lamb".split(" "); - - let output: Vec<&str> = - pad_items(Box::new(sent.clone()), 3, Some(""), Some("")).collect(); - let expected = vec![ - "", "", "Mary", "had", "a", "little", "lamb", "", "", - ]; - assert_eq!(output, expected); - - let output: Vec<&str> = pad_items(Box::new(sent.clone()), 2, Some(""), None).collect(); - let expected = vec!["", "Mary", "had", "a", "little", "lamb"]; - assert_eq!(output, expected); - - let output: Vec<&str> = pad_items(Box::new(sent.clone()), 2, None, Some("")).collect(); - let expected = vec!["Mary", "had", "a", "little", "lamb", ""]; - assert_eq!(output, expected); -} - #[test] fn test_bigram() { let sent = "Mary had a little lamb".split(" "); - let output_iter = bigram(Box::new(sent), None, None).unwrap(); - let output: Vec> = output_iter.collect(); + let gramizer = KSkipNGrams::new_bigram(); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), None, None) + .unwrap() + .collect(); let expected = vec![ vec!["Mary", "had"], @@ -36,15 +19,18 @@ fn test_bigram() { vec!["little", "lamb"], ]; - assert_eq!(output, expected); + assert_eq!(grams, expected); } #[test] fn test_trigram() { let sent = "Mary had a little lamb".split(" "); - let output_iter = ngrams(Box::new(sent.clone()), 3, Some(""), Some("")).unwrap(); - let output: Vec> = output_iter.collect(); + let gramizer = KSkipNGrams::new_trigram(); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); let expected = vec![ vec!["", "", "Mary"], @@ -56,10 +42,13 @@ fn test_trigram() { vec!["lamb", "", ""], ]; - assert_eq!(output, expected); + assert_eq!(grams, expected); - let output_iter = ngrams(Box::new(sent.clone()), 3, None, Some("")).unwrap(); - let output: Vec> = output_iter.collect(); + let gramizer = KSkipNGrams::new_trigram(); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), None, Some("")) + .unwrap() + .collect(); let expected = vec![ vec!["Mary", "had", "a"], @@ -69,15 +58,18 @@ fn test_trigram() { vec!["lamb", "", ""], ]; - assert_eq!(output, expected); + assert_eq!(grams, expected); } #[test] fn test_ngrams() { let sent = "Mary had a little lamb".split(" "); - let output_iter = ngrams(Box::new(sent), 4, Some(""), Some("")).unwrap(); - let output: Vec> = output_iter.collect(); + let gramizer = KSkipNGrams::new_ngrams(4); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); let expected = vec![ vec!["", "", "", "Mary"], @@ -90,15 +82,18 @@ fn test_ngrams() { vec!["lamb", "", "", ""], ]; - assert_eq!(output, expected); + assert_eq!(grams, expected); } #[test] fn test_everygram() { let sent = "Mary had a little lamb".split(" "); - let output_iter = everygrams(Box::new(sent), 1, 3, Some(""), Some("")).unwrap(); - let output: Vec> = output_iter.collect(); + let gramizer = KSkipNGrams::new_everygrams(1, 3); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); let expected = vec![ vec!["", "Mary"], @@ -121,15 +116,18 @@ fn test_everygram() { vec!["lamb", "", ""], ]; - assert_eq!(output, expected); + assert_eq!(grams, expected); } #[test] fn test_skipgram() { let sent = "Mary had a little lamb".split(" "); - let output_iter = skipgrams(Box::new(sent.clone()), 2, 1, Some(""), Some("")).unwrap(); - let output: Vec> = output_iter.collect(); + let gramizer = KSkipNGrams::new_skipgrams(2, 1); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); let expected = vec![ vec!["", "Mary"], @@ -145,10 +143,13 @@ fn test_skipgram() { vec!["little", ""], ]; - assert_eq!(output, expected); + assert_eq!(grams, expected); - let output_iter = skipgrams(Box::new(sent.clone()), 3, 1, Some(""), Some("")).unwrap(); - let output: Vec> = output_iter.collect(); + let gramizer = KSkipNGrams::new_skipgrams(3, 1); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); let expected = vec![ vec!["", "", "Mary"], @@ -170,48 +171,111 @@ fn test_skipgram() { vec!["little", "", ""], ]; - assert_eq!(output, expected); + assert_eq!(grams, expected); + + let sent = "Mary had a little lamb, whose fleece ...".split(" "); + + let gramizer = KSkipNGrams::new_skipgrams(3, 2); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), None, None) + .unwrap() + .collect(); + + let expected = vec![ + vec!["Mary", "had", "a"], + vec!["Mary", "had", "little"], + vec!["Mary", "had", "lamb,"], + vec!["Mary", "a", "little"], + vec!["Mary", "a", "lamb,"], + vec!["Mary", "little", "lamb,"], + vec!["had", "a", "little"], + vec!["had", "a", "lamb,"], + vec!["had", "a", "whose"], + vec!["had", "little", "lamb,"], + vec!["had", "little", "whose"], + vec!["had", "lamb,", "whose"], + vec!["a", "little", "lamb,"], + vec!["a", "little", "whose"], + vec!["a", "little", "fleece"], + vec!["a", "lamb,", "whose"], + vec!["a", "lamb,", "fleece"], + vec!["a", "whose", "fleece"], + vec!["little", "lamb,", "whose"], + vec!["little", "lamb,", "fleece"], + vec!["little", "lamb,", "..."], + vec!["little", "whose", "fleece"], + vec!["little", "whose", "..."], + vec!["little", "fleece", "..."], + vec!["lamb,", "whose", "fleece"], + vec!["lamb,", "whose", "..."], + vec!["lamb,", "fleece", "..."], + vec!["whose", "fleece", "..."], + ]; + + assert_eq!(grams, expected); } #[test] fn test_skipgram_everygram() { - let sent = "Mary had a little lamb".split(" "); + let sent = "Mary had a little lamb, whose fleece ...".split(" "); - // min_n=2, max_n=3, max_k=1 - let output_iter = - build_k_skip_n_grams(Box::new(sent.clone()), 2, 3, 1, Some(""), Some("")).unwrap(); - let output: Vec<_> = output_iter.collect(); + // min_n=2, max_n=4, max_k=3 + let gramizer = KSkipNGrams::new(2, 4, 3); + let output: Vec<_> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); let output_set: HashSet> = HashSet::from_iter(output.iter().cloned()); - // should be equivalent to union of two skipgram outputs n=2,3 (k=1) but expect different ordering - let output_sg_2: Vec<_> = skipgrams(Box::new(sent.clone()), 2, 1, Some(""), Some("")) + // Equivalent to union of three skip-gram outputs n=2,3,4 (k=3) but with different ordering + let gramizer_sg_2 = KSkipNGrams::new_skipgrams(2, 3); + let output_sg_2: Vec<_> = gramizer_sg_2 + .transform(Box::new(sent.clone()), Some(""), Some("")) .unwrap() .collect(); let output_sg_2_set: HashSet> = HashSet::from_iter(output_sg_2.iter().cloned()); - let output_sg_3: Vec<_> = skipgrams(Box::new(sent.clone()), 3, 1, Some(""), Some("")) + let gramizer_sg_3 = KSkipNGrams::new_skipgrams(3, 3); + let output_sg_3: Vec<_> = gramizer_sg_3 + .transform(Box::new(sent.clone()), Some(""), Some("")) .unwrap() .collect(); let output_sg_3_set: HashSet> = HashSet::from_iter(output_sg_3.iter().cloned()); + + let gramizer_sg_4 = KSkipNGrams::new_skipgrams(4, 3); + let output_sg_4: Vec<_> = gramizer_sg_4 + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); + let output_sg_4_set: HashSet> = HashSet::from_iter(output_sg_4.iter().cloned()); + let expected_set: HashSet<_> = output_sg_2_set .union(&output_sg_3_set) .map(move |x| x.clone()) + .collect::>() + .union(&output_sg_4_set) + .map(move |x| x.clone()) .collect(); // Same output - different order assert_eq!(output_set, expected_set); // No duplicates from either output expected - assert_eq!(output.len(), output_sg_2.len() + output_sg_3.len()); + assert_eq!( + output.len(), + output_sg_2.len() + output_sg_3.len() + output_sg_4.len() + ); } #[test] fn test_ngram_edge_cases() { let sent = "Mary had a little lamb".split(" "); - let output_iter = - build_k_skip_n_grams(Box::new(sent.clone()), 1, 1, 0, Some(""), Some("")).unwrap(); - let output: Vec> = output_iter.collect(); + let gramizer = KSkipNGrams::new(1, 1, 0); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); let expected = vec![ vec!["Mary"], @@ -221,13 +285,15 @@ fn test_ngram_edge_cases() { vec!["lamb"], ]; - assert_eq!(output, expected); + assert_eq!(grams, expected); - let output_iter = - build_k_skip_n_grams(Box::new(sent.clone()), 1, 1, 1, Some(""), Some("")).unwrap(); - let output: Vec> = output_iter.collect(); + let gramizer = KSkipNGrams::new(1, 1, 1); + let grarms: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); - assert_eq!(output, expected); + assert_eq!(grarms, expected); } #[test] @@ -241,6 +307,17 @@ fn test_sample_combinations() { let expected = vec![vec![0, 1, 2], vec![0, 1, 3], vec![0, 2, 3]]; assert_eq!(output, expected); + let output: Vec> = SampleCombinations::new(true, 4, 3).unwrap().collect(); + let expected = vec![ + vec![0, 1, 2], + vec![0, 1, 3], + vec![0, 1, 4], + vec![0, 2, 3], + vec![0, 2, 4], + vec![0, 3, 4], + ]; + assert_eq!(output, expected); + // Single output let output: Vec> = SampleCombinations::new(false, 1, 2).unwrap().collect(); let expected = vec![vec![0, 1]]; From 94624751f6030d5a8874acd5a50894ddf836236f Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Mon, 6 Jul 2020 16:57:08 +0100 Subject: [PATCH 07/24] Refactored and documented --- src/ngram_utils/mod.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/ngram_utils/mod.rs b/src/ngram_utils/mod.rs index f37226e..afdb3f8 100644 --- a/src/ngram_utils/mod.rs +++ b/src/ngram_utils/mod.rs @@ -178,7 +178,10 @@ impl KSkipNGrams { } /// An iterator which provided with a sequence of `items` transforms into k-skip-n-grams. -/// The iterator consumes the input `items` only once. +/// +/// The iterator consumes the input iterator only once and holds a window of items to generate the +/// grams from which is stepped forward as it consumes the input. It also correctly generates left +/// or right padding if specified. pub struct KSkipNGramsIter<'a> { // Params items: Box + 'a>, @@ -189,18 +192,18 @@ pub struct KSkipNGramsIter<'a> { pad_right: Option<&'a str>, // Iterator state - window: VecDeque<&'a str>, /// Window which holds items that have been consumed - n: usize, + window: VecDeque<&'a str>, /// Gram length that was yielded last - p: usize, + n: usize, /// Amount of padding included in item yielded last - offset: usize, + p: usize, /// Offset used during MainEnd mode - sample_iter: Peekable, + offset: usize, /// k-skip combinations of current window - mode: IterMode, + sample_iter: Peekable, /// Current mode of iterator + mode: IterMode, first: bool, } From 56f756f79cbaa0a9e953ee95d374a3b4ca492e6d Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Tue, 7 Jul 2020 11:07:24 +0100 Subject: [PATCH 08/24] Added documentation to `SampleCombinations` --- src/ngram_utils/mod.rs | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/src/ngram_utils/mod.rs b/src/ngram_utils/mod.rs index afdb3f8..85f5424 100644 --- a/src/ngram_utils/mod.rs +++ b/src/ngram_utils/mod.rs @@ -180,8 +180,8 @@ impl KSkipNGrams { /// An iterator which provided with a sequence of `items` transforms into k-skip-n-grams. /// /// The iterator consumes the input iterator only once and holds a window of items to generate the -/// grams from which is stepped forward as it consumes the input. It also correctly generates left -/// or right padding if specified. +/// grams. The window is stepped forward as it consumes the input. It also correctly generates +/// left or right padding if specified. pub struct KSkipNGramsIter<'a> { // Params items: Box + 'a>, @@ -667,6 +667,29 @@ enum IterMode { PadRight, } +/// An iterator which generates the list of combinations of `n` items in a range upto `max_i`. +/// It is possible to fix the first item at index 0 (i.e. `fix_0` == true) +/// +/// Examples: +/// ``` +/// use vtext::ngram_utils::*; +/// let output: Vec<_> = SampleCombinations::new(false, 3, 3).unwrap().collect(); +/// let expected = vec![ +/// vec![0, 1, 2], +/// vec![0, 1, 3], +/// vec![0, 2, 3], +/// vec![1, 2, 3] +/// ]; +/// assert_eq!(output, expected); +/// +/// let output: Vec<_> = SampleCombinations::new(true, 3, 3).unwrap().collect(); +/// let expected = vec![ +/// vec![0, 1, 2], +/// vec![0, 1, 3], +/// vec![0, 2, 3] +/// ]; +/// assert_eq!(output, expected); +/// ``` pub struct SampleCombinations { // Params min_i: usize, @@ -680,6 +703,12 @@ pub struct SampleCombinations { } impl SampleCombinations { + /// New `SampleCombinations` + /// + /// Parameters: + /// * `fix_0` - fix the first element at 0? + /// * `max_i` - the maximum index for the output elements + /// * `n` - number of items per combination pub fn new(fix_0: bool, max_i: usize, n: usize) -> Result { let min_i; if fix_0 { @@ -709,6 +738,7 @@ impl SampleCombinations { }) } + /// Produce dummy `SampleCombinations`. Will panic if `next` is executed. pub fn new_empty() -> SampleCombinations { SampleCombinations { min_i: 0, From da791fead252774eac44d3df1726771c91ae898c Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Wed, 15 Jul 2020 14:27:43 +0100 Subject: [PATCH 09/24] Change error to `EstimatorErr` enum and add `InputError` variant --- src/errors.rs | 4 +++- src/ngram_utils/mod.rs | 17 +++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/errors.rs b/src/errors.rs index 1f52f36..9979799 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -5,13 +5,15 @@ use thiserror::Error; #[derive(Error, Debug)] pub enum EstimatorErr { - #[error("Invalid paramer: `{0}`")] + #[error("Invalid params: `{0}`")] InvalidParams(String), #[error("Invalid regex parameter")] RegexErr { #[from] source: regex::Error, }, + #[error("Invalid Input: `{0}`")] + InvalidInput(String), } #[cfg(feature = "python")] diff --git a/src/ngram_utils/mod.rs b/src/ngram_utils/mod.rs index 85f5424..3680998 100644 --- a/src/ngram_utils/mod.rs +++ b/src/ngram_utils/mod.rs @@ -9,6 +9,7 @@ use std::iter::Peekable; #[cfg(feature = "python")] use dict_derive::{FromPyObject, IntoPyObject}; use serde::{Deserialize, Serialize}; +use crate::errors::EstimatorErr; #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "python", derive(FromPyObject, IntoPyObject))] @@ -164,7 +165,7 @@ impl KSkipNGrams { items: Box + 'a>, pad_left: Option<&'a str>, pad_right: Option<&'a str>, - ) -> Result> + 'a>, InputError> { + ) -> Result> + 'a>, EstimatorErr> { let k_skip_n_grams_iter = KSkipNGramsIter::new( items, self.params.min_n, @@ -233,14 +234,14 @@ impl<'a> KSkipNGramsIter<'a> { max_k: usize, pad_left: Option<&'a str>, pad_right: Option<&'a str>, - ) -> Result, InputError> { + ) -> Result, EstimatorErr> { if min_n < 1 { - return Err(InputError( + return Err(EstimatorErr::InvalidParams( "`min_n` must be greater than or equal to 1".to_string(), )); } if min_n > max_n { - return Err(InputError( + return Err(EstimatorErr::InvalidParams( "`max_n` must be greater than or equal to `min_n`".to_string(), )); } @@ -276,7 +277,7 @@ impl<'a> KSkipNGramsIter<'a> { items: &mut Box + 'a>, max_n: usize, max_k: usize, - ) -> Result, InputError> { + ) -> Result, EstimatorErr> { let window_size = max_n + max_k; let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); @@ -286,7 +287,7 @@ impl<'a> KSkipNGramsIter<'a> { let next_item = items.next(); match next_item { None => { - return Err(InputError( + return Err(EstimatorErr::InvalidInput( "Items length is smaller than `max_n`+`max_k`".to_string(), )) } @@ -654,10 +655,6 @@ impl<'a> KSkipNGramsIter<'a> { } } -/// Error given when input is inconsistent -#[derive(Debug, Clone)] -pub struct InputError(String); - /// Represents the different modes of `KSkipNGramsIter` enum IterMode { Start, From 146a13bdd64ce71909c3631e6f8637d41bfdf585 Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Wed, 15 Jul 2020 14:37:53 +0100 Subject: [PATCH 10/24] Make struct private and improve error --- src/ngram_utils/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ngram_utils/mod.rs b/src/ngram_utils/mod.rs index 3680998..7f06002 100644 --- a/src/ngram_utils/mod.rs +++ b/src/ngram_utils/mod.rs @@ -668,7 +668,7 @@ enum IterMode { /// It is possible to fix the first item at index 0 (i.e. `fix_0` == true) /// /// Examples: -/// ``` +/// ```text /// use vtext::ngram_utils::*; /// let output: Vec<_> = SampleCombinations::new(false, 3, 3).unwrap().collect(); /// let expected = vec![ @@ -687,7 +687,7 @@ enum IterMode { /// ]; /// assert_eq!(output, expected); /// ``` -pub struct SampleCombinations { +struct SampleCombinations { // Params min_i: usize, max_i: usize, @@ -706,7 +706,7 @@ impl SampleCombinations { /// * `fix_0` - fix the first element at 0? /// * `max_i` - the maximum index for the output elements /// * `n` - number of items per combination - pub fn new(fix_0: bool, max_i: usize, n: usize) -> Result { + pub fn new(fix_0: bool, max_i: usize, n: usize) -> Result { let min_i; if fix_0 { min_i = 1; @@ -715,7 +715,7 @@ impl SampleCombinations { } if max_i + 1 < n { - return Err("`max_i`+1 must be less than `n`"); + return Err(EstimatorErr::InvalidParams("`max_i`+1 must be less than `n`".to_string())); } let position: Vec = (0..n).collect(); From 461fd52c8e56076d546f4a465d38fba0fc3e3628 Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Thu, 16 Jul 2020 11:26:38 +0100 Subject: [PATCH 11/24] Simplified code by chaining iterators. Parity with previous code tested via tests --- src/ngram_utils/mod.rs | 1288 ++++++++++++++++++++++++-------------- src/ngram_utils/tests.rs | 38 +- 2 files changed, 824 insertions(+), 502 deletions(-) diff --git a/src/ngram_utils/mod.rs b/src/ngram_utils/mod.rs index 7f06002..f83c978 100644 --- a/src/ngram_utils/mod.rs +++ b/src/ngram_utils/mod.rs @@ -1,15 +1,15 @@ #[cfg(test)] mod tests; -use std::cmp::{max, min}; +use std::cmp::min; use std::collections::VecDeque; use std::iter; -use std::iter::Peekable; #[cfg(feature = "python")] use dict_derive::{FromPyObject, IntoPyObject}; use serde::{Deserialize, Serialize}; use crate::errors::EstimatorErr; +use itertools::Itertools; #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "python", derive(FromPyObject, IntoPyObject))] @@ -60,20 +60,6 @@ impl KSkipNGrams { KSkipNGramsParams::new(2, 2, 0).build() } - /// Generate all trigrams from a sequence of `items`, an iterator. - /// - /// Example: - /// ``` - /// use vtext::ngram_utils::*; - /// let sent = "One Two Three Four".split(" "); - /// let gramizer = KSkipNGrams::new_trigram(); - /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); - /// assert_eq!(grams, vec![vec!["One", "Two", "Three"], vec!["Two", "Three", "Four"]]); - /// ``` - pub fn new_trigram() -> KSkipNGrams { - KSkipNGramsParams::new(3, 3, 0).build() - } - /// Generate all ngrams from a sequence of `items`, an iterator. /// /// Example: @@ -166,7 +152,7 @@ impl KSkipNGrams { pad_left: Option<&'a str>, pad_right: Option<&'a str>, ) -> Result> + 'a>, EstimatorErr> { - let k_skip_n_grams_iter = KSkipNGramsIter::new( + let k_skip_n_grams_iter = KSkipNGramsIter2::new( items, self.params.min_n, self.params.max_n, @@ -178,38 +164,627 @@ impl KSkipNGrams { } } +// /// An iterator which provided with a sequence of `items` transforms into k-skip-n-grams. +// /// +// /// The iterator consumes the input iterator only once and holds a window of items to generate the +// /// grams. The window is stepped forward as it consumes the input. It also correctly generates +// /// left or right padding if specified. +// pub struct KSkipNGramsIter<'a> { +// // Params +// items: Box + 'a>, +// min_n: usize, +// max_n: usize, +// max_k: usize, +// pad_left: Option<&'a str>, +// pad_right: Option<&'a str>, +// +// // Iterator state +// /// Window which holds items that have been consumed +// window: VecDeque<&'a str>, +// /// Gram length that was yielded last +// n: usize, +// /// Amount of padding included in item yielded last +// p: usize, +// /// Offset used during MainEnd mode +// offset: usize, +// /// k-skip combinations of current window +// sample_iter: Peekable, +// /// Current mode of iterator +// mode: IterMode, +// first: bool, +// } +// +// /// Core methods to build `KSkipNGramsIter` +// impl<'a> KSkipNGramsIter<'a> { +// /// Build a new `KSkipNGramsIter`. +// /// +// /// Example: +// /// ``` +// /// use vtext::ngram_utils::*; +// /// let sent = "One Two Three".split(" "); +// /// let grams_iter = KSkipNGramsIter::new(Box::new(sent), 1, 2, 1, Some(""), Some("")); +// /// let grams: Vec> = grams_iter.unwrap().collect(); +// /// ``` +// /// +// /// Parameters: +// /// * `items` - Input iterator +// /// * `min_n` - The minimum degree of the ngram +// /// * `max_n` - The maximum degree of the ngram +// /// * `max_k` - The maximum-degree of the skipgram: the total max skip between items +// /// * `pad_left` - Optional string to use as left padding +// /// * `pad_right` - Optional string to use as right padding +// pub fn new( +// mut items: Box + 'a>, +// min_n: usize, +// max_n: usize, +// max_k: usize, +// pad_left: Option<&'a str>, +// pad_right: Option<&'a str>, +// ) -> Result, EstimatorErr> { +// if min_n < 1 { +// return Err(EstimatorErr::InvalidParams( +// "`min_n` must be greater than or equal to 1".to_string(), +// )); +// } +// if min_n > max_n { +// return Err(EstimatorErr::InvalidParams( +// "`max_n` must be greater than or equal to `min_n`".to_string(), +// )); +// } +// let mut max_k = max_k; +// if max_n == 1 { +// max_k = 0; // if n == 1. k has no effect +// } +// +// let window = Self::build_window(&mut items, max_n, max_k)?; +// +// Ok(KSkipNGramsIter { +// // Params +// items, +// min_n, +// max_n, +// max_k, +// pad_left, +// pad_right, +// +// // Iterator state +// window, +// n: 0, +// p: 0, +// offset: 0, +// sample_iter: SampleCombinations::new_empty().peekable(), +// mode: IterMode::Start, +// first: false, +// }) +// } +// +// // Prepare and populate start window +// fn build_window( +// items: &mut Box + 'a>, +// max_n: usize, +// max_k: usize, +// ) -> Result, EstimatorErr> { +// let window_size = max_n + max_k; +// let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); +// +// // Populate window +// let mut i = window_size; +// while i > 0 { +// let next_item = items.next(); +// match next_item { +// None => { +// return Err(EstimatorErr::InvalidInput( +// "Items length is smaller than `max_n`+`max_k`".to_string(), +// )) +// } +// Some(s) => { +// window.push_back(s); +// } +// } +// i -= 1; +// } +// Ok(window) +// } +// } +// +// /// Iterator functions +// impl<'a> Iterator for KSkipNGramsIter<'a> { +// type Item = Vec<&'a str>; +// +// // Next item. Depending on current mode obtain next item. +// // If current mode has been exhausted then switch to next +// fn next(&mut self) -> Option { +// return match &self.mode { +// IterMode::Start => { +// self.start_mode_pad_left(); +// self.next() +// } +// +// IterMode::PadLeft => { +// if self.pad_left.is_some() && self.max_n > 1 { +// let next = self.next_gram_pad_left(); +// match &next { +// Some(_e) => next, +// None => { +// self.start_mode_main(); +// self.next() +// } +// } +// } else { +// self.start_mode_main(); +// self.next() +// } +// } +// +// IterMode::Main => { +// let next = self.next_gram_main(); +// match &next { +// Some(_e) => next, +// None => { +// self.start_mode_main_end(); +// self.next() +// } +// } +// } +// +// IterMode::MainEnd => { +// if (self.min_n != self.max_n || self.max_k > 0) && self.window.len() > 1 { +// let next = self.next_gram_main_end(); +// match &next { +// Some(_e) => next, +// None => { +// self.start_mode_pad_right(); +// self.next() +// } +// } +// } else { +// self.start_mode_pad_right(); +// self.next() +// } +// } +// +// IterMode::PadRight => { +// if self.pad_right.is_some() && self.max_n > 1 { +// self.next_gram_pad_right() +// } else { +// return None; +// } +// } +// }; +// } +// } +// +// /// Internal functions +// impl<'a> KSkipNGramsIter<'a> { +// // Switching between modes +// fn start_mode_pad_left(&mut self) { +// self.mode = IterMode::PadLeft; +// self.first = true; +// } +// +// fn start_mode_main(&mut self) { +// self.mode = IterMode::Main; +// self.first = true; +// } +// +// fn start_mode_main_end(&mut self) { +// self.mode = IterMode::MainEnd; +// self.first = true; +// } +// +// fn start_mode_pad_right(&mut self) { +// self.mode = IterMode::PadRight; +// self.first = true; +// } +// +// // Obtain next gram for PadLeft mode +// fn next_gram_pad_left(&mut self) -> Option> { +// self.next_params_pad_left()?; +// +// let slice_idx: Vec = self.sample_iter.next().unwrap(); +// let grams = self.construct_grams_vec(slice_idx); +// Some(grams) +// } +// +// // Obtain next gram for PadRight mode +// fn next_gram_pad_right(&mut self) -> Option> { +// self.next_params_pad_right()?; +// +// let mut sample_idx: Vec = self.sample_iter.next().unwrap(); +// +// // Mirror index +// for e in sample_idx.iter_mut() { +// *e = self.window.len() - 1 - *e; +// } +// sample_idx.reverse(); +// +// let grams = self.construct_grams_vec(sample_idx); +// Some(grams) +// } +// +// // Obtain next gram for Main mode +// fn next_gram_main(&mut self) -> Option> { +// let finished = self.next_state_pad_main(); +// +// if finished.is_none() { +// self.forward_window()?; +// self.first = true; +// return self.next_gram_main(); +// } +// +// let sample_idx = self.sample_iter.next().unwrap(); +// let grams = self.construct_grams_vec(sample_idx); +// Some(grams) +// } +// +// // Obtain next gram for MainEnd mode +// fn next_gram_main_end(&mut self) -> Option> { +// self.next_state_pad_main_end()?; +// +// let mut sample_idx = self.sample_iter.next().unwrap(); +// // Offset index +// for e in sample_idx.iter_mut() { +// *e += self.offset; +// } +// let grams = self.construct_grams_vec(sample_idx); +// Some(grams) +// } +// +// // Forward the window by one step +// fn forward_window(&mut self) -> Option<()> { +// // Need to forward window when yielded ngram of max-length and max-skip-size +// let next_item = self.items.next(); +// +// return match next_item { +// None => None, +// Some(s) => { +// self.window.pop_front(); +// self.window.push_back(s); +// Some(()) // Successfully forwarded window +// } +// }; +// } +// +// // Increment parameters and sample iterator +// fn next_params_pad_left(&mut self) -> Option<()> { +// // Equivalent to a for-loop: +// // for n in max(self.min_n, 2)..self.max_n+1: +// // for p in (n-1)..0: // decreasing +// // for sample_idx in sample_iter: +// // next_gram(n, p, sample_idx) +// return if self.first { +// self.n = max(self.min_n, 2); +// self.p = self.n - 1; +// self.sample_iter = +// SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) +// .unwrap() +// .peekable(); +// +// self.first = false; +// Some(()) +// } else if self.sample_iter.peek().is_some() { +// Some(()) +// } else if self.p > 1 { +// self.p -= 1; +// +// self.sample_iter = +// SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) +// .unwrap() +// .peekable(); +// +// Some(()) +// } else if self.n < self.max_n { +// self.n += 1; +// self.p = self.n - 1; +// +// self.sample_iter = +// SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) +// .unwrap() +// .peekable(); +// +// Some(()) +// } else { +// None +// }; +// } +// +// // Increment parameters and sample iterator +// fn next_params_pad_right(&mut self) -> Option<()> { +// // Equivalent to a for-loop: +// // for n in max(self.min_n, 2)..self.max_n+1: +// // for p in 1..n: +// // for sample_idx in sample_iter: +// // next_gram(n, p, sample_idx) +// return if self.first { +// self.n = max(self.min_n, 2); +// self.p = 1; +// self.first = false; +// +// self.sample_iter = +// SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) +// .unwrap() +// .peekable(); +// +// Some(()) +// } else if self.sample_iter.peek().is_some() { +// Some(()) +// } else if self.p < self.n - 1 { +// self.p += 1; +// +// self.sample_iter = +// SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) +// .unwrap() +// .peekable(); +// +// Some(()) +// } else if self.n < self.max_n { +// self.n += 1; +// self.p = 1; +// +// self.sample_iter = +// SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) +// .unwrap() +// .peekable(); +// +// Some(()) +// } else { +// None +// }; +// } +// +// // Increment parameters and sample iterator for each window +// fn next_state_pad_main(&mut self) -> Option<()> { +// // Equivalent to a for-loop: +// // for n in self.min_n..self.max_n + 1: +// // for sample_idx in sample_iter: +// // next_gram(n, sample_idx) +// return if self.first { +// self.n = self.min_n; +// self.sample_iter = SampleCombinations::new(true, self.n + self.max_k - 1, self.n) +// .unwrap() +// .peekable(); +// +// self.first = false; +// Some(()) +// } else if self.sample_iter.peek().is_some() { +// Some(()) +// } else if self.n < min(self.max_n, self.window.len()) { +// self.n += 1; +// self.sample_iter = SampleCombinations::new(true, self.n + self.max_k - 1, self.n) +// .unwrap() +// .peekable(); +// +// Some(()) +// } else { +// None +// }; +// } +// +// // Increment parameters and sample iterator for each window +// fn next_state_pad_main_end(&mut self) -> Option<()> { +// // Equivalent to a for-loop: +// // for offset in 1..window.len()-min_n +// // for n in self.min_n..self.max_n + 1: +// // for sample_idx in sample_iter: +// // next_gram(offset, n, sample_idx) +// return if self.first { +// self.n = self.min_n; +// self.offset = 1; +// self.reset_sample_iter_main_end(); +// +// self.first = false; +// Some(()) +// } else if self.sample_iter.peek().is_some() { +// Some(()) +// } else if self.n < min(self.max_n, self.window.len() - self.offset) { +// self.n += 1; +// self.reset_sample_iter_main_end(); +// +// Some(()) +// } else if self.window.len() - self.offset > self.min_n { +// self.offset += 1; +// self.n = self.min_n; +// self.reset_sample_iter_main_end(); +// +// Some(()) +// } else { +// None +// }; +// } +// +// fn reset_sample_iter_main_end(&mut self) { +// let window_len = self.window.len() - self.offset; +// let mut k = 0; +// if window_len > self.n { +// k = min(self.max_k, window_len - self.n); +// } +// let max_i = self.n + k - 1; +// self.sample_iter = SampleCombinations::new(true, max_i, self.n) +// .unwrap() +// .peekable(); +// } +// +// // Create output vec from sample index and add padding if necessary +// fn construct_grams_vec(&mut self, sample_idx: Vec) -> Vec<&'a str> { +// let grams = self.vec_from_idx(sample_idx); +// +// return match self.mode { +// IterMode::PadLeft => { +// // Add padding to the left +// [ +// iter::repeat(self.pad_left.unwrap()).take(self.p).collect(), +// grams, +// ] +// .concat() +// } +// +// IterMode::PadRight => { +// // Add padding to the right +// [ +// grams, +// iter::repeat(self.pad_right.unwrap()).take(self.p).collect(), +// ] +// .concat() +// } +// +// _ => grams, +// }; +// } +// +// // Create output vec from sample index +// fn vec_from_idx(&mut self, sample_idx: Vec) -> Vec<&'a str> { +// let mut grams = Vec::with_capacity(sample_idx.len()); +// for idx in sample_idx.iter() { +// grams.push(self.window[*idx].clone()); +// } +// grams +// } +// } +// +// /// Represents the different modes of `KSkipNGramsIter` +// enum IterMode { +// Start, +// PadLeft, +// Main, +// MainEnd, +// PadRight, +// } + +/// An iterator which generates the list of combinations of `n` items in a range upto `max_i`. +/// It is possible to fix the first item at index 0 (i.e. `fix_0` == true) +/// +/// Examples: +/// ```text +/// use vtext::ngram_utils::*; +/// let output: Vec<_> = SampleCombinations::new(false, 3, 3).unwrap().collect(); +/// let expected = vec![ +/// vec![0, 1, 2], +/// vec![0, 1, 3], +/// vec![0, 2, 3], +/// vec![1, 2, 3] +/// ]; +/// assert_eq!(output, expected); +/// +/// let output: Vec<_> = SampleCombinations::new(true, 3, 3).unwrap().collect(); +/// let expected = vec![ +/// vec![0, 1, 2], +/// vec![0, 1, 3], +/// vec![0, 2, 3] +/// ]; +/// assert_eq!(output, expected); +/// ``` +struct SampleCombinations { + // Params + min_i: usize, + max_i: usize, + n: usize, + + // State + position: Vec, + first: bool, + last: bool, +} + +impl SampleCombinations { + /// New `SampleCombinations` + /// + /// Parameters: + /// * `fix_0` - fix the first element at 0? + /// * `max_i` - the maximum index for the output elements + /// * `n` - number of items per combination + pub fn new(fix_0: bool, max_i: usize, n: usize) -> Result { + let min_i; + if fix_0 { + min_i = 1; + } else { + min_i = 0; + } + + if max_i + 1 < n { + return Err(EstimatorErr::InvalidParams("`max_i`+1 must be less than `n`".to_string())); + } + + let position: Vec = (0..n).collect(); + + let mut last = false; + if n == max_i + 1 { + last = true; + } + + Ok(SampleCombinations { + min_i, + max_i, + n, + position, + first: true, + last, + }) + } + + /// Produce dummy `SampleCombinations`. Will panic if `next` is executed. + pub fn new_empty() -> SampleCombinations { + SampleCombinations { + min_i: 0, + max_i: 0, + n: 0, + position: Vec::new(), + first: false, + last: false, + } + } +} + +impl Iterator for SampleCombinations { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.first { + self.first = false; + return Some(self.position.clone()); + } + if self.last { + return None; + } + + for i in (self.min_i..self.position.len()).rev() { + let e = self.position[i]; + if e < self.max_i - (self.n - i - 1) { + let mut e_1 = e; + for j in i..self.position.len() { + e_1 += 1; + self.position[j] = e_1; + } + if i == self.min_i && e + 1 == self.max_i { + self.last = true; + } + return Some(self.position.clone()); + } + } + None // Will never reach + } +} + + +// ------------------------------------------------------------------------------------------------ + + /// An iterator which provided with a sequence of `items` transforms into k-skip-n-grams. /// /// The iterator consumes the input iterator only once and holds a window of items to generate the /// grams. The window is stepped forward as it consumes the input. It also correctly generates /// left or right padding if specified. -pub struct KSkipNGramsIter<'a> { +pub struct NGramIter<'a> { // Params items: Box + 'a>, - min_n: usize, - max_n: usize, - max_k: usize, - pad_left: Option<&'a str>, - pad_right: Option<&'a str>, // Iterator state /// Window which holds items that have been consumed window: VecDeque<&'a str>, - /// Gram length that was yielded last - n: usize, - /// Amount of padding included in item yielded last - p: usize, - /// Offset used during MainEnd mode - offset: usize, - /// k-skip combinations of current window - sample_iter: Peekable, - /// Current mode of iterator - mode: IterMode, first: bool, } /// Core methods to build `KSkipNGramsIter` -impl<'a> KSkipNGramsIter<'a> { +impl<'a> NGramIter<'a> { /// Build a new `KSkipNGramsIter`. /// /// Example: @@ -229,56 +804,38 @@ impl<'a> KSkipNGramsIter<'a> { /// * `pad_right` - Optional string to use as right padding pub fn new( mut items: Box + 'a>, - min_n: usize, - max_n: usize, - max_k: usize, + n: usize, pad_left: Option<&'a str>, pad_right: Option<&'a str>, - ) -> Result, EstimatorErr> { - if min_n < 1 { + ) -> Result, EstimatorErr> { + if n < 1 { return Err(EstimatorErr::InvalidParams( "`min_n` must be greater than or equal to 1".to_string(), )); } - if min_n > max_n { - return Err(EstimatorErr::InvalidParams( - "`max_n` must be greater than or equal to `min_n`".to_string(), - )); - } - let mut max_k = max_k; - if max_n == 1 { - max_k = 0; // if n == 1. k has no effect + + if pad_left.is_some() || pad_right.is_some() { + items = pad_items(items, n, pad_left, pad_right); } - let window = Self::build_window(&mut items, max_n, max_k)?; + let window = Self::build_window(&mut items, n)?; - Ok(KSkipNGramsIter { + Ok(NGramIter { // Params items, - min_n, - max_n, - max_k, - pad_left, - pad_right, // Iterator state window, - n: 0, - p: 0, - offset: 0, - sample_iter: SampleCombinations::new_empty().peekable(), - mode: IterMode::Start, - first: false, + first: true, }) } // Prepare and populate start window fn build_window( items: &mut Box + 'a>, - max_n: usize, - max_k: usize, + n: usize, ) -> Result, EstimatorErr> { - let window_size = max_n + max_k; + let window_size = n; let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); // Populate window @@ -288,7 +845,7 @@ impl<'a> KSkipNGramsIter<'a> { match next_item { None => { return Err(EstimatorErr::InvalidInput( - "Items length is smaller than `max_n`+`max_k`".to_string(), + "Items length is smaller than `n`".to_string(), )) } Some(s) => { @@ -302,478 +859,241 @@ impl<'a> KSkipNGramsIter<'a> { } /// Iterator functions -impl<'a> Iterator for KSkipNGramsIter<'a> { +impl<'a> Iterator for NGramIter<'a> { type Item = Vec<&'a str>; - // Next item. Depending on current mode obtain next item. - // If current mode has been exhausted then switch to next fn next(&mut self) -> Option { - return match &self.mode { - IterMode::Start => { - self.start_mode_pad_left(); - self.next() - } + if self.first { + self.first = false; + return Some(Vec::from(self.window.clone())); + } - IterMode::PadLeft => { - if self.pad_left.is_some() && self.max_n > 1 { - let next = self.next_gram_pad_left(); - match &next { - Some(_e) => next, - None => { - self.start_mode_main(); - self.next() - } - } - } else { - self.start_mode_main(); - self.next() - } - } + let next_item = self.items.next()?; - IterMode::Main => { - let next = self.next_gram_main(); - match &next { - Some(_e) => next, - None => { - self.start_mode_main_end(); - self.next() - } - } - } - - IterMode::MainEnd => { - if (self.min_n != self.max_n || self.max_k > 0) && self.window.len() > 1 { - let next = self.next_gram_main_end(); - match &next { - Some(_e) => next, - None => { - self.start_mode_pad_right(); - self.next() - } - } - } else { - self.start_mode_pad_right(); - self.next() - } - } + self.window.pop_front(); + self.window.push_back(next_item); - IterMode::PadRight => { - if self.pad_right.is_some() && self.max_n > 1 { - self.next_gram_pad_right() - } else { - return None; - } - } - }; + return Some(Vec::from(self.window.clone())); } } -/// Internal functions -impl<'a> KSkipNGramsIter<'a> { - // Switching between modes - fn start_mode_pad_left(&mut self) { - self.mode = IterMode::PadLeft; - self.first = true; - } - - fn start_mode_main(&mut self) { - self.mode = IterMode::Main; - self.first = true; - } - fn start_mode_main_end(&mut self) { - self.mode = IterMode::MainEnd; - self.first = true; - } - - fn start_mode_pad_right(&mut self) { - self.mode = IterMode::PadRight; - self.first = true; - } - - // Obtain next gram for PadLeft mode - fn next_gram_pad_left(&mut self) -> Option> { - self.next_params_pad_left()?; - - let slice_idx: Vec = self.sample_iter.next().unwrap(); - let grams = self.construct_grams_vec(slice_idx); - Some(grams) - } - - // Obtain next gram for PadRight mode - fn next_gram_pad_right(&mut self) -> Option> { - self.next_params_pad_right()?; +pub struct SkipGramIter<'a> { + // Params + items: Box + 'a>, + n: usize, + max_k: usize, - let mut sample_idx: Vec = self.sample_iter.next().unwrap(); + // Iterator state + /// Window which holds items that have been consumed + window: VecDeque<&'a str>, + sample_iter: SampleCombinations, + last: bool, +} - // Mirror index - for e in sample_idx.iter_mut() { - *e = self.window.len() - 1 - *e; +/// Core methods to build `KSkipNGramsIter` +impl<'a> SkipGramIter<'a> { + /// Build a new `KSkipNGramsIter`. + /// + /// Example: + /// ``` + /// use vtext::ngram_utils::*; + /// let sent = "One Two Three".split(" "); + /// let grams_iter = KSkipNGramsIter::new(Box::new(sent), 1, 2, 1, Some(""), Some("")); + /// let grams: Vec> = grams_iter.unwrap().collect(); + /// ``` + /// + /// Parameters: + /// * `items` - Input iterator + /// * `min_n` - The minimum degree of the ngram + /// * `max_n` - The maximum degree of the ngram + /// * `max_k` - The maximum-degree of the skipgram: the total max skip between items + /// * `pad_left` - Optional string to use as left padding + /// * `pad_right` - Optional string to use as right padding + pub fn new( + mut items: Box + 'a>, + n: usize, + max_k: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, + ) -> Result, EstimatorErr> { + if n < 1 { + return Err(EstimatorErr::InvalidParams( + "`min_n` must be greater than or equal to 1".to_string(), + )); } - sample_idx.reverse(); - - let grams = self.construct_grams_vec(sample_idx); - Some(grams) - } - // Obtain next gram for Main mode - fn next_gram_main(&mut self) -> Option> { - let finished = self.next_state_pad_main(); - - if finished.is_none() { - self.forward_window()?; - self.first = true; - return self.next_gram_main(); + if pad_left.is_some() || pad_right.is_some() { + items = pad_items(items, n, pad_left, pad_right); } - let sample_idx = self.sample_iter.next().unwrap(); - let grams = self.construct_grams_vec(sample_idx); - Some(grams) - } - - // Obtain next gram for MainEnd mode - fn next_gram_main_end(&mut self) -> Option> { - self.next_state_pad_main_end()?; + let window = Self::build_window(&mut items, n, max_k)?; + let sample_iter = SampleCombinations::new(true, n+max_k-1, n)?; - let mut sample_idx = self.sample_iter.next().unwrap(); - // Offset index - for e in sample_idx.iter_mut() { - *e += self.offset; - } - let grams = self.construct_grams_vec(sample_idx); - Some(grams) - } + Ok(SkipGramIter { + // Params + items, + n, + max_k, - // Forward the window by one step - fn forward_window(&mut self) -> Option<()> { - // Need to forward window when yielded ngram of max-length and max-skip-size - let next_item = self.items.next(); - - return match next_item { - None => None, - Some(s) => { - self.window.pop_front(); - self.window.push_back(s); - Some(()) // Successfully forwarded window - } - }; + // Iterator state + window, + sample_iter, + last: false + }) } - // Increment parameters and sample iterator - fn next_params_pad_left(&mut self) -> Option<()> { - // Equivalent to a for-loop: - // for n in max(self.min_n, 2)..self.max_n+1: - // for p in (n-1)..0: // decreasing - // for sample_idx in sample_iter: - // next_gram(n, p, sample_idx) - return if self.first { - self.n = max(self.min_n, 2); - self.p = self.n - 1; - self.sample_iter = - SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) - .unwrap() - .peekable(); + // Prepare and populate start window + fn build_window( + items: &mut Box + 'a>, + n: usize, + max_k: usize, + ) -> Result, EstimatorErr> { + let window_size = n+max_k; + let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); - self.first = false; - Some(()) - } else if self.sample_iter.peek().is_some() { - Some(()) - } else if self.p > 1 { - self.p -= 1; - - self.sample_iter = - SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) - .unwrap() - .peekable(); - - Some(()) - } else if self.n < self.max_n { - self.n += 1; - self.p = self.n - 1; - - self.sample_iter = - SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) - .unwrap() - .peekable(); - - Some(()) - } else { - None - }; + // Populate window + let mut i = window_size; + while i > 0 { + let next_item = items.next(); + match next_item { + None => { + return Err(EstimatorErr::InvalidInput( + "Items length is smaller than `n`+`max_k`".to_string(), + )) + } + Some(s) => { + window.push_back(s); + } + } + i -= 1; + } + Ok(window) } +} - // Increment parameters and sample iterator - fn next_params_pad_right(&mut self) -> Option<()> { - // Equivalent to a for-loop: - // for n in max(self.min_n, 2)..self.max_n+1: - // for p in 1..n: - // for sample_idx in sample_iter: - // next_gram(n, p, sample_idx) - return if self.first { - self.n = max(self.min_n, 2); - self.p = 1; - self.first = false; +/// Iterator functions +impl<'a> Iterator for SkipGramIter<'a> { + type Item = Vec<&'a str>; - self.sample_iter = - SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) - .unwrap() - .peekable(); - - Some(()) - } else if self.sample_iter.peek().is_some() { - Some(()) - } else if self.p < self.n - 1 { - self.p += 1; - - self.sample_iter = - SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) - .unwrap() - .peekable(); - - Some(()) - } else if self.n < self.max_n { - self.n += 1; - self.p = 1; - - self.sample_iter = - SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) - .unwrap() - .peekable(); - - Some(()) - } else { - None - }; - } + fn next(&mut self) -> Option { - // Increment parameters and sample iterator for each window - fn next_state_pad_main(&mut self) -> Option<()> { - // Equivalent to a for-loop: - // for n in self.min_n..self.max_n + 1: - // for sample_idx in sample_iter: - // next_gram(n, sample_idx) - return if self.first { - self.n = self.min_n; - self.sample_iter = SampleCombinations::new(true, self.n + self.max_k - 1, self.n) - .unwrap() - .peekable(); + let next_sample = self.sample_iter.next(); - self.first = false; - Some(()) - } else if self.sample_iter.peek().is_some() { - Some(()) - } else if self.n < min(self.max_n, self.window.len()) { - self.n += 1; - self.sample_iter = SampleCombinations::new(true, self.n + self.max_k - 1, self.n) - .unwrap() - .peekable(); - - Some(()) - } else { - None - }; - } + return match next_sample { + None => { + // forward window + let next_item = self.items.next(); // If ended then return None - // Increment parameters and sample iterator for each window - fn next_state_pad_main_end(&mut self) -> Option<()> { - // Equivalent to a for-loop: - // for offset in 1..window.len()-min_n - // for n in self.min_n..self.max_n + 1: - // for sample_idx in sample_iter: - // next_gram(offset, n, sample_idx) - return if self.first { - self.n = self.min_n; - self.offset = 1; - self.reset_sample_iter_main_end(); + match next_item { + Some(item) => { + self.window.pop_front(); + self.window.push_back(item); - self.first = false; - Some(()) - } else if self.sample_iter.peek().is_some() { - Some(()) - } else if self.n < min(self.max_n, self.window.len() - self.offset) { - self.n += 1; - self.reset_sample_iter_main_end(); - - Some(()) - } else if self.window.len() - self.offset > self.min_n { - self.offset += 1; - self.n = self.min_n; - self.reset_sample_iter_main_end(); - - Some(()) - } else { - None - }; - } + self.sample_iter = SampleCombinations::new(true, self.n + self.max_k - 1, self.n).unwrap(); - fn reset_sample_iter_main_end(&mut self) { - let window_len = self.window.len() - self.offset; - let mut k = 0; - if window_len > self.n { - k = min(self.max_k, window_len - self.n); - } - let max_i = self.n + k - 1; - self.sample_iter = SampleCombinations::new(true, max_i, self.n) - .unwrap() - .peekable(); - } + self.next() + }, - // Create output vec from sample index and add padding if necessary - fn construct_grams_vec(&mut self, sample_idx: Vec) -> Vec<&'a str> { - let grams = self.vec_from_idx(sample_idx); - - return match self.mode { - IterMode::PadLeft => { - // Add padding to the left - [ - iter::repeat(self.pad_left.unwrap()).take(self.p).collect(), - grams, - ] - .concat() - } + None => { + // 1. Reduce window + if self.window.len() > self.n { + // reduce window + self.window.pop_front(); + } else { + return None; + } - IterMode::PadRight => { - // Add padding to the right - [ - grams, - iter::repeat(self.pad_right.unwrap()).take(self.p).collect(), - ] - .concat() + let k = min(self.max_k, self.window.len() - self.n); + self.sample_iter = SampleCombinations::new(true, self.n + k - 1, self.n).unwrap(); + self.next() + } + } + }, + Some(sample_idx) => { + let mut sample = Vec::with_capacity(sample_idx.len()); + for idx in sample_idx.iter() { + sample.push(self.window[*idx].clone()); + } + Some(sample) } - - _ => grams, - }; - } - - // Create output vec from sample index - fn vec_from_idx(&mut self, sample_idx: Vec) -> Vec<&'a str> { - let mut grams = Vec::with_capacity(sample_idx.len()); - for idx in sample_idx.iter() { - grams.push(self.window[*idx].clone()); } - grams } } -/// Represents the different modes of `KSkipNGramsIter` -enum IterMode { - Start, - PadLeft, - Main, - MainEnd, - PadRight, -} -/// An iterator which generates the list of combinations of `n` items in a range upto `max_i`. -/// It is possible to fix the first item at index 0 (i.e. `fix_0` == true) -/// -/// Examples: -/// ```text -/// use vtext::ngram_utils::*; -/// let output: Vec<_> = SampleCombinations::new(false, 3, 3).unwrap().collect(); -/// let expected = vec![ -/// vec![0, 1, 2], -/// vec![0, 1, 3], -/// vec![0, 2, 3], -/// vec![1, 2, 3] -/// ]; -/// assert_eq!(output, expected); -/// -/// let output: Vec<_> = SampleCombinations::new(true, 3, 3).unwrap().collect(); -/// let expected = vec![ -/// vec![0, 1, 2], -/// vec![0, 1, 3], -/// vec![0, 2, 3] -/// ]; -/// assert_eq!(output, expected); -/// ``` -struct SampleCombinations { +pub struct KSkipNGramsIter2<'a> { // Params - min_i: usize, - max_i: usize, - n: usize, - - // State - position: Vec, - first: bool, - last: bool, + iter: Box> + 'a>, } -impl SampleCombinations { - /// New `SampleCombinations` - /// - /// Parameters: - /// * `fix_0` - fix the first element at 0? - /// * `max_i` - the maximum index for the output elements - /// * `n` - number of items per combination - pub fn new(fix_0: bool, max_i: usize, n: usize) -> Result { - let min_i; - if fix_0 { - min_i = 1; - } else { - min_i = 0; - } - - if max_i + 1 < n { - return Err(EstimatorErr::InvalidParams("`max_i`+1 must be less than `n`".to_string())); - } - - let position: Vec = (0..n).collect(); - - let mut last = false; - if n == max_i + 1 { - last = true; +impl<'a> KSkipNGramsIter2<'a> { + pub fn new( + mut items: Box + 'a>, + min_n: usize, + max_n: usize, + max_k: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, + ) -> Result, EstimatorErr> { + let mut iter: Box> + 'a> = Box::new(iter::empty()); + + for n in min_n..max_n+1 { + let (iter_split_1, iter_split_0) = items.tee(); + items = Box::new(iter_split_0); + + if max_k == 0 { + let sub_iter = NGramIter::new(Box::new(iter_split_1), n, pad_left, pad_right)?; + iter = Box::new(iter.chain(sub_iter)); + } else { + let sub_iter = SkipGramIter::new(Box::new(iter_split_1), n, max_k, pad_left, pad_right)?; + iter = Box::new(iter.chain(sub_iter)); + } } - Ok(SampleCombinations { - min_i, - max_i, - n, - position, - first: true, - last, + Ok(KSkipNGramsIter2 { + iter }) } +} - /// Produce dummy `SampleCombinations`. Will panic if `next` is executed. - pub fn new_empty() -> SampleCombinations { - SampleCombinations { - min_i: 0, - max_i: 0, - n: 0, - position: Vec::new(), - first: false, - last: false, - } +impl<'a> Iterator for KSkipNGramsIter2<'a> { + type Item = Vec<&'a str>; + + fn next(&mut self) -> Option { + self.iter.next() } } -impl Iterator for SampleCombinations { - type Item = Vec; - fn next(&mut self) -> Option { - if self.first { - self.first = false; - return Some(self.position.clone()); +fn pad_items<'a>( + items: Box + 'a>, + n: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, +) -> Box + 'a> { + let left_chained: Box>; + let all_chained: Box>; + + match pad_left { + Some(s) => { + let pad_left_iter = iter::repeat(s).take(n - 1); + left_chained = Box::new(pad_left_iter.chain(items)); } - if self.last { - return None; + None => { + left_chained = items; } + } - for i in (self.min_i..self.position.len()).rev() { - let e = self.position[i]; - if e < self.max_i - (self.n - i - 1) { - let mut e_1 = e; - for j in i..self.position.len() { - e_1 += 1; - self.position[j] = e_1; - } - if i == self.min_i && e + 1 == self.max_i { - self.last = true; - } - return Some(self.position.clone()); - } + match pad_right { + Some(s) => { + let pad_right_iter = iter::repeat(s).take(n - 1); + all_chained = Box::new(left_chained.chain(pad_right_iter)); + } + None => { + all_chained = left_chained; } - None // Will never reach } + + all_chained } diff --git a/src/ngram_utils/tests.rs b/src/ngram_utils/tests.rs index ff4a4e9..1fd939b 100644 --- a/src/ngram_utils/tests.rs +++ b/src/ngram_utils/tests.rs @@ -26,7 +26,7 @@ fn test_bigram() { fn test_trigram() { let sent = "Mary had a little lamb".split(" "); - let gramizer = KSkipNGrams::new_trigram(); + let gramizer = KSkipNGrams::new_ngrams(3); let grams: Vec> = gramizer .transform(Box::new(sent.clone()), Some(""), Some("")) .unwrap() @@ -44,7 +44,7 @@ fn test_trigram() { assert_eq!(grams, expected); - let gramizer = KSkipNGrams::new_trigram(); + let gramizer = KSkipNGrams::new_ngrams(3); let grams: Vec> = gramizer .transform(Box::new(sent.clone()), None, Some("")) .unwrap() @@ -96,24 +96,24 @@ fn test_everygram() { .collect(); let expected = vec![ - vec!["", "Mary"], - vec!["", "", "Mary"], - vec!["", "Mary", "had"], vec!["Mary"], - vec!["Mary", "had"], - vec!["Mary", "had", "a"], vec!["had"], - vec!["had", "a"], - vec!["had", "a", "little"], vec!["a"], - vec!["a", "little"], - vec!["a", "little", "lamb"], vec!["little"], - vec!["little", "lamb"], vec!["lamb"], + vec!["", "Mary"], + vec!["Mary", "had"], + vec!["had", "a"], + vec!["a", "little"], + vec!["little", "lamb"], vec!["lamb", ""], + vec!["", "", "Mary"], + vec!["", "Mary", "had"], + vec!["Mary", "had", "a"], + vec!["had", "a", "little"], + vec!["a", "little", "lamb"], vec!["little", "lamb", ""], - vec!["lamb", "", ""], + vec!["lamb", "", ""] ]; assert_eq!(grams, expected); @@ -139,8 +139,8 @@ fn test_skipgram() { vec!["a", "little"], vec!["a", "lamb"], vec!["little", "lamb"], - vec!["lamb", ""], vec!["little", ""], + vec!["lamb", ""], ]; assert_eq!(grams, expected); @@ -155,6 +155,7 @@ fn test_skipgram() { vec!["", "", "Mary"], vec!["", "", "had"], vec!["", "Mary", "had"], + vec!["", "Mary", "had"], vec!["", "Mary", "a"], vec!["", "had", "a"], vec!["Mary", "had", "a"], @@ -164,11 +165,12 @@ fn test_skipgram() { vec!["had", "a", "lamb"], vec!["had", "little", "lamb"], vec!["a", "little", "lamb"], - vec!["little", "lamb", ""], - vec!["a", "lamb", ""], vec!["a", "little", ""], - vec!["lamb", "", ""], + vec!["a", "lamb", ""], + vec!["little", "lamb", ""], + vec!["little", "lamb", ""], vec!["little", "", ""], + vec!["lamb", "", ""], ]; assert_eq!(grams, expected); @@ -338,4 +340,4 @@ fn test_sample_combinations() { let output: Vec> = SampleCombinations::new(true, 0, 1).unwrap().collect(); let expected = vec![vec![0]]; assert_eq!(output, expected); -} +} \ No newline at end of file From e6dab70d5af7f4a60ef8a1bf64308fd96784c6c3 Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Thu, 16 Jul 2020 12:00:24 +0100 Subject: [PATCH 12/24] Tidied code, docs and cargo fmt --- src/ngram_utils/mod.rs | 878 ++++++++++----------------------------- src/ngram_utils/tests.rs | 18 +- 2 files changed, 227 insertions(+), 669 deletions(-) diff --git a/src/ngram_utils/mod.rs b/src/ngram_utils/mod.rs index f83c978..d68bba6 100644 --- a/src/ngram_utils/mod.rs +++ b/src/ngram_utils/mod.rs @@ -5,11 +5,11 @@ use std::cmp::min; use std::collections::VecDeque; use std::iter; +use crate::errors::EstimatorErr; #[cfg(feature = "python")] use dict_derive::{FromPyObject, IntoPyObject}; -use serde::{Deserialize, Serialize}; -use crate::errors::EstimatorErr; use itertools::Itertools; +use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "python", derive(FromPyObject, IntoPyObject))] @@ -60,7 +60,7 @@ impl KSkipNGrams { KSkipNGramsParams::new(2, 2, 0).build() } - /// Generate all ngrams from a sequence of `items`, an iterator. + /// Generate n-grams from a sequence of `items`, an iterator. /// /// Example: /// ``` @@ -77,7 +77,7 @@ impl KSkipNGrams { KSkipNGramsParams::new(n, n, 0).build() } - /// Generate all ngrams between `min_n` and `max_n` from a sequence of `items`, an iterator. + /// Generate all n-grams between `min_n` and `max_n` from a sequence of `items`, an iterator. /// /// Example: /// ``` @@ -86,8 +86,8 @@ impl KSkipNGrams { /// let gramizer = KSkipNGrams::new_everygrams(1, 3); /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); /// assert_eq!(grams, vec![ - /// vec!["One"], vec!["One", "Two"], vec!["One", "Two", "Three"], vec!["Two"], - /// vec!["Two", "Three"], vec!["Three"]]); + /// vec!["One"], vec!["Two"], vec!["Three"], vec!["One", "Two"], vec!["Two", "Three"], + /// vec!["One", "Two", "Three"]]); /// ``` /// /// Paramaters: @@ -127,9 +127,9 @@ impl KSkipNGrams { /// let sent = "One Two Three Four".split(" "); /// let gramizer = KSkipNGrams::new(2, 3, 1); /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); - /// assert_eq!(grams, vec![vec!["One", "Two"], vec!["One", "Three"], vec!["One", "Two", "Three"], - /// vec!["One", "Two", "Four"], vec!["One", "Three", "Four"], vec!["Two", "Three"], - /// vec!["Two", "Four"], vec!["Two", "Three", "Four"], vec!["Three", "Four"]]); + /// assert_eq!(grams, vec![vec!["One", "Two"], vec!["One", "Three"], vec!["Two", "Three"], + /// vec!["Two", "Four"], vec!["Three", "Four"], vec!["One", "Two", "Three"], + /// vec!["One", "Two", "Four"], vec!["One", "Three", "Four"], vec!["Two", "Three", "Four"]]); /// ``` /// /// Paramaters: @@ -152,7 +152,7 @@ impl KSkipNGrams { pad_left: Option<&'a str>, pad_right: Option<&'a str>, ) -> Result> + 'a>, EstimatorErr> { - let k_skip_n_grams_iter = KSkipNGramsIter2::new( + let k_skip_n_grams_iter = KSkipNGramsIter::new( items, self.params.min_n, self.params.max_n, @@ -164,614 +164,73 @@ impl KSkipNGrams { } } -// /// An iterator which provided with a sequence of `items` transforms into k-skip-n-grams. -// /// -// /// The iterator consumes the input iterator only once and holds a window of items to generate the -// /// grams. The window is stepped forward as it consumes the input. It also correctly generates -// /// left or right padding if specified. -// pub struct KSkipNGramsIter<'a> { -// // Params -// items: Box + 'a>, -// min_n: usize, -// max_n: usize, -// max_k: usize, -// pad_left: Option<&'a str>, -// pad_right: Option<&'a str>, -// -// // Iterator state -// /// Window which holds items that have been consumed -// window: VecDeque<&'a str>, -// /// Gram length that was yielded last -// n: usize, -// /// Amount of padding included in item yielded last -// p: usize, -// /// Offset used during MainEnd mode -// offset: usize, -// /// k-skip combinations of current window -// sample_iter: Peekable, -// /// Current mode of iterator -// mode: IterMode, -// first: bool, -// } -// -// /// Core methods to build `KSkipNGramsIter` -// impl<'a> KSkipNGramsIter<'a> { -// /// Build a new `KSkipNGramsIter`. -// /// -// /// Example: -// /// ``` -// /// use vtext::ngram_utils::*; -// /// let sent = "One Two Three".split(" "); -// /// let grams_iter = KSkipNGramsIter::new(Box::new(sent), 1, 2, 1, Some(""), Some("")); -// /// let grams: Vec> = grams_iter.unwrap().collect(); -// /// ``` -// /// -// /// Parameters: -// /// * `items` - Input iterator -// /// * `min_n` - The minimum degree of the ngram -// /// * `max_n` - The maximum degree of the ngram -// /// * `max_k` - The maximum-degree of the skipgram: the total max skip between items -// /// * `pad_left` - Optional string to use as left padding -// /// * `pad_right` - Optional string to use as right padding -// pub fn new( -// mut items: Box + 'a>, -// min_n: usize, -// max_n: usize, -// max_k: usize, -// pad_left: Option<&'a str>, -// pad_right: Option<&'a str>, -// ) -> Result, EstimatorErr> { -// if min_n < 1 { -// return Err(EstimatorErr::InvalidParams( -// "`min_n` must be greater than or equal to 1".to_string(), -// )); -// } -// if min_n > max_n { -// return Err(EstimatorErr::InvalidParams( -// "`max_n` must be greater than or equal to `min_n`".to_string(), -// )); -// } -// let mut max_k = max_k; -// if max_n == 1 { -// max_k = 0; // if n == 1. k has no effect -// } -// -// let window = Self::build_window(&mut items, max_n, max_k)?; -// -// Ok(KSkipNGramsIter { -// // Params -// items, -// min_n, -// max_n, -// max_k, -// pad_left, -// pad_right, -// -// // Iterator state -// window, -// n: 0, -// p: 0, -// offset: 0, -// sample_iter: SampleCombinations::new_empty().peekable(), -// mode: IterMode::Start, -// first: false, -// }) -// } -// -// // Prepare and populate start window -// fn build_window( -// items: &mut Box + 'a>, -// max_n: usize, -// max_k: usize, -// ) -> Result, EstimatorErr> { -// let window_size = max_n + max_k; -// let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); -// -// // Populate window -// let mut i = window_size; -// while i > 0 { -// let next_item = items.next(); -// match next_item { -// None => { -// return Err(EstimatorErr::InvalidInput( -// "Items length is smaller than `max_n`+`max_k`".to_string(), -// )) -// } -// Some(s) => { -// window.push_back(s); -// } -// } -// i -= 1; -// } -// Ok(window) -// } -// } -// -// /// Iterator functions -// impl<'a> Iterator for KSkipNGramsIter<'a> { -// type Item = Vec<&'a str>; -// -// // Next item. Depending on current mode obtain next item. -// // If current mode has been exhausted then switch to next -// fn next(&mut self) -> Option { -// return match &self.mode { -// IterMode::Start => { -// self.start_mode_pad_left(); -// self.next() -// } -// -// IterMode::PadLeft => { -// if self.pad_left.is_some() && self.max_n > 1 { -// let next = self.next_gram_pad_left(); -// match &next { -// Some(_e) => next, -// None => { -// self.start_mode_main(); -// self.next() -// } -// } -// } else { -// self.start_mode_main(); -// self.next() -// } -// } -// -// IterMode::Main => { -// let next = self.next_gram_main(); -// match &next { -// Some(_e) => next, -// None => { -// self.start_mode_main_end(); -// self.next() -// } -// } -// } -// -// IterMode::MainEnd => { -// if (self.min_n != self.max_n || self.max_k > 0) && self.window.len() > 1 { -// let next = self.next_gram_main_end(); -// match &next { -// Some(_e) => next, -// None => { -// self.start_mode_pad_right(); -// self.next() -// } -// } -// } else { -// self.start_mode_pad_right(); -// self.next() -// } -// } -// -// IterMode::PadRight => { -// if self.pad_right.is_some() && self.max_n > 1 { -// self.next_gram_pad_right() -// } else { -// return None; -// } -// } -// }; -// } -// } -// -// /// Internal functions -// impl<'a> KSkipNGramsIter<'a> { -// // Switching between modes -// fn start_mode_pad_left(&mut self) { -// self.mode = IterMode::PadLeft; -// self.first = true; -// } -// -// fn start_mode_main(&mut self) { -// self.mode = IterMode::Main; -// self.first = true; -// } -// -// fn start_mode_main_end(&mut self) { -// self.mode = IterMode::MainEnd; -// self.first = true; -// } -// -// fn start_mode_pad_right(&mut self) { -// self.mode = IterMode::PadRight; -// self.first = true; -// } -// -// // Obtain next gram for PadLeft mode -// fn next_gram_pad_left(&mut self) -> Option> { -// self.next_params_pad_left()?; -// -// let slice_idx: Vec = self.sample_iter.next().unwrap(); -// let grams = self.construct_grams_vec(slice_idx); -// Some(grams) -// } -// -// // Obtain next gram for PadRight mode -// fn next_gram_pad_right(&mut self) -> Option> { -// self.next_params_pad_right()?; -// -// let mut sample_idx: Vec = self.sample_iter.next().unwrap(); -// -// // Mirror index -// for e in sample_idx.iter_mut() { -// *e = self.window.len() - 1 - *e; -// } -// sample_idx.reverse(); -// -// let grams = self.construct_grams_vec(sample_idx); -// Some(grams) -// } -// -// // Obtain next gram for Main mode -// fn next_gram_main(&mut self) -> Option> { -// let finished = self.next_state_pad_main(); -// -// if finished.is_none() { -// self.forward_window()?; -// self.first = true; -// return self.next_gram_main(); -// } -// -// let sample_idx = self.sample_iter.next().unwrap(); -// let grams = self.construct_grams_vec(sample_idx); -// Some(grams) -// } -// -// // Obtain next gram for MainEnd mode -// fn next_gram_main_end(&mut self) -> Option> { -// self.next_state_pad_main_end()?; -// -// let mut sample_idx = self.sample_iter.next().unwrap(); -// // Offset index -// for e in sample_idx.iter_mut() { -// *e += self.offset; -// } -// let grams = self.construct_grams_vec(sample_idx); -// Some(grams) -// } -// -// // Forward the window by one step -// fn forward_window(&mut self) -> Option<()> { -// // Need to forward window when yielded ngram of max-length and max-skip-size -// let next_item = self.items.next(); -// -// return match next_item { -// None => None, -// Some(s) => { -// self.window.pop_front(); -// self.window.push_back(s); -// Some(()) // Successfully forwarded window -// } -// }; -// } -// -// // Increment parameters and sample iterator -// fn next_params_pad_left(&mut self) -> Option<()> { -// // Equivalent to a for-loop: -// // for n in max(self.min_n, 2)..self.max_n+1: -// // for p in (n-1)..0: // decreasing -// // for sample_idx in sample_iter: -// // next_gram(n, p, sample_idx) -// return if self.first { -// self.n = max(self.min_n, 2); -// self.p = self.n - 1; -// self.sample_iter = -// SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) -// .unwrap() -// .peekable(); -// -// self.first = false; -// Some(()) -// } else if self.sample_iter.peek().is_some() { -// Some(()) -// } else if self.p > 1 { -// self.p -= 1; -// -// self.sample_iter = -// SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) -// .unwrap() -// .peekable(); -// -// Some(()) -// } else if self.n < self.max_n { -// self.n += 1; -// self.p = self.n - 1; -// -// self.sample_iter = -// SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) -// .unwrap() -// .peekable(); -// -// Some(()) -// } else { -// None -// }; -// } -// -// // Increment parameters and sample iterator -// fn next_params_pad_right(&mut self) -> Option<()> { -// // Equivalent to a for-loop: -// // for n in max(self.min_n, 2)..self.max_n+1: -// // for p in 1..n: -// // for sample_idx in sample_iter: -// // next_gram(n, p, sample_idx) -// return if self.first { -// self.n = max(self.min_n, 2); -// self.p = 1; -// self.first = false; -// -// self.sample_iter = -// SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) -// .unwrap() -// .peekable(); -// -// Some(()) -// } else if self.sample_iter.peek().is_some() { -// Some(()) -// } else if self.p < self.n - 1 { -// self.p += 1; -// -// self.sample_iter = -// SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) -// .unwrap() -// .peekable(); -// -// Some(()) -// } else if self.n < self.max_n { -// self.n += 1; -// self.p = 1; -// -// self.sample_iter = -// SampleCombinations::new(false, self.n + self.max_k - self.p - 1, self.n - self.p) -// .unwrap() -// .peekable(); -// -// Some(()) -// } else { -// None -// }; -// } -// -// // Increment parameters and sample iterator for each window -// fn next_state_pad_main(&mut self) -> Option<()> { -// // Equivalent to a for-loop: -// // for n in self.min_n..self.max_n + 1: -// // for sample_idx in sample_iter: -// // next_gram(n, sample_idx) -// return if self.first { -// self.n = self.min_n; -// self.sample_iter = SampleCombinations::new(true, self.n + self.max_k - 1, self.n) -// .unwrap() -// .peekable(); -// -// self.first = false; -// Some(()) -// } else if self.sample_iter.peek().is_some() { -// Some(()) -// } else if self.n < min(self.max_n, self.window.len()) { -// self.n += 1; -// self.sample_iter = SampleCombinations::new(true, self.n + self.max_k - 1, self.n) -// .unwrap() -// .peekable(); -// -// Some(()) -// } else { -// None -// }; -// } -// -// // Increment parameters and sample iterator for each window -// fn next_state_pad_main_end(&mut self) -> Option<()> { -// // Equivalent to a for-loop: -// // for offset in 1..window.len()-min_n -// // for n in self.min_n..self.max_n + 1: -// // for sample_idx in sample_iter: -// // next_gram(offset, n, sample_idx) -// return if self.first { -// self.n = self.min_n; -// self.offset = 1; -// self.reset_sample_iter_main_end(); -// -// self.first = false; -// Some(()) -// } else if self.sample_iter.peek().is_some() { -// Some(()) -// } else if self.n < min(self.max_n, self.window.len() - self.offset) { -// self.n += 1; -// self.reset_sample_iter_main_end(); -// -// Some(()) -// } else if self.window.len() - self.offset > self.min_n { -// self.offset += 1; -// self.n = self.min_n; -// self.reset_sample_iter_main_end(); -// -// Some(()) -// } else { -// None -// }; -// } -// -// fn reset_sample_iter_main_end(&mut self) { -// let window_len = self.window.len() - self.offset; -// let mut k = 0; -// if window_len > self.n { -// k = min(self.max_k, window_len - self.n); -// } -// let max_i = self.n + k - 1; -// self.sample_iter = SampleCombinations::new(true, max_i, self.n) -// .unwrap() -// .peekable(); -// } -// -// // Create output vec from sample index and add padding if necessary -// fn construct_grams_vec(&mut self, sample_idx: Vec) -> Vec<&'a str> { -// let grams = self.vec_from_idx(sample_idx); -// -// return match self.mode { -// IterMode::PadLeft => { -// // Add padding to the left -// [ -// iter::repeat(self.pad_left.unwrap()).take(self.p).collect(), -// grams, -// ] -// .concat() -// } -// -// IterMode::PadRight => { -// // Add padding to the right -// [ -// grams, -// iter::repeat(self.pad_right.unwrap()).take(self.p).collect(), -// ] -// .concat() -// } -// -// _ => grams, -// }; -// } -// -// // Create output vec from sample index -// fn vec_from_idx(&mut self, sample_idx: Vec) -> Vec<&'a str> { -// let mut grams = Vec::with_capacity(sample_idx.len()); -// for idx in sample_idx.iter() { -// grams.push(self.window[*idx].clone()); -// } -// grams -// } -// } -// -// /// Represents the different modes of `KSkipNGramsIter` -// enum IterMode { -// Start, -// PadLeft, -// Main, -// MainEnd, -// PadRight, -// } - -/// An iterator which generates the list of combinations of `n` items in a range upto `max_i`. -/// It is possible to fix the first item at index 0 (i.e. `fix_0` == true) -/// -/// Examples: -/// ```text -/// use vtext::ngram_utils::*; -/// let output: Vec<_> = SampleCombinations::new(false, 3, 3).unwrap().collect(); -/// let expected = vec![ -/// vec![0, 1, 2], -/// vec![0, 1, 3], -/// vec![0, 2, 3], -/// vec![1, 2, 3] -/// ]; -/// assert_eq!(output, expected); +/// An iterator which provided with a sequence of `items` transforms into k-skip-n-grams. /// -/// let output: Vec<_> = SampleCombinations::new(true, 3, 3).unwrap().collect(); -/// let expected = vec![ -/// vec![0, 1, 2], -/// vec![0, 1, 3], -/// vec![0, 2, 3] -/// ]; -/// assert_eq!(output, expected); -/// ``` -struct SampleCombinations { - // Params - min_i: usize, - max_i: usize, - n: usize, - - // State - position: Vec, - first: bool, - last: bool, +/// It also correctly generates left or right padding if specified. +pub struct KSkipNGramsIter<'a> { + iter: Box> + 'a>, } -impl SampleCombinations { - /// New `SampleCombinations` +/// Core methods to build `KSkipNGramsIter` +impl<'a> KSkipNGramsIter<'a> { + /// Build a new `KSkipNGramsIter`. + /// + /// Example: + /// ``` + /// use vtext::ngram_utils::*; + /// let sent = "One Two Three".split(" "); + /// let grams_iter = KSkipNGramsIter::new(Box::new(sent), 1, 2, 1, Some(""), Some("")); + /// let grams: Vec> = grams_iter.unwrap().collect(); + /// ``` /// /// Parameters: - /// * `fix_0` - fix the first element at 0? - /// * `max_i` - the maximum index for the output elements - /// * `n` - number of items per combination - pub fn new(fix_0: bool, max_i: usize, n: usize) -> Result { - let min_i; - if fix_0 { - min_i = 1; - } else { - min_i = 0; - } - - if max_i + 1 < n { - return Err(EstimatorErr::InvalidParams("`max_i`+1 must be less than `n`".to_string())); - } + /// * `items` - Input iterator + /// * `min_n` - The minimum degree of the ngram + /// * `max_n` - The maximum degree of the ngram + /// * `max_k` - The maximum-degree of the skipgram: the total max skip between items + /// * `pad_left` - Optional string to use as left padding + /// * `pad_right` - Optional string to use as right padding + pub fn new( + mut items: Box + 'a>, + min_n: usize, + max_n: usize, + max_k: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, + ) -> Result, EstimatorErr> { + let mut iter: Box> + 'a> = Box::new(iter::empty()); - let position: Vec = (0..n).collect(); + for n in min_n..max_n + 1 { + let (iter_split_1, iter_split_0) = items.tee(); + items = Box::new(iter_split_0); - let mut last = false; - if n == max_i + 1 { - last = true; + if max_k == 0 { + let sub_iter = NGramIter::new(Box::new(iter_split_1), n, pad_left, pad_right)?; + iter = Box::new(iter.chain(sub_iter)); + } else { + let sub_iter = + SkipGramIter::new(Box::new(iter_split_1), n, max_k, pad_left, pad_right)?; + iter = Box::new(iter.chain(sub_iter)); + } } - Ok(SampleCombinations { - min_i, - max_i, - n, - position, - first: true, - last, - }) - } - - /// Produce dummy `SampleCombinations`. Will panic if `next` is executed. - pub fn new_empty() -> SampleCombinations { - SampleCombinations { - min_i: 0, - max_i: 0, - n: 0, - position: Vec::new(), - first: false, - last: false, - } + Ok(KSkipNGramsIter { iter }) } } -impl Iterator for SampleCombinations { - type Item = Vec; +/// Iterator functions +impl<'a> Iterator for KSkipNGramsIter<'a> { + type Item = Vec<&'a str>; fn next(&mut self) -> Option { - if self.first { - self.first = false; - return Some(self.position.clone()); - } - if self.last { - return None; - } - - for i in (self.min_i..self.position.len()).rev() { - let e = self.position[i]; - if e < self.max_i - (self.n - i - 1) { - let mut e_1 = e; - for j in i..self.position.len() { - e_1 += 1; - self.position[j] = e_1; - } - if i == self.min_i && e + 1 == self.max_i { - self.last = true; - } - return Some(self.position.clone()); - } - } - None // Will never reach + self.iter.next() } } - -// ------------------------------------------------------------------------------------------------ - - -/// An iterator which provided with a sequence of `items` transforms into k-skip-n-grams. +/// An iterator which provided with a sequence of `items` transforms into n-grams. /// /// The iterator consumes the input iterator only once and holds a window of items to generate the -/// grams. The window is stepped forward as it consumes the input. It also correctly generates +/// n-grams. The window is stepped forward as it consumes the input. It also correctly generates /// left or right padding if specified. pub struct NGramIter<'a> { // Params @@ -783,23 +242,21 @@ pub struct NGramIter<'a> { first: bool, } -/// Core methods to build `KSkipNGramsIter` +/// Core method to build `NGramIter` impl<'a> NGramIter<'a> { - /// Build a new `KSkipNGramsIter`. + /// Build a new `NGramIter`. /// /// Example: /// ``` /// use vtext::ngram_utils::*; /// let sent = "One Two Three".split(" "); - /// let grams_iter = KSkipNGramsIter::new(Box::new(sent), 1, 2, 1, Some(""), Some("")); + /// let grams_iter = NGramIter::new(Box::new(sent), 1, Some(""), Some("")); /// let grams: Vec> = grams_iter.unwrap().collect(); /// ``` /// /// Parameters: /// * `items` - Input iterator - /// * `min_n` - The minimum degree of the ngram - /// * `max_n` - The maximum degree of the ngram - /// * `max_k` - The maximum-degree of the skipgram: the total max skip between items + /// * `min_n` - The degree of the ngrams /// * `pad_left` - Optional string to use as left padding /// * `pad_right` - Optional string to use as right padding pub fn new( @@ -815,7 +272,7 @@ impl<'a> NGramIter<'a> { } if pad_left.is_some() || pad_right.is_some() { - items = pad_items(items, n, pad_left, pad_right); + items = pad_items(items, n, pad_left, pad_right)?; } let window = Self::build_window(&mut items, n)?; @@ -830,7 +287,7 @@ impl<'a> NGramIter<'a> { }) } - // Prepare and populate start window + /// Prepare and populate start window fn build_window( items: &mut Box + 'a>, n: usize, @@ -868,8 +325,8 @@ impl<'a> Iterator for NGramIter<'a> { return Some(Vec::from(self.window.clone())); } + // Forward window or when self.items return None let next_item = self.items.next()?; - self.window.pop_front(); self.window.push_back(next_item); @@ -877,7 +334,11 @@ impl<'a> Iterator for NGramIter<'a> { } } - +/// An iterator which provided with a sequence of `items` transforms into k-skip-grams. +/// +/// The iterator consumes the input iterator only once and holds a window of items to generate the +/// k-skip-grams. The window is stepped forward as it consumes the input. It also correctly +/// generates left or right padding if specified. pub struct SkipGramIter<'a> { // Params items: Box + 'a>, @@ -888,25 +349,23 @@ pub struct SkipGramIter<'a> { /// Window which holds items that have been consumed window: VecDeque<&'a str>, sample_iter: SampleCombinations, - last: bool, } -/// Core methods to build `KSkipNGramsIter` +/// Core methods to build `SkipGramIter` impl<'a> SkipGramIter<'a> { - /// Build a new `KSkipNGramsIter`. + /// Build a new `SkipGramIter`. /// /// Example: /// ``` /// use vtext::ngram_utils::*; /// let sent = "One Two Three".split(" "); - /// let grams_iter = KSkipNGramsIter::new(Box::new(sent), 1, 2, 1, Some(""), Some("")); + /// let grams_iter = SkipGramIter::new(Box::new(sent), 1, 2, Some(""), Some("")); /// let grams: Vec> = grams_iter.unwrap().collect(); /// ``` /// /// Parameters: /// * `items` - Input iterator - /// * `min_n` - The minimum degree of the ngram - /// * `max_n` - The maximum degree of the ngram + /// * `n` - The degree of the ngram /// * `max_k` - The maximum-degree of the skipgram: the total max skip between items /// * `pad_left` - Optional string to use as left padding /// * `pad_right` - Optional string to use as right padding @@ -924,11 +383,11 @@ impl<'a> SkipGramIter<'a> { } if pad_left.is_some() || pad_right.is_some() { - items = pad_items(items, n, pad_left, pad_right); + items = pad_items(items, n, pad_left, pad_right)?; } let window = Self::build_window(&mut items, n, max_k)?; - let sample_iter = SampleCombinations::new(true, n+max_k-1, n)?; + let sample_iter = SampleCombinations::new(true, n + max_k - 1, n)?; Ok(SkipGramIter { // Params @@ -939,7 +398,6 @@ impl<'a> SkipGramIter<'a> { // Iterator state window, sample_iter, - last: false }) } @@ -949,7 +407,7 @@ impl<'a> SkipGramIter<'a> { n: usize, max_k: usize, ) -> Result, EstimatorErr> { - let window_size = n+max_k; + let window_size = n + max_k; let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); // Populate window @@ -977,101 +435,187 @@ impl<'a> Iterator for SkipGramIter<'a> { type Item = Vec<&'a str>; fn next(&mut self) -> Option { - let next_sample = self.sample_iter.next(); return match next_sample { - None => { - // forward window - let next_item = self.items.next(); // If ended then return None + // Generate and return samples using self.sample_iter + Some(sample_idx) => { + let mut sample = Vec::with_capacity(sample_idx.len()); + for idx in sample_idx.iter() { + sample.push(self.window[*idx].clone()); + } + Some(sample) + } + // Sample_iter finished so attempt to forward window + None => { + // Try to forward window + let next_item = self.items.next(); match next_item { + // Forward window Some(item) => { self.window.pop_front(); self.window.push_back(item); - self.sample_iter = SampleCombinations::new(true, self.n + self.max_k - 1, self.n).unwrap(); + self.sample_iter = + SampleCombinations::new(true, self.n + self.max_k - 1, self.n).unwrap(); self.next() - }, + } + // self.items finished. So reduce window size iteratively to n and then finish None => { - // 1. Reduce window if self.window.len() > self.n { // reduce window self.window.pop_front(); } else { + // finished return None; } + // Generate samples from smaller window let k = min(self.max_k, self.window.len() - self.n); - self.sample_iter = SampleCombinations::new(true, self.n + k - 1, self.n).unwrap(); + self.sample_iter = + SampleCombinations::new(true, self.n + k - 1, self.n).unwrap(); self.next() } } - }, - Some(sample_idx) => { - let mut sample = Vec::with_capacity(sample_idx.len()); - for idx in sample_idx.iter() { - sample.push(self.window[*idx].clone()); - } - Some(sample) } - } + }; } } - -pub struct KSkipNGramsIter2<'a> { +/// An iterator which generates the list of combinations of `n` items in a range upto `max_i`. +/// It is possible to fix the first item at index 0 (i.e. `fix_0` == true) +/// +/// Examples: +/// ```text +/// use vtext::ngram_utils::*; +/// let output: Vec<_> = SampleCombinations::new(false, 3, 3).unwrap().collect(); +/// let expected = vec![ +/// vec![0, 1, 2], +/// vec![0, 1, 3], +/// vec![0, 2, 3], +/// vec![1, 2, 3] +/// ]; +/// assert_eq!(output, expected); +/// +/// let output: Vec<_> = SampleCombinations::new(true, 3, 3).unwrap().collect(); +/// let expected = vec![ +/// vec![0, 1, 2], +/// vec![0, 1, 3], +/// vec![0, 2, 3] +/// ]; +/// assert_eq!(output, expected); +/// ``` +struct SampleCombinations { // Params - iter: Box> + 'a>, + min_i: usize, + max_i: usize, + n: usize, + + // State + position: Vec, + first: bool, + last: bool, } -impl<'a> KSkipNGramsIter2<'a> { - pub fn new( - mut items: Box + 'a>, - min_n: usize, - max_n: usize, - max_k: usize, - pad_left: Option<&'a str>, - pad_right: Option<&'a str>, - ) -> Result, EstimatorErr> { - let mut iter: Box> + 'a> = Box::new(iter::empty()); +impl SampleCombinations { + /// New `SampleCombinations` + /// + /// Parameters: + /// * `fix_0` - fix the first element at 0? + /// * `max_i` - the maximum index for the output elements + /// * `n` - number of items per combination + pub fn new(fix_0: bool, max_i: usize, n: usize) -> Result { + let min_i; + if fix_0 { + min_i = 1; + } else { + min_i = 0; + } - for n in min_n..max_n+1 { - let (iter_split_1, iter_split_0) = items.tee(); - items = Box::new(iter_split_0); + if max_i + 1 < n { + return Err(EstimatorErr::InvalidParams( + "`max_i`+1 must be less than `n`".to_string(), + )); + } - if max_k == 0 { - let sub_iter = NGramIter::new(Box::new(iter_split_1), n, pad_left, pad_right)?; - iter = Box::new(iter.chain(sub_iter)); - } else { - let sub_iter = SkipGramIter::new(Box::new(iter_split_1), n, max_k, pad_left, pad_right)?; - iter = Box::new(iter.chain(sub_iter)); - } + let position: Vec = (0..n).collect(); + + let mut last = false; + if n == max_i + 1 { + last = true; } - Ok(KSkipNGramsIter2 { - iter + Ok(SampleCombinations { + min_i, + max_i, + n, + position, + first: true, + last, }) } } -impl<'a> Iterator for KSkipNGramsIter2<'a> { - type Item = Vec<&'a str>; +impl Iterator for SampleCombinations { + type Item = Vec; fn next(&mut self) -> Option { - self.iter.next() + if self.first { + self.first = false; + return Some(self.position.clone()); + } + if self.last { + return None; + } + + for i in (self.min_i..self.position.len()).rev() { + let e = self.position[i]; + if e < self.max_i - (self.n - i - 1) { + let mut e_1 = e; + for j in i..self.position.len() { + e_1 += 1; + self.position[j] = e_1; + } + if i == self.min_i && e + 1 == self.max_i { + self.last = true; + } + return Some(self.position.clone()); + } + } + None // Will never reach } } - -fn pad_items<'a>( +/// Pad an integrator left and/or right with tokens. +/// +/// Example: +/// ``` +/// use vtext::ngram_utils::*; +/// let sent = "One Two Three".split(" "); +/// let grams_iter = SkipGramIter::new(Box::new(sent), 1, 2, Some(""), Some("")); +/// let grams: Vec> = grams_iter.unwrap().collect(); +/// ``` +/// +/// Parameters: +/// * `items` - Input iterator +/// * `n` - The degree of the ngram +/// * `pad_left` - Optional string to use as left padding +/// * `pad_right` - Optional string to use as right padding +pub fn pad_items<'a>( items: Box + 'a>, n: usize, pad_left: Option<&'a str>, pad_right: Option<&'a str>, -) -> Box + 'a> { +) -> Result + 'a>, EstimatorErr> { + if n < 1 { + return Err(EstimatorErr::InvalidParams( + "`n` must be greater than or equal to 1".to_string(), + )); + } + let left_chained: Box>; let all_chained: Box>; @@ -1095,5 +639,5 @@ fn pad_items<'a>( } } - all_chained + Ok(all_chained) } diff --git a/src/ngram_utils/tests.rs b/src/ngram_utils/tests.rs index 1fd939b..4cea617 100644 --- a/src/ngram_utils/tests.rs +++ b/src/ngram_utils/tests.rs @@ -113,7 +113,7 @@ fn test_everygram() { vec!["had", "a", "little"], vec!["a", "little", "lamb"], vec!["little", "lamb", ""], - vec!["lamb", "", ""] + vec!["lamb", "", ""], ]; assert_eq!(grams, expected); @@ -340,4 +340,18 @@ fn test_sample_combinations() { let output: Vec> = SampleCombinations::new(true, 0, 1).unwrap().collect(); let expected = vec![vec![0]]; assert_eq!(output, expected); -} \ No newline at end of file +} + +#[test] +fn test_padding() { + let iter = "Mary had a little lamb".split(" "); + + let output_iter = pad_items(Box::new(iter), 3, Some(""), Some("")).unwrap(); + let output: Vec<&str> = output_iter.collect(); + + let expected = vec![ + "", "", "Mary", "had", "a", "little", "lamb", "", "", + ]; + + assert_eq!(output, expected); +} From 298ca3577dce74959e94dae7a956b610c4e737ed Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Thu, 16 Jul 2020 12:10:41 +0100 Subject: [PATCH 13/24] Module name renamed to `token_processing` --- src/lib.rs | 2 +- src/{ngram_utils => token_processing}/mod.rs | 22 +++++++++---------- .../tests.rs | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) rename src/{ngram_utils => token_processing}/mod.rs (97%) rename src/{ngram_utils => token_processing}/tests.rs (99%) diff --git a/src/lib.rs b/src/lib.rs index 9934a78..d56f8fe 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,7 +41,7 @@ assert_eq!(tokens, vec!["Flights", "ca", "n't", "depart", "after", "2:00", "pm", pub mod errors; mod math; pub mod metrics; -pub mod ngram_utils; +pub mod token_processing; pub mod tokenize; pub mod tokenize_sentence; pub mod vectorize; diff --git a/src/ngram_utils/mod.rs b/src/token_processing/mod.rs similarity index 97% rename from src/ngram_utils/mod.rs rename to src/token_processing/mod.rs index d68bba6..d4d4b41 100644 --- a/src/ngram_utils/mod.rs +++ b/src/token_processing/mod.rs @@ -50,7 +50,7 @@ impl KSkipNGrams { /// Example: /// /// ``` - /// use vtext::ngram_utils::*; + /// use vtext::token_processing::*; /// let sent = "One Two Three Four".split(" "); /// let gramizer = KSkipNGrams::new_bigram(); /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); @@ -64,7 +64,7 @@ impl KSkipNGrams { /// /// Example: /// ``` - /// use vtext::ngram_utils::*; + /// use vtext::token_processing::*; /// let sent = "One Two Three Four".split(" "); /// let gramizer = KSkipNGrams::new_ngrams(3); /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); @@ -81,7 +81,7 @@ impl KSkipNGrams { /// /// Example: /// ``` - /// use vtext::ngram_utils::*; + /// use vtext::token_processing::*; /// let sent = "One Two Three".split(" "); /// let gramizer = KSkipNGrams::new_everygrams(1, 3); /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); @@ -102,7 +102,7 @@ impl KSkipNGrams { /// /// Example: /// ``` - /// use vtext::ngram_utils::*; + /// use vtext::token_processing::*; /// let sent = "One Two Three Four Five".split(" "); /// let gramizer = KSkipNGrams::new_skipgrams(3, 2); /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); @@ -123,7 +123,7 @@ impl KSkipNGrams { /// /// Example: /// ``` - /// use vtext::ngram_utils::*; + /// use vtext::token_processing::*; /// let sent = "One Two Three Four".split(" "); /// let gramizer = KSkipNGrams::new(2, 3, 1); /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); @@ -177,7 +177,7 @@ impl<'a> KSkipNGramsIter<'a> { /// /// Example: /// ``` - /// use vtext::ngram_utils::*; + /// use vtext::token_processing::*; /// let sent = "One Two Three".split(" "); /// let grams_iter = KSkipNGramsIter::new(Box::new(sent), 1, 2, 1, Some(""), Some("")); /// let grams: Vec> = grams_iter.unwrap().collect(); @@ -248,7 +248,7 @@ impl<'a> NGramIter<'a> { /// /// Example: /// ``` - /// use vtext::ngram_utils::*; + /// use vtext::token_processing::*; /// let sent = "One Two Three".split(" "); /// let grams_iter = NGramIter::new(Box::new(sent), 1, Some(""), Some("")); /// let grams: Vec> = grams_iter.unwrap().collect(); @@ -357,7 +357,7 @@ impl<'a> SkipGramIter<'a> { /// /// Example: /// ``` - /// use vtext::ngram_utils::*; + /// use vtext::token_processing::*; /// let sent = "One Two Three".split(" "); /// let grams_iter = SkipGramIter::new(Box::new(sent), 1, 2, Some(""), Some("")); /// let grams: Vec> = grams_iter.unwrap().collect(); @@ -490,7 +490,7 @@ impl<'a> Iterator for SkipGramIter<'a> { /// /// Examples: /// ```text -/// use vtext::ngram_utils::*; +/// use vtext::token_processing::*; /// let output: Vec<_> = SampleCombinations::new(false, 3, 3).unwrap().collect(); /// let expected = vec![ /// vec![0, 1, 2], @@ -527,7 +527,7 @@ impl SampleCombinations { /// * `fix_0` - fix the first element at 0? /// * `max_i` - the maximum index for the output elements /// * `n` - number of items per combination - pub fn new(fix_0: bool, max_i: usize, n: usize) -> Result { + fn new(fix_0: bool, max_i: usize, n: usize) -> Result { let min_i; if fix_0 { min_i = 1; @@ -593,7 +593,7 @@ impl Iterator for SampleCombinations { /// /// Example: /// ``` -/// use vtext::ngram_utils::*; +/// use vtext::token_processing::*; /// let sent = "One Two Three".split(" "); /// let grams_iter = SkipGramIter::new(Box::new(sent), 1, 2, Some(""), Some("")); /// let grams: Vec> = grams_iter.unwrap().collect(); diff --git a/src/ngram_utils/tests.rs b/src/token_processing/tests.rs similarity index 99% rename from src/ngram_utils/tests.rs rename to src/token_processing/tests.rs index 4cea617..80379f7 100644 --- a/src/ngram_utils/tests.rs +++ b/src/token_processing/tests.rs @@ -1,4 +1,4 @@ -use crate::ngram_utils::*; +use crate::token_processing::*; use std::collections::HashSet; use std::iter::FromIterator; From 9453129f4818a31924604bc557ca628875344be6 Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Thu, 16 Jul 2020 15:51:29 +0100 Subject: [PATCH 14/24] KSkipNGrams Python API --- python/src/lib.rs | 2 + python/src/token_processing.rs | 96 +++++++++++++++++++++ python/vtext/tests/test_token_processing.py | 40 +++++++++ python/vtext/token_processing.py | 9 ++ src/token_processing/mod.rs | 6 ++ 5 files changed, 153 insertions(+) create mode 100644 python/src/token_processing.rs create mode 100644 python/vtext/tests/test_token_processing.py create mode 100644 python/vtext/token_processing.py diff --git a/python/src/lib.rs b/python/src/lib.rs index 071f28a..5d1525d 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -8,6 +8,7 @@ use pyo3::prelude::*; use pyo3::wrap_pyfunction; mod stem; +mod token_processing; mod tokenize; mod tokenize_sentence; mod utils; @@ -183,6 +184,7 @@ fn _lib(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_wrapped(wrap_pyfunction!(dice_similarity))?; m.add_wrapped(wrap_pyfunction!(jaro_similarity))?; m.add_wrapped(wrap_pyfunction!(jaro_winkler_similarity))?; diff --git a/python/src/token_processing.rs b/python/src/token_processing.rs new file mode 100644 index 0000000..13a8eef --- /dev/null +++ b/python/src/token_processing.rs @@ -0,0 +1,96 @@ +// Copyright 2019 vtext developers +// +// Licensed under the Apache License, Version 2.0, +// . This file may not be copied, +// modified, or distributed except according to those terms. + +use pyo3::prelude::*; +use pyo3::types::{PyIterator, PyList, PyString}; +use pyo3::PyIterProtocol; + +use crate::utils::{deserialize_params, serialize_params}; +use vtext::token_processing::*; + +/// __init__(self, min_n: int, max_n: int, max_k: int) +/// +/// K-Skip-N-Grams generator +/// +/// Provided with a list of tokens it generates k-skip-n-grams. +/// +/// Parameters +/// ---------- +/// min_n : int +/// The minimum degree of the ngram +/// max_n : int +/// The maximum degree of the ngram +/// max_k : int +/// The maximum-degree of the skipgram: the total max skip between items +#[pyclass(module = "vtext.token_processing")] +pub struct KSkipNGrams { + inner: vtext::token_processing::KSkipNGrams, +} + +#[pymethods] +impl KSkipNGrams { + #[new] + fn new(min_n: usize, max_n: usize, max_k: usize) -> PyResult { + let kskipngrams = vtext::token_processing::KSkipNGrams::new(min_n, max_n, max_k); + Ok(KSkipNGrams { inner: kskipngrams }) + } + + /// transform(self, items: List[str], + /// pad_left: Optional[str]=None, pad_right: Optional[str]=None) -> List[List[str]] + /// + /// Transforms a given sequence of `items` into k-skip-n-grams. + /// + /// Parameters + /// ---------- + /// items : List[str] + /// The list of items to create the k-skip-n-grams of. + /// pad_left : Optional[str] + /// Optional string to use as left padding + /// pad_right : Optional[str] + /// Optional string to use as right padding + /// + /// Returns + /// ------- + /// k-skip-n-grams : List[List[str]] + /// computed k-skip-n-grams + #[args(pad_left = "None", pad_right = "None")] + fn transform<'py>( + &self, + py: Python<'py>, + items: Vec<&str>, + pad_left: Option<&str>, + pad_right: Option<&str>, + ) -> PyResult<&'py PyList> { + let res: Vec<_> = self + .inner + .transform(Box::new(items.into_iter()), pad_left, pad_right)? + .collect(); + let output = PyList::new(py, res); + Ok(output) + } + + /// get_params(self, x) + /// + /// Get parameters for this estimator. + /// + /// Returns + /// ------- + /// params : mapping of string to any + /// Parameter names mapped to their values. + fn get_params(&self) -> PyResult { + Ok(self.inner.params.clone()) + } + + pub fn __getstate__(&self, py: Python) -> PyResult { + serialize_params(&self.inner.params, py) + } + + pub fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { + let mut params: KSkipNGramsParams = deserialize_params(py, state)?; + self.inner = params.build(); + Ok(()) + } +} diff --git a/python/vtext/tests/test_token_processing.py b/python/vtext/tests/test_token_processing.py new file mode 100644 index 0000000..29381e4 --- /dev/null +++ b/python/vtext/tests/test_token_processing.py @@ -0,0 +1,40 @@ +# Copyright 2019 vtext developers +# +# Licensed under the Apache License, Version 2.0, +# . This file may not be copied, +# modified, or distributed except according to those terms. + +import pytest +import hypothesis +import hypothesis.strategies as st + +from vtext.token_processing import KSkipNGrams + + +def test_unicode_segment_tokenize(): + + gramizer = KSkipNGrams(min_n=2, max_n=2, max_k=0) + assert gramizer.transform(["One", "Two", "Three"]) == [ + ["One", "Two"], + ["Two", "Three"], + ] + + with pytest.raises(TypeError): + KSkipNGrams() + + # n == 0 + with pytest.raises(ValueError): + KSkipNGrams(min_n=0, max_n=0, max_k=0).transform(["One", "Two", "Three"]) + + # min_n > max_n + with pytest.raises(ValueError): + KSkipNGrams(min_n=1, max_n=0, max_k=0).transform(["One", "Two", "Three"]) + + # max_k < 0 + with pytest.raises(OverflowError): + KSkipNGrams(min_n=1, max_n=1, max_k=-1).transform(["One", "Two", "Three"]) + + +@hypothesis.given(st.text(min_size=2)) +def test_tokenize_edge_cases(txt): + KSkipNGrams(min_n=1, max_n=1, max_k=1).transform(list(txt)) diff --git a/python/vtext/token_processing.py b/python/vtext/token_processing.py new file mode 100644 index 0000000..81f3e9e --- /dev/null +++ b/python/vtext/token_processing.py @@ -0,0 +1,9 @@ +# Copyright 2019 vtext developers +# +# Licensed under the Apache License, Version 2.0, +# . This file may not be copied, +# modified, or distributed except according to those terms. + +from ._lib import KSkipNGrams + +__all__ = ["KSkipNGrams"] diff --git a/src/token_processing/mod.rs b/src/token_processing/mod.rs index d4d4b41..68d0606 100644 --- a/src/token_processing/mod.rs +++ b/src/token_processing/mod.rs @@ -198,6 +198,12 @@ impl<'a> KSkipNGramsIter<'a> { pad_left: Option<&'a str>, pad_right: Option<&'a str>, ) -> Result, EstimatorErr> { + if min_n > max_n { + return Err(EstimatorErr::InvalidParams( + "`min_n` must be equal to or less than `max_n`".to_string(), + )); + } + let mut iter: Box> + 'a> = Box::new(iter::empty()); for n in min_n..max_n + 1 { From 5f6698e585162321bd0ebb11b6a95e64257730c8 Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Fri, 17 Jul 2020 13:47:43 +0100 Subject: [PATCH 15/24] Fixed doc --- src/token_processing/mod.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/token_processing/mod.rs b/src/token_processing/mod.rs index 68d0606..b511e77 100644 --- a/src/token_processing/mod.rs +++ b/src/token_processing/mod.rs @@ -601,8 +601,7 @@ impl Iterator for SampleCombinations { /// ``` /// use vtext::token_processing::*; /// let sent = "One Two Three".split(" "); -/// let grams_iter = SkipGramIter::new(Box::new(sent), 1, 2, Some(""), Some("")); -/// let grams: Vec> = grams_iter.unwrap().collect(); +/// let sent_padded: Vec<_> = pad_items(Box::new(sent), 3, Some(""), Some("")).unwrap().collect(); /// ``` /// /// Parameters: From b37e47db948116d2c152dfb62a3fdcb4fd0d8a9d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 21 Jul 2020 23:06:20 +0200 Subject: [PATCH 16/24] Update python/vtext/tests/test_token_processing.py --- python/vtext/tests/test_token_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/vtext/tests/test_token_processing.py b/python/vtext/tests/test_token_processing.py index 29381e4..5f4e033 100644 --- a/python/vtext/tests/test_token_processing.py +++ b/python/vtext/tests/test_token_processing.py @@ -35,6 +35,6 @@ def test_unicode_segment_tokenize(): KSkipNGrams(min_n=1, max_n=1, max_k=-1).transform(["One", "Two", "Three"]) -@hypothesis.given(st.text(min_size=2)) +@hypothesis.given(st.lists(st.text(min_size=0)) def test_tokenize_edge_cases(txt): KSkipNGrams(min_n=1, max_n=1, max_k=1).transform(list(txt)) From 3bea0279cd4a2f5db46ffa5eb6f48b09ec9ef6fd Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 22 Jul 2020 00:55:26 +0200 Subject: [PATCH 17/24] Fix syntax error --- python/vtext/tests/test_token_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/vtext/tests/test_token_processing.py b/python/vtext/tests/test_token_processing.py index 5f4e033..b2eaece 100644 --- a/python/vtext/tests/test_token_processing.py +++ b/python/vtext/tests/test_token_processing.py @@ -35,6 +35,6 @@ def test_unicode_segment_tokenize(): KSkipNGrams(min_n=1, max_n=1, max_k=-1).transform(["One", "Two", "Three"]) -@hypothesis.given(st.lists(st.text(min_size=0)) +@hypothesis.given(st.lists(st.text(min_size=0))) def test_tokenize_edge_cases(txt): KSkipNGrams(min_n=1, max_n=1, max_k=1).transform(list(txt)) From e382b97f37f4e4a006708a6c655da0bbf9af0a27 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 22 Jul 2020 01:00:15 +0200 Subject: [PATCH 18/24] Minor fixes in the hypothesis test --- python/vtext/tests/test_token_processing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/vtext/tests/test_token_processing.py b/python/vtext/tests/test_token_processing.py index b2eaece..98ac057 100644 --- a/python/vtext/tests/test_token_processing.py +++ b/python/vtext/tests/test_token_processing.py @@ -35,6 +35,6 @@ def test_unicode_segment_tokenize(): KSkipNGrams(min_n=1, max_n=1, max_k=-1).transform(["One", "Two", "Three"]) -@hypothesis.given(st.lists(st.text(min_size=0))) +@hypothesis.given(st.lists(st.text(min_size=2))) def test_tokenize_edge_cases(txt): - KSkipNGrams(min_n=1, max_n=1, max_k=1).transform(list(txt)) + KSkipNGrams(min_n=1, max_n=1, max_k=1).transform(txt) From dc80e5d0837ed17434a26435e86f6a47786fc09c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 22 Jul 2020 01:06:42 +0200 Subject: [PATCH 19/24] Increase min_size for tokens in python tests --- python/vtext/tests/test_token_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/vtext/tests/test_token_processing.py b/python/vtext/tests/test_token_processing.py index 98ac057..4dbedc6 100644 --- a/python/vtext/tests/test_token_processing.py +++ b/python/vtext/tests/test_token_processing.py @@ -35,6 +35,6 @@ def test_unicode_segment_tokenize(): KSkipNGrams(min_n=1, max_n=1, max_k=-1).transform(["One", "Two", "Three"]) -@hypothesis.given(st.lists(st.text(min_size=2))) +@hypothesis.given(st.lists(st.text(), min_size=2)) def test_tokenize_edge_cases(txt): KSkipNGrams(min_n=1, max_n=1, max_k=1).transform(txt) From d58ac99980e0304d6ba8fbfdc99649bc4e2c8bbc Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Wed, 22 Jul 2020 17:19:16 +0100 Subject: [PATCH 20/24] `cargo clippy` suggestions --- src/token_processing/mod.rs | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/token_processing/mod.rs b/src/token_processing/mod.rs index b511e77..17a5f9f 100644 --- a/src/token_processing/mod.rs +++ b/src/token_processing/mod.rs @@ -336,7 +336,7 @@ impl<'a> Iterator for NGramIter<'a> { self.window.pop_front(); self.window.push_back(next_item); - return Some(Vec::from(self.window.clone())); + Some(Vec::from(self.window.clone())) } } @@ -443,12 +443,12 @@ impl<'a> Iterator for SkipGramIter<'a> { fn next(&mut self) -> Option { let next_sample = self.sample_iter.next(); - return match next_sample { + match next_sample { // Generate and return samples using self.sample_iter Some(sample_idx) => { let mut sample = Vec::with_capacity(sample_idx.len()); - for idx in sample_idx.iter() { - sample.push(self.window[*idx].clone()); + for idx in sample_idx.into_iter() { + sample.push(self.window[idx]); } Some(sample) } @@ -487,7 +487,7 @@ impl<'a> Iterator for SkipGramIter<'a> { } } } - }; + } } } @@ -534,12 +534,7 @@ impl SampleCombinations { /// * `max_i` - the maximum index for the output elements /// * `n` - number of items per combination fn new(fix_0: bool, max_i: usize, n: usize) -> Result { - let min_i; - if fix_0 { - min_i = 1; - } else { - min_i = 0; - } + let min_i = if fix_0 { 1 } else { 0 }; if max_i + 1 < n { return Err(EstimatorErr::InvalidParams( @@ -549,10 +544,7 @@ impl SampleCombinations { let position: Vec = (0..n).collect(); - let mut last = false; - if n == max_i + 1 { - last = true; - } + let last = n == max_i + 1; Ok(SampleCombinations { min_i, From 7b7ef183fdf6a1df38eb5af6a673a891707e1059 Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Wed, 22 Jul 2020 18:18:16 +0100 Subject: [PATCH 21/24] Empty or input that is smaller than n gives empty output --- src/token_processing/mod.rs | 122 ++++++++++++++++++---------------- src/token_processing/tests.rs | 56 ++++++++++++++++ 2 files changed, 120 insertions(+), 58 deletions(-) diff --git a/src/token_processing/mod.rs b/src/token_processing/mod.rs index 17a5f9f..dc2e0eb 100644 --- a/src/token_processing/mod.rs +++ b/src/token_processing/mod.rs @@ -246,6 +246,7 @@ pub struct NGramIter<'a> { /// Window which holds items that have been consumed window: VecDeque<&'a str>, first: bool, + last: bool } /// Core method to build `NGramIter` @@ -281,7 +282,9 @@ impl<'a> NGramIter<'a> { items = pad_items(items, n, pad_left, pad_right)?; } - let window = Self::build_window(&mut items, n)?; + // Build window + let window = build_window(&mut items, n); + let last = window.len() < n; // if not full window then will always return None Ok(NGramIter { // Params @@ -290,35 +293,9 @@ impl<'a> NGramIter<'a> { // Iterator state window, first: true, + last }) } - - /// Prepare and populate start window - fn build_window( - items: &mut Box + 'a>, - n: usize, - ) -> Result, EstimatorErr> { - let window_size = n; - let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); - - // Populate window - let mut i = window_size; - while i > 0 { - let next_item = items.next(); - match next_item { - None => { - return Err(EstimatorErr::InvalidInput( - "Items length is smaller than `n`".to_string(), - )) - } - Some(s) => { - window.push_back(s); - } - } - i -= 1; - } - Ok(window) - } } /// Iterator functions @@ -326,6 +303,9 @@ impl<'a> Iterator for NGramIter<'a> { type Item = Vec<&'a str>; fn next(&mut self) -> Option { + if self.last { + return None; + } if self.first { self.first = false; return Some(Vec::from(self.window.clone())); @@ -355,6 +335,7 @@ pub struct SkipGramIter<'a> { /// Window which holds items that have been consumed window: VecDeque<&'a str>, sample_iter: SampleCombinations, + last: bool } /// Core methods to build `SkipGramIter` @@ -392,8 +373,22 @@ impl<'a> SkipGramIter<'a> { items = pad_items(items, n, pad_left, pad_right)?; } - let window = Self::build_window(&mut items, n, max_k)?; - let sample_iter = SampleCombinations::new(true, n + max_k - 1, n)?; + let window_size = n + max_k; + let window = build_window(&mut items, window_size); + + let sample_iter; + let last; + if window.len() >= n { + let k = min(max_k, window.len() - n); + sample_iter = + SampleCombinations::new(true, n + k - 1, n)?; + last = false; + } else { + // Window too small. Always return None + sample_iter = SampleCombinations::new_empty(); + last = true; + } + Ok(SkipGramIter { // Params @@ -404,36 +399,9 @@ impl<'a> SkipGramIter<'a> { // Iterator state window, sample_iter, + last }) } - - // Prepare and populate start window - fn build_window( - items: &mut Box + 'a>, - n: usize, - max_k: usize, - ) -> Result, EstimatorErr> { - let window_size = n + max_k; - let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); - - // Populate window - let mut i = window_size; - while i > 0 { - let next_item = items.next(); - match next_item { - None => { - return Err(EstimatorErr::InvalidInput( - "Items length is smaller than `n`+`max_k`".to_string(), - )) - } - Some(s) => { - window.push_back(s); - } - } - i -= 1; - } - Ok(window) - } } /// Iterator functions @@ -441,6 +409,9 @@ impl<'a> Iterator for SkipGramIter<'a> { type Item = Vec<&'a str>; fn next(&mut self) -> Option { + if self.last { + return None + } let next_sample = self.sample_iter.next(); match next_sample { @@ -555,6 +526,17 @@ impl SampleCombinations { last, }) } + + fn new_empty() -> SampleCombinations { + SampleCombinations{ + min_i: 0, + max_i: 0, + n: 0, + position: Vec::new(), + first: false, + last: true, + } + } } impl Iterator for SampleCombinations { @@ -638,3 +620,27 @@ pub fn pad_items<'a>( Ok(all_chained) } + +/// Build and populate window for start of NGramIter or SkipGrmIter +fn build_window<'a>( + items: &mut Box + 'a>, + window_size: usize, +) -> VecDeque<&'a str> { + let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); + + // Populate window + let mut i = window_size; + while i > 0 { + let next_item = items.next(); + match next_item { + None => { + break + } + Some(s) => { + window.push_back(s); + } + } + i -= 1; + } + window +} \ No newline at end of file diff --git a/src/token_processing/tests.rs b/src/token_processing/tests.rs index 80379f7..02058a7 100644 --- a/src/token_processing/tests.rs +++ b/src/token_processing/tests.rs @@ -271,6 +271,52 @@ fn test_skipgram_everygram() { #[test] fn test_ngram_edge_cases() { + // Input length less than n + let sent = vec!["a", "b"].into_iter(); // Empty + let gramizer = NGramIter::new(Box::new(sent), 3, None, None).unwrap(); + let grarms: Vec> = gramizer.collect(); + + let expected: Vec> = Vec::new(); // Empty + assert_eq!(grarms, expected); + + // Empty input + let sent = Vec::<&str>::new().into_iter(); // Empty + let gramizer = NGramIter::new(Box::new(sent), 1, Some(""), Some("")).unwrap(); + let grarms: Vec> = gramizer.collect(); + + let expected: Vec> = Vec::new(); // Empty + assert_eq!(grarms, expected); +} + +#[test] +fn test_skipgram_edge_cases() { + // Input length less than n + k but greater or equal to n + let sent = vec!["a", "b"].into_iter(); // Empty + let gramizer = SkipGramIter::new(Box::new(sent), 2, 1, None, None).unwrap(); + let grarms: Vec> = gramizer.collect(); + + let expected = vec![vec!["a", "b"]]; + assert_eq!(grarms, expected); + + // Input length less than n + k + let sent = vec!["a"].into_iter(); // Empty + let gramizer = SkipGramIter::new(Box::new(sent), 2, 1, None, None).unwrap(); + let grarms: Vec> = gramizer.collect(); + + let expected: Vec> = Vec::new(); // Empty + assert_eq!(grarms, expected); + + // Empty input + let sent = Vec::<&str>::new().into_iter(); // Empty + let gramizer = NGramIter::new(Box::new(sent), 1, Some(""), Some("")).unwrap(); + let grarms: Vec> = gramizer.collect(); + + let expected: Vec> = Vec::new(); // Empty + assert_eq!(grarms, expected); +} + +#[test] +fn test_kskipngram_edge_cases() { let sent = "Mary had a little lamb".split(" "); let gramizer = KSkipNGrams::new(1, 1, 0); @@ -296,6 +342,16 @@ fn test_ngram_edge_cases() { .collect(); assert_eq!(grarms, expected); + + let sent = Vec::<&str>::new().into_iter(); // Empty + let gramizer = KSkipNGrams::new(1, 2, 1); + let grarms: Vec> = gramizer + .transform(Box::new(sent), None, None) + .unwrap() + .collect(); + + let expected: Vec> = Vec::new(); // Empty + assert_eq!(grarms, expected); } #[test] From 6f1cadf14d62a6862fc853873d27de8218447af1 Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Wed, 22 Jul 2020 20:10:28 +0100 Subject: [PATCH 22/24] ngram and skipgram benchmark: ``` # Testing 19924 documents vtext: everygram: 133.95s [0.7 MB/s, 54 kWPS] nltk: everygram: 5.06s [18.0 MB/s, 1419 kWPS] vtext: skipgram: 498.58s [0.2 MB/s, 14 kWPS] nltk: skipgram: 18.01s [5.1 MB/s, 399 kWPS] ``` --- benchmarks/bench_ngram.py | 75 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 benchmarks/bench_ngram.py diff --git a/benchmarks/bench_ngram.py b/benchmarks/bench_ngram.py new file mode 100644 index 0000000..bae92db --- /dev/null +++ b/benchmarks/bench_ngram.py @@ -0,0 +1,75 @@ +from time import time +from glob import glob +from pathlib import Path + +from vtext.tokenize import RegexpTokenizer +from vtext.token_processing import KSkipNGrams + +try: + import nltk + from nltk.util import everygrams as nltk_everygrams, skipgrams as nltk_skipgrams +except ImportError: + nltk = None + + +base_dir = Path(__file__).parent.parent.resolve() + +LIMIT = None + +if __name__ == "__main__": + input_files = list(glob(str(base_dir / "data" / "*" / "*"))) + if LIMIT is not None: + input_files = input_files[:LIMIT] + data = [] + for file_path in input_files: + with open(file_path, "rt") as fh: + data.append(fh.read()) + assert len(data) > 0 + + token_regexp = r"\b\w\w+\b" + + dataset_size = 91 # MB for 20 newsgroup dataset + + print("# Testing {} documents".format(len(data))) + + db = [ + ( + r"vtext: everygram", + KSkipNGrams(min_n=1, max_n=3, max_k=0).transform, + ), + ( + "nltk: everygram", + lambda seq: nltk_everygrams(seq, 1, 3), + ), + ( + r"vtext: skipgram", + KSkipNGrams(min_n=3, max_n=3, max_k=2).transform, + ), + ( + "nltk: skipgram", + lambda seq: nltk_skipgrams(seq, n=3, k=2), + ), + ] + + tokenizer = RegexpTokenizer(pattern=token_regexp) + + # Tokenize + doc_tokens = [tokenizer.tokenize(doc) for doc in data] + + for label, func in db: + t0 = time() + + out = [] + for idx, doc in enumerate(doc_tokens): + out.append(func(doc)) + + dt = time() - t0 + + # number of input tokens + n_tokens = sum(len(tok) for tok in doc_tokens) + + print( + "{:>45}: {:.2f}s [{:.1f} MB/s, {:.0f} kWPS]".format( + label, dt, dataset_size / dt, n_tokens * 1e-3 / dt + ) + ) From 083b4ec1df94e2060596d33e7612921152fca358 Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Wed, 22 Jul 2020 20:12:03 +0100 Subject: [PATCH 23/24] cargo fmt --- src/token_processing/mod.rs | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/token_processing/mod.rs b/src/token_processing/mod.rs index dc2e0eb..195d3c9 100644 --- a/src/token_processing/mod.rs +++ b/src/token_processing/mod.rs @@ -246,7 +246,7 @@ pub struct NGramIter<'a> { /// Window which holds items that have been consumed window: VecDeque<&'a str>, first: bool, - last: bool + last: bool, } /// Core method to build `NGramIter` @@ -293,7 +293,7 @@ impl<'a> NGramIter<'a> { // Iterator state window, first: true, - last + last, }) } } @@ -304,7 +304,7 @@ impl<'a> Iterator for NGramIter<'a> { fn next(&mut self) -> Option { if self.last { - return None; + return None; } if self.first { self.first = false; @@ -335,7 +335,7 @@ pub struct SkipGramIter<'a> { /// Window which holds items that have been consumed window: VecDeque<&'a str>, sample_iter: SampleCombinations, - last: bool + last: bool, } /// Core methods to build `SkipGramIter` @@ -380,8 +380,7 @@ impl<'a> SkipGramIter<'a> { let last; if window.len() >= n { let k = min(max_k, window.len() - n); - sample_iter = - SampleCombinations::new(true, n + k - 1, n)?; + sample_iter = SampleCombinations::new(true, n + k - 1, n)?; last = false; } else { // Window too small. Always return None @@ -389,7 +388,6 @@ impl<'a> SkipGramIter<'a> { last = true; } - Ok(SkipGramIter { // Params items, @@ -399,7 +397,7 @@ impl<'a> SkipGramIter<'a> { // Iterator state window, sample_iter, - last + last, }) } } @@ -410,7 +408,7 @@ impl<'a> Iterator for SkipGramIter<'a> { fn next(&mut self) -> Option { if self.last { - return None + return None; } let next_sample = self.sample_iter.next(); @@ -528,7 +526,7 @@ impl SampleCombinations { } fn new_empty() -> SampleCombinations { - SampleCombinations{ + SampleCombinations { min_i: 0, max_i: 0, n: 0, @@ -633,9 +631,7 @@ fn build_window<'a>( while i > 0 { let next_item = items.next(); match next_item { - None => { - break - } + None => break, Some(s) => { window.push_back(s); } @@ -643,4 +639,4 @@ fn build_window<'a>( i -= 1; } window -} \ No newline at end of file +} From 6345cf3edab281c8131059e3818af109a59cd13c Mon Sep 17 00:00:00 2001 From: Josh Levy-Kramer Date: Wed, 22 Jul 2020 20:19:04 +0100 Subject: [PATCH 24/24] `black python/ benchmarks/ evaluation/` --- benchmarks/bench_ngram.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/benchmarks/bench_ngram.py b/benchmarks/bench_ngram.py index bae92db..96e127c 100644 --- a/benchmarks/bench_ngram.py +++ b/benchmarks/bench_ngram.py @@ -33,22 +33,10 @@ print("# Testing {} documents".format(len(data))) db = [ - ( - r"vtext: everygram", - KSkipNGrams(min_n=1, max_n=3, max_k=0).transform, - ), - ( - "nltk: everygram", - lambda seq: nltk_everygrams(seq, 1, 3), - ), - ( - r"vtext: skipgram", - KSkipNGrams(min_n=3, max_n=3, max_k=2).transform, - ), - ( - "nltk: skipgram", - lambda seq: nltk_skipgrams(seq, n=3, k=2), - ), + (r"vtext: everygram", KSkipNGrams(min_n=1, max_n=3, max_k=0).transform,), + ("nltk: everygram", lambda seq: nltk_everygrams(seq, 1, 3),), + (r"vtext: skipgram", KSkipNGrams(min_n=3, max_n=3, max_k=2).transform,), + ("nltk: skipgram", lambda seq: nltk_skipgrams(seq, n=3, k=2),), ] tokenizer = RegexpTokenizer(pattern=token_regexp)