diff --git a/benchmarks/bench_ngram.py b/benchmarks/bench_ngram.py new file mode 100644 index 0000000..96e127c --- /dev/null +++ b/benchmarks/bench_ngram.py @@ -0,0 +1,63 @@ +from time import time +from glob import glob +from pathlib import Path + +from vtext.tokenize import RegexpTokenizer +from vtext.token_processing import KSkipNGrams + +try: + import nltk + from nltk.util import everygrams as nltk_everygrams, skipgrams as nltk_skipgrams +except ImportError: + nltk = None + + +base_dir = Path(__file__).parent.parent.resolve() + +LIMIT = None + +if __name__ == "__main__": + input_files = list(glob(str(base_dir / "data" / "*" / "*"))) + if LIMIT is not None: + input_files = input_files[:LIMIT] + data = [] + for file_path in input_files: + with open(file_path, "rt") as fh: + data.append(fh.read()) + assert len(data) > 0 + + token_regexp = r"\b\w\w+\b" + + dataset_size = 91 # MB for 20 newsgroup dataset + + print("# Testing {} documents".format(len(data))) + + db = [ + (r"vtext: everygram", KSkipNGrams(min_n=1, max_n=3, max_k=0).transform,), + ("nltk: everygram", lambda seq: nltk_everygrams(seq, 1, 3),), + (r"vtext: skipgram", KSkipNGrams(min_n=3, max_n=3, max_k=2).transform,), + ("nltk: skipgram", lambda seq: nltk_skipgrams(seq, n=3, k=2),), + ] + + tokenizer = RegexpTokenizer(pattern=token_regexp) + + # Tokenize + doc_tokens = [tokenizer.tokenize(doc) for doc in data] + + for label, func in db: + t0 = time() + + out = [] + for idx, doc in enumerate(doc_tokens): + out.append(func(doc)) + + dt = time() - t0 + + # number of input tokens + n_tokens = sum(len(tok) for tok in doc_tokens) + + print( + "{:>45}: {:.2f}s [{:.1f} MB/s, {:.0f} kWPS]".format( + label, dt, dataset_size / dt, n_tokens * 1e-3 / dt + ) + ) diff --git a/python/src/lib.rs b/python/src/lib.rs index 071f28a..5d1525d 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -8,6 +8,7 @@ use pyo3::prelude::*; use pyo3::wrap_pyfunction; mod stem; +mod token_processing; mod tokenize; mod tokenize_sentence; mod utils; @@ -183,6 +184,7 @@ fn _lib(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_wrapped(wrap_pyfunction!(dice_similarity))?; m.add_wrapped(wrap_pyfunction!(jaro_similarity))?; m.add_wrapped(wrap_pyfunction!(jaro_winkler_similarity))?; diff --git a/python/src/token_processing.rs b/python/src/token_processing.rs new file mode 100644 index 0000000..13a8eef --- /dev/null +++ b/python/src/token_processing.rs @@ -0,0 +1,96 @@ +// Copyright 2019 vtext developers +// +// Licensed under the Apache License, Version 2.0, +// . This file may not be copied, +// modified, or distributed except according to those terms. + +use pyo3::prelude::*; +use pyo3::types::{PyIterator, PyList, PyString}; +use pyo3::PyIterProtocol; + +use crate::utils::{deserialize_params, serialize_params}; +use vtext::token_processing::*; + +/// __init__(self, min_n: int, max_n: int, max_k: int) +/// +/// K-Skip-N-Grams generator +/// +/// Provided with a list of tokens it generates k-skip-n-grams. +/// +/// Parameters +/// ---------- +/// min_n : int +/// The minimum degree of the ngram +/// max_n : int +/// The maximum degree of the ngram +/// max_k : int +/// The maximum-degree of the skipgram: the total max skip between items +#[pyclass(module = "vtext.token_processing")] +pub struct KSkipNGrams { + inner: vtext::token_processing::KSkipNGrams, +} + +#[pymethods] +impl KSkipNGrams { + #[new] + fn new(min_n: usize, max_n: usize, max_k: usize) -> PyResult { + let kskipngrams = vtext::token_processing::KSkipNGrams::new(min_n, max_n, max_k); + Ok(KSkipNGrams { inner: kskipngrams }) + } + + /// transform(self, items: List[str], + /// pad_left: Optional[str]=None, pad_right: Optional[str]=None) -> List[List[str]] + /// + /// Transforms a given sequence of `items` into k-skip-n-grams. + /// + /// Parameters + /// ---------- + /// items : List[str] + /// The list of items to create the k-skip-n-grams of. + /// pad_left : Optional[str] + /// Optional string to use as left padding + /// pad_right : Optional[str] + /// Optional string to use as right padding + /// + /// Returns + /// ------- + /// k-skip-n-grams : List[List[str]] + /// computed k-skip-n-grams + #[args(pad_left = "None", pad_right = "None")] + fn transform<'py>( + &self, + py: Python<'py>, + items: Vec<&str>, + pad_left: Option<&str>, + pad_right: Option<&str>, + ) -> PyResult<&'py PyList> { + let res: Vec<_> = self + .inner + .transform(Box::new(items.into_iter()), pad_left, pad_right)? + .collect(); + let output = PyList::new(py, res); + Ok(output) + } + + /// get_params(self, x) + /// + /// Get parameters for this estimator. + /// + /// Returns + /// ------- + /// params : mapping of string to any + /// Parameter names mapped to their values. + fn get_params(&self) -> PyResult { + Ok(self.inner.params.clone()) + } + + pub fn __getstate__(&self, py: Python) -> PyResult { + serialize_params(&self.inner.params, py) + } + + pub fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { + let mut params: KSkipNGramsParams = deserialize_params(py, state)?; + self.inner = params.build(); + Ok(()) + } +} diff --git a/python/vtext/tests/test_token_processing.py b/python/vtext/tests/test_token_processing.py new file mode 100644 index 0000000..4dbedc6 --- /dev/null +++ b/python/vtext/tests/test_token_processing.py @@ -0,0 +1,40 @@ +# Copyright 2019 vtext developers +# +# Licensed under the Apache License, Version 2.0, +# . This file may not be copied, +# modified, or distributed except according to those terms. + +import pytest +import hypothesis +import hypothesis.strategies as st + +from vtext.token_processing import KSkipNGrams + + +def test_unicode_segment_tokenize(): + + gramizer = KSkipNGrams(min_n=2, max_n=2, max_k=0) + assert gramizer.transform(["One", "Two", "Three"]) == [ + ["One", "Two"], + ["Two", "Three"], + ] + + with pytest.raises(TypeError): + KSkipNGrams() + + # n == 0 + with pytest.raises(ValueError): + KSkipNGrams(min_n=0, max_n=0, max_k=0).transform(["One", "Two", "Three"]) + + # min_n > max_n + with pytest.raises(ValueError): + KSkipNGrams(min_n=1, max_n=0, max_k=0).transform(["One", "Two", "Three"]) + + # max_k < 0 + with pytest.raises(OverflowError): + KSkipNGrams(min_n=1, max_n=1, max_k=-1).transform(["One", "Two", "Three"]) + + +@hypothesis.given(st.lists(st.text(), min_size=2)) +def test_tokenize_edge_cases(txt): + KSkipNGrams(min_n=1, max_n=1, max_k=1).transform(txt) diff --git a/python/vtext/token_processing.py b/python/vtext/token_processing.py new file mode 100644 index 0000000..81f3e9e --- /dev/null +++ b/python/vtext/token_processing.py @@ -0,0 +1,9 @@ +# Copyright 2019 vtext developers +# +# Licensed under the Apache License, Version 2.0, +# . This file may not be copied, +# modified, or distributed except according to those terms. + +from ._lib import KSkipNGrams + +__all__ = ["KSkipNGrams"] diff --git a/src/errors.rs b/src/errors.rs index 1f52f36..9979799 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -5,13 +5,15 @@ use thiserror::Error; #[derive(Error, Debug)] pub enum EstimatorErr { - #[error("Invalid paramer: `{0}`")] + #[error("Invalid params: `{0}`")] InvalidParams(String), #[error("Invalid regex parameter")] RegexErr { #[from] source: regex::Error, }, + #[error("Invalid Input: `{0}`")] + InvalidInput(String), } #[cfg(feature = "python")] diff --git a/src/lib.rs b/src/lib.rs index b1a68fc..d56f8fe 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,6 +41,7 @@ assert_eq!(tokens, vec!["Flights", "ca", "n't", "depart", "after", "2:00", "pm", pub mod errors; mod math; pub mod metrics; +pub mod token_processing; pub mod tokenize; pub mod tokenize_sentence; pub mod vectorize; diff --git a/src/token_processing/mod.rs b/src/token_processing/mod.rs new file mode 100644 index 0000000..195d3c9 --- /dev/null +++ b/src/token_processing/mod.rs @@ -0,0 +1,642 @@ +#[cfg(test)] +mod tests; + +use std::cmp::min; +use std::collections::VecDeque; +use std::iter; + +use crate::errors::EstimatorErr; +#[cfg(feature = "python")] +use dict_derive::{FromPyObject, IntoPyObject}; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "python", derive(FromPyObject, IntoPyObject))] +pub struct KSkipNGramsParams { + pub min_n: usize, + pub max_n: usize, + pub max_k: usize, +} + +impl KSkipNGramsParams { + pub fn new(min_n: usize, max_n: usize, max_k: usize) -> KSkipNGramsParams { + KSkipNGramsParams { + min_n, + max_n, + max_k, + } + } + + pub fn build(&mut self) -> KSkipNGrams { + KSkipNGrams { + params: self.clone(), + } + } +} + +/// Transforms a given sequence of `items` into k-skip-n-grams iterator. +/// +/// Use convenience methods for common use cases: `new_bigram`, `new_trigram`, `new_ngrams`, +/// `new_everygrams`, `new_skipgrams`. Otherwise build new using `new`. +pub struct KSkipNGrams { + pub params: KSkipNGramsParams, +} + +/// Core methods to build `KSkipNGrams` +impl KSkipNGrams { + /// Generate all bigrams from a sequence of `items`, an iterator. + /// + /// Example: + /// + /// ``` + /// use vtext::token_processing::*; + /// let sent = "One Two Three Four".split(" "); + /// let gramizer = KSkipNGrams::new_bigram(); + /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); + /// assert_eq!(grams, vec![vec!["One", "Two"], vec!["Two", "Three"], vec!["Three", "Four"]]); + /// ``` + pub fn new_bigram() -> KSkipNGrams { + KSkipNGramsParams::new(2, 2, 0).build() + } + + /// Generate n-grams from a sequence of `items`, an iterator. + /// + /// Example: + /// ``` + /// use vtext::token_processing::*; + /// let sent = "One Two Three Four".split(" "); + /// let gramizer = KSkipNGrams::new_ngrams(3); + /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); + /// assert_eq!(grams, vec![vec!["One", "Two", "Three"], vec!["Two", "Three", "Four"]]); + /// ``` + /// + /// Paramaters: + /// * `n` - The degree of the ngrams + pub fn new_ngrams(n: usize) -> KSkipNGrams { + KSkipNGramsParams::new(n, n, 0).build() + } + + /// Generate all n-grams between `min_n` and `max_n` from a sequence of `items`, an iterator. + /// + /// Example: + /// ``` + /// use vtext::token_processing::*; + /// let sent = "One Two Three".split(" "); + /// let gramizer = KSkipNGrams::new_everygrams(1, 3); + /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); + /// assert_eq!(grams, vec![ + /// vec!["One"], vec!["Two"], vec!["Three"], vec!["One", "Two"], vec!["Two", "Three"], + /// vec!["One", "Two", "Three"]]); + /// ``` + /// + /// Paramaters: + /// * `min_n` - The minimum degree of the ngram + /// * `max_n` - The maximum degree of the ngram + pub fn new_everygrams(min_n: usize, max_n: usize) -> KSkipNGrams { + KSkipNGramsParams::new(min_n, max_n, 0).build() + } + + /// Generate all skip-grams with a max total skip of `k` from a sequence of `items`, + /// an iterator. + /// + /// Example: + /// ``` + /// use vtext::token_processing::*; + /// let sent = "One Two Three Four Five".split(" "); + /// let gramizer = KSkipNGrams::new_skipgrams(3, 2); + /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); + /// assert_eq!(grams, vec![vec!["One", "Two", "Three"], vec!["One", "Two", "Four"], + /// vec!["One", "Two", "Five"], vec!["One", "Three", "Four"], vec!["One", "Three", "Five"], + /// vec!["One", "Four", "Five"], vec!["Two", "Three", "Four"], vec!["Two", "Three", "Five"], + /// vec!["Two", "Four", "Five"], vec!["Three", "Four", "Five"]]); + /// ``` + /// + /// Paramaters: + /// * `n` - The degree of the ngram + /// * `k` - The degree of the skipgram: the total max skip between items + pub fn new_skipgrams(n: usize, k: usize) -> KSkipNGrams { + KSkipNGramsParams::new(n, n, k).build() + } + + /// Generate all k-skip-n-grams from a sequence of `items`, an iterator. + /// + /// Example: + /// ``` + /// use vtext::token_processing::*; + /// let sent = "One Two Three Four".split(" "); + /// let gramizer = KSkipNGrams::new(2, 3, 1); + /// let grams: Vec<_> = gramizer.transform(Box::new(sent), None, None).unwrap().collect(); + /// assert_eq!(grams, vec![vec!["One", "Two"], vec!["One", "Three"], vec!["Two", "Three"], + /// vec!["Two", "Four"], vec!["Three", "Four"], vec!["One", "Two", "Three"], + /// vec!["One", "Two", "Four"], vec!["One", "Three", "Four"], vec!["Two", "Three", "Four"]]); + /// ``` + /// + /// Paramaters: + /// * `min_n` - The minimum degree of the ngram + /// * `max_n` - The maximum degree of the ngram + /// * `k` - The degree of the skipgram: the total max skip between items + pub fn new(min_n: usize, max_n: usize, max_k: usize) -> KSkipNGrams { + KSkipNGramsParams::new(min_n, max_n, max_k).build() + } + + /// Transform a sequence of `items`, an iterator to a `KSkipNGramsIter` iterator. + /// + /// Parameters: + /// * `items` - Input iterator + /// * `pad_left` - Optional string to use as left padding + /// * `pad_right` - Optional string to use as right padding + pub fn transform<'a>( + &'a self, + items: Box + 'a>, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, + ) -> Result> + 'a>, EstimatorErr> { + let k_skip_n_grams_iter = KSkipNGramsIter::new( + items, + self.params.min_n, + self.params.max_n, + self.params.max_k, + pad_left, + pad_right, + )?; + Ok(Box::new(k_skip_n_grams_iter)) + } +} + +/// An iterator which provided with a sequence of `items` transforms into k-skip-n-grams. +/// +/// It also correctly generates left or right padding if specified. +pub struct KSkipNGramsIter<'a> { + iter: Box> + 'a>, +} + +/// Core methods to build `KSkipNGramsIter` +impl<'a> KSkipNGramsIter<'a> { + /// Build a new `KSkipNGramsIter`. + /// + /// Example: + /// ``` + /// use vtext::token_processing::*; + /// let sent = "One Two Three".split(" "); + /// let grams_iter = KSkipNGramsIter::new(Box::new(sent), 1, 2, 1, Some(""), Some("")); + /// let grams: Vec> = grams_iter.unwrap().collect(); + /// ``` + /// + /// Parameters: + /// * `items` - Input iterator + /// * `min_n` - The minimum degree of the ngram + /// * `max_n` - The maximum degree of the ngram + /// * `max_k` - The maximum-degree of the skipgram: the total max skip between items + /// * `pad_left` - Optional string to use as left padding + /// * `pad_right` - Optional string to use as right padding + pub fn new( + mut items: Box + 'a>, + min_n: usize, + max_n: usize, + max_k: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, + ) -> Result, EstimatorErr> { + if min_n > max_n { + return Err(EstimatorErr::InvalidParams( + "`min_n` must be equal to or less than `max_n`".to_string(), + )); + } + + let mut iter: Box> + 'a> = Box::new(iter::empty()); + + for n in min_n..max_n + 1 { + let (iter_split_1, iter_split_0) = items.tee(); + items = Box::new(iter_split_0); + + if max_k == 0 { + let sub_iter = NGramIter::new(Box::new(iter_split_1), n, pad_left, pad_right)?; + iter = Box::new(iter.chain(sub_iter)); + } else { + let sub_iter = + SkipGramIter::new(Box::new(iter_split_1), n, max_k, pad_left, pad_right)?; + iter = Box::new(iter.chain(sub_iter)); + } + } + + Ok(KSkipNGramsIter { iter }) + } +} + +/// Iterator functions +impl<'a> Iterator for KSkipNGramsIter<'a> { + type Item = Vec<&'a str>; + + fn next(&mut self) -> Option { + self.iter.next() + } +} + +/// An iterator which provided with a sequence of `items` transforms into n-grams. +/// +/// The iterator consumes the input iterator only once and holds a window of items to generate the +/// n-grams. The window is stepped forward as it consumes the input. It also correctly generates +/// left or right padding if specified. +pub struct NGramIter<'a> { + // Params + items: Box + 'a>, + + // Iterator state + /// Window which holds items that have been consumed + window: VecDeque<&'a str>, + first: bool, + last: bool, +} + +/// Core method to build `NGramIter` +impl<'a> NGramIter<'a> { + /// Build a new `NGramIter`. + /// + /// Example: + /// ``` + /// use vtext::token_processing::*; + /// let sent = "One Two Three".split(" "); + /// let grams_iter = NGramIter::new(Box::new(sent), 1, Some(""), Some("")); + /// let grams: Vec> = grams_iter.unwrap().collect(); + /// ``` + /// + /// Parameters: + /// * `items` - Input iterator + /// * `min_n` - The degree of the ngrams + /// * `pad_left` - Optional string to use as left padding + /// * `pad_right` - Optional string to use as right padding + pub fn new( + mut items: Box + 'a>, + n: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, + ) -> Result, EstimatorErr> { + if n < 1 { + return Err(EstimatorErr::InvalidParams( + "`min_n` must be greater than or equal to 1".to_string(), + )); + } + + if pad_left.is_some() || pad_right.is_some() { + items = pad_items(items, n, pad_left, pad_right)?; + } + + // Build window + let window = build_window(&mut items, n); + let last = window.len() < n; // if not full window then will always return None + + Ok(NGramIter { + // Params + items, + + // Iterator state + window, + first: true, + last, + }) + } +} + +/// Iterator functions +impl<'a> Iterator for NGramIter<'a> { + type Item = Vec<&'a str>; + + fn next(&mut self) -> Option { + if self.last { + return None; + } + if self.first { + self.first = false; + return Some(Vec::from(self.window.clone())); + } + + // Forward window or when self.items return None + let next_item = self.items.next()?; + self.window.pop_front(); + self.window.push_back(next_item); + + Some(Vec::from(self.window.clone())) + } +} + +/// An iterator which provided with a sequence of `items` transforms into k-skip-grams. +/// +/// The iterator consumes the input iterator only once and holds a window of items to generate the +/// k-skip-grams. The window is stepped forward as it consumes the input. It also correctly +/// generates left or right padding if specified. +pub struct SkipGramIter<'a> { + // Params + items: Box + 'a>, + n: usize, + max_k: usize, + + // Iterator state + /// Window which holds items that have been consumed + window: VecDeque<&'a str>, + sample_iter: SampleCombinations, + last: bool, +} + +/// Core methods to build `SkipGramIter` +impl<'a> SkipGramIter<'a> { + /// Build a new `SkipGramIter`. + /// + /// Example: + /// ``` + /// use vtext::token_processing::*; + /// let sent = "One Two Three".split(" "); + /// let grams_iter = SkipGramIter::new(Box::new(sent), 1, 2, Some(""), Some("")); + /// let grams: Vec> = grams_iter.unwrap().collect(); + /// ``` + /// + /// Parameters: + /// * `items` - Input iterator + /// * `n` - The degree of the ngram + /// * `max_k` - The maximum-degree of the skipgram: the total max skip between items + /// * `pad_left` - Optional string to use as left padding + /// * `pad_right` - Optional string to use as right padding + pub fn new( + mut items: Box + 'a>, + n: usize, + max_k: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, + ) -> Result, EstimatorErr> { + if n < 1 { + return Err(EstimatorErr::InvalidParams( + "`min_n` must be greater than or equal to 1".to_string(), + )); + } + + if pad_left.is_some() || pad_right.is_some() { + items = pad_items(items, n, pad_left, pad_right)?; + } + + let window_size = n + max_k; + let window = build_window(&mut items, window_size); + + let sample_iter; + let last; + if window.len() >= n { + let k = min(max_k, window.len() - n); + sample_iter = SampleCombinations::new(true, n + k - 1, n)?; + last = false; + } else { + // Window too small. Always return None + sample_iter = SampleCombinations::new_empty(); + last = true; + } + + Ok(SkipGramIter { + // Params + items, + n, + max_k, + + // Iterator state + window, + sample_iter, + last, + }) + } +} + +/// Iterator functions +impl<'a> Iterator for SkipGramIter<'a> { + type Item = Vec<&'a str>; + + fn next(&mut self) -> Option { + if self.last { + return None; + } + let next_sample = self.sample_iter.next(); + + match next_sample { + // Generate and return samples using self.sample_iter + Some(sample_idx) => { + let mut sample = Vec::with_capacity(sample_idx.len()); + for idx in sample_idx.into_iter() { + sample.push(self.window[idx]); + } + Some(sample) + } + + // Sample_iter finished so attempt to forward window + None => { + // Try to forward window + let next_item = self.items.next(); + match next_item { + // Forward window + Some(item) => { + self.window.pop_front(); + self.window.push_back(item); + + self.sample_iter = + SampleCombinations::new(true, self.n + self.max_k - 1, self.n).unwrap(); + + self.next() + } + + // self.items finished. So reduce window size iteratively to n and then finish + None => { + if self.window.len() > self.n { + // reduce window + self.window.pop_front(); + } else { + // finished + return None; + } + + // Generate samples from smaller window + let k = min(self.max_k, self.window.len() - self.n); + self.sample_iter = + SampleCombinations::new(true, self.n + k - 1, self.n).unwrap(); + self.next() + } + } + } + } + } +} + +/// An iterator which generates the list of combinations of `n` items in a range upto `max_i`. +/// It is possible to fix the first item at index 0 (i.e. `fix_0` == true) +/// +/// Examples: +/// ```text +/// use vtext::token_processing::*; +/// let output: Vec<_> = SampleCombinations::new(false, 3, 3).unwrap().collect(); +/// let expected = vec![ +/// vec![0, 1, 2], +/// vec![0, 1, 3], +/// vec![0, 2, 3], +/// vec![1, 2, 3] +/// ]; +/// assert_eq!(output, expected); +/// +/// let output: Vec<_> = SampleCombinations::new(true, 3, 3).unwrap().collect(); +/// let expected = vec![ +/// vec![0, 1, 2], +/// vec![0, 1, 3], +/// vec![0, 2, 3] +/// ]; +/// assert_eq!(output, expected); +/// ``` +struct SampleCombinations { + // Params + min_i: usize, + max_i: usize, + n: usize, + + // State + position: Vec, + first: bool, + last: bool, +} + +impl SampleCombinations { + /// New `SampleCombinations` + /// + /// Parameters: + /// * `fix_0` - fix the first element at 0? + /// * `max_i` - the maximum index for the output elements + /// * `n` - number of items per combination + fn new(fix_0: bool, max_i: usize, n: usize) -> Result { + let min_i = if fix_0 { 1 } else { 0 }; + + if max_i + 1 < n { + return Err(EstimatorErr::InvalidParams( + "`max_i`+1 must be less than `n`".to_string(), + )); + } + + let position: Vec = (0..n).collect(); + + let last = n == max_i + 1; + + Ok(SampleCombinations { + min_i, + max_i, + n, + position, + first: true, + last, + }) + } + + fn new_empty() -> SampleCombinations { + SampleCombinations { + min_i: 0, + max_i: 0, + n: 0, + position: Vec::new(), + first: false, + last: true, + } + } +} + +impl Iterator for SampleCombinations { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.first { + self.first = false; + return Some(self.position.clone()); + } + if self.last { + return None; + } + + for i in (self.min_i..self.position.len()).rev() { + let e = self.position[i]; + if e < self.max_i - (self.n - i - 1) { + let mut e_1 = e; + for j in i..self.position.len() { + e_1 += 1; + self.position[j] = e_1; + } + if i == self.min_i && e + 1 == self.max_i { + self.last = true; + } + return Some(self.position.clone()); + } + } + None // Will never reach + } +} + +/// Pad an integrator left and/or right with tokens. +/// +/// Example: +/// ``` +/// use vtext::token_processing::*; +/// let sent = "One Two Three".split(" "); +/// let sent_padded: Vec<_> = pad_items(Box::new(sent), 3, Some(""), Some("")).unwrap().collect(); +/// ``` +/// +/// Parameters: +/// * `items` - Input iterator +/// * `n` - The degree of the ngram +/// * `pad_left` - Optional string to use as left padding +/// * `pad_right` - Optional string to use as right padding +pub fn pad_items<'a>( + items: Box + 'a>, + n: usize, + pad_left: Option<&'a str>, + pad_right: Option<&'a str>, +) -> Result + 'a>, EstimatorErr> { + if n < 1 { + return Err(EstimatorErr::InvalidParams( + "`n` must be greater than or equal to 1".to_string(), + )); + } + + let left_chained: Box>; + let all_chained: Box>; + + match pad_left { + Some(s) => { + let pad_left_iter = iter::repeat(s).take(n - 1); + left_chained = Box::new(pad_left_iter.chain(items)); + } + None => { + left_chained = items; + } + } + + match pad_right { + Some(s) => { + let pad_right_iter = iter::repeat(s).take(n - 1); + all_chained = Box::new(left_chained.chain(pad_right_iter)); + } + None => { + all_chained = left_chained; + } + } + + Ok(all_chained) +} + +/// Build and populate window for start of NGramIter or SkipGrmIter +fn build_window<'a>( + items: &mut Box + 'a>, + window_size: usize, +) -> VecDeque<&'a str> { + let mut window: VecDeque<&'a str> = VecDeque::with_capacity(window_size); + + // Populate window + let mut i = window_size; + while i > 0 { + let next_item = items.next(); + match next_item { + None => break, + Some(s) => { + window.push_back(s); + } + } + i -= 1; + } + window +} diff --git a/src/token_processing/tests.rs b/src/token_processing/tests.rs new file mode 100644 index 0000000..02058a7 --- /dev/null +++ b/src/token_processing/tests.rs @@ -0,0 +1,413 @@ +use crate::token_processing::*; +use std::collections::HashSet; +use std::iter::FromIterator; + +#[test] +fn test_bigram() { + let sent = "Mary had a little lamb".split(" "); + + let gramizer = KSkipNGrams::new_bigram(); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), None, None) + .unwrap() + .collect(); + + let expected = vec![ + vec!["Mary", "had"], + vec!["had", "a"], + vec!["a", "little"], + vec!["little", "lamb"], + ]; + + assert_eq!(grams, expected); +} + +#[test] +fn test_trigram() { + let sent = "Mary had a little lamb".split(" "); + + let gramizer = KSkipNGrams::new_ngrams(3); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); + + let expected = vec![ + vec!["", "", "Mary"], + vec!["", "Mary", "had"], + vec!["Mary", "had", "a"], + vec!["had", "a", "little"], + vec!["a", "little", "lamb"], + vec!["little", "lamb", ""], + vec!["lamb", "", ""], + ]; + + assert_eq!(grams, expected); + + let gramizer = KSkipNGrams::new_ngrams(3); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), None, Some("")) + .unwrap() + .collect(); + + let expected = vec![ + vec!["Mary", "had", "a"], + vec!["had", "a", "little"], + vec!["a", "little", "lamb"], + vec!["little", "lamb", ""], + vec!["lamb", "", ""], + ]; + + assert_eq!(grams, expected); +} + +#[test] +fn test_ngrams() { + let sent = "Mary had a little lamb".split(" "); + + let gramizer = KSkipNGrams::new_ngrams(4); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); + + let expected = vec![ + vec!["", "", "", "Mary"], + vec!["", "", "Mary", "had"], + vec!["", "Mary", "had", "a"], + vec!["Mary", "had", "a", "little"], + vec!["had", "a", "little", "lamb"], + vec!["a", "little", "lamb", ""], + vec!["little", "lamb", "", ""], + vec!["lamb", "", "", ""], + ]; + + assert_eq!(grams, expected); +} + +#[test] +fn test_everygram() { + let sent = "Mary had a little lamb".split(" "); + + let gramizer = KSkipNGrams::new_everygrams(1, 3); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); + + let expected = vec![ + vec!["Mary"], + vec!["had"], + vec!["a"], + vec!["little"], + vec!["lamb"], + vec!["", "Mary"], + vec!["Mary", "had"], + vec!["had", "a"], + vec!["a", "little"], + vec!["little", "lamb"], + vec!["lamb", ""], + vec!["", "", "Mary"], + vec!["", "Mary", "had"], + vec!["Mary", "had", "a"], + vec!["had", "a", "little"], + vec!["a", "little", "lamb"], + vec!["little", "lamb", ""], + vec!["lamb", "", ""], + ]; + + assert_eq!(grams, expected); +} + +#[test] +fn test_skipgram() { + let sent = "Mary had a little lamb".split(" "); + + let gramizer = KSkipNGrams::new_skipgrams(2, 1); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); + + let expected = vec![ + vec!["", "Mary"], + vec!["", "had"], + vec!["Mary", "had"], + vec!["Mary", "a"], + vec!["had", "a"], + vec!["had", "little"], + vec!["a", "little"], + vec!["a", "lamb"], + vec!["little", "lamb"], + vec!["little", ""], + vec!["lamb", ""], + ]; + + assert_eq!(grams, expected); + + let gramizer = KSkipNGrams::new_skipgrams(3, 1); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); + + let expected = vec![ + vec!["", "", "Mary"], + vec!["", "", "had"], + vec!["", "Mary", "had"], + vec!["", "Mary", "had"], + vec!["", "Mary", "a"], + vec!["", "had", "a"], + vec!["Mary", "had", "a"], + vec!["Mary", "had", "little"], + vec!["Mary", "a", "little"], + vec!["had", "a", "little"], + vec!["had", "a", "lamb"], + vec!["had", "little", "lamb"], + vec!["a", "little", "lamb"], + vec!["a", "little", ""], + vec!["a", "lamb", ""], + vec!["little", "lamb", ""], + vec!["little", "lamb", ""], + vec!["little", "", ""], + vec!["lamb", "", ""], + ]; + + assert_eq!(grams, expected); + + let sent = "Mary had a little lamb, whose fleece ...".split(" "); + + let gramizer = KSkipNGrams::new_skipgrams(3, 2); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), None, None) + .unwrap() + .collect(); + + let expected = vec![ + vec!["Mary", "had", "a"], + vec!["Mary", "had", "little"], + vec!["Mary", "had", "lamb,"], + vec!["Mary", "a", "little"], + vec!["Mary", "a", "lamb,"], + vec!["Mary", "little", "lamb,"], + vec!["had", "a", "little"], + vec!["had", "a", "lamb,"], + vec!["had", "a", "whose"], + vec!["had", "little", "lamb,"], + vec!["had", "little", "whose"], + vec!["had", "lamb,", "whose"], + vec!["a", "little", "lamb,"], + vec!["a", "little", "whose"], + vec!["a", "little", "fleece"], + vec!["a", "lamb,", "whose"], + vec!["a", "lamb,", "fleece"], + vec!["a", "whose", "fleece"], + vec!["little", "lamb,", "whose"], + vec!["little", "lamb,", "fleece"], + vec!["little", "lamb,", "..."], + vec!["little", "whose", "fleece"], + vec!["little", "whose", "..."], + vec!["little", "fleece", "..."], + vec!["lamb,", "whose", "fleece"], + vec!["lamb,", "whose", "..."], + vec!["lamb,", "fleece", "..."], + vec!["whose", "fleece", "..."], + ]; + + assert_eq!(grams, expected); +} + +#[test] +fn test_skipgram_everygram() { + let sent = "Mary had a little lamb, whose fleece ...".split(" "); + + // min_n=2, max_n=4, max_k=3 + let gramizer = KSkipNGrams::new(2, 4, 3); + let output: Vec<_> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); + let output_set: HashSet> = HashSet::from_iter(output.iter().cloned()); + + // Equivalent to union of three skip-gram outputs n=2,3,4 (k=3) but with different ordering + let gramizer_sg_2 = KSkipNGrams::new_skipgrams(2, 3); + let output_sg_2: Vec<_> = gramizer_sg_2 + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); + let output_sg_2_set: HashSet> = HashSet::from_iter(output_sg_2.iter().cloned()); + + let gramizer_sg_3 = KSkipNGrams::new_skipgrams(3, 3); + let output_sg_3: Vec<_> = gramizer_sg_3 + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); + let output_sg_3_set: HashSet> = HashSet::from_iter(output_sg_3.iter().cloned()); + + let gramizer_sg_4 = KSkipNGrams::new_skipgrams(4, 3); + let output_sg_4: Vec<_> = gramizer_sg_4 + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); + let output_sg_4_set: HashSet> = HashSet::from_iter(output_sg_4.iter().cloned()); + + let expected_set: HashSet<_> = output_sg_2_set + .union(&output_sg_3_set) + .map(move |x| x.clone()) + .collect::>() + .union(&output_sg_4_set) + .map(move |x| x.clone()) + .collect(); + + // Same output - different order + assert_eq!(output_set, expected_set); + + // No duplicates from either output expected + assert_eq!( + output.len(), + output_sg_2.len() + output_sg_3.len() + output_sg_4.len() + ); +} + +#[test] +fn test_ngram_edge_cases() { + // Input length less than n + let sent = vec!["a", "b"].into_iter(); // Empty + let gramizer = NGramIter::new(Box::new(sent), 3, None, None).unwrap(); + let grarms: Vec> = gramizer.collect(); + + let expected: Vec> = Vec::new(); // Empty + assert_eq!(grarms, expected); + + // Empty input + let sent = Vec::<&str>::new().into_iter(); // Empty + let gramizer = NGramIter::new(Box::new(sent), 1, Some(""), Some("")).unwrap(); + let grarms: Vec> = gramizer.collect(); + + let expected: Vec> = Vec::new(); // Empty + assert_eq!(grarms, expected); +} + +#[test] +fn test_skipgram_edge_cases() { + // Input length less than n + k but greater or equal to n + let sent = vec!["a", "b"].into_iter(); // Empty + let gramizer = SkipGramIter::new(Box::new(sent), 2, 1, None, None).unwrap(); + let grarms: Vec> = gramizer.collect(); + + let expected = vec![vec!["a", "b"]]; + assert_eq!(grarms, expected); + + // Input length less than n + k + let sent = vec!["a"].into_iter(); // Empty + let gramizer = SkipGramIter::new(Box::new(sent), 2, 1, None, None).unwrap(); + let grarms: Vec> = gramizer.collect(); + + let expected: Vec> = Vec::new(); // Empty + assert_eq!(grarms, expected); + + // Empty input + let sent = Vec::<&str>::new().into_iter(); // Empty + let gramizer = NGramIter::new(Box::new(sent), 1, Some(""), Some("")).unwrap(); + let grarms: Vec> = gramizer.collect(); + + let expected: Vec> = Vec::new(); // Empty + assert_eq!(grarms, expected); +} + +#[test] +fn test_kskipngram_edge_cases() { + let sent = "Mary had a little lamb".split(" "); + + let gramizer = KSkipNGrams::new(1, 1, 0); + let grams: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); + + let expected = vec![ + vec!["Mary"], + vec!["had"], + vec!["a"], + vec!["little"], + vec!["lamb"], + ]; + + assert_eq!(grams, expected); + + let gramizer = KSkipNGrams::new(1, 1, 1); + let grarms: Vec> = gramizer + .transform(Box::new(sent.clone()), Some(""), Some("")) + .unwrap() + .collect(); + + assert_eq!(grarms, expected); + + let sent = Vec::<&str>::new().into_iter(); // Empty + let gramizer = KSkipNGrams::new(1, 2, 1); + let grarms: Vec> = gramizer + .transform(Box::new(sent), None, None) + .unwrap() + .collect(); + + let expected: Vec> = Vec::new(); // Empty + assert_eq!(grarms, expected); +} + +#[test] +fn test_sample_combinations() { + let output: Vec> = SampleCombinations::new(false, 3, 3).unwrap().collect(); + + let expected = vec![vec![0, 1, 2], vec![0, 1, 3], vec![0, 2, 3], vec![1, 2, 3]]; + assert_eq!(output, expected); + + let output: Vec> = SampleCombinations::new(true, 3, 3).unwrap().collect(); + let expected = vec![vec![0, 1, 2], vec![0, 1, 3], vec![0, 2, 3]]; + assert_eq!(output, expected); + + let output: Vec> = SampleCombinations::new(true, 4, 3).unwrap().collect(); + let expected = vec![ + vec![0, 1, 2], + vec![0, 1, 3], + vec![0, 1, 4], + vec![0, 2, 3], + vec![0, 2, 4], + vec![0, 3, 4], + ]; + assert_eq!(output, expected); + + // Single output + let output: Vec> = SampleCombinations::new(false, 1, 2).unwrap().collect(); + let expected = vec![vec![0, 1]]; + assert_eq!(output, expected); + + let output: Vec> = SampleCombinations::new(true, 1, 2).unwrap().collect(); + let expected = vec![vec![0, 1]]; + assert_eq!(output, expected); + + let output: Vec> = SampleCombinations::new(true, 2, 3).unwrap().collect(); + let expected = vec![vec![0, 1, 2]]; + assert_eq!(output, expected); + + let output: Vec> = SampleCombinations::new(false, 0, 1).unwrap().collect(); + let expected = vec![vec![0]]; + assert_eq!(output, expected); + + let output: Vec> = SampleCombinations::new(true, 0, 1).unwrap().collect(); + let expected = vec![vec![0]]; + assert_eq!(output, expected); +} + +#[test] +fn test_padding() { + let iter = "Mary had a little lamb".split(" "); + + let output_iter = pad_items(Box::new(iter), 3, Some(""), Some("")).unwrap(); + let output: Vec<&str> = output_iter.collect(); + + let expected = vec![ + "", "", "Mary", "had", "a", "little", "lamb", "", "", + ]; + + assert_eq!(output, expected); +}