diff --git a/Cargo.toml b/Cargo.toml index 4a3f2b2..f7dbc24 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ libc = "0.2.81" derive_builder = "0.5.1" serde_json = "1.0.59" polars = {version = "0.16.0", optional = true} +csv = "1.2.1" [features] diff --git a/src/booster.rs b/src/booster.rs deleted file mode 100644 index 0830628..0000000 --- a/src/booster.rs +++ /dev/null @@ -1,673 +0,0 @@ -use std; -use std::convert::TryInto; -use std::ffi::CString; - -use libc::{c_char, c_double, c_longlong, c_void}; -use lightgbm_sys; -use serde_json::Value; - -use crate::{Dataset, Error, Result}; - -/// Core model in LightGBM, containing functions for training, evaluating and predicting. -pub struct Booster { - handle: lightgbm_sys::BoosterHandle, -} - -/// Represents the score during training on either the train or validation set -#[derive(Debug, PartialEq)] -pub struct EvalResult { - pub metric: String, - pub score: f64, -} - -impl Booster { - fn new(handle: lightgbm_sys::BoosterHandle) -> Self { - Booster { handle } - } - - /// Init from model file. - pub fn from_file(filename: &str) -> Result { - let filename_str = CString::new(filename).unwrap(); - let mut out_num_iterations = 0; - let mut handle = std::ptr::null_mut(); - lgbm_call!(lightgbm_sys::LGBM_BoosterCreateFromModelfile( - filename_str.as_ptr() as *const c_char, - &mut out_num_iterations, - &mut handle - ))?; - - Ok(Booster::new(handle)) - } - - /// Init from model string. - pub fn from_string(model_description: &str) -> Result { - let cstring = CString::new(model_description).unwrap(); - let mut out_num_iterations = 0; - let mut handle = std::ptr::null_mut(); - lgbm_call!(lightgbm_sys::LGBM_BoosterLoadModelFromString( - cstring.as_ptr() as *const c_char, - &mut out_num_iterations, - &mut handle - ))?; - - Ok(Booster::new(handle)) - } - - /// Create a new Booster model with given Dataset and parameters. - /// - /// Example - /// ``` - /// extern crate serde_json; - /// use lightgbm::{Dataset, Booster}; - /// use serde_json::json; - /// - /// let data = vec![vec![1.0, 0.1, 0.2, 0.1], - /// vec![0.7, 0.4, 0.5, 0.1], - /// vec![0.9, 0.8, 0.5, 0.1], - /// vec![0.2, 0.2, 0.8, 0.7], - /// vec![0.1, 0.7, 1.0, 0.9]]; - /// let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - /// let dataset = Dataset::from_mat(data, label).unwrap(); - /// let params = json!{ - /// { - /// "num_iterations": 3, - /// "objective": "binary", - /// "metric": "auc" - /// } - /// }; - /// let bst = Booster::train(dataset, None, ¶ms).unwrap(); - /// ``` - /// Validation data can be provided aswell. - /// ``` - /// extern crate serde_json; - /// use lightgbm::{Dataset, Booster}; - /// use serde_json::json; - /// - /// let data = vec![vec![1.0, 0.1, 0.2, 0.1], - /// vec![0.7, 0.4, 0.5, 0.1], - /// vec![0.9, 0.8, 0.5, 0.1], - /// vec![0.2, 0.2, 0.8, 0.7], - /// vec![0.1, 0.7, 1.0, 0.9]]; - /// let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - /// let train_data = Dataset::from_mat(data, label).unwrap(); - /// - /// let data = vec![ - /// vec![0.9, 0.6, 0.2, 0.1], - /// vec![0.5, 0.7, 0.2, 0.1], - /// vec![0.2, 0.1, 0.6, 0.8]]; - /// let label = vec![0.0, 0.0, 1.0]; - /// let val_data = Dataset::from_mat(data, label); - /// - /// let params = json!{ - /// { - /// "num_iterations": 3, - /// "objective": "binary", - /// "metric": "auc" - /// } - /// }; - /// - /// let bst = Booster::train(train_data, val_data.ok(), ¶ms).unwrap(); - /// ``` - pub fn train( - train_data: Dataset, - val_data: Option, - parameter: &Value, - ) -> Result { - // get num_iterations - let num_iterations: i64 = if parameter["num_iterations"].is_null() { - 100 - } else { - parameter["num_iterations"].as_i64().unwrap() - }; - - // exchange params {"x": "y", "z": 1} => "x=y z=1" - // and {"k" = ["a", "b"]} => "k=a,b" - let params_string = parameter - .as_object() - .unwrap() - .iter() - .map(|(k, v)| match v { - Value::Array(a) => { - let v_formatted = a.iter().map(|x| x.to_string() + ",").collect::(); - let v_formatted = v_formatted - .replace("\",\"", ",") - .trim_end_matches(',') - .to_string(); - (k, v_formatted) - } - _ => (k, v.to_string()), - }) - .map(|(k, v)| format!("{}={}", k, v)) - .collect::>() - .join(" "); - let params_cstring = CString::new(params_string).unwrap(); - - let mut handle = std::ptr::null_mut(); - lgbm_call!(lightgbm_sys::LGBM_BoosterCreate( - train_data.handle, - params_cstring.as_ptr() as *const c_char, - &mut handle - ))?; - - // the following has to borrow val_data to avoid dropping the dataset - if let Some(validation_data) = &val_data { - lgbm_call!(lightgbm_sys::LGBM_BoosterAddValidData( - handle, - validation_data.handle - ))?; - } - - let mut is_finished: i32 = 0; - for _ in 0..num_iterations { - lgbm_call!(lightgbm_sys::LGBM_BoosterUpdateOneIter( - handle, - &mut is_finished - ))?; - } - Ok(Booster::new(handle)) - } - - /// Predict results for given data. - /// - /// Input data example - /// ``` - /// let data = vec![vec![1.0, 0.1, 0.2], - /// vec![0.7, 0.4, 0.5], - /// vec![0.1, 0.7, 1.0]]; - /// ``` - /// - /// Output data example - /// ``` - /// let output = vec![vec![1.0, 0.109, 0.433]]; - /// ``` - pub fn predict(&self, data: Vec>) -> Result>> { - let data_length = data.len(); - let feature_length = data[0].len(); - let params = CString::new("").unwrap(); - let mut out_length: c_longlong = 0; - let flat_data = data.into_iter().flatten().collect::>(); - - // get num_class - let mut num_class = 0; - lgbm_call!(lightgbm_sys::LGBM_BoosterGetNumClasses( - self.handle, - &mut num_class - ))?; - - let out_result: Vec = vec![Default::default(); data_length * num_class as usize]; - - lgbm_call!(lightgbm_sys::LGBM_BoosterPredictForMat( - self.handle, - flat_data.as_ptr() as *const c_void, - lightgbm_sys::C_API_DTYPE_FLOAT64 as i32, - data_length as i32, - feature_length as i32, - 1_i32, - 0_i32, - 0_i32, - -1_i32, - params.as_ptr() as *const c_char, - &mut out_length, - out_result.as_ptr() as *mut c_double - ))?; - - // reshape for multiclass [1,2,3,4,5,6] -> [[1,2,3], [4,5,6]] # 3 class - let reshaped_output = if num_class > 1 { - out_result - .chunks(num_class as usize) - .map(|x| x.to_vec()) - .collect() - } else { - vec![out_result] - }; - Ok(reshaped_output) - } - - /// Get Feature Num. - pub fn num_feature(&self) -> Result { - let mut out_len = 0; - lgbm_call!(lightgbm_sys::LGBM_BoosterGetNumFeature( - self.handle, - &mut out_len - ))?; - Ok(out_len) - } - - /// Get Feature Names. - pub fn feature_name(&self) -> Result> { - let num_feature = self.num_feature()?; - let feature_name_length = 32; - let mut num_feature_names = 0; - let mut out_buffer_len = 0; - let out_strs = (0..num_feature) - .map(|_| { - CString::new(" ".repeat(feature_name_length)) - .unwrap() - .into_raw() as *mut c_char - }) - .collect::>(); - lgbm_call!(lightgbm_sys::LGBM_BoosterGetFeatureNames( - self.handle, - num_feature, - &mut num_feature_names, - feature_name_length as u64, - &mut out_buffer_len, - out_strs.as_ptr() as *mut *mut c_char - ))?; - let output: Vec = out_strs - .into_iter() - .map(|s| unsafe { CString::from_raw(s).into_string().unwrap() }) - .collect(); - Ok(output) - } - - /// Get number of evaluation metrics - pub fn num_eval(&self) -> Result { - let mut out_len = 0; - lgbm_call!(lightgbm_sys::LGBM_BoosterGetEvalCounts( - self.handle, - &mut out_len - ))?; - Ok(out_len) - } - - /// Get names of evaluation metrics - pub fn eval_names(&self) -> Result> { - let num_metrics = self.num_eval()?; - - ///////////////////////////////////////////////////////////////////// - // call with 0-sized buffer to find out how much space to allocate - ///////////////////////////////////////////////////////////////////// - let mut num_eval_names = 0; - let mut out_buffer_len = 0; - - lgbm_call!(lightgbm_sys::LGBM_BoosterGetEvalNames( - self.handle, - 0, - &mut num_eval_names, - 0, - &mut out_buffer_len, - std::ptr::null_mut() as *mut *mut c_char - )) - .unwrap(); - - ///////////////////////////////////////////////////////////////////// - // sanity check - ///////////////////////////////////////////////////////////////////// - if num_eval_names != num_metrics { - return Err(Error::new(format!( - "expected num_eval_names==num_metrics, but got {num_eval_names}!={num_metrics}. This is a bug in lightgbm or its rust wrapper" - ))); - } - - ///////////////////////////////////////////////////////////////////// - // get the actual strings - ///////////////////////////////////////////////////////////////////// - - let mut out_strs = (0..num_metrics) - .map(|_| (0..out_buffer_len).map(|_| 0).collect::>()) - .collect::>(); - - let mut out_strs_pointers = out_strs - .iter_mut() - .map(|s| s.as_mut_ptr()) - .collect::>(); - - let metric_name_length = out_buffer_len; - - lgbm_call!(lightgbm_sys::LGBM_BoosterGetEvalNames( - self.handle, - num_metrics, - &mut num_eval_names, - metric_name_length, - &mut out_buffer_len, - out_strs_pointers.as_mut_ptr() as *mut *mut c_char - )) - .unwrap(); - - drop(out_strs_pointers); // don't let pointers outlive their target - - let mut output = Vec::with_capacity(out_strs.len()); - for mut out_str in out_strs { - let first_null = out_str - .iter() - .enumerate() - .find(|(_, e)| **e == 0) - .map(|(i, _)| i) - .expect("string not null terminated, possible memory corruption"); - out_str.truncate(first_null + 1); - - let string = CString::from_vec_with_nul(out_str) - .expect("string memory invariant violated, possible memory corruption") - .into_string() - .map_err(|_| Error::new("name not valid UTF-8"))?; - output.push(string); - } - - Ok(output) - } - - pub fn get_eval(&self, data_index: i32) -> Result> { - let names = self.eval_names()?; - let mut out_len = 0; - let out_result: Vec = vec![Default::default(); names.len()]; - lgbm_call!(lightgbm_sys::LGBM_BoosterGetEval( - self.handle, - data_index, - &mut out_len, - out_result.as_ptr() as *mut c_double - ))?; - Ok(names - .into_iter() - .zip(out_result) - .map(|(metric, score)| EvalResult { metric, score }) - .collect()) - } - - // Get Feature Importance - pub fn feature_importance(&self) -> Result> { - let num_feature = self.num_feature()?; - let out_result: Vec = vec![Default::default(); num_feature as usize]; - lgbm_call!(lightgbm_sys::LGBM_BoosterFeatureImportance( - self.handle, - 0_i32, - 0_i32, - out_result.as_ptr() as *mut c_double - ))?; - Ok(out_result) - } - - /// Save model to file. - pub fn save_file(&self, filename: &str) -> Result<()> { - let filename_str = CString::new(filename).unwrap(); - lgbm_call!(lightgbm_sys::LGBM_BoosterSaveModel( - self.handle, - 0_i32, - -1_i32, - 0_i32, - filename_str.as_ptr() as *const c_char - ))?; - Ok(()) - } - - /// Returns the size the model would have if saved using `save_file`, without having to write the file - pub fn save_file_size(&self) -> Result { - let mut out_size = 0_i64; - lgbm_call!(lightgbm_sys::LGBM_BoosterSaveModelToString( - self.handle, - 0_i32, - -1_i32, - 0_i32, - 0, - &mut out_size as *mut _, - std::ptr::null_mut() as *mut i8 - ))?; - // subtract 1 because the file doesn't contain the final null character - (out_size - 1) - .try_into() - .map_err(|_| Error::new("size negative")) - } - - /// Save model to string. This returns the same content that `save_file` writes into a file. - pub fn save_string(&self) -> Result { - // get nessesary buffer size - - let mut out_size = 0_i64; - lgbm_call!(lightgbm_sys::LGBM_BoosterSaveModelToString( - self.handle, - 0_i32, - -1_i32, - 0_i32, - 0, - &mut out_size as *mut _, - std::ptr::null_mut() as *mut i8 - ))?; - - // write data to buffer and convert - let mut buffer = vec![ - 0u8; - out_size - .try_into() - .map_err(|_| Error::new("size negative"))? - ]; - lgbm_call!(lightgbm_sys::LGBM_BoosterSaveModelToString( - self.handle, - 0_i32, - -1_i32, - 0_i32, - buffer.len() as c_longlong, - &mut out_size as *mut _, - buffer.as_mut_ptr() as *mut c_char - ))?; - - if buffer.pop() != Some(0) { - // this should never happen, unless lightgbm has a bug - panic!("write out of bounds happened in lightgbm call"); - } - - let cstring = CString::new(buffer).map_err(|e| Error::new(e.to_string()))?; - cstring - .into_string() - .map_err(|_| Error::new("can't convert model string to unicode")) - } -} - -impl Drop for Booster { - fn drop(&mut self) { - lgbm_call!(lightgbm_sys::LGBM_BoosterFree(self.handle)).unwrap(); - } -} - -#[cfg(test)] -mod tests { - use std::fs; - use std::path::Path; - - use serde_json::json; - - use super::*; - - fn _read_train_file() -> Result { - Dataset::from_file( - &"lightgbm-sys/lightgbm/examples/binary_classification/binary.train", - None, - ) - } - - fn _train_booster(params: &Value) -> Booster { - let dataset = _read_train_file().unwrap(); - Booster::train(dataset, None, ¶ms).unwrap() - } - - fn _default_params() -> Value { - let params = json! { - { - "num_iterations": 1, - "objective": "binary", - "metric": "auc", - "data_random_seed": 0 - } - }; - params - } - - #[test] - fn predict() { - let params = json! { - { - "num_iterations": 10, - "objective": "binary", - "metric": "auc", - "data_random_seed": 0 - } - }; - let bst = _train_booster(¶ms); - let feature = vec![vec![0.5; 28], vec![0.0; 28], vec![0.9; 28]]; - let result = bst.predict(feature).unwrap(); - let mut normalized_result = Vec::new(); - for r in &result[0] { - normalized_result.push(if r > &0.5 { 1 } else { 0 }); - } - assert_eq!(normalized_result, vec![0, 0, 1]); - } - - #[test] - fn num_feature() { - let params = _default_params(); - let bst = _train_booster(¶ms); - let num_feature = bst.num_feature().unwrap(); - assert_eq!(num_feature, 28); - } - - #[test] - fn eval_names() { - let params = json! { - { - "num_iterations": 1, - "objective": "binary", - "metrics": ["auc", "l1"], - "data_random_seed": 0 - } - }; - let bst = _train_booster(¶ms); - let eval_names = bst.eval_names().unwrap(); - assert_eq!(eval_names, vec!["auc", "l1"]) - } - - #[test] - fn get_eval_sample_dataset() { - let params = json! { - { - "num_iterations": 30, - "objective": "binary", - "boosting_type": "gbdt", - "metrics": ["binary_logloss","auc"], - "label_column": 0, - "max_bin": 255, - "tree_learner": "serial", - "feature_fraction": 0.8, - "is_enable_sparse": true, - "data_random_seed": 0 - } - }; - let train = _read_train_file().unwrap(); - let val = Dataset::from_file( - &"lightgbm-sys/lightgbm/examples/binary_classification/binary.test", - Some(train.handle), - ) - .unwrap(); - - let bst = Booster::train(train, Some(val), ¶ms).unwrap(); - - let eval_train = bst.get_eval(0); - let eval_val = bst.get_eval(1); - assert!(eval_val.is_ok()); - assert!(eval_train.is_ok()); - let eval_invalid = bst.get_eval(420); - assert!(eval_invalid.is_err()); - } - - #[test] - fn get_eval() { - let data = vec![ - vec![1.0, 0.1, 0.2, 0.1], - vec![0.7, 0.4, 0.5, 0.1], - vec![0.9, 0.8, 0.5, 0.1], - vec![0.2, 0.2, 0.8, 0.7], - vec![0.1, 0.7, 1.0, 0.9], - ]; - let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - let train_data = Dataset::from_mat(data, label).unwrap(); - - let data = vec![ - vec![0.9, 0.6, 0.2, 0.1], - vec![0.5, 0.7, 0.2, 0.1], - vec![0.2, 0.1, 0.6, 0.8], - ]; - let label = vec![0.0, 0.0, 1.0]; - let val_data = Dataset::from_mat(data, label); - - let params = json! { - { - "num_iterations": 3, - "objective": "binary", - "metric": ["auc","l1"] - } - }; - - let bst = Booster::train(train_data, val_data.ok(), ¶ms).unwrap(); - - let train_res = bst.get_eval(0).unwrap(); - let val_res = bst.get_eval(1).unwrap(); - let invalid_res = bst.get_eval(420); - assert!(invalid_res.is_err()); - assert_eq!(train_res[0].metric, "auc"); - assert_eq!(val_res[1].metric, "l1"); - assert!(0.0 <= train_res[0].score && train_res[0].score <= 1.0); // make shure values make sense - assert!(0.0 <= train_res[1].score); - } - - #[test] - fn feature_importance() { - let mut params = _default_params(); - params["num_iterations"] = "0".parse().unwrap(); - let bst = _train_booster(¶ms); - let feature_importance = bst.feature_importance().unwrap(); - assert_eq!(feature_importance, vec![0.0; 28]); - } - - #[test] - fn feature_name() { - let params = _default_params(); - let bst = _train_booster(¶ms); - let feature_name = bst.feature_name().unwrap(); - let target = (0..28).map(|i| format!("Column_{}", i)).collect::>(); - assert_eq!(feature_name, target); - } - - #[test] - fn save_file() { - let params = _default_params(); - let bst = _train_booster(¶ms); - assert_eq!(bst.save_file(&"./test/test_save_file.output"), Ok(())); - assert!(Path::new("./test/test_save_file.output").exists()); - let _ = fs::remove_file("./test/test_save_file.output"); - } - - #[test] - fn save_file_size() { - let params = _default_params(); - let bst = _train_booster(¶ms); - let filename = "./test/test_save_file_size.output"; - assert_eq!(bst.save_file(filename), Ok(())); - let file_size = Path::new(filename).metadata().unwrap().len(); - assert!(file_size > 0); - assert_eq!(bst.save_file_size(), Ok(file_size)); - let _ = fs::remove_file(filename); - } - - #[test] - fn save_string() { - let params = _default_params(); - let bst = _train_booster(¶ms); - let filename = "./test/test_save_string.output"; - assert_eq!(bst.save_file(&filename), Ok(())); - assert!(Path::new(&filename).exists()); - let booster_file_content = fs::read_to_string(&filename).unwrap(); - let _ = fs::remove_file("./test/test_save_file.output"); - - assert!(!booster_file_content.is_empty()); - assert_eq!(Ok(booster_file_content), bst.save_string()) - } - - #[test] - fn from_file() { - let _ = Booster::from_file(&"./test/test_from_file.input"); - } - - #[test] - fn from_string() { - let model_string = fs::read_to_string("./test/test_from_file.input").unwrap(); - Booster::from_string(&model_string).unwrap(); - } -} diff --git a/src/booster/builder.rs b/src/booster/builder.rs new file mode 100644 index 0000000..46bd3f3 --- /dev/null +++ b/src/booster/builder.rs @@ -0,0 +1,502 @@ +use serde_json::Value; + +use booster::Booster; +use dataset::DataSet; +use Matrixf64; +use {booster, LgbmError}; + +///////////////////////////////////////////// +// types for training set +#[derive(Clone)] +pub struct TrainDataAdded(DataSet); // this should not implement default, so it can safely be used for construction +#[derive(Default, Clone)] +pub struct TrainDataMissing; +///////////////////////////////////////////// + +///////////////////////////////////////////// +// types for params +#[derive(Clone)] +pub struct ParamsAdded(String, i32); // this should not implement default, so it can safely be used for construction +#[derive(Default, Clone)] +pub struct ParamsMissing; +///////////////////////////////////////////// + +/// Builder for the Booster. +/// +/// Uses TypeState Pattern to make sure that Training Data is added +/// so that Validation can be synced properly and params are present for training. +#[derive(Default, Clone)] +pub struct BoosterBuilder { + train_data: T, + val_data: Vec, + params: P, +} + +/// These Methods are always available to the Booster. +impl BoosterBuilder { + /// Returns the Builder and a clone from it. Useful if you want to train 2 models with + /// only a couple differences. This should be called at the end of the adapter chain, + /// where u defined all things that are equal in the models. + /// U can then continue to build the models separately. + /// + /// ``` + /// use lightgbm::booster::Booster; + /// use lightgbm::dataset::DataSet; + /// # use lightgbm::LgbmError; + /// + /// # fn main() -> Result<(), LgbmError> { + /// let params_a = serde_json::json! { + /// { + /// "num_iterations": 5, + /// "objective": "binary", + /// "metric": "auc", + /// "data_random_seed": 0 + /// } + /// }; + /// let params_b = serde_json::json! { + /// { + /// "num_iterations": 100, + /// "objective": "binary", + /// "metric": "acc", + /// "data_random_seed": 42 + /// } + /// }; + /// # let x = vec![ + /// # vec![1.0, 0.1, 0.2, 0.1], + /// # vec![0.7, 0.4, 0.5, 0.1], + /// # vec![0.9, 0.8, 0.5, 0.1], + /// # vec![0.2, 0.2, 0.8, 0.7], + /// # vec![0.1, 0.7, 1.0, 0.9]]; + /// # let y = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + /// # let train_data = DataSet::from_mat(x,y); + /// let (booster_low_it, booster_high_it) = Booster::builder() + /// .add_train_data(train_data) + /// .duplicate(); + /// let booster_low_it = booster_low_it + /// .add_params(params_a)? + /// .fit()?; + /// let booster_high_it = booster_high_it + /// .add_params(params_b)? + /// .fit()?; + /// # Ok(())} + /// ``` + pub fn duplicate(self) -> (Self, Self) { + (self.clone(), self) + } +} + +/// Methods in this block require, that no params are added to the Booster. +impl BoosterBuilder { + /// Adds params to the Booster. + /// Returns Error, if param parsing returns Error. + /// + /// ``` + /// use lightgbm::booster::Booster; + /// # use lightgbm::LgbmError; + /// + /// # fn main() -> Result<(), LgbmError> { + /// let params = serde_json::json! { + /// { + /// "num_iterations": 5, + /// "objective": "binary", + /// "metric": "auc", + /// "data_random_seed": 0 + /// } + /// }; + /// let booster_builder = Booster::builder().add_params(params)?; + /// # Ok(())} + /// ``` + pub fn add_params(self, params: Value) -> Result, LgbmError> { + let num_iterations = params + .get("num_iterations") + .ok_or(LgbmError::new("Num iterations in params missing."))? + .as_i64() + .ok_or(LgbmError::new("Invalid Value for num iterations."))?; + let parsed_params = parse_params(params)?; + Ok(BoosterBuilder { + params: ParamsAdded(parsed_params, num_iterations as i32), + train_data: self.train_data, + val_data: self.val_data, + }) + } +} + +/// Methods in this Block require, that there is no train data added to the Booster. +impl BoosterBuilder { + /// Adds training data. necessary for validation data (so bins can be synced) + /// and for model fitting. + /// ``` + /// use lightgbm::booster::Booster; + /// use lightgbm::dataset::DataSet; + /// # use lightgbm::LgbmError; + /// + /// # fn main() -> Result<(), LgbmError> { + /// let params = serde_json::json! { + /// { + /// "num_iterations": 5, + /// "objective": "binary", + /// "metric": "auc", + /// "data_random_seed": 0 + /// } + /// }; + /// let x = vec![ + /// vec![1.0, 0.1, 0.2, 0.1], + /// vec![0.7, 0.4, 0.5, 0.1], + /// vec![0.9, 0.8, 0.5, 0.1], + /// vec![0.2, 0.2, 0.8, 0.7], + /// vec![0.1, 0.7, 1.0, 0.9]]; + /// let y = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + /// let train_data = DataSet::from_mat(x,y); + /// let booster = Booster::builder() + /// .add_train_data(train_data) + /// .add_params(params)? + /// .fit()?; + /// + /// # Ok(())} + /// ``` + pub fn add_train_data(self, train: DataSet) -> BoosterBuilder { + BoosterBuilder { + train_data: TrainDataAdded(train), + val_data: self.val_data, + params: self.params, + } + } +} + +/// Methods in this impl Block require, that training data is already added. +impl BoosterBuilder { + /// Adds validation data to the Booster. + /// ``` + /// use lightgbm::booster::Booster; + /// use lightgbm::dataset::DataSet; + /// # use lightgbm::LgbmError; + /// + /// # fn main() -> Result<(), LgbmError> { + /// let params = serde_json::json! { + /// { + /// "num_iterations": 5, + /// "objective": "binary", + /// "metric": "auc", + /// "data_random_seed": 0 + /// } + /// }; + /// let x = vec![ + /// vec![1.0, 0.1, 0.2, 0.1], + /// vec![0.7, 0.4, 0.5, 0.1], + /// vec![0.9, 0.8, 0.5, 0.1], + /// vec![0.2, 0.2, 0.8, 0.7], + /// vec![0.1, 0.7, 1.0, 0.9]]; + /// let y = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + /// let train_data = DataSet::from_mat(x,y); + /// let x = vec![ + /// vec![8.0, 0.2, 0.4, 0.5], + /// vec![0.9, 0.4, 0.3, 0.5], + /// vec![0.5, 0.6, 0.3, 0.8], + /// vec![0.244, 0.25, 0.9, 0.9], + /// vec![0.4, 0.8, 0.8, 0.7], + /// ]; + /// let y = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + /// let validation_data = DataSet::from_mat(x,y); + /// let booster = Booster::builder() + /// .add_train_data(train_data) // add training data first + /// .add_val_data(validation_data) // then validation data + /// .add_params(params)? + /// .fit()?; + /// + /// # Ok(())} + /// ``` + pub fn add_val_data(mut self, val: DataSet) -> Self { + self.val_data.push(val); + self + } +} + +/// Methods in this impl block are only available, after Training Data and Params are added. +impl BoosterBuilder { + /// Builds the booster by: + /// 1. Adding the training data + /// 2. Adding the validation data + /// 3. Training with the params + /// + /// Each of these steps can fail and return errors. + /// + /// ``` + /// use lightgbm::booster::Booster; + /// use lightgbm::dataset::DataSet; + /// # use lightgbm::LgbmError; + /// + /// # fn main() -> Result<(), LgbmError> { + /// let params = serde_json::json! { + /// { + /// "num_iterations": 5, + /// "objective": "binary", + /// "metric": "auc", + /// "data_random_seed": 0 + /// } + /// }; + /// let x = vec![ + /// vec![1.0, 0.1, 0.2, 0.1], + /// vec![0.7, 0.4, 0.5, 0.1], + /// vec![0.9, 0.8, 0.5, 0.1], + /// vec![0.2, 0.2, 0.8, 0.7], + /// vec![0.1, 0.7, 1.0, 0.9]]; + /// let y = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + /// let train_data = DataSet::from_mat(x,y); + /// let x = vec![ + /// vec![8.0, 0.2, 0.4, 0.5], + /// vec![0.9, 0.4, 0.3, 0.5], + /// vec![0.5, 0.6, 0.3, 0.8], + /// vec![0.244, 0.25, 0.9, 0.9], + /// vec![0.4, 0.8, 0.8, 0.7], + /// ]; + /// let y = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + /// let validation_data = DataSet::from_mat(x,y); + /// let booster = Booster::builder() + /// .add_train_data(train_data) // this is necessary + /// .add_val_data(validation_data) // this is optional + /// .add_params(params)? // this is also necessary + /// .fit()?; + /// + /// # Ok(())} + /// ``` + pub fn fit(self) -> Result { + let train_data = self.train_data.0.load(None)?; + let booster_handle = booster::ffi::new_booster(train_data.handle, &self.params.0)?; + let mut validation_sets = Vec::with_capacity(self.val_data.len()); + for val in self.val_data.into_iter() { + let loaded_data = val.load(Some(train_data.handle))?; + booster::ffi::add_validation_data_to_booster(booster_handle, loaded_data.handle)?; + validation_sets.push(loaded_data); + } + let mut booster = Booster { + handle: booster_handle, + train_data: Some(train_data), + validation_data: validation_sets, + }; + booster.train_loop(self.params.1)?; // param parsing checked already if present + Ok(booster) + } + + /// Build the Booster with fit and immediately predict for the given input. + /// Can Fail in fit if the Booster isn't correctly build or in predict if the Input Data + /// is corrupted. + /// + /// ``` + /// use lightgbm::booster::Booster; + /// use lightgbm::dataset::DataSet; + /// # use lightgbm::LgbmError; + /// + /// # fn main() -> Result<(), LgbmError> { + /// let params = serde_json::json! { + /// { + /// "num_iterations": 5, + /// "objective": "binary", + /// "metric": "auc", + /// "data_random_seed": 0 + /// } + /// }; + /// let x = vec![ + /// vec![1.0, 0.1, 0.2, 0.1], + /// vec![0.7, 0.4, 0.5, 0.1], + /// vec![0.9, 0.8, 0.5, 0.1], + /// vec![0.2, 0.2, 0.8, 0.7], + /// vec![0.1, 0.7, 1.0, 0.9]]; + /// let y = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + /// let train_data = DataSet::from_mat(x,y); + /// let input = vec![ + /// vec![8.0, 0.2, 0.4, 0.5], + /// vec![0.9, 0.4, 0.3, 0.5], + /// vec![0.5, 0.6, 0.3, 0.8], + /// vec![0.244, 0.25, 0.9, 0.9], + /// vec![0.4, 0.8, 0.8, 0.7], + /// ]; + /// let (booster, pred) = Booster::builder() + /// .add_train_data(train_data) + /// .add_params(params)? + /// .fit_predict(&input)?; + /// + /// assert_eq!(input.len(), pred[0].len()); // binary classification. One output value for each input vec + /// # Ok(())} + /// ``` + pub fn fit_predict(self, x: &Matrixf64) -> Result<(Booster, Matrixf64), LgbmError> { + let booster = self.fit()?; + let y = booster.predict(x)?; + Ok((booster, y)) + } +} + +/// Transforms a serde_json Value object into a String that Lightgbm Requires. Note that a conversion +/// to a CString is still required for the ffi. +/// The algorithms transforms data like this: +/// {"x": "y", "z": 1} => "x="y" z=1" +/// and +/// {"k" = ["a", "b"]} => "k="a,b"" +/// Returns Error if the Value object somehow doesn't represents valid json, or the num_iterations +/// param is not set. +fn parse_params(params: Value) -> Result { + if params.get("num_iterations").is_none() { + return Err(LgbmError::new("Num Iterations not specified.")); + } + + let s = params + .as_object() + .ok_or(LgbmError::new("Couldn't parse params"))? + .iter() + .map(|(k, v)| match v { + Value::Array(a) => { + let v_formatted = a.iter().map(|x| x.to_string() + ",").collect::(); + let v_formatted = v_formatted + .replace("\",\"", ",") + .trim_end_matches(',') + .to_string(); + (k, v_formatted) + } + _ => (k, v.to_string()), + }) + .map(|(k, v)| format!("{}={}", k, v)) + .collect::>() + .join(" "); + Ok(s) +} + +#[cfg(test)] +mod tests { + use booster::builder::parse_params; + use booster::Booster; + use dataset::DataSet; + use serde_json::json; + use {LabelVec, Matrixf64}; + + fn get_simple_params() -> serde_json::Value { + json! { + { + "num_iterations": 5, + "objective": "binary", + "metric": "auc", + "data_random_seed": 0 + } + } + } + + fn get_dummy_data_1() -> (Matrixf64, LabelVec) { + let data = vec![ + vec![1.0, 0.1, 0.2, 0.1], + vec![0.7, 0.4, 0.5, 0.1], + vec![0.9, 0.8, 0.5, 0.1], + vec![0.2, 0.2, 0.8, 0.7], + vec![0.1, 0.7, 1.0, 0.9], + ]; + let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + (data, label) + } + + fn get_dummy_data_2() -> (Matrixf64, LabelVec) { + let data = vec![ + vec![8.0, 0.2, 0.4, 0.5], + vec![0.9, 0.4, 0.3, 0.5], + vec![0.5, 0.6, 0.3, 0.8], + vec![0.244, 0.25, 0.9, 0.9], + vec![0.4, 0.8, 0.8, 0.7], + ]; + let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + (data, label) + } + + #[test] + fn simple_build_test() { + let (train_x, train_y) = get_dummy_data_1(); + let (val_x, val_y) = get_dummy_data_2(); + let params = get_simple_params(); + + let train_set = DataSet::from_mat(train_x, train_y); + let val_set = DataSet::from_mat(val_x, val_y); + + let booster = Booster::builder() + .add_train_data(train_set) + .add_val_data(val_set) + .add_params(params) + .unwrap() + .fit() + .unwrap(); + + let result_train = booster.get_eval_result_for_dataset(0).unwrap(); + let result_val = booster.get_eval_result_for_dataset(1).unwrap(); + assert!(booster.get_eval_result_for_dataset(2).is_err()); + + assert_eq!(result_train.len(), 1); + assert_eq!(result_val.len(), 1); + } + + #[test] + fn more_params() { + let params = json! { + { + "num_iterations": 30, + "objective": "binary", + "boosting_type": "gbdt", + "metrics": ["binary_logloss","auc"], + "label_column": 0, + "max_bin": 255, + "tree_learner": "serial", + "feature_fraction": 0.8, + "is_enable_sparse": true, + "data_random_seed": 0 + } + }; + let (train_x, train_y) = get_dummy_data_1(); + let (val_x, val_y) = get_dummy_data_2(); + + let train_set = DataSet::from_mat(train_x, train_y); + let val_set = DataSet::from_mat(val_x, val_y); + let val_set_2 = val_set.clone(); + + let booster = Booster::builder() + .add_train_data(train_set) + .add_val_data(val_set) + .add_val_data(val_set_2) + .add_params(params) + .unwrap() + .fit() + .unwrap(); + + let result_train = booster.get_eval_result_for_dataset(0).unwrap(); + let result_val_1 = booster.get_eval_result_for_dataset(1).unwrap(); + let result_val_2 = booster.get_eval_result_for_dataset(2).unwrap(); + assert!(booster.get_eval_result_for_dataset(3).is_err()); + + assert_eq!(result_train.len(), 2); + assert_eq!(result_val_1.len(), 2); + assert_eq!(result_val_2.len(), 2); + let delta = (result_val_1[0].score - result_val_2[0].score).abs(); // floating point error + assert!(-0.1 < delta && delta < 0.01); + } + + #[test] + fn params_test_valid() { + let params = json! { + { + "num_iterations": 30, + "objective": "binary", + "metrics": ["binary_logloss","auc"], + "is_enable_sparse": true + } + }; + let supposed_to_be = + "is_enable_sparse=true metrics=\"binary_logloss,auc\" num_iterations=30 objective=\"binary\""; + let parsed = parse_params(params).unwrap(); + + assert_eq!(&parsed, supposed_to_be); + } + + #[test] + fn params_num_it_missing() { + let params = json! { + { + "objective": "binary", + "metrics": ["binary_logloss","auc"], + "is_enable_sparse": true + } + }; + assert!(parse_params(params).is_err()); + } +} diff --git a/src/booster/ffi.rs b/src/booster/ffi.rs new file mode 100644 index 0000000..ebc5bf8 --- /dev/null +++ b/src/booster/ffi.rs @@ -0,0 +1,210 @@ +use std::ffi::CString; + +use libc::{c_char, c_double, c_longlong, c_void}; +use lightgbm_sys::{BoosterHandle, DatasetHandle}; + +use {lightgbm_sys, Matrixf64}; + +use crate::{LgbmError, Result}; + +pub(crate) fn new_booster(train_data: DatasetHandle, parsed_params: &str) -> Result { + let params_cstring = CString::new(parsed_params)?; + let mut handle = std::ptr::null_mut(); + lgbm_call!(lightgbm_sys::LGBM_BoosterCreate( + train_data, + params_cstring.as_ptr() as *const c_char, + &mut handle + ))?; + Ok(handle) +} +pub(crate) fn free_booster(handle: BoosterHandle) -> Result<()> { + lgbm_call!(lightgbm_sys::LGBM_BoosterFree(handle)) +} + +pub(crate) fn add_validation_data_to_booster( + booster: BoosterHandle, + validation_data_handle: DatasetHandle, +) -> Result<()> { + lgbm_call!(lightgbm_sys::LGBM_BoosterAddValidData( + booster, + validation_data_handle + )) +} + +#[inline] +pub(crate) fn train_one_step(booster: BoosterHandle, is_finished: &mut i32) -> Result<()> { + lgbm_call!(lightgbm_sys::LGBM_BoosterUpdateOneIter( + booster, + is_finished + )) +} + +pub(crate) fn get_num_classes(booster: BoosterHandle) -> Result { + let mut num_classes = -1; + lgbm_call!(lightgbm_sys::LGBM_BoosterGetNumClasses( + booster, + &mut num_classes + ))?; + if num_classes > -1 { + Ok(num_classes) + } else { + Err(LgbmError::new( + "lgbm didn't update the number of classes correctly.", + )) + } +} + +pub(crate) fn predict( + booster: BoosterHandle, + prediction_params: &str, + data: &Matrixf64, +) -> Result { + let data_length = data.len(); + let feature_length = data[0].len(); + let params = CString::new(prediction_params)?; + let mut out_length: c_longlong = 0; + let flat_data = data.clone().into_iter().flatten().collect::>(); + let num_classes = get_num_classes(booster)?; + let out_result: Vec = vec![Default::default(); data_length * num_classes as usize]; + + lgbm_call!(lightgbm_sys::LGBM_BoosterPredictForMat( + booster, + flat_data.as_ptr() as *const c_void, + lightgbm_sys::C_API_DTYPE_FLOAT64 as i32, + data_length as i32, + feature_length as i32, + 1_i32, + 0_i32, + 0_i32, + -1_i32, + params.as_ptr() as *const c_char, + &mut out_length, + out_result.as_ptr() as *mut c_double + ))?; + + // reshape for multiclass [1,2,3,4,5,6] -> [[1,2,3], [4,5,6]] # 3 class + let reshaped_output = if num_classes > 1 { + out_result + .chunks(num_classes as usize) + .map(|x| x.to_vec()) + .collect() + } else { + vec![out_result] + }; + Ok(reshaped_output) +} + +/// Get number of evaluation metrics +pub(crate) fn num_eval(handle: BoosterHandle) -> Result { + let mut out_len = 0; + lgbm_call!(lightgbm_sys::LGBM_BoosterGetEvalCounts( + handle, + &mut out_len + ))?; + Ok(out_len) +} + +/// Get names of evaluation metrics +pub(crate) fn get_eval_names(handle: BoosterHandle) -> Result> { + let num_metrics = num_eval(handle)?; + + ///////////////////////////////////////////////////////////////////// + // call with 0-sized buffer to find out how much space to allocate + ///////////////////////////////////////////////////////////////////// + let mut num_eval_names = 0; + let mut out_buffer_len = 0; + + lgbm_call!(lightgbm_sys::LGBM_BoosterGetEvalNames( + handle, + 0, + &mut num_eval_names, + 0, + &mut out_buffer_len, + std::ptr::null_mut() as *mut *mut c_char + ))?; + + ///////////////////////////////////////////////////////////////////// + // sanity check + ///////////////////////////////////////////////////////////////////// + if num_eval_names != num_metrics { + return Err(LgbmError::new(format!( + "expected num_eval_names==num_metrics, but got {num_eval_names}!={num_metrics}. This is a bug in lightgbm or its rust wrapper" + ))); + } + + ///////////////////////////////////////////////////////////////////// + // get the actual strings + ///////////////////////////////////////////////////////////////////// + + let mut out_strs = (0..num_metrics) + .map(|_| (0..out_buffer_len).map(|_| 0).collect::>()) + .collect::>(); + + let mut out_strs_pointers = out_strs + .iter_mut() + .map(|s| s.as_mut_ptr()) + .collect::>(); + + let metric_name_length = out_buffer_len; + + lgbm_call!(lightgbm_sys::LGBM_BoosterGetEvalNames( + handle, + num_metrics, + &mut num_eval_names, + metric_name_length, + &mut out_buffer_len, + out_strs_pointers.as_mut_ptr() as *mut *mut c_char + ))?; + + drop(out_strs_pointers); // don't let pointers outlive their target + + let mut output = Vec::with_capacity(out_strs.len()); + for mut out_str in out_strs { + let first_null = out_str + .iter() + .enumerate() + .find(|(_, e)| **e == 0) + .map(|(i, _)| i) + .expect("string not null terminated, possible memory corruption"); + out_str.truncate(first_null + 1); + + let string = CString::from_vec_with_nul(out_str) + .expect("string memory invariant violated, possible memory corruption") + .into_string() + .map_err(|_| LgbmError::new("name not valid UTF-8"))?; + output.push(string); + } + + Ok(output) +} + +pub(crate) fn get_eval_scores( + handle: BoosterHandle, + data_index: i32, + num_metrics: usize, +) -> Result> { + let mut out_len = 0; + let out_result: Vec = vec![Default::default(); num_metrics]; + lgbm_call!(lightgbm_sys::LGBM_BoosterGetEval( + handle, + data_index, + &mut out_len, + out_result.as_ptr() as *mut c_double + ))?; + if out_len != out_result.len() as i32 { + Err(LgbmError::new( + "Output Array length doesn't match reported length.", + )) + } else { + Ok(out_result) + } +} + +pub(crate) fn num_feature(booster: BoosterHandle) -> Result { + let mut out_len = 0; + lgbm_call!(lightgbm_sys::LGBM_BoosterGetNumFeature( + booster, + &mut out_len + ))?; + Ok(out_len) +} diff --git a/src/booster/mod.rs b/src/booster/mod.rs new file mode 100644 index 0000000..4c21172 --- /dev/null +++ b/src/booster/mod.rs @@ -0,0 +1,227 @@ +use booster::builder::{BoosterBuilder, ParamsMissing, TrainDataMissing}; +use dataset::LoadedDataSet; +use LgbmError; +use Matrixf64; + +pub mod builder; +mod ffi; + +/// Evaluation Result of a Booster on a given Dataset. +/// Returned by get_eval +pub struct EvalResult { + pub metric_name: String, + pub score: f64, +} + +/// Class that is returned by the builder, once fit() is called. +/// Used to interact with a trained booster. +pub struct Booster { + handle: lightgbm_sys::BoosterHandle, + #[allow(dead_code)] + train_data: Option, // dont drop datasets + validation_data: Vec, +} + +// exchange params method as well? does this make sense? +impl Booster { + /// Returns a builder. At least training data and params need to be added, + /// so that the model can be fitted (built). + pub fn builder() -> BoosterBuilder { + BoosterBuilder::default() + } + + /// Generates a prediction for a given Input. + /// Output dimensions depend on booster task. + /// Can return an Error if the input or model is corrupt. + /// ``` + /// use lightgbm::booster::Booster; + /// # use lightgbm::dataset::DataSet; + /// # use lightgbm::LgbmError; + /// + /// # fn main() -> Result<(), LgbmError> { + /// # let params = serde_json::json! { + /// # { + /// # "num_iterations": 5, + /// # "objective": "binary", + /// # "metric": "auc", + /// # "data_random_seed": 0 + /// # } + /// # }; + /// # let x = vec![ + /// # vec![1.0, 0.1, 0.2, 0.1], + /// # vec![0.7, 0.4, 0.5, 0.1], + /// # vec![0.9, 0.8, 0.5, 0.1], + /// # vec![0.2, 0.2, 0.8, 0.7], + /// # vec![0.1, 0.7, 1.0, 0.9]]; + /// # let y = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + /// # let train_data = DataSet::from_mat(x,y); + /// # let input = vec![ + /// # vec![8.0, 0.2, 0.4, 0.5], + /// # vec![0.9, 0.4, 0.3, 0.5], + /// # vec![0.5, 0.6, 0.3, 0.8], + /// # vec![0.244, 0.25, 0.9, 0.9], + /// # vec![0.4, 0.8, 0.8, 0.7], + /// # ]; + /// let booster = Booster::builder() + /// .add_train_data(train_data) + /// .add_params(params)? + /// .fit()?; + /// let pred = booster.predict(&input)?; + /// + /// assert_eq!(input.len(), pred[0].len()); // binary classification. One output value for each input vec + /// # Ok(())} + /// ``` + pub fn predict(&self, x: &Matrixf64) -> Result { + let prediction_params = ""; // do we need this? + self.predict_with_params(x, prediction_params) + } + + /// Predict with additional params + /// ``` + /// use lightgbm::booster::Booster; + /// # use lightgbm::dataset::DataSet; + /// # use lightgbm::LgbmError; + /// + /// # fn main() -> Result<(), LgbmError> { + /// # let params = serde_json::json! { + /// # { + /// # "num_iterations": 5, + /// # "objective": "binary", + /// # "metric": "auc", + /// # "data_random_seed": 0 + /// # } + /// # }; + /// # let x = vec![ + /// # vec![1.0, 0.1, 0.2, 0.1], + /// # vec![0.7, 0.4, 0.5, 0.1], + /// # vec![0.9, 0.8, 0.5, 0.1], + /// # vec![0.2, 0.2, 0.8, 0.7], + /// # vec![0.1, 0.7, 1.0, 0.9]]; + /// # let y = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + /// # let train_data = DataSet::from_mat(x,y); + /// # let input = vec![ + /// # vec![8.0, 0.2, 0.4, 0.5], + /// # vec![0.9, 0.4, 0.3, 0.5], + /// # vec![0.5, 0.6, 0.3, 0.8], + /// # vec![0.244, 0.25, 0.9, 0.9], + /// # vec![0.4, 0.8, 0.8, 0.7], + /// # ]; + /// let booster = Booster::builder() + /// .add_train_data(train_data) + /// .add_params(params)? + /// .fit()?; + /// let predict_params = "predict_raw_score=true"; + /// # let pred = booster.predict(&input)?; + /// let pred_raw = booster.predict_with_params(&input, predict_params)?; + /// + /// # Ok(())} + /// ``` + pub fn predict_with_params( + &self, + x: &Matrixf64, + prediction_params: &str, + ) -> Result { + ffi::predict(self.handle, prediction_params, x) + } + + /// Returns the scores for a certain dataset. You can use the index like this: + /// 0 = Train Dataset + /// 1 = 1. Validation Dataset + /// 2 = 2. Validation Dataset + /// ... + /// n = nth Validation Dataset + /// ``` + /// use lightgbm::booster::Booster; + /// # use lightgbm::dataset::DataSet; + /// # use lightgbm::LgbmError; + /// + /// # fn main() -> Result<(), LgbmError> { + /// # let params = serde_json::json! { + /// # { + /// # "num_iterations": 5, + /// # "objective": "binary", + /// # "metric": "auc", + /// # "data_random_seed": 0 + /// # } + /// # }; + /// # let x = vec![ + /// # vec![1.0, 0.1, 0.2, 0.1], + /// # vec![0.7, 0.4, 0.5, 0.1], + /// # vec![0.9, 0.8, 0.5, 0.1], + /// # vec![0.2, 0.2, 0.8, 0.7], + /// # vec![0.1, 0.7, 1.0, 0.9]]; + /// # let y = vec![0.0, 0.0, 0.0, 1.0, 1.0]; + /// # let train_data = DataSet::from_mat(x,y); + /// # let input = vec![ + /// # vec![8.0, 0.2, 0.4, 0.5], + /// # vec![0.9, 0.4, 0.3, 0.5], + /// # vec![0.5, 0.6, 0.3, 0.8], + /// # vec![0.244, 0.25, 0.9, 0.9], + /// # vec![0.4, 0.8, 0.8, 0.7], + /// # ]; + /// let booster = Booster::builder() + /// .add_train_data(train_data) + /// .add_params(params)? + /// .fit()?; + /// let pred = booster.predict(&input)?; + /// let eval = booster.get_eval_result_for_dataset(0); // train data + /// + /// # Ok(())} + /// ``` + pub fn get_eval_result_for_dataset( + &self, + dataset_index: i32, + ) -> Result, LgbmError> { + if dataset_index > self.validation_data.len() as i32 { + return Err(LgbmError::new(format!( + "Invalid Dataset Index. Given: {} Max Allowed: {}", + dataset_index, + self.validation_data.len() + ))); + } + let names = ffi::get_eval_names(self.handle)?; + let scores = ffi::get_eval_scores(self.handle, dataset_index, names.len())?; + Ok(names + .into_iter() + .zip(scores) + .map(|(metric_name, score)| EvalResult { metric_name, score }) + .collect()) + } + + /// this should take &mut self, because it changes the model + fn train_loop(&mut self, max_iterations: i32) -> Result<(), LgbmError> { + let mut is_finished = 0; + let mut i = 0; + while is_finished == 0 && i < max_iterations { + // callback stuff here + ffi::train_one_step(self.handle, &mut is_finished)?; + i += 1; + } + Ok(()) + } + + /* /// Train a booster further with a new dataset. + /// This should not reset the already existing submodels. + /// Pass an empty array as validation data, if you don't want to validate the train results. + /// TODO validate this after implemented + pub fn finetune( + &mut self, + _train_data: DataSet, + _validation_data: Vec, + ) -> Result<(), LgbmError> { + + }*/ +} + +impl Drop for Booster { + fn drop(&mut self) { + ffi::free_booster(self.handle).expect("Something went wrong dropping the Booster."); + } +} + +#[cfg(test)] +mod tests { + + #[test] + fn simple() {} +} diff --git a/src/dataset.rs b/src/dataset.rs deleted file mode 100644 index 406a028..0000000 --- a/src/dataset.rs +++ /dev/null @@ -1,356 +0,0 @@ -use libc::{c_char, c_void}; -use lightgbm_sys; -use lightgbm_sys::DatasetHandle; -use std; -use std::convert::TryInto; -use std::ffi::CString; - -#[cfg(feature = "dataframe")] -use polars::prelude::*; - -use crate::{Error, Result}; - -/// Dataset used throughout LightGBM for training. -/// -/// # Examples -/// -/// ## from mat -/// -/// ``` -/// use lightgbm::Dataset; -/// -/// let data = vec![vec![1.0, 0.1, 0.2, 0.1], -/// vec![0.7, 0.4, 0.5, 0.1], -/// vec![0.9, 0.8, 0.5, 0.1], -/// vec![0.2, 0.2, 0.8, 0.7], -/// vec![0.1, 0.7, 1.0, 0.9]]; -/// let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; -/// let dataset = Dataset::from_mat(data, label).unwrap(); -/// ``` -/// -/// ## from file -/// -/// ``` -/// use lightgbm::Dataset; -/// -/// let dataset = Dataset::from_file(&"lightgbm-sys/lightgbm/examples/binary_classification/binary.train", None).unwrap(); -/// ``` -pub struct Dataset { - pub(crate) handle: lightgbm_sys::DatasetHandle, -} - -#[link(name = "c")] -impl Dataset { - fn new(handle: lightgbm_sys::DatasetHandle) -> Self { - Self { handle } - } - - /// Create a new `Dataset` from dense array in row-major order. - /// - /// Example - /// ``` - /// use lightgbm::Dataset; - /// - /// let data = vec![vec![1.0, 0.1, 0.2, 0.1], - /// vec![0.7, 0.4, 0.5, 0.1], - /// vec![0.9, 0.8, 0.5, 0.1], - /// vec![0.2, 0.2, 0.8, 0.7], - /// vec![0.1, 0.7, 1.0, 0.9]]; - /// let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - /// let dataset = Dataset::from_mat(data, label).unwrap(); - /// ``` - pub fn from_mat(data: Vec>, label: Vec) -> Result { - let data_length = data.len(); - let feature_length = data[0].len(); - let params = CString::new("").unwrap(); - let label_str = CString::new("label").unwrap(); - let reference = std::ptr::null_mut(); // not use - let mut handle = std::ptr::null_mut(); - let flat_data = data.into_iter().flatten().collect::>(); - - if data_length > i32::MAX as usize || feature_length > i32::MAX as usize { - return Err(Error::new(format!( - "received dataset of size {}x{}, but at most {}x{} is supported", - data_length, - feature_length, - i32::MAX, - i32::MAX - ))); - } - - lgbm_call!(lightgbm_sys::LGBM_DatasetCreateFromMat( - flat_data.as_ptr() as *const c_void, - lightgbm_sys::C_API_DTYPE_FLOAT64 as i32, - data_length as i32, - feature_length as i32, - 1_i32, - params.as_ptr() as *const c_char, - reference, - &mut handle - ))?; - - lgbm_call!(lightgbm_sys::LGBM_DatasetSetField( - handle, - label_str.as_ptr() as *const c_char, - label.as_ptr() as *const c_void, - data_length as i32, - lightgbm_sys::C_API_DTYPE_FLOAT32 as i32 - ))?; - - Ok(Self::new(handle)) - } - - /// Create a new `Dataset` from file. - /// - /// file is `tsv`. - /// ```text - ///