From c43143148c68fd5f63d43d256d272d6c3629d598 Mon Sep 17 00:00:00 2001 From: David Schwab Date: Thu, 18 May 2023 22:12:05 +0200 Subject: [PATCH 01/30] created first version of api. ffi calls and example missing --- Cargo.toml | 1 + src/booster/builder.rs | 95 ++++++++++++++++++++++++++++++ src/booster/ffi.rs | 1 + src/booster/mod.rs | 37 ++++++++++++ src/dataset/ffi.rs | 1 + src/dataset/mod.rs | 53 +++++++++++++++++ src/lib.rs | 9 ++- src/{booster.rs => old_booster.rs} | 19 +++--- src/{dataset.rs => old_dataset.rs} | 62 ++++++++++++++----- 9 files changed, 251 insertions(+), 27 deletions(-) create mode 100644 src/booster/builder.rs create mode 100644 src/booster/ffi.rs create mode 100644 src/booster/mod.rs create mode 100644 src/dataset/ffi.rs create mode 100644 src/dataset/mod.rs rename src/{booster.rs => old_booster.rs} (97%) rename src/{dataset.rs => old_dataset.rs} (81%) diff --git a/Cargo.toml b/Cargo.toml index 4a3f2b2..f7dbc24 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ libc = "0.2.81" derive_builder = "0.5.1" serde_json = "1.0.59" polars = {version = "0.16.0", optional = true} +csv = "1.2.1" [features] diff --git a/src/booster/builder.rs b/src/booster/builder.rs new file mode 100644 index 0000000..b6c2f61 --- /dev/null +++ b/src/booster/builder.rs @@ -0,0 +1,95 @@ +use serde_json::Value; + +use booster::Booster; +use dataset::DataSet; +use Error; +use {InputMatrix, OutputVec}; + +// types for training set +#[derive(Clone)] +pub struct TrainDataAdded(DataSet); // this should not implement default, so it can safely be used for construction +#[derive(Default, Clone)] +pub struct TrainDataNotAdded; + +// types for params +#[derive(Clone)] +pub struct ParamsAdded(Value); // this should not implement default, so it can safely be used for construction +#[derive(Default, Clone)] +pub struct ParamsNotAdded; + +/// Builder for the Booster. +/// +/// Uses TypeState Pattern to make sure that Training Data is added +/// so that Validation can be synced properly and params are present for training. +#[derive(Default, Clone)] +pub struct BoosterBuilder { + train_data: T, + val_data: Vec, + params: P, // after #3 should this be a struct +} + +impl BoosterBuilder { + /// Returns the Builder and a clone from it. Useful if you want to train 2 models with + /// only a couple differences + pub fn duplicate(self) -> (Self, Self) { + (self.clone(), self) + } +} + +impl BoosterBuilder { + pub fn add_params(self, params: Value) -> BoosterBuilder { + BoosterBuilder { + params: ParamsAdded(params), + train_data: self.train_data, + val_data: self.val_data, + } + } +} + +impl BoosterBuilder { + /// Adds training data. necessary for validation data (so bins can be synced) + /// and for model fitting. + pub fn add_train_data(self, train: DataSet) -> BoosterBuilder { + BoosterBuilder { + train_data: TrainDataAdded(train), + val_data: self.val_data, + params: self.params, + } + } +} + +impl BoosterBuilder { + pub fn add_val_data(mut self, val: DataSet) -> Self { + self.val_data.push(val); + self + } +} + +/// Methods in this impl block are only available, after Training Data and Params are added. +impl BoosterBuilder { + /// Builds the booster by: + /// 1. Adding the training data + /// 2. Adding the validation data + /// 3. Training with the params + /// + /// Each of these steps can return errors. + pub fn fit(self) -> Result { + let train = self.train_data.0.load(None); + let vals: Vec<_> = self + .val_data + .iter() + .map(|v| v.load(Some(train.handle))) + .collect(); + // train classifier + // call train ffi from here + + // return + todo!() + } + + pub fn fit_predict(self, x: &InputMatrix) -> Result<(Booster, OutputVec), Error> { + let booster = self.fit()?; + let y = booster.predict(x)?; + Ok((booster, y)) + } +} diff --git a/src/booster/ffi.rs b/src/booster/ffi.rs new file mode 100644 index 0000000..0264036 --- /dev/null +++ b/src/booster/ffi.rs @@ -0,0 +1 @@ +// add ffi calls for booster here diff --git a/src/booster/mod.rs b/src/booster/mod.rs new file mode 100644 index 0000000..e8329fa --- /dev/null +++ b/src/booster/mod.rs @@ -0,0 +1,37 @@ +use booster::builder::{BoosterBuilder, ParamsNotAdded, TrainDataNotAdded}; +use dataset::DataSet; +use Error; +use {InputMatrix, OutputVec}; + +mod builder; +mod ffi; + +pub struct Booster { + handle: lightgbm_sys::BoosterHandle, + train_data: DataSet, + validation_data: Vec, +} + +impl Booster { + /// Returns a builder. At least training data and params need to be added, + /// so that the model can be fitted (built). + pub fn builder() -> BoosterBuilder { + BoosterBuilder::default() + } + + /// Generates a prediction for a given Input. + /// + /// Can return an Error if the input or model is corrupt. + pub fn predict(&self, x: &InputMatrix) -> Result { + let _ = x[0][0] + 1_f64; // silence warning for now + todo!() + } +} + +#[cfg(test)] +mod tests { + use booster::Booster; + + #[test] + fn simple() {} +} diff --git a/src/dataset/ffi.rs b/src/dataset/ffi.rs new file mode 100644 index 0000000..788e739 --- /dev/null +++ b/src/dataset/ffi.rs @@ -0,0 +1 @@ +// add ffi calls for dataset here diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs new file mode 100644 index 0000000..d0564a8 --- /dev/null +++ b/src/dataset/mod.rs @@ -0,0 +1,53 @@ +mod ffi; + +use lightgbm_sys::DatasetHandle; +#[cfg(feature = "dataframe")] +use polars::prelude::*; + +/// Represents an unloaded Dataset for the Booster Builder. +/// At the fit step of the BoosterBuilder, these will be added to the +/// lightgbm backend +#[derive(Clone)] +pub struct DataSet { + format: DataFormat, + /// Possible params for the Dataset. This is !currently! not in use + /// and will be just an empty string + params: String, +} + +#[derive(Clone)] +pub enum DataFormat { + File { + path: String, + }, + Vecs { + x: Vec>, + y: Vec, + }, + #[cfg(feature = "dataframe")] + DataFrame { + df: DataFrame, + y_column: Into, + }, +} + +pub struct LoadedDataSet { + pub(crate) handle: DatasetHandle, +} + +impl Drop for LoadedDataSet { + fn drop(&mut self) { + todo!() + } +} + +impl DataSet { + pub(crate) fn load(&self, reference: Option) -> LoadedDataSet { + match &self.format { + DataFormat::File { path } => todo!(), //add here corresponding ffi calls + DataFormat::Vecs { x, y } => todo!(), + #[cfg(feature = "dataframe")] + DataFormat::DataFrame { df, y_column } => todo!(), + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 2867d47..6e4062e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,10 +2,12 @@ extern crate libc; extern crate lightgbm_sys; extern crate serde_json; +type InputMatrix = Vec>; +type OutputVec = Vec; + #[cfg(feature = "dataframe")] extern crate polars; -#[macro_use] macro_rules! lgbm_call { ($x:expr) => { Error::check_return_value(unsafe { $x }) @@ -15,8 +17,5 @@ macro_rules! lgbm_call { mod error; pub use error::{Error, Result}; -mod dataset; -pub use dataset::Dataset; - mod booster; -pub use booster::Booster; +mod dataset; diff --git a/src/booster.rs b/src/old_booster.rs similarity index 97% rename from src/booster.rs rename to src/old_booster.rs index 0830628..47280a3 100644 --- a/src/booster.rs +++ b/src/old_booster.rs @@ -149,7 +149,7 @@ impl Booster { &mut handle ))?; - // the following has to borrow val_data to avoid dropping the dataset + // the following has to borrow val_data to avoid dropping the old_dataset if let Some(validation_data) = &val_data { lgbm_call!(lightgbm_sys::LGBM_BoosterAddValidData( handle, @@ -410,7 +410,7 @@ impl Booster { /// Save model to string. This returns the same content that `save_file` writes into a file. pub fn save_string(&self) -> Result { - // get nessesary buffer size + // get necessary buffer size let mut out_size = 0_i64; lgbm_call!(lightgbm_sys::LGBM_BoosterSaveModelToString( @@ -550,14 +550,19 @@ mod tests { "data_random_seed": 0 } }; - let train = _read_train_file().unwrap(); + let train = Dataset::from_file( + "lightgbm-sys/lightgbm/examples/binary_classification/binary.train", + None, + ) + .unwrap(); let val = Dataset::from_file( - &"lightgbm-sys/lightgbm/examples/binary_classification/binary.test", - Some(train.handle), + "lightgbm-sys/lightgbm/examples/binary_classification/binary.test", + Some(&train), ) .unwrap(); let bst = Booster::train(train, Some(val), ¶ms).unwrap(); + //let bst = Booster::train(train, None, ¶ms).unwrap(); let eval_train = bst.get_eval(0); let eval_val = bst.get_eval(1); @@ -577,7 +582,7 @@ mod tests { vec![0.1, 0.7, 1.0, 0.9], ]; let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - let train_data = Dataset::from_mat(data, label).unwrap(); + let train_data = Dataset::from_mat(data, label, None).unwrap(); let data = vec![ vec![0.9, 0.6, 0.2, 0.1], @@ -585,7 +590,7 @@ mod tests { vec![0.2, 0.1, 0.6, 0.8], ]; let label = vec![0.0, 0.0, 1.0]; - let val_data = Dataset::from_mat(data, label); + let val_data = Dataset::from_mat(data, label, None); let params = json! { { diff --git a/src/dataset.rs b/src/old_dataset.rs similarity index 81% rename from src/dataset.rs rename to src/old_dataset.rs index 406a028..ad0ea35 100644 --- a/src/dataset.rs +++ b/src/old_dataset.rs @@ -59,18 +59,27 @@ impl Dataset { /// let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; /// let dataset = Dataset::from_mat(data, label).unwrap(); /// ``` - pub fn from_mat(data: Vec>, label: Vec) -> Result { + pub fn from_mat( + data: Vec>, + label: Vec, + reference_dataset: Option<&Dataset>, + ) -> Result { let data_length = data.len(); let feature_length = data[0].len(); let params = CString::new("").unwrap(); let label_str = CString::new("label").unwrap(); - let reference = std::ptr::null_mut(); // not use + + let reference = match reference_dataset { + Some(h) => h.handle.clone(), + None => std::ptr::null_mut(), + }; + let mut handle = std::ptr::null_mut(); let flat_data = data.into_iter().flatten().collect::>(); if data_length > i32::MAX as usize || feature_length > i32::MAX as usize { return Err(Error::new(format!( - "received dataset of size {}x{}, but at most {}x{} is supported", + "received old_dataset of size {}x{}, but at most {}x{} is supported", data_length, feature_length, i32::MAX, @@ -121,13 +130,13 @@ impl Dataset { /// /// let dataset = Dataset::from_file(&"lightgbm-sys/lightgbm/examples/binary_classification/binary.train", None); /// ``` - pub fn from_file(file_path: &str, dataset_handle: Option) -> Result { + pub fn old_from_file(file_path: &str, reference_dataset: Option<&Dataset>) -> Result { let file_path_str = CString::new(file_path).unwrap(); let params = CString::new("").unwrap(); let mut handle = std::ptr::null_mut(); - let reference = match dataset_handle { - Some(h) => h, + let reference = match reference_dataset { + Some(h) => h.handle.clone(), None => std::ptr::null_mut(), }; @@ -141,6 +150,28 @@ impl Dataset { Ok(Self::new(handle)) } + pub fn from_file(file_path: &str, reference_dataset: Option<&Dataset>) -> Result { + let rdr = csv::ReaderBuilder::new() + .has_headers(false) + .delimiter(b'\t') + .from_path(file_path); + let mut labels: Vec = Vec::new(); + let mut features: Vec> = Vec::new(); + for result in rdr.unwrap().records() { + let record = result.unwrap(); + let label = record[0].parse::().unwrap(); + let feature: Vec = record + .iter() + .map(|x| x.parse::().unwrap()) + .collect::>()[1..] + .to_vec(); + labels.push(label); + features.push(feature); + } + + Self::from_mat(features, labels, reference_dataset) + } + /// Create a new `Dataset` from a polars DataFrame. /// /// Note: the feature ```dataframe``` is required for this method @@ -163,7 +194,7 @@ impl Dataset { "feature_4" => [0.1, 0.1, 0.1, 0.7, 0.9], "label" => [0.0, 0.0, 0.0, 1.0, 1.0] ].unwrap(); - let dataset = Dataset::from_dataframe(df, String::from("label")).unwrap(); + let old_dataset = Dataset::from_dataframe(df, String::from("label")).unwrap(); "## )] #[cfg(feature = "dataframe")] @@ -175,7 +206,7 @@ impl Dataset { let label_series = &dataframe.select_series(label_col_name)?[0].cast::()?; if label_series.null_count() != 0 { - panic!("Cannot create a dataset with null values, encountered nulls when creating the label array") + panic!("Cannot create a old_dataset with null values, encountered nulls when creating the label array") } dataframe.drop_in_place(label_col_name)?; @@ -198,7 +229,7 @@ impl Dataset { for (_col_idx, series) in dataframe.get_columns().iter().enumerate() { if series.null_count() != 0 { - panic!("Cannot create a dataset with null values, encountered nulls when creating the features array") + panic!("Cannot create a old_dataset with null values, encountered nulls when creating the features array") } let series = series.cast::()?; @@ -219,7 +250,7 @@ impl Dataset { ))?; result .try_into() - .map_err(|_| Error::new("dataset length negative")) + .map_err(|_| Error::new("old_dataset length negative")) } pub fn get_feature_count(&self) -> Result { @@ -237,7 +268,7 @@ impl Dataset { let dataset_len = self.get_data_len()?; if dataset_len != weights.len() { return Err(Error::new(format!( - "got {} weights, but dataset has {} records", + "got {} weights, but old_dataset has {} records", weights.len(), dataset_len ))); @@ -257,6 +288,7 @@ impl Dataset { impl Drop for Dataset { fn drop(&mut self) { + println!("u just dropped a old_dataset"); lgbm_call!(lightgbm_sys::LGBM_DatasetFree(self.handle)).unwrap(); } } @@ -286,7 +318,7 @@ mod tests { vec![0.1, 0.7, 1.0, 0.9], ]; let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - let dataset = Dataset::from_mat(data, label); + let dataset = Dataset::from_mat(data, label, None); assert!(dataset.is_ok()); } @@ -317,7 +349,7 @@ mod tests { vec![0.1, 0.7, 1.0, 0.9], ]; let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - let dataset = Dataset::from_mat(data, label).unwrap(); + let dataset = Dataset::from_mat(data, label, None).unwrap(); assert_eq!(dataset.get_data_len(), Ok(5)); assert_eq!(dataset.get_feature_count(), Ok(4)); } @@ -332,7 +364,7 @@ mod tests { vec![0.1, 0.7, 1.0, 0.9], ]; let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - let mut dataset = Dataset::from_mat(data, label).unwrap(); + let mut dataset = Dataset::from_mat(data, label, None).unwrap(); let weights = vec![0.5, 1.0, 2.0, 0.5, 0.5]; dataset.set_weights(weights).unwrap(); } @@ -347,7 +379,7 @@ mod tests { vec![0.1, 0.7, 1.0, 0.9], ]; let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - let mut dataset = Dataset::from_mat(data, label).unwrap(); + let mut dataset = Dataset::from_mat(data, label, None).unwrap(); let weights_short = vec![0.5, 1.0, 2.0, 0.5]; let weights_long = vec![0.5, 1.0, 2.0, 0.5, 0.1, 0.1]; assert!(dataset.set_weights(weights_short).is_err()); From 7732775ed3c0959a0d7fa804228c3ce6ade0661c Mon Sep 17 00:00:00 2001 From: David Schwab Date: Fri, 19 May 2023 16:51:50 +0200 Subject: [PATCH 02/30] added dataset ffi --- src/booster/builder.rs | 1 + src/booster/mod.rs | 17 ++++++++ src/dataset/dataframe.rs | 53 +++++++++++++++++++++++ src/dataset/ffi.rs | 91 +++++++++++++++++++++++++++++++++++++++- src/dataset/mod.rs | 30 ++++++++----- 5 files changed, 181 insertions(+), 11 deletions(-) create mode 100644 src/dataset/dataframe.rs diff --git a/src/booster/builder.rs b/src/booster/builder.rs index b6c2f61..9801898 100644 --- a/src/booster/builder.rs +++ b/src/booster/builder.rs @@ -81,6 +81,7 @@ impl BoosterBuilder { .map(|v| v.load(Some(train.handle))) .collect(); // train classifier + // check callbacks (not implemented yet) // call train ffi from here // return diff --git a/src/booster/mod.rs b/src/booster/mod.rs index e8329fa..39dd597 100644 --- a/src/booster/mod.rs +++ b/src/booster/mod.rs @@ -6,6 +6,11 @@ use {InputMatrix, OutputVec}; mod builder; mod ffi; +pub struct EvalResult { + metric_name: String, + score: f64, +} + pub struct Booster { handle: lightgbm_sys::BoosterHandle, train_data: DataSet, @@ -26,6 +31,18 @@ impl Booster { let _ = x[0][0] + 1_f64; // silence warning for now todo!() } + + /// Returns the scores for the train and validation set. + /// If successful, returns a Result with a m·n matrix, where + /// m = number of datasets + /// n = number of metrics + pub fn get_eval_results(&self) -> Result>, Error> { + todo!("just ffi call i guess") + } + + pub fn finetune(&self, data: DataSet) -> Result<(), Error> { + todo!() + } } #[cfg(test)] diff --git a/src/dataset/dataframe.rs b/src/dataset/dataframe.rs new file mode 100644 index 0000000..07bfc6a --- /dev/null +++ b/src/dataset/dataframe.rs @@ -0,0 +1,53 @@ +use polars::prelude::{DataFrame, Float32Type, Float64Type, PolarsError}; + +use {InputMatrix, OutputVec}; + +type FfiError = crate::Error; + +pub(crate) fn dataframe_to_mat( + &mut dataframe: DataFrame, + label_column: String, +) -> Result<(InputMatrix, OutputVec), FfiError> { + let label_col_name = label_column.as_str(); + let (m, n) = dataframe.shape(); + let label_series = &dataframe.select_series(label_col_name)?[0].cast::()?; + if label_series.null_count() != 0 { + return Err(FfiError::new("Cannot create a dataset with null values, encountered nulls when creating the label array")); + } + + dataframe + .drop_in_place(label_col_name) + .map_err(FfiError::new)?; + + let mut label_values = Vec::with_capacity(m); + + let label_values_ca = label_series + .unpack::() + .map_err(FfiError::new)?; + + label_values_ca + .into_no_null_iter() + .enumerate() + .for_each(|(_row_idx, val)| { + label_values.push(val); + }); + + let mut feature_values = Vec::with_capacity(m); + for _i in 0..m { + feature_values.push(Vec::with_capacity(n)); + } + + for (_col_idx, series) in dataframe.get_columns().iter().enumerate() { + if series.null_count() != 0 { + return Err(FfiError::new("Cannot create a dataset with null values, encountered nulls when creating the label array")); + } + + let series = series.cast::().map_err(FfiError::new)?; + let ca = series.unpack::().map_err(FfiError::new)?; + + ca.into_no_null_iter() + .enumerate() + .for_each(|(row_idx, val)| feature_values[row_idx].push(val)); + } + Ok((feature_values, label_values)) +} diff --git a/src/dataset/ffi.rs b/src/dataset/ffi.rs index 788e739..d45f355 100644 --- a/src/dataset/ffi.rs +++ b/src/dataset/ffi.rs @@ -1 +1,90 @@ -// add ffi calls for dataset here +use std::error::Error; +use std::ffi::CString; + +use lightgbm_sys::DatasetHandle; + +use dataset::LoadedDataSet; +use {InputMatrix, OutputVec}; + +type FfiError = crate::Error; + +pub(crate) fn drop_dataset(handle: DatasetHandle) -> Result<(), FfiError> { + lgbm_call!(lightgbm_sys::LGBM_DatasetFree(handle))?; + Ok(()) +} + +pub(crate) fn load_dataset_from_file( + file_path: &str, + dataset_params: &str, + reference_dataset: &Option, +) -> Result { + let file_path_str = CString::new(file_path).unwrap(); + let params = CString::new(dataset_params).unwrap(); + let mut handle = std::ptr::null_mut(); + + let reference = match reference_dataset { + Some(h) => h.clone(), + None => std::ptr::null_mut(), + }; + + lgbm_call!(lightgbm_sys::LGBM_DatasetCreateFromFile( + file_path_str.as_ptr() as *const c_char, + params.as_ptr() as *const c_char, + reference, + &mut handle + ))?; + + Ok(handle) +} + +pub(crate) fn load_from_vec( + data: &InputMatrix, + label: &OutputVec, + dataset_params: &str, + reference_dataset: &Option, +) -> Result { + let data_length = data.len(); + let feature_length = data[0].len(); + let params = CString::new(dataset_params).unwrap(); + let label_str = CString::new("label").unwrap(); + + let reference = match reference_dataset { + Some(h) => h.clone(), + None => std::ptr::null_mut(), + }; + + let mut handle = std::ptr::null_mut(); + // mhhh..... does lightgbm reserve new space or uses this one + let flat_data = data.into_iter().flatten().collect::>(); + + if data_length > i32::MAX as usize || feature_length > i32::MAX as usize { + return Err(FfiError::new(format!( + "received old_dataset of size {}x{}, but at most {}x{} is supported", + data_length, + feature_length, + i32::MAX, + i32::MAX + ))); + } + + lgbm_call!(lightgbm_sys::LGBM_DatasetCreateFromMat( + flat_data.as_ptr() as *const c_void, + lightgbm_sys::C_API_DTYPE_FLOAT64 as i32, + data_length as i32, + feature_length as i32, + 1_i32, + params.as_ptr() as *const c_char, + reference, + &mut handle + ))?; + + lgbm_call!(lightgbm_sys::LGBM_DatasetSetField( + handle, + label_str.as_ptr() as *const c_char, + label.as_ptr() as *const c_void, + data_length as i32, + lightgbm_sys::C_API_DTYPE_FLOAT32 as i32 + ))?; + + Ok(handle) +} diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index d0564a8..8ade52c 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -1,8 +1,12 @@ +#[cfg(feature = "dataframe")] +mod dataframe; mod ffi; use lightgbm_sys::DatasetHandle; #[cfg(feature = "dataframe")] use polars::prelude::*; +use OutputVec; +use {Error, InputMatrix}; /// Represents an unloaded Dataset for the Booster Builder. /// At the fit step of the BoosterBuilder, these will be added to the @@ -21,13 +25,13 @@ pub enum DataFormat { path: String, }, Vecs { - x: Vec>, - y: Vec, + x: InputMatrix, + y: OutputVec, }, #[cfg(feature = "dataframe")] DataFrame { df: DataFrame, - y_column: Into, + y_column: String, }, } @@ -37,17 +41,23 @@ pub struct LoadedDataSet { impl Drop for LoadedDataSet { fn drop(&mut self) { - todo!() + ffi::drop_dataset(self.handle).expect("Something went wrong dropping the Dataset."); } } impl DataSet { - pub(crate) fn load(&self, reference: Option) -> LoadedDataSet { - match &self.format { - DataFormat::File { path } => todo!(), //add here corresponding ffi calls - DataFormat::Vecs { x, y } => todo!(), + pub(crate) fn load(&self, reference: Option) -> Result { + let handle = match &self.format { + DataFormat::File { path } => { + ffi::load_dataset_from_file(path, &self.params, &reference) + } + DataFormat::Vecs { x, y } => ffi::load_from_vec(x, y, &self.params, &reference), #[cfg(feature = "dataframe")] - DataFormat::DataFrame { df, y_column } => todo!(), - } + DataFormat::DataFrame { df, y_column } => { + let (x, y) = dataframe::dataframe_to_mat(dataframe, label_column)?; + ffi::load_from_vec(&x, &y, &self.params, &reference) + } + }?; + Ok(LoadedDataSet { handle }) } } From 5057c1ab7c45431892bf505bbdfabf9723992c72 Mon Sep 17 00:00:00 2001 From: David Schwab Date: Fri, 19 May 2023 17:13:56 +0200 Subject: [PATCH 03/30] fixed build --- src/booster/builder.rs | 4 ++-- src/dataset/ffi.rs | 12 ++++++------ src/dataset/mod.rs | 13 ++++++------- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/booster/builder.rs b/src/booster/builder.rs index 9801898..0fe0e35 100644 --- a/src/booster/builder.rs +++ b/src/booster/builder.rs @@ -74,11 +74,11 @@ impl BoosterBuilder { /// /// Each of these steps can return errors. pub fn fit(self) -> Result { - let train = self.train_data.0.load(None); + let train = self.train_data.0.load(&None); let vals: Vec<_> = self .val_data .iter() - .map(|v| v.load(Some(train.handle))) + .map(|v| v.load(&train.as_ref().map(|t| t.handle).ok())) .collect(); // train classifier // check callbacks (not implemented yet) diff --git a/src/dataset/ffi.rs b/src/dataset/ffi.rs index d45f355..a0d8bb0 100644 --- a/src/dataset/ffi.rs +++ b/src/dataset/ffi.rs @@ -1,14 +1,14 @@ -use std::error::Error; use std::ffi::CString; +use libc::{c_char, c_void}; use lightgbm_sys::DatasetHandle; use dataset::LoadedDataSet; use {InputMatrix, OutputVec}; -type FfiError = crate::Error; +use crate::error::{Error, Result}; -pub(crate) fn drop_dataset(handle: DatasetHandle) -> Result<(), FfiError> { +pub(crate) fn drop_dataset(handle: DatasetHandle) -> Result<()> { lgbm_call!(lightgbm_sys::LGBM_DatasetFree(handle))?; Ok(()) } @@ -17,7 +17,7 @@ pub(crate) fn load_dataset_from_file( file_path: &str, dataset_params: &str, reference_dataset: &Option, -) -> Result { +) -> Result { let file_path_str = CString::new(file_path).unwrap(); let params = CString::new(dataset_params).unwrap(); let mut handle = std::ptr::null_mut(); @@ -42,7 +42,7 @@ pub(crate) fn load_from_vec( label: &OutputVec, dataset_params: &str, reference_dataset: &Option, -) -> Result { +) -> Result { let data_length = data.len(); let feature_length = data[0].len(); let params = CString::new(dataset_params).unwrap(); @@ -58,7 +58,7 @@ pub(crate) fn load_from_vec( let flat_data = data.into_iter().flatten().collect::>(); if data_length > i32::MAX as usize || feature_length > i32::MAX as usize { - return Err(FfiError::new(format!( + return Err(Error::new(format!( "received old_dataset of size {}x{}, but at most {}x{} is supported", data_length, feature_length, diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index 8ade52c..1a00f1d 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -4,7 +4,8 @@ mod ffi; use lightgbm_sys::DatasetHandle; #[cfg(feature = "dataframe")] -use polars::prelude::*; +use polars::prelude::DataFrame; + use OutputVec; use {Error, InputMatrix}; @@ -46,16 +47,14 @@ impl Drop for LoadedDataSet { } impl DataSet { - pub(crate) fn load(&self, reference: Option) -> Result { + pub(crate) fn load(&self, reference: &Option) -> Result { let handle = match &self.format { - DataFormat::File { path } => { - ffi::load_dataset_from_file(path, &self.params, &reference) - } - DataFormat::Vecs { x, y } => ffi::load_from_vec(x, y, &self.params, &reference), + DataFormat::File { path } => ffi::load_dataset_from_file(path, &self.params, reference), + DataFormat::Vecs { x, y } => ffi::load_from_vec(x, y, &self.params, reference), #[cfg(feature = "dataframe")] DataFormat::DataFrame { df, y_column } => { let (x, y) = dataframe::dataframe_to_mat(dataframe, label_column)?; - ffi::load_from_vec(&x, &y, &self.params, &reference) + ffi::load_from_vec(&x, &y, &self.params, reference) } }?; Ok(LoadedDataSet { handle }) From c1d5dec52b7e765fc1daca9016b16e0e7a83dc9c Mon Sep 17 00:00:00 2001 From: David Schwab Date: Sat, 20 May 2023 11:06:53 +0200 Subject: [PATCH 04/30] fix couple errors --- src/dataset/dataframe.rs | 2 +- src/dataset/ffi.rs | 10 +++++----- src/dataset/mod.rs | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/dataset/dataframe.rs b/src/dataset/dataframe.rs index 07bfc6a..35a48ef 100644 --- a/src/dataset/dataframe.rs +++ b/src/dataset/dataframe.rs @@ -5,7 +5,7 @@ use {InputMatrix, OutputVec}; type FfiError = crate::Error; pub(crate) fn dataframe_to_mat( - &mut dataframe: DataFrame, + dataframe: &mut DataFrame, label_column: String, ) -> Result<(InputMatrix, OutputVec), FfiError> { let label_col_name = label_column.as_str(); diff --git a/src/dataset/ffi.rs b/src/dataset/ffi.rs index a0d8bb0..814635f 100644 --- a/src/dataset/ffi.rs +++ b/src/dataset/ffi.rs @@ -22,8 +22,8 @@ pub(crate) fn load_dataset_from_file( let params = CString::new(dataset_params).unwrap(); let mut handle = std::ptr::null_mut(); - let reference = match reference_dataset { - Some(h) => h.clone(), + let reference = match *reference_dataset { + Some(h) => h, None => std::ptr::null_mut(), }; @@ -48,14 +48,14 @@ pub(crate) fn load_from_vec( let params = CString::new(dataset_params).unwrap(); let label_str = CString::new("label").unwrap(); - let reference = match reference_dataset { - Some(h) => h.clone(), + let reference = match *reference_dataset { + Some(h) => h, None => std::ptr::null_mut(), }; let mut handle = std::ptr::null_mut(); // mhhh..... does lightgbm reserve new space or uses this one - let flat_data = data.into_iter().flatten().collect::>(); + let flat_data = data.iter().flatten().collect::>(); if data_length > i32::MAX as usize || feature_length > i32::MAX as usize { return Err(Error::new(format!( diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index 1a00f1d..fe4ce3c 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -1,7 +1,3 @@ -#[cfg(feature = "dataframe")] -mod dataframe; -mod ffi; - use lightgbm_sys::DatasetHandle; #[cfg(feature = "dataframe")] use polars::prelude::DataFrame; @@ -9,6 +5,10 @@ use polars::prelude::DataFrame; use OutputVec; use {Error, InputMatrix}; +#[cfg(feature = "dataframe")] +mod dataframe; +mod ffi; + /// Represents an unloaded Dataset for the Booster Builder. /// At the fit step of the BoosterBuilder, these will be added to the /// lightgbm backend From 40cef9e4b65f3bf0706c5d1015f8cc5ab1be586b Mon Sep 17 00:00:00 2001 From: David Schwab Date: Thu, 25 May 2023 12:01:24 +0200 Subject: [PATCH 05/30] added ffi calls, documentation (without tests). integrated pull request feedback. --- src/booster/builder.rs | 147 ++++++++++++++++++++++++++++++++------- src/booster/ffi.rs | 102 ++++++++++++++++++++++++++- src/booster/mod.rs | 42 ++++++++--- src/dataset/dataframe.rs | 4 +- src/dataset/ffi.rs | 13 ++-- src/dataset/mod.rs | 33 +++++++-- src/error.rs | 28 +++++--- src/lib.rs | 5 +- 8 files changed, 310 insertions(+), 64 deletions(-) diff --git a/src/booster/builder.rs b/src/booster/builder.rs index 0fe0e35..eef4bc0 100644 --- a/src/booster/builder.rs +++ b/src/booster/builder.rs @@ -2,20 +2,24 @@ use serde_json::Value; use booster::Booster; use dataset::DataSet; -use Error; +use {booster, LgbmError}; use {InputMatrix, OutputVec}; +///////////////////////////////////////////// // types for training set #[derive(Clone)] pub struct TrainDataAdded(DataSet); // this should not implement default, so it can safely be used for construction #[derive(Default, Clone)] -pub struct TrainDataNotAdded; +pub struct TrainDataMissing; +///////////////////////////////////////////// +///////////////////////////////////////////// // types for params #[derive(Clone)] -pub struct ParamsAdded(Value); // this should not implement default, so it can safely be used for construction +pub struct ParamsAdded(String); // this should not implement default, so it can safely be used for construction #[derive(Default, Clone)] -pub struct ParamsNotAdded; +pub struct ParamsMissing; +///////////////////////////////////////////// /// Builder for the Booster. /// @@ -25,28 +29,36 @@ pub struct ParamsNotAdded; pub struct BoosterBuilder { train_data: T, val_data: Vec, - params: P, // after #3 should this be a struct + params: P, } +/// These Methods are always available to the Booster. impl BoosterBuilder { /// Returns the Builder and a clone from it. Useful if you want to train 2 models with - /// only a couple differences + /// only a couple differences. This should be called at the end of the adapter chain, + /// where u defined all things that are equal in the models. + /// U can then continue to build the models separately. pub fn duplicate(self) -> (Self, Self) { (self.clone(), self) } } -impl BoosterBuilder { - pub fn add_params(self, params: Value) -> BoosterBuilder { - BoosterBuilder { - params: ParamsAdded(params), +/// Methods in this block require, that no params are added to the Booster. +impl BoosterBuilder { + /// Adds params to the Booster. + /// Returns Error, if param parsing returns Error. + pub fn add_params(self, params: Value) -> Result, LgbmError> { + let parsed_params = parse_params(params)?; + Ok(BoosterBuilder { + params: ParamsAdded(parsed_params), train_data: self.train_data, val_data: self.val_data, - } + }) } } -impl BoosterBuilder { +/// Methods in this Block require, that there is no train data added to the Booster. +impl BoosterBuilder { /// Adds training data. necessary for validation data (so bins can be synced) /// and for model fitting. pub fn add_train_data(self, train: DataSet) -> BoosterBuilder { @@ -58,7 +70,9 @@ impl BoosterBuilder { } } +/// Methods in this impl Block require, that training data is already added. impl BoosterBuilder { + /// Adds validation data to the Booster. pub fn add_val_data(mut self, val: DataSet) -> Self { self.val_data.push(val); self @@ -72,25 +86,104 @@ impl BoosterBuilder { /// 2. Adding the validation data /// 3. Training with the params /// - /// Each of these steps can return errors. - pub fn fit(self) -> Result { - let train = self.train_data.0.load(&None); - let vals: Vec<_> = self - .val_data - .iter() - .map(|v| v.load(&train.as_ref().map(|t| t.handle).ok())) - .collect(); - // train classifier - // check callbacks (not implemented yet) - // call train ffi from here - - // return - todo!() + /// Each of these steps can fail and return errors. + pub fn fit(self) -> Result { + let train_data = self.train_data.0.load(None)?; + let booster_handle = booster::ffi::new_booster(train_data.handle, &self.params.0)?; + let mut validation_sets = Vec::with_capacity(self.val_data.len()); + for val in self.val_data.into_iter() { + let loaded_data = val.load(Some(train_data.handle))?; + booster::ffi::add_validation_data_to_booster(booster_handle, loaded_data.handle)?; + validation_sets.push(loaded_data); + } + let mut booster = Booster { + handle: booster_handle, + train_data: Some(train_data), + validation_data: validation_sets, + }; + booster.train_loop()?; + Ok(booster) } - pub fn fit_predict(self, x: &InputMatrix) -> Result<(Booster, OutputVec), Error> { + /// Build the Booster with fit and immediately predict for the given input. + /// Can Fail in fit if the Booster isn't correctly build or in predict if the Input Data + /// is corrupted. + pub fn fit_predict(self, x: &InputMatrix) -> Result<(Booster, OutputVec), LgbmError> { let booster = self.fit()?; let y = booster.predict(x)?; Ok((booster, y)) } } + +/// Transforms a serde_json Value object into a String that Lightgbm Requires. Note that a conversion +/// to a CString is still required for the ffi. +/// The algorithms thransforms data like this: +/// {"x": "y", "z": 1} => "x=y z=1" +/// and +/// {"k" = ["a", "b"]} => "k=a,b" +/// Returns Error if the Value object somehow doesn't represents valid json, or the num_iterations +/// param is not set. +fn parse_params(params: Value) -> Result { + if params.get("num_iterations").is_none() { + return Err(LgbmError::new("Num Iterations not specified.")); + } + + let s = params + .as_object() + .ok_or(LgbmError::new("Couldn't parse params"))? + .iter() + .map(|(k, v)| match v { + Value::Array(a) => { + let v_formatted = a.iter().map(|x| x.to_string() + ",").collect::(); + let v_formatted = v_formatted + .replace("\",\"", ",") + .trim_end_matches(',') + .to_string(); + (k, v_formatted) + } + _ => (k, v.to_string()), + }) + .map(|(k, v)| format!("{}={}", k, v)) + .collect::>() + .join(" "); + Ok(s) +} + +#[cfg(test)] +mod tests { + use serde_json::{json, Value}; + + use booster::Booster; + use dataset::{DataFormat, DataSet}; + + fn _default_params() -> Value { + let params = json! { + { + "num_iterations": 1, + "objective": "binary", + "metric": "auc", + "data_random_seed": 0 + } + }; + params + } + + #[test] + fn easy() { + let x = vec![vec![1.0, 1.0, 0.5], vec![1.0, 1.0, 0.5]]; + let y = vec![0_f32, 1.0]; + let format = DataFormat::Vecs { x, y }; + let dataset = DataSet::new(format); + let (bst_low_lr, bst_high_lr) = Booster::builder().add_train_data(dataset).duplicate(); + let _bst_low_lr = bst_low_lr + .add_params(_default_params()) + .unwrap() + .fit() + .unwrap(); + let _bst_high_lr = bst_high_lr + .add_params(_default_params()) + .unwrap() + .fit() + .unwrap(); + } +} diff --git a/src/booster/ffi.rs b/src/booster/ffi.rs index 0264036..a6dd779 100644 --- a/src/booster/ffi.rs +++ b/src/booster/ffi.rs @@ -1 +1,101 @@ -// add ffi calls for booster here +use std::ffi::CString; + +use libc::{c_char, c_double, c_longlong, c_void}; +use lightgbm_sys::{BoosterHandle, DatasetHandle}; + +use {lightgbm_sys, InputMatrix}; + +use crate::{LgbmError, Result}; + +pub(crate) fn new_booster(train_data: DatasetHandle, parsed_params: &str) -> Result { + let params_cstring = CString::new(parsed_params)?; + let mut handle = std::ptr::null_mut(); + lgbm_call!(lightgbm_sys::LGBM_BoosterCreate( + train_data, + params_cstring.as_ptr() as *const c_char, + &mut handle + ))?; + Ok(handle) +} + +pub(crate) fn add_validation_data_to_booster( + booster: BoosterHandle, + validation_data_handle: DatasetHandle, +) -> Result<()> { + lgbm_call!(lightgbm_sys::LGBM_BoosterAddValidData( + booster, + validation_data_handle + )) +} + +#[inline] +pub(crate) fn train_one_step(booster: BoosterHandle, is_finished: &mut i32) -> Result<()> { + lgbm_call!(lightgbm_sys::LGBM_BoosterUpdateOneIter( + booster, + is_finished + )) +} + +pub(crate) fn get_num_classes(booster: BoosterHandle) -> Result { + let mut num_classes = -1; + lgbm_call!(lightgbm_sys::LGBM_BoosterGetNumClasses( + booster, + &mut num_classes + ))?; + if num_classes > -1 { + Ok(num_classes) + } else { + Err(LgbmError::new( + "lgbm didn't update the number of classes correctly.", + )) + } +} + +pub(crate) fn predict( + booster: BoosterHandle, + prediction_params: &str, + data: InputMatrix, +) -> Result { + let data_length = data.len(); + let feature_length = data[0].len(); + let params = CString::new(prediction_params)?; + let mut out_length: c_longlong = 0; + let flat_data = data.into_iter().flatten().collect::>(); + let num_classes = get_num_classes(booster)?; + let out_result: Vec = vec![Default::default(); data_length * num_classes as usize]; + + lgbm_call!(lightgbm_sys::LGBM_BoosterPredictForMat( + booster, + flat_data.as_ptr() as *const c_void, + lightgbm_sys::C_API_DTYPE_FLOAT64 as i32, + data_length as i32, + feature_length as i32, + 1_i32, + 0_i32, + 0_i32, + -1_i32, + params.as_ptr() as *const c_char, + &mut out_length, + out_result.as_ptr() as *mut c_double + ))?; + + // reshape for multiclass [1,2,3,4,5,6] -> [[1,2,3], [4,5,6]] # 3 class + let reshaped_output = if num_classes > 1 { + out_result + .chunks(num_classes as usize) + .map(|x| x.to_vec()) + .collect() + } else { + vec![out_result] + }; + Ok(reshaped_output) +} + +pub(crate) fn num_feature(booster: BoosterHandle) -> Result { + let mut out_len = 0; + lgbm_call!(lightgbm_sys::LGBM_BoosterGetNumFeature( + booster, + &mut out_len + ))?; + Ok(out_len) +} diff --git a/src/booster/mod.rs b/src/booster/mod.rs index 39dd597..dd945cd 100644 --- a/src/booster/mod.rs +++ b/src/booster/mod.rs @@ -1,33 +1,38 @@ -use booster::builder::{BoosterBuilder, ParamsNotAdded, TrainDataNotAdded}; -use dataset::DataSet; -use Error; +use booster::builder::{BoosterBuilder, ParamsMissing, TrainDataMissing}; +use dataset::{DataSet, LoadedDataSet}; +use LgbmError; use {InputMatrix, OutputVec}; mod builder; mod ffi; +/// Evaluation Result of a Booster on a given Dataset. +/// Returned by get_eval pub struct EvalResult { metric_name: String, score: f64, } +/// Class that is returned by the builder, once fit() is called. +/// Used to interact with a trained booster. pub struct Booster { handle: lightgbm_sys::BoosterHandle, - train_data: DataSet, - validation_data: Vec, + train_data: Option, // dont drop datasets + validation_data: Vec, } +// exchange params method aswell? does this make sense? impl Booster { /// Returns a builder. At least training data and params need to be added, /// so that the model can be fitted (built). - pub fn builder() -> BoosterBuilder { + pub fn builder() -> BoosterBuilder { BoosterBuilder::default() } /// Generates a prediction for a given Input. /// /// Can return an Error if the input or model is corrupt. - pub fn predict(&self, x: &InputMatrix) -> Result { + pub fn predict(&self, x: &InputMatrix) -> Result { let _ = x[0][0] + 1_f64; // silence warning for now todo!() } @@ -36,18 +41,35 @@ impl Booster { /// If successful, returns a Result with a m·n matrix, where /// m = number of datasets /// n = number of metrics - pub fn get_eval_results(&self) -> Result>, Error> { + pub fn get_eval_results(&self) -> Result>, LgbmError> { todo!("just ffi call i guess") } - pub fn finetune(&self, data: DataSet) -> Result<(), Error> { + /// this should take &mut self, because it changes the model + pub(crate) fn train_loop(&mut self) -> Result<(), LgbmError> { + let mut is_finished = 0; + while is_finished == 0 { + // callback stuff here + ffi::train_one_step(self.handle, &mut is_finished)? + } + Ok(()) + } + + /// Train a booster further with a new dataset. + /// This should not reset the already existing submodels. + /// Pass an empty array as validation data, if you don't want to validate the train results. + /// TODO validate this after implemented + pub fn finetune( + &mut self, + _train_data: DataSet, + _validation_data: Vec, + ) -> Result<(), LgbmError> { todo!() } } #[cfg(test)] mod tests { - use booster::Booster; #[test] fn simple() {} diff --git a/src/dataset/dataframe.rs b/src/dataset/dataframe.rs index 35a48ef..31da9c8 100644 --- a/src/dataset/dataframe.rs +++ b/src/dataset/dataframe.rs @@ -2,11 +2,11 @@ use polars::prelude::{DataFrame, Float32Type, Float64Type, PolarsError}; use {InputMatrix, OutputVec}; -type FfiError = crate::Error; +type FfiError = crate::LgbmError; pub(crate) fn dataframe_to_mat( dataframe: &mut DataFrame, - label_column: String, + label_column: &str, ) -> Result<(InputMatrix, OutputVec), FfiError> { let label_col_name = label_column.as_str(); let (m, n) = dataframe.shape(); diff --git a/src/dataset/ffi.rs b/src/dataset/ffi.rs index 814635f..6310088 100644 --- a/src/dataset/ffi.rs +++ b/src/dataset/ffi.rs @@ -3,10 +3,9 @@ use std::ffi::CString; use libc::{c_char, c_void}; use lightgbm_sys::DatasetHandle; -use dataset::LoadedDataSet; use {InputMatrix, OutputVec}; -use crate::error::{Error, Result}; +use crate::error::{LgbmError, Result}; pub(crate) fn drop_dataset(handle: DatasetHandle) -> Result<()> { lgbm_call!(lightgbm_sys::LGBM_DatasetFree(handle))?; @@ -16,13 +15,13 @@ pub(crate) fn drop_dataset(handle: DatasetHandle) -> Result<()> { pub(crate) fn load_dataset_from_file( file_path: &str, dataset_params: &str, - reference_dataset: &Option, + reference_dataset: Option, ) -> Result { let file_path_str = CString::new(file_path).unwrap(); let params = CString::new(dataset_params).unwrap(); let mut handle = std::ptr::null_mut(); - let reference = match *reference_dataset { + let reference = match reference_dataset { Some(h) => h, None => std::ptr::null_mut(), }; @@ -41,14 +40,14 @@ pub(crate) fn load_from_vec( data: &InputMatrix, label: &OutputVec, dataset_params: &str, - reference_dataset: &Option, + reference_dataset: Option, ) -> Result { let data_length = data.len(); let feature_length = data[0].len(); let params = CString::new(dataset_params).unwrap(); let label_str = CString::new("label").unwrap(); - let reference = match *reference_dataset { + let reference = match reference_dataset { Some(h) => h, None => std::ptr::null_mut(), }; @@ -58,7 +57,7 @@ pub(crate) fn load_from_vec( let flat_data = data.iter().flatten().collect::>(); if data_length > i32::MAX as usize || feature_length > i32::MAX as usize { - return Err(Error::new(format!( + return Err(LgbmError::new(format!( "received old_dataset of size {}x{}, but at most {}x{} is supported", data_length, feature_length, diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index fe4ce3c..e151b79 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -3,7 +3,7 @@ use lightgbm_sys::DatasetHandle; use polars::prelude::DataFrame; use OutputVec; -use {Error, InputMatrix}; +use {InputMatrix, LgbmError}; #[cfg(feature = "dataframe")] mod dataframe; @@ -20,6 +20,18 @@ pub struct DataSet { params: String, } +impl DataSet { + // ignore params for now + pub fn new(format: DataFormat) -> Self { + Self { + format, + params: "".to_string(), + } + } +} + +/// Represents the different Formats for datasets, that can be loaded into lightgbm. +/// Depending on the type, a different way for processing/loading the data is chosen. #[derive(Clone)] pub enum DataFormat { File { @@ -36,10 +48,15 @@ pub enum DataFormat { }, } +/// Loaded Dataset. Created by calling the load method on a Dataset. +/// This is done by the BoosterBuilder. +/// The DatasetHandle is returned by the lightgbm ffi. pub struct LoadedDataSet { pub(crate) handle: DatasetHandle, + dataset: DataSet, // this can maybe be removed } +/// Data needs to be freed manually impl Drop for LoadedDataSet { fn drop(&mut self) { ffi::drop_dataset(self.handle).expect("Something went wrong dropping the Dataset."); @@ -47,16 +64,22 @@ impl Drop for LoadedDataSet { } impl DataSet { - pub(crate) fn load(&self, reference: &Option) -> Result { + /// Load a Dataset into Lightgbm. Depending on the format, different ffis are used. + /// Either returns a Loaded Dataset or an Error, if lightgbm (or polars) reject the data. + /// This functions is called by the BoosterBuilder + pub(crate) fn load(self, reference: Option) -> Result { let handle = match &self.format { DataFormat::File { path } => ffi::load_dataset_from_file(path, &self.params, reference), DataFormat::Vecs { x, y } => ffi::load_from_vec(x, y, &self.params, reference), #[cfg(feature = "dataframe")] - DataFormat::DataFrame { df, y_column } => { - let (x, y) = dataframe::dataframe_to_mat(dataframe, label_column)?; + DataFormat::DataFrame { mut df, y_column } => { + let (x, y) = dataframe::dataframe_to_mat(&mut df, y_column)?; ffi::load_from_vec(&x, &y, &self.params, reference) } }?; - Ok(LoadedDataSet { handle }) + Ok(LoadedDataSet { + handle, + dataset: self, + }) } } diff --git a/src/error.rs b/src/error.rs index 8564eb8..0fe8e2c 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,7 +1,7 @@ //! Functionality related to errors and error handling. use std::error; -use std::ffi::CStr; +use std::ffi::{CStr, NulError}; use std::fmt::{self, Debug, Display}; use lightgbm_sys; @@ -10,15 +10,15 @@ use lightgbm_sys; use polars::prelude::*; /// Convenience return type for most operations which can return an `LightGBM`. -pub type Result = std::result::Result; +pub type Result = std::result::Result; /// Wrap errors returned by the LightGBM library. #[derive(Debug, Eq, PartialEq)] -pub struct Error { +pub struct LgbmError { desc: String, } -impl Error { +impl LgbmError { pub(crate) fn new>(desc: S) -> Self { Self { desc: desc.into() } } @@ -44,16 +44,24 @@ impl Error { } } -impl error::Error for Error {} +impl error::Error for LgbmError {} -impl Display for Error { +impl Display for LgbmError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "LightGBM error: {}", &self.desc) } } +impl From for LgbmError { + fn from(_: NulError) -> Self { + Self { + desc: "Null Byte found within String".into(), + } + } +} + #[cfg(feature = "dataframe")] -impl From for Error { +impl From for LgbmError { fn from(pe: PolarsError) -> Self { Self { desc: pe.to_string(), @@ -67,10 +75,10 @@ mod tests { #[test] fn return_value_handling() { - let result = Error::check_return_value(0); + let result = LgbmError::check_return_value(0); assert_eq!(result, Ok(())); - let result = Error::check_return_value(-1); - assert_eq!(result, Err(Error::new("Everything is fine"))); + let result = LgbmError::check_return_value(-1); + assert_eq!(result, Err(LgbmError::new("Everything is fine"))); } } diff --git a/src/lib.rs b/src/lib.rs index 6e4062e..aabb993 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,17 +5,18 @@ extern crate serde_json; type InputMatrix = Vec>; type OutputVec = Vec; +extern crate alloc; #[cfg(feature = "dataframe")] extern crate polars; macro_rules! lgbm_call { ($x:expr) => { - Error::check_return_value(unsafe { $x }) + LgbmError::check_return_value(unsafe { $x }) }; } mod error; -pub use error::{Error, Result}; +pub use error::{LgbmError, Result}; mod booster; mod dataset; From 2269ad516944309cc21fdcc23ba3385d134778a5 Mon Sep 17 00:00:00 2001 From: David Schwab Date: Thu, 25 May 2023 12:09:03 +0200 Subject: [PATCH 06/30] use num_iterations --- src/booster/builder.rs | 2 +- src/booster/mod.rs | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/booster/builder.rs b/src/booster/builder.rs index eef4bc0..30dd688 100644 --- a/src/booster/builder.rs +++ b/src/booster/builder.rs @@ -101,7 +101,7 @@ impl BoosterBuilder { train_data: Some(train_data), validation_data: validation_sets, }; - booster.train_loop()?; + booster.train_loop(self.params["num_iterations"])?; // param parsing checked already if present Ok(booster) } diff --git a/src/booster/mod.rs b/src/booster/mod.rs index dd945cd..c6247a4 100644 --- a/src/booster/mod.rs +++ b/src/booster/mod.rs @@ -46,11 +46,13 @@ impl Booster { } /// this should take &mut self, because it changes the model - pub(crate) fn train_loop(&mut self) -> Result<(), LgbmError> { + pub(crate) fn train_loop(&mut self, max_iterations: i32) -> Result<(), LgbmError> { let mut is_finished = 0; - while is_finished == 0 { + let mut i = 0; + while is_finished == 0 && i < max_iterations { // callback stuff here ffi::train_one_step(self.handle, &mut is_finished)? + i+=1; } Ok(()) } From 2089b13915b2cbccde0ebc124b632b6a7c31d73e Mon Sep 17 00:00:00 2001 From: David Schwab Date: Thu, 25 May 2023 12:24:03 +0200 Subject: [PATCH 07/30] implement predict --- src/booster/ffi.rs | 4 ++-- src/booster/mod.rs | 15 ++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/booster/ffi.rs b/src/booster/ffi.rs index a6dd779..9811ef7 100644 --- a/src/booster/ffi.rs +++ b/src/booster/ffi.rs @@ -54,13 +54,13 @@ pub(crate) fn get_num_classes(booster: BoosterHandle) -> Result { pub(crate) fn predict( booster: BoosterHandle, prediction_params: &str, - data: InputMatrix, + data: &InputMatrix, ) -> Result { let data_length = data.len(); let feature_length = data[0].len(); let params = CString::new(prediction_params)?; let mut out_length: c_longlong = 0; - let flat_data = data.into_iter().flatten().collect::>(); + let flat_data = data.clone().into_iter().flatten().collect::>(); let num_classes = get_num_classes(booster)?; let out_result: Vec = vec![Default::default(); data_length * num_classes as usize]; diff --git a/src/booster/mod.rs b/src/booster/mod.rs index c6247a4..f5030fd 100644 --- a/src/booster/mod.rs +++ b/src/booster/mod.rs @@ -21,7 +21,7 @@ pub struct Booster { validation_data: Vec, } -// exchange params method aswell? does this make sense? +// exchange params method as well? does this make sense? impl Booster { /// Returns a builder. At least training data and params need to be added, /// so that the model can be fitted (built). @@ -30,11 +30,12 @@ impl Booster { } /// Generates a prediction for a given Input. - /// + /// The Output has the same dimensions as the input, + /// because this returns class probabilities. /// Can return an Error if the input or model is corrupt. - pub fn predict(&self, x: &InputMatrix) -> Result { - let _ = x[0][0] + 1_f64; // silence warning for now - todo!() + pub fn predict(&self, x: &InputMatrix) -> Result { + let prediction_params = ""; // do we need this? + ffi::predict(self.handle, prediction_params, x) } /// Returns the scores for the train and validation set. @@ -51,8 +52,8 @@ impl Booster { let mut i = 0; while is_finished == 0 && i < max_iterations { // callback stuff here - ffi::train_one_step(self.handle, &mut is_finished)? - i+=1; + ffi::train_one_step(self.handle, &mut is_finished)?; + i += 1; } Ok(()) } From 3bffc008cc54feb2c016a90573c7813c91c7134a Mon Sep 17 00:00:00 2001 From: David Schwab Date: Thu, 25 May 2023 14:32:56 +0200 Subject: [PATCH 08/30] implemented eval --- src/booster/builder.rs | 15 ++++-- src/booster/ffi.rs | 112 +++++++++++++++++++++++++++++++++++++-- src/booster/mod.rs | 56 +++++++++++++------- src/dataset/dataframe.rs | 4 +- src/dataset/ffi.rs | 6 +-- src/dataset/mod.rs | 8 +-- src/lib.rs | 4 +- src/old_booster.rs | 2 +- 8 files changed, 168 insertions(+), 39 deletions(-) diff --git a/src/booster/builder.rs b/src/booster/builder.rs index 30dd688..95c1e8f 100644 --- a/src/booster/builder.rs +++ b/src/booster/builder.rs @@ -3,7 +3,7 @@ use serde_json::Value; use booster::Booster; use dataset::DataSet; use {booster, LgbmError}; -use {InputMatrix, OutputVec}; +use {LabelVec, Matrixf64}; ///////////////////////////////////////////// // types for training set @@ -16,7 +16,7 @@ pub struct TrainDataMissing; ///////////////////////////////////////////// // types for params #[derive(Clone)] -pub struct ParamsAdded(String); // this should not implement default, so it can safely be used for construction +pub struct ParamsAdded(String, i32); // this should not implement default, so it can safely be used for construction #[derive(Default, Clone)] pub struct ParamsMissing; ///////////////////////////////////////////// @@ -48,9 +48,14 @@ impl BoosterBuilder { /// Adds params to the Booster. /// Returns Error, if param parsing returns Error. pub fn add_params(self, params: Value) -> Result, LgbmError> { + let num_iterations = params + .get("num_iterations") + .ok_or(LgbmError::new("Num iterations in params missing."))? + .as_i64() + .ok_or(LgbmError::new("Invalid Value for num iterations."))?; let parsed_params = parse_params(params)?; Ok(BoosterBuilder { - params: ParamsAdded(parsed_params), + params: ParamsAdded(parsed_params, num_iterations as i32), train_data: self.train_data, val_data: self.val_data, }) @@ -101,14 +106,14 @@ impl BoosterBuilder { train_data: Some(train_data), validation_data: validation_sets, }; - booster.train_loop(self.params["num_iterations"])?; // param parsing checked already if present + booster.train_loop(self.params.1)?; // param parsing checked already if present Ok(booster) } /// Build the Booster with fit and immediately predict for the given input. /// Can Fail in fit if the Booster isn't correctly build or in predict if the Input Data /// is corrupted. - pub fn fit_predict(self, x: &InputMatrix) -> Result<(Booster, OutputVec), LgbmError> { + pub fn fit_predict(self, x: &Matrixf64) -> Result<(Booster, Matrixf64), LgbmError> { let booster = self.fit()?; let y = booster.predict(x)?; Ok((booster, y)) diff --git a/src/booster/ffi.rs b/src/booster/ffi.rs index 9811ef7..25d193c 100644 --- a/src/booster/ffi.rs +++ b/src/booster/ffi.rs @@ -3,7 +3,7 @@ use std::ffi::CString; use libc::{c_char, c_double, c_longlong, c_void}; use lightgbm_sys::{BoosterHandle, DatasetHandle}; -use {lightgbm_sys, InputMatrix}; +use {lightgbm_sys, Matrixf64}; use crate::{LgbmError, Result}; @@ -54,8 +54,8 @@ pub(crate) fn get_num_classes(booster: BoosterHandle) -> Result { pub(crate) fn predict( booster: BoosterHandle, prediction_params: &str, - data: &InputMatrix, -) -> Result { + data: &Matrixf64, +) -> Result { let data_length = data.len(); let feature_length = data[0].len(); let params = CString::new(prediction_params)?; @@ -91,6 +91,112 @@ pub(crate) fn predict( Ok(reshaped_output) } +/// Get number of evaluation metrics +pub(crate) fn num_eval(handle: BoosterHandle) -> Result { + let mut out_len = 0; + lgbm_call!(lightgbm_sys::LGBM_BoosterGetEvalCounts( + handle, + &mut out_len + ))?; + Ok(out_len) +} + +/// Get names of evaluation metrics +pub(crate) fn get_eval_names(handle: BoosterHandle) -> Result> { + let num_metrics = num_eval(handle)?; + + ///////////////////////////////////////////////////////////////////// + // call with 0-sized buffer to find out how much space to allocate + ///////////////////////////////////////////////////////////////////// + let mut num_eval_names = 0; + let mut out_buffer_len = 0; + + lgbm_call!(lightgbm_sys::LGBM_BoosterGetEvalNames( + handle, + 0, + &mut num_eval_names, + 0, + &mut out_buffer_len, + std::ptr::null_mut() as *mut *mut c_char + ))?; + + ///////////////////////////////////////////////////////////////////// + // sanity check + ///////////////////////////////////////////////////////////////////// + if num_eval_names != num_metrics { + return Err(LgbmError::new(format!( + "expected num_eval_names==num_metrics, but got {num_eval_names}!={num_metrics}. This is a bug in lightgbm or its rust wrapper" + ))); + } + + ///////////////////////////////////////////////////////////////////// + // get the actual strings + ///////////////////////////////////////////////////////////////////// + + let mut out_strs = (0..num_metrics) + .map(|_| (0..out_buffer_len).map(|_| 0).collect::>()) + .collect::>(); + + let mut out_strs_pointers = out_strs + .iter_mut() + .map(|s| s.as_mut_ptr()) + .collect::>(); + + let metric_name_length = out_buffer_len; + + lgbm_call!(lightgbm_sys::LGBM_BoosterGetEvalNames( + handle, + num_metrics, + &mut num_eval_names, + metric_name_length, + &mut out_buffer_len, + out_strs_pointers.as_mut_ptr() as *mut *mut c_char + ))?; + + drop(out_strs_pointers); // don't let pointers outlive their target + + let mut output = Vec::with_capacity(out_strs.len()); + for mut out_str in out_strs { + let first_null = out_str + .iter() + .enumerate() + .find(|(_, e)| **e == 0) + .map(|(i, _)| i) + .expect("string not null terminated, possible memory corruption"); + out_str.truncate(first_null + 1); + + let string = CString::from_vec_with_nul(out_str) + .expect("string memory invariant violated, possible memory corruption") + .into_string() + .map_err(|_| LgbmError::new("name not valid UTF-8"))?; + output.push(string); + } + + Ok(output) +} + +pub(crate) fn get_eval_scores( + handle: BoosterHandle, + data_index: i32, + num_metrics: usize, +) -> Result> { + let mut out_len = 0; + let out_result: Vec = vec![Default::default(); num_metrics]; + lgbm_call!(lightgbm_sys::LGBM_BoosterGetEval( + handle, + data_index, + &mut out_len, + out_result.as_ptr() as *mut c_double + ))?; + if out_len != out_result.len() as i32 { + Err(LgbmError::new( + "Output Array length doesn't match reported length.", + )) + } else { + Ok(out_result) + } +} + pub(crate) fn num_feature(booster: BoosterHandle) -> Result { let mut out_len = 0; lgbm_call!(lightgbm_sys::LGBM_BoosterGetNumFeature( diff --git a/src/booster/mod.rs b/src/booster/mod.rs index f5030fd..7717fd6 100644 --- a/src/booster/mod.rs +++ b/src/booster/mod.rs @@ -1,7 +1,7 @@ use booster::builder::{BoosterBuilder, ParamsMissing, TrainDataMissing}; use dataset::{DataSet, LoadedDataSet}; use LgbmError; -use {InputMatrix, OutputVec}; +use {LabelVec, Matrixf64}; mod builder; mod ffi; @@ -33,21 +33,39 @@ impl Booster { /// The Output has the same dimensions as the input, /// because this returns class probabilities. /// Can return an Error if the input or model is corrupt. - pub fn predict(&self, x: &InputMatrix) -> Result { + pub fn predict(&self, x: &Matrixf64) -> Result { let prediction_params = ""; // do we need this? ffi::predict(self.handle, prediction_params, x) } - /// Returns the scores for the train and validation set. - /// If successful, returns a Result with a m·n matrix, where - /// m = number of datasets - /// n = number of metrics - pub fn get_eval_results(&self) -> Result>, LgbmError> { - todo!("just ffi call i guess") + /// Returns the scores for a certain dataset. You can use the index like this: + /// 0 = Train Dataset + /// 1 = 1. Validation Dataset + /// 2 = 2. Validation Dataset + /// ... + /// n = nth Validation Dataset + pub fn get_eval_result_for_dataset( + &self, + dataset_index: i32, + ) -> Result, LgbmError> { + if dataset_index > self.validation_data.len() as i32 { + return Err(LgbmError::new(format!( + "Invalid Dataset Index. Given: {} Max Allowed: {}", + dataset_index, + self.validation_data.len() + ))); + } + let names = ffi::get_eval_names(self.handle)?; + let scores = ffi::get_eval_scores(self.handle, dataset_index, names.len())?; + Ok(names + .into_iter() + .zip(scores) + .map(|(metric_name, score)| EvalResult { metric_name, score }) + .collect()) } /// this should take &mut self, because it changes the model - pub(crate) fn train_loop(&mut self, max_iterations: i32) -> Result<(), LgbmError> { + fn train_loop(&mut self, max_iterations: i32) -> Result<(), LgbmError> { let mut is_finished = 0; let mut i = 0; while is_finished == 0 && i < max_iterations { @@ -58,17 +76,17 @@ impl Booster { Ok(()) } - /// Train a booster further with a new dataset. - /// This should not reset the already existing submodels. - /// Pass an empty array as validation data, if you don't want to validate the train results. - /// TODO validate this after implemented + /* /// Train a booster further with a new dataset. + /// This should not reset the already existing submodels. + /// Pass an empty array as validation data, if you don't want to validate the train results. + /// TODO validate this after implemented pub fn finetune( - &mut self, - _train_data: DataSet, - _validation_data: Vec, - ) -> Result<(), LgbmError> { - todo!() - } + &mut self, + _train_data: DataSet, + _validation_data: Vec, + ) -> Result<(), LgbmError> { + + }*/ } #[cfg(test)] diff --git a/src/dataset/dataframe.rs b/src/dataset/dataframe.rs index 31da9c8..5a9e413 100644 --- a/src/dataset/dataframe.rs +++ b/src/dataset/dataframe.rs @@ -1,13 +1,13 @@ use polars::prelude::{DataFrame, Float32Type, Float64Type, PolarsError}; -use {InputMatrix, OutputVec}; +use {LabelVec, Matrixf64}; type FfiError = crate::LgbmError; pub(crate) fn dataframe_to_mat( dataframe: &mut DataFrame, label_column: &str, -) -> Result<(InputMatrix, OutputVec), FfiError> { +) -> Result<(Matrixf64, LabelVec), FfiError> { let label_col_name = label_column.as_str(); let (m, n) = dataframe.shape(); let label_series = &dataframe.select_series(label_col_name)?[0].cast::()?; diff --git a/src/dataset/ffi.rs b/src/dataset/ffi.rs index 6310088..026743b 100644 --- a/src/dataset/ffi.rs +++ b/src/dataset/ffi.rs @@ -3,7 +3,7 @@ use std::ffi::CString; use libc::{c_char, c_void}; use lightgbm_sys::DatasetHandle; -use {InputMatrix, OutputVec}; +use {LabelVec, Matrixf64}; use crate::error::{LgbmError, Result}; @@ -37,8 +37,8 @@ pub(crate) fn load_dataset_from_file( } pub(crate) fn load_from_vec( - data: &InputMatrix, - label: &OutputVec, + data: &Matrixf64, + label: &LabelVec, dataset_params: &str, reference_dataset: Option, ) -> Result { diff --git a/src/dataset/mod.rs b/src/dataset/mod.rs index e151b79..9f57f07 100644 --- a/src/dataset/mod.rs +++ b/src/dataset/mod.rs @@ -2,8 +2,8 @@ use lightgbm_sys::DatasetHandle; #[cfg(feature = "dataframe")] use polars::prelude::DataFrame; -use OutputVec; -use {InputMatrix, LgbmError}; +use LabelVec; +use {LgbmError, Matrixf64}; #[cfg(feature = "dataframe")] mod dataframe; @@ -38,8 +38,8 @@ pub enum DataFormat { path: String, }, Vecs { - x: InputMatrix, - y: OutputVec, + x: Matrixf64, + y: LabelVec, }, #[cfg(feature = "dataframe")] DataFrame { diff --git a/src/lib.rs b/src/lib.rs index aabb993..4b61621 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,8 +2,8 @@ extern crate libc; extern crate lightgbm_sys; extern crate serde_json; -type InputMatrix = Vec>; -type OutputVec = Vec; +type Matrixf64 = Vec>; +type LabelVec = Vec; extern crate alloc; #[cfg(feature = "dataframe")] diff --git a/src/old_booster.rs b/src/old_booster.rs index 47280a3..5514daa 100644 --- a/src/old_booster.rs +++ b/src/old_booster.rs @@ -608,7 +608,7 @@ mod tests { assert!(invalid_res.is_err()); assert_eq!(train_res[0].metric, "auc"); assert_eq!(val_res[1].metric, "l1"); - assert!(0.0 <= train_res[0].score && train_res[0].score <= 1.0); // make shure values make sense + assert!(0.0 <= train_res[0].score && train_res[0].score <= 1.0); // make sure values make sense assert!(0.0 <= train_res[1].score); } From dbdb85ffa8ccb6f93a2bccac0128dc0cf1df2baf Mon Sep 17 00:00:00 2001 From: David Schwab Date: Thu, 25 May 2023 14:33:24 +0200 Subject: [PATCH 09/30] clippy fix --- src/booster/builder.rs | 2 +- src/booster/mod.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/booster/builder.rs b/src/booster/builder.rs index 95c1e8f..c925f9f 100644 --- a/src/booster/builder.rs +++ b/src/booster/builder.rs @@ -3,7 +3,7 @@ use serde_json::Value; use booster::Booster; use dataset::DataSet; use {booster, LgbmError}; -use {LabelVec, Matrixf64}; +use {Matrixf64}; ///////////////////////////////////////////// // types for training set diff --git a/src/booster/mod.rs b/src/booster/mod.rs index 7717fd6..0687a69 100644 --- a/src/booster/mod.rs +++ b/src/booster/mod.rs @@ -1,7 +1,7 @@ use booster::builder::{BoosterBuilder, ParamsMissing, TrainDataMissing}; -use dataset::{DataSet, LoadedDataSet}; +use dataset::{LoadedDataSet}; use LgbmError; -use {LabelVec, Matrixf64}; +use {Matrixf64}; mod builder; mod ffi; From 52c1c1d074edbe7c5dfe2bf00266c9bac2be7696 Mon Sep 17 00:00:00 2001 From: David Schwab Date: Thu, 25 May 2023 14:34:13 +0200 Subject: [PATCH 10/30] remove old files --- src/old_booster.rs | 678 --------------------------------------------- src/old_dataset.rs | 388 -------------------------- 2 files changed, 1066 deletions(-) delete mode 100644 src/old_booster.rs delete mode 100644 src/old_dataset.rs diff --git a/src/old_booster.rs b/src/old_booster.rs deleted file mode 100644 index 5514daa..0000000 --- a/src/old_booster.rs +++ /dev/null @@ -1,678 +0,0 @@ -use std; -use std::convert::TryInto; -use std::ffi::CString; - -use libc::{c_char, c_double, c_longlong, c_void}; -use lightgbm_sys; -use serde_json::Value; - -use crate::{Dataset, Error, Result}; - -/// Core model in LightGBM, containing functions for training, evaluating and predicting. -pub struct Booster { - handle: lightgbm_sys::BoosterHandle, -} - -/// Represents the score during training on either the train or validation set -#[derive(Debug, PartialEq)] -pub struct EvalResult { - pub metric: String, - pub score: f64, -} - -impl Booster { - fn new(handle: lightgbm_sys::BoosterHandle) -> Self { - Booster { handle } - } - - /// Init from model file. - pub fn from_file(filename: &str) -> Result { - let filename_str = CString::new(filename).unwrap(); - let mut out_num_iterations = 0; - let mut handle = std::ptr::null_mut(); - lgbm_call!(lightgbm_sys::LGBM_BoosterCreateFromModelfile( - filename_str.as_ptr() as *const c_char, - &mut out_num_iterations, - &mut handle - ))?; - - Ok(Booster::new(handle)) - } - - /// Init from model string. - pub fn from_string(model_description: &str) -> Result { - let cstring = CString::new(model_description).unwrap(); - let mut out_num_iterations = 0; - let mut handle = std::ptr::null_mut(); - lgbm_call!(lightgbm_sys::LGBM_BoosterLoadModelFromString( - cstring.as_ptr() as *const c_char, - &mut out_num_iterations, - &mut handle - ))?; - - Ok(Booster::new(handle)) - } - - /// Create a new Booster model with given Dataset and parameters. - /// - /// Example - /// ``` - /// extern crate serde_json; - /// use lightgbm::{Dataset, Booster}; - /// use serde_json::json; - /// - /// let data = vec![vec![1.0, 0.1, 0.2, 0.1], - /// vec![0.7, 0.4, 0.5, 0.1], - /// vec![0.9, 0.8, 0.5, 0.1], - /// vec![0.2, 0.2, 0.8, 0.7], - /// vec![0.1, 0.7, 1.0, 0.9]]; - /// let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - /// let dataset = Dataset::from_mat(data, label).unwrap(); - /// let params = json!{ - /// { - /// "num_iterations": 3, - /// "objective": "binary", - /// "metric": "auc" - /// } - /// }; - /// let bst = Booster::train(dataset, None, ¶ms).unwrap(); - /// ``` - /// Validation data can be provided aswell. - /// ``` - /// extern crate serde_json; - /// use lightgbm::{Dataset, Booster}; - /// use serde_json::json; - /// - /// let data = vec![vec![1.0, 0.1, 0.2, 0.1], - /// vec![0.7, 0.4, 0.5, 0.1], - /// vec![0.9, 0.8, 0.5, 0.1], - /// vec![0.2, 0.2, 0.8, 0.7], - /// vec![0.1, 0.7, 1.0, 0.9]]; - /// let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - /// let train_data = Dataset::from_mat(data, label).unwrap(); - /// - /// let data = vec![ - /// vec![0.9, 0.6, 0.2, 0.1], - /// vec![0.5, 0.7, 0.2, 0.1], - /// vec![0.2, 0.1, 0.6, 0.8]]; - /// let label = vec![0.0, 0.0, 1.0]; - /// let val_data = Dataset::from_mat(data, label); - /// - /// let params = json!{ - /// { - /// "num_iterations": 3, - /// "objective": "binary", - /// "metric": "auc" - /// } - /// }; - /// - /// let bst = Booster::train(train_data, val_data.ok(), ¶ms).unwrap(); - /// ``` - pub fn train( - train_data: Dataset, - val_data: Option, - parameter: &Value, - ) -> Result { - // get num_iterations - let num_iterations: i64 = if parameter["num_iterations"].is_null() { - 100 - } else { - parameter["num_iterations"].as_i64().unwrap() - }; - - // exchange params {"x": "y", "z": 1} => "x=y z=1" - // and {"k" = ["a", "b"]} => "k=a,b" - let params_string = parameter - .as_object() - .unwrap() - .iter() - .map(|(k, v)| match v { - Value::Array(a) => { - let v_formatted = a.iter().map(|x| x.to_string() + ",").collect::(); - let v_formatted = v_formatted - .replace("\",\"", ",") - .trim_end_matches(',') - .to_string(); - (k, v_formatted) - } - _ => (k, v.to_string()), - }) - .map(|(k, v)| format!("{}={}", k, v)) - .collect::>() - .join(" "); - let params_cstring = CString::new(params_string).unwrap(); - - let mut handle = std::ptr::null_mut(); - lgbm_call!(lightgbm_sys::LGBM_BoosterCreate( - train_data.handle, - params_cstring.as_ptr() as *const c_char, - &mut handle - ))?; - - // the following has to borrow val_data to avoid dropping the old_dataset - if let Some(validation_data) = &val_data { - lgbm_call!(lightgbm_sys::LGBM_BoosterAddValidData( - handle, - validation_data.handle - ))?; - } - - let mut is_finished: i32 = 0; - for _ in 0..num_iterations { - lgbm_call!(lightgbm_sys::LGBM_BoosterUpdateOneIter( - handle, - &mut is_finished - ))?; - } - Ok(Booster::new(handle)) - } - - /// Predict results for given data. - /// - /// Input data example - /// ``` - /// let data = vec![vec![1.0, 0.1, 0.2], - /// vec![0.7, 0.4, 0.5], - /// vec![0.1, 0.7, 1.0]]; - /// ``` - /// - /// Output data example - /// ``` - /// let output = vec![vec![1.0, 0.109, 0.433]]; - /// ``` - pub fn predict(&self, data: Vec>) -> Result>> { - let data_length = data.len(); - let feature_length = data[0].len(); - let params = CString::new("").unwrap(); - let mut out_length: c_longlong = 0; - let flat_data = data.into_iter().flatten().collect::>(); - - // get num_class - let mut num_class = 0; - lgbm_call!(lightgbm_sys::LGBM_BoosterGetNumClasses( - self.handle, - &mut num_class - ))?; - - let out_result: Vec = vec![Default::default(); data_length * num_class as usize]; - - lgbm_call!(lightgbm_sys::LGBM_BoosterPredictForMat( - self.handle, - flat_data.as_ptr() as *const c_void, - lightgbm_sys::C_API_DTYPE_FLOAT64 as i32, - data_length as i32, - feature_length as i32, - 1_i32, - 0_i32, - 0_i32, - -1_i32, - params.as_ptr() as *const c_char, - &mut out_length, - out_result.as_ptr() as *mut c_double - ))?; - - // reshape for multiclass [1,2,3,4,5,6] -> [[1,2,3], [4,5,6]] # 3 class - let reshaped_output = if num_class > 1 { - out_result - .chunks(num_class as usize) - .map(|x| x.to_vec()) - .collect() - } else { - vec![out_result] - }; - Ok(reshaped_output) - } - - /// Get Feature Num. - pub fn num_feature(&self) -> Result { - let mut out_len = 0; - lgbm_call!(lightgbm_sys::LGBM_BoosterGetNumFeature( - self.handle, - &mut out_len - ))?; - Ok(out_len) - } - - /// Get Feature Names. - pub fn feature_name(&self) -> Result> { - let num_feature = self.num_feature()?; - let feature_name_length = 32; - let mut num_feature_names = 0; - let mut out_buffer_len = 0; - let out_strs = (0..num_feature) - .map(|_| { - CString::new(" ".repeat(feature_name_length)) - .unwrap() - .into_raw() as *mut c_char - }) - .collect::>(); - lgbm_call!(lightgbm_sys::LGBM_BoosterGetFeatureNames( - self.handle, - num_feature, - &mut num_feature_names, - feature_name_length as u64, - &mut out_buffer_len, - out_strs.as_ptr() as *mut *mut c_char - ))?; - let output: Vec = out_strs - .into_iter() - .map(|s| unsafe { CString::from_raw(s).into_string().unwrap() }) - .collect(); - Ok(output) - } - - /// Get number of evaluation metrics - pub fn num_eval(&self) -> Result { - let mut out_len = 0; - lgbm_call!(lightgbm_sys::LGBM_BoosterGetEvalCounts( - self.handle, - &mut out_len - ))?; - Ok(out_len) - } - - /// Get names of evaluation metrics - pub fn eval_names(&self) -> Result> { - let num_metrics = self.num_eval()?; - - ///////////////////////////////////////////////////////////////////// - // call with 0-sized buffer to find out how much space to allocate - ///////////////////////////////////////////////////////////////////// - let mut num_eval_names = 0; - let mut out_buffer_len = 0; - - lgbm_call!(lightgbm_sys::LGBM_BoosterGetEvalNames( - self.handle, - 0, - &mut num_eval_names, - 0, - &mut out_buffer_len, - std::ptr::null_mut() as *mut *mut c_char - )) - .unwrap(); - - ///////////////////////////////////////////////////////////////////// - // sanity check - ///////////////////////////////////////////////////////////////////// - if num_eval_names != num_metrics { - return Err(Error::new(format!( - "expected num_eval_names==num_metrics, but got {num_eval_names}!={num_metrics}. This is a bug in lightgbm or its rust wrapper" - ))); - } - - ///////////////////////////////////////////////////////////////////// - // get the actual strings - ///////////////////////////////////////////////////////////////////// - - let mut out_strs = (0..num_metrics) - .map(|_| (0..out_buffer_len).map(|_| 0).collect::>()) - .collect::>(); - - let mut out_strs_pointers = out_strs - .iter_mut() - .map(|s| s.as_mut_ptr()) - .collect::>(); - - let metric_name_length = out_buffer_len; - - lgbm_call!(lightgbm_sys::LGBM_BoosterGetEvalNames( - self.handle, - num_metrics, - &mut num_eval_names, - metric_name_length, - &mut out_buffer_len, - out_strs_pointers.as_mut_ptr() as *mut *mut c_char - )) - .unwrap(); - - drop(out_strs_pointers); // don't let pointers outlive their target - - let mut output = Vec::with_capacity(out_strs.len()); - for mut out_str in out_strs { - let first_null = out_str - .iter() - .enumerate() - .find(|(_, e)| **e == 0) - .map(|(i, _)| i) - .expect("string not null terminated, possible memory corruption"); - out_str.truncate(first_null + 1); - - let string = CString::from_vec_with_nul(out_str) - .expect("string memory invariant violated, possible memory corruption") - .into_string() - .map_err(|_| Error::new("name not valid UTF-8"))?; - output.push(string); - } - - Ok(output) - } - - pub fn get_eval(&self, data_index: i32) -> Result> { - let names = self.eval_names()?; - let mut out_len = 0; - let out_result: Vec = vec![Default::default(); names.len()]; - lgbm_call!(lightgbm_sys::LGBM_BoosterGetEval( - self.handle, - data_index, - &mut out_len, - out_result.as_ptr() as *mut c_double - ))?; - Ok(names - .into_iter() - .zip(out_result) - .map(|(metric, score)| EvalResult { metric, score }) - .collect()) - } - - // Get Feature Importance - pub fn feature_importance(&self) -> Result> { - let num_feature = self.num_feature()?; - let out_result: Vec = vec![Default::default(); num_feature as usize]; - lgbm_call!(lightgbm_sys::LGBM_BoosterFeatureImportance( - self.handle, - 0_i32, - 0_i32, - out_result.as_ptr() as *mut c_double - ))?; - Ok(out_result) - } - - /// Save model to file. - pub fn save_file(&self, filename: &str) -> Result<()> { - let filename_str = CString::new(filename).unwrap(); - lgbm_call!(lightgbm_sys::LGBM_BoosterSaveModel( - self.handle, - 0_i32, - -1_i32, - 0_i32, - filename_str.as_ptr() as *const c_char - ))?; - Ok(()) - } - - /// Returns the size the model would have if saved using `save_file`, without having to write the file - pub fn save_file_size(&self) -> Result { - let mut out_size = 0_i64; - lgbm_call!(lightgbm_sys::LGBM_BoosterSaveModelToString( - self.handle, - 0_i32, - -1_i32, - 0_i32, - 0, - &mut out_size as *mut _, - std::ptr::null_mut() as *mut i8 - ))?; - // subtract 1 because the file doesn't contain the final null character - (out_size - 1) - .try_into() - .map_err(|_| Error::new("size negative")) - } - - /// Save model to string. This returns the same content that `save_file` writes into a file. - pub fn save_string(&self) -> Result { - // get necessary buffer size - - let mut out_size = 0_i64; - lgbm_call!(lightgbm_sys::LGBM_BoosterSaveModelToString( - self.handle, - 0_i32, - -1_i32, - 0_i32, - 0, - &mut out_size as *mut _, - std::ptr::null_mut() as *mut i8 - ))?; - - // write data to buffer and convert - let mut buffer = vec![ - 0u8; - out_size - .try_into() - .map_err(|_| Error::new("size negative"))? - ]; - lgbm_call!(lightgbm_sys::LGBM_BoosterSaveModelToString( - self.handle, - 0_i32, - -1_i32, - 0_i32, - buffer.len() as c_longlong, - &mut out_size as *mut _, - buffer.as_mut_ptr() as *mut c_char - ))?; - - if buffer.pop() != Some(0) { - // this should never happen, unless lightgbm has a bug - panic!("write out of bounds happened in lightgbm call"); - } - - let cstring = CString::new(buffer).map_err(|e| Error::new(e.to_string()))?; - cstring - .into_string() - .map_err(|_| Error::new("can't convert model string to unicode")) - } -} - -impl Drop for Booster { - fn drop(&mut self) { - lgbm_call!(lightgbm_sys::LGBM_BoosterFree(self.handle)).unwrap(); - } -} - -#[cfg(test)] -mod tests { - use std::fs; - use std::path::Path; - - use serde_json::json; - - use super::*; - - fn _read_train_file() -> Result { - Dataset::from_file( - &"lightgbm-sys/lightgbm/examples/binary_classification/binary.train", - None, - ) - } - - fn _train_booster(params: &Value) -> Booster { - let dataset = _read_train_file().unwrap(); - Booster::train(dataset, None, ¶ms).unwrap() - } - - fn _default_params() -> Value { - let params = json! { - { - "num_iterations": 1, - "objective": "binary", - "metric": "auc", - "data_random_seed": 0 - } - }; - params - } - - #[test] - fn predict() { - let params = json! { - { - "num_iterations": 10, - "objective": "binary", - "metric": "auc", - "data_random_seed": 0 - } - }; - let bst = _train_booster(¶ms); - let feature = vec![vec![0.5; 28], vec![0.0; 28], vec![0.9; 28]]; - let result = bst.predict(feature).unwrap(); - let mut normalized_result = Vec::new(); - for r in &result[0] { - normalized_result.push(if r > &0.5 { 1 } else { 0 }); - } - assert_eq!(normalized_result, vec![0, 0, 1]); - } - - #[test] - fn num_feature() { - let params = _default_params(); - let bst = _train_booster(¶ms); - let num_feature = bst.num_feature().unwrap(); - assert_eq!(num_feature, 28); - } - - #[test] - fn eval_names() { - let params = json! { - { - "num_iterations": 1, - "objective": "binary", - "metrics": ["auc", "l1"], - "data_random_seed": 0 - } - }; - let bst = _train_booster(¶ms); - let eval_names = bst.eval_names().unwrap(); - assert_eq!(eval_names, vec!["auc", "l1"]) - } - - #[test] - fn get_eval_sample_dataset() { - let params = json! { - { - "num_iterations": 30, - "objective": "binary", - "boosting_type": "gbdt", - "metrics": ["binary_logloss","auc"], - "label_column": 0, - "max_bin": 255, - "tree_learner": "serial", - "feature_fraction": 0.8, - "is_enable_sparse": true, - "data_random_seed": 0 - } - }; - let train = Dataset::from_file( - "lightgbm-sys/lightgbm/examples/binary_classification/binary.train", - None, - ) - .unwrap(); - let val = Dataset::from_file( - "lightgbm-sys/lightgbm/examples/binary_classification/binary.test", - Some(&train), - ) - .unwrap(); - - let bst = Booster::train(train, Some(val), ¶ms).unwrap(); - //let bst = Booster::train(train, None, ¶ms).unwrap(); - - let eval_train = bst.get_eval(0); - let eval_val = bst.get_eval(1); - assert!(eval_val.is_ok()); - assert!(eval_train.is_ok()); - let eval_invalid = bst.get_eval(420); - assert!(eval_invalid.is_err()); - } - - #[test] - fn get_eval() { - let data = vec![ - vec![1.0, 0.1, 0.2, 0.1], - vec![0.7, 0.4, 0.5, 0.1], - vec![0.9, 0.8, 0.5, 0.1], - vec![0.2, 0.2, 0.8, 0.7], - vec![0.1, 0.7, 1.0, 0.9], - ]; - let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - let train_data = Dataset::from_mat(data, label, None).unwrap(); - - let data = vec![ - vec![0.9, 0.6, 0.2, 0.1], - vec![0.5, 0.7, 0.2, 0.1], - vec![0.2, 0.1, 0.6, 0.8], - ]; - let label = vec![0.0, 0.0, 1.0]; - let val_data = Dataset::from_mat(data, label, None); - - let params = json! { - { - "num_iterations": 3, - "objective": "binary", - "metric": ["auc","l1"] - } - }; - - let bst = Booster::train(train_data, val_data.ok(), ¶ms).unwrap(); - - let train_res = bst.get_eval(0).unwrap(); - let val_res = bst.get_eval(1).unwrap(); - let invalid_res = bst.get_eval(420); - assert!(invalid_res.is_err()); - assert_eq!(train_res[0].metric, "auc"); - assert_eq!(val_res[1].metric, "l1"); - assert!(0.0 <= train_res[0].score && train_res[0].score <= 1.0); // make sure values make sense - assert!(0.0 <= train_res[1].score); - } - - #[test] - fn feature_importance() { - let mut params = _default_params(); - params["num_iterations"] = "0".parse().unwrap(); - let bst = _train_booster(¶ms); - let feature_importance = bst.feature_importance().unwrap(); - assert_eq!(feature_importance, vec![0.0; 28]); - } - - #[test] - fn feature_name() { - let params = _default_params(); - let bst = _train_booster(¶ms); - let feature_name = bst.feature_name().unwrap(); - let target = (0..28).map(|i| format!("Column_{}", i)).collect::>(); - assert_eq!(feature_name, target); - } - - #[test] - fn save_file() { - let params = _default_params(); - let bst = _train_booster(¶ms); - assert_eq!(bst.save_file(&"./test/test_save_file.output"), Ok(())); - assert!(Path::new("./test/test_save_file.output").exists()); - let _ = fs::remove_file("./test/test_save_file.output"); - } - - #[test] - fn save_file_size() { - let params = _default_params(); - let bst = _train_booster(¶ms); - let filename = "./test/test_save_file_size.output"; - assert_eq!(bst.save_file(filename), Ok(())); - let file_size = Path::new(filename).metadata().unwrap().len(); - assert!(file_size > 0); - assert_eq!(bst.save_file_size(), Ok(file_size)); - let _ = fs::remove_file(filename); - } - - #[test] - fn save_string() { - let params = _default_params(); - let bst = _train_booster(¶ms); - let filename = "./test/test_save_string.output"; - assert_eq!(bst.save_file(&filename), Ok(())); - assert!(Path::new(&filename).exists()); - let booster_file_content = fs::read_to_string(&filename).unwrap(); - let _ = fs::remove_file("./test/test_save_file.output"); - - assert!(!booster_file_content.is_empty()); - assert_eq!(Ok(booster_file_content), bst.save_string()) - } - - #[test] - fn from_file() { - let _ = Booster::from_file(&"./test/test_from_file.input"); - } - - #[test] - fn from_string() { - let model_string = fs::read_to_string("./test/test_from_file.input").unwrap(); - Booster::from_string(&model_string).unwrap(); - } -} diff --git a/src/old_dataset.rs b/src/old_dataset.rs deleted file mode 100644 index ad0ea35..0000000 --- a/src/old_dataset.rs +++ /dev/null @@ -1,388 +0,0 @@ -use libc::{c_char, c_void}; -use lightgbm_sys; -use lightgbm_sys::DatasetHandle; -use std; -use std::convert::TryInto; -use std::ffi::CString; - -#[cfg(feature = "dataframe")] -use polars::prelude::*; - -use crate::{Error, Result}; - -/// Dataset used throughout LightGBM for training. -/// -/// # Examples -/// -/// ## from mat -/// -/// ``` -/// use lightgbm::Dataset; -/// -/// let data = vec![vec![1.0, 0.1, 0.2, 0.1], -/// vec![0.7, 0.4, 0.5, 0.1], -/// vec![0.9, 0.8, 0.5, 0.1], -/// vec![0.2, 0.2, 0.8, 0.7], -/// vec![0.1, 0.7, 1.0, 0.9]]; -/// let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; -/// let dataset = Dataset::from_mat(data, label).unwrap(); -/// ``` -/// -/// ## from file -/// -/// ``` -/// use lightgbm::Dataset; -/// -/// let dataset = Dataset::from_file(&"lightgbm-sys/lightgbm/examples/binary_classification/binary.train", None).unwrap(); -/// ``` -pub struct Dataset { - pub(crate) handle: lightgbm_sys::DatasetHandle, -} - -#[link(name = "c")] -impl Dataset { - fn new(handle: lightgbm_sys::DatasetHandle) -> Self { - Self { handle } - } - - /// Create a new `Dataset` from dense array in row-major order. - /// - /// Example - /// ``` - /// use lightgbm::Dataset; - /// - /// let data = vec![vec![1.0, 0.1, 0.2, 0.1], - /// vec![0.7, 0.4, 0.5, 0.1], - /// vec![0.9, 0.8, 0.5, 0.1], - /// vec![0.2, 0.2, 0.8, 0.7], - /// vec![0.1, 0.7, 1.0, 0.9]]; - /// let label = vec![0.0, 0.0, 0.0, 1.0, 1.0]; - /// let dataset = Dataset::from_mat(data, label).unwrap(); - /// ``` - pub fn from_mat( - data: Vec>, - label: Vec, - reference_dataset: Option<&Dataset>, - ) -> Result { - let data_length = data.len(); - let feature_length = data[0].len(); - let params = CString::new("").unwrap(); - let label_str = CString::new("label").unwrap(); - - let reference = match reference_dataset { - Some(h) => h.handle.clone(), - None => std::ptr::null_mut(), - }; - - let mut handle = std::ptr::null_mut(); - let flat_data = data.into_iter().flatten().collect::>(); - - if data_length > i32::MAX as usize || feature_length > i32::MAX as usize { - return Err(Error::new(format!( - "received old_dataset of size {}x{}, but at most {}x{} is supported", - data_length, - feature_length, - i32::MAX, - i32::MAX - ))); - } - - lgbm_call!(lightgbm_sys::LGBM_DatasetCreateFromMat( - flat_data.as_ptr() as *const c_void, - lightgbm_sys::C_API_DTYPE_FLOAT64 as i32, - data_length as i32, - feature_length as i32, - 1_i32, - params.as_ptr() as *const c_char, - reference, - &mut handle - ))?; - - lgbm_call!(lightgbm_sys::LGBM_DatasetSetField( - handle, - label_str.as_ptr() as *const c_char, - label.as_ptr() as *const c_void, - data_length as i32, - lightgbm_sys::C_API_DTYPE_FLOAT32 as i32 - ))?; - - Ok(Self::new(handle)) - } - - /// Create a new `Dataset` from file. - /// - /// file is `tsv`. - /// ```text - ///