From 6978bd0e1dfdacae451ee285f0e46356442a2574 Mon Sep 17 00:00:00 2001
From: "joost.neujens" <jneujens@gmail.com>
Date: Fri, 16 Jun 2023 18:13:51 +0200
Subject: [PATCH 1/4] #143 fix: serialization-deserialization bug

---
 .gitignore                                    |    1 +
 cobra/model_building/__init__.py              |   26 +-
 cobra/preprocessing/preprocessor.py           |    5 +
 cobra/preprocessing/target_encoder.py         |    3 +-
 cobra/utils.py                                |   48 +-
 docs/make.bat                                 |   70 +-
 notebooks/debugging.ipynb                     | 1364 +++++++++++++++++
 notebooks/model_json.json                     |  216 +++
 .../model_building/test_forward_selection.py  |  426 ++---
 tests/model_building/test_models.py           |  516 +++----
 .../test_categorical_data_processor.py        |  626 ++++----
 tests/preprocessing/test_kbins_discretizer.py |  504 +++---
 tests/preprocessing/test_preprocessor.py      |  796 +++++-----
 tests/preprocessing/test_target_encoder.py    |  684 ++++-----
 14 files changed, 3436 insertions(+), 1849 deletions(-)
 create mode 100644 notebooks/debugging.ipynb
 create mode 100644 notebooks/model_json.json

diff --git a/.gitignore b/.gitignore
index 6aa9052..14c9262 100644
--- a/.gitignore
+++ b/.gitignore
@@ -72,6 +72,7 @@ target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+#*notebooks/*
 
 # pyenv
 .python-version
diff --git a/cobra/model_building/__init__.py b/cobra/model_building/__init__.py
index 7a646c3..768112c 100644
--- a/cobra/model_building/__init__.py
+++ b/cobra/model_building/__init__.py
@@ -1,13 +1,13 @@
-from .univariate_selection import compute_univariate_preselection
-from .univariate_selection import get_preselected_predictors
-from .univariate_selection import compute_correlations
-
-from .models import LogisticRegressionModel, LinearRegressionModel
-from .forward_selection import ForwardFeatureSelection
-
-__all__ = ['compute_univariate_preselection',
-           'get_preselected_predictors',
-           'compute_correlations',
-           'LogisticRegressionModel',
-           'LinearRegressionModel',
-           'ForwardFeatureSelection']
+from .univariate_selection import compute_univariate_preselection
+from .univariate_selection import get_preselected_predictors
+from .univariate_selection import compute_correlations
+
+from .models import LogisticRegressionModel, LinearRegressionModel
+from .forward_selection import ForwardFeatureSelection
+
+__all__ = ['compute_univariate_preselection',
+           'get_preselected_predictors',
+           'compute_correlations',
+           'LogisticRegressionModel',
+           'LinearRegressionModel',
+           'ForwardFeatureSelection']
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
index fa7ddf1..7f84716 100644
--- a/cobra/preprocessing/preprocessor.py
+++ b/cobra/preprocessing/preprocessor.py
@@ -367,6 +367,10 @@ def fit(
 
         log.info("Fitting pipeline took {} seconds".format(time.time() - start))
 
+    def test_function(self):
+        return print('heleeeloooo')
+
+
     def transform(
         self, data: pd.DataFrame, continuous_vars: list, discrete_vars: list
     ) -> pd.DataFrame:
@@ -421,6 +425,7 @@ def transform(
 
         return data
 
+
     def fit_transform(
         self,
         train_data: pd.DataFrame,
diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py
index 3eda39d..f438479 100644
--- a/cobra/preprocessing/target_encoder.py
+++ b/cobra/preprocessing/target_encoder.py
@@ -5,6 +5,7 @@
 from tqdm.auto import tqdm
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
+import numpy as np
 
 log = logging.getLogger(__name__)
 
@@ -123,7 +124,7 @@ def set_attributes_from_dict(self, params: dict):
                 params["imputation_strategy"] in self.valid_imputation_strategies):
             self.imputation_strategy = params["imputation_strategy"]
 
-        if "_global_mean" in params and type(params["_global_mean"]) == float:
+        if "_global_mean" in params and isinstance(params["_global_mean"], (np.floating, float)):
             self._global_mean = params["_global_mean"]
 
         _mapping = {}
diff --git a/cobra/utils.py b/cobra/utils.py
index d901380..daf1156 100644
--- a/cobra/utils.py
+++ b/cobra/utils.py
@@ -1,24 +1,24 @@
-import logging
-
-# logger = logging.getLogger(__name__)
-# logger.setLevel(logging.INFO)
-# logger.addHandler(logging.Handler())
-
-
-def clean_predictor_name(predictor_name: str) -> str:
-    """Strip the redundant suffix (e.g. "_enc" or "_bin") off from the end
-    of the predictor name to return a clean version of the predictor
-    """
-    return (
-        predictor_name.replace("_enc", "").replace("_bin", "").replace("_processed", "")
-    )
-
-
-def log_tutorial() -> None:
-    logging.info(
-        """
-    Hi, welcome to Cobra!
-    You can find some tutorials that explain the functioning of cobra on the PythonPredictions GitHub:
-    https://github.com/PythonPredictions/cobra/tree/master/tutorials
-        """
-    )
+import logging
+
+# logger = logging.getLogger(__name__)
+# logger.setLevel(logging.INFO)
+# logger.addHandler(logging.Handler())
+
+
+def clean_predictor_name(predictor_name: str) -> str:
+    """Strip the redundant suffix (e.g. "_enc" or "_bin") off from the end
+    of the predictor name to return a clean version of the predictor
+    """
+    return (
+        predictor_name.replace("_enc", "").replace("_bin", "").replace("_processed", "")
+    )
+
+
+def log_tutorial() -> None:
+    logging.info(
+        """
+    Hi, welcome to Cobra!
+    You can find some tutorials that explain the functioning of cobra on the PythonPredictions GitHub:
+    https://github.com/PythonPredictions/cobra/tree/master/tutorials
+        """
+    )
diff --git a/docs/make.bat b/docs/make.bat
index 6fcf05b..061f32f 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -1,35 +1,35 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/notebooks/debugging.ipynb b/notebooks/debugging.ipynb
new file mode 100644
index 0000000..5dd573e
--- /dev/null
+++ b/notebooks/debugging.ipynb
@@ -0,0 +1,1364 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 464,
+   "id": "23482fd8-b4c1-48f5-8c30-a0e79f7667b3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "%reload_ext autoreload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 465,
+   "id": "da551dc3-ffba-45e0-b87d-7b626a622b08",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, r\"C:/projects/cobra\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 488,
+   "id": "7d2678fa-eb47-4cb5-ad1d-c5034a742f55",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import random\n",
+    "from cobra.preprocessing import PreProcessor\n",
+    "\n",
+    "# custom imports\n",
+    "from cobra.preprocessing import CategoricalDataProcessor\n",
+    "from cobra.preprocessing import KBinsDiscretizer\n",
+    "from cobra.preprocessing import TargetEncoder\n",
+    "import json\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d4d341ec-b5c3-4b00-a54f-c5b6565d2631",
+   "metadata": {},
+   "source": [
+    "### 1. Generate data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 467,
+   "id": "a9563643-308b-4c6c-b358-9cbf93a0666d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "size = 5000\n",
+    "\n",
+    "# Create datetime column\n",
+    "dates = pd.date_range('2022-01-01', periods=size, freq='D')\n",
+    "\n",
+    "# Create categorical variables\n",
+    "category_values = ['Category A', 'Category B', 'Category C']\n",
+    "cat_var1 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n",
+    "cat_var2 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n",
+    "cat_var3 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n",
+    "\n",
+    "# Create continuous variables with different scales and distributions\n",
+    "cont_var1 = pd.Series(np.random.normal(loc=0, scale=1, size=size), name='cont_var1')\n",
+    "cont_var2 = pd.Series(np.random.uniform(low=0, high=10, size=size), name='cont_var2')\n",
+    "cont_var3 = pd.Series(np.random.exponential(scale=1, size=size), name='cont_var3')\n",
+    "\n",
+    "# Create target variable\n",
+    "target = pd.Series(np.random.randint(2, size=size))\n",
+    "\n",
+    "# Combine into a DataFrame\n",
+    "df = pd.DataFrame({'DateTime': dates, 'CategoryVar1': cat_var1,\n",
+    "                   'CategoryVar2': cat_var2, 'CategoryVar3': cat_var3,\n",
+    "                   'cont_var1': cont_var1, 'cont_var2': cont_var2, 'cont_var3': cont_var3,\n",
+    "                   'target': target})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 468,
+   "id": "bde9235f-dc62-433d-b3d3-6bf37b2ddb52",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DateTime        datetime64[ns]\n",
+       "CategoryVar1          category\n",
+       "CategoryVar2          category\n",
+       "CategoryVar3          category\n",
+       "cont_var1              float64\n",
+       "cont_var2              float64\n",
+       "cont_var3              float64\n",
+       "target                   int32\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 468,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 469,
+   "id": "d774e959-73f4-40b4-bc20-43c3af99e593",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>DateTime</th>\n",
+       "      <th>CategoryVar1</th>\n",
+       "      <th>CategoryVar2</th>\n",
+       "      <th>CategoryVar3</th>\n",
+       "      <th>cont_var1</th>\n",
+       "      <th>cont_var2</th>\n",
+       "      <th>cont_var3</th>\n",
+       "      <th>target</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2022-01-01</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>-1.001645</td>\n",
+       "      <td>4.733706</td>\n",
+       "      <td>1.372659</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2022-01-02</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>0.280629</td>\n",
+       "      <td>9.191129</td>\n",
+       "      <td>0.635924</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2022-01-03</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>-0.345219</td>\n",
+       "      <td>7.731792</td>\n",
+       "      <td>0.098091</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2022-01-04</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>-1.134912</td>\n",
+       "      <td>0.205132</td>\n",
+       "      <td>0.179868</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2022-01-05</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>-1.339645</td>\n",
+       "      <td>2.378540</td>\n",
+       "      <td>0.966818</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    DateTime CategoryVar1 CategoryVar2 CategoryVar3  cont_var1  cont_var2  \\\n",
+       "0 2022-01-01   Category C   Category B   Category A  -1.001645   4.733706   \n",
+       "1 2022-01-02   Category C   Category C   Category B   0.280629   9.191129   \n",
+       "2 2022-01-03   Category B   Category B   Category C  -0.345219   7.731792   \n",
+       "3 2022-01-04   Category C   Category B   Category C  -1.134912   0.205132   \n",
+       "4 2022-01-05   Category A   Category C   Category B  -1.339645   2.378540   \n",
+       "\n",
+       "   cont_var3  target  \n",
+       "0   1.372659       0  \n",
+       "1   0.635924       1  \n",
+       "2   0.098091       1  \n",
+       "3   0.179868       0  \n",
+       "4   0.966818       1  "
+      ]
+     },
+     "execution_count": 469,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 470,
+   "id": "e9c06e3a-188f-4cdc-b9cd-51d3db63e5ff",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['DateTime', 'CategoryVar1', 'CategoryVar2', 'CategoryVar3', 'cont_var1',\n",
+       "       'cont_var2', 'cont_var3', 'target'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 470,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.columns"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9aae8c98-434b-4c71-abb1-29fa6d143895",
+   "metadata": {},
+   "source": [
+    "### 2. Fit preprocessor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 521,
+   "id": "a32560d4-b5fe-4b90-9ea6-ede7915bba05",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "continuous_vars = ['cont_var2', 'cont_var3', 'cont_var1']\n",
+    "discrete_vars= ['CategoryVar1', 'CategoryVar2', 'CategoryVar3'] #, 'DateTime'] [] \n",
+    "target_col = \"target\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 522,
+   "id": "d6f1e21a-4a6e-4ad7-9faf-b36e6daff707",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_type = \"classification\"\n",
+    "\n",
+    "# using all Cobra's default parameters for preprocessing here\n",
+    "preprocessor = PreProcessor.from_params(\n",
+    "    model_type=model_type\n",
+    ")\n",
+    "\n",
+    "random.seed(1212)\n",
+    "basetable = preprocessor.train_selection_validation_split(data=df,\n",
+    "                                                          train_prop=0.6,\n",
+    "                                                          selection_prop=0.25,\n",
+    "                                                          validation_prop=0.15)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 523,
+   "id": "7b673619-4eda-4aca-acd5-a125f80d3b20",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Starting to fit pipeline\n",
+      "Computing discretization bins...: 100%|█████████████████████████████████████████████████| 3/3 [00:00<00:00, 507.38it/s]\n",
+      "Fitting KBinsDiscretizer took 0.006914615631103516 seconds\n",
+      "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 240.62it/s]\n",
+      "Fitting category regrouping...: 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 29.42it/s]\n",
+      "Fitting categorical_data_processor class took 0.10196375846862793 seconds\n",
+      "Fitting target encoding...: 100%|███████████████████████████████████████████████████████| 6/6 [00:00<00:00, 558.52it/s]\n",
+      "Fitting TargetEncoder took 0.013732433319091797 seconds\n",
+      "Fitting pipeline took 0.17300176620483398 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "preprocessor.fit(basetable[basetable[\"split\"]==\"train\"],\n",
+    "                 continuous_vars=continuous_vars,\n",
+    "                 discrete_vars = discrete_vars,\n",
+    "                 target_column_name=target_col)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 524,
+   "id": "c9e2c79d-c0bc-464d-b869-f8115ac67776",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 160.70it/s]\n",
+      "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 697.13it/s]\n",
+      "Transforming data took 0.0610198974609375 seconds\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>DateTime</th>\n",
+       "      <th>CategoryVar1</th>\n",
+       "      <th>CategoryVar2</th>\n",
+       "      <th>CategoryVar3</th>\n",
+       "      <th>cont_var1</th>\n",
+       "      <th>cont_var2</th>\n",
+       "      <th>cont_var3</th>\n",
+       "      <th>target</th>\n",
+       "      <th>split</th>\n",
+       "      <th>cont_var2_bin</th>\n",
+       "      <th>...</th>\n",
+       "      <th>cont_var1_bin</th>\n",
+       "      <th>CategoryVar1_processed</th>\n",
+       "      <th>CategoryVar2_processed</th>\n",
+       "      <th>CategoryVar3_processed</th>\n",
+       "      <th>CategoryVar1_enc</th>\n",
+       "      <th>CategoryVar2_enc</th>\n",
+       "      <th>CategoryVar3_enc</th>\n",
+       "      <th>cont_var2_enc</th>\n",
+       "      <th>cont_var3_enc</th>\n",
+       "      <th>cont_var1_enc</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2022-01-01</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>-1.001645</td>\n",
+       "      <td>4.733706</td>\n",
+       "      <td>1.372659</td>\n",
+       "      <td>0</td>\n",
+       "      <td>selection</td>\n",
+       "      <td>4.0 - 5.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-1.3 - -0.8</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>0.504274</td>\n",
+       "      <td>0.495885</td>\n",
+       "      <td>0.514872</td>\n",
+       "      <td>0.467391</td>\n",
+       "      <td>0.486891</td>\n",
+       "      <td>0.523364</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2022-01-02</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>0.280629</td>\n",
+       "      <td>9.191129</td>\n",
+       "      <td>0.635924</td>\n",
+       "      <td>1</td>\n",
+       "      <td>train</td>\n",
+       "      <td>9.0 - 10.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.2 - 0.5</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>0.504274</td>\n",
+       "      <td>0.487952</td>\n",
+       "      <td>0.491000</td>\n",
+       "      <td>0.474048</td>\n",
+       "      <td>0.524355</td>\n",
+       "      <td>0.492997</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2022-01-03</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>-0.345219</td>\n",
+       "      <td>7.731792</td>\n",
+       "      <td>0.098091</td>\n",
+       "      <td>1</td>\n",
+       "      <td>train</td>\n",
+       "      <td>7.0 - 8.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.5 - -0.2</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>0.473367</td>\n",
+       "      <td>0.495885</td>\n",
+       "      <td>0.465366</td>\n",
+       "      <td>0.490260</td>\n",
+       "      <td>0.494297</td>\n",
+       "      <td>0.433225</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2022-01-04</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>-1.134912</td>\n",
+       "      <td>0.205132</td>\n",
+       "      <td>0.179868</td>\n",
+       "      <td>0</td>\n",
+       "      <td>selection</td>\n",
+       "      <td>0.0 - 1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-1.3 - -0.8</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>0.504274</td>\n",
+       "      <td>0.495885</td>\n",
+       "      <td>0.465366</td>\n",
+       "      <td>0.475410</td>\n",
+       "      <td>0.504065</td>\n",
+       "      <td>0.523364</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2022-01-05</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>-1.339645</td>\n",
+       "      <td>2.378540</td>\n",
+       "      <td>0.966818</td>\n",
+       "      <td>1</td>\n",
+       "      <td>train</td>\n",
+       "      <td>2.0 - 3.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-4.0 - -1.3</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>0.491597</td>\n",
+       "      <td>0.487952</td>\n",
+       "      <td>0.491000</td>\n",
+       "      <td>0.455696</td>\n",
+       "      <td>0.471464</td>\n",
+       "      <td>0.562290</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 21 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    DateTime CategoryVar1 CategoryVar2 CategoryVar3  cont_var1  cont_var2  \\\n",
+       "0 2022-01-01   Category C   Category B   Category A  -1.001645   4.733706   \n",
+       "1 2022-01-02   Category C   Category C   Category B   0.280629   9.191129   \n",
+       "2 2022-01-03   Category B   Category B   Category C  -0.345219   7.731792   \n",
+       "3 2022-01-04   Category C   Category B   Category C  -1.134912   0.205132   \n",
+       "4 2022-01-05   Category A   Category C   Category B  -1.339645   2.378540   \n",
+       "\n",
+       "   cont_var3  target      split cont_var2_bin  ... cont_var1_bin  \\\n",
+       "0   1.372659       0  selection     4.0 - 5.0  ...   -1.3 - -0.8   \n",
+       "1   0.635924       1      train    9.0 - 10.0  ...     0.2 - 0.5   \n",
+       "2   0.098091       1      train     7.0 - 8.0  ...   -0.5 - -0.2   \n",
+       "3   0.179868       0  selection     0.0 - 1.0  ...   -1.3 - -0.8   \n",
+       "4   0.966818       1      train     2.0 - 3.0  ...   -4.0 - -1.3   \n",
+       "\n",
+       "  CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed  \\\n",
+       "0             Category C             Category B             Category A   \n",
+       "1             Category C             Category C             Category B   \n",
+       "2             Category B             Category B             Category C   \n",
+       "3             Category C             Category B             Category C   \n",
+       "4             Category A             Category C             Category B   \n",
+       "\n",
+       "  CategoryVar1_enc  CategoryVar2_enc  CategoryVar3_enc  cont_var2_enc  \\\n",
+       "0         0.504274          0.495885          0.514872       0.467391   \n",
+       "1         0.504274          0.487952          0.491000       0.474048   \n",
+       "2         0.473367          0.495885          0.465366       0.490260   \n",
+       "3         0.504274          0.495885          0.465366       0.475410   \n",
+       "4         0.491597          0.487952          0.491000       0.455696   \n",
+       "\n",
+       "   cont_var3_enc  cont_var1_enc  \n",
+       "0       0.486891       0.523364  \n",
+       "1       0.524355       0.492997  \n",
+       "2       0.494297       0.433225  \n",
+       "3       0.504065       0.523364  \n",
+       "4       0.471464       0.562290  \n",
+       "\n",
+       "[5 rows x 21 columns]"
+      ]
+     },
+     "execution_count": 524,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "basetable_transformed_orig = preprocessor.transform(basetable,\n",
+    "                                   continuous_vars=continuous_vars,\n",
+    "                                   discrete_vars=discrete_vars)\n",
+    "basetable_transformed_orig.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 525,
+   "id": "d70f40cc-7814-48a8-91f6-2b7297f97ccc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "#preprocessor._discretizer #._bins_by_column\n",
+    "#preprocessor._target_encoder.attributes_to_dict()\n",
+    "#preprocessor._discretizer.attributes_to_dict()\n",
+    "#preprocessor._target_encoder.attributes_to_dict()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "baab4c1b-4200-4c96-b991-be8efc09abbb",
+   "metadata": {},
+   "source": [
+    "### 3. Serialize the preprocessor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 526,
+   "id": "95b597b2-b475-4d59-b650-dcc208db1eb5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "pipeline_serialized = preprocessor.serialize_pipeline()\n",
+    "\n",
+    "with open(r\"./model_json.json\", \"w\") as file:\n",
+    "    file.write(json.dumps(pipeline_serialized, indent=4))\n",
+    "    \n",
+    "#pipeline_serialized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 527,
+   "id": "c6dbd38c-ca5d-492d-815b-1af02d7de143",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Look into properties of preprocessors\n",
+    "#pipeline_serialized[\"target_encoder\"] #._bins_by_column"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fc339ac8-67a7-4574-811e-2b9bc4ce6a39",
+   "metadata": {},
+   "source": [
+    "### 4. De-serialize pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 528,
+   "id": "2a517ff8-d336-4bd3-abdc-2be784259564",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Read serialized pipeline from json\n",
+    "with open(r\"./model_json.json\", \"r\") as file:\n",
+    "    json_pipeline_serialized = json.load(file)\n",
+    "\n",
+    "# Create new preprocessor object from serialized pipeline\n",
+    "new_preprocessor = PreProcessor.from_pipeline(json_pipeline_serialized)\n",
+    "#new_preprocessor = PreProcessor.from_pipeline(pipeline_serialized)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 529,
+   "id": "ad9442b5-7f7e-48fe-8199-528992d1f0d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Look into properties of preprocessors if needed\n",
+    "#new_preprocessor._discretizer.attributes_to_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 530,
+   "id": "541986d2-8d5d-473c-8871-5e7d2da31c4a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 147.15it/s]\n",
+      "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 661.65it/s]\n",
+      "Transforming data took 0.06773138046264648 seconds\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>DateTime</th>\n",
+       "      <th>CategoryVar1</th>\n",
+       "      <th>CategoryVar2</th>\n",
+       "      <th>CategoryVar3</th>\n",
+       "      <th>cont_var1</th>\n",
+       "      <th>cont_var2</th>\n",
+       "      <th>cont_var3</th>\n",
+       "      <th>target</th>\n",
+       "      <th>split</th>\n",
+       "      <th>cont_var2_bin</th>\n",
+       "      <th>...</th>\n",
+       "      <th>cont_var1_bin</th>\n",
+       "      <th>CategoryVar1_processed</th>\n",
+       "      <th>CategoryVar2_processed</th>\n",
+       "      <th>CategoryVar3_processed</th>\n",
+       "      <th>CategoryVar1_enc</th>\n",
+       "      <th>CategoryVar2_enc</th>\n",
+       "      <th>CategoryVar3_enc</th>\n",
+       "      <th>cont_var2_enc</th>\n",
+       "      <th>cont_var3_enc</th>\n",
+       "      <th>cont_var1_enc</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2022-01-01</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>-1.001645</td>\n",
+       "      <td>4.733706</td>\n",
+       "      <td>1.372659</td>\n",
+       "      <td>0</td>\n",
+       "      <td>selection</td>\n",
+       "      <td>4.0 - 5.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-1.3 - -0.8</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>0.504274</td>\n",
+       "      <td>0.495885</td>\n",
+       "      <td>0.514872</td>\n",
+       "      <td>0.467391</td>\n",
+       "      <td>0.486891</td>\n",
+       "      <td>0.523364</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2022-01-02</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>0.280629</td>\n",
+       "      <td>9.191129</td>\n",
+       "      <td>0.635924</td>\n",
+       "      <td>1</td>\n",
+       "      <td>train</td>\n",
+       "      <td>9.0 - 10.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.2 - 0.5</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>0.504274</td>\n",
+       "      <td>0.487952</td>\n",
+       "      <td>0.491000</td>\n",
+       "      <td>0.474048</td>\n",
+       "      <td>0.524355</td>\n",
+       "      <td>0.492997</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2022-01-03</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>-0.345219</td>\n",
+       "      <td>7.731792</td>\n",
+       "      <td>0.098091</td>\n",
+       "      <td>1</td>\n",
+       "      <td>train</td>\n",
+       "      <td>7.0 - 8.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.5 - -0.2</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>0.473367</td>\n",
+       "      <td>0.495885</td>\n",
+       "      <td>0.465366</td>\n",
+       "      <td>0.490260</td>\n",
+       "      <td>0.494297</td>\n",
+       "      <td>0.433225</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2022-01-04</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>-1.134912</td>\n",
+       "      <td>0.205132</td>\n",
+       "      <td>0.179868</td>\n",
+       "      <td>0</td>\n",
+       "      <td>selection</td>\n",
+       "      <td>0.0 - 1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-1.3 - -0.8</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>0.504274</td>\n",
+       "      <td>0.495885</td>\n",
+       "      <td>0.465366</td>\n",
+       "      <td>0.475410</td>\n",
+       "      <td>0.504065</td>\n",
+       "      <td>0.523364</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2022-01-05</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>-1.339645</td>\n",
+       "      <td>2.378540</td>\n",
+       "      <td>0.966818</td>\n",
+       "      <td>1</td>\n",
+       "      <td>train</td>\n",
+       "      <td>2.0 - 3.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-4.0 - -1.3</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>0.491597</td>\n",
+       "      <td>0.487952</td>\n",
+       "      <td>0.491000</td>\n",
+       "      <td>0.455696</td>\n",
+       "      <td>0.471464</td>\n",
+       "      <td>0.562290</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 21 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    DateTime CategoryVar1 CategoryVar2 CategoryVar3  cont_var1  cont_var2  \\\n",
+       "0 2022-01-01   Category C   Category B   Category A  -1.001645   4.733706   \n",
+       "1 2022-01-02   Category C   Category C   Category B   0.280629   9.191129   \n",
+       "2 2022-01-03   Category B   Category B   Category C  -0.345219   7.731792   \n",
+       "3 2022-01-04   Category C   Category B   Category C  -1.134912   0.205132   \n",
+       "4 2022-01-05   Category A   Category C   Category B  -1.339645   2.378540   \n",
+       "\n",
+       "   cont_var3  target      split cont_var2_bin  ... cont_var1_bin  \\\n",
+       "0   1.372659       0  selection     4.0 - 5.0  ...   -1.3 - -0.8   \n",
+       "1   0.635924       1      train    9.0 - 10.0  ...     0.2 - 0.5   \n",
+       "2   0.098091       1      train     7.0 - 8.0  ...   -0.5 - -0.2   \n",
+       "3   0.179868       0  selection     0.0 - 1.0  ...   -1.3 - -0.8   \n",
+       "4   0.966818       1      train     2.0 - 3.0  ...   -4.0 - -1.3   \n",
+       "\n",
+       "  CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed  \\\n",
+       "0             Category C             Category B             Category A   \n",
+       "1             Category C             Category C             Category B   \n",
+       "2             Category B             Category B             Category C   \n",
+       "3             Category C             Category B             Category C   \n",
+       "4             Category A             Category C             Category B   \n",
+       "\n",
+       "  CategoryVar1_enc  CategoryVar2_enc  CategoryVar3_enc  cont_var2_enc  \\\n",
+       "0         0.504274          0.495885          0.514872       0.467391   \n",
+       "1         0.504274          0.487952          0.491000       0.474048   \n",
+       "2         0.473367          0.495885          0.465366       0.490260   \n",
+       "3         0.504274          0.495885          0.465366       0.475410   \n",
+       "4         0.491597          0.487952          0.491000       0.455696   \n",
+       "\n",
+       "   cont_var3_enc  cont_var1_enc  \n",
+       "0       0.486891       0.523364  \n",
+       "1       0.524355       0.492997  \n",
+       "2       0.494297       0.433225  \n",
+       "3       0.504065       0.523364  \n",
+       "4       0.471464       0.562290  \n",
+       "\n",
+       "[5 rows x 21 columns]"
+      ]
+     },
+     "execution_count": 530,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "basetable_transformed = new_preprocessor.transform(basetable,\n",
+    "                                   continuous_vars=continuous_vars,\n",
+    "                                   discrete_vars=discrete_vars)\n",
+    "basetable_transformed.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 531,
+   "id": "c270d856-452d-4507-a3c2-df3ae1991c36",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>DateTime</th>\n",
+       "      <th>CategoryVar1</th>\n",
+       "      <th>CategoryVar2</th>\n",
+       "      <th>CategoryVar3</th>\n",
+       "      <th>cont_var1</th>\n",
+       "      <th>cont_var2</th>\n",
+       "      <th>cont_var3</th>\n",
+       "      <th>target</th>\n",
+       "      <th>split</th>\n",
+       "      <th>cont_var2_bin</th>\n",
+       "      <th>...</th>\n",
+       "      <th>cont_var1_bin</th>\n",
+       "      <th>CategoryVar1_processed</th>\n",
+       "      <th>CategoryVar2_processed</th>\n",
+       "      <th>CategoryVar3_processed</th>\n",
+       "      <th>CategoryVar1_enc</th>\n",
+       "      <th>CategoryVar2_enc</th>\n",
+       "      <th>CategoryVar3_enc</th>\n",
+       "      <th>cont_var2_enc</th>\n",
+       "      <th>cont_var3_enc</th>\n",
+       "      <th>cont_var1_enc</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4995</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4996</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4997</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4998</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4999</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5000 rows × 21 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      DateTime  CategoryVar1  CategoryVar2  CategoryVar3  cont_var1  \\\n",
+       "0         True          True          True          True       True   \n",
+       "1         True          True          True          True       True   \n",
+       "2         True          True          True          True       True   \n",
+       "3         True          True          True          True       True   \n",
+       "4         True          True          True          True       True   \n",
+       "...        ...           ...           ...           ...        ...   \n",
+       "4995      True          True          True          True       True   \n",
+       "4996      True          True          True          True       True   \n",
+       "4997      True          True          True          True       True   \n",
+       "4998      True          True          True          True       True   \n",
+       "4999      True          True          True          True       True   \n",
+       "\n",
+       "      cont_var2  cont_var3  target  split  cont_var2_bin  ...  cont_var1_bin  \\\n",
+       "0          True       True    True   True           True  ...           True   \n",
+       "1          True       True    True   True           True  ...           True   \n",
+       "2          True       True    True   True           True  ...           True   \n",
+       "3          True       True    True   True           True  ...           True   \n",
+       "4          True       True    True   True           True  ...           True   \n",
+       "...         ...        ...     ...    ...            ...  ...            ...   \n",
+       "4995       True       True    True   True           True  ...           True   \n",
+       "4996       True       True    True   True           True  ...           True   \n",
+       "4997       True       True    True   True           True  ...           True   \n",
+       "4998       True       True    True   True           True  ...           True   \n",
+       "4999       True       True    True   True           True  ...           True   \n",
+       "\n",
+       "      CategoryVar1_processed  CategoryVar2_processed  CategoryVar3_processed  \\\n",
+       "0                       True                    True                    True   \n",
+       "1                       True                    True                    True   \n",
+       "2                       True                    True                    True   \n",
+       "3                       True                    True                    True   \n",
+       "4                       True                    True                    True   \n",
+       "...                      ...                     ...                     ...   \n",
+       "4995                    True                    True                    True   \n",
+       "4996                    True                    True                    True   \n",
+       "4997                    True                    True                    True   \n",
+       "4998                    True                    True                    True   \n",
+       "4999                    True                    True                    True   \n",
+       "\n",
+       "      CategoryVar1_enc  CategoryVar2_enc  CategoryVar3_enc  cont_var2_enc  \\\n",
+       "0                 True              True              True           True   \n",
+       "1                 True              True              True           True   \n",
+       "2                 True              True              True           True   \n",
+       "3                 True              True              True           True   \n",
+       "4                 True              True              True           True   \n",
+       "...                ...               ...               ...            ...   \n",
+       "4995              True              True              True           True   \n",
+       "4996              True              True              True           True   \n",
+       "4997              True              True              True           True   \n",
+       "4998              True              True              True           True   \n",
+       "4999              True              True              True           True   \n",
+       "\n",
+       "      cont_var3_enc  cont_var1_enc  \n",
+       "0              True           True  \n",
+       "1              True           True  \n",
+       "2              True           True  \n",
+       "3              True           True  \n",
+       "4              True           True  \n",
+       "...             ...            ...  \n",
+       "4995           True           True  \n",
+       "4996           True           True  \n",
+       "4997           True           True  \n",
+       "4998           True           True  \n",
+       "4999           True           True  \n",
+       "\n",
+       "[5000 rows x 21 columns]"
+      ]
+     },
+     "execution_count": 531,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Double check transformed basetable is the same\n",
+    "basetable_transformed_orig == basetable_transformed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b478d7c-46d8-4ba9-bf84-375a7cf901a8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cobra_venv",
+   "language": "python",
+   "name": "cobra_venv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/model_json.json b/notebooks/model_json.json
new file mode 100644
index 0000000..fd80281
--- /dev/null
+++ b/notebooks/model_json.json
@@ -0,0 +1,216 @@
+{
+    "metadata": {
+        "timestamp": "16/06/2023 18:00:26"
+    },
+    "categorical_data_processor": {
+        "category_size_threshold": 5,
+        "forced_categories": {},
+        "keep_missing": true,
+        "model_type": "classification",
+        "p_value_threshold": 0.001,
+        "regroup": true,
+        "regroup_name": "Other",
+        "scale_contingency_table": true,
+        "_cleaned_categories_by_column": {
+            "CategoryVar1": [],
+            "CategoryVar2": [],
+            "CategoryVar3": []
+        }
+    },
+    "discretizer": {
+        "auto_adapt_bins": false,
+        "change_endpoint_format": false,
+        "closed": "right",
+        "label_format": "{} - {}",
+        "n_bins": 10,
+        "starting_precision": 0,
+        "strategy": "quantile",
+        "_bins_by_column": {
+            "cont_var2": [
+                [
+                    0.0,
+                    1.0
+                ],
+                [
+                    1.0,
+                    2.0
+                ],
+                [
+                    2.0,
+                    3.0
+                ],
+                [
+                    3.0,
+                    4.0
+                ],
+                [
+                    4.0,
+                    5.0
+                ],
+                [
+                    5.0,
+                    6.0
+                ],
+                [
+                    6.0,
+                    7.0
+                ],
+                [
+                    7.0,
+                    8.0
+                ],
+                [
+                    8.0,
+                    9.0
+                ],
+                [
+                    9.0,
+                    10.0
+                ]
+            ],
+            "cont_var3": [
+                [
+                    0.0,
+                    0.1
+                ],
+                [
+                    0.1,
+                    0.2
+                ],
+                [
+                    0.2,
+                    0.4
+                ],
+                [
+                    0.4,
+                    0.5
+                ],
+                [
+                    0.5,
+                    0.7
+                ],
+                [
+                    0.7,
+                    0.9
+                ],
+                [
+                    0.9,
+                    1.3
+                ],
+                [
+                    1.3,
+                    1.7
+                ],
+                [
+                    1.7,
+                    2.4
+                ],
+                [
+                    2.4,
+                    7.6
+                ]
+            ],
+            "cont_var1": [
+                [
+                    -4.0,
+                    -1.3
+                ],
+                [
+                    -1.3,
+                    -0.8
+                ],
+                [
+                    -0.8,
+                    -0.5
+                ],
+                [
+                    -0.5,
+                    -0.2
+                ],
+                [
+                    -0.2,
+                    0.0
+                ],
+                [
+                    0.0,
+                    0.2
+                ],
+                [
+                    0.2,
+                    0.5
+                ],
+                [
+                    0.5,
+                    0.8
+                ],
+                [
+                    0.8,
+                    1.2
+                ],
+                [
+                    1.2,
+                    3.7
+                ]
+            ]
+        }
+    },
+    "target_encoder": {
+        "imputation_strategy": "mean",
+        "weight": 0.0,
+        "_mapping": {
+            "CategoryVar1_processed": {
+                "Category A": 0.49159663865546216,
+                "Category B": 0.4733668341708543,
+                "Category C": 0.5042735042735043
+            },
+            "CategoryVar2_processed": {
+                "Category A": 0.48643410852713176,
+                "Category B": 0.49588477366255146,
+                "Category C": 0.4879518072289157
+            },
+            "CategoryVar3_processed": {
+                "Category A": 0.5148717948717949,
+                "Category B": 0.491,
+                "Category C": 0.4653658536585366
+            },
+            "cont_var2_bin": {
+                "0.0 - 1.0": 0.47540983606557374,
+                "1.0 - 2.0": 0.46855345911949686,
+                "2.0 - 3.0": 0.45569620253164556,
+                "3.0 - 4.0": 0.5133333333333333,
+                "4.0 - 5.0": 0.4673913043478261,
+                "5.0 - 6.0": 0.5307443365695793,
+                "6.0 - 7.0": 0.5232974910394266,
+                "7.0 - 8.0": 0.4902597402597403,
+                "8.0 - 9.0": 0.5033333333333333,
+                "9.0 - 10.0": 0.4740484429065744
+            },
+            "cont_var3_bin": {
+                "0.0 - 0.1": 0.49429657794676807,
+                "0.1 - 0.2": 0.5040650406504065,
+                "0.2 - 0.4": 0.4897025171624714,
+                "0.4 - 0.5": 0.5,
+                "0.5 - 0.7": 0.5243553008595988,
+                "0.7 - 0.9": 0.4703703703703704,
+                "0.9 - 1.3": 0.47146401985111663,
+                "1.3 - 1.7": 0.4868913857677903,
+                "1.7 - 2.4": 0.43416370106761565,
+                "2.4 - 7.6": 0.5258064516129032
+            },
+            "cont_var1_bin": {
+                "-4.0 - -1.3": 0.5622895622895623,
+                "-1.3 - -0.8": 0.5233644859813084,
+                "-0.8 - -0.5": 0.4358974358974359,
+                "-0.5 - -0.2": 0.43322475570032576,
+                "-0.2 - 0.0": 0.5219123505976095,
+                "0.0 - 0.2": 0.4763779527559055,
+                "0.2 - 0.5": 0.49299719887955185,
+                "0.5 - 0.8": 0.5054545454545455,
+                "0.8 - 1.2": 0.4539249146757679,
+                "1.2 - 3.7": 0.4984984984984985
+            }
+        },
+        "_global_mean": 0.49
+    },
+    "_is_fitted": true
+}
\ No newline at end of file
diff --git a/tests/model_building/test_forward_selection.py b/tests/model_building/test_forward_selection.py
index 19f7157..9383f73 100644
--- a/tests/model_building/test_forward_selection.py
+++ b/tests/model_building/test_forward_selection.py
@@ -1,213 +1,213 @@
-
-from contextlib import contextmanager
-import pytest
-import pandas as pd
-
-from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel
-from cobra.model_building.forward_selection import ForwardFeatureSelection
-
-@contextmanager
-def does_not_raise():
-    yield
-
-def mock_data(add_split_col: bool=False, model_type="classification"):
-    data = pd.DataFrame({"var1_enc": [0.42] * 10,
-                         "var2_enc": [0.94] * 10,
-                         "var3_enc": [0.87] * 10})
-
-    if model_type == "classification":
-        data["target"] = ([0] * 5 + [1] * 2 + [0] * 2 + [1])
-    elif model_type == "regression":
-        data["target"] = [7, 2, 2, 9, 7, 3, 1, 4, 8, 5]
-
-    if add_split_col:
-        data.loc[:, "split"] = (["train"] * 7 + ["selection"] * 3)
-
-    return data
-
-def mock_model_num_pred(n_predictors, model_type="classification"):
-    predictors = [f"var{i + 1}_enc" for i in range(n_predictors)]
-    return mock_model(predictors, model_type)
-
-def mock_model(predictor_list, model_type="classification"):
-    if model_type == "classification":
-        model = LogisticRegressionModel()
-    elif model_type == "regression":
-        model = LinearRegressionModel()
-
-    model.predictors = predictor_list
-
-    return model
-
-
-class TestForwardFeatureSelection:
-
-    def test_get_model_from_step(self):
-
-        forward_selection = ForwardFeatureSelection()
-
-        with pytest.raises(ValueError):
-            forward_selection.get_model_from_step(2)
-
-    @pytest.mark.parametrize("model_type", ["classification", "regression"])
-    def test_compute_model_performances(self, mocker, model_type):
-
-        data = mock_data(add_split_col=True, model_type=model_type)
-
-        fw_selection = ForwardFeatureSelection(model_type=model_type)
-        fw_selection._fitted_models = [
-            mock_model_num_pred(1, model_type=model_type),
-            mock_model_num_pred(2, model_type=model_type),
-            mock_model_num_pred(3, model_type=model_type)
-        ]
-
-        def mock_evaluate(self, X, y, split, metric):  # on AUC scale, but gives the same for RMSE as it is a mock
-            if split == "train":
-                return 0.612
-            else:
-                return 0.609
-
-        if model_type == "classification":
-            patch_fct = "cobra.model_building.forward_selection.LogisticRegressionModel.evaluate"
-        elif model_type == "regression":
-            patch_fct = "cobra.model_building.forward_selection.LinearRegressionModel.evaluate"
-
-        mocker.patch(patch_fct, mock_evaluate)
-
-        actual = (fw_selection
-                  .compute_model_performances(data, "target",
-                                              splits=["train", "selection"],
-                                              metric=None))
-
-        expected = pd.DataFrame([
-            {"predictors": ["var1_enc"],
-             "last_added_predictor": "var1_enc",
-             "train_performance": 0.612, "selection_performance": 0.609,
-             "model_type": model_type},
-            {"predictors": ["var1_enc", "var2_enc"],
-             "last_added_predictor": "var2_enc",
-             "train_performance": 0.612, "selection_performance": 0.609,
-             "model_type": model_type},
-            {"predictors": ["var1_enc", "var2_enc", "var3_enc"],
-             "last_added_predictor": "var3_enc",
-             "train_performance": 0.612, "selection_performance": 0.609,
-             "model_type": model_type}
-        ])
-
-        pd.testing.assert_frame_equal(actual, expected)
-
-    @pytest.mark.parametrize("model_type", ["classification", "regression"])
-    def test_ffs_train_data_assertions(self, model_type):
-
-        fw_selection = ForwardFeatureSelection(model_type=model_type)
-
-        with pytest.raises(AssertionError):  # no split column
-            fw_selection.fit(pd.DataFrame(), "target", predictors=[""])
-
-        df = mock_data(add_split_col=True, model_type=model_type)
-        with pytest.raises(AssertionError):  # not at least train & selection sets
-            fw_selection.fit(df[df["split"] == "train"], "target", predictors=[""])
-
-    @pytest.mark.parametrize("model_type, max_predictors, expectation",
-                             [("classification", 2, pytest.raises(ValueError)),
-                              ("classification", 3, does_not_raise()),
-                              ("classification", 5, does_not_raise()),
-                              ("classification", 10, does_not_raise()),
-                              ("classification", 15, does_not_raise()),
-                              ("regression", 2, pytest.raises(ValueError)),
-                              ("regression", 3, does_not_raise()),
-                              ("regression", 5, does_not_raise()),
-                              ("regression", 10, does_not_raise()),
-                              ("regression", 15, does_not_raise())
-                              ])
-    def test_fit(self, mocker, model_type, max_predictors: int, expectation):
-
-        # create list of elements [var1_enc, var2_enc, ..., var10_enc]
-        predictors_list = [f"var{i+1}_enc" for i in range(10)]
-        # extract sublist [var1_enc, var5_enc, var9_enc]
-        forced_predictors_list = predictors_list[::4]
-
-        ordered_output_list = (forced_predictors_list
-                               + [pred for pred in predictors_list
-                                  if pred not in forced_predictors_list])
-
-        fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors)
-
-        def mock_train_model(self, train_data, target_column_name, predictors):
-            return mock_model(predictors, model_type=model_type)
-
-        def mock_forward_selection(self, train_data, target_column_name,
-                                   predictors, forced_predictors):
-            n_models = min(max_predictors, len(predictors) + len(forced_predictors))
-
-            return [mock_model(ordered_output_list[:i+1], model_type=model_type)
-                    for i in range(n_models)]
-
-        mocker.patch("cobra.model_building.ForwardFeatureSelection._train_model",
-                     mock_train_model)
-
-        mocker.patch("cobra.model_building.ForwardFeatureSelection._forward_selection",
-                     mock_forward_selection)
-
-        df = mock_data(add_split_col=True, model_type=model_type)
-        with expectation:
-            fw_selection.fit(df, "target",  # data is ignored
-                             predictors=predictors_list,
-                             forced_predictors=forced_predictors_list,
-                             excluded_predictors=[])
-
-            # for each fitted model, check number of predictors
-            actual = [model.predictors
-                      for model in fw_selection._fitted_models]
-
-            expected = [ordered_output_list[:i+1]
-                        for i in range(min(max_predictors,
-                                           len(predictors_list)))]
-
-            if max_predictors == len(forced_predictors_list):
-                expected = [forced_predictors_list]
-
-            assert actual == expected
-
-    @pytest.mark.parametrize("model_type, max_predictors", [("classification", 5),
-                                                            ("classification", 10),
-                                                            ("classification", 15),
-                                                            ("regression", 5),
-                                                            ("regression", 10),
-                                                            ("regression", 15)
-                                                            ])
-    def test_forward_selection(self, mocker, model_type, max_predictors: int):
-
-        # create list of elements [var1_enc, var2_c, ..., var10_enc]
-        predictors_list = [f"var{i+1}_enc" for i in range(10)]
-
-        # extract sublist [var1_enc, var5_enc, var9_enc]:
-        forced_predictors = predictors_list[::4]
-        # remove these from predictors list to have clean version
-        predictors = [pred for pred in predictors_list
-                      if pred not in forced_predictors]
-
-        ordered_output_list = forced_predictors + predictors
-
-        def mock_find_next_best_model(self, train_data, target_column_name,
-                                      candidate_predictors,
-                                      current_predictors):
-            return mock_model(current_predictors + candidate_predictors[0:1], model_type=model_type)
-
-        mocker.patch(("cobra.model_building.ForwardFeatureSelection."
-                      "_find_next_best_model"), mock_find_next_best_model)
-
-        fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors)
-
-        fitted_models = (fw_selection.
-                         _forward_selection(pd.DataFrame(), "target",
-                                            predictors,
-                                            forced_predictors))
-
-        actual = [sorted(model.predictors) for model in fitted_models]
-
-        expected = [sorted(ordered_output_list[:i+1])
-                    for i in range(min(max_predictors,
-                                       len(predictors_list)))]
-
-        assert actual == expected
+
+from contextlib import contextmanager
+import pytest
+import pandas as pd
+
+from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel
+from cobra.model_building.forward_selection import ForwardFeatureSelection
+
+@contextmanager
+def does_not_raise():
+    yield
+
+def mock_data(add_split_col: bool=False, model_type="classification"):
+    data = pd.DataFrame({"var1_enc": [0.42] * 10,
+                         "var2_enc": [0.94] * 10,
+                         "var3_enc": [0.87] * 10})
+
+    if model_type == "classification":
+        data["target"] = ([0] * 5 + [1] * 2 + [0] * 2 + [1])
+    elif model_type == "regression":
+        data["target"] = [7, 2, 2, 9, 7, 3, 1, 4, 8, 5]
+
+    if add_split_col:
+        data.loc[:, "split"] = (["train"] * 7 + ["selection"] * 3)
+
+    return data
+
+def mock_model_num_pred(n_predictors, model_type="classification"):
+    predictors = [f"var{i + 1}_enc" for i in range(n_predictors)]
+    return mock_model(predictors, model_type)
+
+def mock_model(predictor_list, model_type="classification"):
+    if model_type == "classification":
+        model = LogisticRegressionModel()
+    elif model_type == "regression":
+        model = LinearRegressionModel()
+
+    model.predictors = predictor_list
+
+    return model
+
+
+class TestForwardFeatureSelection:
+
+    def test_get_model_from_step(self):
+
+        forward_selection = ForwardFeatureSelection()
+
+        with pytest.raises(ValueError):
+            forward_selection.get_model_from_step(2)
+
+    @pytest.mark.parametrize("model_type", ["classification", "regression"])
+    def test_compute_model_performances(self, mocker, model_type):
+
+        data = mock_data(add_split_col=True, model_type=model_type)
+
+        fw_selection = ForwardFeatureSelection(model_type=model_type)
+        fw_selection._fitted_models = [
+            mock_model_num_pred(1, model_type=model_type),
+            mock_model_num_pred(2, model_type=model_type),
+            mock_model_num_pred(3, model_type=model_type)
+        ]
+
+        def mock_evaluate(self, X, y, split, metric):  # on AUC scale, but gives the same for RMSE as it is a mock
+            if split == "train":
+                return 0.612
+            else:
+                return 0.609
+
+        if model_type == "classification":
+            patch_fct = "cobra.model_building.forward_selection.LogisticRegressionModel.evaluate"
+        elif model_type == "regression":
+            patch_fct = "cobra.model_building.forward_selection.LinearRegressionModel.evaluate"
+
+        mocker.patch(patch_fct, mock_evaluate)
+
+        actual = (fw_selection
+                  .compute_model_performances(data, "target",
+                                              splits=["train", "selection"],
+                                              metric=None))
+
+        expected = pd.DataFrame([
+            {"predictors": ["var1_enc"],
+             "last_added_predictor": "var1_enc",
+             "train_performance": 0.612, "selection_performance": 0.609,
+             "model_type": model_type},
+            {"predictors": ["var1_enc", "var2_enc"],
+             "last_added_predictor": "var2_enc",
+             "train_performance": 0.612, "selection_performance": 0.609,
+             "model_type": model_type},
+            {"predictors": ["var1_enc", "var2_enc", "var3_enc"],
+             "last_added_predictor": "var3_enc",
+             "train_performance": 0.612, "selection_performance": 0.609,
+             "model_type": model_type}
+        ])
+
+        pd.testing.assert_frame_equal(actual, expected)
+
+    @pytest.mark.parametrize("model_type", ["classification", "regression"])
+    def test_ffs_train_data_assertions(self, model_type):
+
+        fw_selection = ForwardFeatureSelection(model_type=model_type)
+
+        with pytest.raises(AssertionError):  # no split column
+            fw_selection.fit(pd.DataFrame(), "target", predictors=[""])
+
+        df = mock_data(add_split_col=True, model_type=model_type)
+        with pytest.raises(AssertionError):  # not at least train & selection sets
+            fw_selection.fit(df[df["split"] == "train"], "target", predictors=[""])
+
+    @pytest.mark.parametrize("model_type, max_predictors, expectation",
+                             [("classification", 2, pytest.raises(ValueError)),
+                              ("classification", 3, does_not_raise()),
+                              ("classification", 5, does_not_raise()),
+                              ("classification", 10, does_not_raise()),
+                              ("classification", 15, does_not_raise()),
+                              ("regression", 2, pytest.raises(ValueError)),
+                              ("regression", 3, does_not_raise()),
+                              ("regression", 5, does_not_raise()),
+                              ("regression", 10, does_not_raise()),
+                              ("regression", 15, does_not_raise())
+                              ])
+    def test_fit(self, mocker, model_type, max_predictors: int, expectation):
+
+        # create list of elements [var1_enc, var2_enc, ..., var10_enc]
+        predictors_list = [f"var{i+1}_enc" for i in range(10)]
+        # extract sublist [var1_enc, var5_enc, var9_enc]
+        forced_predictors_list = predictors_list[::4]
+
+        ordered_output_list = (forced_predictors_list
+                               + [pred for pred in predictors_list
+                                  if pred not in forced_predictors_list])
+
+        fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors)
+
+        def mock_train_model(self, train_data, target_column_name, predictors):
+            return mock_model(predictors, model_type=model_type)
+
+        def mock_forward_selection(self, train_data, target_column_name,
+                                   predictors, forced_predictors):
+            n_models = min(max_predictors, len(predictors) + len(forced_predictors))
+
+            return [mock_model(ordered_output_list[:i+1], model_type=model_type)
+                    for i in range(n_models)]
+
+        mocker.patch("cobra.model_building.ForwardFeatureSelection._train_model",
+                     mock_train_model)
+
+        mocker.patch("cobra.model_building.ForwardFeatureSelection._forward_selection",
+                     mock_forward_selection)
+
+        df = mock_data(add_split_col=True, model_type=model_type)
+        with expectation:
+            fw_selection.fit(df, "target",  # data is ignored
+                             predictors=predictors_list,
+                             forced_predictors=forced_predictors_list,
+                             excluded_predictors=[])
+
+            # for each fitted model, check number of predictors
+            actual = [model.predictors
+                      for model in fw_selection._fitted_models]
+
+            expected = [ordered_output_list[:i+1]
+                        for i in range(min(max_predictors,
+                                           len(predictors_list)))]
+
+            if max_predictors == len(forced_predictors_list):
+                expected = [forced_predictors_list]
+
+            assert actual == expected
+
+    @pytest.mark.parametrize("model_type, max_predictors", [("classification", 5),
+                                                            ("classification", 10),
+                                                            ("classification", 15),
+                                                            ("regression", 5),
+                                                            ("regression", 10),
+                                                            ("regression", 15)
+                                                            ])
+    def test_forward_selection(self, mocker, model_type, max_predictors: int):
+
+        # create list of elements [var1_enc, var2_c, ..., var10_enc]
+        predictors_list = [f"var{i+1}_enc" for i in range(10)]
+
+        # extract sublist [var1_enc, var5_enc, var9_enc]:
+        forced_predictors = predictors_list[::4]
+        # remove these from predictors list to have clean version
+        predictors = [pred for pred in predictors_list
+                      if pred not in forced_predictors]
+
+        ordered_output_list = forced_predictors + predictors
+
+        def mock_find_next_best_model(self, train_data, target_column_name,
+                                      candidate_predictors,
+                                      current_predictors):
+            return mock_model(current_predictors + candidate_predictors[0:1], model_type=model_type)
+
+        mocker.patch(("cobra.model_building.ForwardFeatureSelection."
+                      "_find_next_best_model"), mock_find_next_best_model)
+
+        fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors)
+
+        fitted_models = (fw_selection.
+                         _forward_selection(pd.DataFrame(), "target",
+                                            predictors,
+                                            forced_predictors))
+
+        actual = [sorted(model.predictors) for model in fitted_models]
+
+        expected = [sorted(ordered_output_list[:i+1])
+                    for i in range(min(max_predictors,
+                                       len(predictors_list)))]
+
+        assert actual == expected
diff --git a/tests/model_building/test_models.py b/tests/model_building/test_models.py
index 7eca6e6..20fce9f 100644
--- a/tests/model_building/test_models.py
+++ b/tests/model_building/test_models.py
@@ -1,258 +1,258 @@
-
-import numpy as np
-import pandas as pd
-
-from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel
-
-def mock_data():
-    return pd.DataFrame({"var1_enc": [0.42] * 10,
-                         "var2_enc": [0.94] * 10,
-                         "var3_enc": [0.87] * 10})
-
-
-def mock_score_model_classification(self, data):
-    return np.array([0.5, 0.8, 0.2, 0.9, 0.1, 0.7, 0.3, 0.6, 0.4, 0.5])
-
-def mock_score_model_regression(self, data):
-    return np.array([0.7, 0.2, 0.2, 0.9, 0.7, 0.3, 0.1, 0.4, 0.8, 0.5])*15
-
-class TestLogisticRegressionModel:
-
-    def test_evaluate(self, mocker):
-
-        X = mock_data()
-        y = pd.Series([1] * 5 + [0] * 5)
-
-        def mock_roc_auc_score(y_true, y_score):
-            return 0.79
-
-        (mocker
-         .patch("cobra.model_building.LogisticRegressionModel.score_model",
-                mock_score_model_classification))
-
-        (mocker
-         .patch("cobra.model_building.models.roc_auc_score",
-                mock_roc_auc_score))
-
-        model = LogisticRegressionModel()
-        actual = model.evaluate(X, y)
-
-        assert actual == 0.79
-
-    def test_evaluate_cached(self):
-
-        split = "train"
-        expected = 0.79
-
-        model = LogisticRegressionModel()
-        model._eval_metrics_by_split["train"] = expected
-
-        actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split)
-
-        assert actual == expected
-
-    def test_compute_variable_importance(self, mocker):
-
-        def mock_pearsonr(ypred, ytrue):
-            return [ypred.unique()[0]]
-
-        (mocker
-         .patch("cobra.model_building.LogisticRegressionModel.score_model",
-                mock_score_model_classification))
-
-        (mocker
-         .patch("cobra.model_building.models.stats.pearsonr",
-                mock_pearsonr))
-
-        model = LogisticRegressionModel()
-        model.predictors = ["var1_enc", "var2_enc", "var3_enc"]
-
-        data = mock_data()
-
-        actual = model.compute_variable_importance(data)
-
-        expected = pd.DataFrame([
-            {"predictor": "var1", "importance": data["var1_enc"].unique()[0]},
-            {"predictor": "var2", "importance": data["var2_enc"].unique()[0]},
-            {"predictor": "var3", "importance": data["var3_enc"].unique()[0]}
-        ]).sort_values(by="importance", ascending=False).reset_index(drop=True)
-
-        pd.testing.assert_frame_equal(actual, expected)
-
-    def test_serialize(self):
-
-        model = LogisticRegressionModel()
-        actual = model.serialize()
-
-        expected = {
-            "meta": "logistic-regression",
-            "predictors": [],
-            "_eval_metrics_by_split": {},
-            "params": {
-                "C": 1000000000.0,
-                "class_weight": None,
-                "dual": False,
-                "fit_intercept": True,
-                "intercept_scaling": 1,
-                "l1_ratio": None,
-                "max_iter": 100,
-                "multi_class": "auto",
-                "n_jobs": None,
-                "penalty": "l2",
-                "random_state": 42,
-                "solver": "liblinear",
-                "tol": 0.0001,
-                "verbose": 0,
-                "warm_start": False
-            }
-        }
-
-        assert actual == expected
-
-    def test_deserialize(self):
-
-        model = LogisticRegressionModel()
-
-        model_dict = {
-            "meta": "logistic-regression",
-            "predictors": [],
-            "_eval_metrics_by_split": {},
-            "params": {
-                "C": 1000000000.0,
-                "class_weight": None,
-                "dual": False,
-                "fit_intercept": True,
-                "intercept_scaling": 1,
-                "l1_ratio": None,
-                "max_iter": 100,
-                "multi_class": "auto",
-                "n_jobs": None,
-                "penalty": "l2",
-                "random_state": 42,
-                "solver": "liblinear",
-                "tol": 0.0001,
-                "verbose": 0,
-                "warm_start": False
-            },
-            "classes_": [0, 1],
-            "coef_": [[0.5, 0.75]],
-            "intercept_": [-3],
-            "n_iter_": [10]
-        }
-
-        model.deserialize(model_dict)
-
-        logit = model.logit
-        assert logit.get_params() == model_dict["params"]
-        assert logit.classes_.all() == np.array(model_dict["classes_"]).all()
-        assert logit.n_iter_.all() == np.array(model_dict["n_iter_"]).all()
-        assert logit.intercept_.all() == (np.array(model_dict["intercept_"]).all())
-        assert logit.coef_.all() == np.array(model_dict["coef_"]).all()
-
-class TestLinearRegressionModel:
-
-    def test_evaluate(self, mocker):
-
-        X = mock_data()
-        y = pd.Series(np.array([0.6, 0.1, 0.2, 0.9, 0.8, 0.3, 0.2, 0.4, 0.9, 0.5])*12)
-
-        def mock_mean_squared_error(y_true, y_pred):
-            return 1.23
-
-        (mocker
-         .patch("cobra.model_building.LinearRegressionModel.score_model",
-                mock_score_model_regression))
-
-        (mocker
-         .patch("cobra.model_building.models.mean_squared_error",
-                mock_mean_squared_error))
-
-        model = LinearRegressionModel()
-        actual = model.evaluate(X, y)
-
-        assert actual == np.sqrt(1.23)
-
-    def test_evaluate_cached(self):
-
-        split = "train"
-        expected = np.sqrt(1.23)
-
-        model = LinearRegressionModel()
-        model._eval_metrics_by_split["train"] = expected
-
-        actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split)
-
-        assert actual == expected
-
-    def test_compute_variable_importance(self, mocker):
-
-        def mock_pearsonr(ypred, ytrue):
-            return [ypred.unique()[0]]
-
-        (mocker
-         .patch("cobra.model_building.LinearRegressionModel.score_model",
-                mock_score_model_regression))
-
-        (mocker
-         .patch("cobra.model_building.models.stats.pearsonr",
-                mock_pearsonr))
-
-        model = LinearRegressionModel()
-        model.predictors = ["var1_enc", "var2_enc", "var3_enc"]
-
-        data = mock_data()
-
-        actual = model.compute_variable_importance(data)
-
-        expected = pd.DataFrame([
-            {"predictor": "var1", "importance": data["var1_enc"].unique()[0]},
-            {"predictor": "var2", "importance": data["var2_enc"].unique()[0]},
-            {"predictor": "var3", "importance": data["var3_enc"].unique()[0]}
-        ]).sort_values(by="importance", ascending=False).reset_index(drop=True)
-
-        pd.testing.assert_frame_equal(actual, expected)
-
-    def test_serialize(self):
-
-        model = LinearRegressionModel()
-        actual = model.serialize()
-
-        expected = {
-            "meta": "linear-regression",
-            "predictors": [],
-            "_eval_metrics_by_split": {},
-            "params": {
-                "copy_X": True,
-                "fit_intercept": True,
-                "n_jobs": None,
-                "positive": False
-            }
-        }
-
-        assert actual == expected
-
-    def test_deserialize(self):
-
-        model = LinearRegressionModel()
-
-        model_dict = {
-            "meta": "linear-regression",
-            "predictors": [],
-            "_eval_metrics_by_split": {},
-            "params": {
-                "copy_X": True,
-                "fit_intercept": True,
-                "n_jobs": None,
-                "positive": False
-            },
-            "coef_": [[0.5, 0.75]],
-            "intercept_": [-3]
-        }
-
-        model.deserialize(model_dict)
-
-        linear = model.linear
-        assert linear.get_params() == model_dict["params"]
-        assert linear.intercept_.all() == (np.array(model_dict["intercept_"]).all())
-        assert linear.coef_.all() == np.array(model_dict["coef_"]).all()
-
+
+import numpy as np
+import pandas as pd
+
+from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel
+
+def mock_data():
+    return pd.DataFrame({"var1_enc": [0.42] * 10,
+                         "var2_enc": [0.94] * 10,
+                         "var3_enc": [0.87] * 10})
+
+
+def mock_score_model_classification(self, data):
+    return np.array([0.5, 0.8, 0.2, 0.9, 0.1, 0.7, 0.3, 0.6, 0.4, 0.5])
+
+def mock_score_model_regression(self, data):
+    return np.array([0.7, 0.2, 0.2, 0.9, 0.7, 0.3, 0.1, 0.4, 0.8, 0.5])*15
+
+class TestLogisticRegressionModel:
+
+    def test_evaluate(self, mocker):
+
+        X = mock_data()
+        y = pd.Series([1] * 5 + [0] * 5)
+
+        def mock_roc_auc_score(y_true, y_score):
+            return 0.79
+
+        (mocker
+         .patch("cobra.model_building.LogisticRegressionModel.score_model",
+                mock_score_model_classification))
+
+        (mocker
+         .patch("cobra.model_building.models.roc_auc_score",
+                mock_roc_auc_score))
+
+        model = LogisticRegressionModel()
+        actual = model.evaluate(X, y)
+
+        assert actual == 0.79
+
+    def test_evaluate_cached(self):
+
+        split = "train"
+        expected = 0.79
+
+        model = LogisticRegressionModel()
+        model._eval_metrics_by_split["train"] = expected
+
+        actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split)
+
+        assert actual == expected
+
+    def test_compute_variable_importance(self, mocker):
+
+        def mock_pearsonr(ypred, ytrue):
+            return [ypred.unique()[0]]
+
+        (mocker
+         .patch("cobra.model_building.LogisticRegressionModel.score_model",
+                mock_score_model_classification))
+
+        (mocker
+         .patch("cobra.model_building.models.stats.pearsonr",
+                mock_pearsonr))
+
+        model = LogisticRegressionModel()
+        model.predictors = ["var1_enc", "var2_enc", "var3_enc"]
+
+        data = mock_data()
+
+        actual = model.compute_variable_importance(data)
+
+        expected = pd.DataFrame([
+            {"predictor": "var1", "importance": data["var1_enc"].unique()[0]},
+            {"predictor": "var2", "importance": data["var2_enc"].unique()[0]},
+            {"predictor": "var3", "importance": data["var3_enc"].unique()[0]}
+        ]).sort_values(by="importance", ascending=False).reset_index(drop=True)
+
+        pd.testing.assert_frame_equal(actual, expected)
+
+    def test_serialize(self):
+
+        model = LogisticRegressionModel()
+        actual = model.serialize()
+
+        expected = {
+            "meta": "logistic-regression",
+            "predictors": [],
+            "_eval_metrics_by_split": {},
+            "params": {
+                "C": 1000000000.0,
+                "class_weight": None,
+                "dual": False,
+                "fit_intercept": True,
+                "intercept_scaling": 1,
+                "l1_ratio": None,
+                "max_iter": 100,
+                "multi_class": "auto",
+                "n_jobs": None,
+                "penalty": "l2",
+                "random_state": 42,
+                "solver": "liblinear",
+                "tol": 0.0001,
+                "verbose": 0,
+                "warm_start": False
+            }
+        }
+
+        assert actual == expected
+
+    def test_deserialize(self):
+
+        model = LogisticRegressionModel()
+
+        model_dict = {
+            "meta": "logistic-regression",
+            "predictors": [],
+            "_eval_metrics_by_split": {},
+            "params": {
+                "C": 1000000000.0,
+                "class_weight": None,
+                "dual": False,
+                "fit_intercept": True,
+                "intercept_scaling": 1,
+                "l1_ratio": None,
+                "max_iter": 100,
+                "multi_class": "auto",
+                "n_jobs": None,
+                "penalty": "l2",
+                "random_state": 42,
+                "solver": "liblinear",
+                "tol": 0.0001,
+                "verbose": 0,
+                "warm_start": False
+            },
+            "classes_": [0, 1],
+            "coef_": [[0.5, 0.75]],
+            "intercept_": [-3],
+            "n_iter_": [10]
+        }
+
+        model.deserialize(model_dict)
+
+        logit = model.logit
+        assert logit.get_params() == model_dict["params"]
+        assert logit.classes_.all() == np.array(model_dict["classes_"]).all()
+        assert logit.n_iter_.all() == np.array(model_dict["n_iter_"]).all()
+        assert logit.intercept_.all() == (np.array(model_dict["intercept_"]).all())
+        assert logit.coef_.all() == np.array(model_dict["coef_"]).all()
+
+class TestLinearRegressionModel:
+
+    def test_evaluate(self, mocker):
+
+        X = mock_data()
+        y = pd.Series(np.array([0.6, 0.1, 0.2, 0.9, 0.8, 0.3, 0.2, 0.4, 0.9, 0.5])*12)
+
+        def mock_mean_squared_error(y_true, y_pred):
+            return 1.23
+
+        (mocker
+         .patch("cobra.model_building.LinearRegressionModel.score_model",
+                mock_score_model_regression))
+
+        (mocker
+         .patch("cobra.model_building.models.mean_squared_error",
+                mock_mean_squared_error))
+
+        model = LinearRegressionModel()
+        actual = model.evaluate(X, y)
+
+        assert actual == np.sqrt(1.23)
+
+    def test_evaluate_cached(self):
+
+        split = "train"
+        expected = np.sqrt(1.23)
+
+        model = LinearRegressionModel()
+        model._eval_metrics_by_split["train"] = expected
+
+        actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split)
+
+        assert actual == expected
+
+    def test_compute_variable_importance(self, mocker):
+
+        def mock_pearsonr(ypred, ytrue):
+            return [ypred.unique()[0]]
+
+        (mocker
+         .patch("cobra.model_building.LinearRegressionModel.score_model",
+                mock_score_model_regression))
+
+        (mocker
+         .patch("cobra.model_building.models.stats.pearsonr",
+                mock_pearsonr))
+
+        model = LinearRegressionModel()
+        model.predictors = ["var1_enc", "var2_enc", "var3_enc"]
+
+        data = mock_data()
+
+        actual = model.compute_variable_importance(data)
+
+        expected = pd.DataFrame([
+            {"predictor": "var1", "importance": data["var1_enc"].unique()[0]},
+            {"predictor": "var2", "importance": data["var2_enc"].unique()[0]},
+            {"predictor": "var3", "importance": data["var3_enc"].unique()[0]}
+        ]).sort_values(by="importance", ascending=False).reset_index(drop=True)
+
+        pd.testing.assert_frame_equal(actual, expected)
+
+    def test_serialize(self):
+
+        model = LinearRegressionModel()
+        actual = model.serialize()
+
+        expected = {
+            "meta": "linear-regression",
+            "predictors": [],
+            "_eval_metrics_by_split": {},
+            "params": {
+                "copy_X": True,
+                "fit_intercept": True,
+                "n_jobs": None,
+                "positive": False
+            }
+        }
+
+        assert actual == expected
+
+    def test_deserialize(self):
+
+        model = LinearRegressionModel()
+
+        model_dict = {
+            "meta": "linear-regression",
+            "predictors": [],
+            "_eval_metrics_by_split": {},
+            "params": {
+                "copy_X": True,
+                "fit_intercept": True,
+                "n_jobs": None,
+                "positive": False
+            },
+            "coef_": [[0.5, 0.75]],
+            "intercept_": [-3]
+        }
+
+        model.deserialize(model_dict)
+
+        linear = model.linear
+        assert linear.get_params() == model_dict["params"]
+        assert linear.intercept_.all() == (np.array(model_dict["intercept_"]).all())
+        assert linear.coef_.all() == np.array(model_dict["coef_"]).all()
+
diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py
index dd53434..73f5f4e 100644
--- a/tests/preprocessing/test_categorical_data_processor.py
+++ b/tests/preprocessing/test_categorical_data_processor.py
@@ -1,313 +1,313 @@
-
-import pytest
-import numpy as np
-import pandas as pd
-
-from cobra.preprocessing import CategoricalDataProcessor
-
-class TestCategoricalDataProcessor:
-
-    def test_attributes_to_dict(self):
-
-        processor = CategoricalDataProcessor()
-
-        cleaned_categories = ["a", "b", "c"]
-        processor._cleaned_categories_by_column = {
-            "variable": set(cleaned_categories)
-        }
-
-        actual = processor.attributes_to_dict()
-
-        expected = {
-            "model_type": "classification",
-            "regroup": True,
-            "regroup_name": "Other",
-            "keep_missing": True,
-            "category_size_threshold": 5,
-            "p_value_threshold": 0.001,
-            "scale_contingency_table": True,
-            "forced_categories": {},
-            "_cleaned_categories_by_column": {
-                "variable": list(set(cleaned_categories))
-            }
-        }
-
-        assert actual == expected
-
-    @pytest.mark.parametrize("attribute",
-                             ["regroup", "regroup_name", "keep_missing",
-                              "category_size_threshold", "p_value_threshold",
-                              "scale_contingency_table", "forced_categories",
-                              "_cleaned_categories_by_column"])
-    def test_set_attributes_from_dict(self, attribute):
-
-        processor = CategoricalDataProcessor()
-
-        cleaned_categories = ["a", "b", "c"]
-        params = {
-            "regroup": True,
-            "regroup_name": "Other",
-            "keep_missing": True,
-            "category_size_threshold": 5,
-            "p_value_threshold": 0.001,
-            "scale_contingency_table": True,
-            "forced_categories": {},
-            "_cleaned_categories_by_column": {
-                "variable": cleaned_categories
-            }
-        }
-
-        expected = params[attribute]
-
-        if attribute == "_cleaned_categories_by_column":
-            # list is transformed to a set in CategoricalDataProcessor
-            expected = {"variable": set(cleaned_categories)}
-
-        processor.set_attributes_from_dict(params)
-
-        actual = getattr(processor, attribute)
-
-        assert actual == expected
-
-    @pytest.mark.parametrize("scale_contingency_table, expected",
-                             [(False, 0.01329),
-                              (True, 0.43437)])
-    def test_compute_p_value_classification(self, scale_contingency_table, expected):
-
-        X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
-        y = pd.Series(data=([0]*35 + [1]*35 + [0]*15 + [1]*5 + [0]*8 + [1]*2))
-        category = "c1"
-
-        actual = (CategoricalDataProcessor
-                  ._compute_p_value(X, y, category, "classification", scale_contingency_table))
-
-        assert pytest.approx(actual, abs=1e-5) == expected
-
-    @pytest.mark.parametrize("seed, expected",
-                             [(505, 0.02222),
-                              (603, 0.89230)])
-    def test_compute_p_value_regression(self, seed, expected):
-
-        np.random.seed(seed)
-
-        X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
-        y = pd.Series(data=np.random.uniform(0, 1, 100)*5)
-        category = "c1"
-
-        actual = (CategoricalDataProcessor
-                  ._compute_p_value(X, y, category, "regression", None))
-
-        assert pytest.approx(actual, abs=1e-5) == expected
-
-    def test_get_small_categories(self):
-
-        data = pd.Series(data=(["c1"]*50 + ["c2"]*25 + ["c3"]*15 + ["c4"]*5))
-        incidence = 0.35
-        threshold = 10  # to make it easy to manualLy compute
-        expected = {"c3", "c4"}
-
-        actual = (CategoricalDataProcessor
-                  ._get_small_categories(data, incidence, threshold))
-
-        assert actual == expected
-
-    def test_replace_missings(self):
-
-        data = pd.DataFrame({"variable": ["c1", "c2", np.nan, "", " "]})
-        expected = pd.DataFrame({"variable": ["c1", "c2", "Missing", "Missing",
-                                              "Missing"]
-                                 })
-        actual = (CategoricalDataProcessor
-                  ._replace_missings(data, ["variable"]))
-
-        pd.testing.assert_frame_equal(actual, expected)
-
-    @pytest.mark.parametrize("cleaned_categories, expected",
-                             [({"c1", "c2"},
-                               pd.Series(data=["c1", "c2", "Other", "Other"])),
-                              ({"c1", "c2", "c3", "c4"},
-                               pd.Series(data=["c1", "c2", "c3", "c4"]))])
-    def test_replace_categories(self, cleaned_categories, expected):
-
-        data = pd.Series(data=["c1", "c2", "c3", "c4"])
-
-        actual = (CategoricalDataProcessor
-                  ._replace_categories(data, cleaned_categories, 'Other'))
-
-        pd.testing.assert_series_equal(actual, expected)
-
-    def test_all_cats_not_significant(self):
-        # Expected
-        e = {'categorical_var': ['A', 'A', 'A', 'A',
-                                 'B', 'B', 'B', 'B',
-                                 'C', 'C', 'C', 'C'],
-             'target': [1, 1, 1, 1,
-                        0, 0, 0, 0,
-                        1, 0, 1, 0],
-             'categorical_var_processed': ['A', 'A', 'A', 'A',
-                                           'B', 'B', 'B', 'B',
-                                           'C', 'C', 'C', 'C']}
-
-        # data -> actual
-        d = {'categorical_var': ['A', 'A', 'A', 'A',
-                                 'B', 'B', 'B', 'B',
-                                 'C', 'C', 'C', 'C'],
-             'target': [1, 1, 1, 1,
-                        0, 0, 0, 0,
-                        1, 0, 1, 0]}
-
-        discrete_vars = ['categorical_var']
-        target_column_name = 'target'
-
-        data = pd.DataFrame(d, columns=['categorical_var', 'target'])
-        expected = pd.DataFrame(e, columns=['categorical_var',
-                                            'target',
-                                            'categorical_var_processed'])
-
-        categorical_data_processor = CategoricalDataProcessor(
-                    category_size_threshold=0,
-                    p_value_threshold=0.0001)
-
-        categorical_data_processor.fit(data,
-                                       discrete_vars,
-                                       target_column_name)
-
-        actual = categorical_data_processor.transform(data,
-                                                      discrete_vars)
-
-        pd.testing.assert_frame_equal(actual, expected)
-
-    def test_regroup_name(self):
-        # Expected
-        e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
-                                 'B', 'B', 'B', 'B', 'B', 'B',
-                                 'C', 'C', 'C', 'C', 'C', 'C'],
-             'target': [1, 1, 1, 1, 1, 1,
-                        0, 0, 0, 0, 0, 0,
-                        1, 0, 1, 0, 1, 0],
-             'categorical_var_processed': [
-                'A', 'A', 'A', 'A', 'A', 'A',
-                'B', 'B', 'B', 'B', 'B', 'B',
-                'OTH', 'OTH', 'OTH', 'OTH', 'OTH', 'OTH']}
-
-        # data -> actual
-        d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
-                                 'B', 'B', 'B', 'B', 'B', 'B',
-                                 'C', 'C', 'C', 'C', 'C', 'C'],
-             'target': [1, 1, 1, 1, 1, 1,
-                        0, 0, 0, 0, 0, 0,
-                        1, 0, 1, 0, 1, 0]}
-
-        discrete_vars = ['categorical_var']
-        target_column_name = 'target'
-
-        data = pd.DataFrame(d, columns=['categorical_var', 'target'])
-        expected = pd.DataFrame(e, columns=['categorical_var',
-                                            'target',
-                                            'categorical_var_processed'])
-
-        expected['categorical_var_processed'] = (
-            expected['categorical_var_processed'].astype("category"))
-
-        categorical_data_processor = CategoricalDataProcessor(
-                    category_size_threshold=0,
-                    regroup_name='OTH',
-                    p_value_threshold=0.05)
-
-        categorical_data_processor.fit(data,
-                                       discrete_vars,
-                                       target_column_name)
-
-        actual = categorical_data_processor.transform(data,
-                                                      discrete_vars)
-
-        pd.testing.assert_frame_equal(actual, expected)
-
-    def test_force_category(self):
-        # Expected
-        e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
-                                 'B', 'B', 'B', 'B', 'B', 'B',
-                                 'C', 'C', 'C', 'C', 'C', 'C'],
-             'target': [1, 1, 1, 1, 1, 1,
-                        0, 0, 0, 0, 0, 0,
-                        1, 0, 1, 0, 1, 0],
-             'categorical_var_processed': ['A', 'A', 'A', 'A', 'A', 'A',
-                                           'B', 'B', 'B', 'B', 'B', 'B',
-                                           'C', 'C', 'C', 'C', 'C', 'C']}
-
-        # data -> actual
-        d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
-                                 'B', 'B', 'B', 'B', 'B', 'B',
-                                 'C', 'C', 'C', 'C', 'C', 'C'],
-             'target': [1, 1, 1, 1, 1, 1,
-                        0, 0, 0, 0, 0, 0,
-                        1, 0, 1, 0, 1, 0]}
-
-        discrete_vars = ['categorical_var']
-        target_column_name = 'target'
-
-        data = pd.DataFrame(d, columns=['categorical_var', 'target'])
-        expected = pd.DataFrame(e, columns=['categorical_var',
-                                            'target',
-                                            'categorical_var_processed'])
-
-        expected['categorical_var_processed'] = (
-            expected['categorical_var_processed'].astype("category"))
-
-        categorical_data_processor = CategoricalDataProcessor(
-                    category_size_threshold=0,
-                    forced_categories={'categorical_var': ['C']},
-                    p_value_threshold=0.05)
-
-        categorical_data_processor.fit(data,
-                                       discrete_vars,
-                                       target_column_name)
-
-        actual = categorical_data_processor.transform(data,
-                                                      discrete_vars)
-
-        pd.testing.assert_frame_equal(actual, expected)
-
-    def test_categorical_variable_is_constant(self):
-        # Expected
-        e = {'categorical_var': ['A', 'A', 'A', 'A',
-                                 'A', 'A', 'A', 'A',
-                                 'A', 'A', 'A', 'A'],
-             'target': [1, 1, 1, 1,
-                        0, 0, 0, 0,
-                        1, 0, 1, 0],
-             'categorical_var_processed': ['A', 'A', 'A', 'A',
-                                           'A', 'A', 'A', 'A',
-                                           'A', 'A', 'A', 'A']}
-
-        # data -> actual
-        d = {'categorical_var': ['A', 'A', 'A', 'A',
-                                 'A', 'A', 'A', 'A',
-                                 'A', 'A', 'A', 'A'],
-             'target': [1, 1, 1, 1,
-                        0, 0, 0, 0,
-                        1, 0, 1, 0]}
-
-        discrete_vars = ['categorical_var']
-        target_column_name = 'target'
-
-        data = pd.DataFrame(d, columns=['categorical_var', 'target'])
-        expected = pd.DataFrame(e, columns=['categorical_var',
-                                            'target',
-                                            'categorical_var_processed'])
-
-        expected['categorical_var_processed'] = (
-            expected['categorical_var_processed'].astype("category"))
-
-        categorical_data_processor = CategoricalDataProcessor(
-                    category_size_threshold=0,
-                    p_value_threshold=0.0001)
-
-        categorical_data_processor.fit(data,
-                                       discrete_vars,
-                                       target_column_name)
-
-        actual = categorical_data_processor.transform(data,
-                                                      discrete_vars)
-
-        pd.testing.assert_frame_equal(actual, expected)
+
+import pytest
+import numpy as np
+import pandas as pd
+
+from cobra.preprocessing import CategoricalDataProcessor
+
+class TestCategoricalDataProcessor:
+
+    def test_attributes_to_dict(self):
+
+        processor = CategoricalDataProcessor()
+
+        cleaned_categories = ["a", "b", "c"]
+        processor._cleaned_categories_by_column = {
+            "variable": set(cleaned_categories)
+        }
+
+        actual = processor.attributes_to_dict()
+
+        expected = {
+            "model_type": "classification",
+            "regroup": True,
+            "regroup_name": "Other",
+            "keep_missing": True,
+            "category_size_threshold": 5,
+            "p_value_threshold": 0.001,
+            "scale_contingency_table": True,
+            "forced_categories": {},
+            "_cleaned_categories_by_column": {
+                "variable": list(set(cleaned_categories))
+            }
+        }
+
+        assert actual == expected
+
+    @pytest.mark.parametrize("attribute",
+                             ["regroup", "regroup_name", "keep_missing",
+                              "category_size_threshold", "p_value_threshold",
+                              "scale_contingency_table", "forced_categories",
+                              "_cleaned_categories_by_column"])
+    def test_set_attributes_from_dict(self, attribute):
+
+        processor = CategoricalDataProcessor()
+
+        cleaned_categories = ["a", "b", "c"]
+        params = {
+            "regroup": True,
+            "regroup_name": "Other",
+            "keep_missing": True,
+            "category_size_threshold": 5,
+            "p_value_threshold": 0.001,
+            "scale_contingency_table": True,
+            "forced_categories": {},
+            "_cleaned_categories_by_column": {
+                "variable": cleaned_categories
+            }
+        }
+
+        expected = params[attribute]
+
+        if attribute == "_cleaned_categories_by_column":
+            # list is transformed to a set in CategoricalDataProcessor
+            expected = {"variable": set(cleaned_categories)}
+
+        processor.set_attributes_from_dict(params)
+
+        actual = getattr(processor, attribute)
+
+        assert actual == expected
+
+    @pytest.mark.parametrize("scale_contingency_table, expected",
+                             [(False, 0.01329),
+                              (True, 0.43437)])
+    def test_compute_p_value_classification(self, scale_contingency_table, expected):
+
+        X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
+        y = pd.Series(data=([0]*35 + [1]*35 + [0]*15 + [1]*5 + [0]*8 + [1]*2))
+        category = "c1"
+
+        actual = (CategoricalDataProcessor
+                  ._compute_p_value(X, y, category, "classification", scale_contingency_table))
+
+        assert pytest.approx(actual, abs=1e-5) == expected
+
+    @pytest.mark.parametrize("seed, expected",
+                             [(505, 0.02222),
+                              (603, 0.89230)])
+    def test_compute_p_value_regression(self, seed, expected):
+
+        np.random.seed(seed)
+
+        X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
+        y = pd.Series(data=np.random.uniform(0, 1, 100)*5)
+        category = "c1"
+
+        actual = (CategoricalDataProcessor
+                  ._compute_p_value(X, y, category, "regression", None))
+
+        assert pytest.approx(actual, abs=1e-5) == expected
+
+    def test_get_small_categories(self):
+
+        data = pd.Series(data=(["c1"]*50 + ["c2"]*25 + ["c3"]*15 + ["c4"]*5))
+        incidence = 0.35
+        threshold = 10  # to make it easy to manualLy compute
+        expected = {"c3", "c4"}
+
+        actual = (CategoricalDataProcessor
+                  ._get_small_categories(data, incidence, threshold))
+
+        assert actual == expected
+
+    def test_replace_missings(self):
+
+        data = pd.DataFrame({"variable": ["c1", "c2", np.nan, "", " "]})
+        expected = pd.DataFrame({"variable": ["c1", "c2", "Missing", "Missing",
+                                              "Missing"]
+                                 })
+        actual = (CategoricalDataProcessor
+                  ._replace_missings(data, ["variable"]))
+
+        pd.testing.assert_frame_equal(actual, expected)
+
+    @pytest.mark.parametrize("cleaned_categories, expected",
+                             [({"c1", "c2"},
+                               pd.Series(data=["c1", "c2", "Other", "Other"])),
+                              ({"c1", "c2", "c3", "c4"},
+                               pd.Series(data=["c1", "c2", "c3", "c4"]))])
+    def test_replace_categories(self, cleaned_categories, expected):
+
+        data = pd.Series(data=["c1", "c2", "c3", "c4"])
+
+        actual = (CategoricalDataProcessor
+                  ._replace_categories(data, cleaned_categories, 'Other'))
+
+        pd.testing.assert_series_equal(actual, expected)
+
+    def test_all_cats_not_significant(self):
+        # Expected
+        e = {'categorical_var': ['A', 'A', 'A', 'A',
+                                 'B', 'B', 'B', 'B',
+                                 'C', 'C', 'C', 'C'],
+             'target': [1, 1, 1, 1,
+                        0, 0, 0, 0,
+                        1, 0, 1, 0],
+             'categorical_var_processed': ['A', 'A', 'A', 'A',
+                                           'B', 'B', 'B', 'B',
+                                           'C', 'C', 'C', 'C']}
+
+        # data -> actual
+        d = {'categorical_var': ['A', 'A', 'A', 'A',
+                                 'B', 'B', 'B', 'B',
+                                 'C', 'C', 'C', 'C'],
+             'target': [1, 1, 1, 1,
+                        0, 0, 0, 0,
+                        1, 0, 1, 0]}
+
+        discrete_vars = ['categorical_var']
+        target_column_name = 'target'
+
+        data = pd.DataFrame(d, columns=['categorical_var', 'target'])
+        expected = pd.DataFrame(e, columns=['categorical_var',
+                                            'target',
+                                            'categorical_var_processed'])
+
+        categorical_data_processor = CategoricalDataProcessor(
+                    category_size_threshold=0,
+                    p_value_threshold=0.0001)
+
+        categorical_data_processor.fit(data,
+                                       discrete_vars,
+                                       target_column_name)
+
+        actual = categorical_data_processor.transform(data,
+                                                      discrete_vars)
+
+        pd.testing.assert_frame_equal(actual, expected)
+
+    def test_regroup_name(self):
+        # Expected
+        e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
+                                 'B', 'B', 'B', 'B', 'B', 'B',
+                                 'C', 'C', 'C', 'C', 'C', 'C'],
+             'target': [1, 1, 1, 1, 1, 1,
+                        0, 0, 0, 0, 0, 0,
+                        1, 0, 1, 0, 1, 0],
+             'categorical_var_processed': [
+                'A', 'A', 'A', 'A', 'A', 'A',
+                'B', 'B', 'B', 'B', 'B', 'B',
+                'OTH', 'OTH', 'OTH', 'OTH', 'OTH', 'OTH']}
+
+        # data -> actual
+        d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
+                                 'B', 'B', 'B', 'B', 'B', 'B',
+                                 'C', 'C', 'C', 'C', 'C', 'C'],
+             'target': [1, 1, 1, 1, 1, 1,
+                        0, 0, 0, 0, 0, 0,
+                        1, 0, 1, 0, 1, 0]}
+
+        discrete_vars = ['categorical_var']
+        target_column_name = 'target'
+
+        data = pd.DataFrame(d, columns=['categorical_var', 'target'])
+        expected = pd.DataFrame(e, columns=['categorical_var',
+                                            'target',
+                                            'categorical_var_processed'])
+
+        expected['categorical_var_processed'] = (
+            expected['categorical_var_processed'].astype("category"))
+
+        categorical_data_processor = CategoricalDataProcessor(
+                    category_size_threshold=0,
+                    regroup_name='OTH',
+                    p_value_threshold=0.05)
+
+        categorical_data_processor.fit(data,
+                                       discrete_vars,
+                                       target_column_name)
+
+        actual = categorical_data_processor.transform(data,
+                                                      discrete_vars)
+
+        pd.testing.assert_frame_equal(actual, expected)
+
+    def test_force_category(self):
+        # Expected
+        e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
+                                 'B', 'B', 'B', 'B', 'B', 'B',
+                                 'C', 'C', 'C', 'C', 'C', 'C'],
+             'target': [1, 1, 1, 1, 1, 1,
+                        0, 0, 0, 0, 0, 0,
+                        1, 0, 1, 0, 1, 0],
+             'categorical_var_processed': ['A', 'A', 'A', 'A', 'A', 'A',
+                                           'B', 'B', 'B', 'B', 'B', 'B',
+                                           'C', 'C', 'C', 'C', 'C', 'C']}
+
+        # data -> actual
+        d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
+                                 'B', 'B', 'B', 'B', 'B', 'B',
+                                 'C', 'C', 'C', 'C', 'C', 'C'],
+             'target': [1, 1, 1, 1, 1, 1,
+                        0, 0, 0, 0, 0, 0,
+                        1, 0, 1, 0, 1, 0]}
+
+        discrete_vars = ['categorical_var']
+        target_column_name = 'target'
+
+        data = pd.DataFrame(d, columns=['categorical_var', 'target'])
+        expected = pd.DataFrame(e, columns=['categorical_var',
+                                            'target',
+                                            'categorical_var_processed'])
+
+        expected['categorical_var_processed'] = (
+            expected['categorical_var_processed'].astype("category"))
+
+        categorical_data_processor = CategoricalDataProcessor(
+                    category_size_threshold=0,
+                    forced_categories={'categorical_var': ['C']},
+                    p_value_threshold=0.05)
+
+        categorical_data_processor.fit(data,
+                                       discrete_vars,
+                                       target_column_name)
+
+        actual = categorical_data_processor.transform(data,
+                                                      discrete_vars)
+
+        pd.testing.assert_frame_equal(actual, expected)
+
+    def test_categorical_variable_is_constant(self):
+        # Expected
+        e = {'categorical_var': ['A', 'A', 'A', 'A',
+                                 'A', 'A', 'A', 'A',
+                                 'A', 'A', 'A', 'A'],
+             'target': [1, 1, 1, 1,
+                        0, 0, 0, 0,
+                        1, 0, 1, 0],
+             'categorical_var_processed': ['A', 'A', 'A', 'A',
+                                           'A', 'A', 'A', 'A',
+                                           'A', 'A', 'A', 'A']}
+
+        # data -> actual
+        d = {'categorical_var': ['A', 'A', 'A', 'A',
+                                 'A', 'A', 'A', 'A',
+                                 'A', 'A', 'A', 'A'],
+             'target': [1, 1, 1, 1,
+                        0, 0, 0, 0,
+                        1, 0, 1, 0]}
+
+        discrete_vars = ['categorical_var']
+        target_column_name = 'target'
+
+        data = pd.DataFrame(d, columns=['categorical_var', 'target'])
+        expected = pd.DataFrame(e, columns=['categorical_var',
+                                            'target',
+                                            'categorical_var_processed'])
+
+        expected['categorical_var_processed'] = (
+            expected['categorical_var_processed'].astype("category"))
+
+        categorical_data_processor = CategoricalDataProcessor(
+                    category_size_threshold=0,
+                    p_value_threshold=0.0001)
+
+        categorical_data_processor.fit(data,
+                                       discrete_vars,
+                                       target_column_name)
+
+        actual = categorical_data_processor.transform(data,
+                                                      discrete_vars)
+
+        pd.testing.assert_frame_equal(actual, expected)
diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py
index d3a643a..209d74b 100644
--- a/tests/preprocessing/test_kbins_discretizer.py
+++ b/tests/preprocessing/test_kbins_discretizer.py
@@ -1,252 +1,252 @@
-
-from contextlib import contextmanager
-import pytest
-import numpy as np
-import pandas as pd
-
-from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer
-
-@contextmanager
-def does_not_raise():
-    yield
-
-
-class TestKBinsDiscretizer:
-
-    # ---------------- Test for public methods ----------------
-    def test_attributes_to_dict(self):
-
-        discretizer = KBinsDiscretizer()
-
-        bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]
-        discretizer._bins_by_column = {"variable": bins}
-
-        actual = discretizer.attributes_to_dict()
-
-        expected = {
-            "n_bins": 10,
-            "strategy": "quantile",
-            "closed": "right",
-            "auto_adapt_bins": False,
-            "starting_precision": 0,
-            "label_format": "{} - {}",
-            "change_endpoint_format": False,
-            "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0],
-                                             [6.0, 9.0]]}
-        }
-
-        assert actual == expected
-
-    @pytest.mark.parametrize("attribute",
-                             ["n_bins", "strategy", "closed",
-                              "auto_adapt_bins", "starting_precision",
-                              "label_format", "change_endpoint_format",
-                              "_bins_by_column"])
-    def test_set_attributes_from_dict(self, attribute):
-
-        discretizer = KBinsDiscretizer()
-
-        params = {
-            "n_bins": 5,
-            "strategy": "uniform",
-            "closed": "left",
-            "auto_adapt_bins": True,
-            "starting_precision": 1,
-            "label_format": "[,)",
-            "change_endpoint_format": True,
-            "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0],
-                                             [6.0, 9.0]]}
-        }
-
-        expected = params[attribute]
-
-        if attribute == "_bins_by_column":
-            # list of list is transformed to a list of tuples
-            # in KBinsDiscretizer!!!
-            expected = {"variable": [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]}
-
-        discretizer.set_attributes_from_dict(params)
-
-        actual = getattr(discretizer, attribute)
-
-        assert actual == expected
-
-    # no further tests here as this is just a wrapper around _fit_column!
-    @pytest.mark.parametrize("strategy, expectation",
-                             [("trees", pytest.raises(ValueError)),
-                              ("quantile", does_not_raise())])
-    def test_fit_exception(self, strategy, expectation):
-        discretizer = KBinsDiscretizer(strategy=strategy)
-
-        data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
-
-        with expectation:
-            discretizer.fit(data, ["variable"])
-
-    # no further tests here as this is just a wrapper around _transform_column!
-    @pytest.mark.parametrize("scenario, expectation",
-                             [("raise", pytest.raises(ValueError)),
-                              ("regular_test", does_not_raise()),
-                              ("constant_data", does_not_raise())])
-    def test_transform(self, scenario, expectation):
-
-        discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform")
-
-        data = pd.DataFrame({"variable": ([1] * 10)})
-        expected = data.copy()
-
-        if scenario == "regular_test":
-            # overwrite data and expected with DataFrame containing
-            # a non-constant variable
-            data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
-            expected = data.copy()
-
-            discretizer.fit(data, ["variable"])
-
-            categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"]
-            expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4
-                                                      + ["3.0 - 6.0"]*3
-                                                      + ["6.0 - 9.0"]*3
-                                                      + ["Missing"],
-                                                      categories=categories,
-                                                      ordered=True)
-        elif scenario == "constant_data":
-            discretizer.fit(data, ["variable"])
-
-        with expectation:
-            actual = discretizer.transform(data, ["variable"])
-            pd.testing.assert_frame_equal(actual, expected)
-
-    # ---------------- Test for private methods ----------------
-    @pytest.mark.parametrize("n_bins, expectation",
-                             [(1, pytest.raises(ValueError)),
-                              (10.5, pytest.raises(ValueError)),
-                              (2, does_not_raise())])
-    def test_validate_n_bins_exception(self, n_bins, expectation):
-        with expectation:
-            assert KBinsDiscretizer()._validate_n_bins(n_bins=n_bins) is None
-
-    def test_transform_column(self):
-
-        data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
-        discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform")
-
-        bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]
-
-        actual = discretizer._transform_column(data, "variable", bins)
-
-        categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"]
-
-        expected = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
-        expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4
-                                                  + ["3.0 - 6.0"]*3
-                                                  + ["6.0 - 9.0"]*3
-                                                  + ["Missing"],
-                                                  categories=categories,
-                                                  ordered=True)
-
-        # assert using pandas testing module
-        pd.testing.assert_frame_equal(actual, expected)
-
-    @pytest.mark.parametrize("n_bins, auto_adapt_bins, data, expected",
-                             [(4, False,
-                               pd.DataFrame({"variable": list(range(0, 11))}),
-                               [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0),
-                                (8.0, 10.0)]),
-                              (10, True,
-                               # ints from 0-10 with 17 nan's
-                               pd.DataFrame({"variable": list(range(0, 11)) +
-                                            ([np.nan] * 17)}),
-                               [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0),
-                                (8.0, 10.0)]),
-                              (10, False,
-                               # almost constant
-                               pd.DataFrame({"variable": [0] + ([1] * 100)}),
-                               None),
-                              (2, False,
-                               pd.DataFrame({"variable": [5.4, 9.3, np.inf]}),
-                               None)],
-                             ids=["regular", "auto_adapt_bins",
-                                  "two bin edges", "infs"])
-    def test_fit_column(self, n_bins, auto_adapt_bins, data, expected):
-        discretizer = KBinsDiscretizer(n_bins=n_bins,
-                                       auto_adapt_bins=auto_adapt_bins)
-
-        actual = discretizer._fit_column(data, column_name="variable")
-
-        assert actual == expected
-
-    @pytest.mark.parametrize("strategy, n_bins, data, expected",
-                             [("quantile",  # strategy
-                               4,  # n_bins
-                               # data (ints from 0 - 10):
-                               pd.DataFrame({"variable": list(range(0, 11))}),
-                               [0.0, 2.5, 5, 7.5, 10.0]),  # expected result
-                              ("uniform",  # strategy
-                               3,  # n_bins
-                               # data (ints from 0 - 9):
-                               pd.DataFrame({"variable": list(range(0, 10))}),
-                               [0.0, 3.0, 6.0, 9.0])],  # expected result
-                             ids=["quantile", "uniform"])
-    def test_compute_bin_edges(self, strategy, n_bins, data, expected):
-
-        discretizer = KBinsDiscretizer(strategy=strategy)
-
-        actual = discretizer._compute_bin_edges(data, column_name="variable",
-                                                n_bins=n_bins,
-                                                col_min=data.variable.min(),
-                                                col_max=data.variable.max())
-
-        assert actual == expected
-
-    @pytest.mark.parametrize("bin_edges, starting_precision, expected",
-                             [([-10, 0, 1, 2], 1, 1),
-                              ([-10, 0, 1, 1.01], 0, 2),
-                              ([-10, 0, 1, 1.1], 1, 1),
-                              ([-10, 0, 1, 2], -1, 0),
-                              ([-10, 0, 10, 21], -1, -1)],
-                             ids=["less precision", "more precision",
-                                  "equal precision", "negative start",
-                                  "round up"])
-    def test_compute_minimal_precision_of_bin_edges(self, bin_edges,
-                                                    starting_precision,
-                                                    expected):
-
-        discretizer = KBinsDiscretizer(starting_precision=starting_precision)
-
-        actual = discretizer._compute_minimal_precision_of_bin_edges(bin_edges)
-
-        assert actual == expected
-
-    @pytest.mark.parametrize("bin_edges, expected",
-                             [([0, 1, 1.5, 2], [(0, 1), (1, 1.5), (1.5, 2)]),
-                              ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)]),
-                              ([np.inf, 0.0, -np.inf],
-                               [(np.inf, 0.0), (0.0, -np.inf)])])
-    def test_compute_bins_from_edges(self, bin_edges, expected):
-
-        discretizer = KBinsDiscretizer()
-        actual = discretizer._compute_bins_from_edges(bin_edges)
-
-        assert actual == expected
-
-    @pytest.mark.parametrize("change_endpoint_format, closed, bins, expected",
-                             [(False, "right", [(0, 1), (1, 2), (2, 3)],
-                               ["0 - 1", "1 - 2", "2 - 3"]),
-                              (True, "right", [(0, 1), (1, 2), (2, 3)],
-                               ["<= 1", "1 - 2", "> 2"]),
-                              (True, "left", [(0, 1), (1, 2), (2, 3)],
-                               ["< 1", "1 - 2", ">= 2"])],
-                             ids=["standard format", "different endpoints",
-                                  "different endpoints left"])
-    def test_create_bin_labels(self, change_endpoint_format, closed,
-                               bins, expected):
-
-        discretizer = KBinsDiscretizer(
-            closed=closed,
-            change_endpoint_format=change_endpoint_format
-        )
-
-        actual = discretizer._create_bin_labels(bins)
-
-        assert actual == expected
+
+from contextlib import contextmanager
+import pytest
+import numpy as np
+import pandas as pd
+
+from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer
+
+@contextmanager
+def does_not_raise():
+    yield
+
+
+class TestKBinsDiscretizer:
+
+    # ---------------- Test for public methods ----------------
+    def test_attributes_to_dict(self):
+
+        discretizer = KBinsDiscretizer()
+
+        bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]
+        discretizer._bins_by_column = {"variable": bins}
+
+        actual = discretizer.attributes_to_dict()
+
+        expected = {
+            "n_bins": 10,
+            "strategy": "quantile",
+            "closed": "right",
+            "auto_adapt_bins": False,
+            "starting_precision": 0,
+            "label_format": "{} - {}",
+            "change_endpoint_format": False,
+            "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0],
+                                             [6.0, 9.0]]}
+        }
+
+        assert actual == expected
+
+    @pytest.mark.parametrize("attribute",
+                             ["n_bins", "strategy", "closed",
+                              "auto_adapt_bins", "starting_precision",
+                              "label_format", "change_endpoint_format",
+                              "_bins_by_column"])
+    def test_set_attributes_from_dict(self, attribute):
+
+        discretizer = KBinsDiscretizer()
+
+        params = {
+            "n_bins": 5,
+            "strategy": "uniform",
+            "closed": "left",
+            "auto_adapt_bins": True,
+            "starting_precision": 1,
+            "label_format": "[,)",
+            "change_endpoint_format": True,
+            "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0],
+                                             [6.0, 9.0]]}
+        }
+
+        expected = params[attribute]
+
+        if attribute == "_bins_by_column":
+            # list of list is transformed to a list of tuples
+            # in KBinsDiscretizer!!!
+            expected = {"variable": [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]}
+
+        discretizer.set_attributes_from_dict(params)
+
+        actual = getattr(discretizer, attribute)
+
+        assert actual == expected
+
+    # no further tests here as this is just a wrapper around _fit_column!
+    @pytest.mark.parametrize("strategy, expectation",
+                             [("trees", pytest.raises(ValueError)),
+                              ("quantile", does_not_raise())])
+    def test_fit_exception(self, strategy, expectation):
+        discretizer = KBinsDiscretizer(strategy=strategy)
+
+        data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
+
+        with expectation:
+            discretizer.fit(data, ["variable"])
+
+    # no further tests here as this is just a wrapper around _transform_column!
+    @pytest.mark.parametrize("scenario, expectation",
+                             [("raise", pytest.raises(ValueError)),
+                              ("regular_test", does_not_raise()),
+                              ("constant_data", does_not_raise())])
+    def test_transform(self, scenario, expectation):
+
+        discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform")
+
+        data = pd.DataFrame({"variable": ([1] * 10)})
+        expected = data.copy()
+
+        if scenario == "regular_test":
+            # overwrite data and expected with DataFrame containing
+            # a non-constant variable
+            data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
+            expected = data.copy()
+
+            discretizer.fit(data, ["variable"])
+
+            categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"]
+            expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4
+                                                      + ["3.0 - 6.0"]*3
+                                                      + ["6.0 - 9.0"]*3
+                                                      + ["Missing"],
+                                                      categories=categories,
+                                                      ordered=True)
+        elif scenario == "constant_data":
+            discretizer.fit(data, ["variable"])
+
+        with expectation:
+            actual = discretizer.transform(data, ["variable"])
+            pd.testing.assert_frame_equal(actual, expected)
+
+    # ---------------- Test for private methods ----------------
+    @pytest.mark.parametrize("n_bins, expectation",
+                             [(1, pytest.raises(ValueError)),
+                              (10.5, pytest.raises(ValueError)),
+                              (2, does_not_raise())])
+    def test_validate_n_bins_exception(self, n_bins, expectation):
+        with expectation:
+            assert KBinsDiscretizer()._validate_n_bins(n_bins=n_bins) is None
+
+    def test_transform_column(self):
+
+        data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
+        discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform")
+
+        bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]
+
+        actual = discretizer._transform_column(data, "variable", bins)
+
+        categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"]
+
+        expected = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
+        expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4
+                                                  + ["3.0 - 6.0"]*3
+                                                  + ["6.0 - 9.0"]*3
+                                                  + ["Missing"],
+                                                  categories=categories,
+                                                  ordered=True)
+
+        # assert using pandas testing module
+        pd.testing.assert_frame_equal(actual, expected)
+
+    @pytest.mark.parametrize("n_bins, auto_adapt_bins, data, expected",
+                             [(4, False,
+                               pd.DataFrame({"variable": list(range(0, 11))}),
+                               [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0),
+                                (8.0, 10.0)]),
+                              (10, True,
+                               # ints from 0-10 with 17 nan's
+                               pd.DataFrame({"variable": list(range(0, 11)) +
+                                            ([np.nan] * 17)}),
+                               [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0),
+                                (8.0, 10.0)]),
+                              (10, False,
+                               # almost constant
+                               pd.DataFrame({"variable": [0] + ([1] * 100)}),
+                               None),
+                              (2, False,
+                               pd.DataFrame({"variable": [5.4, 9.3, np.inf]}),
+                               None)],
+                             ids=["regular", "auto_adapt_bins",
+                                  "two bin edges", "infs"])
+    def test_fit_column(self, n_bins, auto_adapt_bins, data, expected):
+        discretizer = KBinsDiscretizer(n_bins=n_bins,
+                                       auto_adapt_bins=auto_adapt_bins)
+
+        actual = discretizer._fit_column(data, column_name="variable")
+
+        assert actual == expected
+
+    @pytest.mark.parametrize("strategy, n_bins, data, expected",
+                             [("quantile",  # strategy
+                               4,  # n_bins
+                               # data (ints from 0 - 10):
+                               pd.DataFrame({"variable": list(range(0, 11))}),
+                               [0.0, 2.5, 5, 7.5, 10.0]),  # expected result
+                              ("uniform",  # strategy
+                               3,  # n_bins
+                               # data (ints from 0 - 9):
+                               pd.DataFrame({"variable": list(range(0, 10))}),
+                               [0.0, 3.0, 6.0, 9.0])],  # expected result
+                             ids=["quantile", "uniform"])
+    def test_compute_bin_edges(self, strategy, n_bins, data, expected):
+
+        discretizer = KBinsDiscretizer(strategy=strategy)
+
+        actual = discretizer._compute_bin_edges(data, column_name="variable",
+                                                n_bins=n_bins,
+                                                col_min=data.variable.min(),
+                                                col_max=data.variable.max())
+
+        assert actual == expected
+
+    @pytest.mark.parametrize("bin_edges, starting_precision, expected",
+                             [([-10, 0, 1, 2], 1, 1),
+                              ([-10, 0, 1, 1.01], 0, 2),
+                              ([-10, 0, 1, 1.1], 1, 1),
+                              ([-10, 0, 1, 2], -1, 0),
+                              ([-10, 0, 10, 21], -1, -1)],
+                             ids=["less precision", "more precision",
+                                  "equal precision", "negative start",
+                                  "round up"])
+    def test_compute_minimal_precision_of_bin_edges(self, bin_edges,
+                                                    starting_precision,
+                                                    expected):
+
+        discretizer = KBinsDiscretizer(starting_precision=starting_precision)
+
+        actual = discretizer._compute_minimal_precision_of_bin_edges(bin_edges)
+
+        assert actual == expected
+
+    @pytest.mark.parametrize("bin_edges, expected",
+                             [([0, 1, 1.5, 2], [(0, 1), (1, 1.5), (1.5, 2)]),
+                              ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)]),
+                              ([np.inf, 0.0, -np.inf],
+                               [(np.inf, 0.0), (0.0, -np.inf)])])
+    def test_compute_bins_from_edges(self, bin_edges, expected):
+
+        discretizer = KBinsDiscretizer()
+        actual = discretizer._compute_bins_from_edges(bin_edges)
+
+        assert actual == expected
+
+    @pytest.mark.parametrize("change_endpoint_format, closed, bins, expected",
+                             [(False, "right", [(0, 1), (1, 2), (2, 3)],
+                               ["0 - 1", "1 - 2", "2 - 3"]),
+                              (True, "right", [(0, 1), (1, 2), (2, 3)],
+                               ["<= 1", "1 - 2", "> 2"]),
+                              (True, "left", [(0, 1), (1, 2), (2, 3)],
+                               ["< 1", "1 - 2", ">= 2"])],
+                             ids=["standard format", "different endpoints",
+                                  "different endpoints left"])
+    def test_create_bin_labels(self, change_endpoint_format, closed,
+                               bins, expected):
+
+        discretizer = KBinsDiscretizer(
+            closed=closed,
+            change_endpoint_format=change_endpoint_format
+        )
+
+        actual = discretizer._create_bin_labels(bins)
+
+        assert actual == expected
diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py
index 08f5b63..a97a4e4 100644
--- a/tests/preprocessing/test_preprocessor.py
+++ b/tests/preprocessing/test_preprocessor.py
@@ -1,398 +1,398 @@
-from contextlib import contextmanager
-from typing import Any
-from unittest.mock import MagicMock
-import pytest
-import numpy as np
-import pandas as pd
-from pytest_mock import MockerFixture
-
-from cobra.preprocessing.preprocessor import PreProcessor
-
-
-@contextmanager
-def does_not_raise():
-    yield
-
-
-class TestPreProcessor:
-    @pytest.mark.parametrize(
-        "train_prop, selection_prop, validation_prop, " "expected_sizes",
-        [
-            (0.6, 0.2, 0.2, {"train": 6, "selection": 2, "validation": 2}),
-            (0.7, 0.3, 0.0, {"train": 7, "selection": 3}),
-            # Error "The sum of train_prop, selection_prop and
-            # validation_prop must be 1.0." should not be
-            # raised:
-            (0.7, 0.2, 0.1, {"train": 7, "selection": 2, "validation": 1}),
-        ],
-    )
-    def test_train_selection_validation_split(
-        self,
-        train_prop: float,
-        selection_prop: float,
-        validation_prop: float,
-        expected_sizes: dict,
-    ):
-        X = np.arange(100).reshape(10, 10)
-        data = pd.DataFrame(X, columns=[f"c{i+1}" for i in range(10)])
-        data.loc[:, "target"] = np.array([0] * 7 + [1] * 3)
-
-        actual = PreProcessor.train_selection_validation_split(
-            data, train_prop, selection_prop, validation_prop
-        )
-
-        # check for the output schema
-        assert list(actual.columns) == list(data.columns)
-
-        # check that total size of input & output is the same!
-        assert len(actual.index) == len(data.index)
-
-        # check for the output sizes per split
-        actual_sizes = actual.groupby("split").size().to_dict()
-
-        assert actual_sizes == expected_sizes
-
-    def test_train_selection_validation_split_error_wrong_prop(self):
-
-        error_msg = (
-            "The sum of train_prop, selection_prop and " "validation_prop must be 1.0."
-        )
-        train_prop = 0.7
-        selection_prop = 0.3
-
-        self._test_train_selection_validation_split_error(
-            train_prop, selection_prop, error_msg
-        )
-
-    def test_train_selection_validation_split_error_zero_selection_prop(self):
-
-        error_msg = "selection_prop cannot be zero!"
-        train_prop = 0.9
-        selection_prop = 0.0
-
-        self._test_train_selection_validation_split_error(
-            train_prop, selection_prop, error_msg
-        )
-
-    def _test_train_selection_validation_split_error(
-        self, train_prop: float, selection_prop: float, error_msg: str
-    ):
-        df = pd.DataFrame()
-        with pytest.raises(ValueError, match=error_msg):
-            (
-                PreProcessor.train_selection_validation_split(
-                    df,
-                    train_prop=train_prop,
-                    selection_prop=selection_prop,
-                    validation_prop=0.1,
-                )
-            )
-
-    @pytest.mark.parametrize(
-        "injection_location, expected",
-        [
-            (None, True),
-            ("categorical_data_processor", False),
-            ("discretizer", False),
-            ("target_encoder", False),
-        ],
-    )
-    def test_is_valid_pipeline(self, injection_location: str, expected: bool):
-
-        # is_valid_pipeline only checks for relevant keys atm
-        pipeline_dict = {
-            "categorical_data_processor": {
-                "model_type": None,
-                "regroup": None,
-                "regroup_name": None,
-                "keep_missing": None,
-                "category_size_threshold": None,
-                "p_value_threshold": None,
-                "scale_contingency_table": None,
-                "forced_categories": None,
-            },
-            "discretizer": {
-                "n_bins": None,
-                "strategy": None,
-                "closed": None,
-                "auto_adapt_bins": None,
-                "starting_precision": None,
-                "label_format": None,
-                "change_endpoint_format": None,
-            },
-            "target_encoder": {
-                "weight": None,
-                "imputation_strategy": None,
-            },
-        }
-
-        if injection_location:
-            pipeline_dict[injection_location]["wrong_key"] = None
-
-        actual = PreProcessor._is_valid_pipeline(pipeline_dict)
-
-        assert actual == expected
-
-    @pytest.mark.parametrize(
-        ("continuous_vars, discrete_vars, expectation, " "expected"),
-        [
-            ([], [], pytest.raises(ValueError), None),
-            (
-                ["c1", "c2"],
-                ["d1", "d2"],
-                does_not_raise(),
-                ["d1_processed", "d2_processed", "c1_bin", "c2_bin"],
-            ),
-            (["c1", "c2"], [], does_not_raise(), ["c1_bin", "c2_bin"]),
-            ([], ["d1", "d2"], does_not_raise(), ["d1_processed", "d2_processed"]),
-        ],
-    )
-    def test_get_variable_list(
-        self,
-        continuous_vars: list,
-        discrete_vars: list,
-        expectation: Any,
-        expected: list,
-    ):
-
-        with expectation:
-            actual = PreProcessor._get_variable_list(continuous_vars, discrete_vars)
-
-            assert actual == expected
-
-    @pytest.mark.parametrize(
-    ("input, expected"),
-    [
-        # example 1
-        (
-            pd.DataFrame({
-                "ID": list(range(20)),
-                "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
-                "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
-                "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
-                "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
-                }
-            ),
-            pd.DataFrame({
-                'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
-                'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
-                'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
-                'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
-                'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
-                'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
-                'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
-                'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
-                'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
-                'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
-                'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
-                }
-            ),
-        )
-    ]
-    )
-    def test_fit_transform_without_id_col_name(self, input, expected):
-        
-        preprocessor = PreProcessor.from_params(model_type="classification")
-        
-        continuous_vars, discrete_vars = preprocessor.get_continuous_and_discrete_columns(input, "ID","Target")
-
-        calculated = preprocessor.fit_transform(
-            input,
-            continuous_vars=continuous_vars,
-            discrete_vars=discrete_vars,
-            target_column_name="Target"
-            )
-        pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)
-
-    @pytest.mark.parametrize(
-    ("input, expected"),
-    [
-        # example 1
-        (
-            pd.DataFrame({
-                "ID": list(range(20)),
-                "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
-                "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
-                "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
-                "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
-                }
-            ),
-            pd.DataFrame({
-                'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
-                'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
-                'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
-                'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
-                'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
-                'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
-                'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
-                'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
-                'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
-                'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
-                'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
-                }
-            ),
-        )
-    ]
-    )
-    def test_fit_transform_with_id_col_name(self, input, expected):
-        
-        preprocessor = PreProcessor.from_params(model_type="classification")
-        
-        # continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target")
-
-        calculated = preprocessor.fit_transform(
-            input,
-            continuous_vars=None,
-            discrete_vars=None,
-            target_column_name="Target",
-            id_col_name="ID"
-            )
-        pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)
-
-    @staticmethod
-    def mock_transform(df: pd.DataFrame, args):
-        """Mock the transform method."""
-        df["new_column"] = "Hello World"
-        return df
-
-    def test_mutable_train_data_fit_transform(self, mocker: MockerFixture):
-        """Test if the train_data input is not changed when performing fit_transform."""
-        train_data = pd.DataFrame(
-            [[1, "2", 3], [10, "20", 30], [100, "200", 300]],
-            columns=["foo", "bar", "baz"],
-        )
-        preprocessor = PreProcessor.from_params(
-            model_type="classification", n_bins=10, weight=0.8
-        )
-        preprocessor._categorical_data_processor = MagicMock()
-        preprocessor._categorical_data_processor.transform = self.mock_transform
-        preprocessor._discretizer = MagicMock()
-        preprocessor._discretizer.transform = self.mock_transform
-        preprocessor._target_encoder = MagicMock()
-        preprocessor._target_encoder.transform = self.mock_transform
-
-        result = preprocessor.fit_transform(
-            train_data,
-            continuous_vars=["foo"],
-            discrete_vars=["bar"],
-            target_column_name=["baz"],
-        )
-        assert "new_column" not in train_data.columns
-        assert "new_column" in result.columns
-
-    @pytest.mark.parametrize(
-        ("input, expected"),
-        [
-            # example 1
-            (
-                pd.DataFrame(
-                    {
-                        "a": [1, 8, np.nan],
-                        "b": [np.nan, 8, np.nan],
-                        "c": [np.nan, np.nan, np.nan],
-                        "d": [np.nan, np.nan, 5],
-                        "e": [1, 960, np.nan],
-                        "f": [np.nan, np.nan, np.nan],
-                    }
-                ),
-                pd.DataFrame(
-                    {
-                        "a": [1.0, 8.0, np.nan],
-                        "b": [np.nan, 8.0, np.nan],
-                        "d": [np.nan, np.nan, 5.0],
-                        "e": [1.0, 960.0, np.nan],
-                    }
-                ),
-            ),
-            # example 2
-            (
-                pd.DataFrame(
-                    {
-                        "a": [1, 8, np.nan],
-                        "b": [np.nan, 8, np.nan],
-                        "c": [np.nan, np.nan, np.nan],
-                        "d": [np.nan, np.nan, 5],
-                        "e": [1, 960, np.nan],
-                    }
-                ),
-                pd.DataFrame(
-                    {
-                        "a": [1.0, 8.0, np.nan],
-                        "b": [np.nan, 8.0, np.nan],
-                        "d": [np.nan, np.nan, 5.0],
-                        "e": [1.0, 960.0, np.nan],
-                    }
-                ),
-            ),
-            # example 3
-            (
-                pd.DataFrame(
-                    {
-                        "a": [1, 8, np.nan],
-                        "b": [np.nan, 8, np.nan],
-                        "d": [np.nan, np.nan, 5],
-                        "e": [1, 960, np.nan],
-                    }
-                ),
-                pd.DataFrame(
-                    {
-                        "a": [1.0, 8.0, np.nan],
-                        "b": [np.nan, 8.0, np.nan],
-                        "d": [np.nan, np.nan, 5.0],
-                        "e": [1.0, 960.0, np.nan],
-                    }
-                ),
-            ),
-            # example 4 categorical
-            (
-                pd.DataFrame(
-                    {
-                        "a": [1, 8, np.nan],
-                        "b": [np.nan, np.nan, np.nan],
-                        "d": [np.nan, np.nan, 5],
-                        "e": [1, 960, np.nan],
-                        "category_1": ["A", "A", "B"],
-                        "category_2": [np.nan, "A", "B"],
-                        "category_3": [np.nan, np.nan, np.nan],
-                    },
-                ).astype(
-                    {
-                        "a": np.float64(),
-                        "b": np.float64(),
-                        "d": np.float64(),
-                        "e": np.float64(),
-                        "category_1": pd.CategoricalDtype(),
-                        "category_2": pd.CategoricalDtype(),
-                        "category_3": pd.CategoricalDtype(),
-                    }
-                ),
-                pd.DataFrame(
-                    {
-                        "a": [1, 8, np.nan],
-                        "d": [np.nan, np.nan, 5],
-                        "e": [1, 960, np.nan],
-                        "category_1": ["A", "A", "B"],
-                        "category_2": [np.nan, "A", "B"],
-                    }
-                ).astype(
-                    {
-                        "a": np.float64(),
-                        "d": np.float64(),
-                        "e": np.float64(),
-                        "category_1": pd.CategoricalDtype(),
-                        "category_2": pd.CategoricalDtype(),
-                    }
-                ),
-            ),
-        ],
-    )
-    def test_drops_columns_containing_only_nan(self, input, expected):
-
-        print(input)
-        output = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan(
-            input
-        )
-
-        print(output)
-        print(expected)
-        assert output.equals(expected)
+from contextlib import contextmanager
+from typing import Any
+from unittest.mock import MagicMock
+import pytest
+import numpy as np
+import pandas as pd
+from pytest_mock import MockerFixture
+
+from cobra.preprocessing.preprocessor import PreProcessor
+
+
+@contextmanager
+def does_not_raise():
+    yield
+
+
+class TestPreProcessor:
+    @pytest.mark.parametrize(
+        "train_prop, selection_prop, validation_prop, " "expected_sizes",
+        [
+            (0.6, 0.2, 0.2, {"train": 6, "selection": 2, "validation": 2}),
+            (0.7, 0.3, 0.0, {"train": 7, "selection": 3}),
+            # Error "The sum of train_prop, selection_prop and
+            # validation_prop must be 1.0." should not be
+            # raised:
+            (0.7, 0.2, 0.1, {"train": 7, "selection": 2, "validation": 1}),
+        ],
+    )
+    def test_train_selection_validation_split(
+        self,
+        train_prop: float,
+        selection_prop: float,
+        validation_prop: float,
+        expected_sizes: dict,
+    ):
+        X = np.arange(100).reshape(10, 10)
+        data = pd.DataFrame(X, columns=[f"c{i+1}" for i in range(10)])
+        data.loc[:, "target"] = np.array([0] * 7 + [1] * 3)
+
+        actual = PreProcessor.train_selection_validation_split(
+            data, train_prop, selection_prop, validation_prop
+        )
+
+        # check for the output schema
+        assert list(actual.columns) == list(data.columns)
+
+        # check that total size of input & output is the same!
+        assert len(actual.index) == len(data.index)
+
+        # check for the output sizes per split
+        actual_sizes = actual.groupby("split").size().to_dict()
+
+        assert actual_sizes == expected_sizes
+
+    def test_train_selection_validation_split_error_wrong_prop(self):
+
+        error_msg = (
+            "The sum of train_prop, selection_prop and " "validation_prop must be 1.0."
+        )
+        train_prop = 0.7
+        selection_prop = 0.3
+
+        self._test_train_selection_validation_split_error(
+            train_prop, selection_prop, error_msg
+        )
+
+    def test_train_selection_validation_split_error_zero_selection_prop(self):
+
+        error_msg = "selection_prop cannot be zero!"
+        train_prop = 0.9
+        selection_prop = 0.0
+
+        self._test_train_selection_validation_split_error(
+            train_prop, selection_prop, error_msg
+        )
+
+    def _test_train_selection_validation_split_error(
+        self, train_prop: float, selection_prop: float, error_msg: str
+    ):
+        df = pd.DataFrame()
+        with pytest.raises(ValueError, match=error_msg):
+            (
+                PreProcessor.train_selection_validation_split(
+                    df,
+                    train_prop=train_prop,
+                    selection_prop=selection_prop,
+                    validation_prop=0.1,
+                )
+            )
+
+    @pytest.mark.parametrize(
+        "injection_location, expected",
+        [
+            (None, True),
+            ("categorical_data_processor", False),
+            ("discretizer", False),
+            ("target_encoder", False),
+        ],
+    )
+    def test_is_valid_pipeline(self, injection_location: str, expected: bool):
+
+        # is_valid_pipeline only checks for relevant keys atm
+        pipeline_dict = {
+            "categorical_data_processor": {
+                "model_type": None,
+                "regroup": None,
+                "regroup_name": None,
+                "keep_missing": None,
+                "category_size_threshold": None,
+                "p_value_threshold": None,
+                "scale_contingency_table": None,
+                "forced_categories": None,
+            },
+            "discretizer": {
+                "n_bins": None,
+                "strategy": None,
+                "closed": None,
+                "auto_adapt_bins": None,
+                "starting_precision": None,
+                "label_format": None,
+                "change_endpoint_format": None,
+            },
+            "target_encoder": {
+                "weight": None,
+                "imputation_strategy": None,
+            },
+        }
+
+        if injection_location:
+            pipeline_dict[injection_location]["wrong_key"] = None
+
+        actual = PreProcessor._is_valid_pipeline(pipeline_dict)
+
+        assert actual == expected
+
+    @pytest.mark.parametrize(
+        ("continuous_vars, discrete_vars, expectation, " "expected"),
+        [
+            ([], [], pytest.raises(ValueError), None),
+            (
+                ["c1", "c2"],
+                ["d1", "d2"],
+                does_not_raise(),
+                ["d1_processed", "d2_processed", "c1_bin", "c2_bin"],
+            ),
+            (["c1", "c2"], [], does_not_raise(), ["c1_bin", "c2_bin"]),
+            ([], ["d1", "d2"], does_not_raise(), ["d1_processed", "d2_processed"]),
+        ],
+    )
+    def test_get_variable_list(
+        self,
+        continuous_vars: list,
+        discrete_vars: list,
+        expectation: Any,
+        expected: list,
+    ):
+
+        with expectation:
+            actual = PreProcessor._get_variable_list(continuous_vars, discrete_vars)
+
+            assert actual == expected
+
+    @pytest.mark.parametrize(
+    ("input, expected"),
+    [
+        # example 1
+        (
+            pd.DataFrame({
+                "ID": list(range(20)),
+                "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
+                "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
+                "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
+                "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
+                }
+            ),
+            pd.DataFrame({
+                'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
+                'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
+                'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
+                'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
+                'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
+                'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
+                'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
+                'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
+                'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
+                'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
+                'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
+                }
+            ),
+        )
+    ]
+    )
+    def test_fit_transform_without_id_col_name(self, input, expected):
+        
+        preprocessor = PreProcessor.from_params(model_type="classification")
+        
+        continuous_vars, discrete_vars = preprocessor.get_continuous_and_discrete_columns(input, "ID","Target")
+
+        calculated = preprocessor.fit_transform(
+            input,
+            continuous_vars=continuous_vars,
+            discrete_vars=discrete_vars,
+            target_column_name="Target"
+            )
+        pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)
+
+    @pytest.mark.parametrize(
+    ("input, expected"),
+    [
+        # example 1
+        (
+            pd.DataFrame({
+                "ID": list(range(20)),
+                "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
+                "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
+                "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
+                "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
+                }
+            ),
+            pd.DataFrame({
+                'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
+                'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
+                'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
+                'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
+                'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
+                'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
+                'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
+                'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
+                'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
+                'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
+                'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
+                }
+            ),
+        )
+    ]
+    )
+    def test_fit_transform_with_id_col_name(self, input, expected):
+        
+        preprocessor = PreProcessor.from_params(model_type="classification")
+        
+        # continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target")
+
+        calculated = preprocessor.fit_transform(
+            input,
+            continuous_vars=None,
+            discrete_vars=None,
+            target_column_name="Target",
+            id_col_name="ID"
+            )
+        pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)
+
+    @staticmethod
+    def mock_transform(df: pd.DataFrame, args):
+        """Mock the transform method."""
+        df["new_column"] = "Hello World"
+        return df
+
+    def test_mutable_train_data_fit_transform(self, mocker: MockerFixture):
+        """Test if the train_data input is not changed when performing fit_transform."""
+        train_data = pd.DataFrame(
+            [[1, "2", 3], [10, "20", 30], [100, "200", 300]],
+            columns=["foo", "bar", "baz"],
+        )
+        preprocessor = PreProcessor.from_params(
+            model_type="classification", n_bins=10, weight=0.8
+        )
+        preprocessor._categorical_data_processor = MagicMock()
+        preprocessor._categorical_data_processor.transform = self.mock_transform
+        preprocessor._discretizer = MagicMock()
+        preprocessor._discretizer.transform = self.mock_transform
+        preprocessor._target_encoder = MagicMock()
+        preprocessor._target_encoder.transform = self.mock_transform
+
+        result = preprocessor.fit_transform(
+            train_data,
+            continuous_vars=["foo"],
+            discrete_vars=["bar"],
+            target_column_name=["baz"],
+        )
+        assert "new_column" not in train_data.columns
+        assert "new_column" in result.columns
+
+    @pytest.mark.parametrize(
+        ("input, expected"),
+        [
+            # example 1
+            (
+                pd.DataFrame(
+                    {
+                        "a": [1, 8, np.nan],
+                        "b": [np.nan, 8, np.nan],
+                        "c": [np.nan, np.nan, np.nan],
+                        "d": [np.nan, np.nan, 5],
+                        "e": [1, 960, np.nan],
+                        "f": [np.nan, np.nan, np.nan],
+                    }
+                ),
+                pd.DataFrame(
+                    {
+                        "a": [1.0, 8.0, np.nan],
+                        "b": [np.nan, 8.0, np.nan],
+                        "d": [np.nan, np.nan, 5.0],
+                        "e": [1.0, 960.0, np.nan],
+                    }
+                ),
+            ),
+            # example 2
+            (
+                pd.DataFrame(
+                    {
+                        "a": [1, 8, np.nan],
+                        "b": [np.nan, 8, np.nan],
+                        "c": [np.nan, np.nan, np.nan],
+                        "d": [np.nan, np.nan, 5],
+                        "e": [1, 960, np.nan],
+                    }
+                ),
+                pd.DataFrame(
+                    {
+                        "a": [1.0, 8.0, np.nan],
+                        "b": [np.nan, 8.0, np.nan],
+                        "d": [np.nan, np.nan, 5.0],
+                        "e": [1.0, 960.0, np.nan],
+                    }
+                ),
+            ),
+            # example 3
+            (
+                pd.DataFrame(
+                    {
+                        "a": [1, 8, np.nan],
+                        "b": [np.nan, 8, np.nan],
+                        "d": [np.nan, np.nan, 5],
+                        "e": [1, 960, np.nan],
+                    }
+                ),
+                pd.DataFrame(
+                    {
+                        "a": [1.0, 8.0, np.nan],
+                        "b": [np.nan, 8.0, np.nan],
+                        "d": [np.nan, np.nan, 5.0],
+                        "e": [1.0, 960.0, np.nan],
+                    }
+                ),
+            ),
+            # example 4 categorical
+            (
+                pd.DataFrame(
+                    {
+                        "a": [1, 8, np.nan],
+                        "b": [np.nan, np.nan, np.nan],
+                        "d": [np.nan, np.nan, 5],
+                        "e": [1, 960, np.nan],
+                        "category_1": ["A", "A", "B"],
+                        "category_2": [np.nan, "A", "B"],
+                        "category_3": [np.nan, np.nan, np.nan],
+                    },
+                ).astype(
+                    {
+                        "a": np.float64(),
+                        "b": np.float64(),
+                        "d": np.float64(),
+                        "e": np.float64(),
+                        "category_1": pd.CategoricalDtype(),
+                        "category_2": pd.CategoricalDtype(),
+                        "category_3": pd.CategoricalDtype(),
+                    }
+                ),
+                pd.DataFrame(
+                    {
+                        "a": [1, 8, np.nan],
+                        "d": [np.nan, np.nan, 5],
+                        "e": [1, 960, np.nan],
+                        "category_1": ["A", "A", "B"],
+                        "category_2": [np.nan, "A", "B"],
+                    }
+                ).astype(
+                    {
+                        "a": np.float64(),
+                        "d": np.float64(),
+                        "e": np.float64(),
+                        "category_1": pd.CategoricalDtype(),
+                        "category_2": pd.CategoricalDtype(),
+                    }
+                ),
+            ),
+        ],
+    )
+    def test_drops_columns_containing_only_nan(self, input, expected):
+
+        print(input)
+        output = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan(
+            input
+        )
+
+        print(output)
+        print(expected)
+        assert output.equals(expected)
diff --git a/tests/preprocessing/test_target_encoder.py b/tests/preprocessing/test_target_encoder.py
index 51ebd79..e03992c 100644
--- a/tests/preprocessing/test_target_encoder.py
+++ b/tests/preprocessing/test_target_encoder.py
@@ -1,342 +1,342 @@
-
-import pytest
-import pandas as pd
-from sklearn.exceptions import NotFittedError
-
-from cobra.preprocessing.target_encoder import TargetEncoder
-
-class TestTargetEncoder:
-
-    def test_target_encoder_constructor_weight_value_error(self):
-        with pytest.raises(ValueError):
-            TargetEncoder(weight=-1)
-
-    def test_target_encoder_constructor_imputation_value_error(self):
-        with pytest.raises(ValueError):
-            TargetEncoder(imputation_strategy="median")
-
-    # Tests for attributes_attributes_to_dict and set_attributes_from_dict
-    def test_target_encoder_attributes_to_dict(self):
-        encoder = TargetEncoder()
-
-        mapping_data = pd.Series(data=[0.333333, 0.50000, 0.666667],
-                                 index=["negative", "neutral", "positive"])
-        mapping_data.index.name = "variable"
-
-        encoder._mapping["variable"] = mapping_data
-
-        encoder._global_mean = 0.5
-
-        actual = encoder.attributes_to_dict()
-
-        expected = {"weight": 0.0,
-                    "imputation_strategy": "mean",
-                    "_global_mean": 0.5,
-                    "_mapping": {"variable": {
-                        "negative": 0.333333,
-                        "neutral": 0.50000,
-                        "positive": 0.666667
-                    }}}
-
-        assert actual == expected
-
-    @pytest.mark.parametrize("attribute",
-                             ["weight", "mapping"],
-                             ids=["test_weight", "test_mapping"])
-    def test_target_encoder_set_attributes_from_dict_unfitted(self, attribute):
-        encoder = TargetEncoder()
-
-        data = {"weight": 1.0}
-        encoder.set_attributes_from_dict(data)
-
-        if attribute == "weight":
-            actual = encoder.weight
-            expected = 1.0
-
-            assert expected == actual
-        elif attribute == "mapping":
-            actual = encoder._mapping
-            expected = {}
-
-            assert expected == actual
-
-    def test_target_encoder_set_attributes_from_dict(self):
-        encoder = TargetEncoder()
-
-        data = {"weight": 0.0,
-                "_global_mean": 0.5,
-                "_mapping": {"variable": {
-                    "negative": 0.333333,
-                    "neutral": 0.50000,
-                    "positive": 0.666667
-                }}}
-
-        encoder.set_attributes_from_dict(data)
-
-        expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
-                             index=["negative", "neutral", "positive"])
-        expected.index.name = "variable"
-
-        actual = encoder._mapping["variable"]
-
-        pd.testing.assert_series_equal(actual, expected)
-
-    # Tests for _fit_column:
-    def test_target_encoder_fit_column_binary_classification(self):
-        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
-                                        'neutral', 'negative', 'positive',
-                                        'negative', 'neutral', 'neutral',
-                                        'neutral'],
-                           'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
-
-        encoder = TargetEncoder()
-        encoder._global_mean = 0.5
-        actual = encoder._fit_column(X=df.variable, y=df.target)
-
-        expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
-                             index=["negative", "neutral", "positive"])
-        expected.index.name = "variable"
-
-        pd.testing.assert_series_equal(actual, expected)
-
-    def test_target_encoder_fit_column_linear_regression(self):
-        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
-                                        'neutral', 'negative', 'positive',
-                                        'negative', 'neutral', 'neutral',
-                                        'neutral', 'positive'],
-                           'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
-
-        encoder = TargetEncoder()
-        encoder._global_mean = 0.454545
-        actual = encoder._fit_column(X=df.variable, y=df.target)
-
-        expected = pd.Series(data=[-4.666667, 0.250000, 4.500000],
-                             index=["negative", "neutral", "positive"])
-        expected.index.name = "variable"
-
-        pd.testing.assert_series_equal(actual, expected)
-
-    def test_target_encoder_fit_column_global_mean_binary_classification(self):
-        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
-                                        'neutral', 'negative', 'positive',
-                                        'negative', 'neutral', 'neutral',
-                                        'neutral'],
-                           'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
-
-        encoder = TargetEncoder(weight=1)
-        encoder._global_mean = df.target.sum() / df.target.count()  # is 0.5
-
-        actual = encoder._fit_column(X=df.variable, y=df.target)
-
-        expected = pd.Series(data=[0.375, 0.500, 0.625],
-                             index=["negative", "neutral", "positive"])
-        expected.index.name = "variable"
-
-        pd.testing.assert_series_equal(actual, expected)
-
-    def test_target_encoder_fit_column_global_mean_linear_regression(self):
-        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
-                                        'neutral', 'negative', 'positive',
-                                        'negative', 'neutral', 'neutral',
-                                        'neutral', 'positive'],
-                           'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
-
-        encoder = TargetEncoder(weight=1)
-        encoder._global_mean = 0.454545
-
-        actual = encoder._fit_column(X=df.variable, y=df.target)
-
-        # expected new value:
-        # [count of the value * its mean encoding + weight (= 1) * global mean]
-        # / [count of the value + weight (=1)].
-        expected = pd.Series(data=[(3 * -4.666667 + 1 * 0.454545) / (3 + 1),
-                                   (4 * 0.250000 + 1 * 0.454545) / (4 + 1),
-                                   (4 * 4.500000 + 1 * 0.454545) / (4 + 1)],
-                             index=["negative", "neutral", "positive"])
-        expected.index.name = "variable"
-
-        pd.testing.assert_series_equal(actual, expected)
-
-    # Tests for fit method
-    def test_target_encoder_fit_binary_classification(self):
-        # test_target_encoder_fit_column_linear_regression() tested on one
-        # column input as a numpy series; this test runs on a dataframe input.
-        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
-                                        'neutral', 'negative', 'positive',
-                                        'negative', 'neutral', 'neutral',
-                                        'neutral'],
-                           'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
-
-        encoder = TargetEncoder()
-        encoder.fit(data=df, column_names=["variable"], target_column="target")
-
-        expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
-                             index=["negative", "neutral", "positive"])
-        expected.index.name = "variable"
-        actual = encoder._mapping["variable"]
-
-        pd.testing.assert_series_equal(actual, expected)
-
-    def test_target_encoder_fit_linear_regression(self):
-        # test_target_encoder_fit_column_linear_regression() tested on one
-        # column input as a numpy series; this test runs on a dataframe input.
-        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
-                                        'neutral', 'negative', 'positive',
-                                        'negative', 'neutral', 'neutral',
-                                        'neutral', 'positive'],
-                           'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
-
-        encoder = TargetEncoder()
-        encoder.fit(data=df, column_names=["variable"], target_column="target")
-
-        expected = pd.Series(data=[-4.666667, 0.250000, 4.500000],
-                             index=["negative", "neutral", "positive"])
-        expected.index.name = "variable"
-        actual = encoder._mapping["variable"]
-
-        pd.testing.assert_series_equal(actual, expected)
-
-    # Tests for transform method
-    def test_target_encoder_transform_when_not_fitted(self):
-        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
-                                        'neutral', 'negative', 'positive',
-                                        'negative', 'neutral', 'neutral',
-                                        'neutral'],
-                           'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
-
-        # inputs of TargetEncoder will be of dtype category
-        df["variable"] = df["variable"].astype("category")
-
-        encoder = TargetEncoder()
-        with pytest.raises(NotFittedError):
-            encoder.transform(data=df, column_names=["variable"])
-
-    def test_target_encoder_transform_binary_classification(self):
-        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
-                                        'neutral', 'negative', 'positive',
-                                        'negative', 'neutral', 'neutral',
-                                        'neutral'],
-                           'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
-
-        # inputs of TargetEncoder will be of dtype category
-        df["variable"] = df["variable"].astype("category")
-
-        expected = df.copy()
-        expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000,
-                                    0.333333, 0.666667, 0.333333, 0.50000,
-                                    0.50000, 0.50000]
-
-        encoder = TargetEncoder()
-        encoder.fit(data=df, column_names=["variable"], target_column="target")
-        actual = encoder.transform(data=df, column_names=["variable"])
-
-        pd.testing.assert_frame_equal(actual, expected)
-
-    def test_target_encoder_transform_linear_regression(self):
-        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
-                                        'neutral', 'negative', 'positive',
-                                        'negative', 'neutral', 'neutral',
-                                        'neutral', 'positive'],
-                           'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
-
-        # inputs of TargetEncoder will be of dtype category
-        df["variable"] = df["variable"].astype("category")
-
-        expected = df.copy()
-        expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000,
-                                    -4.666667, 4.500000, -4.666667, 0.250000,
-                                    0.250000, 0.250000, 4.500000]
-
-        encoder = TargetEncoder()
-        encoder.fit(data=df, column_names=["variable"], target_column="target")
-        actual = encoder.transform(data=df, column_names=["variable"])
-
-        pd.testing.assert_frame_equal(actual, expected)
-
-    def test_target_encoder_transform_new_category_binary_classification(self):
-        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
-                                        'neutral', 'negative', 'positive',
-                                        'negative', 'neutral', 'neutral',
-                                        'neutral'],
-                           'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
-
-        df_appended = df.append({"variable": "new", "target": 1},
-                                ignore_index=True)
-
-        # inputs of TargetEncoder will be of dtype category
-        df["variable"] = df["variable"].astype("category")
-        df_appended["variable"] = df_appended["variable"].astype("category")
-
-        expected = df_appended.copy()
-        expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000,
-                                    0.333333, 0.666667, 0.333333, 0.50000,
-                                    0.50000, 0.50000, 0.333333]
-
-        encoder = TargetEncoder(imputation_strategy="min")
-        encoder.fit(data=df, column_names=["variable"], target_column="target")
-        actual = encoder.transform(data=df_appended, column_names=["variable"])
-
-        pd.testing.assert_frame_equal(actual, expected)
-
-    def test_target_encoder_transform_new_category_linear_regression(self):
-        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
-                                        'neutral', 'negative', 'positive',
-                                        'negative', 'neutral', 'neutral',
-                                        'neutral', 'positive'],
-                           'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
-
-        df_appended = df.append({"variable": "new", "target": 10},
-                                ignore_index=True)
-
-        # inputs of TargetEncoder will be of dtype category
-        df["variable"] = df["variable"].astype("category")
-        df_appended["variable"] = df_appended["variable"].astype("category")
-
-        expected = df_appended.copy()
-        expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000,
-                                    -4.666667, 4.500000, -4.666667, 0.250000,
-                                    0.250000, 0.250000, 4.500000,
-                                    -4.666667] # min imputation for new value
-
-        encoder = TargetEncoder(imputation_strategy="min")
-        encoder.fit(data=df, column_names=["variable"], target_column="target")
-        actual = encoder.transform(data=df_appended, column_names=["variable"])
-
-        pd.testing.assert_frame_equal(actual, expected)
-
-    # Tests for _clean_column_name:
-    def test_target_encoder_clean_column_name_binned_column(self):
-        column_name = "test_column_bin"
-        expected = "test_column_enc"
-
-        encoder = TargetEncoder()
-        actual = encoder._clean_column_name(column_name)
-
-        assert actual == expected
-
-    def test_target_encoder_clean_column_name_processed_column(self):
-        column_name = "test_column_processed"
-        expected = "test_column_enc"
-
-        encoder = TargetEncoder()
-        actual = encoder._clean_column_name(column_name)
-
-        assert actual == expected
-
-    def test_target_encoder_clean_column_name_cleaned_column(self):
-        column_name = "test_column_cleaned"
-        expected = "test_column_enc"
-
-        encoder = TargetEncoder()
-        actual = encoder._clean_column_name(column_name)
-
-        assert actual == expected
-
-    def test_target_encoder_clean_column_other_name(self):
-        column_name = "test_column"
-        expected = "test_column_enc"
-
-        encoder = TargetEncoder()
-        actual = encoder._clean_column_name(column_name)
-
-        assert actual == expected
+
+import pytest
+import pandas as pd
+from sklearn.exceptions import NotFittedError
+
+from cobra.preprocessing.target_encoder import TargetEncoder
+
+class TestTargetEncoder:
+
+    def test_target_encoder_constructor_weight_value_error(self):
+        with pytest.raises(ValueError):
+            TargetEncoder(weight=-1)
+
+    def test_target_encoder_constructor_imputation_value_error(self):
+        with pytest.raises(ValueError):
+            TargetEncoder(imputation_strategy="median")
+
+    # Tests for attributes_attributes_to_dict and set_attributes_from_dict
+    def test_target_encoder_attributes_to_dict(self):
+        encoder = TargetEncoder()
+
+        mapping_data = pd.Series(data=[0.333333, 0.50000, 0.666667],
+                                 index=["negative", "neutral", "positive"])
+        mapping_data.index.name = "variable"
+
+        encoder._mapping["variable"] = mapping_data
+
+        encoder._global_mean = 0.5
+
+        actual = encoder.attributes_to_dict()
+
+        expected = {"weight": 0.0,
+                    "imputation_strategy": "mean",
+                    "_global_mean": 0.5,
+                    "_mapping": {"variable": {
+                        "negative": 0.333333,
+                        "neutral": 0.50000,
+                        "positive": 0.666667
+                    }}}
+
+        assert actual == expected
+
+    @pytest.mark.parametrize("attribute",
+                             ["weight", "mapping"],
+                             ids=["test_weight", "test_mapping"])
+    def test_target_encoder_set_attributes_from_dict_unfitted(self, attribute):
+        encoder = TargetEncoder()
+
+        data = {"weight": 1.0}
+        encoder.set_attributes_from_dict(data)
+
+        if attribute == "weight":
+            actual = encoder.weight
+            expected = 1.0
+
+            assert expected == actual
+        elif attribute == "mapping":
+            actual = encoder._mapping
+            expected = {}
+
+            assert expected == actual
+
+    def test_target_encoder_set_attributes_from_dict(self):
+        encoder = TargetEncoder()
+
+        data = {"weight": 0.0,
+                "_global_mean": 0.5,
+                "_mapping": {"variable": {
+                    "negative": 0.333333,
+                    "neutral": 0.50000,
+                    "positive": 0.666667
+                }}}
+
+        encoder.set_attributes_from_dict(data)
+
+        expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
+                             index=["negative", "neutral", "positive"])
+        expected.index.name = "variable"
+
+        actual = encoder._mapping["variable"]
+
+        pd.testing.assert_series_equal(actual, expected)
+
+    # Tests for _fit_column:
+    def test_target_encoder_fit_column_binary_classification(self):
+        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+                                        'neutral', 'negative', 'positive',
+                                        'negative', 'neutral', 'neutral',
+                                        'neutral'],
+                           'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
+
+        encoder = TargetEncoder()
+        encoder._global_mean = 0.5
+        actual = encoder._fit_column(X=df.variable, y=df.target)
+
+        expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
+                             index=["negative", "neutral", "positive"])
+        expected.index.name = "variable"
+
+        pd.testing.assert_series_equal(actual, expected)
+
+    def test_target_encoder_fit_column_linear_regression(self):
+        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+                                        'neutral', 'negative', 'positive',
+                                        'negative', 'neutral', 'neutral',
+                                        'neutral', 'positive'],
+                           'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
+
+        encoder = TargetEncoder()
+        encoder._global_mean = 0.454545
+        actual = encoder._fit_column(X=df.variable, y=df.target)
+
+        expected = pd.Series(data=[-4.666667, 0.250000, 4.500000],
+                             index=["negative", "neutral", "positive"])
+        expected.index.name = "variable"
+
+        pd.testing.assert_series_equal(actual, expected)
+
+    def test_target_encoder_fit_column_global_mean_binary_classification(self):
+        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+                                        'neutral', 'negative', 'positive',
+                                        'negative', 'neutral', 'neutral',
+                                        'neutral'],
+                           'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
+
+        encoder = TargetEncoder(weight=1)
+        encoder._global_mean = df.target.sum() / df.target.count()  # is 0.5
+
+        actual = encoder._fit_column(X=df.variable, y=df.target)
+
+        expected = pd.Series(data=[0.375, 0.500, 0.625],
+                             index=["negative", "neutral", "positive"])
+        expected.index.name = "variable"
+
+        pd.testing.assert_series_equal(actual, expected)
+
+    def test_target_encoder_fit_column_global_mean_linear_regression(self):
+        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+                                        'neutral', 'negative', 'positive',
+                                        'negative', 'neutral', 'neutral',
+                                        'neutral', 'positive'],
+                           'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
+
+        encoder = TargetEncoder(weight=1)
+        encoder._global_mean = 0.454545
+
+        actual = encoder._fit_column(X=df.variable, y=df.target)
+
+        # expected new value:
+        # [count of the value * its mean encoding + weight (= 1) * global mean]
+        # / [count of the value + weight (=1)].
+        expected = pd.Series(data=[(3 * -4.666667 + 1 * 0.454545) / (3 + 1),
+                                   (4 * 0.250000 + 1 * 0.454545) / (4 + 1),
+                                   (4 * 4.500000 + 1 * 0.454545) / (4 + 1)],
+                             index=["negative", "neutral", "positive"])
+        expected.index.name = "variable"
+
+        pd.testing.assert_series_equal(actual, expected)
+
+    # Tests for fit method
+    def test_target_encoder_fit_binary_classification(self):
+        # test_target_encoder_fit_column_linear_regression() tested on one
+        # column input as a numpy series; this test runs on a dataframe input.
+        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+                                        'neutral', 'negative', 'positive',
+                                        'negative', 'neutral', 'neutral',
+                                        'neutral'],
+                           'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
+
+        encoder = TargetEncoder()
+        encoder.fit(data=df, column_names=["variable"], target_column="target")
+
+        expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
+                             index=["negative", "neutral", "positive"])
+        expected.index.name = "variable"
+        actual = encoder._mapping["variable"]
+
+        pd.testing.assert_series_equal(actual, expected)
+
+    def test_target_encoder_fit_linear_regression(self):
+        # test_target_encoder_fit_column_linear_regression() tested on one
+        # column input as a numpy series; this test runs on a dataframe input.
+        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+                                        'neutral', 'negative', 'positive',
+                                        'negative', 'neutral', 'neutral',
+                                        'neutral', 'positive'],
+                           'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
+
+        encoder = TargetEncoder()
+        encoder.fit(data=df, column_names=["variable"], target_column="target")
+
+        expected = pd.Series(data=[-4.666667, 0.250000, 4.500000],
+                             index=["negative", "neutral", "positive"])
+        expected.index.name = "variable"
+        actual = encoder._mapping["variable"]
+
+        pd.testing.assert_series_equal(actual, expected)
+
+    # Tests for transform method
+    def test_target_encoder_transform_when_not_fitted(self):
+        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+                                        'neutral', 'negative', 'positive',
+                                        'negative', 'neutral', 'neutral',
+                                        'neutral'],
+                           'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
+
+        # inputs of TargetEncoder will be of dtype category
+        df["variable"] = df["variable"].astype("category")
+
+        encoder = TargetEncoder()
+        with pytest.raises(NotFittedError):
+            encoder.transform(data=df, column_names=["variable"])
+
+    def test_target_encoder_transform_binary_classification(self):
+        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+                                        'neutral', 'negative', 'positive',
+                                        'negative', 'neutral', 'neutral',
+                                        'neutral'],
+                           'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
+
+        # inputs of TargetEncoder will be of dtype category
+        df["variable"] = df["variable"].astype("category")
+
+        expected = df.copy()
+        expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000,
+                                    0.333333, 0.666667, 0.333333, 0.50000,
+                                    0.50000, 0.50000]
+
+        encoder = TargetEncoder()
+        encoder.fit(data=df, column_names=["variable"], target_column="target")
+        actual = encoder.transform(data=df, column_names=["variable"])
+
+        pd.testing.assert_frame_equal(actual, expected)
+
+    def test_target_encoder_transform_linear_regression(self):
+        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+                                        'neutral', 'negative', 'positive',
+                                        'negative', 'neutral', 'neutral',
+                                        'neutral', 'positive'],
+                           'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
+
+        # inputs of TargetEncoder will be of dtype category
+        df["variable"] = df["variable"].astype("category")
+
+        expected = df.copy()
+        expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000,
+                                    -4.666667, 4.500000, -4.666667, 0.250000,
+                                    0.250000, 0.250000, 4.500000]
+
+        encoder = TargetEncoder()
+        encoder.fit(data=df, column_names=["variable"], target_column="target")
+        actual = encoder.transform(data=df, column_names=["variable"])
+
+        pd.testing.assert_frame_equal(actual, expected)
+
+    def test_target_encoder_transform_new_category_binary_classification(self):
+        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+                                        'neutral', 'negative', 'positive',
+                                        'negative', 'neutral', 'neutral',
+                                        'neutral'],
+                           'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
+
+        df_appended = df.append({"variable": "new", "target": 1},
+                                ignore_index=True)
+
+        # inputs of TargetEncoder will be of dtype category
+        df["variable"] = df["variable"].astype("category")
+        df_appended["variable"] = df_appended["variable"].astype("category")
+
+        expected = df_appended.copy()
+        expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000,
+                                    0.333333, 0.666667, 0.333333, 0.50000,
+                                    0.50000, 0.50000, 0.333333]
+
+        encoder = TargetEncoder(imputation_strategy="min")
+        encoder.fit(data=df, column_names=["variable"], target_column="target")
+        actual = encoder.transform(data=df_appended, column_names=["variable"])
+
+        pd.testing.assert_frame_equal(actual, expected)
+
+    def test_target_encoder_transform_new_category_linear_regression(self):
+        df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+                                        'neutral', 'negative', 'positive',
+                                        'negative', 'neutral', 'neutral',
+                                        'neutral', 'positive'],
+                           'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
+
+        df_appended = df.append({"variable": "new", "target": 10},
+                                ignore_index=True)
+
+        # inputs of TargetEncoder will be of dtype category
+        df["variable"] = df["variable"].astype("category")
+        df_appended["variable"] = df_appended["variable"].astype("category")
+
+        expected = df_appended.copy()
+        expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000,
+                                    -4.666667, 4.500000, -4.666667, 0.250000,
+                                    0.250000, 0.250000, 4.500000,
+                                    -4.666667] # min imputation for new value
+
+        encoder = TargetEncoder(imputation_strategy="min")
+        encoder.fit(data=df, column_names=["variable"], target_column="target")
+        actual = encoder.transform(data=df_appended, column_names=["variable"])
+
+        pd.testing.assert_frame_equal(actual, expected)
+
+    # Tests for _clean_column_name:
+    def test_target_encoder_clean_column_name_binned_column(self):
+        column_name = "test_column_bin"
+        expected = "test_column_enc"
+
+        encoder = TargetEncoder()
+        actual = encoder._clean_column_name(column_name)
+
+        assert actual == expected
+
+    def test_target_encoder_clean_column_name_processed_column(self):
+        column_name = "test_column_processed"
+        expected = "test_column_enc"
+
+        encoder = TargetEncoder()
+        actual = encoder._clean_column_name(column_name)
+
+        assert actual == expected
+
+    def test_target_encoder_clean_column_name_cleaned_column(self):
+        column_name = "test_column_cleaned"
+        expected = "test_column_enc"
+
+        encoder = TargetEncoder()
+        actual = encoder._clean_column_name(column_name)
+
+        assert actual == expected
+
+    def test_target_encoder_clean_column_other_name(self):
+        column_name = "test_column"
+        expected = "test_column_enc"
+
+        encoder = TargetEncoder()
+        actual = encoder._clean_column_name(column_name)
+
+        assert actual == expected

From e1248549e70505f37ccce4ddb44eb83f1a5ffee9 Mon Sep 17 00:00:00 2001
From: joostneuj <91886694+joostneuj@users.noreply.github.com>
Date: Fri, 16 Jun 2023 18:15:38 +0200
Subject: [PATCH 2/4] #143 deleted notebook

---
 notebooks/debugging.ipynb | 1364 -------------------------------------
 1 file changed, 1364 deletions(-)
 delete mode 100644 notebooks/debugging.ipynb

diff --git a/notebooks/debugging.ipynb b/notebooks/debugging.ipynb
deleted file mode 100644
index 5dd573e..0000000
--- a/notebooks/debugging.ipynb
+++ /dev/null
@@ -1,1364 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 464,
-   "id": "23482fd8-b4c1-48f5-8c30-a0e79f7667b3",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    }
-   ],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "%reload_ext autoreload"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 465,
-   "id": "da551dc3-ffba-45e0-b87d-7b626a622b08",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "sys.path.insert(0, r\"C:/projects/cobra\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 488,
-   "id": "7d2678fa-eb47-4cb5-ad1d-c5034a742f55",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import random\n",
-    "from cobra.preprocessing import PreProcessor\n",
-    "\n",
-    "# custom imports\n",
-    "from cobra.preprocessing import CategoricalDataProcessor\n",
-    "from cobra.preprocessing import KBinsDiscretizer\n",
-    "from cobra.preprocessing import TargetEncoder\n",
-    "import json\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d4d341ec-b5c3-4b00-a54f-c5b6565d2631",
-   "metadata": {},
-   "source": [
-    "### 1. Generate data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 467,
-   "id": "a9563643-308b-4c6c-b358-9cbf93a0666d",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "size = 5000\n",
-    "\n",
-    "# Create datetime column\n",
-    "dates = pd.date_range('2022-01-01', periods=size, freq='D')\n",
-    "\n",
-    "# Create categorical variables\n",
-    "category_values = ['Category A', 'Category B', 'Category C']\n",
-    "cat_var1 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n",
-    "cat_var2 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n",
-    "cat_var3 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n",
-    "\n",
-    "# Create continuous variables with different scales and distributions\n",
-    "cont_var1 = pd.Series(np.random.normal(loc=0, scale=1, size=size), name='cont_var1')\n",
-    "cont_var2 = pd.Series(np.random.uniform(low=0, high=10, size=size), name='cont_var2')\n",
-    "cont_var3 = pd.Series(np.random.exponential(scale=1, size=size), name='cont_var3')\n",
-    "\n",
-    "# Create target variable\n",
-    "target = pd.Series(np.random.randint(2, size=size))\n",
-    "\n",
-    "# Combine into a DataFrame\n",
-    "df = pd.DataFrame({'DateTime': dates, 'CategoryVar1': cat_var1,\n",
-    "                   'CategoryVar2': cat_var2, 'CategoryVar3': cat_var3,\n",
-    "                   'cont_var1': cont_var1, 'cont_var2': cont_var2, 'cont_var3': cont_var3,\n",
-    "                   'target': target})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 468,
-   "id": "bde9235f-dc62-433d-b3d3-6bf37b2ddb52",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "DateTime        datetime64[ns]\n",
-       "CategoryVar1          category\n",
-       "CategoryVar2          category\n",
-       "CategoryVar3          category\n",
-       "cont_var1              float64\n",
-       "cont_var2              float64\n",
-       "cont_var3              float64\n",
-       "target                   int32\n",
-       "dtype: object"
-      ]
-     },
-     "execution_count": 468,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.dtypes"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 469,
-   "id": "d774e959-73f4-40b4-bc20-43c3af99e593",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>DateTime</th>\n",
-       "      <th>CategoryVar1</th>\n",
-       "      <th>CategoryVar2</th>\n",
-       "      <th>CategoryVar3</th>\n",
-       "      <th>cont_var1</th>\n",
-       "      <th>cont_var2</th>\n",
-       "      <th>cont_var3</th>\n",
-       "      <th>target</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>2022-01-01</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>-1.001645</td>\n",
-       "      <td>4.733706</td>\n",
-       "      <td>1.372659</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2022-01-02</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>0.280629</td>\n",
-       "      <td>9.191129</td>\n",
-       "      <td>0.635924</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2022-01-03</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>-0.345219</td>\n",
-       "      <td>7.731792</td>\n",
-       "      <td>0.098091</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2022-01-04</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>-1.134912</td>\n",
-       "      <td>0.205132</td>\n",
-       "      <td>0.179868</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>2022-01-05</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>-1.339645</td>\n",
-       "      <td>2.378540</td>\n",
-       "      <td>0.966818</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    DateTime CategoryVar1 CategoryVar2 CategoryVar3  cont_var1  cont_var2  \\\n",
-       "0 2022-01-01   Category C   Category B   Category A  -1.001645   4.733706   \n",
-       "1 2022-01-02   Category C   Category C   Category B   0.280629   9.191129   \n",
-       "2 2022-01-03   Category B   Category B   Category C  -0.345219   7.731792   \n",
-       "3 2022-01-04   Category C   Category B   Category C  -1.134912   0.205132   \n",
-       "4 2022-01-05   Category A   Category C   Category B  -1.339645   2.378540   \n",
-       "\n",
-       "   cont_var3  target  \n",
-       "0   1.372659       0  \n",
-       "1   0.635924       1  \n",
-       "2   0.098091       1  \n",
-       "3   0.179868       0  \n",
-       "4   0.966818       1  "
-      ]
-     },
-     "execution_count": 469,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 470,
-   "id": "e9c06e3a-188f-4cdc-b9cd-51d3db63e5ff",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Index(['DateTime', 'CategoryVar1', 'CategoryVar2', 'CategoryVar3', 'cont_var1',\n",
-       "       'cont_var2', 'cont_var3', 'target'],\n",
-       "      dtype='object')"
-      ]
-     },
-     "execution_count": 470,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.columns"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9aae8c98-434b-4c71-abb1-29fa6d143895",
-   "metadata": {},
-   "source": [
-    "### 2. Fit preprocessor"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 521,
-   "id": "a32560d4-b5fe-4b90-9ea6-ede7915bba05",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "continuous_vars = ['cont_var2', 'cont_var3', 'cont_var1']\n",
-    "discrete_vars= ['CategoryVar1', 'CategoryVar2', 'CategoryVar3'] #, 'DateTime'] [] \n",
-    "target_col = \"target\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 522,
-   "id": "d6f1e21a-4a6e-4ad7-9faf-b36e6daff707",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.\n"
-     ]
-    }
-   ],
-   "source": [
-    "model_type = \"classification\"\n",
-    "\n",
-    "# using all Cobra's default parameters for preprocessing here\n",
-    "preprocessor = PreProcessor.from_params(\n",
-    "    model_type=model_type\n",
-    ")\n",
-    "\n",
-    "random.seed(1212)\n",
-    "basetable = preprocessor.train_selection_validation_split(data=df,\n",
-    "                                                          train_prop=0.6,\n",
-    "                                                          selection_prop=0.25,\n",
-    "                                                          validation_prop=0.15)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 523,
-   "id": "7b673619-4eda-4aca-acd5-a125f80d3b20",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Starting to fit pipeline\n",
-      "Computing discretization bins...: 100%|█████████████████████████████████████████████████| 3/3 [00:00<00:00, 507.38it/s]\n",
-      "Fitting KBinsDiscretizer took 0.006914615631103516 seconds\n",
-      "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 240.62it/s]\n",
-      "Fitting category regrouping...: 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 29.42it/s]\n",
-      "Fitting categorical_data_processor class took 0.10196375846862793 seconds\n",
-      "Fitting target encoding...: 100%|███████████████████████████████████████████████████████| 6/6 [00:00<00:00, 558.52it/s]\n",
-      "Fitting TargetEncoder took 0.013732433319091797 seconds\n",
-      "Fitting pipeline took 0.17300176620483398 seconds\n"
-     ]
-    }
-   ],
-   "source": [
-    "preprocessor.fit(basetable[basetable[\"split\"]==\"train\"],\n",
-    "                 continuous_vars=continuous_vars,\n",
-    "                 discrete_vars = discrete_vars,\n",
-    "                 target_column_name=target_col)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 524,
-   "id": "c9e2c79d-c0bc-464d-b869-f8115ac67776",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 160.70it/s]\n",
-      "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 697.13it/s]\n",
-      "Transforming data took 0.0610198974609375 seconds\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>DateTime</th>\n",
-       "      <th>CategoryVar1</th>\n",
-       "      <th>CategoryVar2</th>\n",
-       "      <th>CategoryVar3</th>\n",
-       "      <th>cont_var1</th>\n",
-       "      <th>cont_var2</th>\n",
-       "      <th>cont_var3</th>\n",
-       "      <th>target</th>\n",
-       "      <th>split</th>\n",
-       "      <th>cont_var2_bin</th>\n",
-       "      <th>...</th>\n",
-       "      <th>cont_var1_bin</th>\n",
-       "      <th>CategoryVar1_processed</th>\n",
-       "      <th>CategoryVar2_processed</th>\n",
-       "      <th>CategoryVar3_processed</th>\n",
-       "      <th>CategoryVar1_enc</th>\n",
-       "      <th>CategoryVar2_enc</th>\n",
-       "      <th>CategoryVar3_enc</th>\n",
-       "      <th>cont_var2_enc</th>\n",
-       "      <th>cont_var3_enc</th>\n",
-       "      <th>cont_var1_enc</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>2022-01-01</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>-1.001645</td>\n",
-       "      <td>4.733706</td>\n",
-       "      <td>1.372659</td>\n",
-       "      <td>0</td>\n",
-       "      <td>selection</td>\n",
-       "      <td>4.0 - 5.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-1.3 - -0.8</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>0.504274</td>\n",
-       "      <td>0.495885</td>\n",
-       "      <td>0.514872</td>\n",
-       "      <td>0.467391</td>\n",
-       "      <td>0.486891</td>\n",
-       "      <td>0.523364</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2022-01-02</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>0.280629</td>\n",
-       "      <td>9.191129</td>\n",
-       "      <td>0.635924</td>\n",
-       "      <td>1</td>\n",
-       "      <td>train</td>\n",
-       "      <td>9.0 - 10.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0.2 - 0.5</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>0.504274</td>\n",
-       "      <td>0.487952</td>\n",
-       "      <td>0.491000</td>\n",
-       "      <td>0.474048</td>\n",
-       "      <td>0.524355</td>\n",
-       "      <td>0.492997</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2022-01-03</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>-0.345219</td>\n",
-       "      <td>7.731792</td>\n",
-       "      <td>0.098091</td>\n",
-       "      <td>1</td>\n",
-       "      <td>train</td>\n",
-       "      <td>7.0 - 8.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-0.5 - -0.2</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>0.473367</td>\n",
-       "      <td>0.495885</td>\n",
-       "      <td>0.465366</td>\n",
-       "      <td>0.490260</td>\n",
-       "      <td>0.494297</td>\n",
-       "      <td>0.433225</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2022-01-04</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>-1.134912</td>\n",
-       "      <td>0.205132</td>\n",
-       "      <td>0.179868</td>\n",
-       "      <td>0</td>\n",
-       "      <td>selection</td>\n",
-       "      <td>0.0 - 1.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-1.3 - -0.8</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>0.504274</td>\n",
-       "      <td>0.495885</td>\n",
-       "      <td>0.465366</td>\n",
-       "      <td>0.475410</td>\n",
-       "      <td>0.504065</td>\n",
-       "      <td>0.523364</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>2022-01-05</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>-1.339645</td>\n",
-       "      <td>2.378540</td>\n",
-       "      <td>0.966818</td>\n",
-       "      <td>1</td>\n",
-       "      <td>train</td>\n",
-       "      <td>2.0 - 3.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-4.0 - -1.3</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>0.491597</td>\n",
-       "      <td>0.487952</td>\n",
-       "      <td>0.491000</td>\n",
-       "      <td>0.455696</td>\n",
-       "      <td>0.471464</td>\n",
-       "      <td>0.562290</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 21 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    DateTime CategoryVar1 CategoryVar2 CategoryVar3  cont_var1  cont_var2  \\\n",
-       "0 2022-01-01   Category C   Category B   Category A  -1.001645   4.733706   \n",
-       "1 2022-01-02   Category C   Category C   Category B   0.280629   9.191129   \n",
-       "2 2022-01-03   Category B   Category B   Category C  -0.345219   7.731792   \n",
-       "3 2022-01-04   Category C   Category B   Category C  -1.134912   0.205132   \n",
-       "4 2022-01-05   Category A   Category C   Category B  -1.339645   2.378540   \n",
-       "\n",
-       "   cont_var3  target      split cont_var2_bin  ... cont_var1_bin  \\\n",
-       "0   1.372659       0  selection     4.0 - 5.0  ...   -1.3 - -0.8   \n",
-       "1   0.635924       1      train    9.0 - 10.0  ...     0.2 - 0.5   \n",
-       "2   0.098091       1      train     7.0 - 8.0  ...   -0.5 - -0.2   \n",
-       "3   0.179868       0  selection     0.0 - 1.0  ...   -1.3 - -0.8   \n",
-       "4   0.966818       1      train     2.0 - 3.0  ...   -4.0 - -1.3   \n",
-       "\n",
-       "  CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed  \\\n",
-       "0             Category C             Category B             Category A   \n",
-       "1             Category C             Category C             Category B   \n",
-       "2             Category B             Category B             Category C   \n",
-       "3             Category C             Category B             Category C   \n",
-       "4             Category A             Category C             Category B   \n",
-       "\n",
-       "  CategoryVar1_enc  CategoryVar2_enc  CategoryVar3_enc  cont_var2_enc  \\\n",
-       "0         0.504274          0.495885          0.514872       0.467391   \n",
-       "1         0.504274          0.487952          0.491000       0.474048   \n",
-       "2         0.473367          0.495885          0.465366       0.490260   \n",
-       "3         0.504274          0.495885          0.465366       0.475410   \n",
-       "4         0.491597          0.487952          0.491000       0.455696   \n",
-       "\n",
-       "   cont_var3_enc  cont_var1_enc  \n",
-       "0       0.486891       0.523364  \n",
-       "1       0.524355       0.492997  \n",
-       "2       0.494297       0.433225  \n",
-       "3       0.504065       0.523364  \n",
-       "4       0.471464       0.562290  \n",
-       "\n",
-       "[5 rows x 21 columns]"
-      ]
-     },
-     "execution_count": 524,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "basetable_transformed_orig = preprocessor.transform(basetable,\n",
-    "                                   continuous_vars=continuous_vars,\n",
-    "                                   discrete_vars=discrete_vars)\n",
-    "basetable_transformed_orig.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 525,
-   "id": "d70f40cc-7814-48a8-91f6-2b7297f97ccc",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "#preprocessor._discretizer #._bins_by_column\n",
-    "#preprocessor._target_encoder.attributes_to_dict()\n",
-    "#preprocessor._discretizer.attributes_to_dict()\n",
-    "#preprocessor._target_encoder.attributes_to_dict()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "baab4c1b-4200-4c96-b991-be8efc09abbb",
-   "metadata": {},
-   "source": [
-    "### 3. Serialize the preprocessor"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 526,
-   "id": "95b597b2-b475-4d59-b650-dcc208db1eb5",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "pipeline_serialized = preprocessor.serialize_pipeline()\n",
-    "\n",
-    "with open(r\"./model_json.json\", \"w\") as file:\n",
-    "    file.write(json.dumps(pipeline_serialized, indent=4))\n",
-    "    \n",
-    "#pipeline_serialized"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 527,
-   "id": "c6dbd38c-ca5d-492d-815b-1af02d7de143",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# Look into properties of preprocessors\n",
-    "#pipeline_serialized[\"target_encoder\"] #._bins_by_column"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fc339ac8-67a7-4574-811e-2b9bc4ce6a39",
-   "metadata": {},
-   "source": [
-    "### 4. De-serialize pipeline"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 528,
-   "id": "2a517ff8-d336-4bd3-abdc-2be784259564",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Read serialized pipeline from json\n",
-    "with open(r\"./model_json.json\", \"r\") as file:\n",
-    "    json_pipeline_serialized = json.load(file)\n",
-    "\n",
-    "# Create new preprocessor object from serialized pipeline\n",
-    "new_preprocessor = PreProcessor.from_pipeline(json_pipeline_serialized)\n",
-    "#new_preprocessor = PreProcessor.from_pipeline(pipeline_serialized)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 529,
-   "id": "ad9442b5-7f7e-48fe-8199-528992d1f0d6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Look into properties of preprocessors if needed\n",
-    "#new_preprocessor._discretizer.attributes_to_dict()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 530,
-   "id": "541986d2-8d5d-473c-8871-5e7d2da31c4a",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 147.15it/s]\n",
-      "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 661.65it/s]\n",
-      "Transforming data took 0.06773138046264648 seconds\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>DateTime</th>\n",
-       "      <th>CategoryVar1</th>\n",
-       "      <th>CategoryVar2</th>\n",
-       "      <th>CategoryVar3</th>\n",
-       "      <th>cont_var1</th>\n",
-       "      <th>cont_var2</th>\n",
-       "      <th>cont_var3</th>\n",
-       "      <th>target</th>\n",
-       "      <th>split</th>\n",
-       "      <th>cont_var2_bin</th>\n",
-       "      <th>...</th>\n",
-       "      <th>cont_var1_bin</th>\n",
-       "      <th>CategoryVar1_processed</th>\n",
-       "      <th>CategoryVar2_processed</th>\n",
-       "      <th>CategoryVar3_processed</th>\n",
-       "      <th>CategoryVar1_enc</th>\n",
-       "      <th>CategoryVar2_enc</th>\n",
-       "      <th>CategoryVar3_enc</th>\n",
-       "      <th>cont_var2_enc</th>\n",
-       "      <th>cont_var3_enc</th>\n",
-       "      <th>cont_var1_enc</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>2022-01-01</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>-1.001645</td>\n",
-       "      <td>4.733706</td>\n",
-       "      <td>1.372659</td>\n",
-       "      <td>0</td>\n",
-       "      <td>selection</td>\n",
-       "      <td>4.0 - 5.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-1.3 - -0.8</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>0.504274</td>\n",
-       "      <td>0.495885</td>\n",
-       "      <td>0.514872</td>\n",
-       "      <td>0.467391</td>\n",
-       "      <td>0.486891</td>\n",
-       "      <td>0.523364</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2022-01-02</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>0.280629</td>\n",
-       "      <td>9.191129</td>\n",
-       "      <td>0.635924</td>\n",
-       "      <td>1</td>\n",
-       "      <td>train</td>\n",
-       "      <td>9.0 - 10.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0.2 - 0.5</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>0.504274</td>\n",
-       "      <td>0.487952</td>\n",
-       "      <td>0.491000</td>\n",
-       "      <td>0.474048</td>\n",
-       "      <td>0.524355</td>\n",
-       "      <td>0.492997</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2022-01-03</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>-0.345219</td>\n",
-       "      <td>7.731792</td>\n",
-       "      <td>0.098091</td>\n",
-       "      <td>1</td>\n",
-       "      <td>train</td>\n",
-       "      <td>7.0 - 8.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-0.5 - -0.2</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>0.473367</td>\n",
-       "      <td>0.495885</td>\n",
-       "      <td>0.465366</td>\n",
-       "      <td>0.490260</td>\n",
-       "      <td>0.494297</td>\n",
-       "      <td>0.433225</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2022-01-04</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>-1.134912</td>\n",
-       "      <td>0.205132</td>\n",
-       "      <td>0.179868</td>\n",
-       "      <td>0</td>\n",
-       "      <td>selection</td>\n",
-       "      <td>0.0 - 1.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-1.3 - -0.8</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>0.504274</td>\n",
-       "      <td>0.495885</td>\n",
-       "      <td>0.465366</td>\n",
-       "      <td>0.475410</td>\n",
-       "      <td>0.504065</td>\n",
-       "      <td>0.523364</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>2022-01-05</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>-1.339645</td>\n",
-       "      <td>2.378540</td>\n",
-       "      <td>0.966818</td>\n",
-       "      <td>1</td>\n",
-       "      <td>train</td>\n",
-       "      <td>2.0 - 3.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-4.0 - -1.3</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>0.491597</td>\n",
-       "      <td>0.487952</td>\n",
-       "      <td>0.491000</td>\n",
-       "      <td>0.455696</td>\n",
-       "      <td>0.471464</td>\n",
-       "      <td>0.562290</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 21 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    DateTime CategoryVar1 CategoryVar2 CategoryVar3  cont_var1  cont_var2  \\\n",
-       "0 2022-01-01   Category C   Category B   Category A  -1.001645   4.733706   \n",
-       "1 2022-01-02   Category C   Category C   Category B   0.280629   9.191129   \n",
-       "2 2022-01-03   Category B   Category B   Category C  -0.345219   7.731792   \n",
-       "3 2022-01-04   Category C   Category B   Category C  -1.134912   0.205132   \n",
-       "4 2022-01-05   Category A   Category C   Category B  -1.339645   2.378540   \n",
-       "\n",
-       "   cont_var3  target      split cont_var2_bin  ... cont_var1_bin  \\\n",
-       "0   1.372659       0  selection     4.0 - 5.0  ...   -1.3 - -0.8   \n",
-       "1   0.635924       1      train    9.0 - 10.0  ...     0.2 - 0.5   \n",
-       "2   0.098091       1      train     7.0 - 8.0  ...   -0.5 - -0.2   \n",
-       "3   0.179868       0  selection     0.0 - 1.0  ...   -1.3 - -0.8   \n",
-       "4   0.966818       1      train     2.0 - 3.0  ...   -4.0 - -1.3   \n",
-       "\n",
-       "  CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed  \\\n",
-       "0             Category C             Category B             Category A   \n",
-       "1             Category C             Category C             Category B   \n",
-       "2             Category B             Category B             Category C   \n",
-       "3             Category C             Category B             Category C   \n",
-       "4             Category A             Category C             Category B   \n",
-       "\n",
-       "  CategoryVar1_enc  CategoryVar2_enc  CategoryVar3_enc  cont_var2_enc  \\\n",
-       "0         0.504274          0.495885          0.514872       0.467391   \n",
-       "1         0.504274          0.487952          0.491000       0.474048   \n",
-       "2         0.473367          0.495885          0.465366       0.490260   \n",
-       "3         0.504274          0.495885          0.465366       0.475410   \n",
-       "4         0.491597          0.487952          0.491000       0.455696   \n",
-       "\n",
-       "   cont_var3_enc  cont_var1_enc  \n",
-       "0       0.486891       0.523364  \n",
-       "1       0.524355       0.492997  \n",
-       "2       0.494297       0.433225  \n",
-       "3       0.504065       0.523364  \n",
-       "4       0.471464       0.562290  \n",
-       "\n",
-       "[5 rows x 21 columns]"
-      ]
-     },
-     "execution_count": 530,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "basetable_transformed = new_preprocessor.transform(basetable,\n",
-    "                                   continuous_vars=continuous_vars,\n",
-    "                                   discrete_vars=discrete_vars)\n",
-    "basetable_transformed.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 531,
-   "id": "c270d856-452d-4507-a3c2-df3ae1991c36",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>DateTime</th>\n",
-       "      <th>CategoryVar1</th>\n",
-       "      <th>CategoryVar2</th>\n",
-       "      <th>CategoryVar3</th>\n",
-       "      <th>cont_var1</th>\n",
-       "      <th>cont_var2</th>\n",
-       "      <th>cont_var3</th>\n",
-       "      <th>target</th>\n",
-       "      <th>split</th>\n",
-       "      <th>cont_var2_bin</th>\n",
-       "      <th>...</th>\n",
-       "      <th>cont_var1_bin</th>\n",
-       "      <th>CategoryVar1_processed</th>\n",
-       "      <th>CategoryVar2_processed</th>\n",
-       "      <th>CategoryVar3_processed</th>\n",
-       "      <th>CategoryVar1_enc</th>\n",
-       "      <th>CategoryVar2_enc</th>\n",
-       "      <th>CategoryVar3_enc</th>\n",
-       "      <th>cont_var2_enc</th>\n",
-       "      <th>cont_var3_enc</th>\n",
-       "      <th>cont_var1_enc</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4995</th>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4996</th>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4997</th>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4998</th>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4999</th>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5000 rows × 21 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "      DateTime  CategoryVar1  CategoryVar2  CategoryVar3  cont_var1  \\\n",
-       "0         True          True          True          True       True   \n",
-       "1         True          True          True          True       True   \n",
-       "2         True          True          True          True       True   \n",
-       "3         True          True          True          True       True   \n",
-       "4         True          True          True          True       True   \n",
-       "...        ...           ...           ...           ...        ...   \n",
-       "4995      True          True          True          True       True   \n",
-       "4996      True          True          True          True       True   \n",
-       "4997      True          True          True          True       True   \n",
-       "4998      True          True          True          True       True   \n",
-       "4999      True          True          True          True       True   \n",
-       "\n",
-       "      cont_var2  cont_var3  target  split  cont_var2_bin  ...  cont_var1_bin  \\\n",
-       "0          True       True    True   True           True  ...           True   \n",
-       "1          True       True    True   True           True  ...           True   \n",
-       "2          True       True    True   True           True  ...           True   \n",
-       "3          True       True    True   True           True  ...           True   \n",
-       "4          True       True    True   True           True  ...           True   \n",
-       "...         ...        ...     ...    ...            ...  ...            ...   \n",
-       "4995       True       True    True   True           True  ...           True   \n",
-       "4996       True       True    True   True           True  ...           True   \n",
-       "4997       True       True    True   True           True  ...           True   \n",
-       "4998       True       True    True   True           True  ...           True   \n",
-       "4999       True       True    True   True           True  ...           True   \n",
-       "\n",
-       "      CategoryVar1_processed  CategoryVar2_processed  CategoryVar3_processed  \\\n",
-       "0                       True                    True                    True   \n",
-       "1                       True                    True                    True   \n",
-       "2                       True                    True                    True   \n",
-       "3                       True                    True                    True   \n",
-       "4                       True                    True                    True   \n",
-       "...                      ...                     ...                     ...   \n",
-       "4995                    True                    True                    True   \n",
-       "4996                    True                    True                    True   \n",
-       "4997                    True                    True                    True   \n",
-       "4998                    True                    True                    True   \n",
-       "4999                    True                    True                    True   \n",
-       "\n",
-       "      CategoryVar1_enc  CategoryVar2_enc  CategoryVar3_enc  cont_var2_enc  \\\n",
-       "0                 True              True              True           True   \n",
-       "1                 True              True              True           True   \n",
-       "2                 True              True              True           True   \n",
-       "3                 True              True              True           True   \n",
-       "4                 True              True              True           True   \n",
-       "...                ...               ...               ...            ...   \n",
-       "4995              True              True              True           True   \n",
-       "4996              True              True              True           True   \n",
-       "4997              True              True              True           True   \n",
-       "4998              True              True              True           True   \n",
-       "4999              True              True              True           True   \n",
-       "\n",
-       "      cont_var3_enc  cont_var1_enc  \n",
-       "0              True           True  \n",
-       "1              True           True  \n",
-       "2              True           True  \n",
-       "3              True           True  \n",
-       "4              True           True  \n",
-       "...             ...            ...  \n",
-       "4995           True           True  \n",
-       "4996           True           True  \n",
-       "4997           True           True  \n",
-       "4998           True           True  \n",
-       "4999           True           True  \n",
-       "\n",
-       "[5000 rows x 21 columns]"
-      ]
-     },
-     "execution_count": 531,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Double check transformed basetable is the same\n",
-    "basetable_transformed_orig == basetable_transformed"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2b478d7c-46d8-4ba9-bf84-375a7cf901a8",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "cobra_venv",
-   "language": "python",
-   "name": "cobra_venv"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From 82b378f8eff34d7b19b8c39e28d0806564ecad05 Mon Sep 17 00:00:00 2001
From: joostneuj <91886694+joostneuj@users.noreply.github.com>
Date: Fri, 16 Jun 2023 18:15:58 +0200
Subject: [PATCH 3/4] #143 delete file

---
 notebooks/model_json.json | 216 --------------------------------------
 1 file changed, 216 deletions(-)
 delete mode 100644 notebooks/model_json.json

diff --git a/notebooks/model_json.json b/notebooks/model_json.json
deleted file mode 100644
index fd80281..0000000
--- a/notebooks/model_json.json
+++ /dev/null
@@ -1,216 +0,0 @@
-{
-    "metadata": {
-        "timestamp": "16/06/2023 18:00:26"
-    },
-    "categorical_data_processor": {
-        "category_size_threshold": 5,
-        "forced_categories": {},
-        "keep_missing": true,
-        "model_type": "classification",
-        "p_value_threshold": 0.001,
-        "regroup": true,
-        "regroup_name": "Other",
-        "scale_contingency_table": true,
-        "_cleaned_categories_by_column": {
-            "CategoryVar1": [],
-            "CategoryVar2": [],
-            "CategoryVar3": []
-        }
-    },
-    "discretizer": {
-        "auto_adapt_bins": false,
-        "change_endpoint_format": false,
-        "closed": "right",
-        "label_format": "{} - {}",
-        "n_bins": 10,
-        "starting_precision": 0,
-        "strategy": "quantile",
-        "_bins_by_column": {
-            "cont_var2": [
-                [
-                    0.0,
-                    1.0
-                ],
-                [
-                    1.0,
-                    2.0
-                ],
-                [
-                    2.0,
-                    3.0
-                ],
-                [
-                    3.0,
-                    4.0
-                ],
-                [
-                    4.0,
-                    5.0
-                ],
-                [
-                    5.0,
-                    6.0
-                ],
-                [
-                    6.0,
-                    7.0
-                ],
-                [
-                    7.0,
-                    8.0
-                ],
-                [
-                    8.0,
-                    9.0
-                ],
-                [
-                    9.0,
-                    10.0
-                ]
-            ],
-            "cont_var3": [
-                [
-                    0.0,
-                    0.1
-                ],
-                [
-                    0.1,
-                    0.2
-                ],
-                [
-                    0.2,
-                    0.4
-                ],
-                [
-                    0.4,
-                    0.5
-                ],
-                [
-                    0.5,
-                    0.7
-                ],
-                [
-                    0.7,
-                    0.9
-                ],
-                [
-                    0.9,
-                    1.3
-                ],
-                [
-                    1.3,
-                    1.7
-                ],
-                [
-                    1.7,
-                    2.4
-                ],
-                [
-                    2.4,
-                    7.6
-                ]
-            ],
-            "cont_var1": [
-                [
-                    -4.0,
-                    -1.3
-                ],
-                [
-                    -1.3,
-                    -0.8
-                ],
-                [
-                    -0.8,
-                    -0.5
-                ],
-                [
-                    -0.5,
-                    -0.2
-                ],
-                [
-                    -0.2,
-                    0.0
-                ],
-                [
-                    0.0,
-                    0.2
-                ],
-                [
-                    0.2,
-                    0.5
-                ],
-                [
-                    0.5,
-                    0.8
-                ],
-                [
-                    0.8,
-                    1.2
-                ],
-                [
-                    1.2,
-                    3.7
-                ]
-            ]
-        }
-    },
-    "target_encoder": {
-        "imputation_strategy": "mean",
-        "weight": 0.0,
-        "_mapping": {
-            "CategoryVar1_processed": {
-                "Category A": 0.49159663865546216,
-                "Category B": 0.4733668341708543,
-                "Category C": 0.5042735042735043
-            },
-            "CategoryVar2_processed": {
-                "Category A": 0.48643410852713176,
-                "Category B": 0.49588477366255146,
-                "Category C": 0.4879518072289157
-            },
-            "CategoryVar3_processed": {
-                "Category A": 0.5148717948717949,
-                "Category B": 0.491,
-                "Category C": 0.4653658536585366
-            },
-            "cont_var2_bin": {
-                "0.0 - 1.0": 0.47540983606557374,
-                "1.0 - 2.0": 0.46855345911949686,
-                "2.0 - 3.0": 0.45569620253164556,
-                "3.0 - 4.0": 0.5133333333333333,
-                "4.0 - 5.0": 0.4673913043478261,
-                "5.0 - 6.0": 0.5307443365695793,
-                "6.0 - 7.0": 0.5232974910394266,
-                "7.0 - 8.0": 0.4902597402597403,
-                "8.0 - 9.0": 0.5033333333333333,
-                "9.0 - 10.0": 0.4740484429065744
-            },
-            "cont_var3_bin": {
-                "0.0 - 0.1": 0.49429657794676807,
-                "0.1 - 0.2": 0.5040650406504065,
-                "0.2 - 0.4": 0.4897025171624714,
-                "0.4 - 0.5": 0.5,
-                "0.5 - 0.7": 0.5243553008595988,
-                "0.7 - 0.9": 0.4703703703703704,
-                "0.9 - 1.3": 0.47146401985111663,
-                "1.3 - 1.7": 0.4868913857677903,
-                "1.7 - 2.4": 0.43416370106761565,
-                "2.4 - 7.6": 0.5258064516129032
-            },
-            "cont_var1_bin": {
-                "-4.0 - -1.3": 0.5622895622895623,
-                "-1.3 - -0.8": 0.5233644859813084,
-                "-0.8 - -0.5": 0.4358974358974359,
-                "-0.5 - -0.2": 0.43322475570032576,
-                "-0.2 - 0.0": 0.5219123505976095,
-                "0.0 - 0.2": 0.4763779527559055,
-                "0.2 - 0.5": 0.49299719887955185,
-                "0.5 - 0.8": 0.5054545454545455,
-                "0.8 - 1.2": 0.4539249146757679,
-                "1.2 - 3.7": 0.4984984984984985
-            }
-        },
-        "_global_mean": 0.49
-    },
-    "_is_fitted": true
-}
\ No newline at end of file

From fcdc5f3e0b022e09f5849cb5f146687ba0ef8cf0 Mon Sep 17 00:00:00 2001
From: "joost.neujens" <jneujens@gmail.com>
Date: Fri, 16 Jun 2023 18:41:46 +0200
Subject: [PATCH 4/4] #143 fix: serialization-deserialization bug

---
 .gitignore                          |   1 -
 cobra/preprocessing/preprocessor.py |   5 -
 notebooks/debugging.ipynb           | 530 ++++++++++++++--------------
 notebooks/model_json.json           | 120 +++----
 4 files changed, 325 insertions(+), 331 deletions(-)

diff --git a/.gitignore b/.gitignore
index 14c9262..6aa9052 100644
--- a/.gitignore
+++ b/.gitignore
@@ -72,7 +72,6 @@ target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
-#*notebooks/*
 
 # pyenv
 .python-version
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
index 7f84716..fa7ddf1 100644
--- a/cobra/preprocessing/preprocessor.py
+++ b/cobra/preprocessing/preprocessor.py
@@ -367,10 +367,6 @@ def fit(
 
         log.info("Fitting pipeline took {} seconds".format(time.time() - start))
 
-    def test_function(self):
-        return print('heleeeloooo')
-
-
     def transform(
         self, data: pd.DataFrame, continuous_vars: list, discrete_vars: list
     ) -> pd.DataFrame:
@@ -425,7 +421,6 @@ def transform(
 
         return data
 
-
     def fit_transform(
         self,
         train_data: pd.DataFrame,
diff --git a/notebooks/debugging.ipynb b/notebooks/debugging.ipynb
index 5dd573e..f420671 100644
--- a/notebooks/debugging.ipynb
+++ b/notebooks/debugging.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 464,
+   "execution_count": 532,
    "id": "23482fd8-b4c1-48f5-8c30-a0e79f7667b3",
    "metadata": {
     "tags": []
@@ -25,7 +25,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 465,
+   "execution_count": 533,
    "id": "da551dc3-ffba-45e0-b87d-7b626a622b08",
    "metadata": {
     "tags": []
@@ -38,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 488,
+   "execution_count": 534,
    "id": "7d2678fa-eb47-4cb5-ad1d-c5034a742f55",
    "metadata": {
     "tags": []
@@ -67,7 +67,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 467,
+   "execution_count": 535,
    "id": "a9563643-308b-4c6c-b358-9cbf93a0666d",
    "metadata": {
     "tags": []
@@ -102,7 +102,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 468,
+   "execution_count": 536,
    "id": "bde9235f-dc62-433d-b3d3-6bf37b2ddb52",
    "metadata": {
     "tags": []
@@ -122,7 +122,7 @@
        "dtype: object"
       ]
      },
-     "execution_count": 468,
+     "execution_count": 536,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -133,7 +133,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 469,
+   "execution_count": 537,
    "id": "d774e959-73f4-40b4-bc20-43c3af99e593",
    "metadata": {
     "tags": []
@@ -174,34 +174,34 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>2022-01-01</td>\n",
-       "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>-1.001645</td>\n",
-       "      <td>4.733706</td>\n",
-       "      <td>1.372659</td>\n",
-       "      <td>0</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>-0.247175</td>\n",
+       "      <td>8.258259</td>\n",
+       "      <td>0.039901</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>2022-01-02</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>0.280629</td>\n",
-       "      <td>9.191129</td>\n",
-       "      <td>0.635924</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>0.247006</td>\n",
+       "      <td>1.234493</td>\n",
+       "      <td>1.336691</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>2022-01-03</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category B</td>\n",
        "      <td>Category C</td>\n",
-       "      <td>-0.345219</td>\n",
-       "      <td>7.731792</td>\n",
-       "      <td>0.098091</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>0.076415</td>\n",
+       "      <td>5.059058</td>\n",
+       "      <td>1.323273</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -209,21 +209,21 @@
        "      <td>2022-01-04</td>\n",
        "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>-1.134912</td>\n",
-       "      <td>0.205132</td>\n",
-       "      <td>0.179868</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>-0.306355</td>\n",
+       "      <td>8.316857</td>\n",
+       "      <td>0.077718</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>2022-01-05</td>\n",
-       "      <td>Category A</td>\n",
+       "      <td>Category C</td>\n",
        "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>-1.339645</td>\n",
-       "      <td>2.378540</td>\n",
-       "      <td>0.966818</td>\n",
+       "      <td>-1.133514</td>\n",
+       "      <td>8.773722</td>\n",
+       "      <td>0.356009</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -232,21 +232,21 @@
       ],
       "text/plain": [
        "    DateTime CategoryVar1 CategoryVar2 CategoryVar3  cont_var1  cont_var2  \\\n",
-       "0 2022-01-01   Category C   Category B   Category A  -1.001645   4.733706   \n",
-       "1 2022-01-02   Category C   Category C   Category B   0.280629   9.191129   \n",
-       "2 2022-01-03   Category B   Category B   Category C  -0.345219   7.731792   \n",
-       "3 2022-01-04   Category C   Category B   Category C  -1.134912   0.205132   \n",
-       "4 2022-01-05   Category A   Category C   Category B  -1.339645   2.378540   \n",
+       "0 2022-01-01   Category B   Category B   Category C  -0.247175   8.258259   \n",
+       "1 2022-01-02   Category B   Category B   Category C   0.247006   1.234493   \n",
+       "2 2022-01-03   Category C   Category A   Category B   0.076415   5.059058   \n",
+       "3 2022-01-04   Category C   Category B   Category A  -0.306355   8.316857   \n",
+       "4 2022-01-05   Category C   Category C   Category B  -1.133514   8.773722   \n",
        "\n",
        "   cont_var3  target  \n",
-       "0   1.372659       0  \n",
-       "1   0.635924       1  \n",
-       "2   0.098091       1  \n",
-       "3   0.179868       0  \n",
-       "4   0.966818       1  "
+       "0   0.039901       1  \n",
+       "1   1.336691       1  \n",
+       "2   1.323273       1  \n",
+       "3   0.077718       0  \n",
+       "4   0.356009       1  "
       ]
      },
-     "execution_count": 469,
+     "execution_count": 537,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -257,7 +257,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 470,
+   "execution_count": 538,
    "id": "e9c06e3a-188f-4cdc-b9cd-51d3db63e5ff",
    "metadata": {
     "tags": []
@@ -271,7 +271,7 @@
        "      dtype='object')"
       ]
      },
-     "execution_count": 470,
+     "execution_count": 538,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -290,7 +290,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 521,
+   "execution_count": 539,
    "id": "a32560d4-b5fe-4b90-9ea6-ede7915bba05",
    "metadata": {
     "tags": []
@@ -304,7 +304,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 522,
+   "execution_count": 540,
    "id": "d6f1e21a-4a6e-4ad7-9faf-b36e6daff707",
    "metadata": {
     "tags": []
@@ -335,7 +335,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 523,
+   "execution_count": 541,
    "id": "7b673619-4eda-4aca-acd5-a125f80d3b20",
    "metadata": {
     "tags": []
@@ -346,14 +346,14 @@
      "output_type": "stream",
      "text": [
       "Starting to fit pipeline\n",
-      "Computing discretization bins...: 100%|█████████████████████████████████████████████████| 3/3 [00:00<00:00, 507.38it/s]\n",
-      "Fitting KBinsDiscretizer took 0.006914615631103516 seconds\n",
-      "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 240.62it/s]\n",
-      "Fitting category regrouping...: 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 29.42it/s]\n",
-      "Fitting categorical_data_processor class took 0.10196375846862793 seconds\n",
-      "Fitting target encoding...: 100%|███████████████████████████████████████████████████████| 6/6 [00:00<00:00, 558.52it/s]\n",
-      "Fitting TargetEncoder took 0.013732433319091797 seconds\n",
-      "Fitting pipeline took 0.17300176620483398 seconds\n"
+      "Computing discretization bins...: 100%|█████████████████████████████████████████████████| 3/3 [00:00<00:00, 251.21it/s]\n",
+      "Fitting KBinsDiscretizer took 0.012943267822265625 seconds\n",
+      "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 193.52it/s]\n",
+      "Fitting category regrouping...: 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 26.85it/s]\n",
+      "Fitting categorical_data_processor class took 0.11171197891235352 seconds\n",
+      "Fitting target encoding...: 100%|███████████████████████████████████████████████████████| 6/6 [00:00<00:00, 564.66it/s]\n",
+      "Fitting TargetEncoder took 0.015709400177001953 seconds\n",
+      "Fitting pipeline took 0.1843581199645996 seconds\n"
      ]
     }
    ],
@@ -366,7 +366,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 524,
+   "execution_count": 542,
    "id": "c9e2c79d-c0bc-464d-b869-f8115ac67776",
    "metadata": {},
    "outputs": [
@@ -374,9 +374,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 160.70it/s]\n",
-      "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 697.13it/s]\n",
-      "Transforming data took 0.0610198974609375 seconds\n"
+      "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 130.81it/s]\n",
+      "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 517.58it/s]\n",
+      "Transforming data took 0.06473207473754883 seconds\n"
      ]
     },
     {
@@ -427,122 +427,122 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>2022-01-01</td>\n",
-       "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>-1.001645</td>\n",
-       "      <td>4.733706</td>\n",
-       "      <td>1.372659</td>\n",
-       "      <td>0</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>-0.247175</td>\n",
+       "      <td>8.258259</td>\n",
+       "      <td>0.039901</td>\n",
+       "      <td>1</td>\n",
        "      <td>selection</td>\n",
-       "      <td>4.0 - 5.0</td>\n",
+       "      <td>8.0 - 9.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>-1.3 - -0.8</td>\n",
-       "      <td>Category C</td>\n",
+       "      <td>-0.3 - 0.0</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>0.504274</td>\n",
-       "      <td>0.495885</td>\n",
-       "      <td>0.514872</td>\n",
-       "      <td>0.467391</td>\n",
-       "      <td>0.486891</td>\n",
-       "      <td>0.523364</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>0.505584</td>\n",
+       "      <td>0.530256</td>\n",
+       "      <td>0.517730</td>\n",
+       "      <td>0.516447</td>\n",
+       "      <td>0.514851</td>\n",
+       "      <td>0.494083</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>2022-01-02</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>0.280629</td>\n",
-       "      <td>9.191129</td>\n",
-       "      <td>0.635924</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>0.247006</td>\n",
+       "      <td>1.234493</td>\n",
+       "      <td>1.336691</td>\n",
        "      <td>1</td>\n",
        "      <td>train</td>\n",
-       "      <td>9.0 - 10.0</td>\n",
+       "      <td>1.0 - 2.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>0.2 - 0.5</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category C</td>\n",
+       "      <td>0.0 - 0.3</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>0.504274</td>\n",
-       "      <td>0.487952</td>\n",
-       "      <td>0.491000</td>\n",
-       "      <td>0.474048</td>\n",
-       "      <td>0.524355</td>\n",
-       "      <td>0.492997</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>0.505584</td>\n",
+       "      <td>0.530256</td>\n",
+       "      <td>0.517730</td>\n",
+       "      <td>0.521311</td>\n",
+       "      <td>0.517986</td>\n",
+       "      <td>0.529086</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>2022-01-03</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category B</td>\n",
        "      <td>Category C</td>\n",
-       "      <td>-0.345219</td>\n",
-       "      <td>7.731792</td>\n",
-       "      <td>0.098091</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>0.076415</td>\n",
+       "      <td>5.059058</td>\n",
+       "      <td>1.323273</td>\n",
        "      <td>1</td>\n",
        "      <td>train</td>\n",
-       "      <td>7.0 - 8.0</td>\n",
+       "      <td>5.0 - 6.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>-0.5 - -0.2</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category B</td>\n",
+       "      <td>0.0 - 0.3</td>\n",
        "      <td>Category C</td>\n",
-       "      <td>0.473367</td>\n",
-       "      <td>0.495885</td>\n",
-       "      <td>0.465366</td>\n",
-       "      <td>0.490260</td>\n",
-       "      <td>0.494297</td>\n",
-       "      <td>0.433225</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>0.494939</td>\n",
+       "      <td>0.461386</td>\n",
+       "      <td>0.487052</td>\n",
+       "      <td>0.510903</td>\n",
+       "      <td>0.517986</td>\n",
+       "      <td>0.529086</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>2022-01-04</td>\n",
        "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>-1.134912</td>\n",
-       "      <td>0.205132</td>\n",
-       "      <td>0.179868</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>-0.306355</td>\n",
+       "      <td>8.316857</td>\n",
+       "      <td>0.077718</td>\n",
        "      <td>0</td>\n",
        "      <td>selection</td>\n",
-       "      <td>0.0 - 1.0</td>\n",
+       "      <td>8.0 - 9.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>-1.3 - -0.8</td>\n",
+       "      <td>-0.5 - -0.3</td>\n",
        "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>0.504274</td>\n",
-       "      <td>0.495885</td>\n",
-       "      <td>0.465366</td>\n",
-       "      <td>0.475410</td>\n",
-       "      <td>0.504065</td>\n",
-       "      <td>0.523364</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>0.494939</td>\n",
+       "      <td>0.530256</td>\n",
+       "      <td>0.488603</td>\n",
+       "      <td>0.516447</td>\n",
+       "      <td>0.514851</td>\n",
+       "      <td>0.534884</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>2022-01-05</td>\n",
-       "      <td>Category A</td>\n",
+       "      <td>Category C</td>\n",
        "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>-1.339645</td>\n",
-       "      <td>2.378540</td>\n",
-       "      <td>0.966818</td>\n",
+       "      <td>-1.133514</td>\n",
+       "      <td>8.773722</td>\n",
+       "      <td>0.356009</td>\n",
        "      <td>1</td>\n",
        "      <td>train</td>\n",
-       "      <td>2.0 - 3.0</td>\n",
+       "      <td>8.0 - 9.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>-4.0 - -1.3</td>\n",
-       "      <td>Category A</td>\n",
+       "      <td>-1.3 - -0.8</td>\n",
+       "      <td>Category C</td>\n",
        "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>0.491597</td>\n",
-       "      <td>0.487952</td>\n",
-       "      <td>0.491000</td>\n",
-       "      <td>0.455696</td>\n",
-       "      <td>0.471464</td>\n",
-       "      <td>0.562290</td>\n",
+       "      <td>0.494939</td>\n",
+       "      <td>0.502463</td>\n",
+       "      <td>0.487052</td>\n",
+       "      <td>0.516447</td>\n",
+       "      <td>0.484634</td>\n",
+       "      <td>0.461078</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -551,44 +551,44 @@
       ],
       "text/plain": [
        "    DateTime CategoryVar1 CategoryVar2 CategoryVar3  cont_var1  cont_var2  \\\n",
-       "0 2022-01-01   Category C   Category B   Category A  -1.001645   4.733706   \n",
-       "1 2022-01-02   Category C   Category C   Category B   0.280629   9.191129   \n",
-       "2 2022-01-03   Category B   Category B   Category C  -0.345219   7.731792   \n",
-       "3 2022-01-04   Category C   Category B   Category C  -1.134912   0.205132   \n",
-       "4 2022-01-05   Category A   Category C   Category B  -1.339645   2.378540   \n",
+       "0 2022-01-01   Category B   Category B   Category C  -0.247175   8.258259   \n",
+       "1 2022-01-02   Category B   Category B   Category C   0.247006   1.234493   \n",
+       "2 2022-01-03   Category C   Category A   Category B   0.076415   5.059058   \n",
+       "3 2022-01-04   Category C   Category B   Category A  -0.306355   8.316857   \n",
+       "4 2022-01-05   Category C   Category C   Category B  -1.133514   8.773722   \n",
        "\n",
        "   cont_var3  target      split cont_var2_bin  ... cont_var1_bin  \\\n",
-       "0   1.372659       0  selection     4.0 - 5.0  ...   -1.3 - -0.8   \n",
-       "1   0.635924       1      train    9.0 - 10.0  ...     0.2 - 0.5   \n",
-       "2   0.098091       1      train     7.0 - 8.0  ...   -0.5 - -0.2   \n",
-       "3   0.179868       0  selection     0.0 - 1.0  ...   -1.3 - -0.8   \n",
-       "4   0.966818       1      train     2.0 - 3.0  ...   -4.0 - -1.3   \n",
+       "0   0.039901       1  selection     8.0 - 9.0  ...    -0.3 - 0.0   \n",
+       "1   1.336691       1      train     1.0 - 2.0  ...     0.0 - 0.3   \n",
+       "2   1.323273       1      train     5.0 - 6.0  ...     0.0 - 0.3   \n",
+       "3   0.077718       0  selection     8.0 - 9.0  ...   -0.5 - -0.3   \n",
+       "4   0.356009       1      train     8.0 - 9.0  ...   -1.3 - -0.8   \n",
        "\n",
        "  CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed  \\\n",
-       "0             Category C             Category B             Category A   \n",
-       "1             Category C             Category C             Category B   \n",
-       "2             Category B             Category B             Category C   \n",
-       "3             Category C             Category B             Category C   \n",
-       "4             Category A             Category C             Category B   \n",
+       "0             Category B             Category B             Category C   \n",
+       "1             Category B             Category B             Category C   \n",
+       "2             Category C             Category A             Category B   \n",
+       "3             Category C             Category B             Category A   \n",
+       "4             Category C             Category C             Category B   \n",
        "\n",
        "  CategoryVar1_enc  CategoryVar2_enc  CategoryVar3_enc  cont_var2_enc  \\\n",
-       "0         0.504274          0.495885          0.514872       0.467391   \n",
-       "1         0.504274          0.487952          0.491000       0.474048   \n",
-       "2         0.473367          0.495885          0.465366       0.490260   \n",
-       "3         0.504274          0.495885          0.465366       0.475410   \n",
-       "4         0.491597          0.487952          0.491000       0.455696   \n",
+       "0         0.505584          0.530256          0.517730       0.516447   \n",
+       "1         0.505584          0.530256          0.517730       0.521311   \n",
+       "2         0.494939          0.461386          0.487052       0.510903   \n",
+       "3         0.494939          0.530256          0.488603       0.516447   \n",
+       "4         0.494939          0.502463          0.487052       0.516447   \n",
        "\n",
        "   cont_var3_enc  cont_var1_enc  \n",
-       "0       0.486891       0.523364  \n",
-       "1       0.524355       0.492997  \n",
-       "2       0.494297       0.433225  \n",
-       "3       0.504065       0.523364  \n",
-       "4       0.471464       0.562290  \n",
+       "0       0.514851       0.494083  \n",
+       "1       0.517986       0.529086  \n",
+       "2       0.517986       0.529086  \n",
+       "3       0.514851       0.534884  \n",
+       "4       0.484634       0.461078  \n",
        "\n",
        "[5 rows x 21 columns]"
       ]
      },
-     "execution_count": 524,
+     "execution_count": 542,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -602,7 +602,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 525,
+   "execution_count": 543,
    "id": "d70f40cc-7814-48a8-91f6-2b7297f97ccc",
    "metadata": {
     "tags": []
@@ -625,7 +625,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 526,
+   "execution_count": 544,
    "id": "95b597b2-b475-4d59-b650-dcc208db1eb5",
    "metadata": {
     "tags": []
@@ -642,7 +642,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 527,
+   "execution_count": 545,
    "id": "c6dbd38c-ca5d-492d-815b-1af02d7de143",
    "metadata": {
     "tags": []
@@ -663,7 +663,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 528,
+   "execution_count": 547,
    "id": "2a517ff8-d336-4bd3-abdc-2be784259564",
    "metadata": {},
    "outputs": [
@@ -698,7 +698,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 530,
+   "execution_count": 548,
    "id": "541986d2-8d5d-473c-8871-5e7d2da31c4a",
    "metadata": {},
    "outputs": [
@@ -706,9 +706,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 147.15it/s]\n",
-      "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 661.65it/s]\n",
-      "Transforming data took 0.06773138046264648 seconds\n"
+      "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 119.08it/s]\n",
+      "Applying target encoding...: 100%|█████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1009.14it/s]\n",
+      "Transforming data took 0.06331968307495117 seconds\n"
      ]
     },
     {
@@ -759,122 +759,122 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>2022-01-01</td>\n",
-       "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>-1.001645</td>\n",
-       "      <td>4.733706</td>\n",
-       "      <td>1.372659</td>\n",
-       "      <td>0</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>-0.247175</td>\n",
+       "      <td>8.258259</td>\n",
+       "      <td>0.039901</td>\n",
+       "      <td>1</td>\n",
        "      <td>selection</td>\n",
-       "      <td>4.0 - 5.0</td>\n",
+       "      <td>8.0 - 9.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>-1.3 - -0.8</td>\n",
-       "      <td>Category C</td>\n",
+       "      <td>-0.3 - 0.0</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>Category A</td>\n",
-       "      <td>0.504274</td>\n",
-       "      <td>0.495885</td>\n",
-       "      <td>0.514872</td>\n",
-       "      <td>0.467391</td>\n",
-       "      <td>0.486891</td>\n",
-       "      <td>0.523364</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>0.505584</td>\n",
+       "      <td>0.530256</td>\n",
+       "      <td>0.517730</td>\n",
+       "      <td>0.516447</td>\n",
+       "      <td>0.514851</td>\n",
+       "      <td>0.494083</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>2022-01-02</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>0.280629</td>\n",
-       "      <td>9.191129</td>\n",
-       "      <td>0.635924</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>0.247006</td>\n",
+       "      <td>1.234493</td>\n",
+       "      <td>1.336691</td>\n",
        "      <td>1</td>\n",
        "      <td>train</td>\n",
-       "      <td>9.0 - 10.0</td>\n",
+       "      <td>1.0 - 2.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>0.2 - 0.5</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>Category C</td>\n",
+       "      <td>0.0 - 0.3</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>0.504274</td>\n",
-       "      <td>0.487952</td>\n",
-       "      <td>0.491000</td>\n",
-       "      <td>0.474048</td>\n",
-       "      <td>0.524355</td>\n",
-       "      <td>0.492997</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>Category C</td>\n",
+       "      <td>0.505584</td>\n",
+       "      <td>0.530256</td>\n",
+       "      <td>0.517730</td>\n",
+       "      <td>0.521311</td>\n",
+       "      <td>0.517986</td>\n",
+       "      <td>0.529086</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>2022-01-03</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category B</td>\n",
        "      <td>Category C</td>\n",
-       "      <td>-0.345219</td>\n",
-       "      <td>7.731792</td>\n",
-       "      <td>0.098091</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>0.076415</td>\n",
+       "      <td>5.059058</td>\n",
+       "      <td>1.323273</td>\n",
        "      <td>1</td>\n",
        "      <td>train</td>\n",
-       "      <td>7.0 - 8.0</td>\n",
+       "      <td>5.0 - 6.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>-0.5 - -0.2</td>\n",
-       "      <td>Category B</td>\n",
-       "      <td>Category B</td>\n",
+       "      <td>0.0 - 0.3</td>\n",
        "      <td>Category C</td>\n",
-       "      <td>0.473367</td>\n",
-       "      <td>0.495885</td>\n",
-       "      <td>0.465366</td>\n",
-       "      <td>0.490260</td>\n",
-       "      <td>0.494297</td>\n",
-       "      <td>0.433225</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>Category B</td>\n",
+       "      <td>0.494939</td>\n",
+       "      <td>0.461386</td>\n",
+       "      <td>0.487052</td>\n",
+       "      <td>0.510903</td>\n",
+       "      <td>0.517986</td>\n",
+       "      <td>0.529086</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>2022-01-04</td>\n",
        "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>-1.134912</td>\n",
-       "      <td>0.205132</td>\n",
-       "      <td>0.179868</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>-0.306355</td>\n",
+       "      <td>8.316857</td>\n",
+       "      <td>0.077718</td>\n",
        "      <td>0</td>\n",
        "      <td>selection</td>\n",
-       "      <td>0.0 - 1.0</td>\n",
+       "      <td>8.0 - 9.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>-1.3 - -0.8</td>\n",
+       "      <td>-0.5 - -0.3</td>\n",
        "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>Category C</td>\n",
-       "      <td>0.504274</td>\n",
-       "      <td>0.495885</td>\n",
-       "      <td>0.465366</td>\n",
-       "      <td>0.475410</td>\n",
-       "      <td>0.504065</td>\n",
-       "      <td>0.523364</td>\n",
+       "      <td>Category A</td>\n",
+       "      <td>0.494939</td>\n",
+       "      <td>0.530256</td>\n",
+       "      <td>0.488603</td>\n",
+       "      <td>0.516447</td>\n",
+       "      <td>0.514851</td>\n",
+       "      <td>0.534884</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>2022-01-05</td>\n",
-       "      <td>Category A</td>\n",
+       "      <td>Category C</td>\n",
        "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>-1.339645</td>\n",
-       "      <td>2.378540</td>\n",
-       "      <td>0.966818</td>\n",
+       "      <td>-1.133514</td>\n",
+       "      <td>8.773722</td>\n",
+       "      <td>0.356009</td>\n",
        "      <td>1</td>\n",
        "      <td>train</td>\n",
-       "      <td>2.0 - 3.0</td>\n",
+       "      <td>8.0 - 9.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>-4.0 - -1.3</td>\n",
-       "      <td>Category A</td>\n",
+       "      <td>-1.3 - -0.8</td>\n",
+       "      <td>Category C</td>\n",
        "      <td>Category C</td>\n",
        "      <td>Category B</td>\n",
-       "      <td>0.491597</td>\n",
-       "      <td>0.487952</td>\n",
-       "      <td>0.491000</td>\n",
-       "      <td>0.455696</td>\n",
-       "      <td>0.471464</td>\n",
-       "      <td>0.562290</td>\n",
+       "      <td>0.494939</td>\n",
+       "      <td>0.502463</td>\n",
+       "      <td>0.487052</td>\n",
+       "      <td>0.516447</td>\n",
+       "      <td>0.484634</td>\n",
+       "      <td>0.461078</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -883,44 +883,44 @@
       ],
       "text/plain": [
        "    DateTime CategoryVar1 CategoryVar2 CategoryVar3  cont_var1  cont_var2  \\\n",
-       "0 2022-01-01   Category C   Category B   Category A  -1.001645   4.733706   \n",
-       "1 2022-01-02   Category C   Category C   Category B   0.280629   9.191129   \n",
-       "2 2022-01-03   Category B   Category B   Category C  -0.345219   7.731792   \n",
-       "3 2022-01-04   Category C   Category B   Category C  -1.134912   0.205132   \n",
-       "4 2022-01-05   Category A   Category C   Category B  -1.339645   2.378540   \n",
+       "0 2022-01-01   Category B   Category B   Category C  -0.247175   8.258259   \n",
+       "1 2022-01-02   Category B   Category B   Category C   0.247006   1.234493   \n",
+       "2 2022-01-03   Category C   Category A   Category B   0.076415   5.059058   \n",
+       "3 2022-01-04   Category C   Category B   Category A  -0.306355   8.316857   \n",
+       "4 2022-01-05   Category C   Category C   Category B  -1.133514   8.773722   \n",
        "\n",
        "   cont_var3  target      split cont_var2_bin  ... cont_var1_bin  \\\n",
-       "0   1.372659       0  selection     4.0 - 5.0  ...   -1.3 - -0.8   \n",
-       "1   0.635924       1      train    9.0 - 10.0  ...     0.2 - 0.5   \n",
-       "2   0.098091       1      train     7.0 - 8.0  ...   -0.5 - -0.2   \n",
-       "3   0.179868       0  selection     0.0 - 1.0  ...   -1.3 - -0.8   \n",
-       "4   0.966818       1      train     2.0 - 3.0  ...   -4.0 - -1.3   \n",
+       "0   0.039901       1  selection     8.0 - 9.0  ...    -0.3 - 0.0   \n",
+       "1   1.336691       1      train     1.0 - 2.0  ...     0.0 - 0.3   \n",
+       "2   1.323273       1      train     5.0 - 6.0  ...     0.0 - 0.3   \n",
+       "3   0.077718       0  selection     8.0 - 9.0  ...   -0.5 - -0.3   \n",
+       "4   0.356009       1      train     8.0 - 9.0  ...   -1.3 - -0.8   \n",
        "\n",
        "  CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed  \\\n",
-       "0             Category C             Category B             Category A   \n",
-       "1             Category C             Category C             Category B   \n",
-       "2             Category B             Category B             Category C   \n",
-       "3             Category C             Category B             Category C   \n",
-       "4             Category A             Category C             Category B   \n",
+       "0             Category B             Category B             Category C   \n",
+       "1             Category B             Category B             Category C   \n",
+       "2             Category C             Category A             Category B   \n",
+       "3             Category C             Category B             Category A   \n",
+       "4             Category C             Category C             Category B   \n",
        "\n",
        "  CategoryVar1_enc  CategoryVar2_enc  CategoryVar3_enc  cont_var2_enc  \\\n",
-       "0         0.504274          0.495885          0.514872       0.467391   \n",
-       "1         0.504274          0.487952          0.491000       0.474048   \n",
-       "2         0.473367          0.495885          0.465366       0.490260   \n",
-       "3         0.504274          0.495885          0.465366       0.475410   \n",
-       "4         0.491597          0.487952          0.491000       0.455696   \n",
+       "0         0.505584          0.530256          0.517730       0.516447   \n",
+       "1         0.505584          0.530256          0.517730       0.521311   \n",
+       "2         0.494939          0.461386          0.487052       0.510903   \n",
+       "3         0.494939          0.530256          0.488603       0.516447   \n",
+       "4         0.494939          0.502463          0.487052       0.516447   \n",
        "\n",
        "   cont_var3_enc  cont_var1_enc  \n",
-       "0       0.486891       0.523364  \n",
-       "1       0.524355       0.492997  \n",
-       "2       0.494297       0.433225  \n",
-       "3       0.504065       0.523364  \n",
-       "4       0.471464       0.562290  \n",
+       "0       0.514851       0.494083  \n",
+       "1       0.517986       0.529086  \n",
+       "2       0.517986       0.529086  \n",
+       "3       0.514851       0.534884  \n",
+       "4       0.484634       0.461078  \n",
        "\n",
        "[5 rows x 21 columns]"
       ]
      },
-     "execution_count": 530,
+     "execution_count": 548,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -934,7 +934,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 531,
+   "execution_count": 549,
    "id": "c270d856-452d-4507-a3c2-df3ae1991c36",
    "metadata": {},
    "outputs": [
@@ -1321,7 +1321,7 @@
        "[5000 rows x 21 columns]"
       ]
      },
-     "execution_count": 531,
+     "execution_count": 549,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/notebooks/model_json.json b/notebooks/model_json.json
index fd80281..0670084 100644
--- a/notebooks/model_json.json
+++ b/notebooks/model_json.json
@@ -1,6 +1,6 @@
 {
     "metadata": {
-        "timestamp": "16/06/2023 18:00:26"
+        "timestamp": "16/06/2023 18:39:39"
     },
     "categorical_data_processor": {
         "category_size_threshold": 5,
@@ -79,10 +79,10 @@
                 ],
                 [
                     0.2,
-                    0.4
+                    0.3
                 ],
                 [
-                    0.4,
+                    0.3,
                     0.5
                 ],
                 [
@@ -95,24 +95,24 @@
                 ],
                 [
                     0.9,
-                    1.3
+                    1.2
                 ],
                 [
-                    1.3,
-                    1.7
+                    1.2,
+                    1.6
                 ],
                 [
-                    1.7,
-                    2.4
+                    1.6,
+                    2.2
                 ],
                 [
-                    2.4,
-                    7.6
+                    2.2,
+                    7.3
                 ]
             ],
             "cont_var1": [
                 [
-                    -4.0,
+                    -3.1,
                     -1.3
                 ],
                 [
@@ -125,31 +125,31 @@
                 ],
                 [
                     -0.5,
-                    -0.2
+                    -0.3
                 ],
                 [
-                    -0.2,
+                    -0.3,
                     0.0
                 ],
                 [
                     0.0,
-                    0.2
+                    0.3
                 ],
                 [
-                    0.2,
+                    0.3,
                     0.5
                 ],
                 [
                     0.5,
-                    0.8
+                    0.9
                 ],
                 [
-                    0.8,
-                    1.2
+                    0.9,
+                    1.3
                 ],
                 [
-                    1.2,
-                    3.7
+                    1.3,
+                    3.3
                 ]
             ]
         }
@@ -159,58 +159,58 @@
         "weight": 0.0,
         "_mapping": {
             "CategoryVar1_processed": {
-                "Category A": 0.49159663865546216,
-                "Category B": 0.4733668341708543,
-                "Category C": 0.5042735042735043
+                "Category A": 0.49269717624148,
+                "Category B": 0.5055837563451777,
+                "Category C": 0.4949392712550607
             },
             "CategoryVar2_processed": {
-                "Category A": 0.48643410852713176,
-                "Category B": 0.49588477366255146,
-                "Category C": 0.4879518072289157
+                "Category A": 0.4613861386138614,
+                "Category B": 0.5302564102564102,
+                "Category C": 0.5024630541871922
             },
             "CategoryVar3_processed": {
-                "Category A": 0.5148717948717949,
-                "Category B": 0.491,
-                "Category C": 0.4653658536585366
+                "Category A": 0.4886025768087215,
+                "Category B": 0.48705179282868527,
+                "Category C": 0.5177304964539007
             },
             "cont_var2_bin": {
-                "0.0 - 1.0": 0.47540983606557374,
-                "1.0 - 2.0": 0.46855345911949686,
-                "2.0 - 3.0": 0.45569620253164556,
-                "3.0 - 4.0": 0.5133333333333333,
-                "4.0 - 5.0": 0.4673913043478261,
-                "5.0 - 6.0": 0.5307443365695793,
-                "6.0 - 7.0": 0.5232974910394266,
-                "7.0 - 8.0": 0.4902597402597403,
-                "8.0 - 9.0": 0.5033333333333333,
-                "9.0 - 10.0": 0.4740484429065744
+                "0.0 - 1.0": 0.5333333333333333,
+                "1.0 - 2.0": 0.521311475409836,
+                "2.0 - 3.0": 0.4197952218430034,
+                "3.0 - 4.0": 0.4781144781144781,
+                "4.0 - 5.0": 0.4557377049180328,
+                "5.0 - 6.0": 0.5109034267912772,
+                "6.0 - 7.0": 0.5408163265306123,
+                "7.0 - 8.0": 0.5050167224080268,
+                "8.0 - 9.0": 0.5164473684210527,
+                "9.0 - 10.0": 0.494949494949495
             },
             "cont_var3_bin": {
-                "0.0 - 0.1": 0.49429657794676807,
-                "0.1 - 0.2": 0.5040650406504065,
-                "0.2 - 0.4": 0.4897025171624714,
-                "0.4 - 0.5": 0.5,
-                "0.5 - 0.7": 0.5243553008595988,
-                "0.7 - 0.9": 0.4703703703703704,
-                "0.9 - 1.3": 0.47146401985111663,
-                "1.3 - 1.7": 0.4868913857677903,
-                "1.7 - 2.4": 0.43416370106761565,
-                "2.4 - 7.6": 0.5258064516129032
+                "0.0 - 0.1": 0.5148514851485149,
+                "0.1 - 0.2": 0.4936708860759494,
+                "0.2 - 0.3": 0.50390625,
+                "0.3 - 0.5": 0.4846335697399527,
+                "0.5 - 0.7": 0.47774480712166173,
+                "0.7 - 0.9": 0.49407114624505927,
+                "0.9 - 1.2": 0.4773413897280967,
+                "1.2 - 1.6": 0.5179856115107914,
+                "1.6 - 2.2": 0.5018050541516246,
+                "2.2 - 7.3": 0.521311475409836
             },
             "cont_var1_bin": {
-                "-4.0 - -1.3": 0.5622895622895623,
-                "-1.3 - -0.8": 0.5233644859813084,
-                "-0.8 - -0.5": 0.4358974358974359,
-                "-0.5 - -0.2": 0.43322475570032576,
-                "-0.2 - 0.0": 0.5219123505976095,
-                "0.0 - 0.2": 0.4763779527559055,
-                "0.2 - 0.5": 0.49299719887955185,
-                "0.5 - 0.8": 0.5054545454545455,
-                "0.8 - 1.2": 0.4539249146757679,
-                "1.2 - 3.7": 0.4984984984984985
+                "-3.1 - -1.3": 0.5152542372881356,
+                "-1.3 - -0.8": 0.46107784431137727,
+                "-0.8 - -0.5": 0.4899328859060403,
+                "-0.5 - -0.3": 0.5348837209302325,
+                "-0.3 - 0.0": 0.4940828402366864,
+                "0.0 - 0.3": 0.5290858725761773,
+                "0.3 - 0.5": 0.46396396396396394,
+                "0.5 - 0.9": 0.4743935309973046,
+                "0.9 - 1.3": 0.48201438848920863,
+                "1.3 - 3.3": 0.5381944444444444
             }
         },
-        "_global_mean": 0.49
+        "_global_mean": 0.49766666666666665
     },
     "_is_fitted": true
 }
\ No newline at end of file