From 6978bd0e1dfdacae451ee285f0e46356442a2574 Mon Sep 17 00:00:00 2001 From: "joost.neujens" Date: Fri, 16 Jun 2023 18:13:51 +0200 Subject: [PATCH 1/4] #143 fix: serialization-deserialization bug --- .gitignore | 1 + cobra/model_building/__init__.py | 26 +- cobra/preprocessing/preprocessor.py | 5 + cobra/preprocessing/target_encoder.py | 3 +- cobra/utils.py | 48 +- docs/make.bat | 70 +- notebooks/debugging.ipynb | 1364 +++++++++++++++++ notebooks/model_json.json | 216 +++ .../model_building/test_forward_selection.py | 426 ++--- tests/model_building/test_models.py | 516 +++---- .../test_categorical_data_processor.py | 626 ++++---- tests/preprocessing/test_kbins_discretizer.py | 504 +++--- tests/preprocessing/test_preprocessor.py | 796 +++++----- tests/preprocessing/test_target_encoder.py | 684 ++++----- 14 files changed, 3436 insertions(+), 1849 deletions(-) create mode 100644 notebooks/debugging.ipynb create mode 100644 notebooks/model_json.json diff --git a/.gitignore b/.gitignore index 6aa9052..14c9262 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ target/ # Jupyter Notebook .ipynb_checkpoints +#*notebooks/* # pyenv .python-version diff --git a/cobra/model_building/__init__.py b/cobra/model_building/__init__.py index 7a646c3..768112c 100644 --- a/cobra/model_building/__init__.py +++ b/cobra/model_building/__init__.py @@ -1,13 +1,13 @@ -from .univariate_selection import compute_univariate_preselection -from .univariate_selection import get_preselected_predictors -from .univariate_selection import compute_correlations - -from .models import LogisticRegressionModel, LinearRegressionModel -from .forward_selection import ForwardFeatureSelection - -__all__ = ['compute_univariate_preselection', - 'get_preselected_predictors', - 'compute_correlations', - 'LogisticRegressionModel', - 'LinearRegressionModel', - 'ForwardFeatureSelection'] +from .univariate_selection import compute_univariate_preselection +from .univariate_selection import get_preselected_predictors +from .univariate_selection import compute_correlations + +from .models import LogisticRegressionModel, LinearRegressionModel +from .forward_selection import ForwardFeatureSelection + +__all__ = ['compute_univariate_preselection', + 'get_preselected_predictors', + 'compute_correlations', + 'LogisticRegressionModel', + 'LinearRegressionModel', + 'ForwardFeatureSelection'] diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index fa7ddf1..7f84716 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -367,6 +367,10 @@ def fit( log.info("Fitting pipeline took {} seconds".format(time.time() - start)) + def test_function(self): + return print('heleeeloooo') + + def transform( self, data: pd.DataFrame, continuous_vars: list, discrete_vars: list ) -> pd.DataFrame: @@ -421,6 +425,7 @@ def transform( return data + def fit_transform( self, train_data: pd.DataFrame, diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py index 3eda39d..f438479 100644 --- a/cobra/preprocessing/target_encoder.py +++ b/cobra/preprocessing/target_encoder.py @@ -5,6 +5,7 @@ from tqdm.auto import tqdm from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError +import numpy as np log = logging.getLogger(__name__) @@ -123,7 +124,7 @@ def set_attributes_from_dict(self, params: dict): params["imputation_strategy"] in self.valid_imputation_strategies): self.imputation_strategy = params["imputation_strategy"] - if "_global_mean" in params and type(params["_global_mean"]) == float: + if "_global_mean" in params and isinstance(params["_global_mean"], (np.floating, float)): self._global_mean = params["_global_mean"] _mapping = {} diff --git a/cobra/utils.py b/cobra/utils.py index d901380..daf1156 100644 --- a/cobra/utils.py +++ b/cobra/utils.py @@ -1,24 +1,24 @@ -import logging - -# logger = logging.getLogger(__name__) -# logger.setLevel(logging.INFO) -# logger.addHandler(logging.Handler()) - - -def clean_predictor_name(predictor_name: str) -> str: - """Strip the redundant suffix (e.g. "_enc" or "_bin") off from the end - of the predictor name to return a clean version of the predictor - """ - return ( - predictor_name.replace("_enc", "").replace("_bin", "").replace("_processed", "") - ) - - -def log_tutorial() -> None: - logging.info( - """ - Hi, welcome to Cobra! - You can find some tutorials that explain the functioning of cobra on the PythonPredictions GitHub: - https://github.com/PythonPredictions/cobra/tree/master/tutorials - """ - ) +import logging + +# logger = logging.getLogger(__name__) +# logger.setLevel(logging.INFO) +# logger.addHandler(logging.Handler()) + + +def clean_predictor_name(predictor_name: str) -> str: + """Strip the redundant suffix (e.g. "_enc" or "_bin") off from the end + of the predictor name to return a clean version of the predictor + """ + return ( + predictor_name.replace("_enc", "").replace("_bin", "").replace("_processed", "") + ) + + +def log_tutorial() -> None: + logging.info( + """ + Hi, welcome to Cobra! + You can find some tutorials that explain the functioning of cobra on the PythonPredictions GitHub: + https://github.com/PythonPredictions/cobra/tree/master/tutorials + """ + ) diff --git a/docs/make.bat b/docs/make.bat index 6fcf05b..061f32f 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -1,35 +1,35 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/notebooks/debugging.ipynb b/notebooks/debugging.ipynb new file mode 100644 index 0000000..5dd573e --- /dev/null +++ b/notebooks/debugging.ipynb @@ -0,0 +1,1364 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 464, + "id": "23482fd8-b4c1-48f5-8c30-a0e79f7667b3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%reload_ext autoreload" + ] + }, + { + "cell_type": "code", + "execution_count": 465, + "id": "da551dc3-ffba-45e0-b87d-7b626a622b08", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, r\"C:/projects/cobra\")" + ] + }, + { + "cell_type": "code", + "execution_count": 488, + "id": "7d2678fa-eb47-4cb5-ad1d-c5034a742f55", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import random\n", + "from cobra.preprocessing import PreProcessor\n", + "\n", + "# custom imports\n", + "from cobra.preprocessing import CategoricalDataProcessor\n", + "from cobra.preprocessing import KBinsDiscretizer\n", + "from cobra.preprocessing import TargetEncoder\n", + "import json\n" + ] + }, + { + "cell_type": "markdown", + "id": "d4d341ec-b5c3-4b00-a54f-c5b6565d2631", + "metadata": {}, + "source": [ + "### 1. Generate data" + ] + }, + { + "cell_type": "code", + "execution_count": 467, + "id": "a9563643-308b-4c6c-b358-9cbf93a0666d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "size = 5000\n", + "\n", + "# Create datetime column\n", + "dates = pd.date_range('2022-01-01', periods=size, freq='D')\n", + "\n", + "# Create categorical variables\n", + "category_values = ['Category A', 'Category B', 'Category C']\n", + "cat_var1 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n", + "cat_var2 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n", + "cat_var3 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n", + "\n", + "# Create continuous variables with different scales and distributions\n", + "cont_var1 = pd.Series(np.random.normal(loc=0, scale=1, size=size), name='cont_var1')\n", + "cont_var2 = pd.Series(np.random.uniform(low=0, high=10, size=size), name='cont_var2')\n", + "cont_var3 = pd.Series(np.random.exponential(scale=1, size=size), name='cont_var3')\n", + "\n", + "# Create target variable\n", + "target = pd.Series(np.random.randint(2, size=size))\n", + "\n", + "# Combine into a DataFrame\n", + "df = pd.DataFrame({'DateTime': dates, 'CategoryVar1': cat_var1,\n", + " 'CategoryVar2': cat_var2, 'CategoryVar3': cat_var3,\n", + " 'cont_var1': cont_var1, 'cont_var2': cont_var2, 'cont_var3': cont_var3,\n", + " 'target': target})" + ] + }, + { + "cell_type": "code", + "execution_count": 468, + "id": "bde9235f-dc62-433d-b3d3-6bf37b2ddb52", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DateTime datetime64[ns]\n", + "CategoryVar1 category\n", + "CategoryVar2 category\n", + "CategoryVar3 category\n", + "cont_var1 float64\n", + "cont_var2 float64\n", + "cont_var3 float64\n", + "target int32\n", + "dtype: object" + ] + }, + "execution_count": 468, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 469, + "id": "d774e959-73f4-40b4-bc20-43c3af99e593", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateTimeCategoryVar1CategoryVar2CategoryVar3cont_var1cont_var2cont_var3target
02022-01-01Category CCategory BCategory A-1.0016454.7337061.3726590
12022-01-02Category CCategory CCategory B0.2806299.1911290.6359241
22022-01-03Category BCategory BCategory C-0.3452197.7317920.0980911
32022-01-04Category CCategory BCategory C-1.1349120.2051320.1798680
42022-01-05Category ACategory CCategory B-1.3396452.3785400.9668181
\n", + "
" + ], + "text/plain": [ + " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 cont_var2 \\\n", + "0 2022-01-01 Category C Category B Category A -1.001645 4.733706 \n", + "1 2022-01-02 Category C Category C Category B 0.280629 9.191129 \n", + "2 2022-01-03 Category B Category B Category C -0.345219 7.731792 \n", + "3 2022-01-04 Category C Category B Category C -1.134912 0.205132 \n", + "4 2022-01-05 Category A Category C Category B -1.339645 2.378540 \n", + "\n", + " cont_var3 target \n", + "0 1.372659 0 \n", + "1 0.635924 1 \n", + "2 0.098091 1 \n", + "3 0.179868 0 \n", + "4 0.966818 1 " + ] + }, + "execution_count": 469, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 470, + "id": "e9c06e3a-188f-4cdc-b9cd-51d3db63e5ff", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['DateTime', 'CategoryVar1', 'CategoryVar2', 'CategoryVar3', 'cont_var1',\n", + " 'cont_var2', 'cont_var3', 'target'],\n", + " dtype='object')" + ] + }, + "execution_count": 470, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "markdown", + "id": "9aae8c98-434b-4c71-abb1-29fa6d143895", + "metadata": {}, + "source": [ + "### 2. Fit preprocessor" + ] + }, + { + "cell_type": "code", + "execution_count": 521, + "id": "a32560d4-b5fe-4b90-9ea6-ede7915bba05", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "continuous_vars = ['cont_var2', 'cont_var3', 'cont_var1']\n", + "discrete_vars= ['CategoryVar1', 'CategoryVar2', 'CategoryVar3'] #, 'DateTime'] [] \n", + "target_col = \"target\"" + ] + }, + { + "cell_type": "code", + "execution_count": 522, + "id": "d6f1e21a-4a6e-4ad7-9faf-b36e6daff707", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.\n" + ] + } + ], + "source": [ + "model_type = \"classification\"\n", + "\n", + "# using all Cobra's default parameters for preprocessing here\n", + "preprocessor = PreProcessor.from_params(\n", + " model_type=model_type\n", + ")\n", + "\n", + "random.seed(1212)\n", + "basetable = preprocessor.train_selection_validation_split(data=df,\n", + " train_prop=0.6,\n", + " selection_prop=0.25,\n", + " validation_prop=0.15)" + ] + }, + { + "cell_type": "code", + "execution_count": 523, + "id": "7b673619-4eda-4aca-acd5-a125f80d3b20", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting to fit pipeline\n", + "Computing discretization bins...: 100%|█████████████████████████████████████████████████| 3/3 [00:00<00:00, 507.38it/s]\n", + "Fitting KBinsDiscretizer took 0.006914615631103516 seconds\n", + "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 240.62it/s]\n", + "Fitting category regrouping...: 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 29.42it/s]\n", + "Fitting categorical_data_processor class took 0.10196375846862793 seconds\n", + "Fitting target encoding...: 100%|███████████████████████████████████████████████████████| 6/6 [00:00<00:00, 558.52it/s]\n", + "Fitting TargetEncoder took 0.013732433319091797 seconds\n", + "Fitting pipeline took 0.17300176620483398 seconds\n" + ] + } + ], + "source": [ + "preprocessor.fit(basetable[basetable[\"split\"]==\"train\"],\n", + " continuous_vars=continuous_vars,\n", + " discrete_vars = discrete_vars,\n", + " target_column_name=target_col)" + ] + }, + { + "cell_type": "code", + "execution_count": 524, + "id": "c9e2c79d-c0bc-464d-b869-f8115ac67776", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 160.70it/s]\n", + "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 697.13it/s]\n", + "Transforming data took 0.0610198974609375 seconds\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateTimeCategoryVar1CategoryVar2CategoryVar3cont_var1cont_var2cont_var3targetsplitcont_var2_bin...cont_var1_binCategoryVar1_processedCategoryVar2_processedCategoryVar3_processedCategoryVar1_encCategoryVar2_encCategoryVar3_enccont_var2_enccont_var3_enccont_var1_enc
02022-01-01Category CCategory BCategory A-1.0016454.7337061.3726590selection4.0 - 5.0...-1.3 - -0.8Category CCategory BCategory A0.5042740.4958850.5148720.4673910.4868910.523364
12022-01-02Category CCategory CCategory B0.2806299.1911290.6359241train9.0 - 10.0...0.2 - 0.5Category CCategory CCategory B0.5042740.4879520.4910000.4740480.5243550.492997
22022-01-03Category BCategory BCategory C-0.3452197.7317920.0980911train7.0 - 8.0...-0.5 - -0.2Category BCategory BCategory C0.4733670.4958850.4653660.4902600.4942970.433225
32022-01-04Category CCategory BCategory C-1.1349120.2051320.1798680selection0.0 - 1.0...-1.3 - -0.8Category CCategory BCategory C0.5042740.4958850.4653660.4754100.5040650.523364
42022-01-05Category ACategory CCategory B-1.3396452.3785400.9668181train2.0 - 3.0...-4.0 - -1.3Category ACategory CCategory B0.4915970.4879520.4910000.4556960.4714640.562290
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 cont_var2 \\\n", + "0 2022-01-01 Category C Category B Category A -1.001645 4.733706 \n", + "1 2022-01-02 Category C Category C Category B 0.280629 9.191129 \n", + "2 2022-01-03 Category B Category B Category C -0.345219 7.731792 \n", + "3 2022-01-04 Category C Category B Category C -1.134912 0.205132 \n", + "4 2022-01-05 Category A Category C Category B -1.339645 2.378540 \n", + "\n", + " cont_var3 target split cont_var2_bin ... cont_var1_bin \\\n", + "0 1.372659 0 selection 4.0 - 5.0 ... -1.3 - -0.8 \n", + "1 0.635924 1 train 9.0 - 10.0 ... 0.2 - 0.5 \n", + "2 0.098091 1 train 7.0 - 8.0 ... -0.5 - -0.2 \n", + "3 0.179868 0 selection 0.0 - 1.0 ... -1.3 - -0.8 \n", + "4 0.966818 1 train 2.0 - 3.0 ... -4.0 - -1.3 \n", + "\n", + " CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed \\\n", + "0 Category C Category B Category A \n", + "1 Category C Category C Category B \n", + "2 Category B Category B Category C \n", + "3 Category C Category B Category C \n", + "4 Category A Category C Category B \n", + "\n", + " CategoryVar1_enc CategoryVar2_enc CategoryVar3_enc cont_var2_enc \\\n", + "0 0.504274 0.495885 0.514872 0.467391 \n", + "1 0.504274 0.487952 0.491000 0.474048 \n", + "2 0.473367 0.495885 0.465366 0.490260 \n", + "3 0.504274 0.495885 0.465366 0.475410 \n", + "4 0.491597 0.487952 0.491000 0.455696 \n", + "\n", + " cont_var3_enc cont_var1_enc \n", + "0 0.486891 0.523364 \n", + "1 0.524355 0.492997 \n", + "2 0.494297 0.433225 \n", + "3 0.504065 0.523364 \n", + "4 0.471464 0.562290 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 524, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "basetable_transformed_orig = preprocessor.transform(basetable,\n", + " continuous_vars=continuous_vars,\n", + " discrete_vars=discrete_vars)\n", + "basetable_transformed_orig.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 525, + "id": "d70f40cc-7814-48a8-91f6-2b7297f97ccc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#preprocessor._discretizer #._bins_by_column\n", + "#preprocessor._target_encoder.attributes_to_dict()\n", + "#preprocessor._discretizer.attributes_to_dict()\n", + "#preprocessor._target_encoder.attributes_to_dict()" + ] + }, + { + "cell_type": "markdown", + "id": "baab4c1b-4200-4c96-b991-be8efc09abbb", + "metadata": {}, + "source": [ + "### 3. Serialize the preprocessor" + ] + }, + { + "cell_type": "code", + "execution_count": 526, + "id": "95b597b2-b475-4d59-b650-dcc208db1eb5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pipeline_serialized = preprocessor.serialize_pipeline()\n", + "\n", + "with open(r\"./model_json.json\", \"w\") as file:\n", + " file.write(json.dumps(pipeline_serialized, indent=4))\n", + " \n", + "#pipeline_serialized" + ] + }, + { + "cell_type": "code", + "execution_count": 527, + "id": "c6dbd38c-ca5d-492d-815b-1af02d7de143", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Look into properties of preprocessors\n", + "#pipeline_serialized[\"target_encoder\"] #._bins_by_column" + ] + }, + { + "cell_type": "markdown", + "id": "fc339ac8-67a7-4574-811e-2b9bc4ce6a39", + "metadata": {}, + "source": [ + "### 4. De-serialize pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 528, + "id": "2a517ff8-d336-4bd3-abdc-2be784259564", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.\n" + ] + } + ], + "source": [ + "# Read serialized pipeline from json\n", + "with open(r\"./model_json.json\", \"r\") as file:\n", + " json_pipeline_serialized = json.load(file)\n", + "\n", + "# Create new preprocessor object from serialized pipeline\n", + "new_preprocessor = PreProcessor.from_pipeline(json_pipeline_serialized)\n", + "#new_preprocessor = PreProcessor.from_pipeline(pipeline_serialized)" + ] + }, + { + "cell_type": "code", + "execution_count": 529, + "id": "ad9442b5-7f7e-48fe-8199-528992d1f0d6", + "metadata": {}, + "outputs": [], + "source": [ + "# Look into properties of preprocessors if needed\n", + "#new_preprocessor._discretizer.attributes_to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 530, + "id": "541986d2-8d5d-473c-8871-5e7d2da31c4a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 147.15it/s]\n", + "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 661.65it/s]\n", + "Transforming data took 0.06773138046264648 seconds\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateTimeCategoryVar1CategoryVar2CategoryVar3cont_var1cont_var2cont_var3targetsplitcont_var2_bin...cont_var1_binCategoryVar1_processedCategoryVar2_processedCategoryVar3_processedCategoryVar1_encCategoryVar2_encCategoryVar3_enccont_var2_enccont_var3_enccont_var1_enc
02022-01-01Category CCategory BCategory A-1.0016454.7337061.3726590selection4.0 - 5.0...-1.3 - -0.8Category CCategory BCategory A0.5042740.4958850.5148720.4673910.4868910.523364
12022-01-02Category CCategory CCategory B0.2806299.1911290.6359241train9.0 - 10.0...0.2 - 0.5Category CCategory CCategory B0.5042740.4879520.4910000.4740480.5243550.492997
22022-01-03Category BCategory BCategory C-0.3452197.7317920.0980911train7.0 - 8.0...-0.5 - -0.2Category BCategory BCategory C0.4733670.4958850.4653660.4902600.4942970.433225
32022-01-04Category CCategory BCategory C-1.1349120.2051320.1798680selection0.0 - 1.0...-1.3 - -0.8Category CCategory BCategory C0.5042740.4958850.4653660.4754100.5040650.523364
42022-01-05Category ACategory CCategory B-1.3396452.3785400.9668181train2.0 - 3.0...-4.0 - -1.3Category ACategory CCategory B0.4915970.4879520.4910000.4556960.4714640.562290
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 cont_var2 \\\n", + "0 2022-01-01 Category C Category B Category A -1.001645 4.733706 \n", + "1 2022-01-02 Category C Category C Category B 0.280629 9.191129 \n", + "2 2022-01-03 Category B Category B Category C -0.345219 7.731792 \n", + "3 2022-01-04 Category C Category B Category C -1.134912 0.205132 \n", + "4 2022-01-05 Category A Category C Category B -1.339645 2.378540 \n", + "\n", + " cont_var3 target split cont_var2_bin ... cont_var1_bin \\\n", + "0 1.372659 0 selection 4.0 - 5.0 ... -1.3 - -0.8 \n", + "1 0.635924 1 train 9.0 - 10.0 ... 0.2 - 0.5 \n", + "2 0.098091 1 train 7.0 - 8.0 ... -0.5 - -0.2 \n", + "3 0.179868 0 selection 0.0 - 1.0 ... -1.3 - -0.8 \n", + "4 0.966818 1 train 2.0 - 3.0 ... -4.0 - -1.3 \n", + "\n", + " CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed \\\n", + "0 Category C Category B Category A \n", + "1 Category C Category C Category B \n", + "2 Category B Category B Category C \n", + "3 Category C Category B Category C \n", + "4 Category A Category C Category B \n", + "\n", + " CategoryVar1_enc CategoryVar2_enc CategoryVar3_enc cont_var2_enc \\\n", + "0 0.504274 0.495885 0.514872 0.467391 \n", + "1 0.504274 0.487952 0.491000 0.474048 \n", + "2 0.473367 0.495885 0.465366 0.490260 \n", + "3 0.504274 0.495885 0.465366 0.475410 \n", + "4 0.491597 0.487952 0.491000 0.455696 \n", + "\n", + " cont_var3_enc cont_var1_enc \n", + "0 0.486891 0.523364 \n", + "1 0.524355 0.492997 \n", + "2 0.494297 0.433225 \n", + "3 0.504065 0.523364 \n", + "4 0.471464 0.562290 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 530, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "basetable_transformed = new_preprocessor.transform(basetable,\n", + " continuous_vars=continuous_vars,\n", + " discrete_vars=discrete_vars)\n", + "basetable_transformed.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 531, + "id": "c270d856-452d-4507-a3c2-df3ae1991c36", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateTimeCategoryVar1CategoryVar2CategoryVar3cont_var1cont_var2cont_var3targetsplitcont_var2_bin...cont_var1_binCategoryVar1_processedCategoryVar2_processedCategoryVar3_processedCategoryVar1_encCategoryVar2_encCategoryVar3_enccont_var2_enccont_var3_enccont_var1_enc
0TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
1TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
2TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
3TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
..................................................................
4995TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4996TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4997TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4998TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4999TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
\n", + "

5000 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 \\\n", + "0 True True True True True \n", + "1 True True True True True \n", + "2 True True True True True \n", + "3 True True True True True \n", + "4 True True True True True \n", + "... ... ... ... ... ... \n", + "4995 True True True True True \n", + "4996 True True True True True \n", + "4997 True True True True True \n", + "4998 True True True True True \n", + "4999 True True True True True \n", + "\n", + " cont_var2 cont_var3 target split cont_var2_bin ... cont_var1_bin \\\n", + "0 True True True True True ... True \n", + "1 True True True True True ... True \n", + "2 True True True True True ... True \n", + "3 True True True True True ... True \n", + "4 True True True True True ... True \n", + "... ... ... ... ... ... ... ... \n", + "4995 True True True True True ... True \n", + "4996 True True True True True ... True \n", + "4997 True True True True True ... True \n", + "4998 True True True True True ... True \n", + "4999 True True True True True ... True \n", + "\n", + " CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed \\\n", + "0 True True True \n", + "1 True True True \n", + "2 True True True \n", + "3 True True True \n", + "4 True True True \n", + "... ... ... ... \n", + "4995 True True True \n", + "4996 True True True \n", + "4997 True True True \n", + "4998 True True True \n", + "4999 True True True \n", + "\n", + " CategoryVar1_enc CategoryVar2_enc CategoryVar3_enc cont_var2_enc \\\n", + "0 True True True True \n", + "1 True True True True \n", + "2 True True True True \n", + "3 True True True True \n", + "4 True True True True \n", + "... ... ... ... ... \n", + "4995 True True True True \n", + "4996 True True True True \n", + "4997 True True True True \n", + "4998 True True True True \n", + "4999 True True True True \n", + "\n", + " cont_var3_enc cont_var1_enc \n", + "0 True True \n", + "1 True True \n", + "2 True True \n", + "3 True True \n", + "4 True True \n", + "... ... ... \n", + "4995 True True \n", + "4996 True True \n", + "4997 True True \n", + "4998 True True \n", + "4999 True True \n", + "\n", + "[5000 rows x 21 columns]" + ] + }, + "execution_count": 531, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Double check transformed basetable is the same\n", + "basetable_transformed_orig == basetable_transformed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b478d7c-46d8-4ba9-bf84-375a7cf901a8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cobra_venv", + "language": "python", + "name": "cobra_venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/model_json.json b/notebooks/model_json.json new file mode 100644 index 0000000..fd80281 --- /dev/null +++ b/notebooks/model_json.json @@ -0,0 +1,216 @@ +{ + "metadata": { + "timestamp": "16/06/2023 18:00:26" + }, + "categorical_data_processor": { + "category_size_threshold": 5, + "forced_categories": {}, + "keep_missing": true, + "model_type": "classification", + "p_value_threshold": 0.001, + "regroup": true, + "regroup_name": "Other", + "scale_contingency_table": true, + "_cleaned_categories_by_column": { + "CategoryVar1": [], + "CategoryVar2": [], + "CategoryVar3": [] + } + }, + "discretizer": { + "auto_adapt_bins": false, + "change_endpoint_format": false, + "closed": "right", + "label_format": "{} - {}", + "n_bins": 10, + "starting_precision": 0, + "strategy": "quantile", + "_bins_by_column": { + "cont_var2": [ + [ + 0.0, + 1.0 + ], + [ + 1.0, + 2.0 + ], + [ + 2.0, + 3.0 + ], + [ + 3.0, + 4.0 + ], + [ + 4.0, + 5.0 + ], + [ + 5.0, + 6.0 + ], + [ + 6.0, + 7.0 + ], + [ + 7.0, + 8.0 + ], + [ + 8.0, + 9.0 + ], + [ + 9.0, + 10.0 + ] + ], + "cont_var3": [ + [ + 0.0, + 0.1 + ], + [ + 0.1, + 0.2 + ], + [ + 0.2, + 0.4 + ], + [ + 0.4, + 0.5 + ], + [ + 0.5, + 0.7 + ], + [ + 0.7, + 0.9 + ], + [ + 0.9, + 1.3 + ], + [ + 1.3, + 1.7 + ], + [ + 1.7, + 2.4 + ], + [ + 2.4, + 7.6 + ] + ], + "cont_var1": [ + [ + -4.0, + -1.3 + ], + [ + -1.3, + -0.8 + ], + [ + -0.8, + -0.5 + ], + [ + -0.5, + -0.2 + ], + [ + -0.2, + 0.0 + ], + [ + 0.0, + 0.2 + ], + [ + 0.2, + 0.5 + ], + [ + 0.5, + 0.8 + ], + [ + 0.8, + 1.2 + ], + [ + 1.2, + 3.7 + ] + ] + } + }, + "target_encoder": { + "imputation_strategy": "mean", + "weight": 0.0, + "_mapping": { + "CategoryVar1_processed": { + "Category A": 0.49159663865546216, + "Category B": 0.4733668341708543, + "Category C": 0.5042735042735043 + }, + "CategoryVar2_processed": { + "Category A": 0.48643410852713176, + "Category B": 0.49588477366255146, + "Category C": 0.4879518072289157 + }, + "CategoryVar3_processed": { + "Category A": 0.5148717948717949, + "Category B": 0.491, + "Category C": 0.4653658536585366 + }, + "cont_var2_bin": { + "0.0 - 1.0": 0.47540983606557374, + "1.0 - 2.0": 0.46855345911949686, + "2.0 - 3.0": 0.45569620253164556, + "3.0 - 4.0": 0.5133333333333333, + "4.0 - 5.0": 0.4673913043478261, + "5.0 - 6.0": 0.5307443365695793, + "6.0 - 7.0": 0.5232974910394266, + "7.0 - 8.0": 0.4902597402597403, + "8.0 - 9.0": 0.5033333333333333, + "9.0 - 10.0": 0.4740484429065744 + }, + "cont_var3_bin": { + "0.0 - 0.1": 0.49429657794676807, + "0.1 - 0.2": 0.5040650406504065, + "0.2 - 0.4": 0.4897025171624714, + "0.4 - 0.5": 0.5, + "0.5 - 0.7": 0.5243553008595988, + "0.7 - 0.9": 0.4703703703703704, + "0.9 - 1.3": 0.47146401985111663, + "1.3 - 1.7": 0.4868913857677903, + "1.7 - 2.4": 0.43416370106761565, + "2.4 - 7.6": 0.5258064516129032 + }, + "cont_var1_bin": { + "-4.0 - -1.3": 0.5622895622895623, + "-1.3 - -0.8": 0.5233644859813084, + "-0.8 - -0.5": 0.4358974358974359, + "-0.5 - -0.2": 0.43322475570032576, + "-0.2 - 0.0": 0.5219123505976095, + "0.0 - 0.2": 0.4763779527559055, + "0.2 - 0.5": 0.49299719887955185, + "0.5 - 0.8": 0.5054545454545455, + "0.8 - 1.2": 0.4539249146757679, + "1.2 - 3.7": 0.4984984984984985 + } + }, + "_global_mean": 0.49 + }, + "_is_fitted": true +} \ No newline at end of file diff --git a/tests/model_building/test_forward_selection.py b/tests/model_building/test_forward_selection.py index 19f7157..9383f73 100644 --- a/tests/model_building/test_forward_selection.py +++ b/tests/model_building/test_forward_selection.py @@ -1,213 +1,213 @@ - -from contextlib import contextmanager -import pytest -import pandas as pd - -from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel -from cobra.model_building.forward_selection import ForwardFeatureSelection - -@contextmanager -def does_not_raise(): - yield - -def mock_data(add_split_col: bool=False, model_type="classification"): - data = pd.DataFrame({"var1_enc": [0.42] * 10, - "var2_enc": [0.94] * 10, - "var3_enc": [0.87] * 10}) - - if model_type == "classification": - data["target"] = ([0] * 5 + [1] * 2 + [0] * 2 + [1]) - elif model_type == "regression": - data["target"] = [7, 2, 2, 9, 7, 3, 1, 4, 8, 5] - - if add_split_col: - data.loc[:, "split"] = (["train"] * 7 + ["selection"] * 3) - - return data - -def mock_model_num_pred(n_predictors, model_type="classification"): - predictors = [f"var{i + 1}_enc" for i in range(n_predictors)] - return mock_model(predictors, model_type) - -def mock_model(predictor_list, model_type="classification"): - if model_type == "classification": - model = LogisticRegressionModel() - elif model_type == "regression": - model = LinearRegressionModel() - - model.predictors = predictor_list - - return model - - -class TestForwardFeatureSelection: - - def test_get_model_from_step(self): - - forward_selection = ForwardFeatureSelection() - - with pytest.raises(ValueError): - forward_selection.get_model_from_step(2) - - @pytest.mark.parametrize("model_type", ["classification", "regression"]) - def test_compute_model_performances(self, mocker, model_type): - - data = mock_data(add_split_col=True, model_type=model_type) - - fw_selection = ForwardFeatureSelection(model_type=model_type) - fw_selection._fitted_models = [ - mock_model_num_pred(1, model_type=model_type), - mock_model_num_pred(2, model_type=model_type), - mock_model_num_pred(3, model_type=model_type) - ] - - def mock_evaluate(self, X, y, split, metric): # on AUC scale, but gives the same for RMSE as it is a mock - if split == "train": - return 0.612 - else: - return 0.609 - - if model_type == "classification": - patch_fct = "cobra.model_building.forward_selection.LogisticRegressionModel.evaluate" - elif model_type == "regression": - patch_fct = "cobra.model_building.forward_selection.LinearRegressionModel.evaluate" - - mocker.patch(patch_fct, mock_evaluate) - - actual = (fw_selection - .compute_model_performances(data, "target", - splits=["train", "selection"], - metric=None)) - - expected = pd.DataFrame([ - {"predictors": ["var1_enc"], - "last_added_predictor": "var1_enc", - "train_performance": 0.612, "selection_performance": 0.609, - "model_type": model_type}, - {"predictors": ["var1_enc", "var2_enc"], - "last_added_predictor": "var2_enc", - "train_performance": 0.612, "selection_performance": 0.609, - "model_type": model_type}, - {"predictors": ["var1_enc", "var2_enc", "var3_enc"], - "last_added_predictor": "var3_enc", - "train_performance": 0.612, "selection_performance": 0.609, - "model_type": model_type} - ]) - - pd.testing.assert_frame_equal(actual, expected) - - @pytest.mark.parametrize("model_type", ["classification", "regression"]) - def test_ffs_train_data_assertions(self, model_type): - - fw_selection = ForwardFeatureSelection(model_type=model_type) - - with pytest.raises(AssertionError): # no split column - fw_selection.fit(pd.DataFrame(), "target", predictors=[""]) - - df = mock_data(add_split_col=True, model_type=model_type) - with pytest.raises(AssertionError): # not at least train & selection sets - fw_selection.fit(df[df["split"] == "train"], "target", predictors=[""]) - - @pytest.mark.parametrize("model_type, max_predictors, expectation", - [("classification", 2, pytest.raises(ValueError)), - ("classification", 3, does_not_raise()), - ("classification", 5, does_not_raise()), - ("classification", 10, does_not_raise()), - ("classification", 15, does_not_raise()), - ("regression", 2, pytest.raises(ValueError)), - ("regression", 3, does_not_raise()), - ("regression", 5, does_not_raise()), - ("regression", 10, does_not_raise()), - ("regression", 15, does_not_raise()) - ]) - def test_fit(self, mocker, model_type, max_predictors: int, expectation): - - # create list of elements [var1_enc, var2_enc, ..., var10_enc] - predictors_list = [f"var{i+1}_enc" for i in range(10)] - # extract sublist [var1_enc, var5_enc, var9_enc] - forced_predictors_list = predictors_list[::4] - - ordered_output_list = (forced_predictors_list - + [pred for pred in predictors_list - if pred not in forced_predictors_list]) - - fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors) - - def mock_train_model(self, train_data, target_column_name, predictors): - return mock_model(predictors, model_type=model_type) - - def mock_forward_selection(self, train_data, target_column_name, - predictors, forced_predictors): - n_models = min(max_predictors, len(predictors) + len(forced_predictors)) - - return [mock_model(ordered_output_list[:i+1], model_type=model_type) - for i in range(n_models)] - - mocker.patch("cobra.model_building.ForwardFeatureSelection._train_model", - mock_train_model) - - mocker.patch("cobra.model_building.ForwardFeatureSelection._forward_selection", - mock_forward_selection) - - df = mock_data(add_split_col=True, model_type=model_type) - with expectation: - fw_selection.fit(df, "target", # data is ignored - predictors=predictors_list, - forced_predictors=forced_predictors_list, - excluded_predictors=[]) - - # for each fitted model, check number of predictors - actual = [model.predictors - for model in fw_selection._fitted_models] - - expected = [ordered_output_list[:i+1] - for i in range(min(max_predictors, - len(predictors_list)))] - - if max_predictors == len(forced_predictors_list): - expected = [forced_predictors_list] - - assert actual == expected - - @pytest.mark.parametrize("model_type, max_predictors", [("classification", 5), - ("classification", 10), - ("classification", 15), - ("regression", 5), - ("regression", 10), - ("regression", 15) - ]) - def test_forward_selection(self, mocker, model_type, max_predictors: int): - - # create list of elements [var1_enc, var2_c, ..., var10_enc] - predictors_list = [f"var{i+1}_enc" for i in range(10)] - - # extract sublist [var1_enc, var5_enc, var9_enc]: - forced_predictors = predictors_list[::4] - # remove these from predictors list to have clean version - predictors = [pred for pred in predictors_list - if pred not in forced_predictors] - - ordered_output_list = forced_predictors + predictors - - def mock_find_next_best_model(self, train_data, target_column_name, - candidate_predictors, - current_predictors): - return mock_model(current_predictors + candidate_predictors[0:1], model_type=model_type) - - mocker.patch(("cobra.model_building.ForwardFeatureSelection." - "_find_next_best_model"), mock_find_next_best_model) - - fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors) - - fitted_models = (fw_selection. - _forward_selection(pd.DataFrame(), "target", - predictors, - forced_predictors)) - - actual = [sorted(model.predictors) for model in fitted_models] - - expected = [sorted(ordered_output_list[:i+1]) - for i in range(min(max_predictors, - len(predictors_list)))] - - assert actual == expected + +from contextlib import contextmanager +import pytest +import pandas as pd + +from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel +from cobra.model_building.forward_selection import ForwardFeatureSelection + +@contextmanager +def does_not_raise(): + yield + +def mock_data(add_split_col: bool=False, model_type="classification"): + data = pd.DataFrame({"var1_enc": [0.42] * 10, + "var2_enc": [0.94] * 10, + "var3_enc": [0.87] * 10}) + + if model_type == "classification": + data["target"] = ([0] * 5 + [1] * 2 + [0] * 2 + [1]) + elif model_type == "regression": + data["target"] = [7, 2, 2, 9, 7, 3, 1, 4, 8, 5] + + if add_split_col: + data.loc[:, "split"] = (["train"] * 7 + ["selection"] * 3) + + return data + +def mock_model_num_pred(n_predictors, model_type="classification"): + predictors = [f"var{i + 1}_enc" for i in range(n_predictors)] + return mock_model(predictors, model_type) + +def mock_model(predictor_list, model_type="classification"): + if model_type == "classification": + model = LogisticRegressionModel() + elif model_type == "regression": + model = LinearRegressionModel() + + model.predictors = predictor_list + + return model + + +class TestForwardFeatureSelection: + + def test_get_model_from_step(self): + + forward_selection = ForwardFeatureSelection() + + with pytest.raises(ValueError): + forward_selection.get_model_from_step(2) + + @pytest.mark.parametrize("model_type", ["classification", "regression"]) + def test_compute_model_performances(self, mocker, model_type): + + data = mock_data(add_split_col=True, model_type=model_type) + + fw_selection = ForwardFeatureSelection(model_type=model_type) + fw_selection._fitted_models = [ + mock_model_num_pred(1, model_type=model_type), + mock_model_num_pred(2, model_type=model_type), + mock_model_num_pred(3, model_type=model_type) + ] + + def mock_evaluate(self, X, y, split, metric): # on AUC scale, but gives the same for RMSE as it is a mock + if split == "train": + return 0.612 + else: + return 0.609 + + if model_type == "classification": + patch_fct = "cobra.model_building.forward_selection.LogisticRegressionModel.evaluate" + elif model_type == "regression": + patch_fct = "cobra.model_building.forward_selection.LinearRegressionModel.evaluate" + + mocker.patch(patch_fct, mock_evaluate) + + actual = (fw_selection + .compute_model_performances(data, "target", + splits=["train", "selection"], + metric=None)) + + expected = pd.DataFrame([ + {"predictors": ["var1_enc"], + "last_added_predictor": "var1_enc", + "train_performance": 0.612, "selection_performance": 0.609, + "model_type": model_type}, + {"predictors": ["var1_enc", "var2_enc"], + "last_added_predictor": "var2_enc", + "train_performance": 0.612, "selection_performance": 0.609, + "model_type": model_type}, + {"predictors": ["var1_enc", "var2_enc", "var3_enc"], + "last_added_predictor": "var3_enc", + "train_performance": 0.612, "selection_performance": 0.609, + "model_type": model_type} + ]) + + pd.testing.assert_frame_equal(actual, expected) + + @pytest.mark.parametrize("model_type", ["classification", "regression"]) + def test_ffs_train_data_assertions(self, model_type): + + fw_selection = ForwardFeatureSelection(model_type=model_type) + + with pytest.raises(AssertionError): # no split column + fw_selection.fit(pd.DataFrame(), "target", predictors=[""]) + + df = mock_data(add_split_col=True, model_type=model_type) + with pytest.raises(AssertionError): # not at least train & selection sets + fw_selection.fit(df[df["split"] == "train"], "target", predictors=[""]) + + @pytest.mark.parametrize("model_type, max_predictors, expectation", + [("classification", 2, pytest.raises(ValueError)), + ("classification", 3, does_not_raise()), + ("classification", 5, does_not_raise()), + ("classification", 10, does_not_raise()), + ("classification", 15, does_not_raise()), + ("regression", 2, pytest.raises(ValueError)), + ("regression", 3, does_not_raise()), + ("regression", 5, does_not_raise()), + ("regression", 10, does_not_raise()), + ("regression", 15, does_not_raise()) + ]) + def test_fit(self, mocker, model_type, max_predictors: int, expectation): + + # create list of elements [var1_enc, var2_enc, ..., var10_enc] + predictors_list = [f"var{i+1}_enc" for i in range(10)] + # extract sublist [var1_enc, var5_enc, var9_enc] + forced_predictors_list = predictors_list[::4] + + ordered_output_list = (forced_predictors_list + + [pred for pred in predictors_list + if pred not in forced_predictors_list]) + + fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors) + + def mock_train_model(self, train_data, target_column_name, predictors): + return mock_model(predictors, model_type=model_type) + + def mock_forward_selection(self, train_data, target_column_name, + predictors, forced_predictors): + n_models = min(max_predictors, len(predictors) + len(forced_predictors)) + + return [mock_model(ordered_output_list[:i+1], model_type=model_type) + for i in range(n_models)] + + mocker.patch("cobra.model_building.ForwardFeatureSelection._train_model", + mock_train_model) + + mocker.patch("cobra.model_building.ForwardFeatureSelection._forward_selection", + mock_forward_selection) + + df = mock_data(add_split_col=True, model_type=model_type) + with expectation: + fw_selection.fit(df, "target", # data is ignored + predictors=predictors_list, + forced_predictors=forced_predictors_list, + excluded_predictors=[]) + + # for each fitted model, check number of predictors + actual = [model.predictors + for model in fw_selection._fitted_models] + + expected = [ordered_output_list[:i+1] + for i in range(min(max_predictors, + len(predictors_list)))] + + if max_predictors == len(forced_predictors_list): + expected = [forced_predictors_list] + + assert actual == expected + + @pytest.mark.parametrize("model_type, max_predictors", [("classification", 5), + ("classification", 10), + ("classification", 15), + ("regression", 5), + ("regression", 10), + ("regression", 15) + ]) + def test_forward_selection(self, mocker, model_type, max_predictors: int): + + # create list of elements [var1_enc, var2_c, ..., var10_enc] + predictors_list = [f"var{i+1}_enc" for i in range(10)] + + # extract sublist [var1_enc, var5_enc, var9_enc]: + forced_predictors = predictors_list[::4] + # remove these from predictors list to have clean version + predictors = [pred for pred in predictors_list + if pred not in forced_predictors] + + ordered_output_list = forced_predictors + predictors + + def mock_find_next_best_model(self, train_data, target_column_name, + candidate_predictors, + current_predictors): + return mock_model(current_predictors + candidate_predictors[0:1], model_type=model_type) + + mocker.patch(("cobra.model_building.ForwardFeatureSelection." + "_find_next_best_model"), mock_find_next_best_model) + + fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors) + + fitted_models = (fw_selection. + _forward_selection(pd.DataFrame(), "target", + predictors, + forced_predictors)) + + actual = [sorted(model.predictors) for model in fitted_models] + + expected = [sorted(ordered_output_list[:i+1]) + for i in range(min(max_predictors, + len(predictors_list)))] + + assert actual == expected diff --git a/tests/model_building/test_models.py b/tests/model_building/test_models.py index 7eca6e6..20fce9f 100644 --- a/tests/model_building/test_models.py +++ b/tests/model_building/test_models.py @@ -1,258 +1,258 @@ - -import numpy as np -import pandas as pd - -from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel - -def mock_data(): - return pd.DataFrame({"var1_enc": [0.42] * 10, - "var2_enc": [0.94] * 10, - "var3_enc": [0.87] * 10}) - - -def mock_score_model_classification(self, data): - return np.array([0.5, 0.8, 0.2, 0.9, 0.1, 0.7, 0.3, 0.6, 0.4, 0.5]) - -def mock_score_model_regression(self, data): - return np.array([0.7, 0.2, 0.2, 0.9, 0.7, 0.3, 0.1, 0.4, 0.8, 0.5])*15 - -class TestLogisticRegressionModel: - - def test_evaluate(self, mocker): - - X = mock_data() - y = pd.Series([1] * 5 + [0] * 5) - - def mock_roc_auc_score(y_true, y_score): - return 0.79 - - (mocker - .patch("cobra.model_building.LogisticRegressionModel.score_model", - mock_score_model_classification)) - - (mocker - .patch("cobra.model_building.models.roc_auc_score", - mock_roc_auc_score)) - - model = LogisticRegressionModel() - actual = model.evaluate(X, y) - - assert actual == 0.79 - - def test_evaluate_cached(self): - - split = "train" - expected = 0.79 - - model = LogisticRegressionModel() - model._eval_metrics_by_split["train"] = expected - - actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split) - - assert actual == expected - - def test_compute_variable_importance(self, mocker): - - def mock_pearsonr(ypred, ytrue): - return [ypred.unique()[0]] - - (mocker - .patch("cobra.model_building.LogisticRegressionModel.score_model", - mock_score_model_classification)) - - (mocker - .patch("cobra.model_building.models.stats.pearsonr", - mock_pearsonr)) - - model = LogisticRegressionModel() - model.predictors = ["var1_enc", "var2_enc", "var3_enc"] - - data = mock_data() - - actual = model.compute_variable_importance(data) - - expected = pd.DataFrame([ - {"predictor": "var1", "importance": data["var1_enc"].unique()[0]}, - {"predictor": "var2", "importance": data["var2_enc"].unique()[0]}, - {"predictor": "var3", "importance": data["var3_enc"].unique()[0]} - ]).sort_values(by="importance", ascending=False).reset_index(drop=True) - - pd.testing.assert_frame_equal(actual, expected) - - def test_serialize(self): - - model = LogisticRegressionModel() - actual = model.serialize() - - expected = { - "meta": "logistic-regression", - "predictors": [], - "_eval_metrics_by_split": {}, - "params": { - "C": 1000000000.0, - "class_weight": None, - "dual": False, - "fit_intercept": True, - "intercept_scaling": 1, - "l1_ratio": None, - "max_iter": 100, - "multi_class": "auto", - "n_jobs": None, - "penalty": "l2", - "random_state": 42, - "solver": "liblinear", - "tol": 0.0001, - "verbose": 0, - "warm_start": False - } - } - - assert actual == expected - - def test_deserialize(self): - - model = LogisticRegressionModel() - - model_dict = { - "meta": "logistic-regression", - "predictors": [], - "_eval_metrics_by_split": {}, - "params": { - "C": 1000000000.0, - "class_weight": None, - "dual": False, - "fit_intercept": True, - "intercept_scaling": 1, - "l1_ratio": None, - "max_iter": 100, - "multi_class": "auto", - "n_jobs": None, - "penalty": "l2", - "random_state": 42, - "solver": "liblinear", - "tol": 0.0001, - "verbose": 0, - "warm_start": False - }, - "classes_": [0, 1], - "coef_": [[0.5, 0.75]], - "intercept_": [-3], - "n_iter_": [10] - } - - model.deserialize(model_dict) - - logit = model.logit - assert logit.get_params() == model_dict["params"] - assert logit.classes_.all() == np.array(model_dict["classes_"]).all() - assert logit.n_iter_.all() == np.array(model_dict["n_iter_"]).all() - assert logit.intercept_.all() == (np.array(model_dict["intercept_"]).all()) - assert logit.coef_.all() == np.array(model_dict["coef_"]).all() - -class TestLinearRegressionModel: - - def test_evaluate(self, mocker): - - X = mock_data() - y = pd.Series(np.array([0.6, 0.1, 0.2, 0.9, 0.8, 0.3, 0.2, 0.4, 0.9, 0.5])*12) - - def mock_mean_squared_error(y_true, y_pred): - return 1.23 - - (mocker - .patch("cobra.model_building.LinearRegressionModel.score_model", - mock_score_model_regression)) - - (mocker - .patch("cobra.model_building.models.mean_squared_error", - mock_mean_squared_error)) - - model = LinearRegressionModel() - actual = model.evaluate(X, y) - - assert actual == np.sqrt(1.23) - - def test_evaluate_cached(self): - - split = "train" - expected = np.sqrt(1.23) - - model = LinearRegressionModel() - model._eval_metrics_by_split["train"] = expected - - actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split) - - assert actual == expected - - def test_compute_variable_importance(self, mocker): - - def mock_pearsonr(ypred, ytrue): - return [ypred.unique()[0]] - - (mocker - .patch("cobra.model_building.LinearRegressionModel.score_model", - mock_score_model_regression)) - - (mocker - .patch("cobra.model_building.models.stats.pearsonr", - mock_pearsonr)) - - model = LinearRegressionModel() - model.predictors = ["var1_enc", "var2_enc", "var3_enc"] - - data = mock_data() - - actual = model.compute_variable_importance(data) - - expected = pd.DataFrame([ - {"predictor": "var1", "importance": data["var1_enc"].unique()[0]}, - {"predictor": "var2", "importance": data["var2_enc"].unique()[0]}, - {"predictor": "var3", "importance": data["var3_enc"].unique()[0]} - ]).sort_values(by="importance", ascending=False).reset_index(drop=True) - - pd.testing.assert_frame_equal(actual, expected) - - def test_serialize(self): - - model = LinearRegressionModel() - actual = model.serialize() - - expected = { - "meta": "linear-regression", - "predictors": [], - "_eval_metrics_by_split": {}, - "params": { - "copy_X": True, - "fit_intercept": True, - "n_jobs": None, - "positive": False - } - } - - assert actual == expected - - def test_deserialize(self): - - model = LinearRegressionModel() - - model_dict = { - "meta": "linear-regression", - "predictors": [], - "_eval_metrics_by_split": {}, - "params": { - "copy_X": True, - "fit_intercept": True, - "n_jobs": None, - "positive": False - }, - "coef_": [[0.5, 0.75]], - "intercept_": [-3] - } - - model.deserialize(model_dict) - - linear = model.linear - assert linear.get_params() == model_dict["params"] - assert linear.intercept_.all() == (np.array(model_dict["intercept_"]).all()) - assert linear.coef_.all() == np.array(model_dict["coef_"]).all() - + +import numpy as np +import pandas as pd + +from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel + +def mock_data(): + return pd.DataFrame({"var1_enc": [0.42] * 10, + "var2_enc": [0.94] * 10, + "var3_enc": [0.87] * 10}) + + +def mock_score_model_classification(self, data): + return np.array([0.5, 0.8, 0.2, 0.9, 0.1, 0.7, 0.3, 0.6, 0.4, 0.5]) + +def mock_score_model_regression(self, data): + return np.array([0.7, 0.2, 0.2, 0.9, 0.7, 0.3, 0.1, 0.4, 0.8, 0.5])*15 + +class TestLogisticRegressionModel: + + def test_evaluate(self, mocker): + + X = mock_data() + y = pd.Series([1] * 5 + [0] * 5) + + def mock_roc_auc_score(y_true, y_score): + return 0.79 + + (mocker + .patch("cobra.model_building.LogisticRegressionModel.score_model", + mock_score_model_classification)) + + (mocker + .patch("cobra.model_building.models.roc_auc_score", + mock_roc_auc_score)) + + model = LogisticRegressionModel() + actual = model.evaluate(X, y) + + assert actual == 0.79 + + def test_evaluate_cached(self): + + split = "train" + expected = 0.79 + + model = LogisticRegressionModel() + model._eval_metrics_by_split["train"] = expected + + actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split) + + assert actual == expected + + def test_compute_variable_importance(self, mocker): + + def mock_pearsonr(ypred, ytrue): + return [ypred.unique()[0]] + + (mocker + .patch("cobra.model_building.LogisticRegressionModel.score_model", + mock_score_model_classification)) + + (mocker + .patch("cobra.model_building.models.stats.pearsonr", + mock_pearsonr)) + + model = LogisticRegressionModel() + model.predictors = ["var1_enc", "var2_enc", "var3_enc"] + + data = mock_data() + + actual = model.compute_variable_importance(data) + + expected = pd.DataFrame([ + {"predictor": "var1", "importance": data["var1_enc"].unique()[0]}, + {"predictor": "var2", "importance": data["var2_enc"].unique()[0]}, + {"predictor": "var3", "importance": data["var3_enc"].unique()[0]} + ]).sort_values(by="importance", ascending=False).reset_index(drop=True) + + pd.testing.assert_frame_equal(actual, expected) + + def test_serialize(self): + + model = LogisticRegressionModel() + actual = model.serialize() + + expected = { + "meta": "logistic-regression", + "predictors": [], + "_eval_metrics_by_split": {}, + "params": { + "C": 1000000000.0, + "class_weight": None, + "dual": False, + "fit_intercept": True, + "intercept_scaling": 1, + "l1_ratio": None, + "max_iter": 100, + "multi_class": "auto", + "n_jobs": None, + "penalty": "l2", + "random_state": 42, + "solver": "liblinear", + "tol": 0.0001, + "verbose": 0, + "warm_start": False + } + } + + assert actual == expected + + def test_deserialize(self): + + model = LogisticRegressionModel() + + model_dict = { + "meta": "logistic-regression", + "predictors": [], + "_eval_metrics_by_split": {}, + "params": { + "C": 1000000000.0, + "class_weight": None, + "dual": False, + "fit_intercept": True, + "intercept_scaling": 1, + "l1_ratio": None, + "max_iter": 100, + "multi_class": "auto", + "n_jobs": None, + "penalty": "l2", + "random_state": 42, + "solver": "liblinear", + "tol": 0.0001, + "verbose": 0, + "warm_start": False + }, + "classes_": [0, 1], + "coef_": [[0.5, 0.75]], + "intercept_": [-3], + "n_iter_": [10] + } + + model.deserialize(model_dict) + + logit = model.logit + assert logit.get_params() == model_dict["params"] + assert logit.classes_.all() == np.array(model_dict["classes_"]).all() + assert logit.n_iter_.all() == np.array(model_dict["n_iter_"]).all() + assert logit.intercept_.all() == (np.array(model_dict["intercept_"]).all()) + assert logit.coef_.all() == np.array(model_dict["coef_"]).all() + +class TestLinearRegressionModel: + + def test_evaluate(self, mocker): + + X = mock_data() + y = pd.Series(np.array([0.6, 0.1, 0.2, 0.9, 0.8, 0.3, 0.2, 0.4, 0.9, 0.5])*12) + + def mock_mean_squared_error(y_true, y_pred): + return 1.23 + + (mocker + .patch("cobra.model_building.LinearRegressionModel.score_model", + mock_score_model_regression)) + + (mocker + .patch("cobra.model_building.models.mean_squared_error", + mock_mean_squared_error)) + + model = LinearRegressionModel() + actual = model.evaluate(X, y) + + assert actual == np.sqrt(1.23) + + def test_evaluate_cached(self): + + split = "train" + expected = np.sqrt(1.23) + + model = LinearRegressionModel() + model._eval_metrics_by_split["train"] = expected + + actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split) + + assert actual == expected + + def test_compute_variable_importance(self, mocker): + + def mock_pearsonr(ypred, ytrue): + return [ypred.unique()[0]] + + (mocker + .patch("cobra.model_building.LinearRegressionModel.score_model", + mock_score_model_regression)) + + (mocker + .patch("cobra.model_building.models.stats.pearsonr", + mock_pearsonr)) + + model = LinearRegressionModel() + model.predictors = ["var1_enc", "var2_enc", "var3_enc"] + + data = mock_data() + + actual = model.compute_variable_importance(data) + + expected = pd.DataFrame([ + {"predictor": "var1", "importance": data["var1_enc"].unique()[0]}, + {"predictor": "var2", "importance": data["var2_enc"].unique()[0]}, + {"predictor": "var3", "importance": data["var3_enc"].unique()[0]} + ]).sort_values(by="importance", ascending=False).reset_index(drop=True) + + pd.testing.assert_frame_equal(actual, expected) + + def test_serialize(self): + + model = LinearRegressionModel() + actual = model.serialize() + + expected = { + "meta": "linear-regression", + "predictors": [], + "_eval_metrics_by_split": {}, + "params": { + "copy_X": True, + "fit_intercept": True, + "n_jobs": None, + "positive": False + } + } + + assert actual == expected + + def test_deserialize(self): + + model = LinearRegressionModel() + + model_dict = { + "meta": "linear-regression", + "predictors": [], + "_eval_metrics_by_split": {}, + "params": { + "copy_X": True, + "fit_intercept": True, + "n_jobs": None, + "positive": False + }, + "coef_": [[0.5, 0.75]], + "intercept_": [-3] + } + + model.deserialize(model_dict) + + linear = model.linear + assert linear.get_params() == model_dict["params"] + assert linear.intercept_.all() == (np.array(model_dict["intercept_"]).all()) + assert linear.coef_.all() == np.array(model_dict["coef_"]).all() + diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py index dd53434..73f5f4e 100644 --- a/tests/preprocessing/test_categorical_data_processor.py +++ b/tests/preprocessing/test_categorical_data_processor.py @@ -1,313 +1,313 @@ - -import pytest -import numpy as np -import pandas as pd - -from cobra.preprocessing import CategoricalDataProcessor - -class TestCategoricalDataProcessor: - - def test_attributes_to_dict(self): - - processor = CategoricalDataProcessor() - - cleaned_categories = ["a", "b", "c"] - processor._cleaned_categories_by_column = { - "variable": set(cleaned_categories) - } - - actual = processor.attributes_to_dict() - - expected = { - "model_type": "classification", - "regroup": True, - "regroup_name": "Other", - "keep_missing": True, - "category_size_threshold": 5, - "p_value_threshold": 0.001, - "scale_contingency_table": True, - "forced_categories": {}, - "_cleaned_categories_by_column": { - "variable": list(set(cleaned_categories)) - } - } - - assert actual == expected - - @pytest.mark.parametrize("attribute", - ["regroup", "regroup_name", "keep_missing", - "category_size_threshold", "p_value_threshold", - "scale_contingency_table", "forced_categories", - "_cleaned_categories_by_column"]) - def test_set_attributes_from_dict(self, attribute): - - processor = CategoricalDataProcessor() - - cleaned_categories = ["a", "b", "c"] - params = { - "regroup": True, - "regroup_name": "Other", - "keep_missing": True, - "category_size_threshold": 5, - "p_value_threshold": 0.001, - "scale_contingency_table": True, - "forced_categories": {}, - "_cleaned_categories_by_column": { - "variable": cleaned_categories - } - } - - expected = params[attribute] - - if attribute == "_cleaned_categories_by_column": - # list is transformed to a set in CategoricalDataProcessor - expected = {"variable": set(cleaned_categories)} - - processor.set_attributes_from_dict(params) - - actual = getattr(processor, attribute) - - assert actual == expected - - @pytest.mark.parametrize("scale_contingency_table, expected", - [(False, 0.01329), - (True, 0.43437)]) - def test_compute_p_value_classification(self, scale_contingency_table, expected): - - X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10)) - y = pd.Series(data=([0]*35 + [1]*35 + [0]*15 + [1]*5 + [0]*8 + [1]*2)) - category = "c1" - - actual = (CategoricalDataProcessor - ._compute_p_value(X, y, category, "classification", scale_contingency_table)) - - assert pytest.approx(actual, abs=1e-5) == expected - - @pytest.mark.parametrize("seed, expected", - [(505, 0.02222), - (603, 0.89230)]) - def test_compute_p_value_regression(self, seed, expected): - - np.random.seed(seed) - - X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10)) - y = pd.Series(data=np.random.uniform(0, 1, 100)*5) - category = "c1" - - actual = (CategoricalDataProcessor - ._compute_p_value(X, y, category, "regression", None)) - - assert pytest.approx(actual, abs=1e-5) == expected - - def test_get_small_categories(self): - - data = pd.Series(data=(["c1"]*50 + ["c2"]*25 + ["c3"]*15 + ["c4"]*5)) - incidence = 0.35 - threshold = 10 # to make it easy to manualLy compute - expected = {"c3", "c4"} - - actual = (CategoricalDataProcessor - ._get_small_categories(data, incidence, threshold)) - - assert actual == expected - - def test_replace_missings(self): - - data = pd.DataFrame({"variable": ["c1", "c2", np.nan, "", " "]}) - expected = pd.DataFrame({"variable": ["c1", "c2", "Missing", "Missing", - "Missing"] - }) - actual = (CategoricalDataProcessor - ._replace_missings(data, ["variable"])) - - pd.testing.assert_frame_equal(actual, expected) - - @pytest.mark.parametrize("cleaned_categories, expected", - [({"c1", "c2"}, - pd.Series(data=["c1", "c2", "Other", "Other"])), - ({"c1", "c2", "c3", "c4"}, - pd.Series(data=["c1", "c2", "c3", "c4"]))]) - def test_replace_categories(self, cleaned_categories, expected): - - data = pd.Series(data=["c1", "c2", "c3", "c4"]) - - actual = (CategoricalDataProcessor - ._replace_categories(data, cleaned_categories, 'Other')) - - pd.testing.assert_series_equal(actual, expected) - - def test_all_cats_not_significant(self): - # Expected - e = {'categorical_var': ['A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C'], - 'target': [1, 1, 1, 1, - 0, 0, 0, 0, - 1, 0, 1, 0], - 'categorical_var_processed': ['A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C']} - - # data -> actual - d = {'categorical_var': ['A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C'], - 'target': [1, 1, 1, 1, - 0, 0, 0, 0, - 1, 0, 1, 0]} - - discrete_vars = ['categorical_var'] - target_column_name = 'target' - - data = pd.DataFrame(d, columns=['categorical_var', 'target']) - expected = pd.DataFrame(e, columns=['categorical_var', - 'target', - 'categorical_var_processed']) - - categorical_data_processor = CategoricalDataProcessor( - category_size_threshold=0, - p_value_threshold=0.0001) - - categorical_data_processor.fit(data, - discrete_vars, - target_column_name) - - actual = categorical_data_processor.transform(data, - discrete_vars) - - pd.testing.assert_frame_equal(actual, expected) - - def test_regroup_name(self): - # Expected - e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C', 'C', 'C'], - 'target': [1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, - 1, 0, 1, 0, 1, 0], - 'categorical_var_processed': [ - 'A', 'A', 'A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', 'B', 'B', - 'OTH', 'OTH', 'OTH', 'OTH', 'OTH', 'OTH']} - - # data -> actual - d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C', 'C', 'C'], - 'target': [1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, - 1, 0, 1, 0, 1, 0]} - - discrete_vars = ['categorical_var'] - target_column_name = 'target' - - data = pd.DataFrame(d, columns=['categorical_var', 'target']) - expected = pd.DataFrame(e, columns=['categorical_var', - 'target', - 'categorical_var_processed']) - - expected['categorical_var_processed'] = ( - expected['categorical_var_processed'].astype("category")) - - categorical_data_processor = CategoricalDataProcessor( - category_size_threshold=0, - regroup_name='OTH', - p_value_threshold=0.05) - - categorical_data_processor.fit(data, - discrete_vars, - target_column_name) - - actual = categorical_data_processor.transform(data, - discrete_vars) - - pd.testing.assert_frame_equal(actual, expected) - - def test_force_category(self): - # Expected - e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C', 'C', 'C'], - 'target': [1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, - 1, 0, 1, 0, 1, 0], - 'categorical_var_processed': ['A', 'A', 'A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C', 'C', 'C']} - - # data -> actual - d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', - 'B', 'B', 'B', 'B', 'B', 'B', - 'C', 'C', 'C', 'C', 'C', 'C'], - 'target': [1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, - 1, 0, 1, 0, 1, 0]} - - discrete_vars = ['categorical_var'] - target_column_name = 'target' - - data = pd.DataFrame(d, columns=['categorical_var', 'target']) - expected = pd.DataFrame(e, columns=['categorical_var', - 'target', - 'categorical_var_processed']) - - expected['categorical_var_processed'] = ( - expected['categorical_var_processed'].astype("category")) - - categorical_data_processor = CategoricalDataProcessor( - category_size_threshold=0, - forced_categories={'categorical_var': ['C']}, - p_value_threshold=0.05) - - categorical_data_processor.fit(data, - discrete_vars, - target_column_name) - - actual = categorical_data_processor.transform(data, - discrete_vars) - - pd.testing.assert_frame_equal(actual, expected) - - def test_categorical_variable_is_constant(self): - # Expected - e = {'categorical_var': ['A', 'A', 'A', 'A', - 'A', 'A', 'A', 'A', - 'A', 'A', 'A', 'A'], - 'target': [1, 1, 1, 1, - 0, 0, 0, 0, - 1, 0, 1, 0], - 'categorical_var_processed': ['A', 'A', 'A', 'A', - 'A', 'A', 'A', 'A', - 'A', 'A', 'A', 'A']} - - # data -> actual - d = {'categorical_var': ['A', 'A', 'A', 'A', - 'A', 'A', 'A', 'A', - 'A', 'A', 'A', 'A'], - 'target': [1, 1, 1, 1, - 0, 0, 0, 0, - 1, 0, 1, 0]} - - discrete_vars = ['categorical_var'] - target_column_name = 'target' - - data = pd.DataFrame(d, columns=['categorical_var', 'target']) - expected = pd.DataFrame(e, columns=['categorical_var', - 'target', - 'categorical_var_processed']) - - expected['categorical_var_processed'] = ( - expected['categorical_var_processed'].astype("category")) - - categorical_data_processor = CategoricalDataProcessor( - category_size_threshold=0, - p_value_threshold=0.0001) - - categorical_data_processor.fit(data, - discrete_vars, - target_column_name) - - actual = categorical_data_processor.transform(data, - discrete_vars) - - pd.testing.assert_frame_equal(actual, expected) + +import pytest +import numpy as np +import pandas as pd + +from cobra.preprocessing import CategoricalDataProcessor + +class TestCategoricalDataProcessor: + + def test_attributes_to_dict(self): + + processor = CategoricalDataProcessor() + + cleaned_categories = ["a", "b", "c"] + processor._cleaned_categories_by_column = { + "variable": set(cleaned_categories) + } + + actual = processor.attributes_to_dict() + + expected = { + "model_type": "classification", + "regroup": True, + "regroup_name": "Other", + "keep_missing": True, + "category_size_threshold": 5, + "p_value_threshold": 0.001, + "scale_contingency_table": True, + "forced_categories": {}, + "_cleaned_categories_by_column": { + "variable": list(set(cleaned_categories)) + } + } + + assert actual == expected + + @pytest.mark.parametrize("attribute", + ["regroup", "regroup_name", "keep_missing", + "category_size_threshold", "p_value_threshold", + "scale_contingency_table", "forced_categories", + "_cleaned_categories_by_column"]) + def test_set_attributes_from_dict(self, attribute): + + processor = CategoricalDataProcessor() + + cleaned_categories = ["a", "b", "c"] + params = { + "regroup": True, + "regroup_name": "Other", + "keep_missing": True, + "category_size_threshold": 5, + "p_value_threshold": 0.001, + "scale_contingency_table": True, + "forced_categories": {}, + "_cleaned_categories_by_column": { + "variable": cleaned_categories + } + } + + expected = params[attribute] + + if attribute == "_cleaned_categories_by_column": + # list is transformed to a set in CategoricalDataProcessor + expected = {"variable": set(cleaned_categories)} + + processor.set_attributes_from_dict(params) + + actual = getattr(processor, attribute) + + assert actual == expected + + @pytest.mark.parametrize("scale_contingency_table, expected", + [(False, 0.01329), + (True, 0.43437)]) + def test_compute_p_value_classification(self, scale_contingency_table, expected): + + X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10)) + y = pd.Series(data=([0]*35 + [1]*35 + [0]*15 + [1]*5 + [0]*8 + [1]*2)) + category = "c1" + + actual = (CategoricalDataProcessor + ._compute_p_value(X, y, category, "classification", scale_contingency_table)) + + assert pytest.approx(actual, abs=1e-5) == expected + + @pytest.mark.parametrize("seed, expected", + [(505, 0.02222), + (603, 0.89230)]) + def test_compute_p_value_regression(self, seed, expected): + + np.random.seed(seed) + + X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10)) + y = pd.Series(data=np.random.uniform(0, 1, 100)*5) + category = "c1" + + actual = (CategoricalDataProcessor + ._compute_p_value(X, y, category, "regression", None)) + + assert pytest.approx(actual, abs=1e-5) == expected + + def test_get_small_categories(self): + + data = pd.Series(data=(["c1"]*50 + ["c2"]*25 + ["c3"]*15 + ["c4"]*5)) + incidence = 0.35 + threshold = 10 # to make it easy to manualLy compute + expected = {"c3", "c4"} + + actual = (CategoricalDataProcessor + ._get_small_categories(data, incidence, threshold)) + + assert actual == expected + + def test_replace_missings(self): + + data = pd.DataFrame({"variable": ["c1", "c2", np.nan, "", " "]}) + expected = pd.DataFrame({"variable": ["c1", "c2", "Missing", "Missing", + "Missing"] + }) + actual = (CategoricalDataProcessor + ._replace_missings(data, ["variable"])) + + pd.testing.assert_frame_equal(actual, expected) + + @pytest.mark.parametrize("cleaned_categories, expected", + [({"c1", "c2"}, + pd.Series(data=["c1", "c2", "Other", "Other"])), + ({"c1", "c2", "c3", "c4"}, + pd.Series(data=["c1", "c2", "c3", "c4"]))]) + def test_replace_categories(self, cleaned_categories, expected): + + data = pd.Series(data=["c1", "c2", "c3", "c4"]) + + actual = (CategoricalDataProcessor + ._replace_categories(data, cleaned_categories, 'Other')) + + pd.testing.assert_series_equal(actual, expected) + + def test_all_cats_not_significant(self): + # Expected + e = {'categorical_var': ['A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, + 0, 0, 0, 0, + 1, 0, 1, 0], + 'categorical_var_processed': ['A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C']} + + # data -> actual + d = {'categorical_var': ['A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, + 0, 0, 0, 0, + 1, 0, 1, 0]} + + discrete_vars = ['categorical_var'] + target_column_name = 'target' + + data = pd.DataFrame(d, columns=['categorical_var', 'target']) + expected = pd.DataFrame(e, columns=['categorical_var', + 'target', + 'categorical_var_processed']) + + categorical_data_processor = CategoricalDataProcessor( + category_size_threshold=0, + p_value_threshold=0.0001) + + categorical_data_processor.fit(data, + discrete_vars, + target_column_name) + + actual = categorical_data_processor.transform(data, + discrete_vars) + + pd.testing.assert_frame_equal(actual, expected) + + def test_regroup_name(self): + # Expected + e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 1, 0], + 'categorical_var_processed': [ + 'A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'OTH', 'OTH', 'OTH', 'OTH', 'OTH', 'OTH']} + + # data -> actual + d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 1, 0]} + + discrete_vars = ['categorical_var'] + target_column_name = 'target' + + data = pd.DataFrame(d, columns=['categorical_var', 'target']) + expected = pd.DataFrame(e, columns=['categorical_var', + 'target', + 'categorical_var_processed']) + + expected['categorical_var_processed'] = ( + expected['categorical_var_processed'].astype("category")) + + categorical_data_processor = CategoricalDataProcessor( + category_size_threshold=0, + regroup_name='OTH', + p_value_threshold=0.05) + + categorical_data_processor.fit(data, + discrete_vars, + target_column_name) + + actual = categorical_data_processor.transform(data, + discrete_vars) + + pd.testing.assert_frame_equal(actual, expected) + + def test_force_category(self): + # Expected + e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 1, 0], + 'categorical_var_processed': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C']} + + # data -> actual + d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 1, 0]} + + discrete_vars = ['categorical_var'] + target_column_name = 'target' + + data = pd.DataFrame(d, columns=['categorical_var', 'target']) + expected = pd.DataFrame(e, columns=['categorical_var', + 'target', + 'categorical_var_processed']) + + expected['categorical_var_processed'] = ( + expected['categorical_var_processed'].astype("category")) + + categorical_data_processor = CategoricalDataProcessor( + category_size_threshold=0, + forced_categories={'categorical_var': ['C']}, + p_value_threshold=0.05) + + categorical_data_processor.fit(data, + discrete_vars, + target_column_name) + + actual = categorical_data_processor.transform(data, + discrete_vars) + + pd.testing.assert_frame_equal(actual, expected) + + def test_categorical_variable_is_constant(self): + # Expected + e = {'categorical_var': ['A', 'A', 'A', 'A', + 'A', 'A', 'A', 'A', + 'A', 'A', 'A', 'A'], + 'target': [1, 1, 1, 1, + 0, 0, 0, 0, + 1, 0, 1, 0], + 'categorical_var_processed': ['A', 'A', 'A', 'A', + 'A', 'A', 'A', 'A', + 'A', 'A', 'A', 'A']} + + # data -> actual + d = {'categorical_var': ['A', 'A', 'A', 'A', + 'A', 'A', 'A', 'A', + 'A', 'A', 'A', 'A'], + 'target': [1, 1, 1, 1, + 0, 0, 0, 0, + 1, 0, 1, 0]} + + discrete_vars = ['categorical_var'] + target_column_name = 'target' + + data = pd.DataFrame(d, columns=['categorical_var', 'target']) + expected = pd.DataFrame(e, columns=['categorical_var', + 'target', + 'categorical_var_processed']) + + expected['categorical_var_processed'] = ( + expected['categorical_var_processed'].astype("category")) + + categorical_data_processor = CategoricalDataProcessor( + category_size_threshold=0, + p_value_threshold=0.0001) + + categorical_data_processor.fit(data, + discrete_vars, + target_column_name) + + actual = categorical_data_processor.transform(data, + discrete_vars) + + pd.testing.assert_frame_equal(actual, expected) diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py index d3a643a..209d74b 100644 --- a/tests/preprocessing/test_kbins_discretizer.py +++ b/tests/preprocessing/test_kbins_discretizer.py @@ -1,252 +1,252 @@ - -from contextlib import contextmanager -import pytest -import numpy as np -import pandas as pd - -from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer - -@contextmanager -def does_not_raise(): - yield - - -class TestKBinsDiscretizer: - - # ---------------- Test for public methods ---------------- - def test_attributes_to_dict(self): - - discretizer = KBinsDiscretizer() - - bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)] - discretizer._bins_by_column = {"variable": bins} - - actual = discretizer.attributes_to_dict() - - expected = { - "n_bins": 10, - "strategy": "quantile", - "closed": "right", - "auto_adapt_bins": False, - "starting_precision": 0, - "label_format": "{} - {}", - "change_endpoint_format": False, - "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0], - [6.0, 9.0]]} - } - - assert actual == expected - - @pytest.mark.parametrize("attribute", - ["n_bins", "strategy", "closed", - "auto_adapt_bins", "starting_precision", - "label_format", "change_endpoint_format", - "_bins_by_column"]) - def test_set_attributes_from_dict(self, attribute): - - discretizer = KBinsDiscretizer() - - params = { - "n_bins": 5, - "strategy": "uniform", - "closed": "left", - "auto_adapt_bins": True, - "starting_precision": 1, - "label_format": "[,)", - "change_endpoint_format": True, - "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0], - [6.0, 9.0]]} - } - - expected = params[attribute] - - if attribute == "_bins_by_column": - # list of list is transformed to a list of tuples - # in KBinsDiscretizer!!! - expected = {"variable": [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]} - - discretizer.set_attributes_from_dict(params) - - actual = getattr(discretizer, attribute) - - assert actual == expected - - # no further tests here as this is just a wrapper around _fit_column! - @pytest.mark.parametrize("strategy, expectation", - [("trees", pytest.raises(ValueError)), - ("quantile", does_not_raise())]) - def test_fit_exception(self, strategy, expectation): - discretizer = KBinsDiscretizer(strategy=strategy) - - data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) - - with expectation: - discretizer.fit(data, ["variable"]) - - # no further tests here as this is just a wrapper around _transform_column! - @pytest.mark.parametrize("scenario, expectation", - [("raise", pytest.raises(ValueError)), - ("regular_test", does_not_raise()), - ("constant_data", does_not_raise())]) - def test_transform(self, scenario, expectation): - - discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform") - - data = pd.DataFrame({"variable": ([1] * 10)}) - expected = data.copy() - - if scenario == "regular_test": - # overwrite data and expected with DataFrame containing - # a non-constant variable - data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) - expected = data.copy() - - discretizer.fit(data, ["variable"]) - - categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"] - expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4 - + ["3.0 - 6.0"]*3 - + ["6.0 - 9.0"]*3 - + ["Missing"], - categories=categories, - ordered=True) - elif scenario == "constant_data": - discretizer.fit(data, ["variable"]) - - with expectation: - actual = discretizer.transform(data, ["variable"]) - pd.testing.assert_frame_equal(actual, expected) - - # ---------------- Test for private methods ---------------- - @pytest.mark.parametrize("n_bins, expectation", - [(1, pytest.raises(ValueError)), - (10.5, pytest.raises(ValueError)), - (2, does_not_raise())]) - def test_validate_n_bins_exception(self, n_bins, expectation): - with expectation: - assert KBinsDiscretizer()._validate_n_bins(n_bins=n_bins) is None - - def test_transform_column(self): - - data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) - discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform") - - bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)] - - actual = discretizer._transform_column(data, "variable", bins) - - categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"] - - expected = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) - expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4 - + ["3.0 - 6.0"]*3 - + ["6.0 - 9.0"]*3 - + ["Missing"], - categories=categories, - ordered=True) - - # assert using pandas testing module - pd.testing.assert_frame_equal(actual, expected) - - @pytest.mark.parametrize("n_bins, auto_adapt_bins, data, expected", - [(4, False, - pd.DataFrame({"variable": list(range(0, 11))}), - [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), - (8.0, 10.0)]), - (10, True, - # ints from 0-10 with 17 nan's - pd.DataFrame({"variable": list(range(0, 11)) + - ([np.nan] * 17)}), - [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), - (8.0, 10.0)]), - (10, False, - # almost constant - pd.DataFrame({"variable": [0] + ([1] * 100)}), - None), - (2, False, - pd.DataFrame({"variable": [5.4, 9.3, np.inf]}), - None)], - ids=["regular", "auto_adapt_bins", - "two bin edges", "infs"]) - def test_fit_column(self, n_bins, auto_adapt_bins, data, expected): - discretizer = KBinsDiscretizer(n_bins=n_bins, - auto_adapt_bins=auto_adapt_bins) - - actual = discretizer._fit_column(data, column_name="variable") - - assert actual == expected - - @pytest.mark.parametrize("strategy, n_bins, data, expected", - [("quantile", # strategy - 4, # n_bins - # data (ints from 0 - 10): - pd.DataFrame({"variable": list(range(0, 11))}), - [0.0, 2.5, 5, 7.5, 10.0]), # expected result - ("uniform", # strategy - 3, # n_bins - # data (ints from 0 - 9): - pd.DataFrame({"variable": list(range(0, 10))}), - [0.0, 3.0, 6.0, 9.0])], # expected result - ids=["quantile", "uniform"]) - def test_compute_bin_edges(self, strategy, n_bins, data, expected): - - discretizer = KBinsDiscretizer(strategy=strategy) - - actual = discretizer._compute_bin_edges(data, column_name="variable", - n_bins=n_bins, - col_min=data.variable.min(), - col_max=data.variable.max()) - - assert actual == expected - - @pytest.mark.parametrize("bin_edges, starting_precision, expected", - [([-10, 0, 1, 2], 1, 1), - ([-10, 0, 1, 1.01], 0, 2), - ([-10, 0, 1, 1.1], 1, 1), - ([-10, 0, 1, 2], -1, 0), - ([-10, 0, 10, 21], -1, -1)], - ids=["less precision", "more precision", - "equal precision", "negative start", - "round up"]) - def test_compute_minimal_precision_of_bin_edges(self, bin_edges, - starting_precision, - expected): - - discretizer = KBinsDiscretizer(starting_precision=starting_precision) - - actual = discretizer._compute_minimal_precision_of_bin_edges(bin_edges) - - assert actual == expected - - @pytest.mark.parametrize("bin_edges, expected", - [([0, 1, 1.5, 2], [(0, 1), (1, 1.5), (1.5, 2)]), - ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)]), - ([np.inf, 0.0, -np.inf], - [(np.inf, 0.0), (0.0, -np.inf)])]) - def test_compute_bins_from_edges(self, bin_edges, expected): - - discretizer = KBinsDiscretizer() - actual = discretizer._compute_bins_from_edges(bin_edges) - - assert actual == expected - - @pytest.mark.parametrize("change_endpoint_format, closed, bins, expected", - [(False, "right", [(0, 1), (1, 2), (2, 3)], - ["0 - 1", "1 - 2", "2 - 3"]), - (True, "right", [(0, 1), (1, 2), (2, 3)], - ["<= 1", "1 - 2", "> 2"]), - (True, "left", [(0, 1), (1, 2), (2, 3)], - ["< 1", "1 - 2", ">= 2"])], - ids=["standard format", "different endpoints", - "different endpoints left"]) - def test_create_bin_labels(self, change_endpoint_format, closed, - bins, expected): - - discretizer = KBinsDiscretizer( - closed=closed, - change_endpoint_format=change_endpoint_format - ) - - actual = discretizer._create_bin_labels(bins) - - assert actual == expected + +from contextlib import contextmanager +import pytest +import numpy as np +import pandas as pd + +from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer + +@contextmanager +def does_not_raise(): + yield + + +class TestKBinsDiscretizer: + + # ---------------- Test for public methods ---------------- + def test_attributes_to_dict(self): + + discretizer = KBinsDiscretizer() + + bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)] + discretizer._bins_by_column = {"variable": bins} + + actual = discretizer.attributes_to_dict() + + expected = { + "n_bins": 10, + "strategy": "quantile", + "closed": "right", + "auto_adapt_bins": False, + "starting_precision": 0, + "label_format": "{} - {}", + "change_endpoint_format": False, + "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0], + [6.0, 9.0]]} + } + + assert actual == expected + + @pytest.mark.parametrize("attribute", + ["n_bins", "strategy", "closed", + "auto_adapt_bins", "starting_precision", + "label_format", "change_endpoint_format", + "_bins_by_column"]) + def test_set_attributes_from_dict(self, attribute): + + discretizer = KBinsDiscretizer() + + params = { + "n_bins": 5, + "strategy": "uniform", + "closed": "left", + "auto_adapt_bins": True, + "starting_precision": 1, + "label_format": "[,)", + "change_endpoint_format": True, + "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0], + [6.0, 9.0]]} + } + + expected = params[attribute] + + if attribute == "_bins_by_column": + # list of list is transformed to a list of tuples + # in KBinsDiscretizer!!! + expected = {"variable": [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]} + + discretizer.set_attributes_from_dict(params) + + actual = getattr(discretizer, attribute) + + assert actual == expected + + # no further tests here as this is just a wrapper around _fit_column! + @pytest.mark.parametrize("strategy, expectation", + [("trees", pytest.raises(ValueError)), + ("quantile", does_not_raise())]) + def test_fit_exception(self, strategy, expectation): + discretizer = KBinsDiscretizer(strategy=strategy) + + data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) + + with expectation: + discretizer.fit(data, ["variable"]) + + # no further tests here as this is just a wrapper around _transform_column! + @pytest.mark.parametrize("scenario, expectation", + [("raise", pytest.raises(ValueError)), + ("regular_test", does_not_raise()), + ("constant_data", does_not_raise())]) + def test_transform(self, scenario, expectation): + + discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform") + + data = pd.DataFrame({"variable": ([1] * 10)}) + expected = data.copy() + + if scenario == "regular_test": + # overwrite data and expected with DataFrame containing + # a non-constant variable + data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) + expected = data.copy() + + discretizer.fit(data, ["variable"]) + + categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"] + expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4 + + ["3.0 - 6.0"]*3 + + ["6.0 - 9.0"]*3 + + ["Missing"], + categories=categories, + ordered=True) + elif scenario == "constant_data": + discretizer.fit(data, ["variable"]) + + with expectation: + actual = discretizer.transform(data, ["variable"]) + pd.testing.assert_frame_equal(actual, expected) + + # ---------------- Test for private methods ---------------- + @pytest.mark.parametrize("n_bins, expectation", + [(1, pytest.raises(ValueError)), + (10.5, pytest.raises(ValueError)), + (2, does_not_raise())]) + def test_validate_n_bins_exception(self, n_bins, expectation): + with expectation: + assert KBinsDiscretizer()._validate_n_bins(n_bins=n_bins) is None + + def test_transform_column(self): + + data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) + discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform") + + bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)] + + actual = discretizer._transform_column(data, "variable", bins) + + categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"] + + expected = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]}) + expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4 + + ["3.0 - 6.0"]*3 + + ["6.0 - 9.0"]*3 + + ["Missing"], + categories=categories, + ordered=True) + + # assert using pandas testing module + pd.testing.assert_frame_equal(actual, expected) + + @pytest.mark.parametrize("n_bins, auto_adapt_bins, data, expected", + [(4, False, + pd.DataFrame({"variable": list(range(0, 11))}), + [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), + (8.0, 10.0)]), + (10, True, + # ints from 0-10 with 17 nan's + pd.DataFrame({"variable": list(range(0, 11)) + + ([np.nan] * 17)}), + [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0), + (8.0, 10.0)]), + (10, False, + # almost constant + pd.DataFrame({"variable": [0] + ([1] * 100)}), + None), + (2, False, + pd.DataFrame({"variable": [5.4, 9.3, np.inf]}), + None)], + ids=["regular", "auto_adapt_bins", + "two bin edges", "infs"]) + def test_fit_column(self, n_bins, auto_adapt_bins, data, expected): + discretizer = KBinsDiscretizer(n_bins=n_bins, + auto_adapt_bins=auto_adapt_bins) + + actual = discretizer._fit_column(data, column_name="variable") + + assert actual == expected + + @pytest.mark.parametrize("strategy, n_bins, data, expected", + [("quantile", # strategy + 4, # n_bins + # data (ints from 0 - 10): + pd.DataFrame({"variable": list(range(0, 11))}), + [0.0, 2.5, 5, 7.5, 10.0]), # expected result + ("uniform", # strategy + 3, # n_bins + # data (ints from 0 - 9): + pd.DataFrame({"variable": list(range(0, 10))}), + [0.0, 3.0, 6.0, 9.0])], # expected result + ids=["quantile", "uniform"]) + def test_compute_bin_edges(self, strategy, n_bins, data, expected): + + discretizer = KBinsDiscretizer(strategy=strategy) + + actual = discretizer._compute_bin_edges(data, column_name="variable", + n_bins=n_bins, + col_min=data.variable.min(), + col_max=data.variable.max()) + + assert actual == expected + + @pytest.mark.parametrize("bin_edges, starting_precision, expected", + [([-10, 0, 1, 2], 1, 1), + ([-10, 0, 1, 1.01], 0, 2), + ([-10, 0, 1, 1.1], 1, 1), + ([-10, 0, 1, 2], -1, 0), + ([-10, 0, 10, 21], -1, -1)], + ids=["less precision", "more precision", + "equal precision", "negative start", + "round up"]) + def test_compute_minimal_precision_of_bin_edges(self, bin_edges, + starting_precision, + expected): + + discretizer = KBinsDiscretizer(starting_precision=starting_precision) + + actual = discretizer._compute_minimal_precision_of_bin_edges(bin_edges) + + assert actual == expected + + @pytest.mark.parametrize("bin_edges, expected", + [([0, 1, 1.5, 2], [(0, 1), (1, 1.5), (1.5, 2)]), + ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)]), + ([np.inf, 0.0, -np.inf], + [(np.inf, 0.0), (0.0, -np.inf)])]) + def test_compute_bins_from_edges(self, bin_edges, expected): + + discretizer = KBinsDiscretizer() + actual = discretizer._compute_bins_from_edges(bin_edges) + + assert actual == expected + + @pytest.mark.parametrize("change_endpoint_format, closed, bins, expected", + [(False, "right", [(0, 1), (1, 2), (2, 3)], + ["0 - 1", "1 - 2", "2 - 3"]), + (True, "right", [(0, 1), (1, 2), (2, 3)], + ["<= 1", "1 - 2", "> 2"]), + (True, "left", [(0, 1), (1, 2), (2, 3)], + ["< 1", "1 - 2", ">= 2"])], + ids=["standard format", "different endpoints", + "different endpoints left"]) + def test_create_bin_labels(self, change_endpoint_format, closed, + bins, expected): + + discretizer = KBinsDiscretizer( + closed=closed, + change_endpoint_format=change_endpoint_format + ) + + actual = discretizer._create_bin_labels(bins) + + assert actual == expected diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py index 08f5b63..a97a4e4 100644 --- a/tests/preprocessing/test_preprocessor.py +++ b/tests/preprocessing/test_preprocessor.py @@ -1,398 +1,398 @@ -from contextlib import contextmanager -from typing import Any -from unittest.mock import MagicMock -import pytest -import numpy as np -import pandas as pd -from pytest_mock import MockerFixture - -from cobra.preprocessing.preprocessor import PreProcessor - - -@contextmanager -def does_not_raise(): - yield - - -class TestPreProcessor: - @pytest.mark.parametrize( - "train_prop, selection_prop, validation_prop, " "expected_sizes", - [ - (0.6, 0.2, 0.2, {"train": 6, "selection": 2, "validation": 2}), - (0.7, 0.3, 0.0, {"train": 7, "selection": 3}), - # Error "The sum of train_prop, selection_prop and - # validation_prop must be 1.0." should not be - # raised: - (0.7, 0.2, 0.1, {"train": 7, "selection": 2, "validation": 1}), - ], - ) - def test_train_selection_validation_split( - self, - train_prop: float, - selection_prop: float, - validation_prop: float, - expected_sizes: dict, - ): - X = np.arange(100).reshape(10, 10) - data = pd.DataFrame(X, columns=[f"c{i+1}" for i in range(10)]) - data.loc[:, "target"] = np.array([0] * 7 + [1] * 3) - - actual = PreProcessor.train_selection_validation_split( - data, train_prop, selection_prop, validation_prop - ) - - # check for the output schema - assert list(actual.columns) == list(data.columns) - - # check that total size of input & output is the same! - assert len(actual.index) == len(data.index) - - # check for the output sizes per split - actual_sizes = actual.groupby("split").size().to_dict() - - assert actual_sizes == expected_sizes - - def test_train_selection_validation_split_error_wrong_prop(self): - - error_msg = ( - "The sum of train_prop, selection_prop and " "validation_prop must be 1.0." - ) - train_prop = 0.7 - selection_prop = 0.3 - - self._test_train_selection_validation_split_error( - train_prop, selection_prop, error_msg - ) - - def test_train_selection_validation_split_error_zero_selection_prop(self): - - error_msg = "selection_prop cannot be zero!" - train_prop = 0.9 - selection_prop = 0.0 - - self._test_train_selection_validation_split_error( - train_prop, selection_prop, error_msg - ) - - def _test_train_selection_validation_split_error( - self, train_prop: float, selection_prop: float, error_msg: str - ): - df = pd.DataFrame() - with pytest.raises(ValueError, match=error_msg): - ( - PreProcessor.train_selection_validation_split( - df, - train_prop=train_prop, - selection_prop=selection_prop, - validation_prop=0.1, - ) - ) - - @pytest.mark.parametrize( - "injection_location, expected", - [ - (None, True), - ("categorical_data_processor", False), - ("discretizer", False), - ("target_encoder", False), - ], - ) - def test_is_valid_pipeline(self, injection_location: str, expected: bool): - - # is_valid_pipeline only checks for relevant keys atm - pipeline_dict = { - "categorical_data_processor": { - "model_type": None, - "regroup": None, - "regroup_name": None, - "keep_missing": None, - "category_size_threshold": None, - "p_value_threshold": None, - "scale_contingency_table": None, - "forced_categories": None, - }, - "discretizer": { - "n_bins": None, - "strategy": None, - "closed": None, - "auto_adapt_bins": None, - "starting_precision": None, - "label_format": None, - "change_endpoint_format": None, - }, - "target_encoder": { - "weight": None, - "imputation_strategy": None, - }, - } - - if injection_location: - pipeline_dict[injection_location]["wrong_key"] = None - - actual = PreProcessor._is_valid_pipeline(pipeline_dict) - - assert actual == expected - - @pytest.mark.parametrize( - ("continuous_vars, discrete_vars, expectation, " "expected"), - [ - ([], [], pytest.raises(ValueError), None), - ( - ["c1", "c2"], - ["d1", "d2"], - does_not_raise(), - ["d1_processed", "d2_processed", "c1_bin", "c2_bin"], - ), - (["c1", "c2"], [], does_not_raise(), ["c1_bin", "c2_bin"]), - ([], ["d1", "d2"], does_not_raise(), ["d1_processed", "d2_processed"]), - ], - ) - def test_get_variable_list( - self, - continuous_vars: list, - discrete_vars: list, - expectation: Any, - expected: list, - ): - - with expectation: - actual = PreProcessor._get_variable_list(continuous_vars, discrete_vars) - - assert actual == expected - - @pytest.mark.parametrize( - ("input, expected"), - [ - # example 1 - ( - pd.DataFrame({ - "ID": list(range(20)), - "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8], - "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5, - "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17], - "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5 - } - ), - pd.DataFrame({ - 'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], - 'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], - 'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], - 'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17], - 'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], - 'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'], - 'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], - 'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], - 'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0], - 'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5], - 'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0] - } - ), - ) - ] - ) - def test_fit_transform_without_id_col_name(self, input, expected): - - preprocessor = PreProcessor.from_params(model_type="classification") - - continuous_vars, discrete_vars = preprocessor.get_continuous_and_discrete_columns(input, "ID","Target") - - calculated = preprocessor.fit_transform( - input, - continuous_vars=continuous_vars, - discrete_vars=discrete_vars, - target_column_name="Target" - ) - pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False) - - @pytest.mark.parametrize( - ("input, expected"), - [ - # example 1 - ( - pd.DataFrame({ - "ID": list(range(20)), - "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8], - "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5, - "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17], - "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5 - } - ), - pd.DataFrame({ - 'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], - 'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], - 'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], - 'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17], - 'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], - 'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'], - 'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], - 'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], - 'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0], - 'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5], - 'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0] - } - ), - ) - ] - ) - def test_fit_transform_with_id_col_name(self, input, expected): - - preprocessor = PreProcessor.from_params(model_type="classification") - - # continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target") - - calculated = preprocessor.fit_transform( - input, - continuous_vars=None, - discrete_vars=None, - target_column_name="Target", - id_col_name="ID" - ) - pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False) - - @staticmethod - def mock_transform(df: pd.DataFrame, args): - """Mock the transform method.""" - df["new_column"] = "Hello World" - return df - - def test_mutable_train_data_fit_transform(self, mocker: MockerFixture): - """Test if the train_data input is not changed when performing fit_transform.""" - train_data = pd.DataFrame( - [[1, "2", 3], [10, "20", 30], [100, "200", 300]], - columns=["foo", "bar", "baz"], - ) - preprocessor = PreProcessor.from_params( - model_type="classification", n_bins=10, weight=0.8 - ) - preprocessor._categorical_data_processor = MagicMock() - preprocessor._categorical_data_processor.transform = self.mock_transform - preprocessor._discretizer = MagicMock() - preprocessor._discretizer.transform = self.mock_transform - preprocessor._target_encoder = MagicMock() - preprocessor._target_encoder.transform = self.mock_transform - - result = preprocessor.fit_transform( - train_data, - continuous_vars=["foo"], - discrete_vars=["bar"], - target_column_name=["baz"], - ) - assert "new_column" not in train_data.columns - assert "new_column" in result.columns - - @pytest.mark.parametrize( - ("input, expected"), - [ - # example 1 - ( - pd.DataFrame( - { - "a": [1, 8, np.nan], - "b": [np.nan, 8, np.nan], - "c": [np.nan, np.nan, np.nan], - "d": [np.nan, np.nan, 5], - "e": [1, 960, np.nan], - "f": [np.nan, np.nan, np.nan], - } - ), - pd.DataFrame( - { - "a": [1.0, 8.0, np.nan], - "b": [np.nan, 8.0, np.nan], - "d": [np.nan, np.nan, 5.0], - "e": [1.0, 960.0, np.nan], - } - ), - ), - # example 2 - ( - pd.DataFrame( - { - "a": [1, 8, np.nan], - "b": [np.nan, 8, np.nan], - "c": [np.nan, np.nan, np.nan], - "d": [np.nan, np.nan, 5], - "e": [1, 960, np.nan], - } - ), - pd.DataFrame( - { - "a": [1.0, 8.0, np.nan], - "b": [np.nan, 8.0, np.nan], - "d": [np.nan, np.nan, 5.0], - "e": [1.0, 960.0, np.nan], - } - ), - ), - # example 3 - ( - pd.DataFrame( - { - "a": [1, 8, np.nan], - "b": [np.nan, 8, np.nan], - "d": [np.nan, np.nan, 5], - "e": [1, 960, np.nan], - } - ), - pd.DataFrame( - { - "a": [1.0, 8.0, np.nan], - "b": [np.nan, 8.0, np.nan], - "d": [np.nan, np.nan, 5.0], - "e": [1.0, 960.0, np.nan], - } - ), - ), - # example 4 categorical - ( - pd.DataFrame( - { - "a": [1, 8, np.nan], - "b": [np.nan, np.nan, np.nan], - "d": [np.nan, np.nan, 5], - "e": [1, 960, np.nan], - "category_1": ["A", "A", "B"], - "category_2": [np.nan, "A", "B"], - "category_3": [np.nan, np.nan, np.nan], - }, - ).astype( - { - "a": np.float64(), - "b": np.float64(), - "d": np.float64(), - "e": np.float64(), - "category_1": pd.CategoricalDtype(), - "category_2": pd.CategoricalDtype(), - "category_3": pd.CategoricalDtype(), - } - ), - pd.DataFrame( - { - "a": [1, 8, np.nan], - "d": [np.nan, np.nan, 5], - "e": [1, 960, np.nan], - "category_1": ["A", "A", "B"], - "category_2": [np.nan, "A", "B"], - } - ).astype( - { - "a": np.float64(), - "d": np.float64(), - "e": np.float64(), - "category_1": pd.CategoricalDtype(), - "category_2": pd.CategoricalDtype(), - } - ), - ), - ], - ) - def test_drops_columns_containing_only_nan(self, input, expected): - - print(input) - output = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan( - input - ) - - print(output) - print(expected) - assert output.equals(expected) +from contextlib import contextmanager +from typing import Any +from unittest.mock import MagicMock +import pytest +import numpy as np +import pandas as pd +from pytest_mock import MockerFixture + +from cobra.preprocessing.preprocessor import PreProcessor + + +@contextmanager +def does_not_raise(): + yield + + +class TestPreProcessor: + @pytest.mark.parametrize( + "train_prop, selection_prop, validation_prop, " "expected_sizes", + [ + (0.6, 0.2, 0.2, {"train": 6, "selection": 2, "validation": 2}), + (0.7, 0.3, 0.0, {"train": 7, "selection": 3}), + # Error "The sum of train_prop, selection_prop and + # validation_prop must be 1.0." should not be + # raised: + (0.7, 0.2, 0.1, {"train": 7, "selection": 2, "validation": 1}), + ], + ) + def test_train_selection_validation_split( + self, + train_prop: float, + selection_prop: float, + validation_prop: float, + expected_sizes: dict, + ): + X = np.arange(100).reshape(10, 10) + data = pd.DataFrame(X, columns=[f"c{i+1}" for i in range(10)]) + data.loc[:, "target"] = np.array([0] * 7 + [1] * 3) + + actual = PreProcessor.train_selection_validation_split( + data, train_prop, selection_prop, validation_prop + ) + + # check for the output schema + assert list(actual.columns) == list(data.columns) + + # check that total size of input & output is the same! + assert len(actual.index) == len(data.index) + + # check for the output sizes per split + actual_sizes = actual.groupby("split").size().to_dict() + + assert actual_sizes == expected_sizes + + def test_train_selection_validation_split_error_wrong_prop(self): + + error_msg = ( + "The sum of train_prop, selection_prop and " "validation_prop must be 1.0." + ) + train_prop = 0.7 + selection_prop = 0.3 + + self._test_train_selection_validation_split_error( + train_prop, selection_prop, error_msg + ) + + def test_train_selection_validation_split_error_zero_selection_prop(self): + + error_msg = "selection_prop cannot be zero!" + train_prop = 0.9 + selection_prop = 0.0 + + self._test_train_selection_validation_split_error( + train_prop, selection_prop, error_msg + ) + + def _test_train_selection_validation_split_error( + self, train_prop: float, selection_prop: float, error_msg: str + ): + df = pd.DataFrame() + with pytest.raises(ValueError, match=error_msg): + ( + PreProcessor.train_selection_validation_split( + df, + train_prop=train_prop, + selection_prop=selection_prop, + validation_prop=0.1, + ) + ) + + @pytest.mark.parametrize( + "injection_location, expected", + [ + (None, True), + ("categorical_data_processor", False), + ("discretizer", False), + ("target_encoder", False), + ], + ) + def test_is_valid_pipeline(self, injection_location: str, expected: bool): + + # is_valid_pipeline only checks for relevant keys atm + pipeline_dict = { + "categorical_data_processor": { + "model_type": None, + "regroup": None, + "regroup_name": None, + "keep_missing": None, + "category_size_threshold": None, + "p_value_threshold": None, + "scale_contingency_table": None, + "forced_categories": None, + }, + "discretizer": { + "n_bins": None, + "strategy": None, + "closed": None, + "auto_adapt_bins": None, + "starting_precision": None, + "label_format": None, + "change_endpoint_format": None, + }, + "target_encoder": { + "weight": None, + "imputation_strategy": None, + }, + } + + if injection_location: + pipeline_dict[injection_location]["wrong_key"] = None + + actual = PreProcessor._is_valid_pipeline(pipeline_dict) + + assert actual == expected + + @pytest.mark.parametrize( + ("continuous_vars, discrete_vars, expectation, " "expected"), + [ + ([], [], pytest.raises(ValueError), None), + ( + ["c1", "c2"], + ["d1", "d2"], + does_not_raise(), + ["d1_processed", "d2_processed", "c1_bin", "c2_bin"], + ), + (["c1", "c2"], [], does_not_raise(), ["c1_bin", "c2_bin"]), + ([], ["d1", "d2"], does_not_raise(), ["d1_processed", "d2_processed"]), + ], + ) + def test_get_variable_list( + self, + continuous_vars: list, + discrete_vars: list, + expectation: Any, + expected: list, + ): + + with expectation: + actual = PreProcessor._get_variable_list(continuous_vars, discrete_vars) + + assert actual == expected + + @pytest.mark.parametrize( + ("input, expected"), + [ + # example 1 + ( + pd.DataFrame({ + "ID": list(range(20)), + "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8], + "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5, + "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17], + "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5 + } + ), + pd.DataFrame({ + 'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + 'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], + 'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], + 'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17], + 'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], + 'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'], + 'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], + 'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], + 'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0], + 'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5], + 'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0] + } + ), + ) + ] + ) + def test_fit_transform_without_id_col_name(self, input, expected): + + preprocessor = PreProcessor.from_params(model_type="classification") + + continuous_vars, discrete_vars = preprocessor.get_continuous_and_discrete_columns(input, "ID","Target") + + calculated = preprocessor.fit_transform( + input, + continuous_vars=continuous_vars, + discrete_vars=discrete_vars, + target_column_name="Target" + ) + pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False) + + @pytest.mark.parametrize( + ("input, expected"), + [ + # example 1 + ( + pd.DataFrame({ + "ID": list(range(20)), + "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8], + "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5, + "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17], + "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5 + } + ), + pd.DataFrame({ + 'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + 'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], + 'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], + 'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17], + 'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], + 'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'], + 'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], + 'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], + 'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0], + 'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5], + 'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0] + } + ), + ) + ] + ) + def test_fit_transform_with_id_col_name(self, input, expected): + + preprocessor = PreProcessor.from_params(model_type="classification") + + # continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target") + + calculated = preprocessor.fit_transform( + input, + continuous_vars=None, + discrete_vars=None, + target_column_name="Target", + id_col_name="ID" + ) + pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False) + + @staticmethod + def mock_transform(df: pd.DataFrame, args): + """Mock the transform method.""" + df["new_column"] = "Hello World" + return df + + def test_mutable_train_data_fit_transform(self, mocker: MockerFixture): + """Test if the train_data input is not changed when performing fit_transform.""" + train_data = pd.DataFrame( + [[1, "2", 3], [10, "20", 30], [100, "200", 300]], + columns=["foo", "bar", "baz"], + ) + preprocessor = PreProcessor.from_params( + model_type="classification", n_bins=10, weight=0.8 + ) + preprocessor._categorical_data_processor = MagicMock() + preprocessor._categorical_data_processor.transform = self.mock_transform + preprocessor._discretizer = MagicMock() + preprocessor._discretizer.transform = self.mock_transform + preprocessor._target_encoder = MagicMock() + preprocessor._target_encoder.transform = self.mock_transform + + result = preprocessor.fit_transform( + train_data, + continuous_vars=["foo"], + discrete_vars=["bar"], + target_column_name=["baz"], + ) + assert "new_column" not in train_data.columns + assert "new_column" in result.columns + + @pytest.mark.parametrize( + ("input, expected"), + [ + # example 1 + ( + pd.DataFrame( + { + "a": [1, 8, np.nan], + "b": [np.nan, 8, np.nan], + "c": [np.nan, np.nan, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + "f": [np.nan, np.nan, np.nan], + } + ), + pd.DataFrame( + { + "a": [1.0, 8.0, np.nan], + "b": [np.nan, 8.0, np.nan], + "d": [np.nan, np.nan, 5.0], + "e": [1.0, 960.0, np.nan], + } + ), + ), + # example 2 + ( + pd.DataFrame( + { + "a": [1, 8, np.nan], + "b": [np.nan, 8, np.nan], + "c": [np.nan, np.nan, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + } + ), + pd.DataFrame( + { + "a": [1.0, 8.0, np.nan], + "b": [np.nan, 8.0, np.nan], + "d": [np.nan, np.nan, 5.0], + "e": [1.0, 960.0, np.nan], + } + ), + ), + # example 3 + ( + pd.DataFrame( + { + "a": [1, 8, np.nan], + "b": [np.nan, 8, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + } + ), + pd.DataFrame( + { + "a": [1.0, 8.0, np.nan], + "b": [np.nan, 8.0, np.nan], + "d": [np.nan, np.nan, 5.0], + "e": [1.0, 960.0, np.nan], + } + ), + ), + # example 4 categorical + ( + pd.DataFrame( + { + "a": [1, 8, np.nan], + "b": [np.nan, np.nan, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + "category_1": ["A", "A", "B"], + "category_2": [np.nan, "A", "B"], + "category_3": [np.nan, np.nan, np.nan], + }, + ).astype( + { + "a": np.float64(), + "b": np.float64(), + "d": np.float64(), + "e": np.float64(), + "category_1": pd.CategoricalDtype(), + "category_2": pd.CategoricalDtype(), + "category_3": pd.CategoricalDtype(), + } + ), + pd.DataFrame( + { + "a": [1, 8, np.nan], + "d": [np.nan, np.nan, 5], + "e": [1, 960, np.nan], + "category_1": ["A", "A", "B"], + "category_2": [np.nan, "A", "B"], + } + ).astype( + { + "a": np.float64(), + "d": np.float64(), + "e": np.float64(), + "category_1": pd.CategoricalDtype(), + "category_2": pd.CategoricalDtype(), + } + ), + ), + ], + ) + def test_drops_columns_containing_only_nan(self, input, expected): + + print(input) + output = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan( + input + ) + + print(output) + print(expected) + assert output.equals(expected) diff --git a/tests/preprocessing/test_target_encoder.py b/tests/preprocessing/test_target_encoder.py index 51ebd79..e03992c 100644 --- a/tests/preprocessing/test_target_encoder.py +++ b/tests/preprocessing/test_target_encoder.py @@ -1,342 +1,342 @@ - -import pytest -import pandas as pd -from sklearn.exceptions import NotFittedError - -from cobra.preprocessing.target_encoder import TargetEncoder - -class TestTargetEncoder: - - def test_target_encoder_constructor_weight_value_error(self): - with pytest.raises(ValueError): - TargetEncoder(weight=-1) - - def test_target_encoder_constructor_imputation_value_error(self): - with pytest.raises(ValueError): - TargetEncoder(imputation_strategy="median") - - # Tests for attributes_attributes_to_dict and set_attributes_from_dict - def test_target_encoder_attributes_to_dict(self): - encoder = TargetEncoder() - - mapping_data = pd.Series(data=[0.333333, 0.50000, 0.666667], - index=["negative", "neutral", "positive"]) - mapping_data.index.name = "variable" - - encoder._mapping["variable"] = mapping_data - - encoder._global_mean = 0.5 - - actual = encoder.attributes_to_dict() - - expected = {"weight": 0.0, - "imputation_strategy": "mean", - "_global_mean": 0.5, - "_mapping": {"variable": { - "negative": 0.333333, - "neutral": 0.50000, - "positive": 0.666667 - }}} - - assert actual == expected - - @pytest.mark.parametrize("attribute", - ["weight", "mapping"], - ids=["test_weight", "test_mapping"]) - def test_target_encoder_set_attributes_from_dict_unfitted(self, attribute): - encoder = TargetEncoder() - - data = {"weight": 1.0} - encoder.set_attributes_from_dict(data) - - if attribute == "weight": - actual = encoder.weight - expected = 1.0 - - assert expected == actual - elif attribute == "mapping": - actual = encoder._mapping - expected = {} - - assert expected == actual - - def test_target_encoder_set_attributes_from_dict(self): - encoder = TargetEncoder() - - data = {"weight": 0.0, - "_global_mean": 0.5, - "_mapping": {"variable": { - "negative": 0.333333, - "neutral": 0.50000, - "positive": 0.666667 - }}} - - encoder.set_attributes_from_dict(data) - - expected = pd.Series(data=[0.333333, 0.50000, 0.666667], - index=["negative", "neutral", "positive"]) - expected.index.name = "variable" - - actual = encoder._mapping["variable"] - - pd.testing.assert_series_equal(actual, expected) - - # Tests for _fit_column: - def test_target_encoder_fit_column_binary_classification(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) - - encoder = TargetEncoder() - encoder._global_mean = 0.5 - actual = encoder._fit_column(X=df.variable, y=df.target) - - expected = pd.Series(data=[0.333333, 0.50000, 0.666667], - index=["negative", "neutral", "positive"]) - expected.index.name = "variable" - - pd.testing.assert_series_equal(actual, expected) - - def test_target_encoder_fit_column_linear_regression(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral', 'positive'], - 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) - - encoder = TargetEncoder() - encoder._global_mean = 0.454545 - actual = encoder._fit_column(X=df.variable, y=df.target) - - expected = pd.Series(data=[-4.666667, 0.250000, 4.500000], - index=["negative", "neutral", "positive"]) - expected.index.name = "variable" - - pd.testing.assert_series_equal(actual, expected) - - def test_target_encoder_fit_column_global_mean_binary_classification(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) - - encoder = TargetEncoder(weight=1) - encoder._global_mean = df.target.sum() / df.target.count() # is 0.5 - - actual = encoder._fit_column(X=df.variable, y=df.target) - - expected = pd.Series(data=[0.375, 0.500, 0.625], - index=["negative", "neutral", "positive"]) - expected.index.name = "variable" - - pd.testing.assert_series_equal(actual, expected) - - def test_target_encoder_fit_column_global_mean_linear_regression(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral', 'positive'], - 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) - - encoder = TargetEncoder(weight=1) - encoder._global_mean = 0.454545 - - actual = encoder._fit_column(X=df.variable, y=df.target) - - # expected new value: - # [count of the value * its mean encoding + weight (= 1) * global mean] - # / [count of the value + weight (=1)]. - expected = pd.Series(data=[(3 * -4.666667 + 1 * 0.454545) / (3 + 1), - (4 * 0.250000 + 1 * 0.454545) / (4 + 1), - (4 * 4.500000 + 1 * 0.454545) / (4 + 1)], - index=["negative", "neutral", "positive"]) - expected.index.name = "variable" - - pd.testing.assert_series_equal(actual, expected) - - # Tests for fit method - def test_target_encoder_fit_binary_classification(self): - # test_target_encoder_fit_column_linear_regression() tested on one - # column input as a numpy series; this test runs on a dataframe input. - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) - - encoder = TargetEncoder() - encoder.fit(data=df, column_names=["variable"], target_column="target") - - expected = pd.Series(data=[0.333333, 0.50000, 0.666667], - index=["negative", "neutral", "positive"]) - expected.index.name = "variable" - actual = encoder._mapping["variable"] - - pd.testing.assert_series_equal(actual, expected) - - def test_target_encoder_fit_linear_regression(self): - # test_target_encoder_fit_column_linear_regression() tested on one - # column input as a numpy series; this test runs on a dataframe input. - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral', 'positive'], - 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) - - encoder = TargetEncoder() - encoder.fit(data=df, column_names=["variable"], target_column="target") - - expected = pd.Series(data=[-4.666667, 0.250000, 4.500000], - index=["negative", "neutral", "positive"]) - expected.index.name = "variable" - actual = encoder._mapping["variable"] - - pd.testing.assert_series_equal(actual, expected) - - # Tests for transform method - def test_target_encoder_transform_when_not_fitted(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) - - # inputs of TargetEncoder will be of dtype category - df["variable"] = df["variable"].astype("category") - - encoder = TargetEncoder() - with pytest.raises(NotFittedError): - encoder.transform(data=df, column_names=["variable"]) - - def test_target_encoder_transform_binary_classification(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) - - # inputs of TargetEncoder will be of dtype category - df["variable"] = df["variable"].astype("category") - - expected = df.copy() - expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000, - 0.333333, 0.666667, 0.333333, 0.50000, - 0.50000, 0.50000] - - encoder = TargetEncoder() - encoder.fit(data=df, column_names=["variable"], target_column="target") - actual = encoder.transform(data=df, column_names=["variable"]) - - pd.testing.assert_frame_equal(actual, expected) - - def test_target_encoder_transform_linear_regression(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral', 'positive'], - 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) - - # inputs of TargetEncoder will be of dtype category - df["variable"] = df["variable"].astype("category") - - expected = df.copy() - expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000, - -4.666667, 4.500000, -4.666667, 0.250000, - 0.250000, 0.250000, 4.500000] - - encoder = TargetEncoder() - encoder.fit(data=df, column_names=["variable"], target_column="target") - actual = encoder.transform(data=df, column_names=["variable"]) - - pd.testing.assert_frame_equal(actual, expected) - - def test_target_encoder_transform_new_category_binary_classification(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) - - df_appended = df.append({"variable": "new", "target": 1}, - ignore_index=True) - - # inputs of TargetEncoder will be of dtype category - df["variable"] = df["variable"].astype("category") - df_appended["variable"] = df_appended["variable"].astype("category") - - expected = df_appended.copy() - expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000, - 0.333333, 0.666667, 0.333333, 0.50000, - 0.50000, 0.50000, 0.333333] - - encoder = TargetEncoder(imputation_strategy="min") - encoder.fit(data=df, column_names=["variable"], target_column="target") - actual = encoder.transform(data=df_appended, column_names=["variable"]) - - pd.testing.assert_frame_equal(actual, expected) - - def test_target_encoder_transform_new_category_linear_regression(self): - df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', - 'neutral', 'negative', 'positive', - 'negative', 'neutral', 'neutral', - 'neutral', 'positive'], - 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) - - df_appended = df.append({"variable": "new", "target": 10}, - ignore_index=True) - - # inputs of TargetEncoder will be of dtype category - df["variable"] = df["variable"].astype("category") - df_appended["variable"] = df_appended["variable"].astype("category") - - expected = df_appended.copy() - expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000, - -4.666667, 4.500000, -4.666667, 0.250000, - 0.250000, 0.250000, 4.500000, - -4.666667] # min imputation for new value - - encoder = TargetEncoder(imputation_strategy="min") - encoder.fit(data=df, column_names=["variable"], target_column="target") - actual = encoder.transform(data=df_appended, column_names=["variable"]) - - pd.testing.assert_frame_equal(actual, expected) - - # Tests for _clean_column_name: - def test_target_encoder_clean_column_name_binned_column(self): - column_name = "test_column_bin" - expected = "test_column_enc" - - encoder = TargetEncoder() - actual = encoder._clean_column_name(column_name) - - assert actual == expected - - def test_target_encoder_clean_column_name_processed_column(self): - column_name = "test_column_processed" - expected = "test_column_enc" - - encoder = TargetEncoder() - actual = encoder._clean_column_name(column_name) - - assert actual == expected - - def test_target_encoder_clean_column_name_cleaned_column(self): - column_name = "test_column_cleaned" - expected = "test_column_enc" - - encoder = TargetEncoder() - actual = encoder._clean_column_name(column_name) - - assert actual == expected - - def test_target_encoder_clean_column_other_name(self): - column_name = "test_column" - expected = "test_column_enc" - - encoder = TargetEncoder() - actual = encoder._clean_column_name(column_name) - - assert actual == expected + +import pytest +import pandas as pd +from sklearn.exceptions import NotFittedError + +from cobra.preprocessing.target_encoder import TargetEncoder + +class TestTargetEncoder: + + def test_target_encoder_constructor_weight_value_error(self): + with pytest.raises(ValueError): + TargetEncoder(weight=-1) + + def test_target_encoder_constructor_imputation_value_error(self): + with pytest.raises(ValueError): + TargetEncoder(imputation_strategy="median") + + # Tests for attributes_attributes_to_dict and set_attributes_from_dict + def test_target_encoder_attributes_to_dict(self): + encoder = TargetEncoder() + + mapping_data = pd.Series(data=[0.333333, 0.50000, 0.666667], + index=["negative", "neutral", "positive"]) + mapping_data.index.name = "variable" + + encoder._mapping["variable"] = mapping_data + + encoder._global_mean = 0.5 + + actual = encoder.attributes_to_dict() + + expected = {"weight": 0.0, + "imputation_strategy": "mean", + "_global_mean": 0.5, + "_mapping": {"variable": { + "negative": 0.333333, + "neutral": 0.50000, + "positive": 0.666667 + }}} + + assert actual == expected + + @pytest.mark.parametrize("attribute", + ["weight", "mapping"], + ids=["test_weight", "test_mapping"]) + def test_target_encoder_set_attributes_from_dict_unfitted(self, attribute): + encoder = TargetEncoder() + + data = {"weight": 1.0} + encoder.set_attributes_from_dict(data) + + if attribute == "weight": + actual = encoder.weight + expected = 1.0 + + assert expected == actual + elif attribute == "mapping": + actual = encoder._mapping + expected = {} + + assert expected == actual + + def test_target_encoder_set_attributes_from_dict(self): + encoder = TargetEncoder() + + data = {"weight": 0.0, + "_global_mean": 0.5, + "_mapping": {"variable": { + "negative": 0.333333, + "neutral": 0.50000, + "positive": 0.666667 + }}} + + encoder.set_attributes_from_dict(data) + + expected = pd.Series(data=[0.333333, 0.50000, 0.666667], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + + actual = encoder._mapping["variable"] + + pd.testing.assert_series_equal(actual, expected) + + # Tests for _fit_column: + def test_target_encoder_fit_column_binary_classification(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + + encoder = TargetEncoder() + encoder._global_mean = 0.5 + actual = encoder._fit_column(X=df.variable, y=df.target) + + expected = pd.Series(data=[0.333333, 0.50000, 0.666667], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + + pd.testing.assert_series_equal(actual, expected) + + def test_target_encoder_fit_column_linear_regression(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral', 'positive'], + 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) + + encoder = TargetEncoder() + encoder._global_mean = 0.454545 + actual = encoder._fit_column(X=df.variable, y=df.target) + + expected = pd.Series(data=[-4.666667, 0.250000, 4.500000], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + + pd.testing.assert_series_equal(actual, expected) + + def test_target_encoder_fit_column_global_mean_binary_classification(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + + encoder = TargetEncoder(weight=1) + encoder._global_mean = df.target.sum() / df.target.count() # is 0.5 + + actual = encoder._fit_column(X=df.variable, y=df.target) + + expected = pd.Series(data=[0.375, 0.500, 0.625], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + + pd.testing.assert_series_equal(actual, expected) + + def test_target_encoder_fit_column_global_mean_linear_regression(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral', 'positive'], + 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) + + encoder = TargetEncoder(weight=1) + encoder._global_mean = 0.454545 + + actual = encoder._fit_column(X=df.variable, y=df.target) + + # expected new value: + # [count of the value * its mean encoding + weight (= 1) * global mean] + # / [count of the value + weight (=1)]. + expected = pd.Series(data=[(3 * -4.666667 + 1 * 0.454545) / (3 + 1), + (4 * 0.250000 + 1 * 0.454545) / (4 + 1), + (4 * 4.500000 + 1 * 0.454545) / (4 + 1)], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + + pd.testing.assert_series_equal(actual, expected) + + # Tests for fit method + def test_target_encoder_fit_binary_classification(self): + # test_target_encoder_fit_column_linear_regression() tested on one + # column input as a numpy series; this test runs on a dataframe input. + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + + encoder = TargetEncoder() + encoder.fit(data=df, column_names=["variable"], target_column="target") + + expected = pd.Series(data=[0.333333, 0.50000, 0.666667], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + actual = encoder._mapping["variable"] + + pd.testing.assert_series_equal(actual, expected) + + def test_target_encoder_fit_linear_regression(self): + # test_target_encoder_fit_column_linear_regression() tested on one + # column input as a numpy series; this test runs on a dataframe input. + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral', 'positive'], + 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) + + encoder = TargetEncoder() + encoder.fit(data=df, column_names=["variable"], target_column="target") + + expected = pd.Series(data=[-4.666667, 0.250000, 4.500000], + index=["negative", "neutral", "positive"]) + expected.index.name = "variable" + actual = encoder._mapping["variable"] + + pd.testing.assert_series_equal(actual, expected) + + # Tests for transform method + def test_target_encoder_transform_when_not_fitted(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + + # inputs of TargetEncoder will be of dtype category + df["variable"] = df["variable"].astype("category") + + encoder = TargetEncoder() + with pytest.raises(NotFittedError): + encoder.transform(data=df, column_names=["variable"]) + + def test_target_encoder_transform_binary_classification(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + + # inputs of TargetEncoder will be of dtype category + df["variable"] = df["variable"].astype("category") + + expected = df.copy() + expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000, + 0.333333, 0.666667, 0.333333, 0.50000, + 0.50000, 0.50000] + + encoder = TargetEncoder() + encoder.fit(data=df, column_names=["variable"], target_column="target") + actual = encoder.transform(data=df, column_names=["variable"]) + + pd.testing.assert_frame_equal(actual, expected) + + def test_target_encoder_transform_linear_regression(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral', 'positive'], + 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) + + # inputs of TargetEncoder will be of dtype category + df["variable"] = df["variable"].astype("category") + + expected = df.copy() + expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000, + -4.666667, 4.500000, -4.666667, 0.250000, + 0.250000, 0.250000, 4.500000] + + encoder = TargetEncoder() + encoder.fit(data=df, column_names=["variable"], target_column="target") + actual = encoder.transform(data=df, column_names=["variable"]) + + pd.testing.assert_frame_equal(actual, expected) + + def test_target_encoder_transform_new_category_binary_classification(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + + df_appended = df.append({"variable": "new", "target": 1}, + ignore_index=True) + + # inputs of TargetEncoder will be of dtype category + df["variable"] = df["variable"].astype("category") + df_appended["variable"] = df_appended["variable"].astype("category") + + expected = df_appended.copy() + expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000, + 0.333333, 0.666667, 0.333333, 0.50000, + 0.50000, 0.50000, 0.333333] + + encoder = TargetEncoder(imputation_strategy="min") + encoder.fit(data=df, column_names=["variable"], target_column="target") + actual = encoder.transform(data=df_appended, column_names=["variable"]) + + pd.testing.assert_frame_equal(actual, expected) + + def test_target_encoder_transform_new_category_linear_regression(self): + df = pd.DataFrame({'variable': ['positive', 'positive', 'negative', + 'neutral', 'negative', 'positive', + 'negative', 'neutral', 'neutral', + 'neutral', 'positive'], + 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]}) + + df_appended = df.append({"variable": "new", "target": 10}, + ignore_index=True) + + # inputs of TargetEncoder will be of dtype category + df["variable"] = df["variable"].astype("category") + df_appended["variable"] = df_appended["variable"].astype("category") + + expected = df_appended.copy() + expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000, + -4.666667, 4.500000, -4.666667, 0.250000, + 0.250000, 0.250000, 4.500000, + -4.666667] # min imputation for new value + + encoder = TargetEncoder(imputation_strategy="min") + encoder.fit(data=df, column_names=["variable"], target_column="target") + actual = encoder.transform(data=df_appended, column_names=["variable"]) + + pd.testing.assert_frame_equal(actual, expected) + + # Tests for _clean_column_name: + def test_target_encoder_clean_column_name_binned_column(self): + column_name = "test_column_bin" + expected = "test_column_enc" + + encoder = TargetEncoder() + actual = encoder._clean_column_name(column_name) + + assert actual == expected + + def test_target_encoder_clean_column_name_processed_column(self): + column_name = "test_column_processed" + expected = "test_column_enc" + + encoder = TargetEncoder() + actual = encoder._clean_column_name(column_name) + + assert actual == expected + + def test_target_encoder_clean_column_name_cleaned_column(self): + column_name = "test_column_cleaned" + expected = "test_column_enc" + + encoder = TargetEncoder() + actual = encoder._clean_column_name(column_name) + + assert actual == expected + + def test_target_encoder_clean_column_other_name(self): + column_name = "test_column" + expected = "test_column_enc" + + encoder = TargetEncoder() + actual = encoder._clean_column_name(column_name) + + assert actual == expected From e1248549e70505f37ccce4ddb44eb83f1a5ffee9 Mon Sep 17 00:00:00 2001 From: joostneuj <91886694+joostneuj@users.noreply.github.com> Date: Fri, 16 Jun 2023 18:15:38 +0200 Subject: [PATCH 2/4] #143 deleted notebook --- notebooks/debugging.ipynb | 1364 ------------------------------------- 1 file changed, 1364 deletions(-) delete mode 100644 notebooks/debugging.ipynb diff --git a/notebooks/debugging.ipynb b/notebooks/debugging.ipynb deleted file mode 100644 index 5dd573e..0000000 --- a/notebooks/debugging.ipynb +++ /dev/null @@ -1,1364 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 464, - "id": "23482fd8-b4c1-48f5-8c30-a0e79f7667b3", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "%reload_ext autoreload" - ] - }, - { - "cell_type": "code", - "execution_count": 465, - "id": "da551dc3-ffba-45e0-b87d-7b626a622b08", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.insert(0, r\"C:/projects/cobra\")" - ] - }, - { - "cell_type": "code", - "execution_count": 488, - "id": "7d2678fa-eb47-4cb5-ad1d-c5034a742f55", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import random\n", - "from cobra.preprocessing import PreProcessor\n", - "\n", - "# custom imports\n", - "from cobra.preprocessing import CategoricalDataProcessor\n", - "from cobra.preprocessing import KBinsDiscretizer\n", - "from cobra.preprocessing import TargetEncoder\n", - "import json\n" - ] - }, - { - "cell_type": "markdown", - "id": "d4d341ec-b5c3-4b00-a54f-c5b6565d2631", - "metadata": {}, - "source": [ - "### 1. Generate data" - ] - }, - { - "cell_type": "code", - "execution_count": 467, - "id": "a9563643-308b-4c6c-b358-9cbf93a0666d", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "size = 5000\n", - "\n", - "# Create datetime column\n", - "dates = pd.date_range('2022-01-01', periods=size, freq='D')\n", - "\n", - "# Create categorical variables\n", - "category_values = ['Category A', 'Category B', 'Category C']\n", - "cat_var1 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n", - "cat_var2 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n", - "cat_var3 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n", - "\n", - "# Create continuous variables with different scales and distributions\n", - "cont_var1 = pd.Series(np.random.normal(loc=0, scale=1, size=size), name='cont_var1')\n", - "cont_var2 = pd.Series(np.random.uniform(low=0, high=10, size=size), name='cont_var2')\n", - "cont_var3 = pd.Series(np.random.exponential(scale=1, size=size), name='cont_var3')\n", - "\n", - "# Create target variable\n", - "target = pd.Series(np.random.randint(2, size=size))\n", - "\n", - "# Combine into a DataFrame\n", - "df = pd.DataFrame({'DateTime': dates, 'CategoryVar1': cat_var1,\n", - " 'CategoryVar2': cat_var2, 'CategoryVar3': cat_var3,\n", - " 'cont_var1': cont_var1, 'cont_var2': cont_var2, 'cont_var3': cont_var3,\n", - " 'target': target})" - ] - }, - { - "cell_type": "code", - "execution_count": 468, - "id": "bde9235f-dc62-433d-b3d3-6bf37b2ddb52", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "DateTime datetime64[ns]\n", - "CategoryVar1 category\n", - "CategoryVar2 category\n", - "CategoryVar3 category\n", - "cont_var1 float64\n", - "cont_var2 float64\n", - "cont_var3 float64\n", - "target int32\n", - "dtype: object" - ] - }, - "execution_count": 468, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 469, - "id": "d774e959-73f4-40b4-bc20-43c3af99e593", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DateTimeCategoryVar1CategoryVar2CategoryVar3cont_var1cont_var2cont_var3target
02022-01-01Category CCategory BCategory A-1.0016454.7337061.3726590
12022-01-02Category CCategory CCategory B0.2806299.1911290.6359241
22022-01-03Category BCategory BCategory C-0.3452197.7317920.0980911
32022-01-04Category CCategory BCategory C-1.1349120.2051320.1798680
42022-01-05Category ACategory CCategory B-1.3396452.3785400.9668181
\n", - "
" - ], - "text/plain": [ - " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 cont_var2 \\\n", - "0 2022-01-01 Category C Category B Category A -1.001645 4.733706 \n", - "1 2022-01-02 Category C Category C Category B 0.280629 9.191129 \n", - "2 2022-01-03 Category B Category B Category C -0.345219 7.731792 \n", - "3 2022-01-04 Category C Category B Category C -1.134912 0.205132 \n", - "4 2022-01-05 Category A Category C Category B -1.339645 2.378540 \n", - "\n", - " cont_var3 target \n", - "0 1.372659 0 \n", - "1 0.635924 1 \n", - "2 0.098091 1 \n", - "3 0.179868 0 \n", - "4 0.966818 1 " - ] - }, - "execution_count": 469, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 470, - "id": "e9c06e3a-188f-4cdc-b9cd-51d3db63e5ff", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['DateTime', 'CategoryVar1', 'CategoryVar2', 'CategoryVar3', 'cont_var1',\n", - " 'cont_var2', 'cont_var3', 'target'],\n", - " dtype='object')" - ] - }, - "execution_count": 470, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.columns" - ] - }, - { - "cell_type": "markdown", - "id": "9aae8c98-434b-4c71-abb1-29fa6d143895", - "metadata": {}, - "source": [ - "### 2. Fit preprocessor" - ] - }, - { - "cell_type": "code", - "execution_count": 521, - "id": "a32560d4-b5fe-4b90-9ea6-ede7915bba05", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "continuous_vars = ['cont_var2', 'cont_var3', 'cont_var1']\n", - "discrete_vars= ['CategoryVar1', 'CategoryVar2', 'CategoryVar3'] #, 'DateTime'] [] \n", - "target_col = \"target\"" - ] - }, - { - "cell_type": "code", - "execution_count": 522, - "id": "d6f1e21a-4a6e-4ad7-9faf-b36e6daff707", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.\n" - ] - } - ], - "source": [ - "model_type = \"classification\"\n", - "\n", - "# using all Cobra's default parameters for preprocessing here\n", - "preprocessor = PreProcessor.from_params(\n", - " model_type=model_type\n", - ")\n", - "\n", - "random.seed(1212)\n", - "basetable = preprocessor.train_selection_validation_split(data=df,\n", - " train_prop=0.6,\n", - " selection_prop=0.25,\n", - " validation_prop=0.15)" - ] - }, - { - "cell_type": "code", - "execution_count": 523, - "id": "7b673619-4eda-4aca-acd5-a125f80d3b20", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Starting to fit pipeline\n", - "Computing discretization bins...: 100%|█████████████████████████████████████████████████| 3/3 [00:00<00:00, 507.38it/s]\n", - "Fitting KBinsDiscretizer took 0.006914615631103516 seconds\n", - "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 240.62it/s]\n", - "Fitting category regrouping...: 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 29.42it/s]\n", - "Fitting categorical_data_processor class took 0.10196375846862793 seconds\n", - "Fitting target encoding...: 100%|███████████████████████████████████████████████████████| 6/6 [00:00<00:00, 558.52it/s]\n", - "Fitting TargetEncoder took 0.013732433319091797 seconds\n", - "Fitting pipeline took 0.17300176620483398 seconds\n" - ] - } - ], - "source": [ - "preprocessor.fit(basetable[basetable[\"split\"]==\"train\"],\n", - " continuous_vars=continuous_vars,\n", - " discrete_vars = discrete_vars,\n", - " target_column_name=target_col)" - ] - }, - { - "cell_type": "code", - "execution_count": 524, - "id": "c9e2c79d-c0bc-464d-b869-f8115ac67776", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 160.70it/s]\n", - "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 697.13it/s]\n", - "Transforming data took 0.0610198974609375 seconds\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DateTimeCategoryVar1CategoryVar2CategoryVar3cont_var1cont_var2cont_var3targetsplitcont_var2_bin...cont_var1_binCategoryVar1_processedCategoryVar2_processedCategoryVar3_processedCategoryVar1_encCategoryVar2_encCategoryVar3_enccont_var2_enccont_var3_enccont_var1_enc
02022-01-01Category CCategory BCategory A-1.0016454.7337061.3726590selection4.0 - 5.0...-1.3 - -0.8Category CCategory BCategory A0.5042740.4958850.5148720.4673910.4868910.523364
12022-01-02Category CCategory CCategory B0.2806299.1911290.6359241train9.0 - 10.0...0.2 - 0.5Category CCategory CCategory B0.5042740.4879520.4910000.4740480.5243550.492997
22022-01-03Category BCategory BCategory C-0.3452197.7317920.0980911train7.0 - 8.0...-0.5 - -0.2Category BCategory BCategory C0.4733670.4958850.4653660.4902600.4942970.433225
32022-01-04Category CCategory BCategory C-1.1349120.2051320.1798680selection0.0 - 1.0...-1.3 - -0.8Category CCategory BCategory C0.5042740.4958850.4653660.4754100.5040650.523364
42022-01-05Category ACategory CCategory B-1.3396452.3785400.9668181train2.0 - 3.0...-4.0 - -1.3Category ACategory CCategory B0.4915970.4879520.4910000.4556960.4714640.562290
\n", - "

5 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 cont_var2 \\\n", - "0 2022-01-01 Category C Category B Category A -1.001645 4.733706 \n", - "1 2022-01-02 Category C Category C Category B 0.280629 9.191129 \n", - "2 2022-01-03 Category B Category B Category C -0.345219 7.731792 \n", - "3 2022-01-04 Category C Category B Category C -1.134912 0.205132 \n", - "4 2022-01-05 Category A Category C Category B -1.339645 2.378540 \n", - "\n", - " cont_var3 target split cont_var2_bin ... cont_var1_bin \\\n", - "0 1.372659 0 selection 4.0 - 5.0 ... -1.3 - -0.8 \n", - "1 0.635924 1 train 9.0 - 10.0 ... 0.2 - 0.5 \n", - "2 0.098091 1 train 7.0 - 8.0 ... -0.5 - -0.2 \n", - "3 0.179868 0 selection 0.0 - 1.0 ... -1.3 - -0.8 \n", - "4 0.966818 1 train 2.0 - 3.0 ... -4.0 - -1.3 \n", - "\n", - " CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed \\\n", - "0 Category C Category B Category A \n", - "1 Category C Category C Category B \n", - "2 Category B Category B Category C \n", - "3 Category C Category B Category C \n", - "4 Category A Category C Category B \n", - "\n", - " CategoryVar1_enc CategoryVar2_enc CategoryVar3_enc cont_var2_enc \\\n", - "0 0.504274 0.495885 0.514872 0.467391 \n", - "1 0.504274 0.487952 0.491000 0.474048 \n", - "2 0.473367 0.495885 0.465366 0.490260 \n", - "3 0.504274 0.495885 0.465366 0.475410 \n", - "4 0.491597 0.487952 0.491000 0.455696 \n", - "\n", - " cont_var3_enc cont_var1_enc \n", - "0 0.486891 0.523364 \n", - "1 0.524355 0.492997 \n", - "2 0.494297 0.433225 \n", - "3 0.504065 0.523364 \n", - "4 0.471464 0.562290 \n", - "\n", - "[5 rows x 21 columns]" - ] - }, - "execution_count": 524, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "basetable_transformed_orig = preprocessor.transform(basetable,\n", - " continuous_vars=continuous_vars,\n", - " discrete_vars=discrete_vars)\n", - "basetable_transformed_orig.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 525, - "id": "d70f40cc-7814-48a8-91f6-2b7297f97ccc", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "#preprocessor._discretizer #._bins_by_column\n", - "#preprocessor._target_encoder.attributes_to_dict()\n", - "#preprocessor._discretizer.attributes_to_dict()\n", - "#preprocessor._target_encoder.attributes_to_dict()" - ] - }, - { - "cell_type": "markdown", - "id": "baab4c1b-4200-4c96-b991-be8efc09abbb", - "metadata": {}, - "source": [ - "### 3. Serialize the preprocessor" - ] - }, - { - "cell_type": "code", - "execution_count": 526, - "id": "95b597b2-b475-4d59-b650-dcc208db1eb5", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "pipeline_serialized = preprocessor.serialize_pipeline()\n", - "\n", - "with open(r\"./model_json.json\", \"w\") as file:\n", - " file.write(json.dumps(pipeline_serialized, indent=4))\n", - " \n", - "#pipeline_serialized" - ] - }, - { - "cell_type": "code", - "execution_count": 527, - "id": "c6dbd38c-ca5d-492d-815b-1af02d7de143", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Look into properties of preprocessors\n", - "#pipeline_serialized[\"target_encoder\"] #._bins_by_column" - ] - }, - { - "cell_type": "markdown", - "id": "fc339ac8-67a7-4574-811e-2b9bc4ce6a39", - "metadata": {}, - "source": [ - "### 4. De-serialize pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 528, - "id": "2a517ff8-d336-4bd3-abdc-2be784259564", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.\n" - ] - } - ], - "source": [ - "# Read serialized pipeline from json\n", - "with open(r\"./model_json.json\", \"r\") as file:\n", - " json_pipeline_serialized = json.load(file)\n", - "\n", - "# Create new preprocessor object from serialized pipeline\n", - "new_preprocessor = PreProcessor.from_pipeline(json_pipeline_serialized)\n", - "#new_preprocessor = PreProcessor.from_pipeline(pipeline_serialized)" - ] - }, - { - "cell_type": "code", - "execution_count": 529, - "id": "ad9442b5-7f7e-48fe-8199-528992d1f0d6", - "metadata": {}, - "outputs": [], - "source": [ - "# Look into properties of preprocessors if needed\n", - "#new_preprocessor._discretizer.attributes_to_dict()" - ] - }, - { - "cell_type": "code", - "execution_count": 530, - "id": "541986d2-8d5d-473c-8871-5e7d2da31c4a", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 147.15it/s]\n", - "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 661.65it/s]\n", - "Transforming data took 0.06773138046264648 seconds\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DateTimeCategoryVar1CategoryVar2CategoryVar3cont_var1cont_var2cont_var3targetsplitcont_var2_bin...cont_var1_binCategoryVar1_processedCategoryVar2_processedCategoryVar3_processedCategoryVar1_encCategoryVar2_encCategoryVar3_enccont_var2_enccont_var3_enccont_var1_enc
02022-01-01Category CCategory BCategory A-1.0016454.7337061.3726590selection4.0 - 5.0...-1.3 - -0.8Category CCategory BCategory A0.5042740.4958850.5148720.4673910.4868910.523364
12022-01-02Category CCategory CCategory B0.2806299.1911290.6359241train9.0 - 10.0...0.2 - 0.5Category CCategory CCategory B0.5042740.4879520.4910000.4740480.5243550.492997
22022-01-03Category BCategory BCategory C-0.3452197.7317920.0980911train7.0 - 8.0...-0.5 - -0.2Category BCategory BCategory C0.4733670.4958850.4653660.4902600.4942970.433225
32022-01-04Category CCategory BCategory C-1.1349120.2051320.1798680selection0.0 - 1.0...-1.3 - -0.8Category CCategory BCategory C0.5042740.4958850.4653660.4754100.5040650.523364
42022-01-05Category ACategory CCategory B-1.3396452.3785400.9668181train2.0 - 3.0...-4.0 - -1.3Category ACategory CCategory B0.4915970.4879520.4910000.4556960.4714640.562290
\n", - "

5 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 cont_var2 \\\n", - "0 2022-01-01 Category C Category B Category A -1.001645 4.733706 \n", - "1 2022-01-02 Category C Category C Category B 0.280629 9.191129 \n", - "2 2022-01-03 Category B Category B Category C -0.345219 7.731792 \n", - "3 2022-01-04 Category C Category B Category C -1.134912 0.205132 \n", - "4 2022-01-05 Category A Category C Category B -1.339645 2.378540 \n", - "\n", - " cont_var3 target split cont_var2_bin ... cont_var1_bin \\\n", - "0 1.372659 0 selection 4.0 - 5.0 ... -1.3 - -0.8 \n", - "1 0.635924 1 train 9.0 - 10.0 ... 0.2 - 0.5 \n", - "2 0.098091 1 train 7.0 - 8.0 ... -0.5 - -0.2 \n", - "3 0.179868 0 selection 0.0 - 1.0 ... -1.3 - -0.8 \n", - "4 0.966818 1 train 2.0 - 3.0 ... -4.0 - -1.3 \n", - "\n", - " CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed \\\n", - "0 Category C Category B Category A \n", - "1 Category C Category C Category B \n", - "2 Category B Category B Category C \n", - "3 Category C Category B Category C \n", - "4 Category A Category C Category B \n", - "\n", - " CategoryVar1_enc CategoryVar2_enc CategoryVar3_enc cont_var2_enc \\\n", - "0 0.504274 0.495885 0.514872 0.467391 \n", - "1 0.504274 0.487952 0.491000 0.474048 \n", - "2 0.473367 0.495885 0.465366 0.490260 \n", - "3 0.504274 0.495885 0.465366 0.475410 \n", - "4 0.491597 0.487952 0.491000 0.455696 \n", - "\n", - " cont_var3_enc cont_var1_enc \n", - "0 0.486891 0.523364 \n", - "1 0.524355 0.492997 \n", - "2 0.494297 0.433225 \n", - "3 0.504065 0.523364 \n", - "4 0.471464 0.562290 \n", - "\n", - "[5 rows x 21 columns]" - ] - }, - "execution_count": 530, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "basetable_transformed = new_preprocessor.transform(basetable,\n", - " continuous_vars=continuous_vars,\n", - " discrete_vars=discrete_vars)\n", - "basetable_transformed.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 531, - "id": "c270d856-452d-4507-a3c2-df3ae1991c36", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DateTimeCategoryVar1CategoryVar2CategoryVar3cont_var1cont_var2cont_var3targetsplitcont_var2_bin...cont_var1_binCategoryVar1_processedCategoryVar2_processedCategoryVar3_processedCategoryVar1_encCategoryVar2_encCategoryVar3_enccont_var2_enccont_var3_enccont_var1_enc
0TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
1TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
2TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
3TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
..................................................................
4995TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4996TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4997TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4998TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4999TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue...TrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
\n", - "

5000 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 \\\n", - "0 True True True True True \n", - "1 True True True True True \n", - "2 True True True True True \n", - "3 True True True True True \n", - "4 True True True True True \n", - "... ... ... ... ... ... \n", - "4995 True True True True True \n", - "4996 True True True True True \n", - "4997 True True True True True \n", - "4998 True True True True True \n", - "4999 True True True True True \n", - "\n", - " cont_var2 cont_var3 target split cont_var2_bin ... cont_var1_bin \\\n", - "0 True True True True True ... True \n", - "1 True True True True True ... True \n", - "2 True True True True True ... True \n", - "3 True True True True True ... True \n", - "4 True True True True True ... True \n", - "... ... ... ... ... ... ... ... \n", - "4995 True True True True True ... True \n", - "4996 True True True True True ... True \n", - "4997 True True True True True ... True \n", - "4998 True True True True True ... True \n", - "4999 True True True True True ... True \n", - "\n", - " CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed \\\n", - "0 True True True \n", - "1 True True True \n", - "2 True True True \n", - "3 True True True \n", - "4 True True True \n", - "... ... ... ... \n", - "4995 True True True \n", - "4996 True True True \n", - "4997 True True True \n", - "4998 True True True \n", - "4999 True True True \n", - "\n", - " CategoryVar1_enc CategoryVar2_enc CategoryVar3_enc cont_var2_enc \\\n", - "0 True True True True \n", - "1 True True True True \n", - "2 True True True True \n", - "3 True True True True \n", - "4 True True True True \n", - "... ... ... ... ... \n", - "4995 True True True True \n", - "4996 True True True True \n", - "4997 True True True True \n", - "4998 True True True True \n", - "4999 True True True True \n", - "\n", - " cont_var3_enc cont_var1_enc \n", - "0 True True \n", - "1 True True \n", - "2 True True \n", - "3 True True \n", - "4 True True \n", - "... ... ... \n", - "4995 True True \n", - "4996 True True \n", - "4997 True True \n", - "4998 True True \n", - "4999 True True \n", - "\n", - "[5000 rows x 21 columns]" - ] - }, - "execution_count": 531, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Double check transformed basetable is the same\n", - "basetable_transformed_orig == basetable_transformed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2b478d7c-46d8-4ba9-bf84-375a7cf901a8", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "cobra_venv", - "language": "python", - "name": "cobra_venv" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 82b378f8eff34d7b19b8c39e28d0806564ecad05 Mon Sep 17 00:00:00 2001 From: joostneuj <91886694+joostneuj@users.noreply.github.com> Date: Fri, 16 Jun 2023 18:15:58 +0200 Subject: [PATCH 3/4] #143 delete file --- notebooks/model_json.json | 216 -------------------------------------- 1 file changed, 216 deletions(-) delete mode 100644 notebooks/model_json.json diff --git a/notebooks/model_json.json b/notebooks/model_json.json deleted file mode 100644 index fd80281..0000000 --- a/notebooks/model_json.json +++ /dev/null @@ -1,216 +0,0 @@ -{ - "metadata": { - "timestamp": "16/06/2023 18:00:26" - }, - "categorical_data_processor": { - "category_size_threshold": 5, - "forced_categories": {}, - "keep_missing": true, - "model_type": "classification", - "p_value_threshold": 0.001, - "regroup": true, - "regroup_name": "Other", - "scale_contingency_table": true, - "_cleaned_categories_by_column": { - "CategoryVar1": [], - "CategoryVar2": [], - "CategoryVar3": [] - } - }, - "discretizer": { - "auto_adapt_bins": false, - "change_endpoint_format": false, - "closed": "right", - "label_format": "{} - {}", - "n_bins": 10, - "starting_precision": 0, - "strategy": "quantile", - "_bins_by_column": { - "cont_var2": [ - [ - 0.0, - 1.0 - ], - [ - 1.0, - 2.0 - ], - [ - 2.0, - 3.0 - ], - [ - 3.0, - 4.0 - ], - [ - 4.0, - 5.0 - ], - [ - 5.0, - 6.0 - ], - [ - 6.0, - 7.0 - ], - [ - 7.0, - 8.0 - ], - [ - 8.0, - 9.0 - ], - [ - 9.0, - 10.0 - ] - ], - "cont_var3": [ - [ - 0.0, - 0.1 - ], - [ - 0.1, - 0.2 - ], - [ - 0.2, - 0.4 - ], - [ - 0.4, - 0.5 - ], - [ - 0.5, - 0.7 - ], - [ - 0.7, - 0.9 - ], - [ - 0.9, - 1.3 - ], - [ - 1.3, - 1.7 - ], - [ - 1.7, - 2.4 - ], - [ - 2.4, - 7.6 - ] - ], - "cont_var1": [ - [ - -4.0, - -1.3 - ], - [ - -1.3, - -0.8 - ], - [ - -0.8, - -0.5 - ], - [ - -0.5, - -0.2 - ], - [ - -0.2, - 0.0 - ], - [ - 0.0, - 0.2 - ], - [ - 0.2, - 0.5 - ], - [ - 0.5, - 0.8 - ], - [ - 0.8, - 1.2 - ], - [ - 1.2, - 3.7 - ] - ] - } - }, - "target_encoder": { - "imputation_strategy": "mean", - "weight": 0.0, - "_mapping": { - "CategoryVar1_processed": { - "Category A": 0.49159663865546216, - "Category B": 0.4733668341708543, - "Category C": 0.5042735042735043 - }, - "CategoryVar2_processed": { - "Category A": 0.48643410852713176, - "Category B": 0.49588477366255146, - "Category C": 0.4879518072289157 - }, - "CategoryVar3_processed": { - "Category A": 0.5148717948717949, - "Category B": 0.491, - "Category C": 0.4653658536585366 - }, - "cont_var2_bin": { - "0.0 - 1.0": 0.47540983606557374, - "1.0 - 2.0": 0.46855345911949686, - "2.0 - 3.0": 0.45569620253164556, - "3.0 - 4.0": 0.5133333333333333, - "4.0 - 5.0": 0.4673913043478261, - "5.0 - 6.0": 0.5307443365695793, - "6.0 - 7.0": 0.5232974910394266, - "7.0 - 8.0": 0.4902597402597403, - "8.0 - 9.0": 0.5033333333333333, - "9.0 - 10.0": 0.4740484429065744 - }, - "cont_var3_bin": { - "0.0 - 0.1": 0.49429657794676807, - "0.1 - 0.2": 0.5040650406504065, - "0.2 - 0.4": 0.4897025171624714, - "0.4 - 0.5": 0.5, - "0.5 - 0.7": 0.5243553008595988, - "0.7 - 0.9": 0.4703703703703704, - "0.9 - 1.3": 0.47146401985111663, - "1.3 - 1.7": 0.4868913857677903, - "1.7 - 2.4": 0.43416370106761565, - "2.4 - 7.6": 0.5258064516129032 - }, - "cont_var1_bin": { - "-4.0 - -1.3": 0.5622895622895623, - "-1.3 - -0.8": 0.5233644859813084, - "-0.8 - -0.5": 0.4358974358974359, - "-0.5 - -0.2": 0.43322475570032576, - "-0.2 - 0.0": 0.5219123505976095, - "0.0 - 0.2": 0.4763779527559055, - "0.2 - 0.5": 0.49299719887955185, - "0.5 - 0.8": 0.5054545454545455, - "0.8 - 1.2": 0.4539249146757679, - "1.2 - 3.7": 0.4984984984984985 - } - }, - "_global_mean": 0.49 - }, - "_is_fitted": true -} \ No newline at end of file From fcdc5f3e0b022e09f5849cb5f146687ba0ef8cf0 Mon Sep 17 00:00:00 2001 From: "joost.neujens" Date: Fri, 16 Jun 2023 18:41:46 +0200 Subject: [PATCH 4/4] #143 fix: serialization-deserialization bug --- .gitignore | 1 - cobra/preprocessing/preprocessor.py | 5 - notebooks/debugging.ipynb | 530 ++++++++++++++-------------- notebooks/model_json.json | 120 +++---- 4 files changed, 325 insertions(+), 331 deletions(-) diff --git a/.gitignore b/.gitignore index 14c9262..6aa9052 100644 --- a/.gitignore +++ b/.gitignore @@ -72,7 +72,6 @@ target/ # Jupyter Notebook .ipynb_checkpoints -#*notebooks/* # pyenv .python-version diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 7f84716..fa7ddf1 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -367,10 +367,6 @@ def fit( log.info("Fitting pipeline took {} seconds".format(time.time() - start)) - def test_function(self): - return print('heleeeloooo') - - def transform( self, data: pd.DataFrame, continuous_vars: list, discrete_vars: list ) -> pd.DataFrame: @@ -425,7 +421,6 @@ def transform( return data - def fit_transform( self, train_data: pd.DataFrame, diff --git a/notebooks/debugging.ipynb b/notebooks/debugging.ipynb index 5dd573e..f420671 100644 --- a/notebooks/debugging.ipynb +++ b/notebooks/debugging.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 464, + "execution_count": 532, "id": "23482fd8-b4c1-48f5-8c30-a0e79f7667b3", "metadata": { "tags": [] @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 465, + "execution_count": 533, "id": "da551dc3-ffba-45e0-b87d-7b626a622b08", "metadata": { "tags": [] @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 488, + "execution_count": 534, "id": "7d2678fa-eb47-4cb5-ad1d-c5034a742f55", "metadata": { "tags": [] @@ -67,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 467, + "execution_count": 535, "id": "a9563643-308b-4c6c-b358-9cbf93a0666d", "metadata": { "tags": [] @@ -102,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 468, + "execution_count": 536, "id": "bde9235f-dc62-433d-b3d3-6bf37b2ddb52", "metadata": { "tags": [] @@ -122,7 +122,7 @@ "dtype: object" ] }, - "execution_count": 468, + "execution_count": 536, "metadata": {}, "output_type": "execute_result" } @@ -133,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": 469, + "execution_count": 537, "id": "d774e959-73f4-40b4-bc20-43c3af99e593", "metadata": { "tags": [] @@ -174,34 +174,34 @@ " \n", " 0\n", " 2022-01-01\n", - " Category C\n", " Category B\n", - " Category A\n", - " -1.001645\n", - " 4.733706\n", - " 1.372659\n", - " 0\n", + " Category B\n", + " Category C\n", + " -0.247175\n", + " 8.258259\n", + " 0.039901\n", + " 1\n", " \n", " \n", " 1\n", " 2022-01-02\n", - " Category C\n", - " Category C\n", " Category B\n", - " 0.280629\n", - " 9.191129\n", - " 0.635924\n", + " Category B\n", + " Category C\n", + " 0.247006\n", + " 1.234493\n", + " 1.336691\n", " 1\n", " \n", " \n", " 2\n", " 2022-01-03\n", - " Category B\n", - " Category B\n", " Category C\n", - " -0.345219\n", - " 7.731792\n", - " 0.098091\n", + " Category A\n", + " Category B\n", + " 0.076415\n", + " 5.059058\n", + " 1.323273\n", " 1\n", " \n", " \n", @@ -209,21 +209,21 @@ " 2022-01-04\n", " Category C\n", " Category B\n", - " Category C\n", - " -1.134912\n", - " 0.205132\n", - " 0.179868\n", + " Category A\n", + " -0.306355\n", + " 8.316857\n", + " 0.077718\n", " 0\n", " \n", " \n", " 4\n", " 2022-01-05\n", - " Category A\n", + " Category C\n", " Category C\n", " Category B\n", - " -1.339645\n", - " 2.378540\n", - " 0.966818\n", + " -1.133514\n", + " 8.773722\n", + " 0.356009\n", " 1\n", " \n", " \n", @@ -232,21 +232,21 @@ ], "text/plain": [ " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 cont_var2 \\\n", - "0 2022-01-01 Category C Category B Category A -1.001645 4.733706 \n", - "1 2022-01-02 Category C Category C Category B 0.280629 9.191129 \n", - "2 2022-01-03 Category B Category B Category C -0.345219 7.731792 \n", - "3 2022-01-04 Category C Category B Category C -1.134912 0.205132 \n", - "4 2022-01-05 Category A Category C Category B -1.339645 2.378540 \n", + "0 2022-01-01 Category B Category B Category C -0.247175 8.258259 \n", + "1 2022-01-02 Category B Category B Category C 0.247006 1.234493 \n", + "2 2022-01-03 Category C Category A Category B 0.076415 5.059058 \n", + "3 2022-01-04 Category C Category B Category A -0.306355 8.316857 \n", + "4 2022-01-05 Category C Category C Category B -1.133514 8.773722 \n", "\n", " cont_var3 target \n", - "0 1.372659 0 \n", - "1 0.635924 1 \n", - "2 0.098091 1 \n", - "3 0.179868 0 \n", - "4 0.966818 1 " + "0 0.039901 1 \n", + "1 1.336691 1 \n", + "2 1.323273 1 \n", + "3 0.077718 0 \n", + "4 0.356009 1 " ] }, - "execution_count": 469, + "execution_count": 537, "metadata": {}, "output_type": "execute_result" } @@ -257,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": 470, + "execution_count": 538, "id": "e9c06e3a-188f-4cdc-b9cd-51d3db63e5ff", "metadata": { "tags": [] @@ -271,7 +271,7 @@ " dtype='object')" ] }, - "execution_count": 470, + "execution_count": 538, "metadata": {}, "output_type": "execute_result" } @@ -290,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 521, + "execution_count": 539, "id": "a32560d4-b5fe-4b90-9ea6-ede7915bba05", "metadata": { "tags": [] @@ -304,7 +304,7 @@ }, { "cell_type": "code", - "execution_count": 522, + "execution_count": 540, "id": "d6f1e21a-4a6e-4ad7-9faf-b36e6daff707", "metadata": { "tags": [] @@ -335,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 523, + "execution_count": 541, "id": "7b673619-4eda-4aca-acd5-a125f80d3b20", "metadata": { "tags": [] @@ -346,14 +346,14 @@ "output_type": "stream", "text": [ "Starting to fit pipeline\n", - "Computing discretization bins...: 100%|█████████████████████████████████████████████████| 3/3 [00:00<00:00, 507.38it/s]\n", - "Fitting KBinsDiscretizer took 0.006914615631103516 seconds\n", - "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 240.62it/s]\n", - "Fitting category regrouping...: 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 29.42it/s]\n", - "Fitting categorical_data_processor class took 0.10196375846862793 seconds\n", - "Fitting target encoding...: 100%|███████████████████████████████████████████████████████| 6/6 [00:00<00:00, 558.52it/s]\n", - "Fitting TargetEncoder took 0.013732433319091797 seconds\n", - "Fitting pipeline took 0.17300176620483398 seconds\n" + "Computing discretization bins...: 100%|█████████████████████████████████████████████████| 3/3 [00:00<00:00, 251.21it/s]\n", + "Fitting KBinsDiscretizer took 0.012943267822265625 seconds\n", + "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 193.52it/s]\n", + "Fitting category regrouping...: 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 26.85it/s]\n", + "Fitting categorical_data_processor class took 0.11171197891235352 seconds\n", + "Fitting target encoding...: 100%|███████████████████████████████████████████████████████| 6/6 [00:00<00:00, 564.66it/s]\n", + "Fitting TargetEncoder took 0.015709400177001953 seconds\n", + "Fitting pipeline took 0.1843581199645996 seconds\n" ] } ], @@ -366,7 +366,7 @@ }, { "cell_type": "code", - "execution_count": 524, + "execution_count": 542, "id": "c9e2c79d-c0bc-464d-b869-f8115ac67776", "metadata": {}, "outputs": [ @@ -374,9 +374,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 160.70it/s]\n", - "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 697.13it/s]\n", - "Transforming data took 0.0610198974609375 seconds\n" + "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 130.81it/s]\n", + "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 517.58it/s]\n", + "Transforming data took 0.06473207473754883 seconds\n" ] }, { @@ -427,122 +427,122 @@ " \n", " 0\n", " 2022-01-01\n", - " Category C\n", " Category B\n", - " Category A\n", - " -1.001645\n", - " 4.733706\n", - " 1.372659\n", - " 0\n", + " Category B\n", + " Category C\n", + " -0.247175\n", + " 8.258259\n", + " 0.039901\n", + " 1\n", " selection\n", - " 4.0 - 5.0\n", + " 8.0 - 9.0\n", " ...\n", - " -1.3 - -0.8\n", - " Category C\n", + " -0.3 - 0.0\n", " Category B\n", - " Category A\n", - " 0.504274\n", - " 0.495885\n", - " 0.514872\n", - " 0.467391\n", - " 0.486891\n", - " 0.523364\n", + " Category B\n", + " Category C\n", + " 0.505584\n", + " 0.530256\n", + " 0.517730\n", + " 0.516447\n", + " 0.514851\n", + " 0.494083\n", " \n", " \n", " 1\n", " 2022-01-02\n", - " Category C\n", - " Category C\n", " Category B\n", - " 0.280629\n", - " 9.191129\n", - " 0.635924\n", + " Category B\n", + " Category C\n", + " 0.247006\n", + " 1.234493\n", + " 1.336691\n", " 1\n", " train\n", - " 9.0 - 10.0\n", + " 1.0 - 2.0\n", " ...\n", - " 0.2 - 0.5\n", - " Category C\n", - " Category C\n", + " 0.0 - 0.3\n", " Category B\n", - " 0.504274\n", - " 0.487952\n", - " 0.491000\n", - " 0.474048\n", - " 0.524355\n", - " 0.492997\n", + " Category B\n", + " Category C\n", + " 0.505584\n", + " 0.530256\n", + " 0.517730\n", + " 0.521311\n", + " 0.517986\n", + " 0.529086\n", " \n", " \n", " 2\n", " 2022-01-03\n", - " Category B\n", - " Category B\n", " Category C\n", - " -0.345219\n", - " 7.731792\n", - " 0.098091\n", + " Category A\n", + " Category B\n", + " 0.076415\n", + " 5.059058\n", + " 1.323273\n", " 1\n", " train\n", - " 7.0 - 8.0\n", + " 5.0 - 6.0\n", " ...\n", - " -0.5 - -0.2\n", - " Category B\n", - " Category B\n", + " 0.0 - 0.3\n", " Category C\n", - " 0.473367\n", - " 0.495885\n", - " 0.465366\n", - " 0.490260\n", - " 0.494297\n", - " 0.433225\n", + " Category A\n", + " Category B\n", + " 0.494939\n", + " 0.461386\n", + " 0.487052\n", + " 0.510903\n", + " 0.517986\n", + " 0.529086\n", " \n", " \n", " 3\n", " 2022-01-04\n", " Category C\n", " Category B\n", - " Category C\n", - " -1.134912\n", - " 0.205132\n", - " 0.179868\n", + " Category A\n", + " -0.306355\n", + " 8.316857\n", + " 0.077718\n", " 0\n", " selection\n", - " 0.0 - 1.0\n", + " 8.0 - 9.0\n", " ...\n", - " -1.3 - -0.8\n", + " -0.5 - -0.3\n", " Category C\n", " Category B\n", - " Category C\n", - " 0.504274\n", - " 0.495885\n", - " 0.465366\n", - " 0.475410\n", - " 0.504065\n", - " 0.523364\n", + " Category A\n", + " 0.494939\n", + " 0.530256\n", + " 0.488603\n", + " 0.516447\n", + " 0.514851\n", + " 0.534884\n", " \n", " \n", " 4\n", " 2022-01-05\n", - " Category A\n", + " Category C\n", " Category C\n", " Category B\n", - " -1.339645\n", - " 2.378540\n", - " 0.966818\n", + " -1.133514\n", + " 8.773722\n", + " 0.356009\n", " 1\n", " train\n", - " 2.0 - 3.0\n", + " 8.0 - 9.0\n", " ...\n", - " -4.0 - -1.3\n", - " Category A\n", + " -1.3 - -0.8\n", + " Category C\n", " Category C\n", " Category B\n", - " 0.491597\n", - " 0.487952\n", - " 0.491000\n", - " 0.455696\n", - " 0.471464\n", - " 0.562290\n", + " 0.494939\n", + " 0.502463\n", + " 0.487052\n", + " 0.516447\n", + " 0.484634\n", + " 0.461078\n", " \n", " \n", "\n", @@ -551,44 +551,44 @@ ], "text/plain": [ " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 cont_var2 \\\n", - "0 2022-01-01 Category C Category B Category A -1.001645 4.733706 \n", - "1 2022-01-02 Category C Category C Category B 0.280629 9.191129 \n", - "2 2022-01-03 Category B Category B Category C -0.345219 7.731792 \n", - "3 2022-01-04 Category C Category B Category C -1.134912 0.205132 \n", - "4 2022-01-05 Category A Category C Category B -1.339645 2.378540 \n", + "0 2022-01-01 Category B Category B Category C -0.247175 8.258259 \n", + "1 2022-01-02 Category B Category B Category C 0.247006 1.234493 \n", + "2 2022-01-03 Category C Category A Category B 0.076415 5.059058 \n", + "3 2022-01-04 Category C Category B Category A -0.306355 8.316857 \n", + "4 2022-01-05 Category C Category C Category B -1.133514 8.773722 \n", "\n", " cont_var3 target split cont_var2_bin ... cont_var1_bin \\\n", - "0 1.372659 0 selection 4.0 - 5.0 ... -1.3 - -0.8 \n", - "1 0.635924 1 train 9.0 - 10.0 ... 0.2 - 0.5 \n", - "2 0.098091 1 train 7.0 - 8.0 ... -0.5 - -0.2 \n", - "3 0.179868 0 selection 0.0 - 1.0 ... -1.3 - -0.8 \n", - "4 0.966818 1 train 2.0 - 3.0 ... -4.0 - -1.3 \n", + "0 0.039901 1 selection 8.0 - 9.0 ... -0.3 - 0.0 \n", + "1 1.336691 1 train 1.0 - 2.0 ... 0.0 - 0.3 \n", + "2 1.323273 1 train 5.0 - 6.0 ... 0.0 - 0.3 \n", + "3 0.077718 0 selection 8.0 - 9.0 ... -0.5 - -0.3 \n", + "4 0.356009 1 train 8.0 - 9.0 ... -1.3 - -0.8 \n", "\n", " CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed \\\n", - "0 Category C Category B Category A \n", - "1 Category C Category C Category B \n", - "2 Category B Category B Category C \n", - "3 Category C Category B Category C \n", - "4 Category A Category C Category B \n", + "0 Category B Category B Category C \n", + "1 Category B Category B Category C \n", + "2 Category C Category A Category B \n", + "3 Category C Category B Category A \n", + "4 Category C Category C Category B \n", "\n", " CategoryVar1_enc CategoryVar2_enc CategoryVar3_enc cont_var2_enc \\\n", - "0 0.504274 0.495885 0.514872 0.467391 \n", - "1 0.504274 0.487952 0.491000 0.474048 \n", - "2 0.473367 0.495885 0.465366 0.490260 \n", - "3 0.504274 0.495885 0.465366 0.475410 \n", - "4 0.491597 0.487952 0.491000 0.455696 \n", + "0 0.505584 0.530256 0.517730 0.516447 \n", + "1 0.505584 0.530256 0.517730 0.521311 \n", + "2 0.494939 0.461386 0.487052 0.510903 \n", + "3 0.494939 0.530256 0.488603 0.516447 \n", + "4 0.494939 0.502463 0.487052 0.516447 \n", "\n", " cont_var3_enc cont_var1_enc \n", - "0 0.486891 0.523364 \n", - "1 0.524355 0.492997 \n", - "2 0.494297 0.433225 \n", - "3 0.504065 0.523364 \n", - "4 0.471464 0.562290 \n", + "0 0.514851 0.494083 \n", + "1 0.517986 0.529086 \n", + "2 0.517986 0.529086 \n", + "3 0.514851 0.534884 \n", + "4 0.484634 0.461078 \n", "\n", "[5 rows x 21 columns]" ] }, - "execution_count": 524, + "execution_count": 542, "metadata": {}, "output_type": "execute_result" } @@ -602,7 +602,7 @@ }, { "cell_type": "code", - "execution_count": 525, + "execution_count": 543, "id": "d70f40cc-7814-48a8-91f6-2b7297f97ccc", "metadata": { "tags": [] @@ -625,7 +625,7 @@ }, { "cell_type": "code", - "execution_count": 526, + "execution_count": 544, "id": "95b597b2-b475-4d59-b650-dcc208db1eb5", "metadata": { "tags": [] @@ -642,7 +642,7 @@ }, { "cell_type": "code", - "execution_count": 527, + "execution_count": 545, "id": "c6dbd38c-ca5d-492d-815b-1af02d7de143", "metadata": { "tags": [] @@ -663,7 +663,7 @@ }, { "cell_type": "code", - "execution_count": 528, + "execution_count": 547, "id": "2a517ff8-d336-4bd3-abdc-2be784259564", "metadata": {}, "outputs": [ @@ -698,7 +698,7 @@ }, { "cell_type": "code", - "execution_count": 530, + "execution_count": 548, "id": "541986d2-8d5d-473c-8871-5e7d2da31c4a", "metadata": {}, "outputs": [ @@ -706,9 +706,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 147.15it/s]\n", - "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 661.65it/s]\n", - "Transforming data took 0.06773138046264648 seconds\n" + "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 119.08it/s]\n", + "Applying target encoding...: 100%|█████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1009.14it/s]\n", + "Transforming data took 0.06331968307495117 seconds\n" ] }, { @@ -759,122 +759,122 @@ " \n", " 0\n", " 2022-01-01\n", - " Category C\n", " Category B\n", - " Category A\n", - " -1.001645\n", - " 4.733706\n", - " 1.372659\n", - " 0\n", + " Category B\n", + " Category C\n", + " -0.247175\n", + " 8.258259\n", + " 0.039901\n", + " 1\n", " selection\n", - " 4.0 - 5.0\n", + " 8.0 - 9.0\n", " ...\n", - " -1.3 - -0.8\n", - " Category C\n", + " -0.3 - 0.0\n", " Category B\n", - " Category A\n", - " 0.504274\n", - " 0.495885\n", - " 0.514872\n", - " 0.467391\n", - " 0.486891\n", - " 0.523364\n", + " Category B\n", + " Category C\n", + " 0.505584\n", + " 0.530256\n", + " 0.517730\n", + " 0.516447\n", + " 0.514851\n", + " 0.494083\n", " \n", " \n", " 1\n", " 2022-01-02\n", - " Category C\n", - " Category C\n", " Category B\n", - " 0.280629\n", - " 9.191129\n", - " 0.635924\n", + " Category B\n", + " Category C\n", + " 0.247006\n", + " 1.234493\n", + " 1.336691\n", " 1\n", " train\n", - " 9.0 - 10.0\n", + " 1.0 - 2.0\n", " ...\n", - " 0.2 - 0.5\n", - " Category C\n", - " Category C\n", + " 0.0 - 0.3\n", " Category B\n", - " 0.504274\n", - " 0.487952\n", - " 0.491000\n", - " 0.474048\n", - " 0.524355\n", - " 0.492997\n", + " Category B\n", + " Category C\n", + " 0.505584\n", + " 0.530256\n", + " 0.517730\n", + " 0.521311\n", + " 0.517986\n", + " 0.529086\n", " \n", " \n", " 2\n", " 2022-01-03\n", - " Category B\n", - " Category B\n", " Category C\n", - " -0.345219\n", - " 7.731792\n", - " 0.098091\n", + " Category A\n", + " Category B\n", + " 0.076415\n", + " 5.059058\n", + " 1.323273\n", " 1\n", " train\n", - " 7.0 - 8.0\n", + " 5.0 - 6.0\n", " ...\n", - " -0.5 - -0.2\n", - " Category B\n", - " Category B\n", + " 0.0 - 0.3\n", " Category C\n", - " 0.473367\n", - " 0.495885\n", - " 0.465366\n", - " 0.490260\n", - " 0.494297\n", - " 0.433225\n", + " Category A\n", + " Category B\n", + " 0.494939\n", + " 0.461386\n", + " 0.487052\n", + " 0.510903\n", + " 0.517986\n", + " 0.529086\n", " \n", " \n", " 3\n", " 2022-01-04\n", " Category C\n", " Category B\n", - " Category C\n", - " -1.134912\n", - " 0.205132\n", - " 0.179868\n", + " Category A\n", + " -0.306355\n", + " 8.316857\n", + " 0.077718\n", " 0\n", " selection\n", - " 0.0 - 1.0\n", + " 8.0 - 9.0\n", " ...\n", - " -1.3 - -0.8\n", + " -0.5 - -0.3\n", " Category C\n", " Category B\n", - " Category C\n", - " 0.504274\n", - " 0.495885\n", - " 0.465366\n", - " 0.475410\n", - " 0.504065\n", - " 0.523364\n", + " Category A\n", + " 0.494939\n", + " 0.530256\n", + " 0.488603\n", + " 0.516447\n", + " 0.514851\n", + " 0.534884\n", " \n", " \n", " 4\n", " 2022-01-05\n", - " Category A\n", + " Category C\n", " Category C\n", " Category B\n", - " -1.339645\n", - " 2.378540\n", - " 0.966818\n", + " -1.133514\n", + " 8.773722\n", + " 0.356009\n", " 1\n", " train\n", - " 2.0 - 3.0\n", + " 8.0 - 9.0\n", " ...\n", - " -4.0 - -1.3\n", - " Category A\n", + " -1.3 - -0.8\n", + " Category C\n", " Category C\n", " Category B\n", - " 0.491597\n", - " 0.487952\n", - " 0.491000\n", - " 0.455696\n", - " 0.471464\n", - " 0.562290\n", + " 0.494939\n", + " 0.502463\n", + " 0.487052\n", + " 0.516447\n", + " 0.484634\n", + " 0.461078\n", " \n", " \n", "\n", @@ -883,44 +883,44 @@ ], "text/plain": [ " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 cont_var2 \\\n", - "0 2022-01-01 Category C Category B Category A -1.001645 4.733706 \n", - "1 2022-01-02 Category C Category C Category B 0.280629 9.191129 \n", - "2 2022-01-03 Category B Category B Category C -0.345219 7.731792 \n", - "3 2022-01-04 Category C Category B Category C -1.134912 0.205132 \n", - "4 2022-01-05 Category A Category C Category B -1.339645 2.378540 \n", + "0 2022-01-01 Category B Category B Category C -0.247175 8.258259 \n", + "1 2022-01-02 Category B Category B Category C 0.247006 1.234493 \n", + "2 2022-01-03 Category C Category A Category B 0.076415 5.059058 \n", + "3 2022-01-04 Category C Category B Category A -0.306355 8.316857 \n", + "4 2022-01-05 Category C Category C Category B -1.133514 8.773722 \n", "\n", " cont_var3 target split cont_var2_bin ... cont_var1_bin \\\n", - "0 1.372659 0 selection 4.0 - 5.0 ... -1.3 - -0.8 \n", - "1 0.635924 1 train 9.0 - 10.0 ... 0.2 - 0.5 \n", - "2 0.098091 1 train 7.0 - 8.0 ... -0.5 - -0.2 \n", - "3 0.179868 0 selection 0.0 - 1.0 ... -1.3 - -0.8 \n", - "4 0.966818 1 train 2.0 - 3.0 ... -4.0 - -1.3 \n", + "0 0.039901 1 selection 8.0 - 9.0 ... -0.3 - 0.0 \n", + "1 1.336691 1 train 1.0 - 2.0 ... 0.0 - 0.3 \n", + "2 1.323273 1 train 5.0 - 6.0 ... 0.0 - 0.3 \n", + "3 0.077718 0 selection 8.0 - 9.0 ... -0.5 - -0.3 \n", + "4 0.356009 1 train 8.0 - 9.0 ... -1.3 - -0.8 \n", "\n", " CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed \\\n", - "0 Category C Category B Category A \n", - "1 Category C Category C Category B \n", - "2 Category B Category B Category C \n", - "3 Category C Category B Category C \n", - "4 Category A Category C Category B \n", + "0 Category B Category B Category C \n", + "1 Category B Category B Category C \n", + "2 Category C Category A Category B \n", + "3 Category C Category B Category A \n", + "4 Category C Category C Category B \n", "\n", " CategoryVar1_enc CategoryVar2_enc CategoryVar3_enc cont_var2_enc \\\n", - "0 0.504274 0.495885 0.514872 0.467391 \n", - "1 0.504274 0.487952 0.491000 0.474048 \n", - "2 0.473367 0.495885 0.465366 0.490260 \n", - "3 0.504274 0.495885 0.465366 0.475410 \n", - "4 0.491597 0.487952 0.491000 0.455696 \n", + "0 0.505584 0.530256 0.517730 0.516447 \n", + "1 0.505584 0.530256 0.517730 0.521311 \n", + "2 0.494939 0.461386 0.487052 0.510903 \n", + "3 0.494939 0.530256 0.488603 0.516447 \n", + "4 0.494939 0.502463 0.487052 0.516447 \n", "\n", " cont_var3_enc cont_var1_enc \n", - "0 0.486891 0.523364 \n", - "1 0.524355 0.492997 \n", - "2 0.494297 0.433225 \n", - "3 0.504065 0.523364 \n", - "4 0.471464 0.562290 \n", + "0 0.514851 0.494083 \n", + "1 0.517986 0.529086 \n", + "2 0.517986 0.529086 \n", + "3 0.514851 0.534884 \n", + "4 0.484634 0.461078 \n", "\n", "[5 rows x 21 columns]" ] }, - "execution_count": 530, + "execution_count": 548, "metadata": {}, "output_type": "execute_result" } @@ -934,7 +934,7 @@ }, { "cell_type": "code", - "execution_count": 531, + "execution_count": 549, "id": "c270d856-452d-4507-a3c2-df3ae1991c36", "metadata": {}, "outputs": [ @@ -1321,7 +1321,7 @@ "[5000 rows x 21 columns]" ] }, - "execution_count": 531, + "execution_count": 549, "metadata": {}, "output_type": "execute_result" } diff --git a/notebooks/model_json.json b/notebooks/model_json.json index fd80281..0670084 100644 --- a/notebooks/model_json.json +++ b/notebooks/model_json.json @@ -1,6 +1,6 @@ { "metadata": { - "timestamp": "16/06/2023 18:00:26" + "timestamp": "16/06/2023 18:39:39" }, "categorical_data_processor": { "category_size_threshold": 5, @@ -79,10 +79,10 @@ ], [ 0.2, - 0.4 + 0.3 ], [ - 0.4, + 0.3, 0.5 ], [ @@ -95,24 +95,24 @@ ], [ 0.9, - 1.3 + 1.2 ], [ - 1.3, - 1.7 + 1.2, + 1.6 ], [ - 1.7, - 2.4 + 1.6, + 2.2 ], [ - 2.4, - 7.6 + 2.2, + 7.3 ] ], "cont_var1": [ [ - -4.0, + -3.1, -1.3 ], [ @@ -125,31 +125,31 @@ ], [ -0.5, - -0.2 + -0.3 ], [ - -0.2, + -0.3, 0.0 ], [ 0.0, - 0.2 + 0.3 ], [ - 0.2, + 0.3, 0.5 ], [ 0.5, - 0.8 + 0.9 ], [ - 0.8, - 1.2 + 0.9, + 1.3 ], [ - 1.2, - 3.7 + 1.3, + 3.3 ] ] } @@ -159,58 +159,58 @@ "weight": 0.0, "_mapping": { "CategoryVar1_processed": { - "Category A": 0.49159663865546216, - "Category B": 0.4733668341708543, - "Category C": 0.5042735042735043 + "Category A": 0.49269717624148, + "Category B": 0.5055837563451777, + "Category C": 0.4949392712550607 }, "CategoryVar2_processed": { - "Category A": 0.48643410852713176, - "Category B": 0.49588477366255146, - "Category C": 0.4879518072289157 + "Category A": 0.4613861386138614, + "Category B": 0.5302564102564102, + "Category C": 0.5024630541871922 }, "CategoryVar3_processed": { - "Category A": 0.5148717948717949, - "Category B": 0.491, - "Category C": 0.4653658536585366 + "Category A": 0.4886025768087215, + "Category B": 0.48705179282868527, + "Category C": 0.5177304964539007 }, "cont_var2_bin": { - "0.0 - 1.0": 0.47540983606557374, - "1.0 - 2.0": 0.46855345911949686, - "2.0 - 3.0": 0.45569620253164556, - "3.0 - 4.0": 0.5133333333333333, - "4.0 - 5.0": 0.4673913043478261, - "5.0 - 6.0": 0.5307443365695793, - "6.0 - 7.0": 0.5232974910394266, - "7.0 - 8.0": 0.4902597402597403, - "8.0 - 9.0": 0.5033333333333333, - "9.0 - 10.0": 0.4740484429065744 + "0.0 - 1.0": 0.5333333333333333, + "1.0 - 2.0": 0.521311475409836, + "2.0 - 3.0": 0.4197952218430034, + "3.0 - 4.0": 0.4781144781144781, + "4.0 - 5.0": 0.4557377049180328, + "5.0 - 6.0": 0.5109034267912772, + "6.0 - 7.0": 0.5408163265306123, + "7.0 - 8.0": 0.5050167224080268, + "8.0 - 9.0": 0.5164473684210527, + "9.0 - 10.0": 0.494949494949495 }, "cont_var3_bin": { - "0.0 - 0.1": 0.49429657794676807, - "0.1 - 0.2": 0.5040650406504065, - "0.2 - 0.4": 0.4897025171624714, - "0.4 - 0.5": 0.5, - "0.5 - 0.7": 0.5243553008595988, - "0.7 - 0.9": 0.4703703703703704, - "0.9 - 1.3": 0.47146401985111663, - "1.3 - 1.7": 0.4868913857677903, - "1.7 - 2.4": 0.43416370106761565, - "2.4 - 7.6": 0.5258064516129032 + "0.0 - 0.1": 0.5148514851485149, + "0.1 - 0.2": 0.4936708860759494, + "0.2 - 0.3": 0.50390625, + "0.3 - 0.5": 0.4846335697399527, + "0.5 - 0.7": 0.47774480712166173, + "0.7 - 0.9": 0.49407114624505927, + "0.9 - 1.2": 0.4773413897280967, + "1.2 - 1.6": 0.5179856115107914, + "1.6 - 2.2": 0.5018050541516246, + "2.2 - 7.3": 0.521311475409836 }, "cont_var1_bin": { - "-4.0 - -1.3": 0.5622895622895623, - "-1.3 - -0.8": 0.5233644859813084, - "-0.8 - -0.5": 0.4358974358974359, - "-0.5 - -0.2": 0.43322475570032576, - "-0.2 - 0.0": 0.5219123505976095, - "0.0 - 0.2": 0.4763779527559055, - "0.2 - 0.5": 0.49299719887955185, - "0.5 - 0.8": 0.5054545454545455, - "0.8 - 1.2": 0.4539249146757679, - "1.2 - 3.7": 0.4984984984984985 + "-3.1 - -1.3": 0.5152542372881356, + "-1.3 - -0.8": 0.46107784431137727, + "-0.8 - -0.5": 0.4899328859060403, + "-0.5 - -0.3": 0.5348837209302325, + "-0.3 - 0.0": 0.4940828402366864, + "0.0 - 0.3": 0.5290858725761773, + "0.3 - 0.5": 0.46396396396396394, + "0.5 - 0.9": 0.4743935309973046, + "0.9 - 1.3": 0.48201438848920863, + "1.3 - 3.3": 0.5381944444444444 } }, - "_global_mean": 0.49 + "_global_mean": 0.49766666666666665 }, "_is_fitted": true } \ No newline at end of file