diff --git a/cobra/model_building/__init__.py b/cobra/model_building/__init__.py
index 7a646c3..768112c 100644
--- a/cobra/model_building/__init__.py
+++ b/cobra/model_building/__init__.py
@@ -1,13 +1,13 @@
-from .univariate_selection import compute_univariate_preselection
-from .univariate_selection import get_preselected_predictors
-from .univariate_selection import compute_correlations
-
-from .models import LogisticRegressionModel, LinearRegressionModel
-from .forward_selection import ForwardFeatureSelection
-
-__all__ = ['compute_univariate_preselection',
- 'get_preselected_predictors',
- 'compute_correlations',
- 'LogisticRegressionModel',
- 'LinearRegressionModel',
- 'ForwardFeatureSelection']
+from .univariate_selection import compute_univariate_preselection
+from .univariate_selection import get_preselected_predictors
+from .univariate_selection import compute_correlations
+
+from .models import LogisticRegressionModel, LinearRegressionModel
+from .forward_selection import ForwardFeatureSelection
+
+__all__ = ['compute_univariate_preselection',
+ 'get_preselected_predictors',
+ 'compute_correlations',
+ 'LogisticRegressionModel',
+ 'LinearRegressionModel',
+ 'ForwardFeatureSelection']
diff --git a/cobra/preprocessing/target_encoder.py b/cobra/preprocessing/target_encoder.py
index 3eda39d..f438479 100644
--- a/cobra/preprocessing/target_encoder.py
+++ b/cobra/preprocessing/target_encoder.py
@@ -5,6 +5,7 @@
from tqdm.auto import tqdm
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError
+import numpy as np
log = logging.getLogger(__name__)
@@ -123,7 +124,7 @@ def set_attributes_from_dict(self, params: dict):
params["imputation_strategy"] in self.valid_imputation_strategies):
self.imputation_strategy = params["imputation_strategy"]
- if "_global_mean" in params and type(params["_global_mean"]) == float:
+ if "_global_mean" in params and isinstance(params["_global_mean"], (np.floating, float)):
self._global_mean = params["_global_mean"]
_mapping = {}
diff --git a/cobra/utils.py b/cobra/utils.py
index d901380..daf1156 100644
--- a/cobra/utils.py
+++ b/cobra/utils.py
@@ -1,24 +1,24 @@
-import logging
-
-# logger = logging.getLogger(__name__)
-# logger.setLevel(logging.INFO)
-# logger.addHandler(logging.Handler())
-
-
-def clean_predictor_name(predictor_name: str) -> str:
- """Strip the redundant suffix (e.g. "_enc" or "_bin") off from the end
- of the predictor name to return a clean version of the predictor
- """
- return (
- predictor_name.replace("_enc", "").replace("_bin", "").replace("_processed", "")
- )
-
-
-def log_tutorial() -> None:
- logging.info(
- """
- Hi, welcome to Cobra!
- You can find some tutorials that explain the functioning of cobra on the PythonPredictions GitHub:
- https://github.com/PythonPredictions/cobra/tree/master/tutorials
- """
- )
+import logging
+
+# logger = logging.getLogger(__name__)
+# logger.setLevel(logging.INFO)
+# logger.addHandler(logging.Handler())
+
+
+def clean_predictor_name(predictor_name: str) -> str:
+ """Strip the redundant suffix (e.g. "_enc" or "_bin") off from the end
+ of the predictor name to return a clean version of the predictor
+ """
+ return (
+ predictor_name.replace("_enc", "").replace("_bin", "").replace("_processed", "")
+ )
+
+
+def log_tutorial() -> None:
+ logging.info(
+ """
+ Hi, welcome to Cobra!
+ You can find some tutorials that explain the functioning of cobra on the PythonPredictions GitHub:
+ https://github.com/PythonPredictions/cobra/tree/master/tutorials
+ """
+ )
diff --git a/docs/make.bat b/docs/make.bat
index 6fcf05b..061f32f 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -1,35 +1,35 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
- set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
- echo.
- echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
- echo.installed, then set the SPHINXBUILD environment variable to point
- echo.to the full path of the 'sphinx-build' executable. Alternatively you
- echo.may add the Sphinx directory to PATH.
- echo.
- echo.If you don't have Sphinx installed, grab it from
- echo.https://www.sphinx-doc.org/
- exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.https://www.sphinx-doc.org/
+ exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/notebooks/debugging.ipynb b/notebooks/debugging.ipynb
new file mode 100644
index 0000000..f420671
--- /dev/null
+++ b/notebooks/debugging.ipynb
@@ -0,0 +1,1364 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 532,
+ "id": "23482fd8-b4c1-48f5-8c30-a0e79f7667b3",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The autoreload extension is already loaded. To reload it, use:\n",
+ " %reload_ext autoreload\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 2\n",
+ "%reload_ext autoreload"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 533,
+ "id": "da551dc3-ffba-45e0-b87d-7b626a622b08",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.insert(0, r\"C:/projects/cobra\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 534,
+ "id": "7d2678fa-eb47-4cb5-ad1d-c5034a742f55",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import random\n",
+ "from cobra.preprocessing import PreProcessor\n",
+ "\n",
+ "# custom imports\n",
+ "from cobra.preprocessing import CategoricalDataProcessor\n",
+ "from cobra.preprocessing import KBinsDiscretizer\n",
+ "from cobra.preprocessing import TargetEncoder\n",
+ "import json\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d4d341ec-b5c3-4b00-a54f-c5b6565d2631",
+ "metadata": {},
+ "source": [
+ "### 1. Generate data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 535,
+ "id": "a9563643-308b-4c6c-b358-9cbf93a0666d",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "size = 5000\n",
+ "\n",
+ "# Create datetime column\n",
+ "dates = pd.date_range('2022-01-01', periods=size, freq='D')\n",
+ "\n",
+ "# Create categorical variables\n",
+ "category_values = ['Category A', 'Category B', 'Category C']\n",
+ "cat_var1 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n",
+ "cat_var2 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n",
+ "cat_var3 = pd.Series(np.random.choice(category_values, size=size), dtype='category')\n",
+ "\n",
+ "# Create continuous variables with different scales and distributions\n",
+ "cont_var1 = pd.Series(np.random.normal(loc=0, scale=1, size=size), name='cont_var1')\n",
+ "cont_var2 = pd.Series(np.random.uniform(low=0, high=10, size=size), name='cont_var2')\n",
+ "cont_var3 = pd.Series(np.random.exponential(scale=1, size=size), name='cont_var3')\n",
+ "\n",
+ "# Create target variable\n",
+ "target = pd.Series(np.random.randint(2, size=size))\n",
+ "\n",
+ "# Combine into a DataFrame\n",
+ "df = pd.DataFrame({'DateTime': dates, 'CategoryVar1': cat_var1,\n",
+ " 'CategoryVar2': cat_var2, 'CategoryVar3': cat_var3,\n",
+ " 'cont_var1': cont_var1, 'cont_var2': cont_var2, 'cont_var3': cont_var3,\n",
+ " 'target': target})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 536,
+ "id": "bde9235f-dc62-433d-b3d3-6bf37b2ddb52",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "DateTime datetime64[ns]\n",
+ "CategoryVar1 category\n",
+ "CategoryVar2 category\n",
+ "CategoryVar3 category\n",
+ "cont_var1 float64\n",
+ "cont_var2 float64\n",
+ "cont_var3 float64\n",
+ "target int32\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 536,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 537,
+ "id": "d774e959-73f4-40b4-bc20-43c3af99e593",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " DateTime | \n",
+ " CategoryVar1 | \n",
+ " CategoryVar2 | \n",
+ " CategoryVar3 | \n",
+ " cont_var1 | \n",
+ " cont_var2 | \n",
+ " cont_var3 | \n",
+ " target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2022-01-01 | \n",
+ " Category B | \n",
+ " Category B | \n",
+ " Category C | \n",
+ " -0.247175 | \n",
+ " 8.258259 | \n",
+ " 0.039901 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2022-01-02 | \n",
+ " Category B | \n",
+ " Category B | \n",
+ " Category C | \n",
+ " 0.247006 | \n",
+ " 1.234493 | \n",
+ " 1.336691 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2022-01-03 | \n",
+ " Category C | \n",
+ " Category A | \n",
+ " Category B | \n",
+ " 0.076415 | \n",
+ " 5.059058 | \n",
+ " 1.323273 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2022-01-04 | \n",
+ " Category C | \n",
+ " Category B | \n",
+ " Category A | \n",
+ " -0.306355 | \n",
+ " 8.316857 | \n",
+ " 0.077718 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2022-01-05 | \n",
+ " Category C | \n",
+ " Category C | \n",
+ " Category B | \n",
+ " -1.133514 | \n",
+ " 8.773722 | \n",
+ " 0.356009 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 cont_var2 \\\n",
+ "0 2022-01-01 Category B Category B Category C -0.247175 8.258259 \n",
+ "1 2022-01-02 Category B Category B Category C 0.247006 1.234493 \n",
+ "2 2022-01-03 Category C Category A Category B 0.076415 5.059058 \n",
+ "3 2022-01-04 Category C Category B Category A -0.306355 8.316857 \n",
+ "4 2022-01-05 Category C Category C Category B -1.133514 8.773722 \n",
+ "\n",
+ " cont_var3 target \n",
+ "0 0.039901 1 \n",
+ "1 1.336691 1 \n",
+ "2 1.323273 1 \n",
+ "3 0.077718 0 \n",
+ "4 0.356009 1 "
+ ]
+ },
+ "execution_count": 537,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 538,
+ "id": "e9c06e3a-188f-4cdc-b9cd-51d3db63e5ff",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['DateTime', 'CategoryVar1', 'CategoryVar2', 'CategoryVar3', 'cont_var1',\n",
+ " 'cont_var2', 'cont_var3', 'target'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 538,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9aae8c98-434b-4c71-abb1-29fa6d143895",
+ "metadata": {},
+ "source": [
+ "### 2. Fit preprocessor"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 539,
+ "id": "a32560d4-b5fe-4b90-9ea6-ede7915bba05",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "continuous_vars = ['cont_var2', 'cont_var3', 'cont_var1']\n",
+ "discrete_vars= ['CategoryVar1', 'CategoryVar2', 'CategoryVar3'] #, 'DateTime'] [] \n",
+ "target_col = \"target\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 540,
+ "id": "d6f1e21a-4a6e-4ad7-9faf-b36e6daff707",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.\n"
+ ]
+ }
+ ],
+ "source": [
+ "model_type = \"classification\"\n",
+ "\n",
+ "# using all Cobra's default parameters for preprocessing here\n",
+ "preprocessor = PreProcessor.from_params(\n",
+ " model_type=model_type\n",
+ ")\n",
+ "\n",
+ "random.seed(1212)\n",
+ "basetable = preprocessor.train_selection_validation_split(data=df,\n",
+ " train_prop=0.6,\n",
+ " selection_prop=0.25,\n",
+ " validation_prop=0.15)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 541,
+ "id": "7b673619-4eda-4aca-acd5-a125f80d3b20",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Starting to fit pipeline\n",
+ "Computing discretization bins...: 100%|█████████████████████████████████████████████████| 3/3 [00:00<00:00, 251.21it/s]\n",
+ "Fitting KBinsDiscretizer took 0.012943267822265625 seconds\n",
+ "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 193.52it/s]\n",
+ "Fitting category regrouping...: 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 26.85it/s]\n",
+ "Fitting categorical_data_processor class took 0.11171197891235352 seconds\n",
+ "Fitting target encoding...: 100%|███████████████████████████████████████████████████████| 6/6 [00:00<00:00, 564.66it/s]\n",
+ "Fitting TargetEncoder took 0.015709400177001953 seconds\n",
+ "Fitting pipeline took 0.1843581199645996 seconds\n"
+ ]
+ }
+ ],
+ "source": [
+ "preprocessor.fit(basetable[basetable[\"split\"]==\"train\"],\n",
+ " continuous_vars=continuous_vars,\n",
+ " discrete_vars = discrete_vars,\n",
+ " target_column_name=target_col)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 542,
+ "id": "c9e2c79d-c0bc-464d-b869-f8115ac67776",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 130.81it/s]\n",
+ "Applying target encoding...: 100%|██████████████████████████████████████████████████████| 6/6 [00:00<00:00, 517.58it/s]\n",
+ "Transforming data took 0.06473207473754883 seconds\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " DateTime | \n",
+ " CategoryVar1 | \n",
+ " CategoryVar2 | \n",
+ " CategoryVar3 | \n",
+ " cont_var1 | \n",
+ " cont_var2 | \n",
+ " cont_var3 | \n",
+ " target | \n",
+ " split | \n",
+ " cont_var2_bin | \n",
+ " ... | \n",
+ " cont_var1_bin | \n",
+ " CategoryVar1_processed | \n",
+ " CategoryVar2_processed | \n",
+ " CategoryVar3_processed | \n",
+ " CategoryVar1_enc | \n",
+ " CategoryVar2_enc | \n",
+ " CategoryVar3_enc | \n",
+ " cont_var2_enc | \n",
+ " cont_var3_enc | \n",
+ " cont_var1_enc | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2022-01-01 | \n",
+ " Category B | \n",
+ " Category B | \n",
+ " Category C | \n",
+ " -0.247175 | \n",
+ " 8.258259 | \n",
+ " 0.039901 | \n",
+ " 1 | \n",
+ " selection | \n",
+ " 8.0 - 9.0 | \n",
+ " ... | \n",
+ " -0.3 - 0.0 | \n",
+ " Category B | \n",
+ " Category B | \n",
+ " Category C | \n",
+ " 0.505584 | \n",
+ " 0.530256 | \n",
+ " 0.517730 | \n",
+ " 0.516447 | \n",
+ " 0.514851 | \n",
+ " 0.494083 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2022-01-02 | \n",
+ " Category B | \n",
+ " Category B | \n",
+ " Category C | \n",
+ " 0.247006 | \n",
+ " 1.234493 | \n",
+ " 1.336691 | \n",
+ " 1 | \n",
+ " train | \n",
+ " 1.0 - 2.0 | \n",
+ " ... | \n",
+ " 0.0 - 0.3 | \n",
+ " Category B | \n",
+ " Category B | \n",
+ " Category C | \n",
+ " 0.505584 | \n",
+ " 0.530256 | \n",
+ " 0.517730 | \n",
+ " 0.521311 | \n",
+ " 0.517986 | \n",
+ " 0.529086 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2022-01-03 | \n",
+ " Category C | \n",
+ " Category A | \n",
+ " Category B | \n",
+ " 0.076415 | \n",
+ " 5.059058 | \n",
+ " 1.323273 | \n",
+ " 1 | \n",
+ " train | \n",
+ " 5.0 - 6.0 | \n",
+ " ... | \n",
+ " 0.0 - 0.3 | \n",
+ " Category C | \n",
+ " Category A | \n",
+ " Category B | \n",
+ " 0.494939 | \n",
+ " 0.461386 | \n",
+ " 0.487052 | \n",
+ " 0.510903 | \n",
+ " 0.517986 | \n",
+ " 0.529086 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2022-01-04 | \n",
+ " Category C | \n",
+ " Category B | \n",
+ " Category A | \n",
+ " -0.306355 | \n",
+ " 8.316857 | \n",
+ " 0.077718 | \n",
+ " 0 | \n",
+ " selection | \n",
+ " 8.0 - 9.0 | \n",
+ " ... | \n",
+ " -0.5 - -0.3 | \n",
+ " Category C | \n",
+ " Category B | \n",
+ " Category A | \n",
+ " 0.494939 | \n",
+ " 0.530256 | \n",
+ " 0.488603 | \n",
+ " 0.516447 | \n",
+ " 0.514851 | \n",
+ " 0.534884 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2022-01-05 | \n",
+ " Category C | \n",
+ " Category C | \n",
+ " Category B | \n",
+ " -1.133514 | \n",
+ " 8.773722 | \n",
+ " 0.356009 | \n",
+ " 1 | \n",
+ " train | \n",
+ " 8.0 - 9.0 | \n",
+ " ... | \n",
+ " -1.3 - -0.8 | \n",
+ " Category C | \n",
+ " Category C | \n",
+ " Category B | \n",
+ " 0.494939 | \n",
+ " 0.502463 | \n",
+ " 0.487052 | \n",
+ " 0.516447 | \n",
+ " 0.484634 | \n",
+ " 0.461078 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 cont_var2 \\\n",
+ "0 2022-01-01 Category B Category B Category C -0.247175 8.258259 \n",
+ "1 2022-01-02 Category B Category B Category C 0.247006 1.234493 \n",
+ "2 2022-01-03 Category C Category A Category B 0.076415 5.059058 \n",
+ "3 2022-01-04 Category C Category B Category A -0.306355 8.316857 \n",
+ "4 2022-01-05 Category C Category C Category B -1.133514 8.773722 \n",
+ "\n",
+ " cont_var3 target split cont_var2_bin ... cont_var1_bin \\\n",
+ "0 0.039901 1 selection 8.0 - 9.0 ... -0.3 - 0.0 \n",
+ "1 1.336691 1 train 1.0 - 2.0 ... 0.0 - 0.3 \n",
+ "2 1.323273 1 train 5.0 - 6.0 ... 0.0 - 0.3 \n",
+ "3 0.077718 0 selection 8.0 - 9.0 ... -0.5 - -0.3 \n",
+ "4 0.356009 1 train 8.0 - 9.0 ... -1.3 - -0.8 \n",
+ "\n",
+ " CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed \\\n",
+ "0 Category B Category B Category C \n",
+ "1 Category B Category B Category C \n",
+ "2 Category C Category A Category B \n",
+ "3 Category C Category B Category A \n",
+ "4 Category C Category C Category B \n",
+ "\n",
+ " CategoryVar1_enc CategoryVar2_enc CategoryVar3_enc cont_var2_enc \\\n",
+ "0 0.505584 0.530256 0.517730 0.516447 \n",
+ "1 0.505584 0.530256 0.517730 0.521311 \n",
+ "2 0.494939 0.461386 0.487052 0.510903 \n",
+ "3 0.494939 0.530256 0.488603 0.516447 \n",
+ "4 0.494939 0.502463 0.487052 0.516447 \n",
+ "\n",
+ " cont_var3_enc cont_var1_enc \n",
+ "0 0.514851 0.494083 \n",
+ "1 0.517986 0.529086 \n",
+ "2 0.517986 0.529086 \n",
+ "3 0.514851 0.534884 \n",
+ "4 0.484634 0.461078 \n",
+ "\n",
+ "[5 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 542,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "basetable_transformed_orig = preprocessor.transform(basetable,\n",
+ " continuous_vars=continuous_vars,\n",
+ " discrete_vars=discrete_vars)\n",
+ "basetable_transformed_orig.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 543,
+ "id": "d70f40cc-7814-48a8-91f6-2b7297f97ccc",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "#preprocessor._discretizer #._bins_by_column\n",
+ "#preprocessor._target_encoder.attributes_to_dict()\n",
+ "#preprocessor._discretizer.attributes_to_dict()\n",
+ "#preprocessor._target_encoder.attributes_to_dict()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "baab4c1b-4200-4c96-b991-be8efc09abbb",
+ "metadata": {},
+ "source": [
+ "### 3. Serialize the preprocessor"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 544,
+ "id": "95b597b2-b475-4d59-b650-dcc208db1eb5",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "pipeline_serialized = preprocessor.serialize_pipeline()\n",
+ "\n",
+ "with open(r\"./model_json.json\", \"w\") as file:\n",
+ " file.write(json.dumps(pipeline_serialized, indent=4))\n",
+ " \n",
+ "#pipeline_serialized"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 545,
+ "id": "c6dbd38c-ca5d-492d-815b-1af02d7de143",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Look into properties of preprocessors\n",
+ "#pipeline_serialized[\"target_encoder\"] #._bins_by_column"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fc339ac8-67a7-4574-811e-2b9bc4ce6a39",
+ "metadata": {},
+ "source": [
+ "### 4. De-serialize pipeline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 547,
+ "id": "2a517ff8-d336-4bd3-abdc-2be784259564",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "The target encoder's additive smoothing weight is set to 0. This disables smoothing and may make the encoding prone to overfitting. Increase the weight if needed.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Read serialized pipeline from json\n",
+ "with open(r\"./model_json.json\", \"r\") as file:\n",
+ " json_pipeline_serialized = json.load(file)\n",
+ "\n",
+ "# Create new preprocessor object from serialized pipeline\n",
+ "new_preprocessor = PreProcessor.from_pipeline(json_pipeline_serialized)\n",
+ "#new_preprocessor = PreProcessor.from_pipeline(pipeline_serialized)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 529,
+ "id": "ad9442b5-7f7e-48fe-8199-528992d1f0d6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Look into properties of preprocessors if needed\n",
+ "#new_preprocessor._discretizer.attributes_to_dict()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 548,
+ "id": "541986d2-8d5d-473c-8871-5e7d2da31c4a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Discretizing columns...: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 119.08it/s]\n",
+ "Applying target encoding...: 100%|█████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1009.14it/s]\n",
+ "Transforming data took 0.06331968307495117 seconds\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " DateTime | \n",
+ " CategoryVar1 | \n",
+ " CategoryVar2 | \n",
+ " CategoryVar3 | \n",
+ " cont_var1 | \n",
+ " cont_var2 | \n",
+ " cont_var3 | \n",
+ " target | \n",
+ " split | \n",
+ " cont_var2_bin | \n",
+ " ... | \n",
+ " cont_var1_bin | \n",
+ " CategoryVar1_processed | \n",
+ " CategoryVar2_processed | \n",
+ " CategoryVar3_processed | \n",
+ " CategoryVar1_enc | \n",
+ " CategoryVar2_enc | \n",
+ " CategoryVar3_enc | \n",
+ " cont_var2_enc | \n",
+ " cont_var3_enc | \n",
+ " cont_var1_enc | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2022-01-01 | \n",
+ " Category B | \n",
+ " Category B | \n",
+ " Category C | \n",
+ " -0.247175 | \n",
+ " 8.258259 | \n",
+ " 0.039901 | \n",
+ " 1 | \n",
+ " selection | \n",
+ " 8.0 - 9.0 | \n",
+ " ... | \n",
+ " -0.3 - 0.0 | \n",
+ " Category B | \n",
+ " Category B | \n",
+ " Category C | \n",
+ " 0.505584 | \n",
+ " 0.530256 | \n",
+ " 0.517730 | \n",
+ " 0.516447 | \n",
+ " 0.514851 | \n",
+ " 0.494083 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2022-01-02 | \n",
+ " Category B | \n",
+ " Category B | \n",
+ " Category C | \n",
+ " 0.247006 | \n",
+ " 1.234493 | \n",
+ " 1.336691 | \n",
+ " 1 | \n",
+ " train | \n",
+ " 1.0 - 2.0 | \n",
+ " ... | \n",
+ " 0.0 - 0.3 | \n",
+ " Category B | \n",
+ " Category B | \n",
+ " Category C | \n",
+ " 0.505584 | \n",
+ " 0.530256 | \n",
+ " 0.517730 | \n",
+ " 0.521311 | \n",
+ " 0.517986 | \n",
+ " 0.529086 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2022-01-03 | \n",
+ " Category C | \n",
+ " Category A | \n",
+ " Category B | \n",
+ " 0.076415 | \n",
+ " 5.059058 | \n",
+ " 1.323273 | \n",
+ " 1 | \n",
+ " train | \n",
+ " 5.0 - 6.0 | \n",
+ " ... | \n",
+ " 0.0 - 0.3 | \n",
+ " Category C | \n",
+ " Category A | \n",
+ " Category B | \n",
+ " 0.494939 | \n",
+ " 0.461386 | \n",
+ " 0.487052 | \n",
+ " 0.510903 | \n",
+ " 0.517986 | \n",
+ " 0.529086 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2022-01-04 | \n",
+ " Category C | \n",
+ " Category B | \n",
+ " Category A | \n",
+ " -0.306355 | \n",
+ " 8.316857 | \n",
+ " 0.077718 | \n",
+ " 0 | \n",
+ " selection | \n",
+ " 8.0 - 9.0 | \n",
+ " ... | \n",
+ " -0.5 - -0.3 | \n",
+ " Category C | \n",
+ " Category B | \n",
+ " Category A | \n",
+ " 0.494939 | \n",
+ " 0.530256 | \n",
+ " 0.488603 | \n",
+ " 0.516447 | \n",
+ " 0.514851 | \n",
+ " 0.534884 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2022-01-05 | \n",
+ " Category C | \n",
+ " Category C | \n",
+ " Category B | \n",
+ " -1.133514 | \n",
+ " 8.773722 | \n",
+ " 0.356009 | \n",
+ " 1 | \n",
+ " train | \n",
+ " 8.0 - 9.0 | \n",
+ " ... | \n",
+ " -1.3 - -0.8 | \n",
+ " Category C | \n",
+ " Category C | \n",
+ " Category B | \n",
+ " 0.494939 | \n",
+ " 0.502463 | \n",
+ " 0.487052 | \n",
+ " 0.516447 | \n",
+ " 0.484634 | \n",
+ " 0.461078 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 cont_var2 \\\n",
+ "0 2022-01-01 Category B Category B Category C -0.247175 8.258259 \n",
+ "1 2022-01-02 Category B Category B Category C 0.247006 1.234493 \n",
+ "2 2022-01-03 Category C Category A Category B 0.076415 5.059058 \n",
+ "3 2022-01-04 Category C Category B Category A -0.306355 8.316857 \n",
+ "4 2022-01-05 Category C Category C Category B -1.133514 8.773722 \n",
+ "\n",
+ " cont_var3 target split cont_var2_bin ... cont_var1_bin \\\n",
+ "0 0.039901 1 selection 8.0 - 9.0 ... -0.3 - 0.0 \n",
+ "1 1.336691 1 train 1.0 - 2.0 ... 0.0 - 0.3 \n",
+ "2 1.323273 1 train 5.0 - 6.0 ... 0.0 - 0.3 \n",
+ "3 0.077718 0 selection 8.0 - 9.0 ... -0.5 - -0.3 \n",
+ "4 0.356009 1 train 8.0 - 9.0 ... -1.3 - -0.8 \n",
+ "\n",
+ " CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed \\\n",
+ "0 Category B Category B Category C \n",
+ "1 Category B Category B Category C \n",
+ "2 Category C Category A Category B \n",
+ "3 Category C Category B Category A \n",
+ "4 Category C Category C Category B \n",
+ "\n",
+ " CategoryVar1_enc CategoryVar2_enc CategoryVar3_enc cont_var2_enc \\\n",
+ "0 0.505584 0.530256 0.517730 0.516447 \n",
+ "1 0.505584 0.530256 0.517730 0.521311 \n",
+ "2 0.494939 0.461386 0.487052 0.510903 \n",
+ "3 0.494939 0.530256 0.488603 0.516447 \n",
+ "4 0.494939 0.502463 0.487052 0.516447 \n",
+ "\n",
+ " cont_var3_enc cont_var1_enc \n",
+ "0 0.514851 0.494083 \n",
+ "1 0.517986 0.529086 \n",
+ "2 0.517986 0.529086 \n",
+ "3 0.514851 0.534884 \n",
+ "4 0.484634 0.461078 \n",
+ "\n",
+ "[5 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 548,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "basetable_transformed = new_preprocessor.transform(basetable,\n",
+ " continuous_vars=continuous_vars,\n",
+ " discrete_vars=discrete_vars)\n",
+ "basetable_transformed.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 549,
+ "id": "c270d856-452d-4507-a3c2-df3ae1991c36",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " DateTime | \n",
+ " CategoryVar1 | \n",
+ " CategoryVar2 | \n",
+ " CategoryVar3 | \n",
+ " cont_var1 | \n",
+ " cont_var2 | \n",
+ " cont_var3 | \n",
+ " target | \n",
+ " split | \n",
+ " cont_var2_bin | \n",
+ " ... | \n",
+ " cont_var1_bin | \n",
+ " CategoryVar1_processed | \n",
+ " CategoryVar2_processed | \n",
+ " CategoryVar3_processed | \n",
+ " CategoryVar1_enc | \n",
+ " CategoryVar2_enc | \n",
+ " CategoryVar3_enc | \n",
+ " cont_var2_enc | \n",
+ " cont_var3_enc | \n",
+ " cont_var1_enc | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " ... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " ... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " ... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " ... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " ... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 4995 | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " ... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4996 | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " ... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4997 | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " ... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4998 | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " ... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4999 | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " ... | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5000 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " DateTime CategoryVar1 CategoryVar2 CategoryVar3 cont_var1 \\\n",
+ "0 True True True True True \n",
+ "1 True True True True True \n",
+ "2 True True True True True \n",
+ "3 True True True True True \n",
+ "4 True True True True True \n",
+ "... ... ... ... ... ... \n",
+ "4995 True True True True True \n",
+ "4996 True True True True True \n",
+ "4997 True True True True True \n",
+ "4998 True True True True True \n",
+ "4999 True True True True True \n",
+ "\n",
+ " cont_var2 cont_var3 target split cont_var2_bin ... cont_var1_bin \\\n",
+ "0 True True True True True ... True \n",
+ "1 True True True True True ... True \n",
+ "2 True True True True True ... True \n",
+ "3 True True True True True ... True \n",
+ "4 True True True True True ... True \n",
+ "... ... ... ... ... ... ... ... \n",
+ "4995 True True True True True ... True \n",
+ "4996 True True True True True ... True \n",
+ "4997 True True True True True ... True \n",
+ "4998 True True True True True ... True \n",
+ "4999 True True True True True ... True \n",
+ "\n",
+ " CategoryVar1_processed CategoryVar2_processed CategoryVar3_processed \\\n",
+ "0 True True True \n",
+ "1 True True True \n",
+ "2 True True True \n",
+ "3 True True True \n",
+ "4 True True True \n",
+ "... ... ... ... \n",
+ "4995 True True True \n",
+ "4996 True True True \n",
+ "4997 True True True \n",
+ "4998 True True True \n",
+ "4999 True True True \n",
+ "\n",
+ " CategoryVar1_enc CategoryVar2_enc CategoryVar3_enc cont_var2_enc \\\n",
+ "0 True True True True \n",
+ "1 True True True True \n",
+ "2 True True True True \n",
+ "3 True True True True \n",
+ "4 True True True True \n",
+ "... ... ... ... ... \n",
+ "4995 True True True True \n",
+ "4996 True True True True \n",
+ "4997 True True True True \n",
+ "4998 True True True True \n",
+ "4999 True True True True \n",
+ "\n",
+ " cont_var3_enc cont_var1_enc \n",
+ "0 True True \n",
+ "1 True True \n",
+ "2 True True \n",
+ "3 True True \n",
+ "4 True True \n",
+ "... ... ... \n",
+ "4995 True True \n",
+ "4996 True True \n",
+ "4997 True True \n",
+ "4998 True True \n",
+ "4999 True True \n",
+ "\n",
+ "[5000 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 549,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Double check transformed basetable is the same\n",
+ "basetable_transformed_orig == basetable_transformed"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2b478d7c-46d8-4ba9-bf84-375a7cf901a8",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "cobra_venv",
+ "language": "python",
+ "name": "cobra_venv"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/model_json.json b/notebooks/model_json.json
new file mode 100644
index 0000000..0670084
--- /dev/null
+++ b/notebooks/model_json.json
@@ -0,0 +1,216 @@
+{
+ "metadata": {
+ "timestamp": "16/06/2023 18:39:39"
+ },
+ "categorical_data_processor": {
+ "category_size_threshold": 5,
+ "forced_categories": {},
+ "keep_missing": true,
+ "model_type": "classification",
+ "p_value_threshold": 0.001,
+ "regroup": true,
+ "regroup_name": "Other",
+ "scale_contingency_table": true,
+ "_cleaned_categories_by_column": {
+ "CategoryVar1": [],
+ "CategoryVar2": [],
+ "CategoryVar3": []
+ }
+ },
+ "discretizer": {
+ "auto_adapt_bins": false,
+ "change_endpoint_format": false,
+ "closed": "right",
+ "label_format": "{} - {}",
+ "n_bins": 10,
+ "starting_precision": 0,
+ "strategy": "quantile",
+ "_bins_by_column": {
+ "cont_var2": [
+ [
+ 0.0,
+ 1.0
+ ],
+ [
+ 1.0,
+ 2.0
+ ],
+ [
+ 2.0,
+ 3.0
+ ],
+ [
+ 3.0,
+ 4.0
+ ],
+ [
+ 4.0,
+ 5.0
+ ],
+ [
+ 5.0,
+ 6.0
+ ],
+ [
+ 6.0,
+ 7.0
+ ],
+ [
+ 7.0,
+ 8.0
+ ],
+ [
+ 8.0,
+ 9.0
+ ],
+ [
+ 9.0,
+ 10.0
+ ]
+ ],
+ "cont_var3": [
+ [
+ 0.0,
+ 0.1
+ ],
+ [
+ 0.1,
+ 0.2
+ ],
+ [
+ 0.2,
+ 0.3
+ ],
+ [
+ 0.3,
+ 0.5
+ ],
+ [
+ 0.5,
+ 0.7
+ ],
+ [
+ 0.7,
+ 0.9
+ ],
+ [
+ 0.9,
+ 1.2
+ ],
+ [
+ 1.2,
+ 1.6
+ ],
+ [
+ 1.6,
+ 2.2
+ ],
+ [
+ 2.2,
+ 7.3
+ ]
+ ],
+ "cont_var1": [
+ [
+ -3.1,
+ -1.3
+ ],
+ [
+ -1.3,
+ -0.8
+ ],
+ [
+ -0.8,
+ -0.5
+ ],
+ [
+ -0.5,
+ -0.3
+ ],
+ [
+ -0.3,
+ 0.0
+ ],
+ [
+ 0.0,
+ 0.3
+ ],
+ [
+ 0.3,
+ 0.5
+ ],
+ [
+ 0.5,
+ 0.9
+ ],
+ [
+ 0.9,
+ 1.3
+ ],
+ [
+ 1.3,
+ 3.3
+ ]
+ ]
+ }
+ },
+ "target_encoder": {
+ "imputation_strategy": "mean",
+ "weight": 0.0,
+ "_mapping": {
+ "CategoryVar1_processed": {
+ "Category A": 0.49269717624148,
+ "Category B": 0.5055837563451777,
+ "Category C": 0.4949392712550607
+ },
+ "CategoryVar2_processed": {
+ "Category A": 0.4613861386138614,
+ "Category B": 0.5302564102564102,
+ "Category C": 0.5024630541871922
+ },
+ "CategoryVar3_processed": {
+ "Category A": 0.4886025768087215,
+ "Category B": 0.48705179282868527,
+ "Category C": 0.5177304964539007
+ },
+ "cont_var2_bin": {
+ "0.0 - 1.0": 0.5333333333333333,
+ "1.0 - 2.0": 0.521311475409836,
+ "2.0 - 3.0": 0.4197952218430034,
+ "3.0 - 4.0": 0.4781144781144781,
+ "4.0 - 5.0": 0.4557377049180328,
+ "5.0 - 6.0": 0.5109034267912772,
+ "6.0 - 7.0": 0.5408163265306123,
+ "7.0 - 8.0": 0.5050167224080268,
+ "8.0 - 9.0": 0.5164473684210527,
+ "9.0 - 10.0": 0.494949494949495
+ },
+ "cont_var3_bin": {
+ "0.0 - 0.1": 0.5148514851485149,
+ "0.1 - 0.2": 0.4936708860759494,
+ "0.2 - 0.3": 0.50390625,
+ "0.3 - 0.5": 0.4846335697399527,
+ "0.5 - 0.7": 0.47774480712166173,
+ "0.7 - 0.9": 0.49407114624505927,
+ "0.9 - 1.2": 0.4773413897280967,
+ "1.2 - 1.6": 0.5179856115107914,
+ "1.6 - 2.2": 0.5018050541516246,
+ "2.2 - 7.3": 0.521311475409836
+ },
+ "cont_var1_bin": {
+ "-3.1 - -1.3": 0.5152542372881356,
+ "-1.3 - -0.8": 0.46107784431137727,
+ "-0.8 - -0.5": 0.4899328859060403,
+ "-0.5 - -0.3": 0.5348837209302325,
+ "-0.3 - 0.0": 0.4940828402366864,
+ "0.0 - 0.3": 0.5290858725761773,
+ "0.3 - 0.5": 0.46396396396396394,
+ "0.5 - 0.9": 0.4743935309973046,
+ "0.9 - 1.3": 0.48201438848920863,
+ "1.3 - 3.3": 0.5381944444444444
+ }
+ },
+ "_global_mean": 0.49766666666666665
+ },
+ "_is_fitted": true
+}
\ No newline at end of file
diff --git a/tests/model_building/test_forward_selection.py b/tests/model_building/test_forward_selection.py
index 19f7157..9383f73 100644
--- a/tests/model_building/test_forward_selection.py
+++ b/tests/model_building/test_forward_selection.py
@@ -1,213 +1,213 @@
-
-from contextlib import contextmanager
-import pytest
-import pandas as pd
-
-from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel
-from cobra.model_building.forward_selection import ForwardFeatureSelection
-
-@contextmanager
-def does_not_raise():
- yield
-
-def mock_data(add_split_col: bool=False, model_type="classification"):
- data = pd.DataFrame({"var1_enc": [0.42] * 10,
- "var2_enc": [0.94] * 10,
- "var3_enc": [0.87] * 10})
-
- if model_type == "classification":
- data["target"] = ([0] * 5 + [1] * 2 + [0] * 2 + [1])
- elif model_type == "regression":
- data["target"] = [7, 2, 2, 9, 7, 3, 1, 4, 8, 5]
-
- if add_split_col:
- data.loc[:, "split"] = (["train"] * 7 + ["selection"] * 3)
-
- return data
-
-def mock_model_num_pred(n_predictors, model_type="classification"):
- predictors = [f"var{i + 1}_enc" for i in range(n_predictors)]
- return mock_model(predictors, model_type)
-
-def mock_model(predictor_list, model_type="classification"):
- if model_type == "classification":
- model = LogisticRegressionModel()
- elif model_type == "regression":
- model = LinearRegressionModel()
-
- model.predictors = predictor_list
-
- return model
-
-
-class TestForwardFeatureSelection:
-
- def test_get_model_from_step(self):
-
- forward_selection = ForwardFeatureSelection()
-
- with pytest.raises(ValueError):
- forward_selection.get_model_from_step(2)
-
- @pytest.mark.parametrize("model_type", ["classification", "regression"])
- def test_compute_model_performances(self, mocker, model_type):
-
- data = mock_data(add_split_col=True, model_type=model_type)
-
- fw_selection = ForwardFeatureSelection(model_type=model_type)
- fw_selection._fitted_models = [
- mock_model_num_pred(1, model_type=model_type),
- mock_model_num_pred(2, model_type=model_type),
- mock_model_num_pred(3, model_type=model_type)
- ]
-
- def mock_evaluate(self, X, y, split, metric): # on AUC scale, but gives the same for RMSE as it is a mock
- if split == "train":
- return 0.612
- else:
- return 0.609
-
- if model_type == "classification":
- patch_fct = "cobra.model_building.forward_selection.LogisticRegressionModel.evaluate"
- elif model_type == "regression":
- patch_fct = "cobra.model_building.forward_selection.LinearRegressionModel.evaluate"
-
- mocker.patch(patch_fct, mock_evaluate)
-
- actual = (fw_selection
- .compute_model_performances(data, "target",
- splits=["train", "selection"],
- metric=None))
-
- expected = pd.DataFrame([
- {"predictors": ["var1_enc"],
- "last_added_predictor": "var1_enc",
- "train_performance": 0.612, "selection_performance": 0.609,
- "model_type": model_type},
- {"predictors": ["var1_enc", "var2_enc"],
- "last_added_predictor": "var2_enc",
- "train_performance": 0.612, "selection_performance": 0.609,
- "model_type": model_type},
- {"predictors": ["var1_enc", "var2_enc", "var3_enc"],
- "last_added_predictor": "var3_enc",
- "train_performance": 0.612, "selection_performance": 0.609,
- "model_type": model_type}
- ])
-
- pd.testing.assert_frame_equal(actual, expected)
-
- @pytest.mark.parametrize("model_type", ["classification", "regression"])
- def test_ffs_train_data_assertions(self, model_type):
-
- fw_selection = ForwardFeatureSelection(model_type=model_type)
-
- with pytest.raises(AssertionError): # no split column
- fw_selection.fit(pd.DataFrame(), "target", predictors=[""])
-
- df = mock_data(add_split_col=True, model_type=model_type)
- with pytest.raises(AssertionError): # not at least train & selection sets
- fw_selection.fit(df[df["split"] == "train"], "target", predictors=[""])
-
- @pytest.mark.parametrize("model_type, max_predictors, expectation",
- [("classification", 2, pytest.raises(ValueError)),
- ("classification", 3, does_not_raise()),
- ("classification", 5, does_not_raise()),
- ("classification", 10, does_not_raise()),
- ("classification", 15, does_not_raise()),
- ("regression", 2, pytest.raises(ValueError)),
- ("regression", 3, does_not_raise()),
- ("regression", 5, does_not_raise()),
- ("regression", 10, does_not_raise()),
- ("regression", 15, does_not_raise())
- ])
- def test_fit(self, mocker, model_type, max_predictors: int, expectation):
-
- # create list of elements [var1_enc, var2_enc, ..., var10_enc]
- predictors_list = [f"var{i+1}_enc" for i in range(10)]
- # extract sublist [var1_enc, var5_enc, var9_enc]
- forced_predictors_list = predictors_list[::4]
-
- ordered_output_list = (forced_predictors_list
- + [pred for pred in predictors_list
- if pred not in forced_predictors_list])
-
- fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors)
-
- def mock_train_model(self, train_data, target_column_name, predictors):
- return mock_model(predictors, model_type=model_type)
-
- def mock_forward_selection(self, train_data, target_column_name,
- predictors, forced_predictors):
- n_models = min(max_predictors, len(predictors) + len(forced_predictors))
-
- return [mock_model(ordered_output_list[:i+1], model_type=model_type)
- for i in range(n_models)]
-
- mocker.patch("cobra.model_building.ForwardFeatureSelection._train_model",
- mock_train_model)
-
- mocker.patch("cobra.model_building.ForwardFeatureSelection._forward_selection",
- mock_forward_selection)
-
- df = mock_data(add_split_col=True, model_type=model_type)
- with expectation:
- fw_selection.fit(df, "target", # data is ignored
- predictors=predictors_list,
- forced_predictors=forced_predictors_list,
- excluded_predictors=[])
-
- # for each fitted model, check number of predictors
- actual = [model.predictors
- for model in fw_selection._fitted_models]
-
- expected = [ordered_output_list[:i+1]
- for i in range(min(max_predictors,
- len(predictors_list)))]
-
- if max_predictors == len(forced_predictors_list):
- expected = [forced_predictors_list]
-
- assert actual == expected
-
- @pytest.mark.parametrize("model_type, max_predictors", [("classification", 5),
- ("classification", 10),
- ("classification", 15),
- ("regression", 5),
- ("regression", 10),
- ("regression", 15)
- ])
- def test_forward_selection(self, mocker, model_type, max_predictors: int):
-
- # create list of elements [var1_enc, var2_c, ..., var10_enc]
- predictors_list = [f"var{i+1}_enc" for i in range(10)]
-
- # extract sublist [var1_enc, var5_enc, var9_enc]:
- forced_predictors = predictors_list[::4]
- # remove these from predictors list to have clean version
- predictors = [pred for pred in predictors_list
- if pred not in forced_predictors]
-
- ordered_output_list = forced_predictors + predictors
-
- def mock_find_next_best_model(self, train_data, target_column_name,
- candidate_predictors,
- current_predictors):
- return mock_model(current_predictors + candidate_predictors[0:1], model_type=model_type)
-
- mocker.patch(("cobra.model_building.ForwardFeatureSelection."
- "_find_next_best_model"), mock_find_next_best_model)
-
- fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors)
-
- fitted_models = (fw_selection.
- _forward_selection(pd.DataFrame(), "target",
- predictors,
- forced_predictors))
-
- actual = [sorted(model.predictors) for model in fitted_models]
-
- expected = [sorted(ordered_output_list[:i+1])
- for i in range(min(max_predictors,
- len(predictors_list)))]
-
- assert actual == expected
+
+from contextlib import contextmanager
+import pytest
+import pandas as pd
+
+from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel
+from cobra.model_building.forward_selection import ForwardFeatureSelection
+
+@contextmanager
+def does_not_raise():
+ yield
+
+def mock_data(add_split_col: bool=False, model_type="classification"):
+ data = pd.DataFrame({"var1_enc": [0.42] * 10,
+ "var2_enc": [0.94] * 10,
+ "var3_enc": [0.87] * 10})
+
+ if model_type == "classification":
+ data["target"] = ([0] * 5 + [1] * 2 + [0] * 2 + [1])
+ elif model_type == "regression":
+ data["target"] = [7, 2, 2, 9, 7, 3, 1, 4, 8, 5]
+
+ if add_split_col:
+ data.loc[:, "split"] = (["train"] * 7 + ["selection"] * 3)
+
+ return data
+
+def mock_model_num_pred(n_predictors, model_type="classification"):
+ predictors = [f"var{i + 1}_enc" for i in range(n_predictors)]
+ return mock_model(predictors, model_type)
+
+def mock_model(predictor_list, model_type="classification"):
+ if model_type == "classification":
+ model = LogisticRegressionModel()
+ elif model_type == "regression":
+ model = LinearRegressionModel()
+
+ model.predictors = predictor_list
+
+ return model
+
+
+class TestForwardFeatureSelection:
+
+ def test_get_model_from_step(self):
+
+ forward_selection = ForwardFeatureSelection()
+
+ with pytest.raises(ValueError):
+ forward_selection.get_model_from_step(2)
+
+ @pytest.mark.parametrize("model_type", ["classification", "regression"])
+ def test_compute_model_performances(self, mocker, model_type):
+
+ data = mock_data(add_split_col=True, model_type=model_type)
+
+ fw_selection = ForwardFeatureSelection(model_type=model_type)
+ fw_selection._fitted_models = [
+ mock_model_num_pred(1, model_type=model_type),
+ mock_model_num_pred(2, model_type=model_type),
+ mock_model_num_pred(3, model_type=model_type)
+ ]
+
+ def mock_evaluate(self, X, y, split, metric): # on AUC scale, but gives the same for RMSE as it is a mock
+ if split == "train":
+ return 0.612
+ else:
+ return 0.609
+
+ if model_type == "classification":
+ patch_fct = "cobra.model_building.forward_selection.LogisticRegressionModel.evaluate"
+ elif model_type == "regression":
+ patch_fct = "cobra.model_building.forward_selection.LinearRegressionModel.evaluate"
+
+ mocker.patch(patch_fct, mock_evaluate)
+
+ actual = (fw_selection
+ .compute_model_performances(data, "target",
+ splits=["train", "selection"],
+ metric=None))
+
+ expected = pd.DataFrame([
+ {"predictors": ["var1_enc"],
+ "last_added_predictor": "var1_enc",
+ "train_performance": 0.612, "selection_performance": 0.609,
+ "model_type": model_type},
+ {"predictors": ["var1_enc", "var2_enc"],
+ "last_added_predictor": "var2_enc",
+ "train_performance": 0.612, "selection_performance": 0.609,
+ "model_type": model_type},
+ {"predictors": ["var1_enc", "var2_enc", "var3_enc"],
+ "last_added_predictor": "var3_enc",
+ "train_performance": 0.612, "selection_performance": 0.609,
+ "model_type": model_type}
+ ])
+
+ pd.testing.assert_frame_equal(actual, expected)
+
+ @pytest.mark.parametrize("model_type", ["classification", "regression"])
+ def test_ffs_train_data_assertions(self, model_type):
+
+ fw_selection = ForwardFeatureSelection(model_type=model_type)
+
+ with pytest.raises(AssertionError): # no split column
+ fw_selection.fit(pd.DataFrame(), "target", predictors=[""])
+
+ df = mock_data(add_split_col=True, model_type=model_type)
+ with pytest.raises(AssertionError): # not at least train & selection sets
+ fw_selection.fit(df[df["split"] == "train"], "target", predictors=[""])
+
+ @pytest.mark.parametrize("model_type, max_predictors, expectation",
+ [("classification", 2, pytest.raises(ValueError)),
+ ("classification", 3, does_not_raise()),
+ ("classification", 5, does_not_raise()),
+ ("classification", 10, does_not_raise()),
+ ("classification", 15, does_not_raise()),
+ ("regression", 2, pytest.raises(ValueError)),
+ ("regression", 3, does_not_raise()),
+ ("regression", 5, does_not_raise()),
+ ("regression", 10, does_not_raise()),
+ ("regression", 15, does_not_raise())
+ ])
+ def test_fit(self, mocker, model_type, max_predictors: int, expectation):
+
+ # create list of elements [var1_enc, var2_enc, ..., var10_enc]
+ predictors_list = [f"var{i+1}_enc" for i in range(10)]
+ # extract sublist [var1_enc, var5_enc, var9_enc]
+ forced_predictors_list = predictors_list[::4]
+
+ ordered_output_list = (forced_predictors_list
+ + [pred for pred in predictors_list
+ if pred not in forced_predictors_list])
+
+ fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors)
+
+ def mock_train_model(self, train_data, target_column_name, predictors):
+ return mock_model(predictors, model_type=model_type)
+
+ def mock_forward_selection(self, train_data, target_column_name,
+ predictors, forced_predictors):
+ n_models = min(max_predictors, len(predictors) + len(forced_predictors))
+
+ return [mock_model(ordered_output_list[:i+1], model_type=model_type)
+ for i in range(n_models)]
+
+ mocker.patch("cobra.model_building.ForwardFeatureSelection._train_model",
+ mock_train_model)
+
+ mocker.patch("cobra.model_building.ForwardFeatureSelection._forward_selection",
+ mock_forward_selection)
+
+ df = mock_data(add_split_col=True, model_type=model_type)
+ with expectation:
+ fw_selection.fit(df, "target", # data is ignored
+ predictors=predictors_list,
+ forced_predictors=forced_predictors_list,
+ excluded_predictors=[])
+
+ # for each fitted model, check number of predictors
+ actual = [model.predictors
+ for model in fw_selection._fitted_models]
+
+ expected = [ordered_output_list[:i+1]
+ for i in range(min(max_predictors,
+ len(predictors_list)))]
+
+ if max_predictors == len(forced_predictors_list):
+ expected = [forced_predictors_list]
+
+ assert actual == expected
+
+ @pytest.mark.parametrize("model_type, max_predictors", [("classification", 5),
+ ("classification", 10),
+ ("classification", 15),
+ ("regression", 5),
+ ("regression", 10),
+ ("regression", 15)
+ ])
+ def test_forward_selection(self, mocker, model_type, max_predictors: int):
+
+ # create list of elements [var1_enc, var2_c, ..., var10_enc]
+ predictors_list = [f"var{i+1}_enc" for i in range(10)]
+
+ # extract sublist [var1_enc, var5_enc, var9_enc]:
+ forced_predictors = predictors_list[::4]
+ # remove these from predictors list to have clean version
+ predictors = [pred for pred in predictors_list
+ if pred not in forced_predictors]
+
+ ordered_output_list = forced_predictors + predictors
+
+ def mock_find_next_best_model(self, train_data, target_column_name,
+ candidate_predictors,
+ current_predictors):
+ return mock_model(current_predictors + candidate_predictors[0:1], model_type=model_type)
+
+ mocker.patch(("cobra.model_building.ForwardFeatureSelection."
+ "_find_next_best_model"), mock_find_next_best_model)
+
+ fw_selection = ForwardFeatureSelection(model_type=model_type, max_predictors=max_predictors)
+
+ fitted_models = (fw_selection.
+ _forward_selection(pd.DataFrame(), "target",
+ predictors,
+ forced_predictors))
+
+ actual = [sorted(model.predictors) for model in fitted_models]
+
+ expected = [sorted(ordered_output_list[:i+1])
+ for i in range(min(max_predictors,
+ len(predictors_list)))]
+
+ assert actual == expected
diff --git a/tests/model_building/test_models.py b/tests/model_building/test_models.py
index 7eca6e6..20fce9f 100644
--- a/tests/model_building/test_models.py
+++ b/tests/model_building/test_models.py
@@ -1,258 +1,258 @@
-
-import numpy as np
-import pandas as pd
-
-from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel
-
-def mock_data():
- return pd.DataFrame({"var1_enc": [0.42] * 10,
- "var2_enc": [0.94] * 10,
- "var3_enc": [0.87] * 10})
-
-
-def mock_score_model_classification(self, data):
- return np.array([0.5, 0.8, 0.2, 0.9, 0.1, 0.7, 0.3, 0.6, 0.4, 0.5])
-
-def mock_score_model_regression(self, data):
- return np.array([0.7, 0.2, 0.2, 0.9, 0.7, 0.3, 0.1, 0.4, 0.8, 0.5])*15
-
-class TestLogisticRegressionModel:
-
- def test_evaluate(self, mocker):
-
- X = mock_data()
- y = pd.Series([1] * 5 + [0] * 5)
-
- def mock_roc_auc_score(y_true, y_score):
- return 0.79
-
- (mocker
- .patch("cobra.model_building.LogisticRegressionModel.score_model",
- mock_score_model_classification))
-
- (mocker
- .patch("cobra.model_building.models.roc_auc_score",
- mock_roc_auc_score))
-
- model = LogisticRegressionModel()
- actual = model.evaluate(X, y)
-
- assert actual == 0.79
-
- def test_evaluate_cached(self):
-
- split = "train"
- expected = 0.79
-
- model = LogisticRegressionModel()
- model._eval_metrics_by_split["train"] = expected
-
- actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split)
-
- assert actual == expected
-
- def test_compute_variable_importance(self, mocker):
-
- def mock_pearsonr(ypred, ytrue):
- return [ypred.unique()[0]]
-
- (mocker
- .patch("cobra.model_building.LogisticRegressionModel.score_model",
- mock_score_model_classification))
-
- (mocker
- .patch("cobra.model_building.models.stats.pearsonr",
- mock_pearsonr))
-
- model = LogisticRegressionModel()
- model.predictors = ["var1_enc", "var2_enc", "var3_enc"]
-
- data = mock_data()
-
- actual = model.compute_variable_importance(data)
-
- expected = pd.DataFrame([
- {"predictor": "var1", "importance": data["var1_enc"].unique()[0]},
- {"predictor": "var2", "importance": data["var2_enc"].unique()[0]},
- {"predictor": "var3", "importance": data["var3_enc"].unique()[0]}
- ]).sort_values(by="importance", ascending=False).reset_index(drop=True)
-
- pd.testing.assert_frame_equal(actual, expected)
-
- def test_serialize(self):
-
- model = LogisticRegressionModel()
- actual = model.serialize()
-
- expected = {
- "meta": "logistic-regression",
- "predictors": [],
- "_eval_metrics_by_split": {},
- "params": {
- "C": 1000000000.0,
- "class_weight": None,
- "dual": False,
- "fit_intercept": True,
- "intercept_scaling": 1,
- "l1_ratio": None,
- "max_iter": 100,
- "multi_class": "auto",
- "n_jobs": None,
- "penalty": "l2",
- "random_state": 42,
- "solver": "liblinear",
- "tol": 0.0001,
- "verbose": 0,
- "warm_start": False
- }
- }
-
- assert actual == expected
-
- def test_deserialize(self):
-
- model = LogisticRegressionModel()
-
- model_dict = {
- "meta": "logistic-regression",
- "predictors": [],
- "_eval_metrics_by_split": {},
- "params": {
- "C": 1000000000.0,
- "class_weight": None,
- "dual": False,
- "fit_intercept": True,
- "intercept_scaling": 1,
- "l1_ratio": None,
- "max_iter": 100,
- "multi_class": "auto",
- "n_jobs": None,
- "penalty": "l2",
- "random_state": 42,
- "solver": "liblinear",
- "tol": 0.0001,
- "verbose": 0,
- "warm_start": False
- },
- "classes_": [0, 1],
- "coef_": [[0.5, 0.75]],
- "intercept_": [-3],
- "n_iter_": [10]
- }
-
- model.deserialize(model_dict)
-
- logit = model.logit
- assert logit.get_params() == model_dict["params"]
- assert logit.classes_.all() == np.array(model_dict["classes_"]).all()
- assert logit.n_iter_.all() == np.array(model_dict["n_iter_"]).all()
- assert logit.intercept_.all() == (np.array(model_dict["intercept_"]).all())
- assert logit.coef_.all() == np.array(model_dict["coef_"]).all()
-
-class TestLinearRegressionModel:
-
- def test_evaluate(self, mocker):
-
- X = mock_data()
- y = pd.Series(np.array([0.6, 0.1, 0.2, 0.9, 0.8, 0.3, 0.2, 0.4, 0.9, 0.5])*12)
-
- def mock_mean_squared_error(y_true, y_pred):
- return 1.23
-
- (mocker
- .patch("cobra.model_building.LinearRegressionModel.score_model",
- mock_score_model_regression))
-
- (mocker
- .patch("cobra.model_building.models.mean_squared_error",
- mock_mean_squared_error))
-
- model = LinearRegressionModel()
- actual = model.evaluate(X, y)
-
- assert actual == np.sqrt(1.23)
-
- def test_evaluate_cached(self):
-
- split = "train"
- expected = np.sqrt(1.23)
-
- model = LinearRegressionModel()
- model._eval_metrics_by_split["train"] = expected
-
- actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split)
-
- assert actual == expected
-
- def test_compute_variable_importance(self, mocker):
-
- def mock_pearsonr(ypred, ytrue):
- return [ypred.unique()[0]]
-
- (mocker
- .patch("cobra.model_building.LinearRegressionModel.score_model",
- mock_score_model_regression))
-
- (mocker
- .patch("cobra.model_building.models.stats.pearsonr",
- mock_pearsonr))
-
- model = LinearRegressionModel()
- model.predictors = ["var1_enc", "var2_enc", "var3_enc"]
-
- data = mock_data()
-
- actual = model.compute_variable_importance(data)
-
- expected = pd.DataFrame([
- {"predictor": "var1", "importance": data["var1_enc"].unique()[0]},
- {"predictor": "var2", "importance": data["var2_enc"].unique()[0]},
- {"predictor": "var3", "importance": data["var3_enc"].unique()[0]}
- ]).sort_values(by="importance", ascending=False).reset_index(drop=True)
-
- pd.testing.assert_frame_equal(actual, expected)
-
- def test_serialize(self):
-
- model = LinearRegressionModel()
- actual = model.serialize()
-
- expected = {
- "meta": "linear-regression",
- "predictors": [],
- "_eval_metrics_by_split": {},
- "params": {
- "copy_X": True,
- "fit_intercept": True,
- "n_jobs": None,
- "positive": False
- }
- }
-
- assert actual == expected
-
- def test_deserialize(self):
-
- model = LinearRegressionModel()
-
- model_dict = {
- "meta": "linear-regression",
- "predictors": [],
- "_eval_metrics_by_split": {},
- "params": {
- "copy_X": True,
- "fit_intercept": True,
- "n_jobs": None,
- "positive": False
- },
- "coef_": [[0.5, 0.75]],
- "intercept_": [-3]
- }
-
- model.deserialize(model_dict)
-
- linear = model.linear
- assert linear.get_params() == model_dict["params"]
- assert linear.intercept_.all() == (np.array(model_dict["intercept_"]).all())
- assert linear.coef_.all() == np.array(model_dict["coef_"]).all()
-
+
+import numpy as np
+import pandas as pd
+
+from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel
+
+def mock_data():
+ return pd.DataFrame({"var1_enc": [0.42] * 10,
+ "var2_enc": [0.94] * 10,
+ "var3_enc": [0.87] * 10})
+
+
+def mock_score_model_classification(self, data):
+ return np.array([0.5, 0.8, 0.2, 0.9, 0.1, 0.7, 0.3, 0.6, 0.4, 0.5])
+
+def mock_score_model_regression(self, data):
+ return np.array([0.7, 0.2, 0.2, 0.9, 0.7, 0.3, 0.1, 0.4, 0.8, 0.5])*15
+
+class TestLogisticRegressionModel:
+
+ def test_evaluate(self, mocker):
+
+ X = mock_data()
+ y = pd.Series([1] * 5 + [0] * 5)
+
+ def mock_roc_auc_score(y_true, y_score):
+ return 0.79
+
+ (mocker
+ .patch("cobra.model_building.LogisticRegressionModel.score_model",
+ mock_score_model_classification))
+
+ (mocker
+ .patch("cobra.model_building.models.roc_auc_score",
+ mock_roc_auc_score))
+
+ model = LogisticRegressionModel()
+ actual = model.evaluate(X, y)
+
+ assert actual == 0.79
+
+ def test_evaluate_cached(self):
+
+ split = "train"
+ expected = 0.79
+
+ model = LogisticRegressionModel()
+ model._eval_metrics_by_split["train"] = expected
+
+ actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split)
+
+ assert actual == expected
+
+ def test_compute_variable_importance(self, mocker):
+
+ def mock_pearsonr(ypred, ytrue):
+ return [ypred.unique()[0]]
+
+ (mocker
+ .patch("cobra.model_building.LogisticRegressionModel.score_model",
+ mock_score_model_classification))
+
+ (mocker
+ .patch("cobra.model_building.models.stats.pearsonr",
+ mock_pearsonr))
+
+ model = LogisticRegressionModel()
+ model.predictors = ["var1_enc", "var2_enc", "var3_enc"]
+
+ data = mock_data()
+
+ actual = model.compute_variable_importance(data)
+
+ expected = pd.DataFrame([
+ {"predictor": "var1", "importance": data["var1_enc"].unique()[0]},
+ {"predictor": "var2", "importance": data["var2_enc"].unique()[0]},
+ {"predictor": "var3", "importance": data["var3_enc"].unique()[0]}
+ ]).sort_values(by="importance", ascending=False).reset_index(drop=True)
+
+ pd.testing.assert_frame_equal(actual, expected)
+
+ def test_serialize(self):
+
+ model = LogisticRegressionModel()
+ actual = model.serialize()
+
+ expected = {
+ "meta": "logistic-regression",
+ "predictors": [],
+ "_eval_metrics_by_split": {},
+ "params": {
+ "C": 1000000000.0,
+ "class_weight": None,
+ "dual": False,
+ "fit_intercept": True,
+ "intercept_scaling": 1,
+ "l1_ratio": None,
+ "max_iter": 100,
+ "multi_class": "auto",
+ "n_jobs": None,
+ "penalty": "l2",
+ "random_state": 42,
+ "solver": "liblinear",
+ "tol": 0.0001,
+ "verbose": 0,
+ "warm_start": False
+ }
+ }
+
+ assert actual == expected
+
+ def test_deserialize(self):
+
+ model = LogisticRegressionModel()
+
+ model_dict = {
+ "meta": "logistic-regression",
+ "predictors": [],
+ "_eval_metrics_by_split": {},
+ "params": {
+ "C": 1000000000.0,
+ "class_weight": None,
+ "dual": False,
+ "fit_intercept": True,
+ "intercept_scaling": 1,
+ "l1_ratio": None,
+ "max_iter": 100,
+ "multi_class": "auto",
+ "n_jobs": None,
+ "penalty": "l2",
+ "random_state": 42,
+ "solver": "liblinear",
+ "tol": 0.0001,
+ "verbose": 0,
+ "warm_start": False
+ },
+ "classes_": [0, 1],
+ "coef_": [[0.5, 0.75]],
+ "intercept_": [-3],
+ "n_iter_": [10]
+ }
+
+ model.deserialize(model_dict)
+
+ logit = model.logit
+ assert logit.get_params() == model_dict["params"]
+ assert logit.classes_.all() == np.array(model_dict["classes_"]).all()
+ assert logit.n_iter_.all() == np.array(model_dict["n_iter_"]).all()
+ assert logit.intercept_.all() == (np.array(model_dict["intercept_"]).all())
+ assert logit.coef_.all() == np.array(model_dict["coef_"]).all()
+
+class TestLinearRegressionModel:
+
+ def test_evaluate(self, mocker):
+
+ X = mock_data()
+ y = pd.Series(np.array([0.6, 0.1, 0.2, 0.9, 0.8, 0.3, 0.2, 0.4, 0.9, 0.5])*12)
+
+ def mock_mean_squared_error(y_true, y_pred):
+ return 1.23
+
+ (mocker
+ .patch("cobra.model_building.LinearRegressionModel.score_model",
+ mock_score_model_regression))
+
+ (mocker
+ .patch("cobra.model_building.models.mean_squared_error",
+ mock_mean_squared_error))
+
+ model = LinearRegressionModel()
+ actual = model.evaluate(X, y)
+
+ assert actual == np.sqrt(1.23)
+
+ def test_evaluate_cached(self):
+
+ split = "train"
+ expected = np.sqrt(1.23)
+
+ model = LinearRegressionModel()
+ model._eval_metrics_by_split["train"] = expected
+
+ actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split)
+
+ assert actual == expected
+
+ def test_compute_variable_importance(self, mocker):
+
+ def mock_pearsonr(ypred, ytrue):
+ return [ypred.unique()[0]]
+
+ (mocker
+ .patch("cobra.model_building.LinearRegressionModel.score_model",
+ mock_score_model_regression))
+
+ (mocker
+ .patch("cobra.model_building.models.stats.pearsonr",
+ mock_pearsonr))
+
+ model = LinearRegressionModel()
+ model.predictors = ["var1_enc", "var2_enc", "var3_enc"]
+
+ data = mock_data()
+
+ actual = model.compute_variable_importance(data)
+
+ expected = pd.DataFrame([
+ {"predictor": "var1", "importance": data["var1_enc"].unique()[0]},
+ {"predictor": "var2", "importance": data["var2_enc"].unique()[0]},
+ {"predictor": "var3", "importance": data["var3_enc"].unique()[0]}
+ ]).sort_values(by="importance", ascending=False).reset_index(drop=True)
+
+ pd.testing.assert_frame_equal(actual, expected)
+
+ def test_serialize(self):
+
+ model = LinearRegressionModel()
+ actual = model.serialize()
+
+ expected = {
+ "meta": "linear-regression",
+ "predictors": [],
+ "_eval_metrics_by_split": {},
+ "params": {
+ "copy_X": True,
+ "fit_intercept": True,
+ "n_jobs": None,
+ "positive": False
+ }
+ }
+
+ assert actual == expected
+
+ def test_deserialize(self):
+
+ model = LinearRegressionModel()
+
+ model_dict = {
+ "meta": "linear-regression",
+ "predictors": [],
+ "_eval_metrics_by_split": {},
+ "params": {
+ "copy_X": True,
+ "fit_intercept": True,
+ "n_jobs": None,
+ "positive": False
+ },
+ "coef_": [[0.5, 0.75]],
+ "intercept_": [-3]
+ }
+
+ model.deserialize(model_dict)
+
+ linear = model.linear
+ assert linear.get_params() == model_dict["params"]
+ assert linear.intercept_.all() == (np.array(model_dict["intercept_"]).all())
+ assert linear.coef_.all() == np.array(model_dict["coef_"]).all()
+
diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py
index dd53434..73f5f4e 100644
--- a/tests/preprocessing/test_categorical_data_processor.py
+++ b/tests/preprocessing/test_categorical_data_processor.py
@@ -1,313 +1,313 @@
-
-import pytest
-import numpy as np
-import pandas as pd
-
-from cobra.preprocessing import CategoricalDataProcessor
-
-class TestCategoricalDataProcessor:
-
- def test_attributes_to_dict(self):
-
- processor = CategoricalDataProcessor()
-
- cleaned_categories = ["a", "b", "c"]
- processor._cleaned_categories_by_column = {
- "variable": set(cleaned_categories)
- }
-
- actual = processor.attributes_to_dict()
-
- expected = {
- "model_type": "classification",
- "regroup": True,
- "regroup_name": "Other",
- "keep_missing": True,
- "category_size_threshold": 5,
- "p_value_threshold": 0.001,
- "scale_contingency_table": True,
- "forced_categories": {},
- "_cleaned_categories_by_column": {
- "variable": list(set(cleaned_categories))
- }
- }
-
- assert actual == expected
-
- @pytest.mark.parametrize("attribute",
- ["regroup", "regroup_name", "keep_missing",
- "category_size_threshold", "p_value_threshold",
- "scale_contingency_table", "forced_categories",
- "_cleaned_categories_by_column"])
- def test_set_attributes_from_dict(self, attribute):
-
- processor = CategoricalDataProcessor()
-
- cleaned_categories = ["a", "b", "c"]
- params = {
- "regroup": True,
- "regroup_name": "Other",
- "keep_missing": True,
- "category_size_threshold": 5,
- "p_value_threshold": 0.001,
- "scale_contingency_table": True,
- "forced_categories": {},
- "_cleaned_categories_by_column": {
- "variable": cleaned_categories
- }
- }
-
- expected = params[attribute]
-
- if attribute == "_cleaned_categories_by_column":
- # list is transformed to a set in CategoricalDataProcessor
- expected = {"variable": set(cleaned_categories)}
-
- processor.set_attributes_from_dict(params)
-
- actual = getattr(processor, attribute)
-
- assert actual == expected
-
- @pytest.mark.parametrize("scale_contingency_table, expected",
- [(False, 0.01329),
- (True, 0.43437)])
- def test_compute_p_value_classification(self, scale_contingency_table, expected):
-
- X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
- y = pd.Series(data=([0]*35 + [1]*35 + [0]*15 + [1]*5 + [0]*8 + [1]*2))
- category = "c1"
-
- actual = (CategoricalDataProcessor
- ._compute_p_value(X, y, category, "classification", scale_contingency_table))
-
- assert pytest.approx(actual, abs=1e-5) == expected
-
- @pytest.mark.parametrize("seed, expected",
- [(505, 0.02222),
- (603, 0.89230)])
- def test_compute_p_value_regression(self, seed, expected):
-
- np.random.seed(seed)
-
- X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
- y = pd.Series(data=np.random.uniform(0, 1, 100)*5)
- category = "c1"
-
- actual = (CategoricalDataProcessor
- ._compute_p_value(X, y, category, "regression", None))
-
- assert pytest.approx(actual, abs=1e-5) == expected
-
- def test_get_small_categories(self):
-
- data = pd.Series(data=(["c1"]*50 + ["c2"]*25 + ["c3"]*15 + ["c4"]*5))
- incidence = 0.35
- threshold = 10 # to make it easy to manualLy compute
- expected = {"c3", "c4"}
-
- actual = (CategoricalDataProcessor
- ._get_small_categories(data, incidence, threshold))
-
- assert actual == expected
-
- def test_replace_missings(self):
-
- data = pd.DataFrame({"variable": ["c1", "c2", np.nan, "", " "]})
- expected = pd.DataFrame({"variable": ["c1", "c2", "Missing", "Missing",
- "Missing"]
- })
- actual = (CategoricalDataProcessor
- ._replace_missings(data, ["variable"]))
-
- pd.testing.assert_frame_equal(actual, expected)
-
- @pytest.mark.parametrize("cleaned_categories, expected",
- [({"c1", "c2"},
- pd.Series(data=["c1", "c2", "Other", "Other"])),
- ({"c1", "c2", "c3", "c4"},
- pd.Series(data=["c1", "c2", "c3", "c4"]))])
- def test_replace_categories(self, cleaned_categories, expected):
-
- data = pd.Series(data=["c1", "c2", "c3", "c4"])
-
- actual = (CategoricalDataProcessor
- ._replace_categories(data, cleaned_categories, 'Other'))
-
- pd.testing.assert_series_equal(actual, expected)
-
- def test_all_cats_not_significant(self):
- # Expected
- e = {'categorical_var': ['A', 'A', 'A', 'A',
- 'B', 'B', 'B', 'B',
- 'C', 'C', 'C', 'C'],
- 'target': [1, 1, 1, 1,
- 0, 0, 0, 0,
- 1, 0, 1, 0],
- 'categorical_var_processed': ['A', 'A', 'A', 'A',
- 'B', 'B', 'B', 'B',
- 'C', 'C', 'C', 'C']}
-
- # data -> actual
- d = {'categorical_var': ['A', 'A', 'A', 'A',
- 'B', 'B', 'B', 'B',
- 'C', 'C', 'C', 'C'],
- 'target': [1, 1, 1, 1,
- 0, 0, 0, 0,
- 1, 0, 1, 0]}
-
- discrete_vars = ['categorical_var']
- target_column_name = 'target'
-
- data = pd.DataFrame(d, columns=['categorical_var', 'target'])
- expected = pd.DataFrame(e, columns=['categorical_var',
- 'target',
- 'categorical_var_processed'])
-
- categorical_data_processor = CategoricalDataProcessor(
- category_size_threshold=0,
- p_value_threshold=0.0001)
-
- categorical_data_processor.fit(data,
- discrete_vars,
- target_column_name)
-
- actual = categorical_data_processor.transform(data,
- discrete_vars)
-
- pd.testing.assert_frame_equal(actual, expected)
-
- def test_regroup_name(self):
- # Expected
- e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
- 'B', 'B', 'B', 'B', 'B', 'B',
- 'C', 'C', 'C', 'C', 'C', 'C'],
- 'target': [1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0,
- 1, 0, 1, 0, 1, 0],
- 'categorical_var_processed': [
- 'A', 'A', 'A', 'A', 'A', 'A',
- 'B', 'B', 'B', 'B', 'B', 'B',
- 'OTH', 'OTH', 'OTH', 'OTH', 'OTH', 'OTH']}
-
- # data -> actual
- d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
- 'B', 'B', 'B', 'B', 'B', 'B',
- 'C', 'C', 'C', 'C', 'C', 'C'],
- 'target': [1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0,
- 1, 0, 1, 0, 1, 0]}
-
- discrete_vars = ['categorical_var']
- target_column_name = 'target'
-
- data = pd.DataFrame(d, columns=['categorical_var', 'target'])
- expected = pd.DataFrame(e, columns=['categorical_var',
- 'target',
- 'categorical_var_processed'])
-
- expected['categorical_var_processed'] = (
- expected['categorical_var_processed'].astype("category"))
-
- categorical_data_processor = CategoricalDataProcessor(
- category_size_threshold=0,
- regroup_name='OTH',
- p_value_threshold=0.05)
-
- categorical_data_processor.fit(data,
- discrete_vars,
- target_column_name)
-
- actual = categorical_data_processor.transform(data,
- discrete_vars)
-
- pd.testing.assert_frame_equal(actual, expected)
-
- def test_force_category(self):
- # Expected
- e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
- 'B', 'B', 'B', 'B', 'B', 'B',
- 'C', 'C', 'C', 'C', 'C', 'C'],
- 'target': [1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0,
- 1, 0, 1, 0, 1, 0],
- 'categorical_var_processed': ['A', 'A', 'A', 'A', 'A', 'A',
- 'B', 'B', 'B', 'B', 'B', 'B',
- 'C', 'C', 'C', 'C', 'C', 'C']}
-
- # data -> actual
- d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
- 'B', 'B', 'B', 'B', 'B', 'B',
- 'C', 'C', 'C', 'C', 'C', 'C'],
- 'target': [1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0,
- 1, 0, 1, 0, 1, 0]}
-
- discrete_vars = ['categorical_var']
- target_column_name = 'target'
-
- data = pd.DataFrame(d, columns=['categorical_var', 'target'])
- expected = pd.DataFrame(e, columns=['categorical_var',
- 'target',
- 'categorical_var_processed'])
-
- expected['categorical_var_processed'] = (
- expected['categorical_var_processed'].astype("category"))
-
- categorical_data_processor = CategoricalDataProcessor(
- category_size_threshold=0,
- forced_categories={'categorical_var': ['C']},
- p_value_threshold=0.05)
-
- categorical_data_processor.fit(data,
- discrete_vars,
- target_column_name)
-
- actual = categorical_data_processor.transform(data,
- discrete_vars)
-
- pd.testing.assert_frame_equal(actual, expected)
-
- def test_categorical_variable_is_constant(self):
- # Expected
- e = {'categorical_var': ['A', 'A', 'A', 'A',
- 'A', 'A', 'A', 'A',
- 'A', 'A', 'A', 'A'],
- 'target': [1, 1, 1, 1,
- 0, 0, 0, 0,
- 1, 0, 1, 0],
- 'categorical_var_processed': ['A', 'A', 'A', 'A',
- 'A', 'A', 'A', 'A',
- 'A', 'A', 'A', 'A']}
-
- # data -> actual
- d = {'categorical_var': ['A', 'A', 'A', 'A',
- 'A', 'A', 'A', 'A',
- 'A', 'A', 'A', 'A'],
- 'target': [1, 1, 1, 1,
- 0, 0, 0, 0,
- 1, 0, 1, 0]}
-
- discrete_vars = ['categorical_var']
- target_column_name = 'target'
-
- data = pd.DataFrame(d, columns=['categorical_var', 'target'])
- expected = pd.DataFrame(e, columns=['categorical_var',
- 'target',
- 'categorical_var_processed'])
-
- expected['categorical_var_processed'] = (
- expected['categorical_var_processed'].astype("category"))
-
- categorical_data_processor = CategoricalDataProcessor(
- category_size_threshold=0,
- p_value_threshold=0.0001)
-
- categorical_data_processor.fit(data,
- discrete_vars,
- target_column_name)
-
- actual = categorical_data_processor.transform(data,
- discrete_vars)
-
- pd.testing.assert_frame_equal(actual, expected)
+
+import pytest
+import numpy as np
+import pandas as pd
+
+from cobra.preprocessing import CategoricalDataProcessor
+
+class TestCategoricalDataProcessor:
+
+ def test_attributes_to_dict(self):
+
+ processor = CategoricalDataProcessor()
+
+ cleaned_categories = ["a", "b", "c"]
+ processor._cleaned_categories_by_column = {
+ "variable": set(cleaned_categories)
+ }
+
+ actual = processor.attributes_to_dict()
+
+ expected = {
+ "model_type": "classification",
+ "regroup": True,
+ "regroup_name": "Other",
+ "keep_missing": True,
+ "category_size_threshold": 5,
+ "p_value_threshold": 0.001,
+ "scale_contingency_table": True,
+ "forced_categories": {},
+ "_cleaned_categories_by_column": {
+ "variable": list(set(cleaned_categories))
+ }
+ }
+
+ assert actual == expected
+
+ @pytest.mark.parametrize("attribute",
+ ["regroup", "regroup_name", "keep_missing",
+ "category_size_threshold", "p_value_threshold",
+ "scale_contingency_table", "forced_categories",
+ "_cleaned_categories_by_column"])
+ def test_set_attributes_from_dict(self, attribute):
+
+ processor = CategoricalDataProcessor()
+
+ cleaned_categories = ["a", "b", "c"]
+ params = {
+ "regroup": True,
+ "regroup_name": "Other",
+ "keep_missing": True,
+ "category_size_threshold": 5,
+ "p_value_threshold": 0.001,
+ "scale_contingency_table": True,
+ "forced_categories": {},
+ "_cleaned_categories_by_column": {
+ "variable": cleaned_categories
+ }
+ }
+
+ expected = params[attribute]
+
+ if attribute == "_cleaned_categories_by_column":
+ # list is transformed to a set in CategoricalDataProcessor
+ expected = {"variable": set(cleaned_categories)}
+
+ processor.set_attributes_from_dict(params)
+
+ actual = getattr(processor, attribute)
+
+ assert actual == expected
+
+ @pytest.mark.parametrize("scale_contingency_table, expected",
+ [(False, 0.01329),
+ (True, 0.43437)])
+ def test_compute_p_value_classification(self, scale_contingency_table, expected):
+
+ X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
+ y = pd.Series(data=([0]*35 + [1]*35 + [0]*15 + [1]*5 + [0]*8 + [1]*2))
+ category = "c1"
+
+ actual = (CategoricalDataProcessor
+ ._compute_p_value(X, y, category, "classification", scale_contingency_table))
+
+ assert pytest.approx(actual, abs=1e-5) == expected
+
+ @pytest.mark.parametrize("seed, expected",
+ [(505, 0.02222),
+ (603, 0.89230)])
+ def test_compute_p_value_regression(self, seed, expected):
+
+ np.random.seed(seed)
+
+ X = pd.Series(data=(["c1"]*70 + ["c2"]*20 + ["c3"]*10))
+ y = pd.Series(data=np.random.uniform(0, 1, 100)*5)
+ category = "c1"
+
+ actual = (CategoricalDataProcessor
+ ._compute_p_value(X, y, category, "regression", None))
+
+ assert pytest.approx(actual, abs=1e-5) == expected
+
+ def test_get_small_categories(self):
+
+ data = pd.Series(data=(["c1"]*50 + ["c2"]*25 + ["c3"]*15 + ["c4"]*5))
+ incidence = 0.35
+ threshold = 10 # to make it easy to manualLy compute
+ expected = {"c3", "c4"}
+
+ actual = (CategoricalDataProcessor
+ ._get_small_categories(data, incidence, threshold))
+
+ assert actual == expected
+
+ def test_replace_missings(self):
+
+ data = pd.DataFrame({"variable": ["c1", "c2", np.nan, "", " "]})
+ expected = pd.DataFrame({"variable": ["c1", "c2", "Missing", "Missing",
+ "Missing"]
+ })
+ actual = (CategoricalDataProcessor
+ ._replace_missings(data, ["variable"]))
+
+ pd.testing.assert_frame_equal(actual, expected)
+
+ @pytest.mark.parametrize("cleaned_categories, expected",
+ [({"c1", "c2"},
+ pd.Series(data=["c1", "c2", "Other", "Other"])),
+ ({"c1", "c2", "c3", "c4"},
+ pd.Series(data=["c1", "c2", "c3", "c4"]))])
+ def test_replace_categories(self, cleaned_categories, expected):
+
+ data = pd.Series(data=["c1", "c2", "c3", "c4"])
+
+ actual = (CategoricalDataProcessor
+ ._replace_categories(data, cleaned_categories, 'Other'))
+
+ pd.testing.assert_series_equal(actual, expected)
+
+ def test_all_cats_not_significant(self):
+ # Expected
+ e = {'categorical_var': ['A', 'A', 'A', 'A',
+ 'B', 'B', 'B', 'B',
+ 'C', 'C', 'C', 'C'],
+ 'target': [1, 1, 1, 1,
+ 0, 0, 0, 0,
+ 1, 0, 1, 0],
+ 'categorical_var_processed': ['A', 'A', 'A', 'A',
+ 'B', 'B', 'B', 'B',
+ 'C', 'C', 'C', 'C']}
+
+ # data -> actual
+ d = {'categorical_var': ['A', 'A', 'A', 'A',
+ 'B', 'B', 'B', 'B',
+ 'C', 'C', 'C', 'C'],
+ 'target': [1, 1, 1, 1,
+ 0, 0, 0, 0,
+ 1, 0, 1, 0]}
+
+ discrete_vars = ['categorical_var']
+ target_column_name = 'target'
+
+ data = pd.DataFrame(d, columns=['categorical_var', 'target'])
+ expected = pd.DataFrame(e, columns=['categorical_var',
+ 'target',
+ 'categorical_var_processed'])
+
+ categorical_data_processor = CategoricalDataProcessor(
+ category_size_threshold=0,
+ p_value_threshold=0.0001)
+
+ categorical_data_processor.fit(data,
+ discrete_vars,
+ target_column_name)
+
+ actual = categorical_data_processor.transform(data,
+ discrete_vars)
+
+ pd.testing.assert_frame_equal(actual, expected)
+
+ def test_regroup_name(self):
+ # Expected
+ e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
+ 'B', 'B', 'B', 'B', 'B', 'B',
+ 'C', 'C', 'C', 'C', 'C', 'C'],
+ 'target': [1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0,
+ 1, 0, 1, 0, 1, 0],
+ 'categorical_var_processed': [
+ 'A', 'A', 'A', 'A', 'A', 'A',
+ 'B', 'B', 'B', 'B', 'B', 'B',
+ 'OTH', 'OTH', 'OTH', 'OTH', 'OTH', 'OTH']}
+
+ # data -> actual
+ d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
+ 'B', 'B', 'B', 'B', 'B', 'B',
+ 'C', 'C', 'C', 'C', 'C', 'C'],
+ 'target': [1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0,
+ 1, 0, 1, 0, 1, 0]}
+
+ discrete_vars = ['categorical_var']
+ target_column_name = 'target'
+
+ data = pd.DataFrame(d, columns=['categorical_var', 'target'])
+ expected = pd.DataFrame(e, columns=['categorical_var',
+ 'target',
+ 'categorical_var_processed'])
+
+ expected['categorical_var_processed'] = (
+ expected['categorical_var_processed'].astype("category"))
+
+ categorical_data_processor = CategoricalDataProcessor(
+ category_size_threshold=0,
+ regroup_name='OTH',
+ p_value_threshold=0.05)
+
+ categorical_data_processor.fit(data,
+ discrete_vars,
+ target_column_name)
+
+ actual = categorical_data_processor.transform(data,
+ discrete_vars)
+
+ pd.testing.assert_frame_equal(actual, expected)
+
+ def test_force_category(self):
+ # Expected
+ e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
+ 'B', 'B', 'B', 'B', 'B', 'B',
+ 'C', 'C', 'C', 'C', 'C', 'C'],
+ 'target': [1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0,
+ 1, 0, 1, 0, 1, 0],
+ 'categorical_var_processed': ['A', 'A', 'A', 'A', 'A', 'A',
+ 'B', 'B', 'B', 'B', 'B', 'B',
+ 'C', 'C', 'C', 'C', 'C', 'C']}
+
+ # data -> actual
+ d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
+ 'B', 'B', 'B', 'B', 'B', 'B',
+ 'C', 'C', 'C', 'C', 'C', 'C'],
+ 'target': [1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0,
+ 1, 0, 1, 0, 1, 0]}
+
+ discrete_vars = ['categorical_var']
+ target_column_name = 'target'
+
+ data = pd.DataFrame(d, columns=['categorical_var', 'target'])
+ expected = pd.DataFrame(e, columns=['categorical_var',
+ 'target',
+ 'categorical_var_processed'])
+
+ expected['categorical_var_processed'] = (
+ expected['categorical_var_processed'].astype("category"))
+
+ categorical_data_processor = CategoricalDataProcessor(
+ category_size_threshold=0,
+ forced_categories={'categorical_var': ['C']},
+ p_value_threshold=0.05)
+
+ categorical_data_processor.fit(data,
+ discrete_vars,
+ target_column_name)
+
+ actual = categorical_data_processor.transform(data,
+ discrete_vars)
+
+ pd.testing.assert_frame_equal(actual, expected)
+
+ def test_categorical_variable_is_constant(self):
+ # Expected
+ e = {'categorical_var': ['A', 'A', 'A', 'A',
+ 'A', 'A', 'A', 'A',
+ 'A', 'A', 'A', 'A'],
+ 'target': [1, 1, 1, 1,
+ 0, 0, 0, 0,
+ 1, 0, 1, 0],
+ 'categorical_var_processed': ['A', 'A', 'A', 'A',
+ 'A', 'A', 'A', 'A',
+ 'A', 'A', 'A', 'A']}
+
+ # data -> actual
+ d = {'categorical_var': ['A', 'A', 'A', 'A',
+ 'A', 'A', 'A', 'A',
+ 'A', 'A', 'A', 'A'],
+ 'target': [1, 1, 1, 1,
+ 0, 0, 0, 0,
+ 1, 0, 1, 0]}
+
+ discrete_vars = ['categorical_var']
+ target_column_name = 'target'
+
+ data = pd.DataFrame(d, columns=['categorical_var', 'target'])
+ expected = pd.DataFrame(e, columns=['categorical_var',
+ 'target',
+ 'categorical_var_processed'])
+
+ expected['categorical_var_processed'] = (
+ expected['categorical_var_processed'].astype("category"))
+
+ categorical_data_processor = CategoricalDataProcessor(
+ category_size_threshold=0,
+ p_value_threshold=0.0001)
+
+ categorical_data_processor.fit(data,
+ discrete_vars,
+ target_column_name)
+
+ actual = categorical_data_processor.transform(data,
+ discrete_vars)
+
+ pd.testing.assert_frame_equal(actual, expected)
diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py
index d3a643a..209d74b 100644
--- a/tests/preprocessing/test_kbins_discretizer.py
+++ b/tests/preprocessing/test_kbins_discretizer.py
@@ -1,252 +1,252 @@
-
-from contextlib import contextmanager
-import pytest
-import numpy as np
-import pandas as pd
-
-from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer
-
-@contextmanager
-def does_not_raise():
- yield
-
-
-class TestKBinsDiscretizer:
-
- # ---------------- Test for public methods ----------------
- def test_attributes_to_dict(self):
-
- discretizer = KBinsDiscretizer()
-
- bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]
- discretizer._bins_by_column = {"variable": bins}
-
- actual = discretizer.attributes_to_dict()
-
- expected = {
- "n_bins": 10,
- "strategy": "quantile",
- "closed": "right",
- "auto_adapt_bins": False,
- "starting_precision": 0,
- "label_format": "{} - {}",
- "change_endpoint_format": False,
- "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0],
- [6.0, 9.0]]}
- }
-
- assert actual == expected
-
- @pytest.mark.parametrize("attribute",
- ["n_bins", "strategy", "closed",
- "auto_adapt_bins", "starting_precision",
- "label_format", "change_endpoint_format",
- "_bins_by_column"])
- def test_set_attributes_from_dict(self, attribute):
-
- discretizer = KBinsDiscretizer()
-
- params = {
- "n_bins": 5,
- "strategy": "uniform",
- "closed": "left",
- "auto_adapt_bins": True,
- "starting_precision": 1,
- "label_format": "[,)",
- "change_endpoint_format": True,
- "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0],
- [6.0, 9.0]]}
- }
-
- expected = params[attribute]
-
- if attribute == "_bins_by_column":
- # list of list is transformed to a list of tuples
- # in KBinsDiscretizer!!!
- expected = {"variable": [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]}
-
- discretizer.set_attributes_from_dict(params)
-
- actual = getattr(discretizer, attribute)
-
- assert actual == expected
-
- # no further tests here as this is just a wrapper around _fit_column!
- @pytest.mark.parametrize("strategy, expectation",
- [("trees", pytest.raises(ValueError)),
- ("quantile", does_not_raise())])
- def test_fit_exception(self, strategy, expectation):
- discretizer = KBinsDiscretizer(strategy=strategy)
-
- data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
-
- with expectation:
- discretizer.fit(data, ["variable"])
-
- # no further tests here as this is just a wrapper around _transform_column!
- @pytest.mark.parametrize("scenario, expectation",
- [("raise", pytest.raises(ValueError)),
- ("regular_test", does_not_raise()),
- ("constant_data", does_not_raise())])
- def test_transform(self, scenario, expectation):
-
- discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform")
-
- data = pd.DataFrame({"variable": ([1] * 10)})
- expected = data.copy()
-
- if scenario == "regular_test":
- # overwrite data and expected with DataFrame containing
- # a non-constant variable
- data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
- expected = data.copy()
-
- discretizer.fit(data, ["variable"])
-
- categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"]
- expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4
- + ["3.0 - 6.0"]*3
- + ["6.0 - 9.0"]*3
- + ["Missing"],
- categories=categories,
- ordered=True)
- elif scenario == "constant_data":
- discretizer.fit(data, ["variable"])
-
- with expectation:
- actual = discretizer.transform(data, ["variable"])
- pd.testing.assert_frame_equal(actual, expected)
-
- # ---------------- Test for private methods ----------------
- @pytest.mark.parametrize("n_bins, expectation",
- [(1, pytest.raises(ValueError)),
- (10.5, pytest.raises(ValueError)),
- (2, does_not_raise())])
- def test_validate_n_bins_exception(self, n_bins, expectation):
- with expectation:
- assert KBinsDiscretizer()._validate_n_bins(n_bins=n_bins) is None
-
- def test_transform_column(self):
-
- data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
- discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform")
-
- bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]
-
- actual = discretizer._transform_column(data, "variable", bins)
-
- categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"]
-
- expected = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
- expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4
- + ["3.0 - 6.0"]*3
- + ["6.0 - 9.0"]*3
- + ["Missing"],
- categories=categories,
- ordered=True)
-
- # assert using pandas testing module
- pd.testing.assert_frame_equal(actual, expected)
-
- @pytest.mark.parametrize("n_bins, auto_adapt_bins, data, expected",
- [(4, False,
- pd.DataFrame({"variable": list(range(0, 11))}),
- [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0),
- (8.0, 10.0)]),
- (10, True,
- # ints from 0-10 with 17 nan's
- pd.DataFrame({"variable": list(range(0, 11)) +
- ([np.nan] * 17)}),
- [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0),
- (8.0, 10.0)]),
- (10, False,
- # almost constant
- pd.DataFrame({"variable": [0] + ([1] * 100)}),
- None),
- (2, False,
- pd.DataFrame({"variable": [5.4, 9.3, np.inf]}),
- None)],
- ids=["regular", "auto_adapt_bins",
- "two bin edges", "infs"])
- def test_fit_column(self, n_bins, auto_adapt_bins, data, expected):
- discretizer = KBinsDiscretizer(n_bins=n_bins,
- auto_adapt_bins=auto_adapt_bins)
-
- actual = discretizer._fit_column(data, column_name="variable")
-
- assert actual == expected
-
- @pytest.mark.parametrize("strategy, n_bins, data, expected",
- [("quantile", # strategy
- 4, # n_bins
- # data (ints from 0 - 10):
- pd.DataFrame({"variable": list(range(0, 11))}),
- [0.0, 2.5, 5, 7.5, 10.0]), # expected result
- ("uniform", # strategy
- 3, # n_bins
- # data (ints from 0 - 9):
- pd.DataFrame({"variable": list(range(0, 10))}),
- [0.0, 3.0, 6.0, 9.0])], # expected result
- ids=["quantile", "uniform"])
- def test_compute_bin_edges(self, strategy, n_bins, data, expected):
-
- discretizer = KBinsDiscretizer(strategy=strategy)
-
- actual = discretizer._compute_bin_edges(data, column_name="variable",
- n_bins=n_bins,
- col_min=data.variable.min(),
- col_max=data.variable.max())
-
- assert actual == expected
-
- @pytest.mark.parametrize("bin_edges, starting_precision, expected",
- [([-10, 0, 1, 2], 1, 1),
- ([-10, 0, 1, 1.01], 0, 2),
- ([-10, 0, 1, 1.1], 1, 1),
- ([-10, 0, 1, 2], -1, 0),
- ([-10, 0, 10, 21], -1, -1)],
- ids=["less precision", "more precision",
- "equal precision", "negative start",
- "round up"])
- def test_compute_minimal_precision_of_bin_edges(self, bin_edges,
- starting_precision,
- expected):
-
- discretizer = KBinsDiscretizer(starting_precision=starting_precision)
-
- actual = discretizer._compute_minimal_precision_of_bin_edges(bin_edges)
-
- assert actual == expected
-
- @pytest.mark.parametrize("bin_edges, expected",
- [([0, 1, 1.5, 2], [(0, 1), (1, 1.5), (1.5, 2)]),
- ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)]),
- ([np.inf, 0.0, -np.inf],
- [(np.inf, 0.0), (0.0, -np.inf)])])
- def test_compute_bins_from_edges(self, bin_edges, expected):
-
- discretizer = KBinsDiscretizer()
- actual = discretizer._compute_bins_from_edges(bin_edges)
-
- assert actual == expected
-
- @pytest.mark.parametrize("change_endpoint_format, closed, bins, expected",
- [(False, "right", [(0, 1), (1, 2), (2, 3)],
- ["0 - 1", "1 - 2", "2 - 3"]),
- (True, "right", [(0, 1), (1, 2), (2, 3)],
- ["<= 1", "1 - 2", "> 2"]),
- (True, "left", [(0, 1), (1, 2), (2, 3)],
- ["< 1", "1 - 2", ">= 2"])],
- ids=["standard format", "different endpoints",
- "different endpoints left"])
- def test_create_bin_labels(self, change_endpoint_format, closed,
- bins, expected):
-
- discretizer = KBinsDiscretizer(
- closed=closed,
- change_endpoint_format=change_endpoint_format
- )
-
- actual = discretizer._create_bin_labels(bins)
-
- assert actual == expected
+
+from contextlib import contextmanager
+import pytest
+import numpy as np
+import pandas as pd
+
+from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer
+
+@contextmanager
+def does_not_raise():
+ yield
+
+
+class TestKBinsDiscretizer:
+
+ # ---------------- Test for public methods ----------------
+ def test_attributes_to_dict(self):
+
+ discretizer = KBinsDiscretizer()
+
+ bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]
+ discretizer._bins_by_column = {"variable": bins}
+
+ actual = discretizer.attributes_to_dict()
+
+ expected = {
+ "n_bins": 10,
+ "strategy": "quantile",
+ "closed": "right",
+ "auto_adapt_bins": False,
+ "starting_precision": 0,
+ "label_format": "{} - {}",
+ "change_endpoint_format": False,
+ "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0],
+ [6.0, 9.0]]}
+ }
+
+ assert actual == expected
+
+ @pytest.mark.parametrize("attribute",
+ ["n_bins", "strategy", "closed",
+ "auto_adapt_bins", "starting_precision",
+ "label_format", "change_endpoint_format",
+ "_bins_by_column"])
+ def test_set_attributes_from_dict(self, attribute):
+
+ discretizer = KBinsDiscretizer()
+
+ params = {
+ "n_bins": 5,
+ "strategy": "uniform",
+ "closed": "left",
+ "auto_adapt_bins": True,
+ "starting_precision": 1,
+ "label_format": "[,)",
+ "change_endpoint_format": True,
+ "_bins_by_column": {"variable": [[0.0, 3.0], [3.0, 6.0],
+ [6.0, 9.0]]}
+ }
+
+ expected = params[attribute]
+
+ if attribute == "_bins_by_column":
+ # list of list is transformed to a list of tuples
+ # in KBinsDiscretizer!!!
+ expected = {"variable": [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]}
+
+ discretizer.set_attributes_from_dict(params)
+
+ actual = getattr(discretizer, attribute)
+
+ assert actual == expected
+
+ # no further tests here as this is just a wrapper around _fit_column!
+ @pytest.mark.parametrize("strategy, expectation",
+ [("trees", pytest.raises(ValueError)),
+ ("quantile", does_not_raise())])
+ def test_fit_exception(self, strategy, expectation):
+ discretizer = KBinsDiscretizer(strategy=strategy)
+
+ data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
+
+ with expectation:
+ discretizer.fit(data, ["variable"])
+
+ # no further tests here as this is just a wrapper around _transform_column!
+ @pytest.mark.parametrize("scenario, expectation",
+ [("raise", pytest.raises(ValueError)),
+ ("regular_test", does_not_raise()),
+ ("constant_data", does_not_raise())])
+ def test_transform(self, scenario, expectation):
+
+ discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform")
+
+ data = pd.DataFrame({"variable": ([1] * 10)})
+ expected = data.copy()
+
+ if scenario == "regular_test":
+ # overwrite data and expected with DataFrame containing
+ # a non-constant variable
+ data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
+ expected = data.copy()
+
+ discretizer.fit(data, ["variable"])
+
+ categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"]
+ expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4
+ + ["3.0 - 6.0"]*3
+ + ["6.0 - 9.0"]*3
+ + ["Missing"],
+ categories=categories,
+ ordered=True)
+ elif scenario == "constant_data":
+ discretizer.fit(data, ["variable"])
+
+ with expectation:
+ actual = discretizer.transform(data, ["variable"])
+ pd.testing.assert_frame_equal(actual, expected)
+
+ # ---------------- Test for private methods ----------------
+ @pytest.mark.parametrize("n_bins, expectation",
+ [(1, pytest.raises(ValueError)),
+ (10.5, pytest.raises(ValueError)),
+ (2, does_not_raise())])
+ def test_validate_n_bins_exception(self, n_bins, expectation):
+ with expectation:
+ assert KBinsDiscretizer()._validate_n_bins(n_bins=n_bins) is None
+
+ def test_transform_column(self):
+
+ data = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
+ discretizer = KBinsDiscretizer(n_bins=3, strategy="uniform")
+
+ bins = [(0.0, 3.0), (3.0, 6.0), (6.0, 9.0)]
+
+ actual = discretizer._transform_column(data, "variable", bins)
+
+ categories = ["0.0 - 3.0", "3.0 - 6.0", "6.0 - 9.0", "Missing"]
+
+ expected = pd.DataFrame({"variable": list(range(0, 10)) + [np.nan]})
+ expected["variable_bin"] = pd.Categorical(["0.0 - 3.0"]*4
+ + ["3.0 - 6.0"]*3
+ + ["6.0 - 9.0"]*3
+ + ["Missing"],
+ categories=categories,
+ ordered=True)
+
+ # assert using pandas testing module
+ pd.testing.assert_frame_equal(actual, expected)
+
+ @pytest.mark.parametrize("n_bins, auto_adapt_bins, data, expected",
+ [(4, False,
+ pd.DataFrame({"variable": list(range(0, 11))}),
+ [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0),
+ (8.0, 10.0)]),
+ (10, True,
+ # ints from 0-10 with 17 nan's
+ pd.DataFrame({"variable": list(range(0, 11)) +
+ ([np.nan] * 17)}),
+ [(0.0, 2.0), (2.0, 5.0), (5.0, 8.0),
+ (8.0, 10.0)]),
+ (10, False,
+ # almost constant
+ pd.DataFrame({"variable": [0] + ([1] * 100)}),
+ None),
+ (2, False,
+ pd.DataFrame({"variable": [5.4, 9.3, np.inf]}),
+ None)],
+ ids=["regular", "auto_adapt_bins",
+ "two bin edges", "infs"])
+ def test_fit_column(self, n_bins, auto_adapt_bins, data, expected):
+ discretizer = KBinsDiscretizer(n_bins=n_bins,
+ auto_adapt_bins=auto_adapt_bins)
+
+ actual = discretizer._fit_column(data, column_name="variable")
+
+ assert actual == expected
+
+ @pytest.mark.parametrize("strategy, n_bins, data, expected",
+ [("quantile", # strategy
+ 4, # n_bins
+ # data (ints from 0 - 10):
+ pd.DataFrame({"variable": list(range(0, 11))}),
+ [0.0, 2.5, 5, 7.5, 10.0]), # expected result
+ ("uniform", # strategy
+ 3, # n_bins
+ # data (ints from 0 - 9):
+ pd.DataFrame({"variable": list(range(0, 10))}),
+ [0.0, 3.0, 6.0, 9.0])], # expected result
+ ids=["quantile", "uniform"])
+ def test_compute_bin_edges(self, strategy, n_bins, data, expected):
+
+ discretizer = KBinsDiscretizer(strategy=strategy)
+
+ actual = discretizer._compute_bin_edges(data, column_name="variable",
+ n_bins=n_bins,
+ col_min=data.variable.min(),
+ col_max=data.variable.max())
+
+ assert actual == expected
+
+ @pytest.mark.parametrize("bin_edges, starting_precision, expected",
+ [([-10, 0, 1, 2], 1, 1),
+ ([-10, 0, 1, 1.01], 0, 2),
+ ([-10, 0, 1, 1.1], 1, 1),
+ ([-10, 0, 1, 2], -1, 0),
+ ([-10, 0, 10, 21], -1, -1)],
+ ids=["less precision", "more precision",
+ "equal precision", "negative start",
+ "round up"])
+ def test_compute_minimal_precision_of_bin_edges(self, bin_edges,
+ starting_precision,
+ expected):
+
+ discretizer = KBinsDiscretizer(starting_precision=starting_precision)
+
+ actual = discretizer._compute_minimal_precision_of_bin_edges(bin_edges)
+
+ assert actual == expected
+
+ @pytest.mark.parametrize("bin_edges, expected",
+ [([0, 1, 1.5, 2], [(0, 1), (1, 1.5), (1.5, 2)]),
+ ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)]),
+ ([np.inf, 0.0, -np.inf],
+ [(np.inf, 0.0), (0.0, -np.inf)])])
+ def test_compute_bins_from_edges(self, bin_edges, expected):
+
+ discretizer = KBinsDiscretizer()
+ actual = discretizer._compute_bins_from_edges(bin_edges)
+
+ assert actual == expected
+
+ @pytest.mark.parametrize("change_endpoint_format, closed, bins, expected",
+ [(False, "right", [(0, 1), (1, 2), (2, 3)],
+ ["0 - 1", "1 - 2", "2 - 3"]),
+ (True, "right", [(0, 1), (1, 2), (2, 3)],
+ ["<= 1", "1 - 2", "> 2"]),
+ (True, "left", [(0, 1), (1, 2), (2, 3)],
+ ["< 1", "1 - 2", ">= 2"])],
+ ids=["standard format", "different endpoints",
+ "different endpoints left"])
+ def test_create_bin_labels(self, change_endpoint_format, closed,
+ bins, expected):
+
+ discretizer = KBinsDiscretizer(
+ closed=closed,
+ change_endpoint_format=change_endpoint_format
+ )
+
+ actual = discretizer._create_bin_labels(bins)
+
+ assert actual == expected
diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py
index 08f5b63..a97a4e4 100644
--- a/tests/preprocessing/test_preprocessor.py
+++ b/tests/preprocessing/test_preprocessor.py
@@ -1,398 +1,398 @@
-from contextlib import contextmanager
-from typing import Any
-from unittest.mock import MagicMock
-import pytest
-import numpy as np
-import pandas as pd
-from pytest_mock import MockerFixture
-
-from cobra.preprocessing.preprocessor import PreProcessor
-
-
-@contextmanager
-def does_not_raise():
- yield
-
-
-class TestPreProcessor:
- @pytest.mark.parametrize(
- "train_prop, selection_prop, validation_prop, " "expected_sizes",
- [
- (0.6, 0.2, 0.2, {"train": 6, "selection": 2, "validation": 2}),
- (0.7, 0.3, 0.0, {"train": 7, "selection": 3}),
- # Error "The sum of train_prop, selection_prop and
- # validation_prop must be 1.0." should not be
- # raised:
- (0.7, 0.2, 0.1, {"train": 7, "selection": 2, "validation": 1}),
- ],
- )
- def test_train_selection_validation_split(
- self,
- train_prop: float,
- selection_prop: float,
- validation_prop: float,
- expected_sizes: dict,
- ):
- X = np.arange(100).reshape(10, 10)
- data = pd.DataFrame(X, columns=[f"c{i+1}" for i in range(10)])
- data.loc[:, "target"] = np.array([0] * 7 + [1] * 3)
-
- actual = PreProcessor.train_selection_validation_split(
- data, train_prop, selection_prop, validation_prop
- )
-
- # check for the output schema
- assert list(actual.columns) == list(data.columns)
-
- # check that total size of input & output is the same!
- assert len(actual.index) == len(data.index)
-
- # check for the output sizes per split
- actual_sizes = actual.groupby("split").size().to_dict()
-
- assert actual_sizes == expected_sizes
-
- def test_train_selection_validation_split_error_wrong_prop(self):
-
- error_msg = (
- "The sum of train_prop, selection_prop and " "validation_prop must be 1.0."
- )
- train_prop = 0.7
- selection_prop = 0.3
-
- self._test_train_selection_validation_split_error(
- train_prop, selection_prop, error_msg
- )
-
- def test_train_selection_validation_split_error_zero_selection_prop(self):
-
- error_msg = "selection_prop cannot be zero!"
- train_prop = 0.9
- selection_prop = 0.0
-
- self._test_train_selection_validation_split_error(
- train_prop, selection_prop, error_msg
- )
-
- def _test_train_selection_validation_split_error(
- self, train_prop: float, selection_prop: float, error_msg: str
- ):
- df = pd.DataFrame()
- with pytest.raises(ValueError, match=error_msg):
- (
- PreProcessor.train_selection_validation_split(
- df,
- train_prop=train_prop,
- selection_prop=selection_prop,
- validation_prop=0.1,
- )
- )
-
- @pytest.mark.parametrize(
- "injection_location, expected",
- [
- (None, True),
- ("categorical_data_processor", False),
- ("discretizer", False),
- ("target_encoder", False),
- ],
- )
- def test_is_valid_pipeline(self, injection_location: str, expected: bool):
-
- # is_valid_pipeline only checks for relevant keys atm
- pipeline_dict = {
- "categorical_data_processor": {
- "model_type": None,
- "regroup": None,
- "regroup_name": None,
- "keep_missing": None,
- "category_size_threshold": None,
- "p_value_threshold": None,
- "scale_contingency_table": None,
- "forced_categories": None,
- },
- "discretizer": {
- "n_bins": None,
- "strategy": None,
- "closed": None,
- "auto_adapt_bins": None,
- "starting_precision": None,
- "label_format": None,
- "change_endpoint_format": None,
- },
- "target_encoder": {
- "weight": None,
- "imputation_strategy": None,
- },
- }
-
- if injection_location:
- pipeline_dict[injection_location]["wrong_key"] = None
-
- actual = PreProcessor._is_valid_pipeline(pipeline_dict)
-
- assert actual == expected
-
- @pytest.mark.parametrize(
- ("continuous_vars, discrete_vars, expectation, " "expected"),
- [
- ([], [], pytest.raises(ValueError), None),
- (
- ["c1", "c2"],
- ["d1", "d2"],
- does_not_raise(),
- ["d1_processed", "d2_processed", "c1_bin", "c2_bin"],
- ),
- (["c1", "c2"], [], does_not_raise(), ["c1_bin", "c2_bin"]),
- ([], ["d1", "d2"], does_not_raise(), ["d1_processed", "d2_processed"]),
- ],
- )
- def test_get_variable_list(
- self,
- continuous_vars: list,
- discrete_vars: list,
- expectation: Any,
- expected: list,
- ):
-
- with expectation:
- actual = PreProcessor._get_variable_list(continuous_vars, discrete_vars)
-
- assert actual == expected
-
- @pytest.mark.parametrize(
- ("input, expected"),
- [
- # example 1
- (
- pd.DataFrame({
- "ID": list(range(20)),
- "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
- "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
- "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
- "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
- }
- ),
- pd.DataFrame({
- 'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
- 'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
- 'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
- 'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
- 'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
- 'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
- 'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
- 'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
- 'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
- 'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
- 'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
- }
- ),
- )
- ]
- )
- def test_fit_transform_without_id_col_name(self, input, expected):
-
- preprocessor = PreProcessor.from_params(model_type="classification")
-
- continuous_vars, discrete_vars = preprocessor.get_continuous_and_discrete_columns(input, "ID","Target")
-
- calculated = preprocessor.fit_transform(
- input,
- continuous_vars=continuous_vars,
- discrete_vars=discrete_vars,
- target_column_name="Target"
- )
- pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)
-
- @pytest.mark.parametrize(
- ("input, expected"),
- [
- # example 1
- (
- pd.DataFrame({
- "ID": list(range(20)),
- "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
- "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
- "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
- "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
- }
- ),
- pd.DataFrame({
- 'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
- 'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
- 'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
- 'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
- 'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
- 'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
- 'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
- 'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
- 'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
- 'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
- 'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
- }
- ),
- )
- ]
- )
- def test_fit_transform_with_id_col_name(self, input, expected):
-
- preprocessor = PreProcessor.from_params(model_type="classification")
-
- # continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target")
-
- calculated = preprocessor.fit_transform(
- input,
- continuous_vars=None,
- discrete_vars=None,
- target_column_name="Target",
- id_col_name="ID"
- )
- pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)
-
- @staticmethod
- def mock_transform(df: pd.DataFrame, args):
- """Mock the transform method."""
- df["new_column"] = "Hello World"
- return df
-
- def test_mutable_train_data_fit_transform(self, mocker: MockerFixture):
- """Test if the train_data input is not changed when performing fit_transform."""
- train_data = pd.DataFrame(
- [[1, "2", 3], [10, "20", 30], [100, "200", 300]],
- columns=["foo", "bar", "baz"],
- )
- preprocessor = PreProcessor.from_params(
- model_type="classification", n_bins=10, weight=0.8
- )
- preprocessor._categorical_data_processor = MagicMock()
- preprocessor._categorical_data_processor.transform = self.mock_transform
- preprocessor._discretizer = MagicMock()
- preprocessor._discretizer.transform = self.mock_transform
- preprocessor._target_encoder = MagicMock()
- preprocessor._target_encoder.transform = self.mock_transform
-
- result = preprocessor.fit_transform(
- train_data,
- continuous_vars=["foo"],
- discrete_vars=["bar"],
- target_column_name=["baz"],
- )
- assert "new_column" not in train_data.columns
- assert "new_column" in result.columns
-
- @pytest.mark.parametrize(
- ("input, expected"),
- [
- # example 1
- (
- pd.DataFrame(
- {
- "a": [1, 8, np.nan],
- "b": [np.nan, 8, np.nan],
- "c": [np.nan, np.nan, np.nan],
- "d": [np.nan, np.nan, 5],
- "e": [1, 960, np.nan],
- "f": [np.nan, np.nan, np.nan],
- }
- ),
- pd.DataFrame(
- {
- "a": [1.0, 8.0, np.nan],
- "b": [np.nan, 8.0, np.nan],
- "d": [np.nan, np.nan, 5.0],
- "e": [1.0, 960.0, np.nan],
- }
- ),
- ),
- # example 2
- (
- pd.DataFrame(
- {
- "a": [1, 8, np.nan],
- "b": [np.nan, 8, np.nan],
- "c": [np.nan, np.nan, np.nan],
- "d": [np.nan, np.nan, 5],
- "e": [1, 960, np.nan],
- }
- ),
- pd.DataFrame(
- {
- "a": [1.0, 8.0, np.nan],
- "b": [np.nan, 8.0, np.nan],
- "d": [np.nan, np.nan, 5.0],
- "e": [1.0, 960.0, np.nan],
- }
- ),
- ),
- # example 3
- (
- pd.DataFrame(
- {
- "a": [1, 8, np.nan],
- "b": [np.nan, 8, np.nan],
- "d": [np.nan, np.nan, 5],
- "e": [1, 960, np.nan],
- }
- ),
- pd.DataFrame(
- {
- "a": [1.0, 8.0, np.nan],
- "b": [np.nan, 8.0, np.nan],
- "d": [np.nan, np.nan, 5.0],
- "e": [1.0, 960.0, np.nan],
- }
- ),
- ),
- # example 4 categorical
- (
- pd.DataFrame(
- {
- "a": [1, 8, np.nan],
- "b": [np.nan, np.nan, np.nan],
- "d": [np.nan, np.nan, 5],
- "e": [1, 960, np.nan],
- "category_1": ["A", "A", "B"],
- "category_2": [np.nan, "A", "B"],
- "category_3": [np.nan, np.nan, np.nan],
- },
- ).astype(
- {
- "a": np.float64(),
- "b": np.float64(),
- "d": np.float64(),
- "e": np.float64(),
- "category_1": pd.CategoricalDtype(),
- "category_2": pd.CategoricalDtype(),
- "category_3": pd.CategoricalDtype(),
- }
- ),
- pd.DataFrame(
- {
- "a": [1, 8, np.nan],
- "d": [np.nan, np.nan, 5],
- "e": [1, 960, np.nan],
- "category_1": ["A", "A", "B"],
- "category_2": [np.nan, "A", "B"],
- }
- ).astype(
- {
- "a": np.float64(),
- "d": np.float64(),
- "e": np.float64(),
- "category_1": pd.CategoricalDtype(),
- "category_2": pd.CategoricalDtype(),
- }
- ),
- ),
- ],
- )
- def test_drops_columns_containing_only_nan(self, input, expected):
-
- print(input)
- output = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan(
- input
- )
-
- print(output)
- print(expected)
- assert output.equals(expected)
+from contextlib import contextmanager
+from typing import Any
+from unittest.mock import MagicMock
+import pytest
+import numpy as np
+import pandas as pd
+from pytest_mock import MockerFixture
+
+from cobra.preprocessing.preprocessor import PreProcessor
+
+
+@contextmanager
+def does_not_raise():
+ yield
+
+
+class TestPreProcessor:
+ @pytest.mark.parametrize(
+ "train_prop, selection_prop, validation_prop, " "expected_sizes",
+ [
+ (0.6, 0.2, 0.2, {"train": 6, "selection": 2, "validation": 2}),
+ (0.7, 0.3, 0.0, {"train": 7, "selection": 3}),
+ # Error "The sum of train_prop, selection_prop and
+ # validation_prop must be 1.0." should not be
+ # raised:
+ (0.7, 0.2, 0.1, {"train": 7, "selection": 2, "validation": 1}),
+ ],
+ )
+ def test_train_selection_validation_split(
+ self,
+ train_prop: float,
+ selection_prop: float,
+ validation_prop: float,
+ expected_sizes: dict,
+ ):
+ X = np.arange(100).reshape(10, 10)
+ data = pd.DataFrame(X, columns=[f"c{i+1}" for i in range(10)])
+ data.loc[:, "target"] = np.array([0] * 7 + [1] * 3)
+
+ actual = PreProcessor.train_selection_validation_split(
+ data, train_prop, selection_prop, validation_prop
+ )
+
+ # check for the output schema
+ assert list(actual.columns) == list(data.columns)
+
+ # check that total size of input & output is the same!
+ assert len(actual.index) == len(data.index)
+
+ # check for the output sizes per split
+ actual_sizes = actual.groupby("split").size().to_dict()
+
+ assert actual_sizes == expected_sizes
+
+ def test_train_selection_validation_split_error_wrong_prop(self):
+
+ error_msg = (
+ "The sum of train_prop, selection_prop and " "validation_prop must be 1.0."
+ )
+ train_prop = 0.7
+ selection_prop = 0.3
+
+ self._test_train_selection_validation_split_error(
+ train_prop, selection_prop, error_msg
+ )
+
+ def test_train_selection_validation_split_error_zero_selection_prop(self):
+
+ error_msg = "selection_prop cannot be zero!"
+ train_prop = 0.9
+ selection_prop = 0.0
+
+ self._test_train_selection_validation_split_error(
+ train_prop, selection_prop, error_msg
+ )
+
+ def _test_train_selection_validation_split_error(
+ self, train_prop: float, selection_prop: float, error_msg: str
+ ):
+ df = pd.DataFrame()
+ with pytest.raises(ValueError, match=error_msg):
+ (
+ PreProcessor.train_selection_validation_split(
+ df,
+ train_prop=train_prop,
+ selection_prop=selection_prop,
+ validation_prop=0.1,
+ )
+ )
+
+ @pytest.mark.parametrize(
+ "injection_location, expected",
+ [
+ (None, True),
+ ("categorical_data_processor", False),
+ ("discretizer", False),
+ ("target_encoder", False),
+ ],
+ )
+ def test_is_valid_pipeline(self, injection_location: str, expected: bool):
+
+ # is_valid_pipeline only checks for relevant keys atm
+ pipeline_dict = {
+ "categorical_data_processor": {
+ "model_type": None,
+ "regroup": None,
+ "regroup_name": None,
+ "keep_missing": None,
+ "category_size_threshold": None,
+ "p_value_threshold": None,
+ "scale_contingency_table": None,
+ "forced_categories": None,
+ },
+ "discretizer": {
+ "n_bins": None,
+ "strategy": None,
+ "closed": None,
+ "auto_adapt_bins": None,
+ "starting_precision": None,
+ "label_format": None,
+ "change_endpoint_format": None,
+ },
+ "target_encoder": {
+ "weight": None,
+ "imputation_strategy": None,
+ },
+ }
+
+ if injection_location:
+ pipeline_dict[injection_location]["wrong_key"] = None
+
+ actual = PreProcessor._is_valid_pipeline(pipeline_dict)
+
+ assert actual == expected
+
+ @pytest.mark.parametrize(
+ ("continuous_vars, discrete_vars, expectation, " "expected"),
+ [
+ ([], [], pytest.raises(ValueError), None),
+ (
+ ["c1", "c2"],
+ ["d1", "d2"],
+ does_not_raise(),
+ ["d1_processed", "d2_processed", "c1_bin", "c2_bin"],
+ ),
+ (["c1", "c2"], [], does_not_raise(), ["c1_bin", "c2_bin"]),
+ ([], ["d1", "d2"], does_not_raise(), ["d1_processed", "d2_processed"]),
+ ],
+ )
+ def test_get_variable_list(
+ self,
+ continuous_vars: list,
+ discrete_vars: list,
+ expectation: Any,
+ expected: list,
+ ):
+
+ with expectation:
+ actual = PreProcessor._get_variable_list(continuous_vars, discrete_vars)
+
+ assert actual == expected
+
+ @pytest.mark.parametrize(
+ ("input, expected"),
+ [
+ # example 1
+ (
+ pd.DataFrame({
+ "ID": list(range(20)),
+ "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
+ "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
+ "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
+ "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
+ }
+ ),
+ pd.DataFrame({
+ 'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
+ 'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
+ 'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
+ 'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
+ 'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
+ 'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
+ 'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
+ 'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
+ 'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
+ 'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
+ 'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
+ }
+ ),
+ )
+ ]
+ )
+ def test_fit_transform_without_id_col_name(self, input, expected):
+
+ preprocessor = PreProcessor.from_params(model_type="classification")
+
+ continuous_vars, discrete_vars = preprocessor.get_continuous_and_discrete_columns(input, "ID","Target")
+
+ calculated = preprocessor.fit_transform(
+ input,
+ continuous_vars=continuous_vars,
+ discrete_vars=discrete_vars,
+ target_column_name="Target"
+ )
+ pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)
+
+ @pytest.mark.parametrize(
+ ("input, expected"),
+ [
+ # example 1
+ (
+ pd.DataFrame({
+ "ID": list(range(20)),
+ "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8],
+ "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5,
+ "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17],
+ "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5
+ }
+ ),
+ pd.DataFrame({
+ 'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
+ 'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
+ 'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
+ 'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17],
+ 'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
+ 'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'],
+ 'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'],
+ 'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8],
+ 'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0],
+ 'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5],
+ 'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0]
+ }
+ ),
+ )
+ ]
+ )
+ def test_fit_transform_with_id_col_name(self, input, expected):
+
+ preprocessor = PreProcessor.from_params(model_type="classification")
+
+ # continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target")
+
+ calculated = preprocessor.fit_transform(
+ input,
+ continuous_vars=None,
+ discrete_vars=None,
+ target_column_name="Target",
+ id_col_name="ID"
+ )
+ pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False)
+
+ @staticmethod
+ def mock_transform(df: pd.DataFrame, args):
+ """Mock the transform method."""
+ df["new_column"] = "Hello World"
+ return df
+
+ def test_mutable_train_data_fit_transform(self, mocker: MockerFixture):
+ """Test if the train_data input is not changed when performing fit_transform."""
+ train_data = pd.DataFrame(
+ [[1, "2", 3], [10, "20", 30], [100, "200", 300]],
+ columns=["foo", "bar", "baz"],
+ )
+ preprocessor = PreProcessor.from_params(
+ model_type="classification", n_bins=10, weight=0.8
+ )
+ preprocessor._categorical_data_processor = MagicMock()
+ preprocessor._categorical_data_processor.transform = self.mock_transform
+ preprocessor._discretizer = MagicMock()
+ preprocessor._discretizer.transform = self.mock_transform
+ preprocessor._target_encoder = MagicMock()
+ preprocessor._target_encoder.transform = self.mock_transform
+
+ result = preprocessor.fit_transform(
+ train_data,
+ continuous_vars=["foo"],
+ discrete_vars=["bar"],
+ target_column_name=["baz"],
+ )
+ assert "new_column" not in train_data.columns
+ assert "new_column" in result.columns
+
+ @pytest.mark.parametrize(
+ ("input, expected"),
+ [
+ # example 1
+ (
+ pd.DataFrame(
+ {
+ "a": [1, 8, np.nan],
+ "b": [np.nan, 8, np.nan],
+ "c": [np.nan, np.nan, np.nan],
+ "d": [np.nan, np.nan, 5],
+ "e": [1, 960, np.nan],
+ "f": [np.nan, np.nan, np.nan],
+ }
+ ),
+ pd.DataFrame(
+ {
+ "a": [1.0, 8.0, np.nan],
+ "b": [np.nan, 8.0, np.nan],
+ "d": [np.nan, np.nan, 5.0],
+ "e": [1.0, 960.0, np.nan],
+ }
+ ),
+ ),
+ # example 2
+ (
+ pd.DataFrame(
+ {
+ "a": [1, 8, np.nan],
+ "b": [np.nan, 8, np.nan],
+ "c": [np.nan, np.nan, np.nan],
+ "d": [np.nan, np.nan, 5],
+ "e": [1, 960, np.nan],
+ }
+ ),
+ pd.DataFrame(
+ {
+ "a": [1.0, 8.0, np.nan],
+ "b": [np.nan, 8.0, np.nan],
+ "d": [np.nan, np.nan, 5.0],
+ "e": [1.0, 960.0, np.nan],
+ }
+ ),
+ ),
+ # example 3
+ (
+ pd.DataFrame(
+ {
+ "a": [1, 8, np.nan],
+ "b": [np.nan, 8, np.nan],
+ "d": [np.nan, np.nan, 5],
+ "e": [1, 960, np.nan],
+ }
+ ),
+ pd.DataFrame(
+ {
+ "a": [1.0, 8.0, np.nan],
+ "b": [np.nan, 8.0, np.nan],
+ "d": [np.nan, np.nan, 5.0],
+ "e": [1.0, 960.0, np.nan],
+ }
+ ),
+ ),
+ # example 4 categorical
+ (
+ pd.DataFrame(
+ {
+ "a": [1, 8, np.nan],
+ "b": [np.nan, np.nan, np.nan],
+ "d": [np.nan, np.nan, 5],
+ "e": [1, 960, np.nan],
+ "category_1": ["A", "A", "B"],
+ "category_2": [np.nan, "A", "B"],
+ "category_3": [np.nan, np.nan, np.nan],
+ },
+ ).astype(
+ {
+ "a": np.float64(),
+ "b": np.float64(),
+ "d": np.float64(),
+ "e": np.float64(),
+ "category_1": pd.CategoricalDtype(),
+ "category_2": pd.CategoricalDtype(),
+ "category_3": pd.CategoricalDtype(),
+ }
+ ),
+ pd.DataFrame(
+ {
+ "a": [1, 8, np.nan],
+ "d": [np.nan, np.nan, 5],
+ "e": [1, 960, np.nan],
+ "category_1": ["A", "A", "B"],
+ "category_2": [np.nan, "A", "B"],
+ }
+ ).astype(
+ {
+ "a": np.float64(),
+ "d": np.float64(),
+ "e": np.float64(),
+ "category_1": pd.CategoricalDtype(),
+ "category_2": pd.CategoricalDtype(),
+ }
+ ),
+ ),
+ ],
+ )
+ def test_drops_columns_containing_only_nan(self, input, expected):
+
+ print(input)
+ output = PreProcessor._check_nan_columns_and_drop_columns_containing_only_nan(
+ input
+ )
+
+ print(output)
+ print(expected)
+ assert output.equals(expected)
diff --git a/tests/preprocessing/test_target_encoder.py b/tests/preprocessing/test_target_encoder.py
index 51ebd79..e03992c 100644
--- a/tests/preprocessing/test_target_encoder.py
+++ b/tests/preprocessing/test_target_encoder.py
@@ -1,342 +1,342 @@
-
-import pytest
-import pandas as pd
-from sklearn.exceptions import NotFittedError
-
-from cobra.preprocessing.target_encoder import TargetEncoder
-
-class TestTargetEncoder:
-
- def test_target_encoder_constructor_weight_value_error(self):
- with pytest.raises(ValueError):
- TargetEncoder(weight=-1)
-
- def test_target_encoder_constructor_imputation_value_error(self):
- with pytest.raises(ValueError):
- TargetEncoder(imputation_strategy="median")
-
- # Tests for attributes_attributes_to_dict and set_attributes_from_dict
- def test_target_encoder_attributes_to_dict(self):
- encoder = TargetEncoder()
-
- mapping_data = pd.Series(data=[0.333333, 0.50000, 0.666667],
- index=["negative", "neutral", "positive"])
- mapping_data.index.name = "variable"
-
- encoder._mapping["variable"] = mapping_data
-
- encoder._global_mean = 0.5
-
- actual = encoder.attributes_to_dict()
-
- expected = {"weight": 0.0,
- "imputation_strategy": "mean",
- "_global_mean": 0.5,
- "_mapping": {"variable": {
- "negative": 0.333333,
- "neutral": 0.50000,
- "positive": 0.666667
- }}}
-
- assert actual == expected
-
- @pytest.mark.parametrize("attribute",
- ["weight", "mapping"],
- ids=["test_weight", "test_mapping"])
- def test_target_encoder_set_attributes_from_dict_unfitted(self, attribute):
- encoder = TargetEncoder()
-
- data = {"weight": 1.0}
- encoder.set_attributes_from_dict(data)
-
- if attribute == "weight":
- actual = encoder.weight
- expected = 1.0
-
- assert expected == actual
- elif attribute == "mapping":
- actual = encoder._mapping
- expected = {}
-
- assert expected == actual
-
- def test_target_encoder_set_attributes_from_dict(self):
- encoder = TargetEncoder()
-
- data = {"weight": 0.0,
- "_global_mean": 0.5,
- "_mapping": {"variable": {
- "negative": 0.333333,
- "neutral": 0.50000,
- "positive": 0.666667
- }}}
-
- encoder.set_attributes_from_dict(data)
-
- expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
- index=["negative", "neutral", "positive"])
- expected.index.name = "variable"
-
- actual = encoder._mapping["variable"]
-
- pd.testing.assert_series_equal(actual, expected)
-
- # Tests for _fit_column:
- def test_target_encoder_fit_column_binary_classification(self):
- df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
- 'neutral', 'negative', 'positive',
- 'negative', 'neutral', 'neutral',
- 'neutral'],
- 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
-
- encoder = TargetEncoder()
- encoder._global_mean = 0.5
- actual = encoder._fit_column(X=df.variable, y=df.target)
-
- expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
- index=["negative", "neutral", "positive"])
- expected.index.name = "variable"
-
- pd.testing.assert_series_equal(actual, expected)
-
- def test_target_encoder_fit_column_linear_regression(self):
- df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
- 'neutral', 'negative', 'positive',
- 'negative', 'neutral', 'neutral',
- 'neutral', 'positive'],
- 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
-
- encoder = TargetEncoder()
- encoder._global_mean = 0.454545
- actual = encoder._fit_column(X=df.variable, y=df.target)
-
- expected = pd.Series(data=[-4.666667, 0.250000, 4.500000],
- index=["negative", "neutral", "positive"])
- expected.index.name = "variable"
-
- pd.testing.assert_series_equal(actual, expected)
-
- def test_target_encoder_fit_column_global_mean_binary_classification(self):
- df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
- 'neutral', 'negative', 'positive',
- 'negative', 'neutral', 'neutral',
- 'neutral'],
- 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
-
- encoder = TargetEncoder(weight=1)
- encoder._global_mean = df.target.sum() / df.target.count() # is 0.5
-
- actual = encoder._fit_column(X=df.variable, y=df.target)
-
- expected = pd.Series(data=[0.375, 0.500, 0.625],
- index=["negative", "neutral", "positive"])
- expected.index.name = "variable"
-
- pd.testing.assert_series_equal(actual, expected)
-
- def test_target_encoder_fit_column_global_mean_linear_regression(self):
- df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
- 'neutral', 'negative', 'positive',
- 'negative', 'neutral', 'neutral',
- 'neutral', 'positive'],
- 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
-
- encoder = TargetEncoder(weight=1)
- encoder._global_mean = 0.454545
-
- actual = encoder._fit_column(X=df.variable, y=df.target)
-
- # expected new value:
- # [count of the value * its mean encoding + weight (= 1) * global mean]
- # / [count of the value + weight (=1)].
- expected = pd.Series(data=[(3 * -4.666667 + 1 * 0.454545) / (3 + 1),
- (4 * 0.250000 + 1 * 0.454545) / (4 + 1),
- (4 * 4.500000 + 1 * 0.454545) / (4 + 1)],
- index=["negative", "neutral", "positive"])
- expected.index.name = "variable"
-
- pd.testing.assert_series_equal(actual, expected)
-
- # Tests for fit method
- def test_target_encoder_fit_binary_classification(self):
- # test_target_encoder_fit_column_linear_regression() tested on one
- # column input as a numpy series; this test runs on a dataframe input.
- df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
- 'neutral', 'negative', 'positive',
- 'negative', 'neutral', 'neutral',
- 'neutral'],
- 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
-
- encoder = TargetEncoder()
- encoder.fit(data=df, column_names=["variable"], target_column="target")
-
- expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
- index=["negative", "neutral", "positive"])
- expected.index.name = "variable"
- actual = encoder._mapping["variable"]
-
- pd.testing.assert_series_equal(actual, expected)
-
- def test_target_encoder_fit_linear_regression(self):
- # test_target_encoder_fit_column_linear_regression() tested on one
- # column input as a numpy series; this test runs on a dataframe input.
- df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
- 'neutral', 'negative', 'positive',
- 'negative', 'neutral', 'neutral',
- 'neutral', 'positive'],
- 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
-
- encoder = TargetEncoder()
- encoder.fit(data=df, column_names=["variable"], target_column="target")
-
- expected = pd.Series(data=[-4.666667, 0.250000, 4.500000],
- index=["negative", "neutral", "positive"])
- expected.index.name = "variable"
- actual = encoder._mapping["variable"]
-
- pd.testing.assert_series_equal(actual, expected)
-
- # Tests for transform method
- def test_target_encoder_transform_when_not_fitted(self):
- df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
- 'neutral', 'negative', 'positive',
- 'negative', 'neutral', 'neutral',
- 'neutral'],
- 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
-
- # inputs of TargetEncoder will be of dtype category
- df["variable"] = df["variable"].astype("category")
-
- encoder = TargetEncoder()
- with pytest.raises(NotFittedError):
- encoder.transform(data=df, column_names=["variable"])
-
- def test_target_encoder_transform_binary_classification(self):
- df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
- 'neutral', 'negative', 'positive',
- 'negative', 'neutral', 'neutral',
- 'neutral'],
- 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
-
- # inputs of TargetEncoder will be of dtype category
- df["variable"] = df["variable"].astype("category")
-
- expected = df.copy()
- expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000,
- 0.333333, 0.666667, 0.333333, 0.50000,
- 0.50000, 0.50000]
-
- encoder = TargetEncoder()
- encoder.fit(data=df, column_names=["variable"], target_column="target")
- actual = encoder.transform(data=df, column_names=["variable"])
-
- pd.testing.assert_frame_equal(actual, expected)
-
- def test_target_encoder_transform_linear_regression(self):
- df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
- 'neutral', 'negative', 'positive',
- 'negative', 'neutral', 'neutral',
- 'neutral', 'positive'],
- 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
-
- # inputs of TargetEncoder will be of dtype category
- df["variable"] = df["variable"].astype("category")
-
- expected = df.copy()
- expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000,
- -4.666667, 4.500000, -4.666667, 0.250000,
- 0.250000, 0.250000, 4.500000]
-
- encoder = TargetEncoder()
- encoder.fit(data=df, column_names=["variable"], target_column="target")
- actual = encoder.transform(data=df, column_names=["variable"])
-
- pd.testing.assert_frame_equal(actual, expected)
-
- def test_target_encoder_transform_new_category_binary_classification(self):
- df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
- 'neutral', 'negative', 'positive',
- 'negative', 'neutral', 'neutral',
- 'neutral'],
- 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
-
- df_appended = df.append({"variable": "new", "target": 1},
- ignore_index=True)
-
- # inputs of TargetEncoder will be of dtype category
- df["variable"] = df["variable"].astype("category")
- df_appended["variable"] = df_appended["variable"].astype("category")
-
- expected = df_appended.copy()
- expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000,
- 0.333333, 0.666667, 0.333333, 0.50000,
- 0.50000, 0.50000, 0.333333]
-
- encoder = TargetEncoder(imputation_strategy="min")
- encoder.fit(data=df, column_names=["variable"], target_column="target")
- actual = encoder.transform(data=df_appended, column_names=["variable"])
-
- pd.testing.assert_frame_equal(actual, expected)
-
- def test_target_encoder_transform_new_category_linear_regression(self):
- df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
- 'neutral', 'negative', 'positive',
- 'negative', 'neutral', 'neutral',
- 'neutral', 'positive'],
- 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
-
- df_appended = df.append({"variable": "new", "target": 10},
- ignore_index=True)
-
- # inputs of TargetEncoder will be of dtype category
- df["variable"] = df["variable"].astype("category")
- df_appended["variable"] = df_appended["variable"].astype("category")
-
- expected = df_appended.copy()
- expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000,
- -4.666667, 4.500000, -4.666667, 0.250000,
- 0.250000, 0.250000, 4.500000,
- -4.666667] # min imputation for new value
-
- encoder = TargetEncoder(imputation_strategy="min")
- encoder.fit(data=df, column_names=["variable"], target_column="target")
- actual = encoder.transform(data=df_appended, column_names=["variable"])
-
- pd.testing.assert_frame_equal(actual, expected)
-
- # Tests for _clean_column_name:
- def test_target_encoder_clean_column_name_binned_column(self):
- column_name = "test_column_bin"
- expected = "test_column_enc"
-
- encoder = TargetEncoder()
- actual = encoder._clean_column_name(column_name)
-
- assert actual == expected
-
- def test_target_encoder_clean_column_name_processed_column(self):
- column_name = "test_column_processed"
- expected = "test_column_enc"
-
- encoder = TargetEncoder()
- actual = encoder._clean_column_name(column_name)
-
- assert actual == expected
-
- def test_target_encoder_clean_column_name_cleaned_column(self):
- column_name = "test_column_cleaned"
- expected = "test_column_enc"
-
- encoder = TargetEncoder()
- actual = encoder._clean_column_name(column_name)
-
- assert actual == expected
-
- def test_target_encoder_clean_column_other_name(self):
- column_name = "test_column"
- expected = "test_column_enc"
-
- encoder = TargetEncoder()
- actual = encoder._clean_column_name(column_name)
-
- assert actual == expected
+
+import pytest
+import pandas as pd
+from sklearn.exceptions import NotFittedError
+
+from cobra.preprocessing.target_encoder import TargetEncoder
+
+class TestTargetEncoder:
+
+ def test_target_encoder_constructor_weight_value_error(self):
+ with pytest.raises(ValueError):
+ TargetEncoder(weight=-1)
+
+ def test_target_encoder_constructor_imputation_value_error(self):
+ with pytest.raises(ValueError):
+ TargetEncoder(imputation_strategy="median")
+
+ # Tests for attributes_attributes_to_dict and set_attributes_from_dict
+ def test_target_encoder_attributes_to_dict(self):
+ encoder = TargetEncoder()
+
+ mapping_data = pd.Series(data=[0.333333, 0.50000, 0.666667],
+ index=["negative", "neutral", "positive"])
+ mapping_data.index.name = "variable"
+
+ encoder._mapping["variable"] = mapping_data
+
+ encoder._global_mean = 0.5
+
+ actual = encoder.attributes_to_dict()
+
+ expected = {"weight": 0.0,
+ "imputation_strategy": "mean",
+ "_global_mean": 0.5,
+ "_mapping": {"variable": {
+ "negative": 0.333333,
+ "neutral": 0.50000,
+ "positive": 0.666667
+ }}}
+
+ assert actual == expected
+
+ @pytest.mark.parametrize("attribute",
+ ["weight", "mapping"],
+ ids=["test_weight", "test_mapping"])
+ def test_target_encoder_set_attributes_from_dict_unfitted(self, attribute):
+ encoder = TargetEncoder()
+
+ data = {"weight": 1.0}
+ encoder.set_attributes_from_dict(data)
+
+ if attribute == "weight":
+ actual = encoder.weight
+ expected = 1.0
+
+ assert expected == actual
+ elif attribute == "mapping":
+ actual = encoder._mapping
+ expected = {}
+
+ assert expected == actual
+
+ def test_target_encoder_set_attributes_from_dict(self):
+ encoder = TargetEncoder()
+
+ data = {"weight": 0.0,
+ "_global_mean": 0.5,
+ "_mapping": {"variable": {
+ "negative": 0.333333,
+ "neutral": 0.50000,
+ "positive": 0.666667
+ }}}
+
+ encoder.set_attributes_from_dict(data)
+
+ expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
+ index=["negative", "neutral", "positive"])
+ expected.index.name = "variable"
+
+ actual = encoder._mapping["variable"]
+
+ pd.testing.assert_series_equal(actual, expected)
+
+ # Tests for _fit_column:
+ def test_target_encoder_fit_column_binary_classification(self):
+ df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+ 'neutral', 'negative', 'positive',
+ 'negative', 'neutral', 'neutral',
+ 'neutral'],
+ 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
+
+ encoder = TargetEncoder()
+ encoder._global_mean = 0.5
+ actual = encoder._fit_column(X=df.variable, y=df.target)
+
+ expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
+ index=["negative", "neutral", "positive"])
+ expected.index.name = "variable"
+
+ pd.testing.assert_series_equal(actual, expected)
+
+ def test_target_encoder_fit_column_linear_regression(self):
+ df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+ 'neutral', 'negative', 'positive',
+ 'negative', 'neutral', 'neutral',
+ 'neutral', 'positive'],
+ 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
+
+ encoder = TargetEncoder()
+ encoder._global_mean = 0.454545
+ actual = encoder._fit_column(X=df.variable, y=df.target)
+
+ expected = pd.Series(data=[-4.666667, 0.250000, 4.500000],
+ index=["negative", "neutral", "positive"])
+ expected.index.name = "variable"
+
+ pd.testing.assert_series_equal(actual, expected)
+
+ def test_target_encoder_fit_column_global_mean_binary_classification(self):
+ df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+ 'neutral', 'negative', 'positive',
+ 'negative', 'neutral', 'neutral',
+ 'neutral'],
+ 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
+
+ encoder = TargetEncoder(weight=1)
+ encoder._global_mean = df.target.sum() / df.target.count() # is 0.5
+
+ actual = encoder._fit_column(X=df.variable, y=df.target)
+
+ expected = pd.Series(data=[0.375, 0.500, 0.625],
+ index=["negative", "neutral", "positive"])
+ expected.index.name = "variable"
+
+ pd.testing.assert_series_equal(actual, expected)
+
+ def test_target_encoder_fit_column_global_mean_linear_regression(self):
+ df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+ 'neutral', 'negative', 'positive',
+ 'negative', 'neutral', 'neutral',
+ 'neutral', 'positive'],
+ 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
+
+ encoder = TargetEncoder(weight=1)
+ encoder._global_mean = 0.454545
+
+ actual = encoder._fit_column(X=df.variable, y=df.target)
+
+ # expected new value:
+ # [count of the value * its mean encoding + weight (= 1) * global mean]
+ # / [count of the value + weight (=1)].
+ expected = pd.Series(data=[(3 * -4.666667 + 1 * 0.454545) / (3 + 1),
+ (4 * 0.250000 + 1 * 0.454545) / (4 + 1),
+ (4 * 4.500000 + 1 * 0.454545) / (4 + 1)],
+ index=["negative", "neutral", "positive"])
+ expected.index.name = "variable"
+
+ pd.testing.assert_series_equal(actual, expected)
+
+ # Tests for fit method
+ def test_target_encoder_fit_binary_classification(self):
+ # test_target_encoder_fit_column_linear_regression() tested on one
+ # column input as a numpy series; this test runs on a dataframe input.
+ df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+ 'neutral', 'negative', 'positive',
+ 'negative', 'neutral', 'neutral',
+ 'neutral'],
+ 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
+
+ encoder = TargetEncoder()
+ encoder.fit(data=df, column_names=["variable"], target_column="target")
+
+ expected = pd.Series(data=[0.333333, 0.50000, 0.666667],
+ index=["negative", "neutral", "positive"])
+ expected.index.name = "variable"
+ actual = encoder._mapping["variable"]
+
+ pd.testing.assert_series_equal(actual, expected)
+
+ def test_target_encoder_fit_linear_regression(self):
+ # test_target_encoder_fit_column_linear_regression() tested on one
+ # column input as a numpy series; this test runs on a dataframe input.
+ df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+ 'neutral', 'negative', 'positive',
+ 'negative', 'neutral', 'neutral',
+ 'neutral', 'positive'],
+ 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
+
+ encoder = TargetEncoder()
+ encoder.fit(data=df, column_names=["variable"], target_column="target")
+
+ expected = pd.Series(data=[-4.666667, 0.250000, 4.500000],
+ index=["negative", "neutral", "positive"])
+ expected.index.name = "variable"
+ actual = encoder._mapping["variable"]
+
+ pd.testing.assert_series_equal(actual, expected)
+
+ # Tests for transform method
+ def test_target_encoder_transform_when_not_fitted(self):
+ df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+ 'neutral', 'negative', 'positive',
+ 'negative', 'neutral', 'neutral',
+ 'neutral'],
+ 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
+
+ # inputs of TargetEncoder will be of dtype category
+ df["variable"] = df["variable"].astype("category")
+
+ encoder = TargetEncoder()
+ with pytest.raises(NotFittedError):
+ encoder.transform(data=df, column_names=["variable"])
+
+ def test_target_encoder_transform_binary_classification(self):
+ df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+ 'neutral', 'negative', 'positive',
+ 'negative', 'neutral', 'neutral',
+ 'neutral'],
+ 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
+
+ # inputs of TargetEncoder will be of dtype category
+ df["variable"] = df["variable"].astype("category")
+
+ expected = df.copy()
+ expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000,
+ 0.333333, 0.666667, 0.333333, 0.50000,
+ 0.50000, 0.50000]
+
+ encoder = TargetEncoder()
+ encoder.fit(data=df, column_names=["variable"], target_column="target")
+ actual = encoder.transform(data=df, column_names=["variable"])
+
+ pd.testing.assert_frame_equal(actual, expected)
+
+ def test_target_encoder_transform_linear_regression(self):
+ df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+ 'neutral', 'negative', 'positive',
+ 'negative', 'neutral', 'neutral',
+ 'neutral', 'positive'],
+ 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
+
+ # inputs of TargetEncoder will be of dtype category
+ df["variable"] = df["variable"].astype("category")
+
+ expected = df.copy()
+ expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000,
+ -4.666667, 4.500000, -4.666667, 0.250000,
+ 0.250000, 0.250000, 4.500000]
+
+ encoder = TargetEncoder()
+ encoder.fit(data=df, column_names=["variable"], target_column="target")
+ actual = encoder.transform(data=df, column_names=["variable"])
+
+ pd.testing.assert_frame_equal(actual, expected)
+
+ def test_target_encoder_transform_new_category_binary_classification(self):
+ df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+ 'neutral', 'negative', 'positive',
+ 'negative', 'neutral', 'neutral',
+ 'neutral'],
+ 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
+
+ df_appended = df.append({"variable": "new", "target": 1},
+ ignore_index=True)
+
+ # inputs of TargetEncoder will be of dtype category
+ df["variable"] = df["variable"].astype("category")
+ df_appended["variable"] = df_appended["variable"].astype("category")
+
+ expected = df_appended.copy()
+ expected["variable_enc"] = [0.666667, 0.666667, 0.333333, 0.50000,
+ 0.333333, 0.666667, 0.333333, 0.50000,
+ 0.50000, 0.50000, 0.333333]
+
+ encoder = TargetEncoder(imputation_strategy="min")
+ encoder.fit(data=df, column_names=["variable"], target_column="target")
+ actual = encoder.transform(data=df_appended, column_names=["variable"])
+
+ pd.testing.assert_frame_equal(actual, expected)
+
+ def test_target_encoder_transform_new_category_linear_regression(self):
+ df = pd.DataFrame({'variable': ['positive', 'positive', 'negative',
+ 'neutral', 'negative', 'positive',
+ 'negative', 'neutral', 'neutral',
+ 'neutral', 'positive'],
+ 'target': [5, 4, -5, 0, -4, 5, -5, 0, 1, 0, 4]})
+
+ df_appended = df.append({"variable": "new", "target": 10},
+ ignore_index=True)
+
+ # inputs of TargetEncoder will be of dtype category
+ df["variable"] = df["variable"].astype("category")
+ df_appended["variable"] = df_appended["variable"].astype("category")
+
+ expected = df_appended.copy()
+ expected["variable_enc"] = [4.500000, 4.500000, -4.666667, 0.250000,
+ -4.666667, 4.500000, -4.666667, 0.250000,
+ 0.250000, 0.250000, 4.500000,
+ -4.666667] # min imputation for new value
+
+ encoder = TargetEncoder(imputation_strategy="min")
+ encoder.fit(data=df, column_names=["variable"], target_column="target")
+ actual = encoder.transform(data=df_appended, column_names=["variable"])
+
+ pd.testing.assert_frame_equal(actual, expected)
+
+ # Tests for _clean_column_name:
+ def test_target_encoder_clean_column_name_binned_column(self):
+ column_name = "test_column_bin"
+ expected = "test_column_enc"
+
+ encoder = TargetEncoder()
+ actual = encoder._clean_column_name(column_name)
+
+ assert actual == expected
+
+ def test_target_encoder_clean_column_name_processed_column(self):
+ column_name = "test_column_processed"
+ expected = "test_column_enc"
+
+ encoder = TargetEncoder()
+ actual = encoder._clean_column_name(column_name)
+
+ assert actual == expected
+
+ def test_target_encoder_clean_column_name_cleaned_column(self):
+ column_name = "test_column_cleaned"
+ expected = "test_column_enc"
+
+ encoder = TargetEncoder()
+ actual = encoder._clean_column_name(column_name)
+
+ assert actual == expected
+
+ def test_target_encoder_clean_column_other_name(self):
+ column_name = "test_column"
+ expected = "test_column_enc"
+
+ encoder = TargetEncoder()
+ actual = encoder._clean_column_name(column_name)
+
+ assert actual == expected