From e2b17027866928f09d2d4fb677801f2afb21712c Mon Sep 17 00:00:00 2001
From: ZlaTanskY <janoroelandt@hotmail.com>
Date: Fri, 12 May 2023 12:03:41 +0200
Subject: [PATCH 1/3] chore: refactor compute_univariate_preselection

---
 cobra/model_building/univariate_selection.py  | 99 ++++++++++---------
 .../test_univariate_selection.py              | 21 ++--
 2 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py
index 2db4abb..72baaa3 100644
--- a/cobra/model_building/univariate_selection.py
+++ b/cobra/model_building/univariate_selection.py
@@ -65,66 +65,69 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
     result = []
 
     if model_type == "classification":
-        for predictor in predictors:
-
-            cleaned_predictor = utils.clean_predictor_name(predictor)
-
-            auc_train = roc_auc_score(
-                y_true=target_enc_train_data[target_column],
-                y_score=target_enc_train_data[predictor])
-
-            auc_selection = roc_auc_score(
-                y_true=target_enc_selection_data[target_column],
-                y_score=target_enc_selection_data[predictor])
-
-            result.append({"predictor": cleaned_predictor,
-                           "AUC train": auc_train,
-                           "AUC selection": auc_selection})
-
-        df_auc = pd.DataFrame(result)
-
+        scoring_method = roc_auc_score
+        kwargs = {}
+        scoring_method_str = "AUC"
+    else:
+        scoring_method = mean_squared_error
+        kwargs = {"squared": False}
+        scoring_method_str = "RMSE"
+
+    for predictor in predictors:
+        cleaned_predictor = utils.clean_predictor_name(predictor)
+
+        score_train = scoring_method(
+            target_enc_train_data[target_column],
+            target_enc_train_data[predictor],
+            **kwargs
+        )
+
+        score_selection = scoring_method(
+            target_enc_selection_data[target_column],
+            target_enc_selection_data[predictor],
+            **kwargs
+        )
+
+        result.append(
+            {
+                "predictor": cleaned_predictor,
+                f"{scoring_method_str} train": score_train,
+                f"{scoring_method_str} selection": score_selection
+            }
+        )
+
+    df_score = pd.DataFrame(result)
+
+    # TODO: This should be `if scoring method is error based` instead of classification vs regression
+    if model_type == "classification":
         # Filter based on min. AUC
-        auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold
+        score_thresh = df_score.loc[:, f"{scoring_method_str} selection"] > preselect_auc_threshold
 
         # Identify those variables for which the AUC difference between train
         # and selection is within a user-defined ratio
-        auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"])
-                         < preselect_overtrain_threshold)
+        score_overtrain = (
+            (df_score[f"{scoring_method_str} train"] - df_score[f"{scoring_method_str} selection"])
+            < preselect_overtrain_threshold
+        )
 
-        df_auc["preselection"] = auc_thresh & auc_overtrain
-
-        df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(drop=True)
-
-    elif model_type == "regression":
-        for predictor in predictors:
-            cleaned_predictor = utils.clean_predictor_name(predictor)
-
-            rmse_train = sqrt(mean_squared_error(
-                y_true=target_enc_train_data[target_column],
-                y_pred=target_enc_train_data[predictor]))
-
-            rmse_selection = sqrt(mean_squared_error(
-                y_true=target_enc_selection_data[target_column],
-                y_pred=target_enc_selection_data[predictor]))
-
-            result.append({"predictor": cleaned_predictor,
-                           "RMSE train": rmse_train,
-                           "RMSE selection": rmse_selection})
-
-        df_rmse = pd.DataFrame(result)
+        df_score["preselection"] = score_thresh & score_overtrain
 
+        df_out = df_score.sort_values(by=f"{scoring_method_str} selection", ascending=False).reset_index(drop=True)
+    else:
+        # What if they fill in something else than `regression`?
         # Filter based on max. RMSE
-        rmse_thresh = df_rmse.loc[:, "RMSE selection"] < preselect_rmse_threshold
+        score_thresh = df_score.loc[:, f"{scoring_method_str} selection"] < preselect_rmse_threshold
 
         # Identify those variables for which the RMSE difference between train
         # and selection is within a user-defined ratio
-        rmse_overtrain = ((df_rmse["RMSE selection"] - df_rmse["RMSE train"])  # flip subtraction vs. AUC
-                          < preselect_overtrain_threshold)
-
-        df_rmse["preselection"] = rmse_thresh & rmse_overtrain
+        score_overtrain = (
+            (df_score[f"{scoring_method_str} selection"] - df_score[f"{scoring_method_str} train"])  # flip subtraction vs. AUC
+            < preselect_overtrain_threshold
+        )
 
-        df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index(drop=True)  # lower is better
+        df_score["preselection"] = score_thresh & score_overtrain
 
+        df_out = df_score.sort_values(by=f"{scoring_method_str} selection", ascending=True).reset_index(drop=True)  # lower is better
     return df_out
 
 def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
diff --git a/tests/model_building/test_univariate_selection.py b/tests/model_building/test_univariate_selection.py
index c69a4de..8bd16cc 100644
--- a/tests/model_building/test_univariate_selection.py
+++ b/tests/model_building/test_univariate_selection.py
@@ -1,8 +1,11 @@
 
 import pandas as pd
+import pytest
 
 from cobra.model_building import univariate_selection
 
+
+@pytest.fixture
 def mock_data():
     return pd.DataFrame({"var1_enc": [0.42] * 10,
                          "var2_enc": [0.94] * 10,
@@ -10,9 +13,8 @@ def mock_data():
 
 class TestUnivariateSelection:
 
-    def test_preselection_classification(self):
-
-        X = mock_data()
+    def test_preselection_classification(self, mock_data: pd.DataFrame):
+        X = mock_data
         y = pd.DataFrame([1] * 5 + [0] * 5, columns=["target"])
 
         basetable = pd.concat([y, X], axis=1)
@@ -29,14 +31,11 @@ def test_preselection_classification(self):
 
         assert all(c in df_auc.columns for c in ["AUC train", "AUC selection"])
 
-        preselected_predictors = (univariate_selection
-                                  .get_preselected_predictors(df_auc))
-
+        preselected_predictors = univariate_selection.get_preselected_predictors(df_auc)
         assert preselected_predictors == ["var1_enc", "var2_enc", "var3_enc"]
 
-    def test_preselection_regression(self):
-
-        X = mock_data()
+    def test_preselection_regression(self, mock_data: pd.DataFrame):
+        X = mock_data
         y = pd.DataFrame([6.0, 9.0, 4.2, 5.5, 0.7, 1.9, 8.7, 8.0, 2.0, 7.2], columns=["target"])
 
         basetable = pd.concat([y, X], axis=1)
@@ -53,7 +52,5 @@ def test_preselection_regression(self):
 
         assert all(c in df_rmse.columns for c in ["RMSE train", "RMSE selection"])
 
-        preselected_predictors = (univariate_selection
-                                  .get_preselected_predictors(df_rmse))
-
+        preselected_predictors = univariate_selection.get_preselected_predictors(df_rmse)
         assert preselected_predictors == ["var2_enc", "var3_enc"]

From 17f49c7cc5410a428cab60833e390c2dd69590a7 Mon Sep 17 00:00:00 2001
From: ZlaTanskY <janoroelandt@hotmail.com>
Date: Fri, 12 May 2023 12:04:36 +0200
Subject: [PATCH 2/3] chore: rearrange newlines

---
 cobra/model_building/univariate_selection.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py
index 72baaa3..99f4567 100644
--- a/cobra/model_building/univariate_selection.py
+++ b/cobra/model_building/univariate_selection.py
@@ -1,10 +1,10 @@
 
 import pandas as pd
 from sklearn.metrics import roc_auc_score, mean_squared_error
-from numpy import sqrt
 
 import cobra.utils as utils
 
+
 def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
                                     target_enc_selection_data: pd.DataFrame,
                                     predictors: list,
@@ -99,6 +99,7 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
     df_score = pd.DataFrame(result)
 
     # TODO: This should be `if scoring method is error based` instead of classification vs regression
+    # This opens the door to customised scoring methods
     if model_type == "classification":
         # Filter based on min. AUC
         score_thresh = df_score.loc[:, f"{scoring_method_str} selection"] > preselect_auc_threshold
@@ -130,6 +131,7 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
         df_out = df_score.sort_values(by=f"{scoring_method_str} selection", ascending=True).reset_index(drop=True)  # lower is better
     return df_out
 
+
 def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
     """Wrapper function to extract a list of predictors from df_metric.
 
@@ -157,6 +159,7 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
 
     return [col + "_enc" for col in predictor_list]
 
+
 def compute_correlations(target_enc_train_data: pd.DataFrame,
                          predictors: list) -> pd.DataFrame:
     """Given a DataFrame and a list of predictors, compute the correlations

From 9c4772b636383e86085d9849561545d13aa577dd Mon Sep 17 00:00:00 2001
From: ZlaTanskY <janoroelandt@hotmail.com>
Date: Fri, 26 May 2023 15:56:12 +0200
Subject: [PATCH 3/3] chore: create 2 methods to filter based on error or score
 based methods

---
 cobra/model_building/univariate_selection.py  | 76 +++++++++--------
 .../test_univariate_selection.py              | 82 +++++++++++++++++++
 2 files changed, 124 insertions(+), 34 deletions(-)

diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py
index 99f4567..7f2ae94 100644
--- a/cobra/model_building/univariate_selection.py
+++ b/cobra/model_building/univariate_selection.py
@@ -5,15 +5,16 @@
 import cobra.utils as utils
 
 
-def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
-                                    target_enc_selection_data: pd.DataFrame,
-                                    predictors: list,
-                                    target_column: str,
-                                    model_type: str = "classification",
-                                    preselect_auc_threshold: float = 0.053,
-                                    preselect_rmse_threshold: float = 5,
-                                    preselect_overtrain_threshold: float = 0.05
-                                    ) -> pd.DataFrame:
+def compute_univariate_preselection(
+    target_enc_train_data: pd.DataFrame,
+    target_enc_selection_data: pd.DataFrame,
+    predictors: list,
+    target_column: str,
+    model_type: str = "classification",
+    preselect_auc_threshold: float = 0.053,
+    preselect_rmse_threshold: float = 5,
+    preselect_overtrain_threshold: float = 0.05
+) -> pd.DataFrame:
     """Perform a preselection of predictors based on an AUC (in case of
     classification) or a RMSE (in case of regression) threshold of
     a univariate model on a train and selection dataset and return a DataFrame
@@ -64,6 +65,7 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
     """
     result = []
 
+    # TODO: Change this to `if is_error_metric` or similar
     if model_type == "classification":
         scoring_method = roc_auc_score
         kwargs = {}
@@ -98,37 +100,43 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
 
     df_score = pd.DataFrame(result)
 
-    # TODO: This should be `if scoring method is error based` instead of classification vs regression
+    # TODO: This should be `if error_metric` instead of classification vs regression
     # This opens the door to customised scoring methods
     if model_type == "classification":
-        # Filter based on min. AUC
-        score_thresh = df_score.loc[:, f"{scoring_method_str} selection"] > preselect_auc_threshold
-
-        # Identify those variables for which the AUC difference between train
-        # and selection is within a user-defined ratio
-        score_overtrain = (
-            (df_score[f"{scoring_method_str} train"] - df_score[f"{scoring_method_str} selection"])
-            < preselect_overtrain_threshold
-        )
-
-        df_score["preselection"] = score_thresh & score_overtrain
-
-        df_out = df_score.sort_values(by=f"{scoring_method_str} selection", ascending=False).reset_index(drop=True)
+        df_out = filter_preselection_score_based(df_score, preselect_auc_threshold, preselect_overtrain_threshold, scoring_method_str)
     else:
         # What if they fill in something else than `regression`?
-        # Filter based on max. RMSE
-        score_thresh = df_score.loc[:, f"{scoring_method_str} selection"] < preselect_rmse_threshold
-
-        # Identify those variables for which the RMSE difference between train
-        # and selection is within a user-defined ratio
-        score_overtrain = (
-            (df_score[f"{scoring_method_str} selection"] - df_score[f"{scoring_method_str} train"])  # flip subtraction vs. AUC
-            < preselect_overtrain_threshold
-        )
+        df_out = filter_preselection_error_based(df_score, preselect_rmse_threshold, preselect_overtrain_threshold, scoring_method_str)
+    return df_out
+
+
+def filter_preselection_error_based(df: pd.DataFrame, preselect_threshold: float, preselect_overtrain: float, scoring_method: str) -> pd.DataFrame:
+    """Filter the dataframe based on the given thresholds for error-based metrics."""
+    score_thresh = df.loc[:, f"{scoring_method} selection"] < preselect_threshold
+
+    # Identify those variables for which the error metric difference between train
+    # and selection is within a user-defined ratio
+    score_overtrain = (
+        (df[f"{scoring_method} selection"] - df[f"{scoring_method} train"])
+        < preselect_overtrain
+    )
+    df["preselection"] = score_thresh & score_overtrain
+    df_out = df.sort_values(by=f"{scoring_method} selection", ascending=True).reset_index(drop=True)
+    return df_out
+
 
-        df_score["preselection"] = score_thresh & score_overtrain
+def filter_preselection_score_based(df: pd.DataFrame, preselect_threshold: float, preselect_overtrain: float, scoring_method: str) -> pd.DataFrame:
+    """Filter the dataframe based on the given thresholds for scoring-based metrics."""
+    score_thresh = df.loc[:, f"{scoring_method} selection"] > preselect_threshold
 
-        df_out = df_score.sort_values(by=f"{scoring_method_str} selection", ascending=True).reset_index(drop=True)  # lower is better
+    # Identify those variables for which the score difference between train
+    # and selection is within a user-defined ratio
+    score_overtrain = (
+        (df[f"{scoring_method} train"] - df[f"{scoring_method} selection"])
+        < preselect_overtrain
+    )
+    df["preselection"] = score_thresh & score_overtrain
+    df_out = df.sort_values(by=f"{scoring_method} selection", ascending=False).reset_index(drop=True)
     return df_out
 
 
diff --git a/tests/model_building/test_univariate_selection.py b/tests/model_building/test_univariate_selection.py
index 8bd16cc..f119ec5 100644
--- a/tests/model_building/test_univariate_selection.py
+++ b/tests/model_building/test_univariate_selection.py
@@ -54,3 +54,85 @@ def test_preselection_regression(self, mock_data: pd.DataFrame):
 
         preselected_predictors = univariate_selection.get_preselected_predictors(df_rmse)
         assert preselected_predictors == ["var2_enc", "var3_enc"]
+
+    def test_filter_preselection_error_based(self):
+        """Test filtering preselection data for an error-based metric."""
+        test_input = pd.DataFrame(
+            [
+                [0.1, 0.1],
+                [0.2, 0.2],
+                [0.3, 0.6],
+                [0.4, 0.4],
+                [0.5, 0.5],
+                [0.6, 0.6],
+                [0.7, 0.7],
+                [0.8, 0.8],
+                [0.9, 0.9],
+                [1.0, 1.0],
+            ],
+            columns=["RMSE train", "RMSE selection"]
+        )
+        result = univariate_selection.filter_preselection_error_based(
+            test_input,
+            preselect_threshold=0.65,
+            preselect_overtrain=0.2,
+            scoring_method="RMSE"
+        )
+
+        target = pd.DataFrame(
+            [
+                [0.1, 0.1, True],
+                [0.2, 0.2, True],
+                [0.4, 0.4, True],
+                [0.5, 0.5, True],
+                [0.3, 0.6, False],
+                [0.6, 0.6, True],
+                [0.7, 0.7, False],
+                [0.8, 0.8, False],
+                [0.9, 0.9, False],
+                [1.0, 1.0, False],
+            ],
+            columns=["RMSE train", "RMSE selection", "preselection"]
+        )
+        assert target.equals(result)
+
+    def test_filter_preselection_score_based(self):
+        """Test filtering preselection data for a score-based metric."""
+        test_input = pd.DataFrame(
+            [
+                [0.1, 0.1],
+                [0.2, 0.2],
+                [0.3, 0.6],
+                [0.4, 0.4],
+                [0.5, 0.5],
+                [0.6, 0.6],
+                [0.7, 0.7],
+                [0.8, 0.8],
+                [0.9, 0.9],
+                [1.0, 0.7],
+            ],
+            columns=["AUC train", "AUC selection"]
+        )
+        result = univariate_selection.filter_preselection_score_based(
+            test_input,
+            preselect_threshold=0.65,
+            preselect_overtrain=0.2,
+            scoring_method="AUC"
+        )
+
+        target = pd.DataFrame(
+            [
+                [0.9, 0.9, True],
+                [0.8, 0.8, True],
+                [0.7, 0.7, True],
+                [1.0, 0.7, False],
+                [0.3, 0.6, False],
+                [0.6, 0.6, False],
+                [0.5, 0.5, False],
+                [0.4, 0.4, False],
+                [0.2, 0.2, False],
+                [0.1, 0.1, False],
+            ],
+            columns=["AUC train", "AUC selection", "preselection"]
+        )
+        assert target.equals(result)