From e2b17027866928f09d2d4fb677801f2afb21712c Mon Sep 17 00:00:00 2001 From: ZlaTanskY Date: Fri, 12 May 2023 12:03:41 +0200 Subject: [PATCH 1/3] chore: refactor compute_univariate_preselection --- cobra/model_building/univariate_selection.py | 99 ++++++++++--------- .../test_univariate_selection.py | 21 ++-- 2 files changed, 60 insertions(+), 60 deletions(-) diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 2db4abb..72baaa3 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -65,66 +65,69 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, result = [] if model_type == "classification": - for predictor in predictors: - - cleaned_predictor = utils.clean_predictor_name(predictor) - - auc_train = roc_auc_score( - y_true=target_enc_train_data[target_column], - y_score=target_enc_train_data[predictor]) - - auc_selection = roc_auc_score( - y_true=target_enc_selection_data[target_column], - y_score=target_enc_selection_data[predictor]) - - result.append({"predictor": cleaned_predictor, - "AUC train": auc_train, - "AUC selection": auc_selection}) - - df_auc = pd.DataFrame(result) - + scoring_method = roc_auc_score + kwargs = {} + scoring_method_str = "AUC" + else: + scoring_method = mean_squared_error + kwargs = {"squared": False} + scoring_method_str = "RMSE" + + for predictor in predictors: + cleaned_predictor = utils.clean_predictor_name(predictor) + + score_train = scoring_method( + target_enc_train_data[target_column], + target_enc_train_data[predictor], + **kwargs + ) + + score_selection = scoring_method( + target_enc_selection_data[target_column], + target_enc_selection_data[predictor], + **kwargs + ) + + result.append( + { + "predictor": cleaned_predictor, + f"{scoring_method_str} train": score_train, + f"{scoring_method_str} selection": score_selection + } + ) + + df_score = pd.DataFrame(result) + + # TODO: This should be `if scoring method is error based` instead of classification vs regression + if model_type == "classification": # Filter based on min. AUC - auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold + score_thresh = df_score.loc[:, f"{scoring_method_str} selection"] > preselect_auc_threshold # Identify those variables for which the AUC difference between train # and selection is within a user-defined ratio - auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"]) - < preselect_overtrain_threshold) + score_overtrain = ( + (df_score[f"{scoring_method_str} train"] - df_score[f"{scoring_method_str} selection"]) + < preselect_overtrain_threshold + ) - df_auc["preselection"] = auc_thresh & auc_overtrain - - df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(drop=True) - - elif model_type == "regression": - for predictor in predictors: - cleaned_predictor = utils.clean_predictor_name(predictor) - - rmse_train = sqrt(mean_squared_error( - y_true=target_enc_train_data[target_column], - y_pred=target_enc_train_data[predictor])) - - rmse_selection = sqrt(mean_squared_error( - y_true=target_enc_selection_data[target_column], - y_pred=target_enc_selection_data[predictor])) - - result.append({"predictor": cleaned_predictor, - "RMSE train": rmse_train, - "RMSE selection": rmse_selection}) - - df_rmse = pd.DataFrame(result) + df_score["preselection"] = score_thresh & score_overtrain + df_out = df_score.sort_values(by=f"{scoring_method_str} selection", ascending=False).reset_index(drop=True) + else: + # What if they fill in something else than `regression`? # Filter based on max. RMSE - rmse_thresh = df_rmse.loc[:, "RMSE selection"] < preselect_rmse_threshold + score_thresh = df_score.loc[:, f"{scoring_method_str} selection"] < preselect_rmse_threshold # Identify those variables for which the RMSE difference between train # and selection is within a user-defined ratio - rmse_overtrain = ((df_rmse["RMSE selection"] - df_rmse["RMSE train"]) # flip subtraction vs. AUC - < preselect_overtrain_threshold) - - df_rmse["preselection"] = rmse_thresh & rmse_overtrain + score_overtrain = ( + (df_score[f"{scoring_method_str} selection"] - df_score[f"{scoring_method_str} train"]) # flip subtraction vs. AUC + < preselect_overtrain_threshold + ) - df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index(drop=True) # lower is better + df_score["preselection"] = score_thresh & score_overtrain + df_out = df_score.sort_values(by=f"{scoring_method_str} selection", ascending=True).reset_index(drop=True) # lower is better return df_out def get_preselected_predictors(df_metric: pd.DataFrame) -> list: diff --git a/tests/model_building/test_univariate_selection.py b/tests/model_building/test_univariate_selection.py index c69a4de..8bd16cc 100644 --- a/tests/model_building/test_univariate_selection.py +++ b/tests/model_building/test_univariate_selection.py @@ -1,8 +1,11 @@ import pandas as pd +import pytest from cobra.model_building import univariate_selection + +@pytest.fixture def mock_data(): return pd.DataFrame({"var1_enc": [0.42] * 10, "var2_enc": [0.94] * 10, @@ -10,9 +13,8 @@ def mock_data(): class TestUnivariateSelection: - def test_preselection_classification(self): - - X = mock_data() + def test_preselection_classification(self, mock_data: pd.DataFrame): + X = mock_data y = pd.DataFrame([1] * 5 + [0] * 5, columns=["target"]) basetable = pd.concat([y, X], axis=1) @@ -29,14 +31,11 @@ def test_preselection_classification(self): assert all(c in df_auc.columns for c in ["AUC train", "AUC selection"]) - preselected_predictors = (univariate_selection - .get_preselected_predictors(df_auc)) - + preselected_predictors = univariate_selection.get_preselected_predictors(df_auc) assert preselected_predictors == ["var1_enc", "var2_enc", "var3_enc"] - def test_preselection_regression(self): - - X = mock_data() + def test_preselection_regression(self, mock_data: pd.DataFrame): + X = mock_data y = pd.DataFrame([6.0, 9.0, 4.2, 5.5, 0.7, 1.9, 8.7, 8.0, 2.0, 7.2], columns=["target"]) basetable = pd.concat([y, X], axis=1) @@ -53,7 +52,5 @@ def test_preselection_regression(self): assert all(c in df_rmse.columns for c in ["RMSE train", "RMSE selection"]) - preselected_predictors = (univariate_selection - .get_preselected_predictors(df_rmse)) - + preselected_predictors = univariate_selection.get_preselected_predictors(df_rmse) assert preselected_predictors == ["var2_enc", "var3_enc"] From 17f49c7cc5410a428cab60833e390c2dd69590a7 Mon Sep 17 00:00:00 2001 From: ZlaTanskY Date: Fri, 12 May 2023 12:04:36 +0200 Subject: [PATCH 2/3] chore: rearrange newlines --- cobra/model_building/univariate_selection.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 72baaa3..99f4567 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -1,10 +1,10 @@ import pandas as pd from sklearn.metrics import roc_auc_score, mean_squared_error -from numpy import sqrt import cobra.utils as utils + def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, target_enc_selection_data: pd.DataFrame, predictors: list, @@ -99,6 +99,7 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, df_score = pd.DataFrame(result) # TODO: This should be `if scoring method is error based` instead of classification vs regression + # This opens the door to customised scoring methods if model_type == "classification": # Filter based on min. AUC score_thresh = df_score.loc[:, f"{scoring_method_str} selection"] > preselect_auc_threshold @@ -130,6 +131,7 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, df_out = df_score.sort_values(by=f"{scoring_method_str} selection", ascending=True).reset_index(drop=True) # lower is better return df_out + def get_preselected_predictors(df_metric: pd.DataFrame) -> list: """Wrapper function to extract a list of predictors from df_metric. @@ -157,6 +159,7 @@ def get_preselected_predictors(df_metric: pd.DataFrame) -> list: return [col + "_enc" for col in predictor_list] + def compute_correlations(target_enc_train_data: pd.DataFrame, predictors: list) -> pd.DataFrame: """Given a DataFrame and a list of predictors, compute the correlations From 9c4772b636383e86085d9849561545d13aa577dd Mon Sep 17 00:00:00 2001 From: ZlaTanskY Date: Fri, 26 May 2023 15:56:12 +0200 Subject: [PATCH 3/3] chore: create 2 methods to filter based on error or score based methods --- cobra/model_building/univariate_selection.py | 76 +++++++++-------- .../test_univariate_selection.py | 82 +++++++++++++++++++ 2 files changed, 124 insertions(+), 34 deletions(-) diff --git a/cobra/model_building/univariate_selection.py b/cobra/model_building/univariate_selection.py index 99f4567..7f2ae94 100644 --- a/cobra/model_building/univariate_selection.py +++ b/cobra/model_building/univariate_selection.py @@ -5,15 +5,16 @@ import cobra.utils as utils -def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, - target_enc_selection_data: pd.DataFrame, - predictors: list, - target_column: str, - model_type: str = "classification", - preselect_auc_threshold: float = 0.053, - preselect_rmse_threshold: float = 5, - preselect_overtrain_threshold: float = 0.05 - ) -> pd.DataFrame: +def compute_univariate_preselection( + target_enc_train_data: pd.DataFrame, + target_enc_selection_data: pd.DataFrame, + predictors: list, + target_column: str, + model_type: str = "classification", + preselect_auc_threshold: float = 0.053, + preselect_rmse_threshold: float = 5, + preselect_overtrain_threshold: float = 0.05 +) -> pd.DataFrame: """Perform a preselection of predictors based on an AUC (in case of classification) or a RMSE (in case of regression) threshold of a univariate model on a train and selection dataset and return a DataFrame @@ -64,6 +65,7 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, """ result = [] + # TODO: Change this to `if is_error_metric` or similar if model_type == "classification": scoring_method = roc_auc_score kwargs = {} @@ -98,37 +100,43 @@ def compute_univariate_preselection(target_enc_train_data: pd.DataFrame, df_score = pd.DataFrame(result) - # TODO: This should be `if scoring method is error based` instead of classification vs regression + # TODO: This should be `if error_metric` instead of classification vs regression # This opens the door to customised scoring methods if model_type == "classification": - # Filter based on min. AUC - score_thresh = df_score.loc[:, f"{scoring_method_str} selection"] > preselect_auc_threshold - - # Identify those variables for which the AUC difference between train - # and selection is within a user-defined ratio - score_overtrain = ( - (df_score[f"{scoring_method_str} train"] - df_score[f"{scoring_method_str} selection"]) - < preselect_overtrain_threshold - ) - - df_score["preselection"] = score_thresh & score_overtrain - - df_out = df_score.sort_values(by=f"{scoring_method_str} selection", ascending=False).reset_index(drop=True) + df_out = filter_preselection_score_based(df_score, preselect_auc_threshold, preselect_overtrain_threshold, scoring_method_str) else: # What if they fill in something else than `regression`? - # Filter based on max. RMSE - score_thresh = df_score.loc[:, f"{scoring_method_str} selection"] < preselect_rmse_threshold - - # Identify those variables for which the RMSE difference between train - # and selection is within a user-defined ratio - score_overtrain = ( - (df_score[f"{scoring_method_str} selection"] - df_score[f"{scoring_method_str} train"]) # flip subtraction vs. AUC - < preselect_overtrain_threshold - ) + df_out = filter_preselection_error_based(df_score, preselect_rmse_threshold, preselect_overtrain_threshold, scoring_method_str) + return df_out + + +def filter_preselection_error_based(df: pd.DataFrame, preselect_threshold: float, preselect_overtrain: float, scoring_method: str) -> pd.DataFrame: + """Filter the dataframe based on the given thresholds for error-based metrics.""" + score_thresh = df.loc[:, f"{scoring_method} selection"] < preselect_threshold + + # Identify those variables for which the error metric difference between train + # and selection is within a user-defined ratio + score_overtrain = ( + (df[f"{scoring_method} selection"] - df[f"{scoring_method} train"]) + < preselect_overtrain + ) + df["preselection"] = score_thresh & score_overtrain + df_out = df.sort_values(by=f"{scoring_method} selection", ascending=True).reset_index(drop=True) + return df_out + - df_score["preselection"] = score_thresh & score_overtrain +def filter_preselection_score_based(df: pd.DataFrame, preselect_threshold: float, preselect_overtrain: float, scoring_method: str) -> pd.DataFrame: + """Filter the dataframe based on the given thresholds for scoring-based metrics.""" + score_thresh = df.loc[:, f"{scoring_method} selection"] > preselect_threshold - df_out = df_score.sort_values(by=f"{scoring_method_str} selection", ascending=True).reset_index(drop=True) # lower is better + # Identify those variables for which the score difference between train + # and selection is within a user-defined ratio + score_overtrain = ( + (df[f"{scoring_method} train"] - df[f"{scoring_method} selection"]) + < preselect_overtrain + ) + df["preselection"] = score_thresh & score_overtrain + df_out = df.sort_values(by=f"{scoring_method} selection", ascending=False).reset_index(drop=True) return df_out diff --git a/tests/model_building/test_univariate_selection.py b/tests/model_building/test_univariate_selection.py index 8bd16cc..f119ec5 100644 --- a/tests/model_building/test_univariate_selection.py +++ b/tests/model_building/test_univariate_selection.py @@ -54,3 +54,85 @@ def test_preselection_regression(self, mock_data: pd.DataFrame): preselected_predictors = univariate_selection.get_preselected_predictors(df_rmse) assert preselected_predictors == ["var2_enc", "var3_enc"] + + def test_filter_preselection_error_based(self): + """Test filtering preselection data for an error-based metric.""" + test_input = pd.DataFrame( + [ + [0.1, 0.1], + [0.2, 0.2], + [0.3, 0.6], + [0.4, 0.4], + [0.5, 0.5], + [0.6, 0.6], + [0.7, 0.7], + [0.8, 0.8], + [0.9, 0.9], + [1.0, 1.0], + ], + columns=["RMSE train", "RMSE selection"] + ) + result = univariate_selection.filter_preselection_error_based( + test_input, + preselect_threshold=0.65, + preselect_overtrain=0.2, + scoring_method="RMSE" + ) + + target = pd.DataFrame( + [ + [0.1, 0.1, True], + [0.2, 0.2, True], + [0.4, 0.4, True], + [0.5, 0.5, True], + [0.3, 0.6, False], + [0.6, 0.6, True], + [0.7, 0.7, False], + [0.8, 0.8, False], + [0.9, 0.9, False], + [1.0, 1.0, False], + ], + columns=["RMSE train", "RMSE selection", "preselection"] + ) + assert target.equals(result) + + def test_filter_preselection_score_based(self): + """Test filtering preselection data for a score-based metric.""" + test_input = pd.DataFrame( + [ + [0.1, 0.1], + [0.2, 0.2], + [0.3, 0.6], + [0.4, 0.4], + [0.5, 0.5], + [0.6, 0.6], + [0.7, 0.7], + [0.8, 0.8], + [0.9, 0.9], + [1.0, 0.7], + ], + columns=["AUC train", "AUC selection"] + ) + result = univariate_selection.filter_preselection_score_based( + test_input, + preselect_threshold=0.65, + preselect_overtrain=0.2, + scoring_method="AUC" + ) + + target = pd.DataFrame( + [ + [0.9, 0.9, True], + [0.8, 0.8, True], + [0.7, 0.7, True], + [1.0, 0.7, False], + [0.3, 0.6, False], + [0.6, 0.6, False], + [0.5, 0.5, False], + [0.4, 0.4, False], + [0.2, 0.2, False], + [0.1, 0.1, False], + ], + columns=["AUC train", "AUC selection", "preselection"] + ) + assert target.equals(result)