From 47327a30a8da69d7509daa737762ba38dec1c0aa Mon Sep 17 00:00:00 2001
From: Sander Vanden Hautte <sander.vandenhautte@tobania.be>
Date: Fri, 8 Apr 2022 09:49:38 +0200
Subject: [PATCH 1/4] Support evaluating and forward feature selecting with a
 custom metric.

---
 cobra/evaluation/evaluator.py             |  20 ++-
 cobra/model_building/forward_selection.py | 136 +++++++++++---
 cobra/model_building/models.py            | 208 +++++++++++++++++-----
 3 files changed, 283 insertions(+), 81 deletions(-)

diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py
index b694a33..66073ff 100644
--- a/cobra/evaluation/evaluator.py
+++ b/cobra/evaluation/evaluator.py
@@ -154,7 +154,7 @@ def _compute_scalar_metrics(y_true: np.ndarray,
             "matthews_corrcoef": matthews_corrcoef(y_true, y_pred_b),
             "lift at {}".format(lift_at): np.round(ClassificationEvaluator
                                                    ._compute_lift(y_true=y_true,
-                                                                  y_pred=y_pred,
+                                                                  y_score=y_pred,
                                                                   lift_at=lift_at), 2)
         })
 
@@ -493,7 +493,7 @@ def _compute_lift_per_bin(y_true: np.ndarray,
         """
 
         lifts = [ClassificationEvaluator._compute_lift(y_true=y_true,
-                                                       y_pred=y_pred,
+                                                       y_score=y_pred,
                                                        lift_at=perc_lift)
                  for perc_lift in np.linspace(1/n_bins, 1, num=n_bins, endpoint=True)]
 
@@ -502,15 +502,23 @@ def _compute_lift_per_bin(y_true: np.ndarray,
         return x_labels, lifts, y_true.mean()
 
     @staticmethod
-    def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray,
+    def _compute_lift(y_true: np.ndarray,
+                      y_score: np.ndarray,
                       lift_at: float=0.05) -> float:
-        """Calculates lift given two arrays on specified level.
+        """
+        Calculate the lift metric for evaluation of the classifier model,
+        given the ground truth labels and the prediction scores
+        (scores or probabilities indicating the likelihood of the observations
+        being positive).
+        The lift metric is computed at a certain top level percentage - meaning
+        the selection percentage of observations from the evaluation data set,
+        after ordering them on descending prediction score.
 
         Parameters
         ----------
         y_true : np.ndarray
             True binary target data labels.
-        y_pred : np.ndarray
+        y_score : np.ndarray
             Target scores of the model.
         lift_at : float, optional
             At what top level percentage the lift should be computed.
@@ -523,7 +531,7 @@ def _compute_lift(y_true: np.ndarray, y_pred: np.ndarray,
 
         # Make sure it is numpy array
         y_true_ = np.array(y_true)
-        y_pred_ = np.array(y_pred)
+        y_pred_ = np.array(y_score)
 
         # Make sure it has correct shape
         y_true_ = y_true_.reshape(len(y_true_), 1)
diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py
index 29e06b3..e3bfa08 100644
--- a/cobra/model_building/forward_selection.py
+++ b/cobra/model_building/forward_selection.py
@@ -14,10 +14,16 @@ class ForwardFeatureSelection:
     algorithm.
 
     Predictors are sequentially added to the model, starting with the one that
-    has the highest univariate predictive power, and then proceeding with those that
-    jointly lead to the best fit, optimizing for selection AUC or RMSE. Interaction
-    effects are not explicitly modeled, yet they are implicitly present given the
-    feature selection and the underlying feature correlation structure.
+    has the highest univariate predictive power, and then proceeding with those
+    that jointly lead to the best fit, optimizing (tuning) for model
+    performance on the selection set, measured with AUC (default for
+    classification), RMSE (default for regression) or a custom metric (when
+    passing the metric parameter and possibly also metric_args and
+    metric_kwargs.)
+
+    Interaction effects are not explicitly modeled, yet they are implicitly
+    present given the feature selection and the underlying feature
+    correlation structure.
 
     Attributes
     ----------
@@ -33,12 +39,61 @@ class ForwardFeatureSelection:
         Whether or not the model coefficients should all be positive (no sign flips).
     self._fitted_models : list
         List of fitted models.
+    metric : Callable (function), optional
+        Function that evaluates the model's performance, by calculating a
+        certain evaluation metric.
+        If the metric is not provided, the default metric AUC is used for
+        evaluating the model.
+        The metric functions from sklearn can be used, see
+        https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.
+        You can also pass a custom function.
+        Examples for classification:
+        - ClassificationEvaluator._compute_lift(y_true=y_true,
+                                                y_score=y_pred,
+                                                lift_at=0.05)
+        - return_on_investment(y_true, y_pred,
+                              cost_of_letter=2.10,
+                              success_rate_of_letter=0.25,
+                              average_return_for_successful_letter=75.0)
+        Examples for regression:
+        - sklearn.metrics.r2_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')
+        - overall_estimated_commission_earned(y_true, y_pred,
+                                              avg_prob_buy_if_err_lower_than_20K=0.25,
+                                              avg_prob_buy_if_err_higher_than_20K=0.05,
+                                              pct_commission_on_buy=0.05)
+        Any metric function you provide here should be a function taking
+        y_true, y_pred and/or y_score arguments, of numpy array type,
+        and optionally also additional arguments, which you can pass
+        through the metric_args and metric_kwargs parameters.
+        If you are unsure which arguments of your metric function are
+        args/kwargs, then run inspect.getfullargspec(your_metric_function).
+    metric_args : dict, optional
+        Arguments (for example: lift_at=0.05) to be passed to the metric
+        function when evaluating the model's performance.
+        Example metric function in which this is required:
+        ClassificationEvaluator._compute_lift(y_true=y_true,
+                                              y_score=y_pred,
+                                              lift_at=0.05)
+    metric_kwargs : dict, optional
+        Keyword arguments (for example: normalize=True) to be passed to the
+        metric function when evaluating the model's performance.
+        Example metric function in which this is required (from
+        scikit-learn):
+        def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None)
+    higher_is_better : bool, optional
+        Whether the model is performing better if the chosen evaluation
+        metric results in a higher score (higher_is_better=True),
+        or worse (higher_is_better=False, meaning "lower is better").
     """
 
     def __init__(self,
                  model_type: str="classification",
                  max_predictors: int=50,
-                 pos_only: bool=True):
+                 pos_only: bool=True,
+                 metric: Optional[Callable] = None,
+                 metric_args: Optional[dict] = None,
+                 metric_kwargs: Optional[dict] = None,
+                 higher_is_better: Optional[bool] = None):
 
         self.model_type = model_type
         if model_type == "classification":
@@ -49,6 +104,37 @@ def __init__(self,
         self.max_predictors = max_predictors
         self.pos_only = pos_only
 
+        if higher_is_better is None:
+            if metric is None:
+                if self.MLModel == LogisticRegressionModel:
+                    # If no custom evaluation metric is chosen,
+                    # the LogisticRegressionModel uses AUC as default metric,
+                    # so "higher is better" evaluation logic is applied on the
+                    # evaluation scores.
+                    self.higher_is_better = True
+                elif self.MLModel == LinearRegressionModel:
+                    # If no custom evaluation metric is chosen,
+                    # the LinearRegressionModel uses RMSE as default metric,
+                    # so "lower is better" evaluation logic is applied on the
+                    # evaluation scores.
+                    self.higher_is_better = False
+                else:
+                    raise ValueError("The configured machine learning model is "
+                                     "not the standard logistic regression or "
+                                     "linear regression model. "
+                                     "Therefore, please fill the metric and "
+                                     "higher_is_better arguments.")
+            else:
+                raise ValueError("You chose a custom evaluation metric. "
+                                 "Please fill the higher_is_better argument.")
+        else:
+            self.higher_is_better = higher_is_better
+
+        self.metric = metric
+        self.metric_args = metric_args
+        self.metric_kwargs = metric_kwargs
+
+
         self._fitted_models = []
 
     def get_model_from_step(self, step: int):
@@ -77,8 +163,7 @@ def get_model_from_step(self, step: int):
 
     def compute_model_performances(self, data: pd.DataFrame,
                                    target_column_name: str,
-                                   splits: list=["train", "selection", "validation"],
-                                   metric: Optional[Callable]=None,
+                                   splits: list=["train", "selection", "validation"]
                                    ) -> pd.DataFrame:
         """Compute for each model the performance for different sets (e.g.
         train-selection-validation) and return them along with a list of
@@ -94,13 +179,6 @@ def compute_model_performances(self, data: pd.DataFrame,
             Name of the target column.
         splits : list, optional
             List of splits to compute performance on.
-        metric: Callable (function), optional
-            Function that computes an evaluation metric to evaluate the model's
-            performances, instead of the default metric (AUC for
-            classification, RMSE for regression).
-            The function should require y_true and y_pred arguments.
-            Metric functions from sklearn can be used, for example, see
-            https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.
 
         Returns
         -------
@@ -126,8 +204,9 @@ def compute_model_performances(self, data: pd.DataFrame,
                     data[data["split"] == split],
                     data[data["split"] == split][target_column_name],
                     split=split,  # parameter used for caching
-                    metric=metric
-                )
+                    metric=self.metric,
+                    metric_args=self.metric_args,
+                    metric_kwargs=self.metric_kwargs)
                 for split in splits
             })
 
@@ -290,14 +369,14 @@ def _find_next_best_model(self,
         """
         # placeholders
         best_model = None
-        if self.MLModel == LogisticRegressionModel:
-            best_performance = -1  # AUC metric is used
-        elif self.MLModel == LinearRegressionModel:
-            best_performance = float("inf")  # RMSE metric is used
+
+        # Set the performance intially with the worst possible value,
+        # depending on whether higher_is_better is true or false for the
+        # chosen evaluation metric.
+        if self.higher_is_better:
+            best_performance = -float("inf")
         else:
-            raise ValueError("No metric comparison method has been configured "
-                             "for the given model_type specified as "
-                             "ForwardFeatureSelection argument.")
+            best_performance = float("inf")
 
         fit_data = train_data[train_data["split"] == "train"]  # data to fit the models with
         sel_data = train_data[train_data["split"] == "selection"]  # data to compare the models with
@@ -311,19 +390,20 @@ def _find_next_best_model(self,
             performance = (model
                            .evaluate(sel_data[current_predictors + [pred]],
                                      sel_data[target_column_name],
-                                     split="selection"))
+                                     split="selection",
+                                     metric=self.metric,
+                                     metric_args=self.metric_args,
+                                     metric_kwargs=self.metric_kwargs))
 
             if self.pos_only and (not (model.get_coef() >= 0).all()):
                 continue
 
             # Check if the model is better than the current best model
             # and if it is, replace the current best.
-            if self.MLModel == LogisticRegressionModel \
-                    and performance > best_performance:  # AUC metric is used
+            if self.higher_is_better and performance > best_performance:
                 best_performance = performance
                 best_model = model
-            elif self.MLModel == LinearRegressionModel \
-                    and performance < best_performance:  # RMSE metric is used
+            elif not self.higher_is_better and performance < best_performance:
                 best_performance = performance
                 best_model = model
 
diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py
index 3a921c0..3a22a68 100644
--- a/cobra/model_building/models.py
+++ b/cobra/model_building/models.py
@@ -1,4 +1,4 @@
-
+import inspect
 from typing import Callable, Optional
 
 # third party imports
@@ -22,11 +22,11 @@ class LogisticRegressionModel:
     Attributes
     ----------
     logit : LogisticRegression
-        scikit-learn logistic regression model.
+        The scikit-learn logistic regression model that is trained and
+        afterwards used for making predictions.
     predictors : list
         List of predictors used in the model.
     """
-
     def __init__(self):
         self.logit = LogisticRegression(fit_intercept=True, C=1e9,
                                         solver='liblinear', random_state=42)
@@ -148,8 +148,10 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
         return self.logit.predict_proba(X[self.predictors])[:, 1]
 
     def evaluate(self, X: pd.DataFrame, y: pd.Series,
-                 split: str=None,
-                 metric: Optional[Callable]=None) -> float:
+                 split: str = None,
+                 metric: Optional[Callable] = None,
+                 metric_args: Optional[dict] = None,
+                 metric_kwargs: Optional[dict] = None) -> float:
         """Evaluate the model on a given dataset (X, y). The optional split
         parameter is to indicate that the dataset belongs to
         (train, selection, validation), so that the computation on these sets
@@ -163,39 +165,97 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
             Dataset containing the target of each observation.
         split : str, optional
             Split name of the dataset (e.g. "train", "selection", or "validation").
-        metric: Callable (function), optional
-            Function that computes an evaluation metric to evaluate the model's
-            performances, instead of the default metric (AUC).
-            The function should require y_true and y_pred (binary output) arguments.
-            Metric functions from sklearn can be used, for example, see
+        metric : Callable (function), optional
+            Function that evaluates the model's performance, by calculating a
+            certain evaluation metric.
+            If the metric is not provided, the default metric AUC is used for
+            evaluating the model.
+            The metric functions from sklearn can be used, see
             https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.
+            You can also pass a custom function.
+            Examples:
+            - ClassificationEvaluator._compute_lift(y_true=y_true,
+                                                    y_score=y_pred,
+                                                    lift_at=0.05)
+            - return_on_investment(y_true, y_pred,
+                                  cost_of_letter=2.10,
+                                  success_rate_of_letter=0.25,
+                                  average_return_for_successful_letter=75.0)
+            Any metric function you provide here should be a function taking
+            y_true, y_pred and/or y_score arguments, of numpy array type,
+            and optionally also additional arguments, which you can pass
+            through the metric_args and metric_kwargs parameters.
+            If you are unsure which arguments of your metric function are
+            args/kwargs, then run inspect.getfullargspec(your_metric_function).
+        metric_args : dict, optional
+            Arguments (for example: lift_at=0.05) to be passed to the metric
+            function when evaluating the model's performance.
+            Example metric function in which this is required:
+            ClassificationEvaluator._compute_lift(y_true=y_true,
+                                                  y_score=y_pred,
+                                                  lift_at=0.05)
+        metric_kwargs : dict, optional
+            Keyword arguments (for example: normalize=True) to be passed to the
+            metric function when evaluating the model's performance.
+            Example metric function in which this is required (from
+            scikit-learn):
+            def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None)
 
         Returns
         -------
         float
             The performance score of the model (AUC by default).
         """
-        if metric is not None:  # decouple from _eval_metrics_by_split attribute
-            y_pred = self.score_model(X)
+        y_score = self.score_model(X)
 
-            fpr, tpr, thresholds = roc_curve(y_true=y, y_score=y_pred)
-            cutoff = (ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds))
-            y_pred_b = np.array([0 if pred <= cutoff else 1 for pred in y_pred])
-
-            performance = metric(y_true=y, y_pred=y_pred_b)
+        if metric is None:
+            # No custom evaluation metric was chosen. We use AUC as default
+            # evaluation metric:
+            performance = roc_auc_score(y_true=y, y_score=y_score)
 
+        else:
+            # A custom evaluation metric was chosen. With the default
+            # metric AUC, the performance could be scored over all possible
+            # thresholds and based on y_score;
+            # now, with any evaluation metric possibly being used, y_pred may
+            # be required instead of y_score, which requires determining the
+            # optimal threshold first and then calculating y_pred.
+            fpr, tpr, thresholds = roc_curve(y_true=y, y_score=y_score)
+            cutoff = ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr,
+                                                                     thresholds)
+            y_pred = np.array([0 if score <= cutoff
+                               else 1
+                               for score in y_score])
+
+            # Compute the model performance with the chosen metric function,
+            # pass all arguments this function could potentially need,
+            # including optional keyword arguments that were passed when
+            # initializing this model.
+            args = {
+                "y_true": y,
+                "y_pred": y_pred,
+                "y_score": y_score,
+                "y_proba": y_score
+            }
+            if metric_args is not None and isinstance(metric_args, dict):
+                args = {**args, **metric_args}
+            args = {
+                arg: val
+                for arg, val in args.items()
+                # we can't provide too much arguments vs. the args of the
+                # metric's signature:
+                if arg in inspect.getfullargspec(metric).args
+            }
+            if metric_kwargs is None:
+                metric_kwargs = {}
+            performance = metric(**args, **metric_kwargs)
+
+        if split is None:
             return performance
         else:
-            if (split is None) or (split not in self._eval_metrics_by_split):
-                y_pred = self.score_model(X)
-                performance = roc_auc_score(y_true=y, y_score=y_pred)
-
-                if split is None:
-                    return performance
-                else:
-                    self._eval_metrics_by_split[split] = performance
-
-        return self._eval_metrics_by_split[split]
+            if split not in self._eval_metrics_by_split:
+                self._eval_metrics_by_split[split] = performance # caching
+            return self._eval_metrics_by_split[split]
 
     def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
         """Compute the importance of each predictor in the model and return
@@ -376,7 +436,9 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
 
     def evaluate(self, X: pd.DataFrame, y: pd.Series,
                  split: str=None,
-                 metric: Optional[Callable]=None) -> float:
+                 metric: Optional[Callable] = None,
+                 metric_args: Optional[dict] = None,
+                 metric_kwargs: Optional[dict] = None) -> float:
         """Evaluate the model on a given dataset (X, y). The optional split
         parameter is to indicate that the dataset belongs to
         (train, selection, validation), so that the computation on these sets
@@ -390,34 +452,86 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
             Dataset containing the target of each observation.
         split : str, optional
             Split name of the dataset (e.g. "train", "selection", or "validation").
-        metric: Callable (function), optional
-            Function that computes an evaluation metric to evaluate the model's
-            performances, instead of the default metric (RMSE).
-            The function should require y_true and y_pred arguments.
-            Metric functions from sklearn can be used, for example, see
+        metric : Callable (function), optional
+            Function that evaluates the model's performance, by calculating a
+            certain evaluation metric.
+            If the metric is not provided, the default metric AUC is used for
+            evaluating the model.
+            The metric functions from sklearn can be used, see
             https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.
+            You can also pass a custom function.
+            Examples:
+            - sklearn.metrics.r2_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')
+            - overall_estimated_commission_earned(y_true, y_pred,
+                                                  avg_prob_buy_if_err_lower_than_20K=0.25,
+                                                  avg_prob_buy_if_err_higher_than_20K=0.05,
+                                                  pct_commission_on_buy=0.05)
+            Any metric function you provide here should be a function taking
+            y_true, y_pred and/or y_score arguments, of numpy array type,
+            and optionally also additional arguments, which you can pass
+            through the metric_args and metric_kwargs parameters.
+            If you are unsure which arguments of your metric function are
+            args/kwargs, then run inspect.getfullargspec(your_metric_function).
+        metric_args : dict, optional
+            Arguments (for example: lift_at=0.05) to be passed to the metric
+            function when evaluating the model's performance.
+            Example metric function in which this is required:
+            overall_estimated_commission_earned(y_true, y_pred,
+                                                avg_prob_buy_if_err_lower_than_20K=0.25,
+                                                avg_prob_buy_if_err_higher_than_20K=0.05,
+                                                pct_commission_on_buy=0.05)
+        metric_kwargs : dict, optional
+            Keyword arguments (for example: normalize=True) to be passed to the
+            metric function when evaluating the model's performance.
+            Example metric function in which this is required (from
+            scikit-learn):
+            sklearn.metrics.r2_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')
 
         Returns
         -------
         float
             The performance score of the model (RMSE by default).
         """
-        if metric is not None:  # decouple from _eval_metrics_by_split attribute
-            y_pred = self.score_model(X)
-            performance = metric(y_true=y, y_pred=y_pred)
+        y_pred = self.score_model(X)
 
-            return performance
-        else:
-            if (split is None) or (split not in self._eval_metrics_by_split):
-                y_pred = self.score_model(X)
-                performance = sqrt(mean_squared_error(y_true=y, y_pred=y_pred))
+        if metric is None:
+            # No custom evaluation metric was chosen. We use RMSE as default
+            # evaluation metric:
+            performance = sqrt(mean_squared_error(y_true=y, y_pred=y_pred))
 
-                if split is None:
-                    return performance
-                else:
-                    self._eval_metrics_by_split[split] = performance
-
-        return self._eval_metrics_by_split[split]
+        else:
+            # A custom evaluation metric was chosen. With the default
+            # metric RMSE, the performance could be scored over all possible
+            # based on y_score;
+            # now, with any evaluation metric possibly being used, y_pred may
+            # be required instead of y_score, so we'll first calculate y_pred.
+            # Compute the model performance with the chosen metric function,
+            # pass all arguments this function could potentially need,
+            # including optional keyword arguments that were passed when
+            # initializing this model.
+            args = {
+                "y_true": y,
+                "y_pred": y_pred
+            }
+            if metric_args is not None and isinstance(metric_args, dict):
+                args = {**args, **metric_args}
+            args = {
+                arg: val
+                for arg, val in args.items()
+                # we can't provide too much arguments vs. the args of the
+                # metric's signature:
+                if arg in inspect.getfullargspec(metric).args
+            }
+            if metric_kwargs is None:
+                metric_kwargs = {}
+            performance = metric(**args, **metric_kwargs)
+
+            if split is None:
+                return performance
+            else:
+                if split not in self._eval_metrics_by_split:
+                    self._eval_metrics_by_split[split] = performance  # caching
+                return self._eval_metrics_by_split[split]
 
     def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
         """Compute the importance of each predictor in the model and return

From 6e6873768b10c43b49502b2551ba476bfff4dbb6 Mon Sep 17 00:00:00 2001
From: Sander Vanden Hautte <sander.vandenhautte@tobania.be>
Date: Wed, 25 May 2022 18:24:54 +0200
Subject: [PATCH 2/4] Revision of code based on Sam's review

---
 README.rst                                    | 14 ++++-
 cobra/evaluation/evaluator.py                 |  6 +--
 cobra/model_building/forward_selection.py     | 32 +++--------
 cobra/model_building/models.py                | 53 +++++++++----------
 .../model_building/test_forward_selection.py  | 12 +++--
 5 files changed, 54 insertions(+), 63 deletions(-)

diff --git a/README.rst b/README.rst
index 71a1e42..344c14b 100644
--- a/README.rst
+++ b/README.rst
@@ -68,7 +68,17 @@ Documentation and extra material
 Contributing to Cobra
 =====================
 
-We'd love you to contribute to the development of Cobra! There are many ways in which you can contribute, the most common of which is to contribute to the source code or documentation of the project. However, there are many other ways you can contribute (report issues, improve code coverage by adding unit tests, ...).
-We use GitHub issues to track all bugs and feature requests. Feel free to open an issue in case you found a bug or in case you wish to see a new feature added.
+We'd love you to contribute to the development of Cobra!
+
+There are many ways in which you can contribute.
+
+* We much appreciate contributions to the source code or documentation of this
+  project.
+* However, there are many other ways you can contribute (report issues,
+  improve code coverage by adding unit tests,...).
+
+We use GitHub issues to track all bugs and feature requests.
+Feel free to open an issue in case you found a bug or in case you wish to see
+a new feature added.
 
 For more details, check out our `wiki <https://github.com/PythonPredictions/cobra/wiki/Contributing-guidelines-&-workflows>`_.
diff --git a/cobra/evaluation/evaluator.py b/cobra/evaluation/evaluator.py
index 66073ff..664985c 100644
--- a/cobra/evaluation/evaluator.py
+++ b/cobra/evaluation/evaluator.py
@@ -531,14 +531,14 @@ def _compute_lift(y_true: np.ndarray,
 
         # Make sure it is numpy array
         y_true_ = np.array(y_true)
-        y_pred_ = np.array(y_score)
+        y_score_ = np.array(y_score)
 
         # Make sure it has correct shape
         y_true_ = y_true_.reshape(len(y_true_), 1)
-        y_pred_ = y_pred_.reshape(len(y_pred_), 1)
+        y_score_ = y_score_.reshape(len(y_score_), 1)
 
         # Merge data together
-        y_data = np.hstack([y_true_, y_pred_])
+        y_data = np.hstack([y_true_, y_score_])
 
         # Calculate necessary variables
         nrows = len(y_data)
diff --git a/cobra/model_building/forward_selection.py b/cobra/model_building/forward_selection.py
index e3bfa08..6c57bc5 100644
--- a/cobra/model_building/forward_selection.py
+++ b/cobra/model_building/forward_selection.py
@@ -42,37 +42,17 @@ class ForwardFeatureSelection:
     metric : Callable (function), optional
         Function that evaluates the model's performance, by calculating a
         certain evaluation metric.
-        If the metric is not provided, the default metric AUC is used for
-        evaluating the model.
-        The metric functions from sklearn can be used, see
-        https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.
-        You can also pass a custom function.
-        Examples for classification:
-        - ClassificationEvaluator._compute_lift(y_true=y_true,
-                                                y_score=y_pred,
-                                                lift_at=0.05)
-        - return_on_investment(y_true, y_pred,
-                              cost_of_letter=2.10,
-                              success_rate_of_letter=0.25,
-                              average_return_for_successful_letter=75.0)
-        Examples for regression:
-        - sklearn.metrics.r2_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')
-        - overall_estimated_commission_earned(y_true, y_pred,
-                                              avg_prob_buy_if_err_lower_than_20K=0.25,
-                                              avg_prob_buy_if_err_higher_than_20K=0.05,
-                                              pct_commission_on_buy=0.05)
-        Any metric function you provide here should be a function taking
-        y_true, y_pred and/or y_score arguments, of numpy array type,
-        and optionally also additional arguments, which you can pass
-        through the metric_args and metric_kwargs parameters.
-        If you are unsure which arguments of your metric function are
-        args/kwargs, then run inspect.getfullargspec(your_metric_function).
+        For more details about the possibilities here, refer to the
+        documentation of the metric parameter in the evaluate() function of
+        either models.LogisticRegressionModel or models.LinearRegressionModel,
+        depending on which model you are going to use in this forward feature
+        selection.
     metric_args : dict, optional
         Arguments (for example: lift_at=0.05) to be passed to the metric
         function when evaluating the model's performance.
         Example metric function in which this is required:
         ClassificationEvaluator._compute_lift(y_true=y_true,
-                                              y_score=y_pred,
+                                              y_score=y_score,
                                               lift_at=0.05)
     metric_kwargs : dict, optional
         Keyword arguments (for example: normalize=True) to be passed to the
diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py
index 3a22a68..66fda17 100644
--- a/cobra/model_building/models.py
+++ b/cobra/model_building/models.py
@@ -173,6 +173,8 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
             The metric functions from sklearn can be used, see
             https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.
             You can also pass a custom function.
+            The variables that we provide and your function can possibly take in
+            are y_true, y_pred, y_score and y_prob.
             Examples:
             - ClassificationEvaluator._compute_lift(y_true=y_true,
                                                     y_score=y_pred,
@@ -214,35 +216,36 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None)
             performance = roc_auc_score(y_true=y, y_score=y_score)
 
         else:
-            # A custom evaluation metric was chosen. With the default
-            # metric AUC, the performance could be scored over all possible
-            # thresholds and based on y_score;
-            # now, with any evaluation metric possibly being used, y_pred may
-            # be required instead of y_score, which requires determining the
-            # optimal threshold first and then calculating y_pred.
-            fpr, tpr, thresholds = roc_curve(y_true=y, y_score=y_score)
-            cutoff = ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr,
-                                                                     thresholds)
-            y_pred = np.array([0 if score <= cutoff
-                               else 1
-                               for score in y_score])
-
+            # A custom evaluation metric was chosen.
             # Compute the model performance with the chosen metric function,
             # pass all arguments this function could potentially need,
             # including optional keyword arguments that were passed when
             # initializing this model.
             args = {
                 "y_true": y,
-                "y_pred": y_pred,
                 "y_score": y_score,
-                "y_proba": y_score
+                "y_prob": y_score
             }
+            if "y_pred" in inspect.getfullargspec(metric).args:
+                # With the default metric AUC, the performance could be
+                # scored over all possible thresholds and based on y_score;
+                # now, with any evaluation metric possibly being used, y_pred
+                # may be required instead of y_score, which requires determining
+                # the optimal threshold first and then calculating y_pred.
+                fpr, tpr, thresholds = roc_curve(y_true=y, y_score=y_score)
+                cutoff = ClassificationEvaluator._compute_optimal_cutoff(fpr,
+                                                                         tpr,
+                                                                         thresholds)
+                args["y_pred"] = np.array([0 if score <= cutoff
+                                           else 1
+                                           for score in y_score])
+
             if metric_args is not None and isinstance(metric_args, dict):
                 args = {**args, **metric_args}
             args = {
                 arg: val
                 for arg, val in args.items()
-                # we can't provide too much arguments vs. the args of the
+                # we can't provide too many arguments vs. the args of the
                 # metric's signature:
                 if arg in inspect.getfullargspec(metric).args
             }
@@ -254,7 +257,7 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None)
             return performance
         else:
             if split not in self._eval_metrics_by_split:
-                self._eval_metrics_by_split[split] = performance # caching
+                self._eval_metrics_by_split[split] = performance  # caching
             return self._eval_metrics_by_split[split]
 
     def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
@@ -455,11 +458,13 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
         metric : Callable (function), optional
             Function that evaluates the model's performance, by calculating a
             certain evaluation metric.
-            If the metric is not provided, the default metric AUC is used for
+            If the metric is not provided, the default metric RMSE is used for
             evaluating the model.
             The metric functions from sklearn can be used, see
             https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.
             You can also pass a custom function.
+            The variables that we provide and your function can possibly take in
+            are y_true and y_pred.
             Examples:
             - sklearn.metrics.r2_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')
             - overall_estimated_commission_earned(y_true, y_pred,
@@ -500,15 +505,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
             performance = sqrt(mean_squared_error(y_true=y, y_pred=y_pred))
 
         else:
-            # A custom evaluation metric was chosen. With the default
-            # metric RMSE, the performance could be scored over all possible
-            # based on y_score;
-            # now, with any evaluation metric possibly being used, y_pred may
-            # be required instead of y_score, so we'll first calculate y_pred.
-            # Compute the model performance with the chosen metric function,
-            # pass all arguments this function could potentially need,
-            # including optional keyword arguments that were passed when
-            # initializing this model.
+            # A custom evaluation metric was chosen.
             args = {
                 "y_true": y,
                 "y_pred": y_pred
@@ -518,7 +515,7 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
             args = {
                 arg: val
                 for arg, val in args.items()
-                # we can't provide too much arguments vs. the args of the
+                # we can't provide too many arguments vs. the args of the
                 # metric's signature:
                 if arg in inspect.getfullargspec(metric).args
             }
diff --git a/tests/model_building/test_forward_selection.py b/tests/model_building/test_forward_selection.py
index 19f7157..cee9448 100644
--- a/tests/model_building/test_forward_selection.py
+++ b/tests/model_building/test_forward_selection.py
@@ -1,4 +1,3 @@
-
 from contextlib import contextmanager
 import pytest
 import pandas as pd
@@ -6,10 +5,12 @@
 from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel
 from cobra.model_building.forward_selection import ForwardFeatureSelection
 
+
 @contextmanager
 def does_not_raise():
     yield
 
+
 def mock_data(add_split_col: bool=False, model_type="classification"):
     data = pd.DataFrame({"var1_enc": [0.42] * 10,
                          "var2_enc": [0.94] * 10,
@@ -25,10 +26,12 @@ def mock_data(add_split_col: bool=False, model_type="classification"):
 
     return data
 
+
 def mock_model_num_pred(n_predictors, model_type="classification"):
     predictors = [f"var{i + 1}_enc" for i in range(n_predictors)]
     return mock_model(predictors, model_type)
 
+
 def mock_model(predictor_list, model_type="classification"):
     if model_type == "classification":
         model = LogisticRegressionModel()
@@ -61,7 +64,9 @@ def test_compute_model_performances(self, mocker, model_type):
             mock_model_num_pred(3, model_type=model_type)
         ]
 
-        def mock_evaluate(self, X, y, split, metric):  # on AUC scale, but gives the same for RMSE as it is a mock
+        def mock_evaluate(self, X, y,
+                          split, metric, metric_args, metric_kwargs):
+            # on AUC scale, but gives the same for RMSE as it is a mock
             if split == "train":
                 return 0.612
             else:
@@ -76,8 +81,7 @@ def mock_evaluate(self, X, y, split, metric):  # on AUC scale, but gives the sam
 
         actual = (fw_selection
                   .compute_model_performances(data, "target",
-                                              splits=["train", "selection"],
-                                              metric=None))
+                                              splits=["train", "selection"]))
 
         expected = pd.DataFrame([
             {"predictors": ["var1_enc"],

From 46001eec9d6fec429c1a04a08157674005b57c20 Mon Sep 17 00:00:00 2001
From: Sander Vanden Hautte <sander.vandenhautte@tobania.be>
Date: Wed, 1 Jun 2022 14:27:18 +0200
Subject: [PATCH 3/4] model evaluation: fixing failed unit tests, new unit
 tests + models: reworking into base + child classes (had too much common
 code).

---
 cobra/model_building/models.py      | 581 +++++++++++++++-------------
 tests/model_building/test_models.py | 511 ++++++++++++++++++++----
 2 files changed, 744 insertions(+), 348 deletions(-)

diff --git a/cobra/model_building/models.py b/cobra/model_building/models.py
index 66fda17..ef71dd0 100644
--- a/cobra/model_building/models.py
+++ b/cobra/model_building/models.py
@@ -1,4 +1,5 @@
 import inspect
+from abc import abstractmethod
 from typing import Callable, Optional
 
 # third party imports
@@ -14,26 +15,23 @@
 import cobra.utils as utils
 from cobra.evaluation import ClassificationEvaluator
 
-class LogisticRegressionModel:
-    """Wrapper around the LogisticRegression class, with additional methods
-    implemented such as evaluation (using AUC), getting a list of coefficients,
-    a dictionary of coefficients per predictor, ... for convenience.
+
+class Model:
+    """Base class for all models provided in Cobra.
 
     Attributes
     ----------
-    logit : LogisticRegression
-        The scikit-learn logistic regression model that is trained and
-        afterwards used for making predictions.
+    linear : LinearRegression
+        scikit-learn linear regression model.
     predictors : list
         List of predictors used in the model.
     """
+
     def __init__(self):
-        self.logit = LogisticRegression(fit_intercept=True, C=1e9,
-                                        solver='liblinear', random_state=42)
         self._is_fitted = False
-        # placeholder to keep track of a list of predictors
         self.predictors = []
-        self._eval_metrics_by_split = {}
+        # Cache for model performance per dataset split:
+        self._performance_per_split = {}
 
     def serialize(self) -> dict:
         """Serialize model as JSON.
@@ -44,20 +42,9 @@ def serialize(self) -> dict:
             Dictionary containing the serialized JSON.
         """
         serialized_model = {
-            "meta": "logistic-regression",
             "predictors": self.predictors,
-            "_eval_metrics_by_split": self._eval_metrics_by_split,
-            "params": self.logit.get_params()
+            "_performance_per_split": self._performance_per_split,
         }
-
-        if self._is_fitted:
-            serialized_model.update({
-                "classes_": self.logit.classes_.tolist(),
-                "coef_": self.logit.coef_.tolist(),
-                "intercept_": self.logit.intercept_.tolist(),
-                "n_iter_": self.logit.n_iter_.tolist(),
-            })
-
         return serialized_model
 
     def deserialize(self, model_dict: dict):
@@ -77,48 +64,19 @@ def deserialize(self, model_dict: dict):
         if not self._is_valid_dict(model_dict):
             raise ValueError("No valid serialized model")
 
-        self.logit = LogisticRegression()
-        self.logit.set_params(**model_dict["params"])
-        self.logit.classes_ = np.array(model_dict["classes_"])
-        self.logit.coef_ = np.array(model_dict["coef_"])
-        self.logit.intercept_ = np.array(model_dict["intercept_"])
-        self.logit.n_iter_ = np.array(model_dict["intercept_"])
         self.predictors = model_dict["predictors"]
-        self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
-
-    def get_coef(self) -> np.array:
-        """Returns the model coefficients.
-
-        Returns
-        -------
-        np.array
-            Array of model coefficients.
-        """
-        return self.logit.coef_[0]
-
-    def get_intercept(self) -> float:
-        """Returns the intercept of the model.
-
-        Returns
-        -------
-        float
-            Intercept of the model.
-        """
-        return self.logit.intercept_[0]
-
-    def get_coef_by_predictor(self) -> dict:
-        """Returns a dictionary mapping predictor (key) to coefficient (value).
 
-        Returns
-        -------
-        dict
-            A map ``{predictor: coefficient}``.
-        """
-        return dict(zip(self.predictors, self.logit.coef_[0]))
+        if "_performance_per_split" in model_dict.keys():
+            self._performance_per_split = model_dict["_performance_per_split"]
+        elif "_eval_metrics_by_split" in model_dict.keys():  # backwards compat.
+            self._performance_per_split = model_dict["_eval_metrics_by_split"]
 
+    @abstractmethod
     def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
         """Fit the model.
 
+        Any child class should call _after_fit() at the end.
+
         Parameters
         ----------
         X_train : pd.DataFrame
@@ -126,10 +84,9 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
         y_train : pd.Series
             Target of train data.
         """
-        self.predictors = list(X_train.columns)
-        self.logit.fit(X_train, y_train)
-        self._is_fitted = True
+        pass
 
+    @abstractmethod
     def score_model(self, X: pd.DataFrame) -> np.ndarray:
         """Score a model on a (new) dataset.
 
@@ -143,19 +100,18 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
         np.ndarray
             Score (i.e. predicted probabilities) of the model for each observation.
         """
-        # We select predictor columns (self.predictors) here to
-        # ensure we have the proper predictors and the proper order
-        return self.logit.predict_proba(X[self.predictors])[:, 1]
+        pass
 
     def evaluate(self, X: pd.DataFrame, y: pd.Series,
                  split: str = None,
                  metric: Optional[Callable] = None,
                  metric_args: Optional[dict] = None,
                  metric_kwargs: Optional[dict] = None) -> float:
-        """Evaluate the model on a given dataset (X, y). The optional split
-        parameter is to indicate that the dataset belongs to
-        (train, selection, validation), so that the computation on these sets
-        can be cached!
+        """Evaluate the model on a given dataset (X, y).
+
+        Caching of the calculated model performance can be enabled with the
+        split argument for faster execution, and a custom metric can be
+        configured for the calculation of the model performance.
 
         Parameters
         ----------
@@ -164,18 +120,36 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
         y : pd.Series
             Dataset containing the target of each observation.
         split : str, optional
-            Split name of the dataset (e.g. "train", "selection", or "validation").
+            Name of the split of the dataset ("train", "selection",
+            or "validation") for which the model performance must be calculated.
+            Providing this argument will enable caching the calculated model
+            performance for the given dataset split, such that every time the
+            performance for the same split is requested again,
+            the performance must not be re-calculated.
         metric : Callable (function), optional
             Function that evaluates the model's performance, by calculating a
             certain evaluation metric.
-            If the metric is not provided, the default metric AUC is used for
-            evaluating the model.
+            If the metric is not provided, the default metric RMSE is used for
+            evaluating the model for a regression task, or AUC for a
+            classification task.
             The metric functions from sklearn can be used, see
             https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.
             You can also pass a custom function.
             The variables that we provide and your function can possibly take in
-            are y_true, y_pred, y_score and y_prob.
-            Examples:
+            are y_true, y_pred, y_score and y_prob arguments, of numpy array
+            type, and optionally also additional arguments, which you can pass
+            through the metric_args and metric_kwargs parameters.
+            If you are unsure which arguments of your metric function are
+            args/kwargs, then run inspect.getfullargspec(your_metric_function).
+
+            Example functions for regression:
+            - sklearn.metrics.r2_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')
+            - overall_estimated_commission_earned(y_true, y_pred,
+                                                  avg_prob_buy_if_err_lower_than_20K=0.25,
+                                                  avg_prob_buy_if_err_higher_than_20K=0.05,
+                                                  pct_commission_on_buy=0.05)
+
+            Example functions for classification:
             - ClassificationEvaluator._compute_lift(y_true=y_true,
                                                     y_score=y_pred,
                                                     lift_at=0.05)
@@ -183,37 +157,38 @@ def evaluate(self, X: pd.DataFrame, y: pd.Series,
                                   cost_of_letter=2.10,
                                   success_rate_of_letter=0.25,
                                   average_return_for_successful_letter=75.0)
-            Any metric function you provide here should be a function taking
-            y_true, y_pred and/or y_score arguments, of numpy array type,
-            and optionally also additional arguments, which you can pass
-            through the metric_args and metric_kwargs parameters.
-            If you are unsure which arguments of your metric function are
-            args/kwargs, then run inspect.getfullargspec(your_metric_function).
+
         metric_args : dict, optional
-            Arguments (for example: lift_at=0.05) to be passed to the metric
-            function when evaluating the model's performance.
-            Example metric function in which this is required:
-            ClassificationEvaluator._compute_lift(y_true=y_true,
-                                                  y_score=y_pred,
-                                                  lift_at=0.05)
+            Arguments to be passed to the metric function when
+            evaluating the model's performance.
+            See the documentation of the metric function above.
+            Example arguments are: lift_at for
+            ClassificationEvaluator._compute_lift()
+            or pct_commission_on_buy for overall_estimated_commission_earned().
         metric_kwargs : dict, optional
             Keyword arguments (for example: normalize=True) to be passed to the
             metric function when evaluating the model's performance.
-            Example metric function in which this is required (from
-            scikit-learn):
-            def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None)
+            See the documentation of the metric function above.
+            An example keyword argument is sample_weight for
+            sklearn.metrics.r2_score().
 
         Returns
         -------
         float
-            The performance score of the model (AUC by default).
+            The performance score of the model (RMSE by default for
+            regression, AUC by default for classification).
         """
+        # If the performance is requested for a certain split of the dataset
+        # and it has been pre-calculated already (it is available in the cache),
+        # return it immediately from the cache:
+        if split is not None and split in self._performance_per_split:
+            return self._performance_per_split[split]
+
         y_score = self.score_model(X)
 
         if metric is None:
-            # No custom evaluation metric was chosen. We use AUC as default
-            # evaluation metric:
-            performance = roc_auc_score(y_true=y, y_score=y_score)
+            # No custom evaluation metric was chosen.
+            performance = self._evaluate_with_default_metric(y, y_score)
 
         else:
             # A custom evaluation metric was chosen.
@@ -221,25 +196,9 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None)
             # pass all arguments this function could potentially need,
             # including optional keyword arguments that were passed when
             # initializing this model.
-            args = {
-                "y_true": y,
-                "y_score": y_score,
-                "y_prob": y_score
-            }
-            if "y_pred" in inspect.getfullargspec(metric).args:
-                # With the default metric AUC, the performance could be
-                # scored over all possible thresholds and based on y_score;
-                # now, with any evaluation metric possibly being used, y_pred
-                # may be required instead of y_score, which requires determining
-                # the optimal threshold first and then calculating y_pred.
-                fpr, tpr, thresholds = roc_curve(y_true=y, y_score=y_score)
-                cutoff = ClassificationEvaluator._compute_optimal_cutoff(fpr,
-                                                                         tpr,
-                                                                         thresholds)
-                args["y_pred"] = np.array([0 if score <= cutoff
-                                           else 1
-                                           for score in y_score])
-
+            args = self._prepare_args_for_custom_evaluation_metric(y,
+                                                                   y_score,
+                                                                   metric)
             if metric_args is not None and isinstance(metric_args, dict):
                 args = {**args, **metric_args}
             args = {
@@ -256,9 +215,9 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None)
         if split is None:
             return performance
         else:
-            if split not in self._eval_metrics_by_split:
-                self._eval_metrics_by_split[split] = performance  # caching
-            return self._eval_metrics_by_split[split]
+            # Cache the model performance for the given dataset split:
+            self._performance_per_split[split] = performance
+            return self._performance_per_split[split]
 
     def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
         """Compute the importance of each predictor in the model and return
@@ -281,7 +240,7 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
             utils.clean_predictor_name(predictor): stats.pearsonr(
                 data[predictor],
                 y_pred
-                )[0]
+            )[0]
             for predictor in self.predictors
         }
 
@@ -292,42 +251,72 @@ def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
         return (df.sort_values(by="importance", ascending=False)
                 .reset_index(drop=True))
 
-    def _is_valid_dict(self, model_dict: dict) -> bool:
+    def _is_valid_dict(self, model_dict: dict):
+        """Verify whether a model dictionary, previously stored as JSON,
+        contains valid information, before constructing a model class from it.
 
-        if ("meta" not in model_dict
-                or model_dict["meta"] != "logistic-regression"):
-            return False
+        Parameters
+        ----------
+        model_dict : dict
+            Serialized JSON file as a dict.
 
-        attr = ["classes_", "coef_", "intercept_", "n_iter_", "predictors"]
-        for key in attr:
-            if not (key in model_dict or type(model_dict[key]) != list):
+        Raises
+        ------
+        ValueError
+            In case JSON file is no valid serialized model.
+        """
+        for key in ["predictors", "params"]:
+            if key not in model_dict:
                 return False
 
-        if ("params" not in model_dict
-                or "_eval_metrics_by_split" not in model_dict):
+        if "_performance_per_split" not in model_dict \
+                and "_eval_metrics_by_split" not in model_dict:  # backw compat.
             return False
 
         return True
 
+    @abstractmethod
+    def _evaluate_with_default_metric(self, y_true, y_score):
+        pass
+
+    def _after_fit(self, X_train: pd.DataFrame, y_train: pd.Series):
+        """Fit the model.
+
+        Parameters
+        ----------
+        X_train : pd.DataFrame
+            Predictors of train data.
+        y_train : pd.Series
+            Target of train data.
+        """
+        self.predictors = list(X_train.columns)
+        self._is_fitted = True
+
+    @abstractmethod
+    def _prepare_args_for_custom_evaluation_metric(self, y_true, y_score,
+                                                   metric):
+        pass
 
-class LinearRegressionModel:
-    """Wrapper around the LinearRegression class, with additional methods
-    implemented such as evaluation (using RMSE), getting a list of coefficients,
-    a dictionary of coefficients per predictor, ... for convenience.
+
+class LogisticRegressionModel(Model):
+    """Wrapper around sklearn's LogisticRegression class, with additional
+    methods implemented such as evaluation (using AUC), getting a list of
+    coefficients, a dictionary of coefficients per predictor, ... for
+    convenience.
+
+    See also the documentation of the base class (Model).
 
     Attributes
     ----------
-    linear : LinearRegression
-        scikit-learn linear regression model.
-    predictors : list
-        List of predictors used in the model.
+    logit : LogisticRegression
+        The scikit-learn logistic regression model that is trained and
+        afterwards used for making predictions.
     """
 
     def __init__(self):
-        self.linear = LinearRegression(fit_intercept=True)
-        self._is_fitted = False
-        self.predictors = []  # placeholder to keep track of a list of predictors
-        self._eval_metrics_by_split = {}
+        super().__init__()
+        self.logit = LogisticRegression(fit_intercept=True, C=1e9,
+                                        solver='liblinear', random_state=42)
 
     def serialize(self) -> dict:
         """Serialize model as JSON.
@@ -337,17 +326,16 @@ def serialize(self) -> dict:
         dict
             Dictionary containing the serialized JSON.
         """
-        serialized_model = {
-            "meta": "linear-regression",
-            "predictors": self.predictors,
-            "_eval_metrics_by_split": self._eval_metrics_by_split,
-            "params": self.linear.get_params()
-        }
+        serialized_model = super().serialize()
+        serialized_model["meta"] = "logistic-regression"
+        serialized_model["params"] = self.logit.get_params()
 
         if self._is_fitted:
             serialized_model.update({
-                "coef_": self.linear.coef_.tolist(),
-                "intercept_": self.linear.intercept_.tolist()
+                "classes_": self.logit.classes_.tolist(),
+                "coef_": self.logit.coef_.tolist(),
+                "intercept_": self.logit.intercept_.tolist(),
+                "n_iter_": self.logit.n_iter_.tolist(),
             })
 
         return serialized_model
@@ -365,16 +353,13 @@ def deserialize(self, model_dict: dict):
         ValueError
             In case JSON file is no valid serialized model.
         """
-
-        if not self._is_valid_dict(model_dict):
-            raise ValueError("No valid serialized model")
-
-        self.linear = LinearRegression()
-        self.linear.set_params(**model_dict["params"])
-        self.linear.coef_ = np.array(model_dict["coef_"])
-        self.linear.intercept_ = np.array(model_dict["intercept_"])
-        self.predictors = model_dict["predictors"]
-        self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
+        super().deserialize(model_dict)
+        self.logit = LogisticRegression()
+        self.logit.set_params(**model_dict["params"])
+        self.logit.classes_ = np.array(model_dict["classes_"])
+        self.logit.coef_ = np.array(model_dict["coef_"])
+        self.logit.intercept_ = np.array(model_dict["intercept_"])
+        self.logit.n_iter_ = np.array(model_dict["intercept_"])
 
     def get_coef(self) -> np.array:
         """Returns the model coefficients.
@@ -384,7 +369,7 @@ def get_coef(self) -> np.array:
         np.array
             Array of model coefficients.
         """
-        return self.linear.coef_
+        return self.logit.coef_[0]
 
     def get_intercept(self) -> float:
         """Returns the intercept of the model.
@@ -394,7 +379,7 @@ def get_intercept(self) -> float:
         float
             Intercept of the model.
         """
-        return self.linear.intercept_[0]
+        return self.logit.intercept_[0]
 
     def get_coef_by_predictor(self) -> dict:
         """Returns a dictionary mapping predictor (key) to coefficient (value).
@@ -404,7 +389,7 @@ def get_coef_by_predictor(self) -> dict:
         dict
             A map ``{predictor: coefficient}``.
         """
-        return dict(zip(self.predictors, self.linear.coef_))
+        return dict(zip(self.predictors, self.logit.coef_[0]))
 
     def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
         """Fit the model.
@@ -416,9 +401,8 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
         y_train : pd.Series
             Target of train data.
         """
-        self.predictors = list(X_train.columns)
-        self.linear.fit(X_train, y_train)
-        self._is_fitted = True
+        self.logit.fit(X_train, y_train)
+        super()._after_fit(X_train, y_train)
 
     def score_model(self, X: pd.DataFrame) -> np.ndarray:
         """Score a model on a (new) dataset.
@@ -431,150 +415,201 @@ def score_model(self, X: pd.DataFrame) -> np.ndarray:
         Returns
         -------
         np.ndarray
-            Score of the model for each observation.
+            Score (i.e. predicted probabilities) of the model for each observation.
         """
         # We select predictor columns (self.predictors) here to
         # ensure we have the proper predictors and the proper order
-        return self.linear.predict(X[self.predictors])
+        return self.logit.predict_proba(X[self.predictors])[:, 1]
 
-    def evaluate(self, X: pd.DataFrame, y: pd.Series,
-                 split: str=None,
-                 metric: Optional[Callable] = None,
-                 metric_args: Optional[dict] = None,
-                 metric_kwargs: Optional[dict] = None) -> float:
-        """Evaluate the model on a given dataset (X, y). The optional split
-        parameter is to indicate that the dataset belongs to
-        (train, selection, validation), so that the computation on these sets
-        can be cached!
+    def _is_valid_dict(self, model_dict: dict) -> bool:
+        if not super()._is_valid_dict(model_dict):
+            return False
 
-        Parameters
-        ----------
-        X : pd.DataFrame
-            Dataset containing the predictor values for each observation.
-        y : pd.Series
-            Dataset containing the target of each observation.
-        split : str, optional
-            Split name of the dataset (e.g. "train", "selection", or "validation").
-        metric : Callable (function), optional
-            Function that evaluates the model's performance, by calculating a
-            certain evaluation metric.
-            If the metric is not provided, the default metric RMSE is used for
-            evaluating the model.
-            The metric functions from sklearn can be used, see
-            https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.
-            You can also pass a custom function.
-            The variables that we provide and your function can possibly take in
-            are y_true and y_pred.
-            Examples:
-            - sklearn.metrics.r2_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')
-            - overall_estimated_commission_earned(y_true, y_pred,
-                                                  avg_prob_buy_if_err_lower_than_20K=0.25,
-                                                  avg_prob_buy_if_err_higher_than_20K=0.05,
-                                                  pct_commission_on_buy=0.05)
-            Any metric function you provide here should be a function taking
-            y_true, y_pred and/or y_score arguments, of numpy array type,
-            and optionally also additional arguments, which you can pass
-            through the metric_args and metric_kwargs parameters.
-            If you are unsure which arguments of your metric function are
-            args/kwargs, then run inspect.getfullargspec(your_metric_function).
-        metric_args : dict, optional
-            Arguments (for example: lift_at=0.05) to be passed to the metric
-            function when evaluating the model's performance.
-            Example metric function in which this is required:
-            overall_estimated_commission_earned(y_true, y_pred,
-                                                avg_prob_buy_if_err_lower_than_20K=0.25,
-                                                avg_prob_buy_if_err_higher_than_20K=0.05,
-                                                pct_commission_on_buy=0.05)
-        metric_kwargs : dict, optional
-            Keyword arguments (for example: normalize=True) to be passed to the
-            metric function when evaluating the model's performance.
-            Example metric function in which this is required (from
-            scikit-learn):
-            sklearn.metrics.r2_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')
+        if ("meta" not in model_dict
+                or model_dict["meta"] != "logistic-regression"):
+            return False
+
+        attr = ["classes_", "coef_", "intercept_", "n_iter_"]
+        for key in attr:
+            if not (key in model_dict or type(model_dict[key]) != list):
+                return False
+
+        return True
+
+    def _evaluate_with_default_metric(self, y_true, y_score):
+        """We use AUC as default evaluation metric."""
+        return roc_auc_score(y_true=y_true, y_score=y_score)
+
+    def _prepare_args_for_custom_evaluation_metric(self, y_true, y_score,
+                                                   metric):
+        args = {
+            "y_true": y_true,
+            "y_score": y_score,
+            "y_prob": y_score
+        }
+
+        if "y_pred" in inspect.getfullargspec(metric).args:
+            # With the default metric AUC, the performance could be
+            # scored over all possible thresholds and based on y_score;
+            # now, with any evaluation metric possibly being used, y_pred
+            # may be required instead of y_score, which requires determining
+            # the optimal threshold first and then calculating y_pred.
+            fpr, tpr, thresholds = roc_curve(y_true=y_true, y_score=y_score)
+            cutoff = ClassificationEvaluator._compute_optimal_cutoff(fpr,
+                                                                     tpr,
+                                                                     thresholds)
+            args["y_pred"] = np.array([0 if score <= cutoff
+                                       else 1
+                                       for score in y_score])
+        return args
+
+
+class LinearRegressionModel(Model):
+    """Wrapper around sckit-learn's LinearRegression class,
+    with additional methods implemented such as evaluation (using RMSE),
+    getting a list of coefficients, a dictionary of coefficients per
+    predictor,... for convenience.
+
+    See also the documentation of the base class (Model).
+
+    Attributes
+    ----------
+    linear : LinearRegression
+        scikit-learn linear regression model.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.linear = LinearRegression(fit_intercept=True)
+
+    def serialize(self) -> dict:
+        """Serialize model as JSON.
 
         Returns
         -------
-        float
-            The performance score of the model (RMSE by default).
+        dict
+            Dictionary containing the serialized JSON.
         """
-        y_pred = self.score_model(X)
+        serialized_model = super().serialize()
+        serialized_model["meta"] = "linear-regression"
+        serialized_model["params"] = self.linear.get_params()
 
-        if metric is None:
-            # No custom evaluation metric was chosen. We use RMSE as default
-            # evaluation metric:
-            performance = sqrt(mean_squared_error(y_true=y, y_pred=y_pred))
-
-        else:
-            # A custom evaluation metric was chosen.
-            args = {
-                "y_true": y,
-                "y_pred": y_pred
-            }
-            if metric_args is not None and isinstance(metric_args, dict):
-                args = {**args, **metric_args}
-            args = {
-                arg: val
-                for arg, val in args.items()
-                # we can't provide too many arguments vs. the args of the
-                # metric's signature:
-                if arg in inspect.getfullargspec(metric).args
-            }
-            if metric_kwargs is None:
-                metric_kwargs = {}
-            performance = metric(**args, **metric_kwargs)
+        if self._is_fitted:
+            serialized_model.update({
+                "coef_": self.linear.coef_.tolist(),
+                "intercept_": self.linear.intercept_.tolist()
+            })
 
-            if split is None:
-                return performance
-            else:
-                if split not in self._eval_metrics_by_split:
-                    self._eval_metrics_by_split[split] = performance  # caching
-                return self._eval_metrics_by_split[split]
+        return serialized_model
 
-    def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
-        """Compute the importance of each predictor in the model and return
-        it as a DataFrame.
+    def deserialize(self, model_dict: dict):
+        """Deserialize a model previously stored as JSON.
 
         Parameters
         ----------
-        data : pd.DataFrame
-            Data to score the model.
+        model_dict : dict
+            Serialized JSON file as a dict.
+
+        Raises
+        ------
+        ValueError
+            In case JSON file is no valid serialized model.
+        """
+        super().deserialize(model_dict)
+        self.linear = LinearRegression()
+        self.linear.set_params(**model_dict["params"])
+        self.linear.coef_ = np.array(model_dict["coef_"])
+        self.linear.intercept_ = np.array(model_dict["intercept_"])
+
+    def get_coef(self) -> np.array:
+        """Returns the model coefficients.
 
         Returns
         -------
-        pd.DataFrame
-            DataFrame containing columns predictor and importance.
+        np.array
+            Array of model coefficients.
         """
+        return self.linear.coef_
 
-        y_pred = self.score_model(data)
+    def get_intercept(self) -> float:
+        """Returns the intercept of the model.
 
-        importance_by_variable = {
-            utils.clean_predictor_name(predictor): stats.pearsonr(
-                data[predictor],
-                y_pred
-                )[0]
-            for predictor in self.predictors
-        }
+        Returns
+        -------
+        float
+            Intercept of the model.
+        """
+        return self.linear.intercept_[0]
 
-        df = pd.DataFrame.from_dict(importance_by_variable,
-                                    orient="index").reset_index()
-        df.columns = ["predictor", "importance"]
+    def get_coef_by_predictor(self) -> dict:
+        """Returns a dictionary mapping predictor (key) to coefficient (value).
 
-        return (df.sort_values(by="importance", ascending=False)
-                .reset_index(drop=True))
+        Returns
+        -------
+        dict
+            A map ``{predictor: coefficient}``.
+        """
+        return dict(zip(self.predictors, self.linear.coef_))
+
+    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
+        """Fit the model.
+
+        Parameters
+        ----------
+        X_train : pd.DataFrame
+            Predictors of train data.
+        y_train : pd.Series
+            Target of train data.
+        """
+        self.linear.fit(X_train, y_train)
+        super()._after_fit(X_train, y_train)
+
+    def score_model(self, X: pd.DataFrame) -> np.ndarray:
+        """Score a model on a (new) dataset.
+
+        Parameters
+        ----------
+        X : pd.DataFrame
+            Dataset of predictors to score the model.
+
+        Returns
+        -------
+        np.ndarray
+            Score of the model for each observation.
+        """
+        # We select predictor columns (self.predictors) here to
+        # ensure we have the proper predictors and the proper order
+        return self.linear.predict(X[self.predictors])
 
     def _is_valid_dict(self, model_dict: dict) -> bool:
 
+        if not super()._is_valid_dict(model_dict):
+            return False
+
         if ("meta" not in model_dict
                 or model_dict["meta"] != "linear-regression"):
             return False
 
-        attr = ["coef_", "intercept_", "predictors"]
+        attr = ["coef_", "intercept_"]
         for key in attr:
             if not (key in model_dict or type(model_dict[key]) != list):
                 return False
 
-        if ("params" not in model_dict
-                or "_eval_metrics_by_split" not in model_dict):
-            return False
-
         return True
+
+    def _evaluate_with_default_metric(self, y_true, y_score):
+        """We use RMSE as default evaluation metric."""
+        return sqrt(
+            mean_squared_error(y_true=y_true,
+                               # LinearRegressionModel actually returns y_pred
+                               # inside y_score:
+                               y_pred=y_score)
+        )
+
+    def _prepare_args_for_custom_evaluation_metric(self, y_true, y_score,
+                                                   metric):
+        return {
+            "y_true": y_true,
+            # LinearRegressionModel actually returns y_pred inside y_score:
+            "y_pred": y_score
+        }
diff --git a/tests/model_building/test_models.py b/tests/model_building/test_models.py
index 4e4db78..4e822c7 100644
--- a/tests/model_building/test_models.py
+++ b/tests/model_building/test_models.py
@@ -1,55 +1,141 @@
-
 import numpy as np
 import pandas as pd
 
-from cobra.model_building.models import LogisticRegressionModel, LinearRegressionModel
+from cobra.model_building.models import LogisticRegressionModel, \
+    LinearRegressionModel, Model
+
 
 def mock_data():
     return pd.DataFrame({"var1_enc": [0.42] * 10,
                          "var2_enc": [0.94] * 10,
                          "var3_enc": [0.87] * 10})
 
+mock_score_model_output = np.array(
+    [0.5, 0.8, 0.2, 0.9, 0.1, 0.7, 0.3, 0.6, 0.4, 0.5]
+)
+
+def mock_score_model(self, data):
+    return mock_score_model_output
+
+default_metric_output = 0.17
+def mock_evaluate_with_default_metric(self, y_true, y_score):
+    return default_metric_output
+
+def mock_prepare_args_for_custom_evaluation_metric(self, y_true, y_score,
+                                                   metric):
+    return {
+        "y_true": y_true,
+        "y_score": y_score
+    }
+
+class TestModel:
+    def test_evaluate_returns_precalculated_performance_from_cache(self,
+                                                                   mocker):
+        """Test whether evaluate() returns a performance, as calculated
+        earlier on the same dataset split, from its internal cache."""
+        expected = 0.79
 
-def mock_score_model_classification(self, data):
-    return np.array([0.5, 0.8, 0.2, 0.9, 0.1, 0.7, 0.3, 0.6, 0.4, 0.5])
-
-def mock_score_model_regression(self, data):
-    return np.array([0.7, 0.2, 0.2, 0.9, 0.7, 0.3, 0.1, 0.4, 0.8, 0.5])*15
+        model = Model()
+        model._performance_per_split["train"] = expected  # setting the cache
 
-class TestLogisticRegressionModel:
+        # passing empty dataframes as input, instead of those that would
+        # enable exact re-calculation of the performance, so we know for sure
+        # that the cache has provided the answer.
+        actual = model.evaluate(pd.DataFrame(),
+                                pd.Series(dtype="float64"),
+                                split="train")
 
-    def test_evaluate(self, mocker):
+        assert actual == expected
 
+    def test_evaluate_with_default_metric(self, mocker):
         X = mock_data()
         y = pd.Series([1] * 5 + [0] * 5)
 
-        def mock_roc_auc_score(y_true, y_score):
-            return 0.79
-
         (mocker
-         .patch("cobra.model_building.LogisticRegressionModel.score_model",
-                mock_score_model_classification))
+         .patch("cobra.model_building.models.Model.score_model",
+                mock_score_model))
 
         (mocker
-         .patch("cobra.model_building.models.roc_auc_score",
-                mock_roc_auc_score))
+         .patch("cobra.model_building.models"
+                ".Model._evaluate_with_default_metric",
+                mock_evaluate_with_default_metric))
 
-        model = LogisticRegressionModel()
-        actual = model.evaluate(X, y)
+        model = Model()
+        actual = model.evaluate(X, y) # metric=None implied
+        assert actual == default_metric_output
 
-        assert actual == 0.79
+    def test_evaluate_with_custom_metric(self, mocker):
+        X = mock_data()
+        y = pd.Series([1] * 5 + [0] * 5)
 
-    def test_evaluate_cached(self):
+        (mocker
+         .patch("cobra.model_building.models.Model.score_model",
+                mock_score_model))
+
+        # we won't test all combinations of args & kwargs that can occur when
+        # calling the custom metric function (basic args y_true and y_score +
+        # no args and no kwargs / same + args but no kwargs / etc.), just the
+        # combination of basic args, args and kwargs.
+        # The child classes will additionally test with some common metrics
+        # used with these child models.
+        (mocker
+         .patch("cobra.model_building.models"
+                ".Model._prepare_args_for_custom_evaluation_metric",
+                mock_prepare_args_for_custom_evaluation_metric))
+
+        expected_arg1 = 14
+        expected_kwarg1 = 17
+        expected_custom_metric_output = 0.56
+        def some_custom_metric(y_true, y_score,
+                               arg1=None,
+                               *, kwarg1=None):
+            if not np.array_equal(y_true, y):
+                raise ValueError("evaluate() did not succeed in correctly "
+                                 "passing y_true to the custom metric.")
+            if not np.array_equal(y_score, mock_score_model_output):
+                raise ValueError("evaluate() did not succeed in correctly "
+                                 "passing y_score to the custom metric.")
+            if arg1 != expected_arg1:
+                raise ValueError("evaluate() did not succeed in correctly "
+                                 "passing additional arguments to the custom "
+                                 "metric.")
+            if kwarg1 != expected_kwarg1:
+                raise ValueError("evaluate() did not succeed in correctly "
+                                 "passing additional keyword arguments to "
+                                 "the custom metric.")
+            return expected_custom_metric_output
+
+        model = Model()
+        actual = model.evaluate(X, y,
+                                metric=some_custom_metric,
+                                metric_args={
+                                    "arg1": expected_arg1
+                                },
+                                metric_kwargs={
+                                    "kwarg1": expected_kwarg1
+                                })
+        assert actual == expected_custom_metric_output
+
+    def test_evaluate_caches_performance(self, mocker):
+        """Test whether the evaluate() function caches the calculated model
+        performance for a certain dataset split."""
+        X = mock_data()
+        y = pd.Series([1] * 5 + [0] * 5)
 
-        split = "train"
-        expected = 0.79
+        (mocker
+         .patch("cobra.model_building.models.Model.score_model",
+                mock_score_model))
 
-        model = LogisticRegressionModel()
-        model._eval_metrics_by_split["train"] = expected
+        (mocker
+         .patch("cobra.model_building.models"
+                ".Model._evaluate_with_default_metric",
+                mock_evaluate_with_default_metric))
 
-        actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split)
+        model = Model()
+        actual = model.evaluate(X, y,
+                                split="train")
 
-        assert actual == expected
+        assert model._performance_per_split["train"] == default_metric_output
 
     def test_compute_variable_importance(self, mocker):
 
@@ -79,15 +165,240 @@ def mock_pearsonr(ypred, ytrue):
 
         pd.testing.assert_frame_equal(actual, expected)
 
+
+mock_score_model_classification_output = mock_score_model_output
+
+def mock_score_model_classification(self, data):
+    return mock_score_model_classification_output
+
+
+class TestLogisticRegressionModel:
+
+    # The following are more like integration tests, which verify
+    # Model.evaluate() with a few examples of metrics that cover the most use
+    # cases that Cobra developers will use when developing a regression model:
+
+    def test_evaluate_no_metric_specified(self, mocker):
+        X = mock_data()
+        y = pd.Series([1] * 5 + [0] * 5)
+
+        (mocker
+         .patch("cobra.model_building.LogisticRegressionModel.score_model",
+                mock_score_model_classification))
+
+        def mock_roc_auc_score(y_true, y_score):
+            # mocking sklearn.metrics.roc_auc_score, as instantiated in
+            # models.py.
+            if not np.array_equal(y_true, y):
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_true "
+                                 "argument.")
+            if not np.array_equal(y_score,
+                                  mock_score_model_classification_output):
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_score "
+                                 "argument.")
+
+            return 0.79
+
+        (mocker
+         .patch("cobra.model_building.models.roc_auc_score",
+                mock_roc_auc_score))
+
+        model = LogisticRegressionModel()
+        actual = model.evaluate(X, y)  # implied: metric=None (default value).
+
+        assert actual == 0.79
+
+    def test_evaluate_metric_specified_requiring_y_score(self, mocker):
+        X = mock_data()
+        y = pd.Series([1] * 5 + [0] * 5)
+
+        (mocker
+         .patch("cobra.model_building.LogisticRegressionModel.score_model",
+                mock_score_model_classification))
+
+        def top_k_accuracy_score(y_true, y_score,
+                                 *, k=2, normalize=True,
+                                 sample_weight=None, labels=None):
+            # mimicking sklearn.metrics.top_k_accuracy_score.
+            if not np.array_equal(y_true, y):
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_true "
+                                 "argument.")
+            if not np.array_equal(y_score,
+                                  mock_score_model_classification_output):
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_score "
+                                 "argument.")
+
+            return 0.14
+
+        model = LogisticRegressionModel()
+        actual = model.evaluate(X, y,
+                                metric=top_k_accuracy_score)
+
+        assert actual == 0.14
+
+    def test_evaluate_metric_specified_requiring_y_prob(self, mocker):
+        X = mock_data()
+        y = pd.Series([1] * 5 + [0] * 5)
+
+        (mocker
+         .patch("cobra.model_building.LogisticRegressionModel.score_model",
+                mock_score_model_classification))
+
+        def brier_score_loss(y_true, y_prob,
+                             *, sample_weight=None, pos_label=None):
+            # mimicking sklearn.metrics.brier_score_loss
+            if not np.array_equal(y_true, y):
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_true "
+                                 "argument.")
+            if not np.array_equal(y_prob,
+                                  mock_score_model_classification_output):
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_prob "
+                                 "argument.")
+
+            return 0.14
+
+        model = LogisticRegressionModel()
+        actual = model.evaluate(X, y,
+                                metric=brier_score_loss)
+
+        assert actual == 0.14
+
+    def test_evaluate_metric_specified_requiring_y_pred(self, mocker):
+        X = mock_data()
+        y = pd.Series([1] * 5 + [0] * 5)
+
+        (mocker
+         .patch("cobra.model_building.LogisticRegressionModel.score_model",
+                mock_score_model_classification))
+
+        def f1_score(y_true, y_pred,
+                     *, labels=None, pos_label=1, average='binary',
+                     sample_weight=None, zero_division='warn'):
+            if not np.array_equal(y_true, y):
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_true "
+                                 "argument.")
+            if not np.array_equal(y_pred,
+                                  np.zeros(
+                                      (len(mock_score_model_classification_output),)
+                                  )):
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_pred "
+                                 "argument.")
+
+            return 0.14
+
+        # We don't mock roc_curve, mocking the optimal_cutoff (see below) is
+        # enough to guarantee that y_pred will have a pre-determined value
+        # that we can test for.
+
+        def mock_compute_optimal_cutoff(fpr: np.ndarray, tpr: np.ndarray,
+                                        thresholds: np.ndarray) -> float:
+            # Let's return a threshold so high, that all scores will end up
+            # below it, which will result in a y_pred being equal to np.zeros().
+            return float("inf")
+
+        (mocker
+         .patch("cobra.evaluation.evaluator.ClassificationEvaluator"
+                "._compute_optimal_cutoff",
+                mock_compute_optimal_cutoff))
+
+        model = LogisticRegressionModel()
+        actual = model.evaluate(X, y,
+                                metric=f1_score)
+
+        assert actual == 0.14
+
+    def test_evaluate_metric_specified_with_additional_metric_args(self,
+                                                                   mocker):
+        X = mock_data()
+        y = pd.Series([1] * 5 + [0] * 5)
+
+        (mocker
+         .patch("cobra.model_building.LogisticRegressionModel.score_model",
+                mock_score_model_classification))
+
+        def compute_lift(y_true,
+                         y_score,
+                         lift_at=0.05):
+            # Mimicking ClassificationEvaluator._compute_lift()
+            if not np.array_equal(y_true, y):
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_true "
+                                 "argument.")
+            if not np.array_equal(y_score,
+                                  mock_score_model_classification_output):
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_score "
+                                 "argument.")
+            if lift_at != 0.22:
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the lift_at argument to "
+                                 "the metric function.")
+            return 0.14
+
+        model = LogisticRegressionModel()
+        actual = model.evaluate(X, y,
+                                metric=compute_lift,
+                                metric_args={
+                                    "lift_at": 0.22
+                                })
+
+        assert actual == 0.14
+
+    def test_evaluate_metric_specified_with_additional_metric_kwargs(self,
+                                                                     mocker):
+        X = mock_data()
+        y = pd.Series([1] * 5 + [0] * 5)
+
+        (mocker
+         .patch("cobra.model_building.LogisticRegressionModel.score_model",
+                mock_score_model_classification))
+
+        def top_k_accuracy_score(y_true, y_score,
+                                 *, k=2, normalize=True,
+                                 sample_weight=None, labels=None):
+            # mimicking sklearn.metrics.top_k_accuracy_score.
+            if not np.array_equal(y_true, y):
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_true "
+                                 "argument.")
+            if not np.array_equal(y_score,
+                                  mock_score_model_classification_output):
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_score "
+                                 "argument.")
+            if k != 100:
+                raise ValueError("LogisticRegressionModel.evaluate() did not "
+                                 "succeed in passing the kwarg k to the "
+                                 "metric function.")
+
+            return 0.14
+
+        model = LogisticRegressionModel()
+        actual = model.evaluate(X, y,
+                                metric=top_k_accuracy_score,
+                                metric_kwargs={
+                                    "k": 100
+                                })
+
+        assert actual == 0.14
+
     def test_serialize(self):
 
         model = LogisticRegressionModel()
         actual = model.serialize()
 
         expected = {
-            "meta": "logistic-regression",
             "predictors": [],
-            "_eval_metrics_by_split": {},
+            "_performance_per_split": {},
+            "meta": "logistic-regression",
             "params": {
                 "C": 1000000000.0,
                 "class_weight": None,
@@ -116,7 +427,7 @@ def test_deserialize(self):
         model_dict = {
             "meta": "logistic-regression",
             "predictors": [],
-            "_eval_metrics_by_split": {},
+            "_performance_per_split": {},
             "params": {
                 "C": 1000000000.0,
                 "class_weight": None,
@@ -149,68 +460,93 @@ def test_deserialize(self):
         assert logit.intercept_.all() == (np.array(model_dict["intercept_"]).all())
         assert logit.coef_.all() == np.array(model_dict["coef_"]).all()
 
-class TestLinearRegressionModel:
-
-    def test_evaluate(self, mocker):
-
-        X = mock_data()
-        y = pd.Series(np.array([0.6, 0.1, 0.2, 0.9, 0.8, 0.3, 0.2, 0.4, 0.9, 0.5])*12)
-
-        def mock_mean_squared_error(y_true, y_pred):
-            return 1.23
+    def test_deserialize_backwards_compat_for_eval_metrics_by_split(self):
 
-        (mocker
-         .patch("cobra.model_building.LinearRegressionModel.score_model",
-                mock_score_model_regression))
+        model = LogisticRegressionModel()
 
-        (mocker
-         .patch("cobra.model_building.models.mean_squared_error",
-                mock_mean_squared_error))
+        model_dict = {
+            "meta": "logistic-regression",
+            "predictors": [],
+            "_eval_metrics_by_split": {},
+            "params": {
+                "C": 1000000000.0,
+                "class_weight": None,
+                "dual": False,
+                "fit_intercept": True,
+                "intercept_scaling": 1,
+                "l1_ratio": None,
+                "max_iter": 100,
+                "multi_class": "auto",
+                "n_jobs": None,
+                "penalty": "l2",
+                "random_state": 42,
+                "solver": "liblinear",
+                "tol": 0.0001,
+                "verbose": 0,
+                "warm_start": False
+            },
+            "classes_": [0, 1],
+            "coef_": [[0.5, 0.75]],
+            "intercept_": [-3],
+            "n_iter_": [10]
+        }
 
-        model = LinearRegressionModel()
-        actual = model.evaluate(X, y)
+        model.deserialize(model_dict)
 
-        assert actual == np.sqrt(1.23)
+        logit = model.logit
+        assert logit.get_params() == model_dict["params"]
+        assert logit.classes_.all() == np.array(model_dict["classes_"]).all()
+        assert logit.n_iter_.all() == np.array(model_dict["n_iter_"]).all()
+        assert logit.intercept_.all() == (np.array(model_dict["intercept_"]).all())
+        assert logit.coef_.all() == np.array(model_dict["coef_"]).all()
 
-    def test_evaluate_cached(self):
 
-        split = "train"
-        expected = np.sqrt(1.23)
+mock_score_model_regression_output = np.array([0.7, 0.2, 0.2, 0.9, 0.7, 0.3, 0.1, 0.4, 0.8, 0.5])*15
 
-        model = LinearRegressionModel()
-        model._eval_metrics_by_split["train"] = expected
+def mock_score_model_regression(self, data):
+    return mock_score_model_regression_output
 
-        actual = model.evaluate(pd.DataFrame(), pd.Series(dtype="float64"), split)
 
-        assert actual == expected
+class TestLinearRegressionModel:
 
-    def test_compute_variable_importance(self, mocker):
+    # The following are more like integration tests, which verify
+    # Model.evaluate() with a few examples of metrics that cover the most use
+    # cases that Cobra developers will use when developing a regression model:
 
-        def mock_pearsonr(ypred, ytrue):
-            return [ypred.unique()[0]]
+    def test_evaluate_no_metric_specified(self, mocker):
+        X = mock_data()
+        y = pd.Series([1] * 5 + [0] * 5)
 
         (mocker
-         .patch("cobra.model_building.LinearRegressionModel.score_model",
+         .patch("cobra.model_building.models.LinearRegressionModel.score_model",
                 mock_score_model_regression))
 
-        (mocker
-         .patch("cobra.model_building.models.stats.pearsonr",
-                mock_pearsonr))
+        def mock_evaluate_with_default_metric(self, y_true, y_score):
+            if not np.array_equal(y_true, y):
+                raise ValueError("LinearRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_true "
+                                 "argument.")
+            if not np.array_equal(y_score,
+                                  mock_score_model_regression_output):
+                raise ValueError("LinearRegressionModel.evaluate() did not "
+                                 "succeed in passing the correct y_score "
+                                 "argument.")
 
-        model = LinearRegressionModel()
-        model.predictors = ["var1_enc", "var2_enc", "var3_enc"]
+            return 0.79
 
-        data = mock_data()
+        (mocker
+         .patch("cobra.model_building.models.LinearRegressionModel."
+                "_evaluate_with_default_metric",
+                mock_evaluate_with_default_metric))
 
-        actual = model.compute_variable_importance(data)
+        model = LinearRegressionModel()
+        actual = model.evaluate(X, y)  # implied: metric=None (default value).
 
-        expected = pd.DataFrame([
-            {"predictor": "var1", "importance": data["var1_enc"].unique()[0]},
-            {"predictor": "var2", "importance": data["var2_enc"].unique()[0]},
-            {"predictor": "var3", "importance": data["var3_enc"].unique()[0]}
-        ]).sort_values(by="importance", ascending=False).reset_index(drop=True)
+        assert actual == 0.79
 
-        pd.testing.assert_frame_equal(actual, expected)
+    # no integration tests for specific custom metrics for this model,
+    # the tests for LogisticRegressionModel were already elaborate to cover
+    # all pathways in Model.evaluate().
 
     def test_serialize(self):
 
@@ -218,14 +554,14 @@ def test_serialize(self):
         actual = model.serialize()
 
         expected = {
-            "meta": "linear-regression",
             "predictors": [],
-            "_eval_metrics_by_split": {},
+            "_performance_per_split": {},
+            "meta": "linear-regression",
             "params": {
                 "copy_X": True,
                 "fit_intercept": True,
                 "n_jobs": None,
-                "normalize": "deprecated",
+                "normalize": False,
                 "positive": False
             }
         }
@@ -239,7 +575,7 @@ def test_deserialize(self):
         model_dict = {
             "meta": "linear-regression",
             "predictors": [],
-            "_eval_metrics_by_split": {},
+            "_performance_per_split": {},
             "params": {
                 "copy_X": True,
                 "fit_intercept": True,
@@ -258,3 +594,28 @@ def test_deserialize(self):
         assert linear.intercept_.all() == (np.array(model_dict["intercept_"]).all())
         assert linear.coef_.all() == np.array(model_dict["coef_"]).all()
 
+    def test_deserialize_backwards_compat_for_eval_metrics_by_split(self):
+
+        model = LinearRegressionModel()
+
+        model_dict = {
+            "meta": "linear-regression",
+            "predictors": [],
+            "_eval_metrics_by_split": {},
+            "params": {
+                "copy_X": True,
+                "fit_intercept": True,
+                "n_jobs": None,
+                "normalize": "deprecated",
+                "positive": False
+            },
+            "coef_": [[0.5, 0.75]],
+            "intercept_": [-3]
+        }
+
+        model.deserialize(model_dict)
+
+        linear = model.linear
+        assert linear.get_params() == model_dict["params"]
+        assert linear.intercept_.all() == (np.array(model_dict["intercept_"]).all())
+        assert linear.coef_.all() == np.array(model_dict["coef_"]).all()

From 24442e220aad1ec0ec0f8064f72aed19b9f86a3e Mon Sep 17 00:00:00 2001
From: Sander Vanden Hautte <sander.vandenhautte@tobania.be>
Date: Wed, 1 Jun 2022 14:38:00 +0200
Subject: [PATCH 4/4] Fix for breaking unit test during CI/CD (my local sklearn
 had way older version).

---
 tests/model_building/test_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/model_building/test_models.py b/tests/model_building/test_models.py
index 4e822c7..f26085d 100644
--- a/tests/model_building/test_models.py
+++ b/tests/model_building/test_models.py
@@ -561,7 +561,7 @@ def test_serialize(self):
                 "copy_X": True,
                 "fit_intercept": True,
                 "n_jobs": None,
-                "normalize": False,
+                "normalize": "deprecated",
                 "positive": False
             }
         }