ajboyd2 · bmangels12 · May 10, 2021 · May 10, 2021
diff --git a/salmon/building.py b/salmon/building.py
@@ -1,6 +1,7 @@
 import numpy as np
 import math
-"""Contains the logic for automatic model building (i.e. stepwise regression)."""
+"""Contains the logic for automatic model building
+(i.e. stepwise regression)."""
 
 from abc import ABC, abstractmethod
 
@@ -29,10 +30,11 @@ def __str__(self):
         return "{} | {}".format(type(self).__name__, self._score)
 
     def compare(self, other):
-        """Return true if self is better than other based on 'higher_is_better'."""
-        assert(type(self) is type(other))  
+        """Return true if self is better than other
+         based on 'higher_is_better'."""
+        assert(isinstance(self, type(other)))
         # make sure we are not comparing different types of scores
-        
+
         if self.higher_is_better:
             return self._score < other._score
         else:
@@ -42,7 +44,7 @@ def compare(self, other):
 class RSquared(Score):
 
     def __init__(self, model, adjusted=False):
-        self.adjusted=adjusted
+        self.adjusted = adjusted
 
         super(RSquared, self).__init__(
             model=model,
@@ -51,7 +53,7 @@ def __init__(self, model, adjusted=False):
 
     def __str__(self):
         return "R^2 ({}adjusted) | {}".format(
-            "" if self.adjusted else "un", 
+            "" if self.adjusted else "un",
             self._score,
         )
 
@@ -147,7 +149,8 @@ def compute(self):
 
         return math.log(n) * p - 2 * log_likelihood
 
-"""All metrics that are supported by default.""" 
+
+"""All metrics that are supported by default."""
 _metrics = dict(
     r_squared=RSquared,
     r_squared_adjusted=lambda model: RSquared(model=model, adjusted=True),
@@ -167,9 +170,9 @@ def stepwise(
     verbose=False,
 ):
     """Perform forward or backward stepwise regression.
-    
+
     Arguments:
-        full_model - A model object that contains all of the terms to be 
+        full_model - A model object that contains all of the terms to be
             considered for the procedure.
         metric_name - A string containing the name of the metric to use in
             the procedure. Options include: "r_squared", "r_squared_adjusted",
@@ -187,7 +190,7 @@ def stepwise(
             Default is False.
 
     Returns:
-         
+
     """
 
     if data is not None:
@@ -200,10 +203,13 @@ def stepwise(
     data = full_model.training_data
 
     if ex_terms is None or re_term is None:
-        raise AssertionError("The full model must be fit prior to undergoing a stepwise procedure.")
+        raise AssertionError(
+            "The full model must be fit prior to \
+            undergoing a stepwise procedure.")
 
     if metric_name not in _metrics:
-        raise KeyError("Metric '{}' not supported. The following metrics are supported: {}".format(
+        raise KeyError("Metric '{}' not supported. The \
+            following metrics are supported: {}".format(
             metric_name,
             list(_metrics.keys())
         ))
@@ -218,13 +224,12 @@ def stepwise(
         best_model = full_model
 
     best_metric = metric_func(best_model)
-
 
     while len(ex_term_list) > 0:
         best_potential_metric = metric_func(None)
         best_potential_model = None
         best_idx = None
-    
+
         if forward and not naive:
             ex_term_list_expression = None
             for t in ex_term_list:
@@ -233,9 +238,9 @@ def stepwise(
                 else:
                     ex_term_list_expression = ex_term_list_expression + t
             # Find all terms that do not depend on other terms
-            leaves = set(term for term in ex_term_list if not \
-                term.contains(ex_term_list_expression - term))
-    
+            leaves = set(term for term in ex_term_list if not
+                         term.contains(ex_term_list_expression - term))
+
         for i, term in enumerate(ex_term_list):
             try:
                 if forward:
@@ -244,7 +249,7 @@ def stepwise(
                         if term not in leaves:
                             continue
                     potential_model = LinearModel(
-                        best_model.given_ex + term, 
+                        best_model.given_ex + term,
                         re_term,
                     )
                 else:
@@ -268,7 +273,8 @@ def stepwise(
                 if verbose:
                     print(potential_model)
                     print(potential_metric)
-                    print("Current best potential model" if best_idx == i else "Not current best potential")
+                    print("Current best potential model" if best_idx ==
+                          i else "Not current best potential")
                     print()
 
             except np.linalg.linalg.LinAlgError:
@@ -278,19 +284,21 @@ def stepwise(
             best_metric = best_potential_metric
             best_model = best_potential_model
             if verbose:
-                print("!!! New model found. Now including", ex_term_list[best_idx])
+                print(
+                    "!!! New model found. Now including",
+                    ex_term_list[best_idx])
                 print()
             del ex_term_list[best_idx]
         else:
             if verbose:
-                print("!!! No potential models better than prior. Exiting search.")
+                print("!!! No potential models better \
+                    than prior. Exiting search.")
                 print()
             break
     else:
         if verbose:
             print("!!! Exhausted all potential terms. None left to consider.")
 
-
     return dict(
         forward=forward,
         metric=best_metric,

diff --git a/salmon/comparison.py b/salmon/comparison.py
@@ -4,9 +4,10 @@
 import numpy as np
 import pandas as pd
 
-def anova(model1, model2 = None):
+
+def anova(model1, model2=None):
     """Perform inference by comparing two models.
-    
+
     User-facing function to execute an Analysis of Variance for one or two
     models. Should only model be given, then a general F-test will be executed
     on all of the coefficients. Should two models be given, then a partial
@@ -27,8 +28,11 @@ def anova(model1, model2 = None):
     elif is_subset(model2, model1):
         return _anova_models(model2, model1)
     else:
-        raise Exception("Parameters must either be one model or two models where one is a subset of the other.")
-
+        raise Exception(
+            "Parameters must either be one model or two\
+             models where one is a subset of the other.")
+
+
 def is_subset(model1, model2):
     """Checks if model1 contains all the terms of model2. In other words,
     checks if model2 is a subset of model1.
@@ -37,18 +41,19 @@ def is_subset(model1, model2):
         model1 - A Model object that has been fit on some data.
         model2 - A Model object that has been fit on some data.
 
-    Returns: 
+    Returns:
         A boolean value that is True if model2 is a subset of model1,
-        False if model2 is not a subset of model1.       
+        False if model2 is not a subset of model1.
     """
     if not model1.given_re.__sim__(model2.given_re):
         # Models should both have the same response variable
         return False
-    
+
     terms1 = set(model1.ex.get_terms())
     terms2 = set(model2.ex.get_terms())
     return terms2.issubset(terms1)
 
+
 def _calc_stats(numer_ss, numer_df, denom_ss, denom_df):
     """Given the appropriate sum of squares for the numerator and the mean sum
     of squares for the denominator (with respective degrees of freedom) this
@@ -62,7 +67,7 @@ def _calc_stats(numer_ss, numer_df, denom_ss, denom_df):
 
     Returns:
         A tuple of three values: Element 0 contains the mean sum of squares for
-        the numerator, element 1 contains the F statistic calculated, and 
+        the numerator, element 1 contains the F statistic calculated, and
         element 2 contains the associated p-value for the generated
         F-statistic.
     """
@@ -72,6 +77,7 @@ def _calc_stats(numer_ss, numer_df, denom_ss, denom_df):
     p_val = f.sf(f_val, numer_df, denom_df)
     return f_val, p_val
 
+
 def _process_term(orig_model, term):
     """Obtains needed sum of squared residuals of a model fitted without a
     specified term/coefficient.
@@ -88,6 +94,7 @@ def _process_term(orig_model, term):
     new_model.fit(orig_model.training_data)
     return new_model.get_sse(), new_model.get_ssr()
 
+
 def _extract_dfs(model, dict_out=False):
     """Obtains the different degrees of freedom for a model in reference to an
     F-test.
@@ -114,6 +121,7 @@ def _extract_dfs(model, dict_out=False):
     else:
         return reg_df, error_df, total_df
 
+
 def _anova_terms(model):
     """Perform a global F-test by analyzing all possible models when you leave
     one coefficient out while fitting.
@@ -127,27 +135,27 @@ def _anova_terms(model):
         the associated tests performed.
     """
     full_reg_df, full_error_df, total_df = _extract_dfs(model)
-  
+
     # Full model values
     full_sse = model.get_sse()  # sum of squared errors
     full_ssr = model.get_ssr()  # sum of squares explained by model
-    full_sst = model.get_sst()  
-    
+    full_sst = model.get_sst()
+
     global_f_val, global_p_val = _calc_stats(
         full_ssr,
         full_reg_df,
         full_sse,
         full_error_df,
     )
-    
+
     # Calculate the general terms now
     indices = ["Global Test"]
     sses = [full_ssr]
     ssrs = [full_ssr]
     f_vals = [global_f_val]
     p_vals = [global_p_val]
     dfs = [full_reg_df]
-    
+
     terms = model.ex.get_terms()
     for term in terms:
         term_df = term.get_dof()
@@ -164,22 +172,23 @@ def _anova_terms(model):
         dfs.append(term_df)
         f_vals.append(reduced_f_val)
         p_vals.append(reduced_p_val)
-        
+
     # Finish off the dataframe's values
     indices.append("Error")
     sses.append("")
     ssrs.append("")
     dfs.append(full_error_df)
     f_vals.append("")
     p_vals.append("")
-    
+
     return pd.DataFrame({
-            "DF" : dfs,
-            "SS Err.": sses,
-            "SS Reg." : ssrs,
-            "F" : f_vals,
-            "p" : p_vals
-        }, index = indices, columns = ["DF", "SS Err.", "SS Reg.", "F", "p"])
+        "DF": dfs,
+        "SS Err.": sses,
+        "SS Reg.": ssrs,
+        "F": f_vals,
+        "p": p_vals
+    }, index=indices, columns=["DF", "SS Err.", "SS Reg.", "F", "p"])
+
 
 def _anova_models(full_model, reduced_model):
     """Performs a partial F-test to compare two models.
@@ -197,34 +206,31 @@ def _anova_models(full_model, reduced_model):
     """
     full_label = str(full_model)
     reduced_label = str(reduced_model)
-    
+
     f_reg_df, f_error_df, f_total_df = _extract_dfs(full_model)
     r_reg_df, r_error_df, r_total_df = _extract_dfs(reduced_model)
-    
+
     f_sse, f_ssr = full_model.get_sse(), full_model.get_ssr()
     r_sse, r_ssr = reduced_model.get_sse(), reduced_model.get_ssr()
-  
+
     f_val, p_val = _calc_stats(
         r_sse - f_sse,
         r_error_df - f_error_df,
         f_sse,
         f_error_df,
     )
-    
+
     indices = ["Full Model", "- Reduced Model", "Error"]
     df = [f_reg_df, f_reg_df - r_reg_df, f_error_df]
-    ssrs = [f_ssr, r_ssr, ""] 
+    ssrs = [f_ssr, r_ssr, ""]
     sses = [f_sse, r_sse, ""]
     f = ["", f_val, ""]
     p = ["", p_val, ""]
 
     return pd.DataFrame({
-        "DF" : df,
-        "SS Err." : sses,
-        "SS Reg." : ssrs, 
-        "F" : f,
-        "p" : p},
-        index = indices, columns = ["DF", "SS Err.", "SS Reg.", "F", "p"])
-
-
-
+        "DF": df,
+        "SS Err.": sses,
+        "SS Reg.": ssrs,
+        "F": f,
+        "p": p},
+        index=indices, columns=["DF", "SS Err.", "SS Reg.", "F", "p"])