Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 30 additions & 22 deletions salmon/building.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import math
"""Contains the logic for automatic model building (i.e. stepwise regression)."""
"""Contains the logic for automatic model building
(i.e. stepwise regression)."""

from abc import ABC, abstractmethod

Expand Down Expand Up @@ -29,10 +30,11 @@ def __str__(self):
return "{} | {}".format(type(self).__name__, self._score)

def compare(self, other):
"""Return true if self is better than other based on 'higher_is_better'."""
assert(type(self) is type(other))
"""Return true if self is better than other
based on 'higher_is_better'."""
assert(isinstance(self, type(other)))
# make sure we are not comparing different types of scores

if self.higher_is_better:
return self._score < other._score
else:
Expand All @@ -42,7 +44,7 @@ def compare(self, other):
class RSquared(Score):

def __init__(self, model, adjusted=False):
self.adjusted=adjusted
self.adjusted = adjusted

super(RSquared, self).__init__(
model=model,
Expand All @@ -51,7 +53,7 @@ def __init__(self, model, adjusted=False):

def __str__(self):
return "R^2 ({}adjusted) | {}".format(
"" if self.adjusted else "un",
"" if self.adjusted else "un",
self._score,
)

Expand Down Expand Up @@ -147,7 +149,8 @@ def compute(self):

return math.log(n) * p - 2 * log_likelihood

"""All metrics that are supported by default."""

"""All metrics that are supported by default."""
_metrics = dict(
r_squared=RSquared,
r_squared_adjusted=lambda model: RSquared(model=model, adjusted=True),
Expand All @@ -167,9 +170,9 @@ def stepwise(
verbose=False,
):
"""Perform forward or backward stepwise regression.

Arguments:
full_model - A model object that contains all of the terms to be
full_model - A model object that contains all of the terms to be
considered for the procedure.
metric_name - A string containing the name of the metric to use in
the procedure. Options include: "r_squared", "r_squared_adjusted",
Expand All @@ -187,7 +190,7 @@ def stepwise(
Default is False.

Returns:

"""

if data is not None:
Expand All @@ -200,10 +203,13 @@ def stepwise(
data = full_model.training_data

if ex_terms is None or re_term is None:
raise AssertionError("The full model must be fit prior to undergoing a stepwise procedure.")
raise AssertionError(
"The full model must be fit prior to \
undergoing a stepwise procedure.")

if metric_name not in _metrics:
raise KeyError("Metric '{}' not supported. The following metrics are supported: {}".format(
raise KeyError("Metric '{}' not supported. The \
following metrics are supported: {}".format(
metric_name,
list(_metrics.keys())
))
Expand All @@ -218,13 +224,12 @@ def stepwise(
best_model = full_model

best_metric = metric_func(best_model)


while len(ex_term_list) > 0:
best_potential_metric = metric_func(None)
best_potential_model = None
best_idx = None

if forward and not naive:
ex_term_list_expression = None
for t in ex_term_list:
Expand All @@ -233,9 +238,9 @@ def stepwise(
else:
ex_term_list_expression = ex_term_list_expression + t
# Find all terms that do not depend on other terms
leaves = set(term for term in ex_term_list if not \
term.contains(ex_term_list_expression - term))
leaves = set(term for term in ex_term_list if not
term.contains(ex_term_list_expression - term))

for i, term in enumerate(ex_term_list):
try:
if forward:
Expand All @@ -244,7 +249,7 @@ def stepwise(
if term not in leaves:
continue
potential_model = LinearModel(
best_model.given_ex + term,
best_model.given_ex + term,
re_term,
)
else:
Expand All @@ -268,7 +273,8 @@ def stepwise(
if verbose:
print(potential_model)
print(potential_metric)
print("Current best potential model" if best_idx == i else "Not current best potential")
print("Current best potential model" if best_idx ==
i else "Not current best potential")
print()

except np.linalg.linalg.LinAlgError:
Expand All @@ -278,19 +284,21 @@ def stepwise(
best_metric = best_potential_metric
best_model = best_potential_model
if verbose:
print("!!! New model found. Now including", ex_term_list[best_idx])
print(
"!!! New model found. Now including",
ex_term_list[best_idx])
print()
del ex_term_list[best_idx]
else:
if verbose:
print("!!! No potential models better than prior. Exiting search.")
print("!!! No potential models better \
than prior. Exiting search.")
print()
break
else:
if verbose:
print("!!! Exhausted all potential terms. None left to consider.")


return dict(
forward=forward,
metric=best_metric,
Expand Down
76 changes: 41 additions & 35 deletions salmon/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import numpy as np
import pandas as pd

def anova(model1, model2 = None):

def anova(model1, model2=None):
"""Perform inference by comparing two models.

User-facing function to execute an Analysis of Variance for one or two
models. Should only model be given, then a general F-test will be executed
on all of the coefficients. Should two models be given, then a partial
Expand All @@ -27,8 +28,11 @@ def anova(model1, model2 = None):
elif is_subset(model2, model1):
return _anova_models(model2, model1)
else:
raise Exception("Parameters must either be one model or two models where one is a subset of the other.")

raise Exception(
"Parameters must either be one model or two\
models where one is a subset of the other.")


def is_subset(model1, model2):
"""Checks if model1 contains all the terms of model2. In other words,
checks if model2 is a subset of model1.
Expand All @@ -37,18 +41,19 @@ def is_subset(model1, model2):
model1 - A Model object that has been fit on some data.
model2 - A Model object that has been fit on some data.

Returns:
Returns:
A boolean value that is True if model2 is a subset of model1,
False if model2 is not a subset of model1.
False if model2 is not a subset of model1.
"""
if not model1.given_re.__sim__(model2.given_re):
# Models should both have the same response variable
return False

terms1 = set(model1.ex.get_terms())
terms2 = set(model2.ex.get_terms())
return terms2.issubset(terms1)


def _calc_stats(numer_ss, numer_df, denom_ss, denom_df):
"""Given the appropriate sum of squares for the numerator and the mean sum
of squares for the denominator (with respective degrees of freedom) this
Expand All @@ -62,7 +67,7 @@ def _calc_stats(numer_ss, numer_df, denom_ss, denom_df):

Returns:
A tuple of three values: Element 0 contains the mean sum of squares for
the numerator, element 1 contains the F statistic calculated, and
the numerator, element 1 contains the F statistic calculated, and
element 2 contains the associated p-value for the generated
F-statistic.
"""
Expand All @@ -72,6 +77,7 @@ def _calc_stats(numer_ss, numer_df, denom_ss, denom_df):
p_val = f.sf(f_val, numer_df, denom_df)
return f_val, p_val


def _process_term(orig_model, term):
"""Obtains needed sum of squared residuals of a model fitted without a
specified term/coefficient.
Expand All @@ -88,6 +94,7 @@ def _process_term(orig_model, term):
new_model.fit(orig_model.training_data)
return new_model.get_sse(), new_model.get_ssr()


def _extract_dfs(model, dict_out=False):
"""Obtains the different degrees of freedom for a model in reference to an
F-test.
Expand All @@ -114,6 +121,7 @@ def _extract_dfs(model, dict_out=False):
else:
return reg_df, error_df, total_df


def _anova_terms(model):
"""Perform a global F-test by analyzing all possible models when you leave
one coefficient out while fitting.
Expand All @@ -127,27 +135,27 @@ def _anova_terms(model):
the associated tests performed.
"""
full_reg_df, full_error_df, total_df = _extract_dfs(model)

# Full model values
full_sse = model.get_sse() # sum of squared errors
full_ssr = model.get_ssr() # sum of squares explained by model
full_sst = model.get_sst()
full_sst = model.get_sst()

global_f_val, global_p_val = _calc_stats(
full_ssr,
full_reg_df,
full_sse,
full_error_df,
)

# Calculate the general terms now
indices = ["Global Test"]
sses = [full_ssr]
ssrs = [full_ssr]
f_vals = [global_f_val]
p_vals = [global_p_val]
dfs = [full_reg_df]

terms = model.ex.get_terms()
for term in terms:
term_df = term.get_dof()
Expand All @@ -164,22 +172,23 @@ def _anova_terms(model):
dfs.append(term_df)
f_vals.append(reduced_f_val)
p_vals.append(reduced_p_val)

# Finish off the dataframe's values
indices.append("Error")
sses.append("")
ssrs.append("")
dfs.append(full_error_df)
f_vals.append("")
p_vals.append("")

return pd.DataFrame({
"DF" : dfs,
"SS Err.": sses,
"SS Reg." : ssrs,
"F" : f_vals,
"p" : p_vals
}, index = indices, columns = ["DF", "SS Err.", "SS Reg.", "F", "p"])
"DF": dfs,
"SS Err.": sses,
"SS Reg.": ssrs,
"F": f_vals,
"p": p_vals
}, index=indices, columns=["DF", "SS Err.", "SS Reg.", "F", "p"])


def _anova_models(full_model, reduced_model):
"""Performs a partial F-test to compare two models.
Expand All @@ -197,34 +206,31 @@ def _anova_models(full_model, reduced_model):
"""
full_label = str(full_model)
reduced_label = str(reduced_model)

f_reg_df, f_error_df, f_total_df = _extract_dfs(full_model)
r_reg_df, r_error_df, r_total_df = _extract_dfs(reduced_model)

f_sse, f_ssr = full_model.get_sse(), full_model.get_ssr()
r_sse, r_ssr = reduced_model.get_sse(), reduced_model.get_ssr()

f_val, p_val = _calc_stats(
r_sse - f_sse,
r_error_df - f_error_df,
f_sse,
f_error_df,
)

indices = ["Full Model", "- Reduced Model", "Error"]
df = [f_reg_df, f_reg_df - r_reg_df, f_error_df]
ssrs = [f_ssr, r_ssr, ""]
ssrs = [f_ssr, r_ssr, ""]
sses = [f_sse, r_sse, ""]
f = ["", f_val, ""]
p = ["", p_val, ""]

return pd.DataFrame({
"DF" : df,
"SS Err." : sses,
"SS Reg." : ssrs,
"F" : f,
"p" : p},
index = indices, columns = ["DF", "SS Err.", "SS Reg.", "F", "p"])



"DF": df,
"SS Err.": sses,
"SS Reg.": ssrs,
"F": f,
"p": p},
index=indices, columns=["DF", "SS Err.", "SS Reg.", "F", "p"])
Loading