diff --git a/salmon/building.py b/salmon/building.py index 994ae63..7cea22a 100644 --- a/salmon/building.py +++ b/salmon/building.py @@ -1,6 +1,7 @@ import numpy as np import math -"""Contains the logic for automatic model building (i.e. stepwise regression).""" +"""Contains the logic for automatic model building +(i.e. stepwise regression).""" from abc import ABC, abstractmethod @@ -29,10 +30,11 @@ def __str__(self): return "{} | {}".format(type(self).__name__, self._score) def compare(self, other): - """Return true if self is better than other based on 'higher_is_better'.""" - assert(type(self) is type(other)) + """Return true if self is better than other + based on 'higher_is_better'.""" + assert(isinstance(self, type(other))) # make sure we are not comparing different types of scores - + if self.higher_is_better: return self._score < other._score else: @@ -42,7 +44,7 @@ def compare(self, other): class RSquared(Score): def __init__(self, model, adjusted=False): - self.adjusted=adjusted + self.adjusted = adjusted super(RSquared, self).__init__( model=model, @@ -51,7 +53,7 @@ def __init__(self, model, adjusted=False): def __str__(self): return "R^2 ({}adjusted) | {}".format( - "" if self.adjusted else "un", + "" if self.adjusted else "un", self._score, ) @@ -147,7 +149,8 @@ def compute(self): return math.log(n) * p - 2 * log_likelihood -"""All metrics that are supported by default.""" + +"""All metrics that are supported by default.""" _metrics = dict( r_squared=RSquared, r_squared_adjusted=lambda model: RSquared(model=model, adjusted=True), @@ -167,9 +170,9 @@ def stepwise( verbose=False, ): """Perform forward or backward stepwise regression. - + Arguments: - full_model - A model object that contains all of the terms to be + full_model - A model object that contains all of the terms to be considered for the procedure. metric_name - A string containing the name of the metric to use in the procedure. Options include: "r_squared", "r_squared_adjusted", @@ -187,7 +190,7 @@ def stepwise( Default is False. Returns: - + """ if data is not None: @@ -200,10 +203,13 @@ def stepwise( data = full_model.training_data if ex_terms is None or re_term is None: - raise AssertionError("The full model must be fit prior to undergoing a stepwise procedure.") + raise AssertionError( + "The full model must be fit prior to \ + undergoing a stepwise procedure.") if metric_name not in _metrics: - raise KeyError("Metric '{}' not supported. The following metrics are supported: {}".format( + raise KeyError("Metric '{}' not supported. The \ + following metrics are supported: {}".format( metric_name, list(_metrics.keys()) )) @@ -218,13 +224,12 @@ def stepwise( best_model = full_model best_metric = metric_func(best_model) - while len(ex_term_list) > 0: best_potential_metric = metric_func(None) best_potential_model = None best_idx = None - + if forward and not naive: ex_term_list_expression = None for t in ex_term_list: @@ -233,9 +238,9 @@ def stepwise( else: ex_term_list_expression = ex_term_list_expression + t # Find all terms that do not depend on other terms - leaves = set(term for term in ex_term_list if not \ - term.contains(ex_term_list_expression - term)) - + leaves = set(term for term in ex_term_list if not + term.contains(ex_term_list_expression - term)) + for i, term in enumerate(ex_term_list): try: if forward: @@ -244,7 +249,7 @@ def stepwise( if term not in leaves: continue potential_model = LinearModel( - best_model.given_ex + term, + best_model.given_ex + term, re_term, ) else: @@ -268,7 +273,8 @@ def stepwise( if verbose: print(potential_model) print(potential_metric) - print("Current best potential model" if best_idx == i else "Not current best potential") + print("Current best potential model" if best_idx == + i else "Not current best potential") print() except np.linalg.linalg.LinAlgError: @@ -278,19 +284,21 @@ def stepwise( best_metric = best_potential_metric best_model = best_potential_model if verbose: - print("!!! New model found. Now including", ex_term_list[best_idx]) + print( + "!!! New model found. Now including", + ex_term_list[best_idx]) print() del ex_term_list[best_idx] else: if verbose: - print("!!! No potential models better than prior. Exiting search.") + print("!!! No potential models better \ + than prior. Exiting search.") print() break else: if verbose: print("!!! Exhausted all potential terms. None left to consider.") - return dict( forward=forward, metric=best_metric, diff --git a/salmon/comparison.py b/salmon/comparison.py index 70c8e55..5d65f91 100644 --- a/salmon/comparison.py +++ b/salmon/comparison.py @@ -4,9 +4,10 @@ import numpy as np import pandas as pd -def anova(model1, model2 = None): + +def anova(model1, model2=None): """Perform inference by comparing two models. - + User-facing function to execute an Analysis of Variance for one or two models. Should only model be given, then a general F-test will be executed on all of the coefficients. Should two models be given, then a partial @@ -27,8 +28,11 @@ def anova(model1, model2 = None): elif is_subset(model2, model1): return _anova_models(model2, model1) else: - raise Exception("Parameters must either be one model or two models where one is a subset of the other.") - + raise Exception( + "Parameters must either be one model or two\ + models where one is a subset of the other.") + + def is_subset(model1, model2): """Checks if model1 contains all the terms of model2. In other words, checks if model2 is a subset of model1. @@ -37,18 +41,19 @@ def is_subset(model1, model2): model1 - A Model object that has been fit on some data. model2 - A Model object that has been fit on some data. - Returns: + Returns: A boolean value that is True if model2 is a subset of model1, - False if model2 is not a subset of model1. + False if model2 is not a subset of model1. """ if not model1.given_re.__sim__(model2.given_re): # Models should both have the same response variable return False - + terms1 = set(model1.ex.get_terms()) terms2 = set(model2.ex.get_terms()) return terms2.issubset(terms1) + def _calc_stats(numer_ss, numer_df, denom_ss, denom_df): """Given the appropriate sum of squares for the numerator and the mean sum of squares for the denominator (with respective degrees of freedom) this @@ -62,7 +67,7 @@ def _calc_stats(numer_ss, numer_df, denom_ss, denom_df): Returns: A tuple of three values: Element 0 contains the mean sum of squares for - the numerator, element 1 contains the F statistic calculated, and + the numerator, element 1 contains the F statistic calculated, and element 2 contains the associated p-value for the generated F-statistic. """ @@ -72,6 +77,7 @@ def _calc_stats(numer_ss, numer_df, denom_ss, denom_df): p_val = f.sf(f_val, numer_df, denom_df) return f_val, p_val + def _process_term(orig_model, term): """Obtains needed sum of squared residuals of a model fitted without a specified term/coefficient. @@ -88,6 +94,7 @@ def _process_term(orig_model, term): new_model.fit(orig_model.training_data) return new_model.get_sse(), new_model.get_ssr() + def _extract_dfs(model, dict_out=False): """Obtains the different degrees of freedom for a model in reference to an F-test. @@ -114,6 +121,7 @@ def _extract_dfs(model, dict_out=False): else: return reg_df, error_df, total_df + def _anova_terms(model): """Perform a global F-test by analyzing all possible models when you leave one coefficient out while fitting. @@ -127,19 +135,19 @@ def _anova_terms(model): the associated tests performed. """ full_reg_df, full_error_df, total_df = _extract_dfs(model) - + # Full model values full_sse = model.get_sse() # sum of squared errors full_ssr = model.get_ssr() # sum of squares explained by model - full_sst = model.get_sst() - + full_sst = model.get_sst() + global_f_val, global_p_val = _calc_stats( full_ssr, full_reg_df, full_sse, full_error_df, ) - + # Calculate the general terms now indices = ["Global Test"] sses = [full_ssr] @@ -147,7 +155,7 @@ def _anova_terms(model): f_vals = [global_f_val] p_vals = [global_p_val] dfs = [full_reg_df] - + terms = model.ex.get_terms() for term in terms: term_df = term.get_dof() @@ -164,7 +172,7 @@ def _anova_terms(model): dfs.append(term_df) f_vals.append(reduced_f_val) p_vals.append(reduced_p_val) - + # Finish off the dataframe's values indices.append("Error") sses.append("") @@ -172,14 +180,15 @@ def _anova_terms(model): dfs.append(full_error_df) f_vals.append("") p_vals.append("") - + return pd.DataFrame({ - "DF" : dfs, - "SS Err.": sses, - "SS Reg." : ssrs, - "F" : f_vals, - "p" : p_vals - }, index = indices, columns = ["DF", "SS Err.", "SS Reg.", "F", "p"]) + "DF": dfs, + "SS Err.": sses, + "SS Reg.": ssrs, + "F": f_vals, + "p": p_vals + }, index=indices, columns=["DF", "SS Err.", "SS Reg.", "F", "p"]) + def _anova_models(full_model, reduced_model): """Performs a partial F-test to compare two models. @@ -197,34 +206,31 @@ def _anova_models(full_model, reduced_model): """ full_label = str(full_model) reduced_label = str(reduced_model) - + f_reg_df, f_error_df, f_total_df = _extract_dfs(full_model) r_reg_df, r_error_df, r_total_df = _extract_dfs(reduced_model) - + f_sse, f_ssr = full_model.get_sse(), full_model.get_ssr() r_sse, r_ssr = reduced_model.get_sse(), reduced_model.get_ssr() - + f_val, p_val = _calc_stats( r_sse - f_sse, r_error_df - f_error_df, f_sse, f_error_df, ) - + indices = ["Full Model", "- Reduced Model", "Error"] df = [f_reg_df, f_reg_df - r_reg_df, f_error_df] - ssrs = [f_ssr, r_ssr, ""] + ssrs = [f_ssr, r_ssr, ""] sses = [f_sse, r_sse, ""] f = ["", f_val, ""] p = ["", p_val, ""] return pd.DataFrame({ - "DF" : df, - "SS Err." : sses, - "SS Reg." : ssrs, - "F" : f, - "p" : p}, - index = indices, columns = ["DF", "SS Err.", "SS Reg.", "F", "p"]) - - - + "DF": df, + "SS Err.": sses, + "SS Reg.": ssrs, + "F": f, + "p": p}, + index=indices, columns=["DF", "SS Err.", "SS Reg.", "F", "p"]) diff --git a/salmon/expression.py b/salmon/expression.py index 4077bfb..9255725 100644 --- a/salmon/expression.py +++ b/salmon/expression.py @@ -7,12 +7,13 @@ from abc import ABC, abstractmethod from scipy.special import binom -from . import transformation as _t +from . import transformation as _t _supported_encodings = ['one-hot'] -# if set to True, has expression representation equivalent to __str__ representation -# useful for debugging +"""if set to True, has expression representation + equivalent to __str__ representation + useful for debugging""" STR_AS_REPR = False @@ -29,12 +30,13 @@ def __new__(cls, input_array, columns=None): def __array_finalize__(self, obj): # see InfoArray.__array_finalize__ for comments - if obj is None: return + if obj is None: + return self.columns = getattr(obj, 'columns', None) - + def get_column(self, colname): for i, column in enumerate(self.columns): - if column == colname: + if column == colname: return self[:, i] raise KeyError("Column %s not in LightDataFrame." % colname) @@ -43,8 +45,8 @@ def get_column(self, colname): class Expression(ABC): """The parent abstract class that all subsequent representations of coefficients stem from and Model objects utilize.""" - - def __init__(self, scale = 1): + + def __init__(self, scale=1): """Create an expression. Cannot be done directly, only through inheritance. @@ -57,7 +59,7 @@ def __init__(self, scale = 1): """ self.scale = scale - + @abstractmethod def __str__(self): """Represent an Expression object as a String utilizing standard @@ -70,7 +72,7 @@ def __repr__(self): return self.__str__() else: return object.__repr__(self) - + def __eq__(self, other): """Check if an Expression is equivalent to another object. @@ -84,7 +86,7 @@ def __eq__(self, other): if isinstance(other, Expression): return self.scale == other.scale return False - + def __sim__(self, other): """Check if an Expression is similar to another object. This is done for the purposes of algebraic simplification. @@ -103,7 +105,7 @@ def __sim__(self, other): other_copy.scale = 1 return self_copy == other_copy return False - + def __hash__(self): """Hash an Expression for the purposes of storing Expressions in sets and dictionaries. @@ -116,8 +118,8 @@ def __hash__(self): @abstractmethod def copy(self): """Creates a deep copy of an expression""" - pass - + pass + @abstractmethod def interpret(self, data): """Cast a Var object or any nested Var objects in a functional manner @@ -128,7 +130,7 @@ def interpret(self, data): Returns: A similar object that is either Quantitative or Categorical object. - """ + """ pass def transform(self, transformation): @@ -143,15 +145,19 @@ def transform(self, transformation): """ if isinstance(transformation, str): if transformation in _t._default_transformations: - transformation = _t._default_transformations[transformation](None) + transformation = _t._default_transformations[transformation]( + None) else: - raise Exception("Transformation specified is not a default function.") + raise Exception( + "Transformation specified is not a default function.") elif not isinstance(transformation, _t.Transformation): - raise Exception("A Transformation or String are needed when transforming a variable.") - + raise Exception( + "A Transformation or String are needed when \ + transforming a variable.") + self_copy = self.copy() return TransVar(self_copy, transformation) - + def __add__(self, other): """Form a combination of two Expressions by the addition operator (+). @@ -183,17 +189,19 @@ def __add__(self, other): # single term expressions return Combination((self, other)) else: - raise Exception("Expressions do not support addition with the given arguments.") - + raise Exception( + "Expressions do not support addition \ + with the given arguments.") + def __radd__(self, other): """See __add__. """ return self.__add__(other) - + def __sub__(self, other): """Subtract one Expression object from another. This is a special case of addition. See __add__. """ return self + -1 * other - + def __mul__(self, other): """Multiply two Expression objects together. When possible, simplification will be done. This is called with the multiplication @@ -221,15 +229,18 @@ def __mul__(self, other): return other.__mul__(self) elif isinstance(other, Combination): self_copy = self.copy() - return Combination(terms=(self_copy*t for t in other.get_terms())) + return Combination( + terms=(self_copy * t for t in other.get_terms())) elif isinstance(other, Expression): - if self.__sim__(other): # Consolidate + if self.__sim__(other): # Consolidate return self.copy() ** 2 - else: # Explicit interaction + else: # Explicit interaction return Interaction(terms=(self, other)) else: - raise Exception("Expressions do not support multiplication with the given arguments.") - + raise Exception( + "Expressions do not support multiplication \ + with the given arguments.") + def __rmul__(self, other): """See __mul__.""" if isinstance(other, (int, float)): @@ -251,13 +262,13 @@ def __truediv__(self, other): def __rtruediv__(self, other): """See __truediv__. """ return other * (self ** -1) - - def __pow__(self, other): - """Raise an Expression object to a constant power. - + + def __pow__(self, other): + """Raise an Expression object to a constant power. + Arguments: other - A real value. - + Returns: An Expression object that represents the original object raised to a constant power. More than likely this will either be a PowerVar @@ -270,7 +281,7 @@ def __pow__(self, other): return self elif other == 0: return 1 - elif isinstance(other, (int, float)): # Proper power + elif isinstance(other, (int, float)): # Proper power return PowerVar(self, other) else: raise Exception("Can only raise to a constant power.") @@ -282,26 +293,27 @@ def __and__(self, other): def __xor__(self, other): """Returns an interaction with main effects up to a specified power. - + If the base object is not a Combination, then the default __pow__ logic is used.""" return self ** other # see Combination's __xor__ for other logic - + def descale(self): """Remove scaling on an Expression. Done in a functional manner so original object is unaffected.""" ret_exp = self.copy() ret_exp._descale() return ret_exp - - def _descale(self): + + def _descale(self): """Helper function for descaling Expressions.""" - # Overwrite on expression containers such as Interactions and Combinations + # Overwrite on expression containers such as Interactions and + # Combinations self.scale = 1 - + @abstractmethod - def evaluate(self, data, fit = True): + def evaluate(self, data, fit=True): """Given data, apply the appropriate transformations, combinations, and interactions. @@ -310,7 +322,7 @@ def evaluate(self, data, fit = True): Variable objects. fit - A flag to reference when evaluating the data to know when to overwrite Categorical levels. - + Returns: An appropriate DataFrame consisting of specified columns for each term in the Expression. @@ -318,34 +330,34 @@ def evaluate(self, data, fit = True): # Transform the incoming data depending on what type of variable the # data is represented by pass - + def reduce(self): """Obtain the base Quantitative, Categorical, Constant, and Varable terms. - + Returns: A dictionary mapping the respective types of terms to a set collection of the unique terms present. """ return self._reduce({ - "Q":set(), - "C":set(), - "V":set(), + "Q": set(), + "C": set(), + "V": set(), "Constant": None, }) - + @abstractmethod def _reduce(self, ret_dict): """A helper method for the reduce function to allow for recursion.""" # Reduce an expression to a dictionary containing lists of unique # Quantitative and Categorical Variables pass - + @abstractmethod def get_terms(self): """Return a list of the top level terms.""" pass - + @abstractmethod def get_dof(self): """Return the degrees of freedom for an expression (not including @@ -354,10 +366,10 @@ def get_dof(self): def untransform(self, data): """Untransform data by inverting the applied operations to it for - the purposes of plotting. - + the purposes of plotting. + Returns: - A DataFrame object with appropriately inverted columns. + A DataFrame object with appropriately inverted columns. """ return data * (1 / self.scale) @@ -365,23 +377,24 @@ def untransform_name(self): """Get the untransformed name of the model after inversions have been applied.""" return str(self * (1 / self.scale)) - + @abstractmethod def contains(self, other): """Returns true if this object is contains the other.""" pass + class Var(Expression): """A base object that represents a generic single term. If a model consists - of Y ~ X1 + X2 + X3, X2 would be a single term, as would X1 and X3. - + of Y ~ X1 + X2 + X3, X2 would be a single term, as would X1 and X3. + This is more general when compared to a Quantitative or Categorical object, as it does not impose any restrictions on the data it represents. Upon model fitting, a Var will check to see if the data it represents is - better suited for being a Quantitative variable or a Categorical one. + better suited for being a Quantitative variable or a Categorical one. """ - def __init__(self, name, scale = 1): + def __init__(self, name, scale=1): """Creates a Var object. Arguments: @@ -390,50 +403,52 @@ def __init__(self, name, scale = 1): representing. scale - A real value that will multiplicatively scale the term. """ - super().__init__(scale = scale) + super().__init__(scale=scale) self.name = name - + def __eq__(self, other): if isinstance(other, Var): if self.name == other.name: return super().__eq__(other) return False - + def __hash__(self): return hash((self.name, self.scale)) - + def __str__(self): if self.scale != 1: - return "{1}*{0}".format(self.name, self.scale) + return "{1}*{0}".format(self.name, self.scale) else: return self.name - + def copy(self): return Var(self.name, self.scale) - + def interpret(self, data): - if ('float' in data[self.name].dtype.name or \ + if ('float' in data[self.name].dtype.name or 'int' in data[self.name].dtype.name): return Quantitative(self.name, self.scale) else: return Categorical(self.name) - + def evaluate(self, data, fit=True): - raise NotImplementedError("Must call interpret prior to evaluating data for variables.") - + raise NotImplementedError( + "Must call interpret prior to evaluating data for variables.") + def _reduce(self, ret_dict): ret_dict["V"].add(self) return ret_dict - + def get_terms(self): return [self] - + def get_dof(self): return 1 - + def contains(self, other): return False # impossible for a simple variable to contain another + class TransVar(Expression): """Represents an Expression that has a Transformation object applied to it. @@ -442,8 +457,8 @@ class TransVar(Expression): pre-transformation was, the new TransVar object will be treated as a single term. """ - - def __init__(self, var, transformation, scale = 1): + + def __init__(self, var, transformation, scale=1): """Creates a TransVar object. Arguments: @@ -454,32 +469,32 @@ def __init__(self, var, transformation, scale = 1): self.scale = scale self.var = var self.transformation = transformation - + def __str__(self): base = self.transformation.compose(str(self.var)) if self.scale == 1: return base else: return "{}*{}".format(self.scale, base) - + def copy(self): return TransVar( self.var.copy(), self.transformation.copy(), self.scale, ) - + def __eq__(self, other): if isinstance(other, TransVar): return self.var == other.var and \ - self.transformation == other.transformation and \ - self.scale == other.scale - + self.transformation == other.transformation and \ + self.scale == other.scale + return False - + def __hash__(self): return hash((self.var, self.scale, self.transformation)) - + def __add__(self, other): if isinstance(other, TransVar) and self.var == other.var and \ self.transformation == other.transformation: @@ -490,16 +505,16 @@ def __add__(self, other): ) else: return Combination((self, other)) - + def interpret(self, data): self.var = self.var.interpret(data) return self - + def _descale(self): self.scale = 1 self.var._descale() - - def evaluate(self, data, fit = True): + + def evaluate(self, data, fit=True): base_data = self.var.evaluate(data, fit).sum(axis=1) transformed_data = self.scale * self.transformation.transform( values=base_data, @@ -509,18 +524,18 @@ def evaluate(self, data, fit = True): transformed_data[:, np.newaxis], columns=[str(self)], ) - + def _reduce(self, ret_dict): return self.var._reduce(ret_dict) - + def get_terms(self): return [self] - + def get_dof(self): return 1 def untransform(self, data): - unscaled = data * (1 / self.scale) + unscaled = data * (1 / self.scale) inverted = self.transformation.invert(unscaled) return self.var.untransform(inverted) @@ -532,13 +547,15 @@ def contains(self, other): return any(self.contains(other_term) for other_term in other.terms) return self.var.__eq__(other) or self.var.contains(other) - + + class PowerVar(TransVar): """A special case of the TransVar as a Power transformation can easily be distributed across terms. As such, special consideration is taken into account when multiplying two PowerVar objects. """ - def __init__(self, var, power, scale = 1): + + def __init__(self, var, power, scale=1): """Creates a PowerVar object. Arguments: @@ -551,21 +568,21 @@ def __init__(self, var, power, scale = 1): self.var.scale = 1 self.transformation = _t.Power(power) self.power = power - + def __eq__(self, other): if isinstance(other, PowerVar): return self.var == other.var and \ - self.power == other.power and \ - self.scale == other.scale + self.power == other.power and \ + self.scale == other.scale return False - + def copy(self): return PowerVar(self.var, self.power, self.scale) - + def __hash__(self): return hash((self.var, self.scale, self.transformation, self.power)) - + def __add__(self, other): if self.__sim__(other): return PowerVar( @@ -575,7 +592,7 @@ def __add__(self, other): ) else: return Combination((self, other)) - + def __mul__(self, other): if isinstance(other, PowerVar): if self.var.__sim__(other.var): @@ -598,20 +615,20 @@ def __mul__(self, other): return Constant(scalar) else: return PowerVar(self.var.copy(), power=power, scale=scalar) - + return super().__mul__(other) - + def __rmul__(self, other): return self.__mul__(other) - + def __pow__(self, other): if isinstance(other, int): return PowerVar(self.var.copy(), self.power * other) return super().__pow__(other) - + def get_terms(self): return [self] - + def get_dof(self): return 1 @@ -626,12 +643,13 @@ def contains(self, other): return self.var.contains(other) else: return self.var.__eq__(other) or self.var.contains(other) - + + class Quantitative(Var): """A base term that inherits from Var. This specifically models Quantitative terms.""" - - def __init__(self, name, scale = 1): + + def __init__(self, name, scale=1): """Creates a Quantitative object. Arguments: @@ -640,66 +658,67 @@ def __init__(self, name, scale = 1): representing. scale - A real value that will multiplicatively scale the term. """ - super().__init__(name = name, scale = scale) + super().__init__(name=name, scale=scale) self.name = name - + def copy(self): return Quantitative(self.name, self.scale) - + def interpret(self, data): return self - - def evaluate(self, data, fit = True): + + def evaluate(self, data, fit=True): transformed_data = self.scale * data[self.name].values return LightDataFrame( - transformed_data[:, np.newaxis], + transformed_data[:, np.newaxis], columns=[self.name], ) - + def _reduce(self, ret_dict): ret_dict["Q"].add(self) return ret_dict - + def get_terms(self): return [self] - + def get_dof(self): return 1 - + + class Constant(Expression): """Represents a standalone term for constant values.""" - - def __init__(self, scale = 1): + + def __init__(self, scale=1): """Create a Constant object. Arguments: scale - A real value that IS the constant value. """ self.scale = scale - + def __str__(self): return str(self.scale) - + def __eq__(self, other): if isinstance(other, Constant): return self.scale == other.scale return False - + def __sim__(self, other): return isinstance(other, Constant) - + def __hash__(self): return hash((self.scale)) - + def copy(self): return Constant(self.scale) - + def interpret(self, data): return self - + def __pow__(self, other): return Constant(self.scale ** other) - + def __mul__(self, other): if isinstance(other, Expression): ret_exp = other.copy() @@ -707,11 +726,11 @@ def __mul__(self, other): return ret_exp elif isinstance(other, (int, float)): return Constant(self.scale * other) - + def __rmul__(self, other): return self.__mul__(other) - - def evaluate(self, data, fit = True): + + def evaluate(self, data, fit=True): n = len(data) if self.scale == 0: return LightDataFrame(np.empty(shape=(n, 0)), columns=[]) @@ -720,14 +739,14 @@ def evaluate(self, data, fit = True): np.full((n, 1), self.scale), columns=[str(self)], ) - + def _reduce(self, ret_dict): ret_dict['Constant'] = self.scale return ret_dict - + def get_terms(self): return list() - + def get_dof(self): if self.scale == 0: return 0 @@ -736,7 +755,8 @@ def get_dof(self): def contains(self, other): return False - + + class Categorical(Var): """The other base term that stems from the Var class. Represents solely categorical data.""" @@ -753,20 +773,24 @@ def __init__(self, name, encoding='one-hot', levels=None, baseline=None): levels - A list object that holds all values to be considered as different levels during encoding. Any left out will be treated similarly as the baseline. A value of None will have levels - learned upon fitting. - baseline - A list of objects to be collectively treated as a baseline. + learned upon fitting. + baseline - A list of objects to be collectively + treated as a baseline. """ self.scale = 1 self.name = name if encoding not in _supported_encodings: - raise Exception("Method " + str(encoding) + " not supported for Categorical variables.") + raise Exception( + "Method " + + str(encoding) + + " not supported for Categorical variables.") self.encoding = encoding self.levels = levels self.baseline = baseline - + def __str__(self): return self.name - + def copy(self): return Categorical( self.name, @@ -774,23 +798,22 @@ def copy(self): None if self.levels is None else self.levels[:], self.baseline, ) - + def interpret(self, data): return self - - #def transform(self, transformation): + + # def transform(self, transformation): # raise Exception("Categorical variables cannot be transformed.") - + def set_baseline(self, value): if isinstance(value, collections.Iterable) and \ not isinstance(value, str): self.baseline = value else: self.baseline = [value] - + def _set_levels(self, data, override_baseline=True): - unique_values = data[self.name].unique() - unique_values.sort() + unique_values = sorted(data[self.name].unique()) if self.levels is None: self.levels = unique_values[:] if self.baseline is None: @@ -805,10 +828,9 @@ def _set_levels(self, data, override_baseline=True): self.levels = list(self.levels) for element in diff: self.levels.append(element) - - + def _one_hot_encode(self, data): - # keep track of the levels, including which ones are not in + # keep track of the levels, including which ones are not in # the baseline mapping = {} keep = [] @@ -818,42 +840,43 @@ def _one_hot_encode(self, data): if level not in self.baseline: keep.append(i) columns.append("%s{%s}" % (self.name, level)) - + # define mapping of levels to integer codes codes = data[self.name].map(mapping) - # create dummy matrix by taking the appropriate rows from an + # create dummy matrix by taking the appropriate rows from an # identity matrix dummy_mat = np.eye(len(self.levels))[:, keep].take(codes, axis=0) - + return LightDataFrame(dummy_mat, columns=columns) - + def evaluate(self, data, fit=True): if self.levels is None or self.baseline is None or fit: self._set_levels(data) - + if self.encoding == 'one-hot': return self._one_hot_encode(data) else: raise NotImplementedError() - + def _reduce(self, ret_dict): ret_dict["C"].add(self) return ret_dict - + def get_terms(self): return [self] - + def get_dof(self): return len(self.levels) - len(self.baseline) - + + class Interaction(Expression): """An Interaction models two or more terms multiplied together. An Interaction is treated as a single term. If multiple terms are the result of a multiplication, then the result will be a Combination of Interactions. """ - def __init__(self, terms, scale = 1): + def __init__(self, terms, scale=1): """Create an Interaction object. Arguments: @@ -863,29 +886,30 @@ def __init__(self, terms, scale = 1): self.scale = scale if any(not isinstance(t, Expression) for t in terms): - raise Exception("Interaction takes only Expressions for initialization.") - + raise Exception( + "Interaction takes only Expressions for initialization.") + self.terms = set() for term in terms: self._add_term(term) - + def __eq__(self, other): if isinstance(other, Interaction): if len(self.terms.difference(other.terms)) == 0 and \ len(other.terms.difference(self.terms)) == 0: return super().__eq__(other) return False - + def __hash__(self): return hash((frozenset(self.terms), self.scale)) - + def __str__(self): - base = "(" + ")(".join(sorted(str(term) for term in self.terms)) + ")" + base = "(" + ")(".join(sorted(str(term) for term in self.terms)) + ")" if self.scale == 1: return base else: return "{}*{}".format(self.scale, base) - + def _add_term(self, other_term): other_term = other_term.copy() if isinstance(other_term, PowerVar): @@ -894,8 +918,8 @@ def _add_term(self, other_term): base_term = other_term similar_term = None for term in self.terms: - if term.__sim__(base_term) or (isinstance(term, PowerVar) and \ - term.var.__sim__(base_term)): + if term.__sim__(base_term) or (isinstance(term, PowerVar) and + term.var.__sim__(base_term)): similar_term = term break @@ -905,20 +929,19 @@ def _add_term(self, other_term): self.terms.remove(similar_term) self.terms.add(similar_term * other_term) else: - self.terms.add(other_term) - + self.terms.add(other_term) + def _add_terms(self, other_terms): for term in other_terms: self._add_term(term) - def copy(self): return Interaction({term.copy() for term in self.terms}, self.scale) - + def interpret(self, data): self.terms = set(term.interpret(data) for term in self.terms) return self - + def __mul__(self, other): if isinstance(other, Interaction): self_copy = self.copy() @@ -930,14 +953,15 @@ def __mul__(self, other): return self_copy elif isinstance(other, Combination): self_copy = self.copy() - return Combination(terms=(self_copy*t for t in other.get_terms())) + return Combination( + terms=(self_copy * t for t in other.get_terms())) elif isinstance(other, Expression): self_copy = self.copy() self_copy._add_term(other) return self_copy else: return super().__mul__(other) - + def __pow__(self, other): if isinstance(other, (int, float)): ret_int = Interaction(()) @@ -945,46 +969,47 @@ def __pow__(self, other): ret_int._add_term(term ** other) return ret_int return super().__pow__(other) - + def _descale(self): self.scale = 1 for term in self.terms: term._descale() - + def evaluate(self, data, fit=True): trans_data_sets = [term.evaluate(data, fit) for term in self.terms] # rename columns in sets for data_set in trans_data_sets: data_set.columns = ["({})".format(col) for col in data_set.columns] - + base_set = trans_data_sets[0] for other_set in trans_data_sets[1:]: new_data = [] new_columns = [] for base_col in base_set.columns: for other_col in other_set.columns: - new_data.append(base_set.get_column(base_col) * \ - other_set.get_column(other_col)) + new_data.append(base_set.get_column(base_col) * + other_set.get_column(other_col)) new_columns.append(base_col + other_col) - + base_set = LightDataFrame( np.column_stack(new_data), columns=new_columns, ) - + return base_set - + def _reduce(self, ret_dict): for term in self.terms: ret_dict = term._reduce(ret_dict) - + return ret_dict - + def get_terms(self): return [self] - + def get_dof(self): - return reduce(lambda x,y: x*y, (term.get_dof() for term in self.terms)) + return reduce(lambda x, y: x * y, (term.get_dof() + for term in self.terms)) def contains(self, other): if isinstance(other, Combination): @@ -996,18 +1021,19 @@ def contains(self, other): else: other_terms = [other] all_terms_found = True - for o_t in other_terms: + for o_t in other_terms: single_term_found = False for s_t in self_terms: single_term_found = single_term_found or s_t.__eq__(o_t) single_term_found = single_term_found or s_t.contains(o_t) all_terms_found = all_terms_found and single_term_found return all_terms_found - + + class Combination(Expression): """A Combination models several single terms added together. """ - def __init__(self, terms, scale = 1): + def __init__(self, terms, scale=1): """Create a Combination object. Arguments: @@ -1015,16 +1041,19 @@ def __init__(self, terms, scale = 1): scale - A real value that will multiplicatively scale the term. """ self.scale = scale - - terms = [Constant(t) if isinstance(t, (int, float)) else t for t in terms] + + terms = [ + Constant(t) if isinstance( + t, (int, float)) else t for t in terms] if any(not isinstance(t, Expression) for t in terms): - raise Exception("Combination takes only Expressions for initialization.") - + raise Exception( + "Combination takes only Expressions for initialization.") + self.terms = set() for term in terms: self._add_term(term) - + def __eq__(self, other): if isinstance(other, Combination): if len(self.terms.difference(other.terms)) == 0 and \ @@ -1034,9 +1063,9 @@ def __eq__(self, other): def __hash__(self): return hash((frozenset(self.terms), self.scale)) - + def __str__(self): - base = "+".join(sorted(str(term) for term in self.terms)) + base = "+".join(sorted(str(term) for term in self.terms)) if self.scale == 1: return base else: @@ -1066,7 +1095,7 @@ def __xor__(self, other): if len(term) == 1: processed_terms.add(next(iter(term))) else: - processed_terms.add(reduce(lambda x,y: x * y, term)) + processed_terms.add(reduce(lambda x, y: x * y, term)) if len(new_terms) == 0: return Constant(1) @@ -1074,36 +1103,36 @@ def __xor__(self, other): return Combination(processed_terms) else: return self ** other - + def _add_term(self, other_term): if isinstance(other_term, (int, float)): other_term = Constant(other_term) - + similar_term = None for term in self.terms: if term.__sim__(other_term): similar_term = term break - + if similar_term is not None: self.terms.remove(similar_term) addition_result = similar_term + other_term if addition_result != Constant(0): self.terms.add(addition_result) else: - self.terms.add(other_term) - + self.terms.add(other_term) + def _add_terms(self, other_terms): for term in other_terms: self._add_term(term) - + def copy(self): return Combination({term.copy() for term in self.terms}, self.scale) - + def interpret(self, data): self.terms = [term.interpret(data) for term in self.terms] return self - + def __add__(self, other): if isinstance(other, Combination): ret_comb = self.copy() @@ -1115,10 +1144,10 @@ def __add__(self, other): return ret_comb else: return super().__add__(other) - + def __radd__(self, other): return self.__add__(other) - + def __mul__(self, other): if isinstance(other, Combination): ret_comb = Combination(()) @@ -1132,40 +1161,41 @@ def __mul__(self, other): return ret_comb else: return super().__mul__(other) - + def __pow__(self, other): if isinstance(other, int) and other > 0: terms = [self.scale * term for term in self.terms] return MultinomialExpansion(terms, other) else: return super().__pow__(other) - + def _descale(self): self.scale = 1 for term in self.terms: term._descale() - - def evaluate(self, data, fit = True): + + def evaluate(self, data, fit=True): dataframes = [] columns = [] for term in self.terms: df = term.evaluate(data, fit) dataframes.append(df) columns.extend(df.columns) - + return LightDataFrame(np.hstack(dataframes), columns=columns) - + def _reduce(self, ret_dict): for term in self.terms: ret_dict = term._reduce(ret_dict) - + return ret_dict - + def get_terms(self): return self.terms - + def get_dof(self): - return reduce(lambda x,y: x+y, (term.get_dof() for term in self.terms)) + return reduce(lambda x, y: x + y, (term.get_dof() + for term in self.terms)) def contains(self, other): self_terms = self.terms @@ -1174,13 +1204,15 @@ def contains(self, other): else: other_terms = [other] for s_t in self_terms: - for o_t in other_terms: + for o_t in other_terms: if s_t.__eq__(o_t) or s_t.contains(o_t): return True return False - + + def MultinomialCoef(params): - """Calculate the coefficients necessary when raising polynomials to a power. + """Calculate the coefficients necessary + when raising polynomials to a power. Arguments: params - A list of powers of individual terms. @@ -1190,7 +1222,8 @@ def MultinomialCoef(params): """ if len(params) == 1: return 1 - return binom(sum(params), params[-1]) * MultinomialCoef(params[:-1]) + return binom(sum(params), params[-1]) * MultinomialCoef(params[:-1]) + def MultinomialExpansion(terms, power): """Raise a collection of single terms (polynomial) to a power. @@ -1204,22 +1237,27 @@ def MultinomialExpansion(terms, power): to the specified power. """ combination_terms = [] - generators = [range(power+1) for _ in range(len(terms))] - for powers in product(*generators): # Inefficient, optimize later + generators = [range(power + 1) for _ in range(len(terms))] + for powers in product(*generators): # Inefficient, optimize later if sum(powers) == power: interaction_terms = [int(MultinomialCoef(powers))] for term, term_power in zip(terms, powers): if term_power == 0: continue interaction_terms.append(term ** term_power) - combination_terms.append(reduce(lambda x,y: x * y, interaction_terms)) - return reduce(lambda x,y: x + y, combination_terms) - + combination_terms.append( + reduce( + lambda x, + y: x * y, + interaction_terms)) + return reduce(lambda x, y: x + y, combination_terms) + + def Poly(var, power): """A quick way to create a standard polynomial from one base expression. Arguments: - var - An Expression object. + var - An Expression object. power - An integer to raise var to the power of. Returns: @@ -1230,17 +1268,17 @@ def Poly(var, power): """ if isinstance(var, str): var = Q(var) - + if not isinstance(power, int) or power < 0: raise Exception("Must raise to a non-negative integer power.") elif power == 0: return Constant(1) else: - terms = [var ** i for i in range(1, power+1)] - return reduce(lambda x,y: x + y, terms) - - -# Transformations + terms = [var ** i for i in range(1, power + 1)] + return reduce(lambda x, y: x + y, terms) + + +# Transformations Log = lambda var: var.transform("log") Log10 = lambda var: var.transform("log10") Sin = lambda var: var.transform("sin") @@ -1252,6 +1290,7 @@ def Poly(var, power): Center = lambda var: var.transform("center") Identity = lambda var: var.transform("identity") + # Aliases V = Var Q = Quantitative diff --git a/salmon/model.py b/salmon/model.py index 72006a1..9a51bf0 100644 --- a/salmon/model.py +++ b/salmon/model.py @@ -253,7 +253,7 @@ def _fit(self, data): if self.intercept: cols.append("Intercept") coef_ = np.append(coef_, y_offset - (X_offsets * coef_).sum()) - cov_coef_intercept = -1*np.dot(self.cov_, X_offsets) + cov_coef_intercept = -1 * np.dot(self.cov_, X_offsets) var_intercept = self.resid_var_ / self.n var_intercept -= (X_offsets * cov_coef_intercept).sum() @@ -454,7 +454,7 @@ def r_squared(self, X=None, y=None, adjusted=False, **kwargs): def score(self, X=None, y=None, adjusted=False, **kwargs): """Wrapper for sklearn api for cross fold validation. - + See LinearModel.r_squared. """ return self.r_squared(X, y, adjusted, **kwargs) @@ -745,7 +745,7 @@ def _plot_one_quant( max_x = max(max_x - diff, max_x + diff) # TODO: Check if min() and max() are necessary here - plot_objs['x'] = {"min" : min_x, "max" : max_x, "name" : x_name} + plot_objs['x'] = {"min": min_x, "max": max_x, "name": x_name} # Quantitative inputs line_x = pd.DataFrame({x_name: np.linspace(min_x, max_x, 100)}) @@ -836,7 +836,7 @@ def _plot_one_quant_zero_cats( if original_y_space: y_train_vals = self.re.untransform(y_train_vals) - ax.scatter(x, y_train_vals, c = "black", alpha = alpha) + ax.scatter(x, y_train_vals, c="black", alpha=alpha) def _plot_band( self, @@ -851,7 +851,7 @@ def _plot_band( """A helper function to plot the confidence or prediction bands for a model. By default will plot prediction bands.""" x_name = plot_objs['x']['name'] - X_new = self.ex.evaluate(line_x, fit = False) + X_new = self.ex.evaluate(line_x, fit=False) if self.intercept: n, _ = X_new.shape X_new = np.hstack((X_new, np.ones((n, 1)))) @@ -876,7 +876,6 @@ def _plot_band( alpha=0.3, ) - def _plot_one_quant_some_cats( self, x, @@ -898,7 +897,6 @@ def _plot_one_quant_some_cats( x_name = plot_objs['x']['name'] y_name = plot_objs['y']['name'] - plots = [] labels = [] linestyles = [':', '-.', '--', '-'] @@ -909,19 +907,19 @@ def _plot_one_quant_some_cats( # cartesian product of all combinations level_combinations = product(*levels) - dummy_data = line_x.copy() # rest of columns set in next few lines + dummy_data = line_x.copy() # rest of columns set in next few lines y_train_vals = self.y_train_ if original_y_space: y_train_vals = self.re.untransform(y_train_vals) for level_set in level_combinations: - label = [] # To be used in legend - for (cat,level) in zip(cats,level_set): - dummy_data[str(cat)] = level # set dummy data for prediction + label = [] # To be used in legend + for (cat, level) in zip(cats, level_set): + dummy_data[str(cat)] = level # set dummy data for prediction label.append(str(level)) - line_type = linestyles.pop() # rotate through line styles + line_type = linestyles.pop() # rotate through line styles linestyles.insert(0, line_type) line_y = self.predict(dummy_data, for_plot=True) @@ -930,15 +928,17 @@ def _plot_one_quant_some_cats( y_vals_to_plot = self.re.untransform(y_vals) else: y_vals_to_plot = y_vals - plot, = ax.plot(dummy_data[x_name], y_vals_to_plot, linestyle=line_type) + plot, = ax.plot( + dummy_data[x_name], y_vals_to_plot, linestyle=line_type) plots.append(plot) labels.append(", ".join(label)) - if categorize_residuals: - indices_to_use = pd.Series([True] * len(x)) # gradually gets filtered out - for (cat,level) in zip(cats,level_set): - indices_to_use = indices_to_use & (self.training_data[str(cat)] == level) + indices_to_use = pd.Series( + [True] * len(x)) # gradually gets filtered out + for (cat, level) in zip(cats, level_set): + indices_to_use = indices_to_use & ( + self.training_data[str(cat)] == level) ax.scatter( x[indices_to_use], y_train_vals[indices_to_use], @@ -1015,7 +1015,7 @@ def partial_plots(self, alpha=0.5, **kwargs): A tuple containing the matplotlib (figure, list of axes) for the partial plots. """ - #terms = self.ex.flatten(separate_interactions = False) + # terms = self.ex.flatten(separate_interactions = False) terms = self.ex.get_terms() fig, axs = plt.subplots(1, len(terms), **kwargs) @@ -1023,7 +1023,7 @@ def partial_plots(self, alpha=0.5, **kwargs): xi = terms[i] - sans_xi = Combination(terms[:i] + terms[i+1:]) + sans_xi = Combination(terms[:i] + terms[i + 1:]) yaxis = LinearModel(sans_xi, self.re) xaxis = LinearModel(sans_xi, xi) @@ -1038,7 +1038,7 @@ def partial_plots(self, alpha=0.5, **kwargs): @staticmethod def ones_column(data): """Helper function to create a column of ones for the intercept.""" - return pd.DataFrame({"Intercept" : np.repeat(1, data.shape[0])}) + return pd.DataFrame({"Intercept": np.repeat(1, data.shape[0])}) def plot_residual_diagnostics(self, **kwargs): """Produce a matrix of four diagnostic plots: @@ -1074,7 +1074,7 @@ def residual_quantile_plot(self, ax=None): A rendered matplotlib axis object. """ if ax is None: - _, ax = plt.subplots(1,1) + _, ax = plt.subplots(1, 1) stats.probplot(self.residuals_, dist="norm", plot=ax) ax.set_title("Residual Q-Q Plot") @@ -1090,7 +1090,7 @@ def residual_fitted_plot(self, ax=None): A rendered matplotlib axis object. """ if ax is None: - _, ax = plt.subplots(1,1) + _, ax = plt.subplots(1, 1) ax.scatter(self.fitted_, self.residuals_) ax.set_title("Fitted Values v. Residuals") @@ -1109,7 +1109,7 @@ def residual_histogram(self, ax=None): A rendered matplotlib axis object. """ if ax is None: - _, ax = plt.subplots(1,1) + _, ax = plt.subplots(1, 1) ax.hist(self.residuals_) ax.set_title("Histogram of Residuals") @@ -1128,7 +1128,7 @@ def residual_order_plot(self, ax=None): A rendered matplotlib axis object. """ if ax is None: - _, ax = plt.subplots(1,1) + _, ax = plt.subplots(1, 1) ax.plot(self.training_data.index, self.residuals_, "o-") ax.set_title("Order v. Residuals") @@ -1136,4 +1136,3 @@ def residual_order_plot(self, ax=None): ax.set_ylabel("Residual") return ax - diff --git a/salmon/test.py b/salmon/test.py index d72ff8a..4f376bf 100644 --- a/salmon/test.py +++ b/salmon/test.py @@ -3,22 +3,27 @@ from .model import * import pandas as pd -def floatComparison(a, b, eps = 0.0001): - if isinstance(a, (pd.Series, pd.DataFrame)) or isinstance(b, (pd.Series, pd.DataFrame)): + +def floatComparison(a, b, eps=0.0001): + if isinstance(a, (pd.Series, pd.DataFrame)) or isinstance( + b, (pd.Series, pd.DataFrame)): return (a - eps < b) & (b < a + eps) else: return (a - eps < b) and (b < a + eps) # Expressions + + class TestVarMethods(unittest.TestCase): - + def test_str(self): self.assertEqual(str(Var("Test Case")), "Test Case") - + def test_mul(self): self.assertEqual(str(Var("Var1") * Var("Var2")), "(Var1)(Var2)") - self.assertEqual(str(Var("Test") * Poly("Single", 2)), "(Single)(Test)+(Single^2)(Test)") - + self.assertEqual(str(Var("Test") * Poly("Single", 2)), + "(Single)(Test)+(Single^2)(Test)") + def test_imul(self): v = Var("Test") v *= Var("Other") @@ -26,38 +31,40 @@ def test_imul(self): v = Var("Test") v *= Poly("Single", 2) self.assertEqual(str(v), "(Single)(Test)+(Single^2)(Test)") - + def test_flatten(self): flattened = Var("Test").get_terms() self.assertEqual(len(flattened), 1) self.assertEqual(str(flattened[0]), "Test") - + def test_copy(self): orig = Var("A") copy = orig.copy() self.assertFalse(orig is copy) self.assertEqual(str(orig), str(copy)) - + def test_interpret(self): - data = pd.DataFrame({"A" : [1,2,3], "B" : ["hello", "test", "goodbye"]}) + data = pd.DataFrame( + {"A": [1, 2, 3], "B": ["hello", "test", "goodbye"]}) var1 = Var("A") var2 = Var("B") self.assertTrue(isinstance(var1.interpret(data), Quantitative)) self.assertTrue(isinstance(var2.interpret(data), Categorical)) - + + class TestQuantitativeMethods(unittest.TestCase): - + def test_str(self): - var = 4*(Quantitative("A")+3)**2 + var = 4 * (Quantitative("A") + 3)**2 self.assertEqual(str(var), "4*(6*A+9+A^2)") - var = 4*Sin(Quantitative("A")+3) + var = 4 * Sin(Quantitative("A") + 3) self.assertEqual(str(var), "4*sin(3+A)") - + def test_add(self): var1 = Quantitative("A") self.assertEqual(str(var1 + 3), "3+A") self.assertEqual(str(var1 + var1), "2*A") - + def test_iadd(self): var = Quantitative("A") var += 3 @@ -65,13 +72,13 @@ def test_iadd(self): var += -3 var += var self.assertEqual(str(var), "2*A") - + def test_mul(self): var = Quantitative("A") self.assertEqual(str(var * 3), "3*A") self.assertEqual(str(var * var), "A^2") self.assertEqual(str(var * Quantitative("B")), "(A)(B)") - + def test_imul(self): var1 = Quantitative("A") var1 *= 3 @@ -82,20 +89,20 @@ def test_imul(self): var3 = Quantitative("A") var3 *= Quantitative("B") self.assertEqual(str(var3), "(A)(B)") - + def test_pow(self): var = Quantitative("A") self.assertEqual(str(var ** 4), "A^4") with self.assertRaises(Exception): var ** "bad arg" - + def test_ipow(self): var = Quantitative("A") var **= 3 self.assertEqual(str(var), "A^3") with self.assertRaises(Exception): var *= "bad arg" - + def test_transform(self): var = Quantitative("A") var = var.transform("sin") @@ -115,12 +122,13 @@ def test_interpret(self): var = Quantitative("A") self.assertEqual(var.interpret(None), var) + class TestCategoricalMethods(unittest.TestCase): - + def test_str(self): var = Categorical("A") self.assertEqual(str(var), "A") - + def test_copy(self): orig = Categorical("A") copy = orig.copy() @@ -130,20 +138,21 @@ def test_copy(self): def test_interpret(self): var = Categorical("A") self.assertEqual(var.interpret(None), var) - + + class TestInteractionMethods(unittest.TestCase): - + def test_str(self): inter = Interaction(terms=(Quantitative("A"), Quantitative("B"))) self.assertEqual(str(inter), "(A)(B)") - + def test_pow(self): inter = Interaction(terms=(Quantitative("A"), Quantitative("B"))) inter = inter ** 3 self.assertEqual(str(inter), "(A^3)(B^3)") with self.assertRaises(Exception): inter ** "log" - + def test_flatten(self): inter = Interaction(terms=(Quantitative("A"), Quantitative("B"))) ls1 = inter.get_terms() @@ -151,87 +160,97 @@ def test_flatten(self): self.assertEqual(ls1[0], inter) ls2 = list(inter.terms) self.assertEqual(len(ls2), 2) - self.assertTrue((str(ls2[0])=="A" and str(ls2[1])=="B") or (str(ls2[0])=="B" and str(ls2[1])=="A")) - + self.assertTrue((str(ls2[0]) == "A" and str(ls2[1]) == "B") or ( + str(ls2[0]) == "B" and str(ls2[1]) == "A")) + def test_copy(self): orig = Interaction(terms=(Var("A"), Var("B"))) copy = orig.copy() self.assertFalse(orig is copy) self.assertEqual(str(orig), str(copy)) - + def test_interpret(self): old = Var("A") * Var("B") - data = pd.DataFrame({"A" : [1], "B" : ["cat"]}) + data = pd.DataFrame({"A": [1], "B": ["cat"]}) old.interpret(data) for v in old.terms: if v.name == "B": self.assertTrue(isinstance(v, Categorical)) else: self.assertTrue(isinstance(v, Quantitative)) - + + class TestCombinationMethods(unittest.TestCase): - + def test_str(self): comb = Var("A") + Var("B") self.assertEqual(str(comb), "A+B") - + def test_mul(self): comb = Var("A") + Var("B") self.assertEqual(str(comb * Var("C")), "(A)(C)+(B)(C)") with self.assertRaises(Exception): comb * "bad arg" - + def test_imul(self): comb = Var("A") + Var("B") comb *= Var("C") self.assertEqual(str(comb), "(A)(C)+(B)(C)") with self.assertRaises(Exception): comb *= "bad arg" - + def test_pow(self): comb = Var("A") + Var("B") self.assertEqual(str(comb ** 2), "2*(A)(B)+A^2+B^2") - + def test_flatten(self): comb = Var("A") + Var("B") ls = list(comb.get_terms()) self.assertEqual(len(ls), 2) - self.assertTrue((str(ls[0])=="A" and str(ls[1])=="B") or (str(ls[0])=="B" and str(ls[1])=="A")) + self.assertTrue((str(ls[0]) == "A" and str(ls[1]) == "B") or ( + str(ls[0]) == "B" and str(ls[1]) == "A")) - def test_copy(self): orig = Interaction(terms=(Var("A"), Var("B"))) copy = orig.copy() self.assertFalse(orig is copy) self.assertEqual(str(orig), str(copy)) - + def test_interpret(self): old = Var("A") + Var("B") - data = pd.DataFrame({"A" : [1], "B" : ["cat"]}) + data = pd.DataFrame({"A": [1], "B": ["cat"]}) old.interpret(data) for v in old.get_terms(): if v.name == "B": self.assertTrue(isinstance(v, Categorical)) else: self.assertTrue(isinstance(v, Quantitative)) - -iris = pd.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv") -# Model +iris = pd.read_csv( + "https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv") + + +# Model class TestModelMethods(unittest.TestCase): - + def test_fit(self): levels = ["virginica", "setosa", "versicolor"] - explanatory = Q("petal_width") + C("species", levels = levels) + explanatory = Q("petal_width") + C("species", levels=levels) response = Q("sepal_width") model = LinearModel(explanatory, response) results = model.fit(iris)[["Coefficient", "SE", "t", "p"]].sort_index() - expected = pd.DataFrame({"Coefficient" : [1.36214962108944, 0.795582615454372, 1.86172822073969, 0.35290783081806], - "SE" : [0.248101683560026, 0.120657012161229, 0.223217037772943, 0.10358416791633], - "t" : [5.49, 6.59, 8.34, 3.41], - "p" : [0, 0, 0, 0.0008]}, - index = ["Intercept", "petal_width", "species{setosa}", "species{versicolor}"]).sort_index() + expected = pd.DataFrame({"Coefficient": [1.36214962108944, + 0.795582615454372, + 1.86172822073969, + 0.35290783081806], + "SE": [0.248101683560026, 0.120657012161229, + 0.223217037772943, 0.10358416791633], + "t": [5.49, 6.59, 8.34, 3.41], + "p": [0, 0, 0, 0.0008]}, + index=["Intercept", "petal_width", + "species{setosa}", + "species{versicolor}"]).sort_index() diff = results - expected diff["Coefficient"] = floatComparison(0, diff["Coefficient"], 0.000001) diff["SE"] = floatComparison(0, diff["SE"], 0.000001) @@ -239,37 +258,44 @@ def test_fit(self): diff["p"] = floatComparison(0, diff["p"], 0.0001) self.assertTrue(all(diff.apply(all, 1))) - + def test_predict(self): levels = ["virginica", "setosa", "versicolor"] explanatory = Q("petal_width") + C("species", levels=levels) response = Q("sepal_width") model = LinearModel(explanatory, response) model.fit(iris) - newData = pd.DataFrame({"petal_width" : [1,2], "species" : ["setosa", "virginica"]}) + newData = pd.DataFrame( + {"petal_width": [1, 2], "species": ["setosa", "virginica"]}) pred = model.predict(newData) - expected = pd.DataFrame({"Predicted" : [4.019461, 3.306224]}) + expected = pd.DataFrame({"Predicted": [4.019461, 3.306224]}) diff = floatComparison(0, pred - expected, 0.00001) self.assertTrue(all(diff)) - + def test_plot(self): - model = LinearModel(Q("petal_width") + Q("petal_length"), Q("sepal_length")) + model = LinearModel( + Q("petal_width") + + Q("petal_length"), + Q("sepal_length")) model.fit(iris) with self.assertRaises(Exception): model.plot() - + def test_residual_plots(self): - model = LinearModel(Q("petal_width") + Q("petal_length"), Q("sepal_length")) + model = LinearModel( + Q("petal_width") + + Q("petal_length"), + Q("sepal_length")) model.fit(iris) plots = model.residual_plots() self.assertEqual(len(plots), 2) - + def test_ones_column(self): ones = LinearModel.ones_column(iris) self.assertEqual(len(ones), 150) self.assertTrue(all(ones["Intercept"] == 1)) - - ''' + + ''' def test_extract_columns(self): self.assertEqual() with self.assertRaises(Exception): @@ -280,15 +306,15 @@ def test_extract_columns(self): pass with self.assertRaises(Exception): pass - + def test_transform(self): self.assertEqual() with self.assertRaises(Exception): pass with self.assertRaises(Exception): - pass + pass ''' - + + if __name__ == "__main__": unittest.main() - diff --git a/salmon/transformation.py b/salmon/transformation.py index c3a33d1..bf8ecf5 100644 --- a/salmon/transformation.py +++ b/salmon/transformation.py @@ -12,7 +12,7 @@ class Transformation(): transformations on data as well as some helper information for printing and visualizing. """ - + def __init__(self, func, pattern, name, inverse=None): """Creates a Transformation object. @@ -26,13 +26,13 @@ def __init__(self, func, pattern, name, inverse=None): self.pattern = pattern self.inverse = inverse self.name = name - + def __str__(self): """Returns the given pattern for debugging. """ return self.pattern def __eq__(self, other): - """Check if two objects are equivalent. + """Check if two objects are equivalent. Arguments: other - An object. @@ -55,18 +55,18 @@ def __hash__(self): """ return hash((self.pattern, self.name)) - + def compose(self, inner): - """Applies specific data to the pattern for printing. - + """Applies specific data to the pattern for printing. + Arguments: inner - An object to be injested by str.format() - + Returns: A processed and evaluated str representing the Trasnformation. """ return self.pattern.format(inner) - + def transform(self, values, training=True): """Apply the function to the data. @@ -76,10 +76,10 @@ def transform(self, values, training=True): training or not. Default is True. Returns: - A trasnformed Series. + A trasnformed Series. """ return self.func(values) - + def copy(self): """Returns a deep copy of the Transformation.""" return Transformation(self.func, self.pattern, self.name, self.inverse) @@ -94,10 +94,14 @@ def invert(self, data): A Series object that has the inverted data. """ if self.inverse is None: - raise Exception("Inverse not defined for " + self.name + " transformation.") + raise Exception( + "Inverse not defined for " + + self.name + + " transformation.") return self.inverse(data) + class Center(Transformation): """A specific type of Trasnformation for centering data so that it has a mean of 0.""" @@ -107,13 +111,13 @@ def __init__(self): self.pattern = "{0}-E({0})" self.past_mean = 0 self.name = "Center" - + def transform(self, values, training=True): if training: self.past_mean = values.mean() - + return values - self.past_mean - + def copy(self): ret_val = Center() ret_val.past_mean = self.past_mean @@ -122,24 +126,25 @@ def copy(self): def invert(self, data): return data + self.past_mean + class Standardize(Transformation): """A specific type of Transformation that standardizes the data so that it has a mean of 0 and standard deviation of 1.""" - + def __init__(self): """Create a Standardize object. """ self.pattern = "({0}-E({0}))/Std({0})" self.past_mean = 0 self.past_std = 1 self.name = "Standardize" - - def transform(self, values, training = True): + + def transform(self, values, training=True): if training: self.past_mean = values.mean() self.past_std = values.std() - - return (values - self.past_mean) / self.past_std - + + return (values - self.past_mean) / self.past_std + def copy(self): ret_val = Standardize() ret_val.past_mean, ret_val.past_std = self.past_mean, self.past_std @@ -147,7 +152,8 @@ def copy(self): def invert(self, data): return (data * self.past_std) + self.past_mean - + + # Aliases for common Transformations Sin = lambda i: Transformation(np.sin, "sin({})", "Sine") Cos = lambda i: Transformation(np.cos, "cos({})", "Cosine") @@ -162,12 +168,12 @@ def invert(self, data): Power = lambda i: Transformation(lambda x: x ** i, "{}^" + str(i), "Power", lambda x: x ** (1/i) if i % 2 == 1 else x.clip(0, None) ** (1/i)) _default_transformations = { - "sin" : Sin, - "cos" : Cos, - "log" : Log, - "log10" : Log10, - "exp" : Exp, - "standardize" : Std, + "sin": Sin, + "cos": Cos, + "log": Log, + "log10": Log10, + "exp": Exp, + "standardize": Std, "center": Cen, "identity": Identity -} \ No newline at end of file +}