Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions chainladder/development/barnzehn.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ class BarnettZehnwirth(TweedieGLM):

Parameters
----------
drop: tuple or list of tuples
Drops specific origin/development combination(s)
drop_valuation: str or list of str (default = None)
Drops specific valuation periods. str must be date convertible.
formula: formula-like
A patsy formula describing the independent variables, X of the GLM
response: str
Expand All @@ -31,7 +35,9 @@ class BarnettZehnwirth(TweedieGLM):

"""

def __init__(self, formula='C(origin) + development', response=None):
def __init__(self, drop=None,drop_valuation=None,formula='C(origin) + development', response=None):
self.drop = drop
self.drop_valuation = drop_valuation
self.formula = formula
self.response = response

Expand All @@ -50,7 +56,7 @@ def fit(self, X, y=None, sample_weight=None):
self.model_ = DevelopmentML(Pipeline(steps=[
('design_matrix', PatsyFormula(self.formula)),
('model', LinearRegression(fit_intercept=False))]),
y_ml=response, fit_incrementals=False).fit(tri)
y_ml=response, fit_incrementals=True, drop=self.drop, drop_valuation = self.drop_valuation, weighted_step = 'model').fit(X = tri, sample_weight = sample_weight)
resid = tri - self.model_.triangle_ml_[
self.model_.triangle_ml_.valuation <= tri.valuation_date]
self.mse_resid_ = (resid**2).sum(0).sum(1).sum(2).sum() / (
Expand All @@ -77,10 +83,11 @@ def transform(self, X):
X_new = X.copy()
X_ml = self.model_._prep_X_ml(X.cum_to_incr().log())
y_ml = self.model_.estimator_ml.predict(X_ml)
triangle_ml = self.model_._get_triangle_ml(X_ml, y_ml)
triangle_ml, predicted_data = self.model_._get_triangle_ml(X_ml, y_ml)
backend = "cupy" if X.array_backend == "cupy" else "numpy"
triangle_ml.is_cumulative = False
X_new.ldf_ = triangle_ml.exp().incr_to_cum().link_ratio.set_backend(backend)
X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
X_new._set_slicers()
X_new.predicted_data_ = predicted_data
return X_new
25 changes: 16 additions & 9 deletions chainladder/development/glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,16 @@ class TweedieGLM(DevelopmentBase):

Parameters
----------
drop: tuple or list of tuples
Drops specific origin/development combination(s)
drop_valuation: str or list of str (default = None)
Drops specific valuation periods. str must be date convertible.
design_matrix: formula-like
A patsy formula describing the independent variables, X of the GLM
response: str
Column name for the reponse variable of the GLM. If ommitted, then the
first column of the Triangle will be used.
weight: str
Column name of any weight to use in the GLM. If none specified, then an
unweighted regression will be performed.
power: float, default=0
power: float, default=1
The power determines the underlying target distribution according
to the following table:
+-------+------------------------+
Expand All @@ -52,7 +53,7 @@ class TweedieGLM(DevelopmentBase):
regularization strength. ``alpha = 0`` is equivalent to unpenalized
GLMs. In this case, the design matrix `X` must have full column rank
(no collinearities).
link: {'auto', 'identity', 'log'}, default='auto'
link: {'auto', 'identity', 'log'}, default='log'
The link function of the GLM, i.e. mapping from linear predictor
`X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
the link depending on the chosen family as follows:
Expand All @@ -78,10 +79,11 @@ class TweedieGLM(DevelopmentBase):
"""

def __init__(self, design_matrix='C(development) + C(origin)',
response=None, weight=None, power=1.0, alpha=1.0, link='log',
max_iter=100, tol=0.0001, warm_start=False, verbose=0):
response=None, power=1.0, alpha=1.0, link='log',
max_iter=100, tol=0.0001, warm_start=False, verbose=0, drop=None,drop_valuation=None):
self.drop = drop
self.drop_valuation = drop_valuation
self.response=response
self.weight=weight
self.design_matrix = design_matrix
self.power=power
self.alpha=alpha
Expand All @@ -93,13 +95,18 @@ def __init__(self, design_matrix='C(development) + C(origin)',

def fit(self, X, y=None, sample_weight=None):
response = X.columns[0] if not self.response else self.response
if sample_weight is None:
weight = None
else:
weight = 'model'
self.model_ = DevelopmentML(Pipeline(steps=[
('design_matrix', PatsyFormula(self.design_matrix)),
('model', TweedieRegressor(
link=self.link, power=self.power, max_iter=self.max_iter,
tol=self.tol, warm_start=self.warm_start,
verbose=self.verbose, fit_intercept=False))]),
y_ml=response, weight_ml=self.weight).fit(X)
y_ml=response, weighted_step = weight,
drop=self.drop, drop_valuation=self.drop_valuation).fit(X = X, sample_weight = sample_weight)
return self

@property
Expand Down
53 changes: 40 additions & 13 deletions chainladder/development/learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,14 @@ class DevelopmentML(DevelopmentBase):
Time Series aspects of the model. Predictions from one development period
get used as featues in the next development period. Lags should be negative
integers.
weight_step: str
Step name within estimator_ml that is weighted
drop: tuple or list of tuples
Drops specific origin/development combination(s)
drop_valuation: str or list of str (default = None)
Drops specific valuation periods. str must be date convertible.
fit_incrementals:
Whether the response variable should be converted to an incremental basis
for fitting.
Whether the response variable should be converted to an incremental basis for fitting.

Attributes
----------
Expand All @@ -48,11 +53,13 @@ class DevelopmentML(DevelopmentBase):
"""

def __init__(self, estimator_ml=None, y_ml=None, autoregressive=False,
weight_ml=None, fit_incrementals=True):
weighted_step=None,drop=None,drop_valuation=None,fit_incrementals=True):
self.estimator_ml=estimator_ml
self.y_ml=y_ml
self.weight_ml = weight_ml
self.autoregressive=autoregressive
self.weighted_step = weighted_step
self.autoregressive = autoregressive
self.drop = drop
self.drop_valuation = drop_valuation
self.fit_incrementals = fit_incrementals

def _get_y_names(self):
Expand Down Expand Up @@ -124,7 +131,7 @@ def _get_triangle_ml(self, df, preds=None):
return Triangle(
out, origin='origin', development='valuation',
index=self._key_labels, columns=self._get_y_names(),
cumulative=not self.fit_incrementals).dropna()
cumulative=not self.fit_incrementals).dropna(), out

def _prep_X_ml(self, X):
""" Preps Triangle data ahead of the pipeline """
Expand All @@ -139,14 +146,25 @@ def _prep_X_ml(self, X):
df_base = X.incr_to_cum().to_frame(
keepdims=True, implicit_axis=True, origin_as_datetime=True
).reset_index().iloc[:, :-1]
df = df_base.merge(X.cum_to_incr().to_frame(
df = df_base.merge(X_.to_frame(
keepdims=True, implicit_axis=True, origin_as_datetime=True
).reset_index(), how='left',
on=list(df_base.columns)).fillna(0)
df['origin'] = df['origin'].map(self.origin_encoder_)
df['valuation'] = df['valuation'].map(self.valuation_encoder_)
return df

def _prep_w_ml(self,X,sample_weight=None):
weight_base = (~np.isnan(X.values)).astype(float)
weight = weight_base.copy()
if self.drop is not None:
weight = weight * self._drop_func(X)
if self.drop_valuation is not None:
weight = weight * self._drop_valuation_func(X)
if sample_weight is not None:
weight = weight * sample_weight.values
return weight.flatten()[weight_base.flatten()>0]

def fit(self, X, y=None, sample_weight=None):
"""Fit the model with X.

Expand All @@ -156,8 +174,8 @@ def fit(self, X, y=None, sample_weight=None):
Set of LDFs to which the estimator will be applied.
y : None
Ignored, use y_ml to set a reponse variable for the ML algorithm
sample_weight : None
Ignored
sample_weight : Triangle-like
Weights to use in the regression

Returns
-------
Expand All @@ -178,10 +196,18 @@ def fit(self, X, y=None, sample_weight=None):
(pd.Series(val).rank()-1)/{'Y':1, 'S': 2, 'Q':4, 'M': 12}[X.development_grain]))
df = self._prep_X_ml(X)
self.df_ = df
weight = self._prep_w_ml(X,sample_weight)
self.weight_ = weight
if self.weighted_step == None:
sample_weights = {}
elif isinstance(self.weighted_step, list):
sample_weights = {x + '__sample_weight':weight for x in self.weighted_step}
else:
sample_weights = {self.weighted_step + '__sample_weight':weight}
# Fit model
self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze())
self.estimator_ml.fit(df, self.y_ml_.fit_transform(df).squeeze(),**sample_weights)
#return selffit_incrementals
self.triangle_ml_ = self._get_triangle_ml(df)
self.triangle_ml_, self.predicted_data_ = self._get_triangle_ml(df)
return self

@property
Expand All @@ -206,9 +232,10 @@ def transform(self, X):
X_new = X.copy()
X_ml = self._prep_X_ml(X)
y_ml=self.estimator_ml.predict(X_ml)
triangle_ml = self._get_triangle_ml(X_ml, y_ml)
triangle_ml, predicted_data = self._get_triangle_ml(X_ml, y_ml)
backend = "cupy" if X.array_backend == "cupy" else "numpy"
X_new.ldf_ = triangle_ml.incr_to_cum().link_ratio.set_backend(backend)
X_new.ldf_.valuation_date = pd.to_datetime(options.ULT_VAL)
X_new._set_slicers()
return X_new
X_new.predicted_data_ = predicted_data
return X_new
37 changes: 35 additions & 2 deletions chainladder/development/tests/test_barnzehn.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import numpy as np
import chainladder as cl
import pytest
abc = cl.load_sample('abc')

def test_basic_bz():
abc = cl.load_sample('abc')
assert np.all(
np.around(cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(abc).coef_.T.values,3).flatten()
== np.array([11.837,0.179,0.345,0.378,0.405,0.427,0.431,0.66,0.963,1.157,1.278,0.251,-0.056,-0.449,-0.829,-1.169,-1.508,-1.798,-2.023,-2.238,-2.428])
Expand All @@ -12,4 +12,37 @@ def test_basic_bz():
def test_multiple_triangle_exception():
d = cl.load_sample("usauto")
with pytest.raises(ValueError):
cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)
cl.BarnettZehnwirth(formula='C(origin)+C(development)').fit(d)

def test_drops():
'''
this function tests the passing in a basic drop_valuation
'''
assert np.all(
np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit_transform(abc).ldf_.values,3)
== np.around(cl.BarnettZehnwirth(formula='C(development)',drop = [('1977',36),('1978',24),('1979',12)]).fit_transform(abc).ldf_.values,3)
)
assert np.all(
np.around(cl.BarnettZehnwirth(formula='C(development)',drop_valuation='1979').fit(abc).triangle_ml_.values,3)
== np.around(cl.BarnettZehnwirth(formula='C(development)',drop = [('1977',36),('1978',24),('1979',12)]).fit(abc).triangle_ml_.values,3)
)

def test_bz_2008():
'''
this function tests the drop parameter by recreating the example in the 2008 BZ paper, section 4.1
'''
exposure=np.array([[2.2], [2.4], [2.2], [2.0], [1.9], [1.6], [1.6], [1.8], [2.2], [2.5], [2.6]])
abc_adj = abc/exposure

origin_buckets = [2,3,5]
dev_buckets = [(24,36),(36,48),(48,84),(84,108),(108,9999)]
val_buckets = [(1,8),(8,9),(9,999)]

origin_formula = '+'.join([f'I(origin >= {x})' for x in origin_buckets])
dev_formula = '+'.join([f'I((np.minimum({x[1]-12},development) - np.minimum({x[0]-12},development))/12)' for x in dev_buckets])
val_formula = '+'.join([f'I(np.minimum({x[1]-1},valuation) - np.minimum({x[0]-1},valuation))' for x in val_buckets])
model=cl.BarnettZehnwirth(formula=origin_formula + '+' + dev_formula + '+' + val_formula, drop=('1982',72)).fit(abc_adj)
assert np.all(
np.around(model.coef_.values,4).flatten()
== np.array([11.1579,0.1989,0.0703,0.0919,0.1871,-0.3771,-0.4465,-0.3727,-0.3154,0.0432,0.0858,0.1464])
)
6 changes: 6 additions & 0 deletions chainladder/development/tests/test_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,9 @@ def test_basic_odp_cl(genins):
(cl.Chainladder().fit(genins).ultimate_ -
cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins)).ultimate_) /
genins.latest_diagonal).max()< 1e-2

def test_sample_weight(genins):
assert abs(
(cl.Chainladder().fit(genins).ultimate_ -
cl.Chainladder().fit(cl.TweedieGLM().fit_transform(genins,sample_weight=genins/genins)).ultimate_) /
genins.latest_diagonal).max()< 1e-2
19 changes: 19 additions & 0 deletions chainladder/development/tests/test_learning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import chainladder as cl
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from chainladder.utils.utility_functions import PatsyFormula

def test_incremental(genins):
response = [genins.columns[0]]
model = cl.DevelopmentML(Pipeline(steps=[
('design_matrix', PatsyFormula('C(development)')),
('model', LinearRegression(fit_intercept=False))]),
y_ml=response,fit_incrementals=False).fit(genins)
assert abs(model.triangle_ml_.loc[:,:,'2010',:] - genins.mean()).max() < 1e2

def test_misc(genins):
model = cl.DevelopmentML(Pipeline(steps=[
('design_matrix', PatsyFormula('C(development)')),
('model', LinearRegression(fit_intercept=False))]),
weighted_step = ['model'], fit_incrementals=False).fit(genins, sample_weight=genins/genins)
assert abs(model.triangle_ml_.loc[:,:,'2010',:] - genins.mean()).max() < 1e2