From 5e0b2b22f05a22a7712d298b2e0c398c5570a226 Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Sun, 5 Nov 2017 16:28:15 -0800 Subject: [PATCH 01/24] ENH: One sample confidence test and intervals for shift model. No comments/tests yet. --- permute/core.py | 238 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 237 insertions(+), 1 deletion(-) diff --git a/permute/core.py b/permute/core.py index 26a0b848..899da5d6 100644 --- a/permute/core.py +++ b/permute/core.py @@ -520,7 +520,8 @@ def one_sample(x, y=None, reps=10**5, stat='mean', alternative="greater", } stats = { 'mean': lambda u: np.mean(u), - 't': lambda u: ttest_1samp(u, 0)[0] + 't': lambda u: ttest_1samp(u, 0)[0], + 'median': lambda u: np.median(u) } if callable(stat): tst_fun = stat @@ -528,6 +529,7 @@ def one_sample(x, y=None, reps=10**5, stat='mean', alternative="greater", tst_fun = stats[stat] tst = tst_fun(z) + n = len(z) if keep_dist: dist = [] @@ -539,3 +541,237 @@ def one_sample(x, y=None, reps=10**5, stat='mean', alternative="greater", hits = np.sum([(tst_fun(z * (1 - 2 * prng.binomial(1, .5, size=n)))) >= tst for i in range(reps)]) return thePvalue[alternative](hits / reps), tst + + +def one_sample_shift(x, y=None, reps=10**5, stat='mean', alternative="greater", + keep_dist=False, seed=None, shift = None): + r""" + One-sided or two-sided, one-sample permutation test for the mean, + with p-value estimated by simulated random sampling with + reps replications. + + Alternatively, a permutation test for equality of means of two paired + samples. + + This function assumed a shift model. + + Tests the hypothesis that x is distributed symmetrically symmetric about 0 + (or x and y have the same center) against the alternative that x comes from + a population with mean + + (a) greater than 0 (greater than that of the population from which y comes), + if side = 'greater' + (b) less than 0 (less than that of the population from which y comes), + if side = 'less' + (c) different from 0 (different from that of the population from which y comes), + if side = 'two-sided' + + If ``keep_dist``, return the distribution of values of the test statistic; + otherwise, return only the number of permutations for which the value of + the test statistic and p-value. + + Parameters + ---------- + x : array-like + Sample 1 + y : array-like + Sample 2. Must preserve the order of pairs with x. + If None, x is taken to be the one sample. + reps : int + number of repetitions + stat : {'mean', 't'} + The test statistic. The statistic is computed based on either z = x or + z = x - y, if y is specified. + + (a) If stat == 'mean', the test statistic is mean(z). + (b) If stat == 't', the test statistic is the t-statistic-- + but the p-value is still estimated by the randomization, + approximating the permutation distribution. + (c) If stat is a function (a callable object), the test statistic is + that function. The function should take a permutation of the + data and compute the test function from it. For instance, if the + test statistic is the maximum absolute value, $\max_i |z_i|$, + the test statistic could be written: + + f = lambda u: np.max(abs(u)) + alternative : {'greater', 'less', 'two-sided'} + The alternative hypothesis to test + keep_dist : bool + flag for whether to store and return the array of values + of the irr test statistic + seed : RandomState instance or {None, int, RandomState instance} + If None, the pseudorandom number generator is the RandomState + instance used by `np.random`; + If int, seed is the seed used by the random number generator; + If RandomState instance, seed is the pseudorandom number generator + shift : float + Assumption of symmetry around the shift, d. + + Returns + ------- + float + the estimated p-value + float + the test statistic + list + The distribution of test statistics. + These values are only returned if `keep_dist` == True + """ + prng = get_prng(seed) + + if y is None: + z = x + elif len(x) != len(y): + raise ValueError('x and y must be pairs') + else: + z = np.array(x) - np.array(y) + + thePvalue = { + 'greater': lambda p: p, + 'less': lambda p: 1 - p, + 'two-sided': lambda p: 2 * np.min([p, 1 - p]) + } + stats = { + 'mean': lambda u: np.mean(u), + 't': lambda u: ttest_1samp(u, 0)[0], + 'median': lambda u: np.median(u) + } + if callable(stat): + tst_fun = stat + else: + tst_fun = stats[stat] + + tst = tst_fun(z) + + if shift is None: + shift = float(0) + + n = len(z) + if keep_dist: + dist = [] + for i in range(reps): + dist.append(tst_fun(shift + (z - shift) * (1 - 2 * prng.binomial(1, .5, size=n)))) + hits = np.sum(dist >= tst) + return thePvalue[alternative](hits / reps), tst, dist + else: + hits = np.sum([(tst_fun(shift + (z - shift) * (1 - 2 * prng.binomial(1, .5, size=n)))) >= tst + for i in range(reps)]) + return thePvalue[alternative](hits / reps), tst + + + + +def one_sample_conf_int(x, y = None, cl=0.95, alternative="two-sided", seed=None, + reps=10**4, stat="mean"): + """ + One-sided or two-sided confidence interval for the parameter determining + the treatment effect. The default is the "shift model", where we are + interested in the parameter d such that x is equal in distribution to + y + d. In general, if we have some family of invertible functions parameterized + by d, we'd like to find d such that x is equal in distribution to f(y, d). + + Parameters + ---------- + x : array-like + Sample 1 + y : array-like + Sample 2 + cl : float in (0, 1) + The desired confidence level. Default 0.95. + alternative : {"two-sided", "lower", "upper"} + Indicates the alternative hypothesis. + seed : RandomState instance or {None, int, RandomState instance} + If None, the pseudorandom number generator is the RandomState + instance used by `np.random`; + If int, seed is the seed used by the random number generator; + If RandomState instance, seed is the pseudorandom number generator + reps : int + number of repetitions in two_sample + stat : {'mean', 't'} + The test statistic. + + (a) If stat == 'mean', the test statistic is (mean(x) - mean(y)) + (equivalently, sum(x), since those are monotonically related) + (b) If stat == 't', the test statistic is the two-sample t-statistic-- + but the p-value is still estimated by the randomization, + approximating the permutation distribution. + The t-statistic is computed using scipy.stats.ttest_ind + (c) If stat is a function (a callable object), the test statistic is + that function. The function should take a permutation of the pooled + data and compute the test function from it. For instance, if the + test statistic is the Kolmogorov-Smirnov distance between the + empirical distributions of the two samples, $\max_t |F_x(t) - F_y(t)|$, + the test statistic could be written: + + f = lambda u: np.max( \ + [abs(sum(u[:len(x)]<=v)/len(x)-sum(u[len(x):]<=v)/len(y)) for v in u]\ + ) + shift : float + The relationship between x and y under the null hypothesis. + + (a) If None, the relationship is assumed to be additive (e.g. x = y+d) + (b) A tuple containing the function and its inverse $(f, f^{-1})$, so + $x_i = f(y_i, d)$ and $y_i = f^{-1}(x_i, d)$ + + Returns + ------- + tuple + the estimated confidence limits + + Notes + ----- + xtol : float + Tolerance in brentq + rtol : float + Tolerance in brentq + maxiter : int + Maximum number of iterations in brentq + """ + assert alternative in ("two-sided", "lower", "upper") + + if shift is None: + shift_limit = max(abs(max(x) - min(y)), abs(max(y) - min(x))) + # FIXME: unused observed + # observed = np.mean(x) - np.mean(y) + elif isinstance(shift, tuple): + assert (callable(shift[0])), "Supply f and finverse in shift tuple" + assert (callable(shift[1])), "Supply f and finverse in shift tuple" + f = shift[0] + finverse = shift[1] + # Check that f is increasing in d; this is very ad hoc! + assert (f(5, 1) < f(5, 2)), "f must be increasing in the parameter d" + shift_limit = max(abs(fsolve(lambda d: f(max(y), d) - min(x), 0)), + abs(fsolve(lambda d: f(min(y), d) - max(x), 0))) + # FIXME: unused observed + # observed = fsolve(lambda d: np.mean(x) - np.mean(f(y, d)), 0) + else: + raise ValueError("Bad input for shift") + ci_low = -shift_limit + ci_upp = shift_limit + max_around_zero = np.max([np.max(x), np.max(-x)]) + ci_low = -max_around_zero + ci_upp = max_around_zero + + if alternative == 'two-sided': + cl = 1 - (1 - cl) / 2 + + if alternative != "upper": + # if shift is None: + # g = lambda q: cl - one_sample_shift(x, y, alternative="less", seed=seed, reps=reps, stat=stat, shift=q)[0] + # else: + g = lambda q: cl - one_sample_shift(x, y, alternative="less", seed=seed, \ + reps=reps, stat=stat, shift=q)[0] + ci_low = brentq(g, -2 * max_around_zero, 2 * max_around_zero) + + if alternative != "lower": + # if shift is None: + # g = lambda q: cl - one_sample_shift(x, y, alternative="greater", seed=seed, reps=reps, stat=stat, shift=q)[0] + # else: + g = lambda q: cl - one_sample_shift(x, y, alternative="greater", seed=seed, \ + reps=reps, stat=stat, shift=q)[0] + ci_upp = brentq(g, -2 * max_around_zero, 2 * max_around_zero) + + return ci_low, ci_upp + + + From 38bb698adad91446aa77dd92f60b5553305e1c7a Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Mon, 6 Nov 2017 21:29:59 -0800 Subject: [PATCH 02/24] EDH one sample tests --- permute/core.py | 6 +++--- permute/tests/test_core.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/permute/core.py b/permute/core.py index 899da5d6..51ea2d2b 100644 --- a/permute/core.py +++ b/permute/core.py @@ -553,7 +553,7 @@ def one_sample_shift(x, y=None, reps=10**5, stat='mean', alternative="greater", Alternatively, a permutation test for equality of means of two paired samples. - This function assumed a shift model. + This function assumed a shift model, assuming symmetry around a certain shifted mean. Tests the hypothesis that x is distributed symmetrically symmetric about 0 (or x and y have the same center) against the alternative that x comes from @@ -665,7 +665,7 @@ def one_sample_conf_int(x, y = None, cl=0.95, alternative="two-sided", seed=None reps=10**4, stat="mean"): """ One-sided or two-sided confidence interval for the parameter determining - the treatment effect. The default is the "shift model", where we are + a statistic of a univariate sample. The default is the "shift model", where we are interested in the parameter d such that x is equal in distribution to y + d. In general, if we have some family of invertible functions parameterized by d, we'd like to find d such that x is equal in distribution to f(y, d). @@ -730,7 +730,7 @@ def one_sample_conf_int(x, y = None, cl=0.95, alternative="two-sided", seed=None assert alternative in ("two-sided", "lower", "upper") if shift is None: - shift_limit = max(abs(max(x) - min(y)), abs(max(y) - min(x))) + shift_limit = np.max([np.max(x), np.max(-x)]) # FIXME: unused observed # observed = np.mean(x) - np.mean(y) elif isinstance(shift, tuple): diff --git a/permute/tests/test_core.py b/permute/tests/test_core.py index a3668f8b..6ac8b35f 100644 --- a/permute/tests/test_core.py +++ b/permute/tests/test_core.py @@ -217,3 +217,40 @@ def test_one_sample(): res = one_sample(x, y, seed=42, reps=100, stat="t", alternative="less") np.testing.assert_almost_equal(res[0], 0.05) np.testing.assert_almost_equal(res[1], -1.4491883) + + +def test_one_sample_shift(): + prng = RandomState(42) + + x = np.array(range(5)) + y = x - 1 + + # case 1: one sample only + res = one_sample_shift(x, seed=42, reps=100) + np.testing.assert_almost_equal(res[0], 0.05999999) + np.testing.assert_equal(res[1], 2) + + # case 2: paired sample + res = one_sample(x, y, seed=42, reps=100) + np.testing.assert_equal(res[0], 0.02) + np.testing.assert_equal(res[1], 1) + + # case 3: break it - supply x and y, but not paired + y = np.append(y, 10) + assert_raises(ValueError, one_sample, x, y) + + # case 4: say keep_dist=True + res = one_sample(x, seed=42, reps=100, keep_dist=True) + np.testing.assert_almost_equal(res[0], 0.05999999) + np.testing.assert_equal(res[1], 2) + np.testing.assert_equal(min(res[2]), -2) + np.testing.assert_equal(max(res[2]), 2) + np.testing.assert_equal(np.median(res[2]), 0) + + # case 5: use t as test statistic + y = x + prng.normal(size=5) + res = one_sample(x, y, seed=42, reps=100, stat="t", alternative="less") + np.testing.assert_almost_equal(res[0], 0.05) + np.testing.assert_almost_equal(res[1], -1.4491883) + +def test_one_sample_conf_int(): From 76941a3ea424dba19cad9a94d9ce9bde9dfe191b Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Tue, 14 Nov 2017 01:23:58 -0800 Subject: [PATCH 03/24] WIP one sample confidence intervals and tests assuming symmetry --- permute/core.py | 56 ++++++++++++++++++++------------ permute/tests/test_core.py | 66 +++++++++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 21 deletions(-) diff --git a/permute/core.py b/permute/core.py index 899da5d6..abbd6f33 100644 --- a/permute/core.py +++ b/permute/core.py @@ -542,7 +542,6 @@ def one_sample(x, y=None, reps=10**5, stat='mean', alternative="greater", for i in range(reps)]) return thePvalue[alternative](hits / reps), tst - def one_sample_shift(x, y=None, reps=10**5, stat='mean', alternative="greater", keep_dist=False, seed=None, shift = None): r""" @@ -553,7 +552,8 @@ def one_sample_shift(x, y=None, reps=10**5, stat='mean', alternative="greater", Alternatively, a permutation test for equality of means of two paired samples. - This function assumed a shift model. + This function assumed a shift model. Given a shift (float), this test will + find the Tests the hypothesis that x is distributed symmetrically symmetric about 0 (or x and y have the same center) against the alternative that x comes from @@ -662,13 +662,11 @@ def one_sample_shift(x, y=None, reps=10**5, stat='mean', alternative="greater", def one_sample_conf_int(x, y = None, cl=0.95, alternative="two-sided", seed=None, - reps=10**4, stat="mean"): + reps=10**4, stat="mean", shift=None): """ - One-sided or two-sided confidence interval for the parameter determining - the treatment effect. The default is the "shift model", where we are - interested in the parameter d such that x is equal in distribution to - y + d. In general, if we have some family of invertible functions parameterized - by d, we'd like to find d such that x is equal in distribution to f(y, d). + One-sided or two-sided confidence interval for a test statistic of a sample with + or paired sample. The default is the two-sided confidence interval for the mean of a sample x. + Giving Parameters ---------- @@ -729,8 +727,15 @@ def one_sample_conf_int(x, y = None, cl=0.95, alternative="two-sided", seed=None """ assert alternative in ("two-sided", "lower", "upper") + if y is None: + z = x + elif len(x) != len(y): + raise ValueError('x and y must be pairs') + else: + z = np.array(x) - np.array(y) + if shift is None: - shift_limit = max(abs(max(x) - min(y)), abs(max(y) - min(x))) + shift_limit = max(z) - min(z) # FIXME: unused observed # observed = np.mean(x) - np.mean(y) elif isinstance(shift, tuple): @@ -740,17 +745,28 @@ def one_sample_conf_int(x, y = None, cl=0.95, alternative="two-sided", seed=None finverse = shift[1] # Check that f is increasing in d; this is very ad hoc! assert (f(5, 1) < f(5, 2)), "f must be increasing in the parameter d" - shift_limit = max(abs(fsolve(lambda d: f(max(y), d) - min(x), 0)), - abs(fsolve(lambda d: f(min(y), d) - max(x), 0))) + + shift_limit = max(abs(fsolve(lambda d: f(max(z), d) - min(z), 0)), + abs(fsolve(lambda d: f(min(z), d) - max(z), 0))) # FIXME: unused observed # observed = fsolve(lambda d: np.mean(x) - np.mean(f(y, d)), 0) else: raise ValueError("Bad input for shift") - ci_low = -shift_limit - ci_upp = shift_limit - max_around_zero = np.max([np.max(x), np.max(-x)]) - ci_low = -max_around_zero - ci_upp = max_around_zero + + stats = { + 'mean': lambda u: np.mean(u), + 't': lambda u: ttest_1samp(u, 0)[0], + 'median': lambda u: np.median(u) + } + if callable(stat): + tst_fun = stat + else: + tst_fun = stats[stat] + + tst = tst_fun(z) + + ci_low = tst - shift_limit + ci_upp = tst + shift_limit if alternative == 'two-sided': cl = 1 - (1 - cl) / 2 @@ -759,17 +775,17 @@ def one_sample_conf_int(x, y = None, cl=0.95, alternative="two-sided", seed=None # if shift is None: # g = lambda q: cl - one_sample_shift(x, y, alternative="less", seed=seed, reps=reps, stat=stat, shift=q)[0] # else: - g = lambda q: cl - one_sample_shift(x, y, alternative="less", seed=seed, \ + g = lambda q: cl - one_sample_shift(z, alternative="less", seed=seed, \ reps=reps, stat=stat, shift=q)[0] - ci_low = brentq(g, -2 * max_around_zero, 2 * max_around_zero) + ci_low = brentq(g, tst - 2 * shift_limit, tst + 2 * shift_limit) if alternative != "lower": # if shift is None: # g = lambda q: cl - one_sample_shift(x, y, alternative="greater", seed=seed, reps=reps, stat=stat, shift=q)[0] # else: - g = lambda q: cl - one_sample_shift(x, y, alternative="greater", seed=seed, \ + g = lambda q: cl - one_sample_shift(z, alternative="greater", seed=seed, \ reps=reps, stat=stat, shift=q)[0] - ci_upp = brentq(g, -2 * max_around_zero, 2 * max_around_zero) + ci_upp = brentq(g, tst - 2 * shift_limit, tst + 2 * shift_limit) return ci_low, ci_upp diff --git a/permute/tests/test_core.py b/permute/tests/test_core.py index a3668f8b..6959ef6e 100644 --- a/permute/tests/test_core.py +++ b/permute/tests/test_core.py @@ -12,7 +12,9 @@ two_sample, two_sample_shift, two_sample_conf_int, - one_sample) + one_sample, + one_sample_shift, + one_sample_conf_int) def test_corr(): @@ -217,3 +219,65 @@ def test_one_sample(): res = one_sample(x, y, seed=42, reps=100, stat="t", alternative="less") np.testing.assert_almost_equal(res[0], 0.05) np.testing.assert_almost_equal(res[1], -1.4491883) + + +def test_one_sample_shift(): + prng = RandomState(42) + + x = np.array(range(10)) + y = x - 1 + + # case 1: one sample without shift. should work the same as test_one_sample + res0 = one_sample_shift(x, seed=prng, reps=100, shift=3) + res1 = one_sample_shift(x, seed=prng, reps=100, shift=7) + np.testing.assert_almost_equal(res0[0], 0.07) + np.testing.assert_equal(res0[1], np.mean(x)) + np.testing.assert_almost_equal(res1[0], 1) + + # case 2: paired sample + res = one_sample_shift(x, y, seed=42, reps=100, shift=1, keep_dist=True) + np.testing.assert_almost_equal(res[0], 1) + np.testing.assert_equal(res[1], 1) + dist_unique = np.unique(res[2]) # Distribution should be all 1's + np.testing.assert_equal(len(dist_unique), 1) + np.testing.assert_equal(dist_unique[0], 1) + + # case 3: + x = np.array(range(5)) + res = one_sample_shift(x, seed=42, reps=100, stat="t", alternative="less", shift = 0.1) + np.testing.assert_almost_equal(res[0], 0.93999999999999995) + np.testing.assert_almost_equal(res[1], 2.8284271247461898) + +@attr('slow') +def test_one_sample_conf_int(): + prng = RandomState(42) + + x = np.array(range(10)) + res = one_sample_conf_int(x, seed=prng) + expected_ci = (2.2696168, 6.6684788) + np.testing.assert_almost_equal(res, expected_ci) + res = one_sample_conf_int(x, seed=prng, alternative="upper") + expected_ci = (-4.5, 6.255232305502077) + np.testing.assert_almost_equal(res, expected_ci) + res = one_sample_conf_int(x, seed=prng, alternative="lower") + expected_ci = (2.7680067828582393, 13.5) + np.testing.assert_almost_equal(res, expected_ci) + + norm = prng.normal(0, 1, size=100) + shift = prng.normal(1, 1, size=100) + diff = norm - shift + res = one_sample_conf_int(norm, diff, seed=prng) + expected_ci = (0.8, 1.2) + np.testing.assert_almost_equal(res, expected_ci, decimal=1) + + + # Specify shift with a function pair + shift = (lambda u, d: u + d, lambda u, d: u - d) + res = one_sample_conf_int(x, seed=5, shift=shift) + np.testing.assert_almost_equal(res, (2.333333333333319, 6.749999999999943)) + + # Specify shift with a multiplicative pair + shift = (lambda u, d: u * d, lambda u, d: u / d) + res = one_sample_conf_int(norm, seed=5, shift=shift) + np.testing.assert_almost_equal(res, (-0.0653441, 0.309073 )) + From 6ebd63191ea241919637b7e8b8ea23873380da8a Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Tue, 14 Nov 2017 01:38:50 -0800 Subject: [PATCH 04/24] removed some merge conflicts --- permute/core.py | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/permute/core.py b/permute/core.py index b14f9b57..e4ffd465 100644 --- a/permute/core.py +++ b/permute/core.py @@ -552,12 +552,8 @@ def one_sample_shift(x, y=None, reps=10**5, stat='mean', alternative="greater", Alternatively, a permutation test for equality of means of two paired samples. -<<<<<<< HEAD - This function assumed a shift model, assuming symmetry around a certain shifted mean. -======= This function assumed a shift model. Given a shift (float), this test will find the ->>>>>>> one-sample Tests the hypothesis that x is distributed symmetrically symmetric about 0 (or x and y have the same center) against the alternative that x comes from @@ -668,17 +664,8 @@ def one_sample_shift(x, y=None, reps=10**5, stat='mean', alternative="greater", def one_sample_conf_int(x, y = None, cl=0.95, alternative="two-sided", seed=None, reps=10**4, stat="mean", shift=None): """ -<<<<<<< HEAD - One-sided or two-sided confidence interval for the parameter determining - a statistic of a univariate sample. The default is the "shift model", where we are - interested in the parameter d such that x is equal in distribution to - y + d. In general, if we have some family of invertible functions parameterized - by d, we'd like to find d such that x is equal in distribution to f(y, d). -======= One-sided or two-sided confidence interval for a test statistic of a sample with or paired sample. The default is the two-sided confidence interval for the mean of a sample x. - Giving ->>>>>>> one-sample Parameters ---------- @@ -747,13 +734,8 @@ def one_sample_conf_int(x, y = None, cl=0.95, alternative="two-sided", seed=None z = np.array(x) - np.array(y) if shift is None: -<<<<<<< HEAD - shift_limit = np.max([np.max(x), np.max(-x)]) -======= shift_limit = max(z) - min(z) ->>>>>>> one-sample - # FIXME: unused observed - # observed = np.mean(x) - np.mean(y) + elif isinstance(shift, tuple): assert (callable(shift[0])), "Supply f and finverse in shift tuple" assert (callable(shift[1])), "Supply f and finverse in shift tuple" From 6c05b138cbe12d2fd32afda79222d52ea2b9d13b Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Tue, 14 Nov 2017 01:48:10 -0800 Subject: [PATCH 05/24] removed some merge conflicts, improved coverage a little --- permute/tests/test_core.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/permute/tests/test_core.py b/permute/tests/test_core.py index 41e26619..e4f1e034 100644 --- a/permute/tests/test_core.py +++ b/permute/tests/test_core.py @@ -248,6 +248,10 @@ def test_one_sample_shift(): np.testing.assert_almost_equal(res[0], 0.93999999999999995) np.testing.assert_almost_equal(res[1], 2.8284271247461898) + # case 4: break it - supply x and y, but not paired + y = np.append(y, 10) + assert_raises(ValueError, one_sample_shift, x, y) + @attr('slow') def test_one_sample_conf_int(): prng = RandomState(42) @@ -280,3 +284,7 @@ def test_one_sample_conf_int(): shift = (lambda u, d: u * d, lambda u, d: u / d) res = one_sample_conf_int(norm, seed=5, shift=shift) np.testing.assert_almost_equal(res, (-0.0653441, 0.309073 )) + + # case 3: break it - supply x and y, but not paired + y = np.append(x, 10) + assert_raises(ValueError, one_sample_conf_int, x, y) From 769e5d75fdbd6add99ec4e0b553596f9dc831e14 Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Mon, 20 Nov 2017 19:48:28 -0800 Subject: [PATCH 06/24] Attempt to raise coverage. Adding initial one sample percentile test for symmetric distributions --- permute/core.py | 65 ++++++++++++++++++++++++++++++++++++++ permute/tests/test_core.py | 43 +++++++++++++++++++++++-- 2 files changed, 105 insertions(+), 3 deletions(-) diff --git a/permute/core.py b/permute/core.py index e4ffd465..7b5c9cbb 100644 --- a/permute/core.py +++ b/permute/core.py @@ -10,6 +10,7 @@ from scipy.stats import ttest_ind, ttest_1samp from .utils import get_prng, potential_outcomes +from .binomialp import binomial_p def corr(x, y, reps=10**4, seed=None): @@ -432,6 +433,67 @@ def two_sample_conf_int(x, y, cl=0.95, alternative="two-sided", seed=None, return ci_low, ci_upp +""" +ROUGH DRAFT: One sample test for percentile. +def one_sample_percentile(x, y=None, p=50, alternative="greater", keep_dist=False, seed=None): + One-sided or two-sided test for the percentile P of a population distribution. + assuming a there is an P/100 chance for each value of the sample to be within the + Pth percentile or not. + + This test defaults to P=50, a test for a symmetrical distribution. + /"/"/" + Parameters + ---------- + x : array-like + Sample 1 + y : array-like + Sample 2. Must preserve the order of pairs with x. + If None, x is taken to be the one sample. + p: int in [0,100] + Percentile of interest to test. + reps : int + number of repetitions + alternative : {'greater', 'less', 'two-sided'} + The alternative hypothesis to test + keep_dist : bool + flag for whether to store and return the array of values + of the irr test statistic + seed : RandomState instance or {None, int, RandomState instance} + If None, the pseudorandom number generator is the RandomState + instance used by `np.random`; + If int, seed is the seed used by the random number generator; + If RandomState instance, seed is the pseudorandom number generator + + Returns + ------- + float + the estimated p-value + float + the test statistic + list + The distribution of test statistics. + These values are only returned if `keep_dist` == True + /"/"/" + prng = get_prng(seed) + + if y is None: + z = x + elif len(x) != len(y): + raise ValueError('x and y must be pairs') + else: + z = np.array(x) - np.array(y) + + if not 0 <= p <= 100: + raise ValueError('p must be between 0 and 100') + + return binomial_p(np.sum(x < np.percentile(x, p)), len(x), p/100, alternative=alternative, keep_dist=keep_dist, seed=seed) +""" + +""" +ROUGH DRAFT: One sample confidence intervals for percentiles +def one_sample_p_int(x, y=None, p=50, cl=0.95, alternative="two-sided", seed=None): + return binom_conf_interval(len(x), x, cl=0.95, alternative="two-sided", p=p/100) +""" def one_sample(x, y=None, reps=10**5, stat='mean', alternative="greater", keep_dist=False, seed=None): @@ -518,11 +580,13 @@ def one_sample(x, y=None, reps=10**5, stat='mean', alternative="greater", 'less': lambda p: 1 - p, 'two-sided': lambda p: 2 * np.min([p, 1 - p]) } + stats = { 'mean': lambda u: np.mean(u), 't': lambda u: ttest_1samp(u, 0)[0], 'median': lambda u: np.median(u) } + if callable(stat): tst_fun = stat else: @@ -542,6 +606,7 @@ def one_sample(x, y=None, reps=10**5, stat='mean', alternative="greater", for i in range(reps)]) return thePvalue[alternative](hits / reps), tst + def one_sample_shift(x, y=None, reps=10**5, stat='mean', alternative="greater", keep_dist=False, seed=None, shift = None): r""" diff --git a/permute/tests/test_core.py b/permute/tests/test_core.py index e4f1e034..a12c3f17 100644 --- a/permute/tests/test_core.py +++ b/permute/tests/test_core.py @@ -220,6 +220,11 @@ def test_one_sample(): np.testing.assert_almost_equal(res[0], 0.05) np.testing.assert_almost_equal(res[1], -1.4491883) + # case 6: use median as test statistic + res = one_sample(x, seed=42, reps=100, stat="median") + np.testing.assert_almost_equal(res[0], 0.14) + np.testing.assert_equal(res[1], 2) + def test_one_sample_shift(): prng = RandomState(42) @@ -242,9 +247,9 @@ def test_one_sample_shift(): np.testing.assert_equal(len(dist_unique), 1) np.testing.assert_equal(dist_unique[0], 1) - # case 3: + # case 3: t test statistic x = np.array(range(5)) - res = one_sample_shift(x, seed=42, reps=100, stat="t", alternative="less", shift = 0.1) + res = one_sample_shift(x, seed=42, reps=100, stat="t", alternative="less", shift=0.1) np.testing.assert_almost_equal(res[0], 0.93999999999999995) np.testing.assert_almost_equal(res[1], 2.8284271247461898) @@ -252,10 +257,18 @@ def test_one_sample_shift(): y = np.append(y, 10) assert_raises(ValueError, one_sample_shift, x, y) + # case 5: use median as test statistic + x = np.array(range(10)) + res = one_sample_shift(x, seed=42, reps=100, stat="median", shift=4.5) + np.testing.assert_almost_equal(res[0], 0.53) + np.testing.assert_equal(res[1], 4.5) + + @attr('slow') def test_one_sample_conf_int(): prng = RandomState(42) + # Standard confidenceinterval x = np.array(range(10)) res = one_sample_conf_int(x, seed=prng) expected_ci = (2.2696168, 6.6684788) @@ -267,6 +280,7 @@ def test_one_sample_conf_int(): expected_ci = (2.7680067828582393, 13.5) np.testing.assert_almost_equal(res, expected_ci) + # Normal distribution shift centered at 1 norm = prng.normal(0, 1, size=100) shift = prng.normal(1, 1, size=100) diff = norm - shift @@ -285,6 +299,29 @@ def test_one_sample_conf_int(): res = one_sample_conf_int(norm, seed=5, shift=shift) np.testing.assert_almost_equal(res, (-0.0653441, 0.309073 )) - # case 3: break it - supply x and y, but not paired + # break it - supply x and y, but not paired y = np.append(x, 10) assert_raises(ValueError, one_sample_conf_int, x, y) + + # Testing with sample statistic of median + res = one_sample_conf_int(x, seed=42, stat="median") + np.testing.assert_almost_equal(res, (2.499999999999531, 6.500000000000918)) + + # Testing with t statistic + prng = RandomState(42) + x = np.arange(20) + y = x + prng.normal(size=20) + res = one_sample_conf_int(x, y, seed=prng, stat='t') + np.testing.assert_almost_equal(res[0], -0.27271209581516753) + np.testing.assert_almost_equal(res[0], 0.6217655068645991) + + +@raises(AssertionError) +def test_one_sample_conf_int_bad_shift(): + # Break it with a bad shift + x = np.array(range(5)) + y = np.array(range(1, 6)) + shift = (lambda u, d: -d * u, lambda u, d: -u / d) + one_sample_conf_int(x, y, seed=5, shift=shift) + + From 0ab6cbad4e24aea21c828a819912d1849ec861ae Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Mon, 20 Nov 2017 21:12:18 -0800 Subject: [PATCH 07/24] fix error with test: --- permute/tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/permute/tests/test_core.py b/permute/tests/test_core.py index a12c3f17..cd24a63e 100644 --- a/permute/tests/test_core.py +++ b/permute/tests/test_core.py @@ -313,7 +313,7 @@ def test_one_sample_conf_int(): y = x + prng.normal(size=20) res = one_sample_conf_int(x, y, seed=prng, stat='t') np.testing.assert_almost_equal(res[0], -0.27271209581516753) - np.testing.assert_almost_equal(res[0], 0.6217655068645991) + np.testing.assert_almost_equal(res[1], 0.6217655068645991) @raises(AssertionError) From 05e90ee042b76b75fdebfaccd2463dfabdcd0baa Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Mon, 20 Nov 2017 21:52:08 -0800 Subject: [PATCH 08/24] Sped up tests a little bit --- permute/tests/test_core.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/permute/tests/test_core.py b/permute/tests/test_core.py index cd24a63e..058c05ff 100644 --- a/permute/tests/test_core.py +++ b/permute/tests/test_core.py @@ -304,16 +304,15 @@ def test_one_sample_conf_int(): assert_raises(ValueError, one_sample_conf_int, x, y) # Testing with sample statistic of median - res = one_sample_conf_int(x, seed=42, stat="median") - np.testing.assert_almost_equal(res, (2.499999999999531, 6.500000000000918)) + res = one_sample_conf_int(x, seed=42, reps=100, stat="median") + np.testing.assert_almost_equal(res, (2.499999999999458, 6.999999999999045)) # Testing with t statistic prng = RandomState(42) x = np.arange(20) y = x + prng.normal(size=20) - res = one_sample_conf_int(x, y, seed=prng, stat='t') - np.testing.assert_almost_equal(res[0], -0.27271209581516753) - np.testing.assert_almost_equal(res[1], 0.6217655068645991) + res = one_sample_conf_int(x, y, reps=100, seed=prng, stat='t') + np.testing.assert_almost_equal(res, (-0.3018817477447192, 0.6510547144565948)) @raises(AssertionError) From b2ea3af69f1e913e7d7876944ac53a00fa51faf8 Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Mon, 20 Nov 2017 23:18:11 -0800 Subject: [PATCH 09/24] Dont think this helps --- permute/tests/test_core.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/permute/tests/test_core.py b/permute/tests/test_core.py index 058c05ff..c7b86688 100644 --- a/permute/tests/test_core.py +++ b/permute/tests/test_core.py @@ -232,13 +232,18 @@ def test_one_sample_shift(): x = np.array(range(10)) y = x - 1 - # case 1: one sample without shift. should work the same as test_one_sample + # case 1: res0 = one_sample_shift(x, seed=prng, reps=100, shift=3) res1 = one_sample_shift(x, seed=prng, reps=100, shift=7) np.testing.assert_almost_equal(res0[0], 0.07) np.testing.assert_equal(res0[1], np.mean(x)) np.testing.assert_almost_equal(res1[0], 1) + # one_sample_shift with no shift is the same as a shift of 0 + zero = one_sample_shift(x, seed=42, reps=100) + zero_reg = one_sample(x, seed=42, reps=100) + np.testing.assert_almost_equal(zero, zero_reg) + # case 2: paired sample res = one_sample_shift(x, y, seed=42, reps=100, shift=1, keep_dist=True) np.testing.assert_almost_equal(res[0], 1) From af65f08168b577a53583143bc245eebfc2c81c42 Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Tue, 21 Nov 2017 00:12:19 -0800 Subject: [PATCH 10/24] alternate test stats --- permute/tests/test_core.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/permute/tests/test_core.py b/permute/tests/test_core.py index c7b86688..4b946cd1 100644 --- a/permute/tests/test_core.py +++ b/permute/tests/test_core.py @@ -225,6 +225,12 @@ def test_one_sample(): np.testing.assert_almost_equal(res[0], 0.14) np.testing.assert_equal(res[1], 2) + # case 7: Test statistic is a function + pcntl = lambda x: np.percentile(x, 20) + res = one_sample(x, seed=42, reps=100, stat=pcntl) + np.testing.assert_almost_equal(res[0], 0.059999999999) + np.testing.assert_almost_equal(res[1], 0.8) + def test_one_sample_shift(): prng = RandomState(42) @@ -268,6 +274,12 @@ def test_one_sample_shift(): np.testing.assert_almost_equal(res[0], 0.53) np.testing.assert_equal(res[1], 4.5) + # case 6: Test statistic is a function + pcntl = lambda x: np.percentile(x, 20) + res = one_sample_shift(x, seed=42, reps=100, stat=pcntl, shift=2) + np.testing.assert_almost_equal(res[0], 0.029999999999999999) + np.testing.assert_almost_equal(res[1], 1.8) + @attr('slow') def test_one_sample_conf_int(): @@ -319,6 +331,11 @@ def test_one_sample_conf_int(): res = one_sample_conf_int(x, y, reps=100, seed=prng, stat='t') np.testing.assert_almost_equal(res, (-0.3018817477447192, 0.6510547144565948)) + min_func = lambda x: np.min(x) + res = one_sample_conf_int(x, y, reps=100, seed=42, stat=min_func) + np.testing.assert_almost_equal(res, (-0.5084626431347878, 0.167033714575861)) + + @raises(AssertionError) def test_one_sample_conf_int_bad_shift(): From 03233116709f920cb338828a7305d35a295cef1e Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Mon, 27 Nov 2017 16:51:56 -0800 Subject: [PATCH 11/24] developed one sample percentile and confidence intervals --- permute/core.py | 83 +++++++++++++++++++++++++++++--------- permute/tests/test_core.py | 45 +++++++++++++++++++-- 2 files changed, 105 insertions(+), 23 deletions(-) diff --git a/permute/core.py b/permute/core.py index 7b5c9cbb..3d9c42c7 100644 --- a/permute/core.py +++ b/permute/core.py @@ -9,7 +9,7 @@ from scipy.optimize import brentq, fsolve from scipy.stats import ttest_ind, ttest_1samp -from .utils import get_prng, potential_outcomes +from .utils import get_prng, potential_outcomes, binom_conf_interval from .binomialp import binomial_p @@ -433,15 +433,19 @@ def two_sample_conf_int(x, y, cl=0.95, alternative="two-sided", seed=None, return ci_low, ci_upp -""" -ROUGH DRAFT: One sample test for percentile. -def one_sample_percentile(x, y=None, p=50, alternative="greater", keep_dist=False, seed=None): + +#ROUGH DRAFT: One sample test for percentile. +def one_sample_percentile(x, y=None, p=50, reps=10**5, alternative="greater", keep_dist=False, seed=None): + r""" One-sided or two-sided test for the percentile P of a population distribution. - assuming a there is an P/100 chance for each value of the sample to be within the - Pth percentile or not. + assuming there is an P/100 chance for each value of the sample to be in the + Pth percentile. - This test defaults to P=50, a test for a symmetrical distribution. - /"/"/" + The null hypothesis is that there is an equal P/100 chance for any value of the + sample to lie at or below the sample Pth percentile. + + This test defaults to P=50. + Parameters ---------- x : array-like @@ -469,12 +473,10 @@ def one_sample_percentile(x, y=None, p=50, alternative="greater", keep_dist=Fals float the estimated p-value float - the test statistic + the test statistic: Number of values at or below percentile P of sample list - The distribution of test statistics. - These values are only returned if `keep_dist` == True - /"/"/" - prng = get_prng(seed) + distribution of test statistics (only if keep_dist == True) + """ if y is None: z = x @@ -486,14 +488,55 @@ def one_sample_percentile(x, y=None, p=50, alternative="greater", keep_dist=Fals if not 0 <= p <= 100: raise ValueError('p must be between 0 and 100') - return binomial_p(np.sum(x < np.percentile(x, p)), len(x), p/100, alternative=alternative, keep_dist=keep_dist, seed=seed) -""" + return binomial_p(np.sum(z <= np.percentile(z, p)), len(z), p/100, reps=reps, \ + alternative=alternative, keep_dist=keep_dist, seed=seed) + + + +# ROUGH DRAFT: One sample confidence intervals for percentiles +def one_sample_percentile_conf_int(x, y=None, p=50, cl=0.95, alternative="two-sided", seed=None): + """ + Confidence intervals for a percentile P of the population distribution of a + univariate or paired sample. + + Compute a confidence interval for a binomial p, the probability of success in each trial. + + Parameters + ---------- + n : int + The number of Bernoulli trials. + x : int + The number of successes. + cl : float in (0, 1) + The desired confidence level. + alternative : {"two-sided", "lower", "upper"} + Indicates the alternative hypothesis. + p : int in [0,100] + Desired percentile of interest to test. + kwargs : dict + Key word arguments + + Returns + ------- + tuple + lower and upper confidence level with coverage (approximately) + 1-alpha. + + """ + + if y is None: + z = x + elif len(x) != len(y): + raise ValueError('x and y must be pairs') + else: + z = np.array(x) - np.array(y) + + if not 0 <= p <= 100: + raise ValueError('p must be between 0 and 100') + + conf_int = binom_conf_interval(len(z), np.sum(z <= np.percentile(z, p)), cl=cl, alternative="two-sided", p=p/100) + return (np.percentile(z, conf_int[0]*100), np.percentile(z, conf_int[1]*100)) -""" -ROUGH DRAFT: One sample confidence intervals for percentiles -def one_sample_p_int(x, y=None, p=50, cl=0.95, alternative="two-sided", seed=None): - return binom_conf_interval(len(x), x, cl=0.95, alternative="two-sided", p=p/100) -""" def one_sample(x, y=None, reps=10**5, stat='mean', alternative="greater", keep_dist=False, seed=None): diff --git a/permute/tests/test_core.py b/permute/tests/test_core.py index 4b946cd1..7d083b38 100644 --- a/permute/tests/test_core.py +++ b/permute/tests/test_core.py @@ -14,7 +14,9 @@ two_sample_conf_int, one_sample, one_sample_shift, - one_sample_conf_int) + one_sample_conf_int, + one_sample_percentile, + one_sample_percentile_conf_int) def test_corr(): @@ -134,7 +136,7 @@ def test_two_sample_shift(): # Define a lambda function f = lambda u, v: np.max(u) - np.max(v) - res = two_sample(x, y, seed=42, stat=f, reps=100) + res = two_sample_shift(x, y, seed=42, stat=f, reps=100) expected = (1, -3.2730653690015465) np.testing.assert_equal(res[0], expected[0]) np.testing.assert_equal(res[1], expected[1]) @@ -336,7 +338,6 @@ def test_one_sample_conf_int(): np.testing.assert_almost_equal(res, (-0.5084626431347878, 0.167033714575861)) - @raises(AssertionError) def test_one_sample_conf_int_bad_shift(): # Break it with a bad shift @@ -346,3 +347,41 @@ def test_one_sample_conf_int_bad_shift(): one_sample_conf_int(x, y, seed=5, shift=shift) +def test_one_sample_percentile(): + prng = RandomState(42) + x = np.arange(0, 100) + y = x + prng.normal(size=100) + res = one_sample_percentile(x=x, y=y, p=50, seed=42, reps=100) + np.testing.assert_almost_equal(res[0], 0.53925999999999996) + np.testing.assert_equal(res[1], 50) + + y = np.append(y, 10) + np.testing.assert_raises(ValueError, one_sample, x, y) + + np.testing.assert_raises(ValueError, one_sample, x, p=101) + + # Skewed sample + x_skew = np.array(list(x) + [80] * 50) + res = one_sample_percentile(x=x_skew, p=80, seed=42, reps=100) + np.testing.assert_almost_equal(res[0], 0.01) + np.testing.assert_equal(res[1], 131) + +@attr('slow') +def test_one_sample_percentile_conf_int(): + x = np.arange(0, 100) + res = one_sample_percentile_conf_int(x, seed=42, p=50) + np.testing.assert_almost_equal(res[0], 39.43379182082676) + np.testing.assert_almost_equal(res[1], 59.56620817917322) + + + y = np.append(y, 10) + np.testing.assert_raises(ValueError, one_sample, x, y) + + np.testing.assert_raises(ValueError, one_sample, x, p=101) + + prng = RandomState(42) + y = x + prng.normal(size=100) + res = one_sample_percentile_conf_int(x=x, y=y, p=20, seed=42, reps=100) + expected_ci = (-0.94850907410172258, -0.34506992048853352) + np.testing.assert_almost_equal(res, expected_ci) + From c6df5222ac2f6535a412aeadf1c3ab952d88f682 Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Mon, 27 Nov 2017 16:56:19 -0800 Subject: [PATCH 12/24] developed one sample percentile and confidence intervals --- permute/tests/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/permute/tests/test_core.py b/permute/tests/test_core.py index 7d083b38..50958301 100644 --- a/permute/tests/test_core.py +++ b/permute/tests/test_core.py @@ -352,7 +352,7 @@ def test_one_sample_percentile(): x = np.arange(0, 100) y = x + prng.normal(size=100) res = one_sample_percentile(x=x, y=y, p=50, seed=42, reps=100) - np.testing.assert_almost_equal(res[0], 0.53925999999999996) + np.testing.assert_almost_equal(res[0], 0.52) np.testing.assert_equal(res[1], 50) y = np.append(y, 10) @@ -374,7 +374,7 @@ def test_one_sample_percentile_conf_int(): np.testing.assert_almost_equal(res[1], 59.56620817917322) - y = np.append(y, 10) + y = np.append(x, 10) np.testing.assert_raises(ValueError, one_sample, x, y) np.testing.assert_raises(ValueError, one_sample, x, p=101) From 6dc8aa59267c68b670031753ba80a45e2d717d76 Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Mon, 27 Nov 2017 17:10:30 -0800 Subject: [PATCH 13/24] Fixed tests for one sample percentile confidence tests and intervals --- permute/tests/test_core.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/permute/tests/test_core.py b/permute/tests/test_core.py index 50958301..6664dee3 100644 --- a/permute/tests/test_core.py +++ b/permute/tests/test_core.py @@ -136,7 +136,7 @@ def test_two_sample_shift(): # Define a lambda function f = lambda u, v: np.max(u) - np.max(v) - res = two_sample_shift(x, y, seed=42, stat=f, reps=100) + res = two_sample(x, y, seed=42, stat=f, reps=100) expected = (1, -3.2730653690015465) np.testing.assert_equal(res[0], expected[0]) np.testing.assert_equal(res[1], expected[1]) @@ -356,9 +356,9 @@ def test_one_sample_percentile(): np.testing.assert_equal(res[1], 50) y = np.append(y, 10) - np.testing.assert_raises(ValueError, one_sample, x, y) + np.testing.assert_raises(ValueError, one_sample_percentile, x, y) - np.testing.assert_raises(ValueError, one_sample, x, p=101) + np.testing.assert_raises(ValueError, one_sample_percentile, x, p=101) # Skewed sample x_skew = np.array(list(x) + [80] * 50) @@ -375,13 +375,13 @@ def test_one_sample_percentile_conf_int(): y = np.append(x, 10) - np.testing.assert_raises(ValueError, one_sample, x, y) + np.testing.assert_raises(ValueError, one_sample_percentile_conf_int, x, y) - np.testing.assert_raises(ValueError, one_sample, x, p=101) + np.testing.assert_raises(ValueError, one_sample_percentile_conf_int, x, p=101) prng = RandomState(42) y = x + prng.normal(size=100) - res = one_sample_percentile_conf_int(x=x, y=y, p=20, seed=42, reps=100) + res = one_sample_percentile_conf_int(x=x, y=y, p=20, seed=42) expected_ci = (-0.94850907410172258, -0.34506992048853352) np.testing.assert_almost_equal(res, expected_ci) From be94bad413a69b8cd434016fca8bd73c43b39e7b Mon Sep 17 00:00:00 2001 From: Michael Zheng Date: Mon, 27 Nov 2017 22:44:04 -0800 Subject: [PATCH 14/24] hoeffding --- permute/hoeffding.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 permute/hoeffding.py diff --git a/permute/hoeffding.py b/permute/hoeffding.py new file mode 100644 index 00000000..b79a36c3 --- /dev/null +++ b/permute/hoeffding.py @@ -0,0 +1,38 @@ +from __future__ import (absolute_import, division, + print_function, unicode_literals) + +import numpy as np + +def hoeffding_conf_int(x, N, cl=0.95, lower_bound=min(x), upper_bound=max(x), alernative="one-sided"): + + r""" + Confidence interval for the mean of bounded, independent observations + derived by inverting Hoeffding's inequality. This method uses the + assumption that the observations have the same bounds. + + Parameters + ---------- + x : array-like + list of observations + N : int + population size + cl : float in (0, 1) + the desired confidence level. Default 0.95. + lower_bound : float + lower bound for the observations + upper_bound : float + upper bound for the observations + alternative : {'greater', 'less', 'two-sided'} + alternative hypothesis to test (default: 'greater') + + Returns + ------- + tuple + the confidence limits + """ + x_bar = np.mean(x) + tau_sq = N*math.pow(upper_bound - lower_bound) + alpha = 1-cl + hCrit = np.sqrt(-math.log(alpha/2)*tau_sq/(2*N**2)) + ci_low, ci_upp = x_bar - hCrit, x_bar + hCrit + return ci_low, ci_upp \ No newline at end of file From 3e1ab19490eb1d243c38a5fa222b064b2a073fa8 Mon Sep 17 00:00:00 2001 From: Michael Zheng Date: Thu, 30 Nov 2017 10:32:41 -0800 Subject: [PATCH 15/24] heoffding cases --- permute/hoeffding.py | 53 +++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/permute/hoeffding.py b/permute/hoeffding.py index b79a36c3..219ff968 100644 --- a/permute/hoeffding.py +++ b/permute/hoeffding.py @@ -2,13 +2,13 @@ print_function, unicode_literals) import numpy as np +import math -def hoeffding_conf_int(x, N, cl=0.95, lower_bound=min(x), upper_bound=max(x), alernative="one-sided"): - +def hoeffding_conf_int(x, N, lower_bound, upper_bound, cl=0.95, alternative="one-sided"): r""" Confidence interval for the mean of bounded, independent observations - derived by inverting Hoeffding's inequality. This method uses the - assumption that the observations have the same bounds. + derived by inverting Hoeffding's inequality. This method uses the + assumption that the observations have the same bounds. Parameters ---------- @@ -17,11 +17,11 @@ def hoeffding_conf_int(x, N, cl=0.95, lower_bound=min(x), upper_bound=max(x), al N : int population size cl : float in (0, 1) - the desired confidence level. Default 0.95. - lower_bound : float - lower bound for the observations - upper_bound : float - upper bound for the observations + the desired confidence level. Default 0.95. + lower_bound : scalar or array-like + lower bound for the observations + upper_bound : scalar or array-like + upper bound for the observations alternative : {'greater', 'less', 'two-sided'} alternative hypothesis to test (default: 'greater') @@ -29,10 +29,31 @@ def hoeffding_conf_int(x, N, cl=0.95, lower_bound=min(x), upper_bound=max(x), al ------- tuple the confidence limits - """ - x_bar = np.mean(x) - tau_sq = N*math.pow(upper_bound - lower_bound) - alpha = 1-cl - hCrit = np.sqrt(-math.log(alpha/2)*tau_sq/(2*N**2)) - ci_low, ci_upp = x_bar - hCrit, x_bar + hCrit - return ci_low, ci_upp \ No newline at end of file + """ + #Check if Bounds are Scalar or Array-Like + if (isinstance(lower_bound, float) or isinstance(lower_bound, int)) and (isinstance(upper_bound, float) or isinstance(upper_bound, int)): + tau_sq = N*((upper_bound - lower_bound)**2) + max_upper = upper_bound + min_lower = lower_bound + elif isinstance(lower_bound, tuple) and isinstance(lower_bound, tuple): + if len(lower_bound) != N or len(upper_bound) != N: + raise ValueError("Bad Bound Input Length") + if np.sum(upper_bound > lower_bound) != N: + raise ValueError("Invalid Upper and Lower bounds") + else: + tau_sq = np.sum(upper_bound - lower_bound)**2 + + max_upper = np.mean(upper_bound) + min_lower = np.mean(lower_bound) + + x_bar = np.mean(x) + hCrit = np.sqrt(-math.log((1-cl)/2)*tau_sq/(2*N**2)) + ci_low, ci_upp = x_bar - hCrit, x_bar + hCrit + + #Truncated Bounds of the Hoeffding Confidence Interval + if ci_upp > max_upper: + ci_upp = max_upper + if ci_low < min_lower: + ci_low = min_lower + + return ci_low, ci_upp \ No newline at end of file From f538e968c1e1552830a013ca6f8e582627b4b71d Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Wed, 13 Dec 2017 02:16:33 -0800 Subject: [PATCH 16/24] Major changes to one sample tests. Merged one_sample_shift into one_sample with 'shifted' center parameter named 'center'. Changed function signatures of one_sample_percentile, one_sample_percentile_ci (once one_sample_percentile_conf_int) --- permute/core.py | 209 +++++++++---------------------------- permute/tests/test_core.py | 80 +++++++------- 2 files changed, 89 insertions(+), 200 deletions(-) diff --git a/permute/core.py b/permute/core.py index 3d9c42c7..6f4b1ddf 100644 --- a/permute/core.py +++ b/permute/core.py @@ -7,7 +7,7 @@ import numpy as np from scipy.optimize import brentq, fsolve -from scipy.stats import ttest_ind, ttest_1samp +from scipy.stats import ttest_ind, ttest_1samp, binom from .utils import get_prng, potential_outcomes, binom_conf_interval from .binomialp import binomial_p @@ -434,8 +434,7 @@ def two_sample_conf_int(x, y, cl=0.95, alternative="two-sided", seed=None, return ci_low, ci_upp -#ROUGH DRAFT: One sample test for percentile. -def one_sample_percentile(x, y=None, p=50, reps=10**5, alternative="greater", keep_dist=False, seed=None): +def one_sample_percentile(x, x_p, p=50, alternative="less"): r""" One-sided or two-sided test for the percentile P of a population distribution. assuming there is an P/100 chance for each value of the sample to be in the @@ -450,9 +449,8 @@ def one_sample_percentile(x, y=None, p=50, reps=10**5, alternative="greater", ke ---------- x : array-like Sample 1 - y : array-like - Sample 2. Must preserve the order of pairs with x. - If None, x is taken to be the one sample. + x_p : numeric + Null Hypothesis p: int in [0,100] Percentile of interest to test. reps : int @@ -473,28 +471,27 @@ def one_sample_percentile(x, y=None, p=50, reps=10**5, alternative="greater", ke float the estimated p-value float - the test statistic: Number of values at or below percentile P of sample + the test statistic: Number of values at or below x_p in the samples list distribution of test statistics (only if keep_dist == True) """ - if y is None: - z = x - elif len(x) != len(y): - raise ValueError('x and y must be pairs') - else: - z = np.array(x) - np.array(y) - if not 0 <= p <= 100: raise ValueError('p must be between 0 and 100') - return binomial_p(np.sum(z <= np.percentile(z, p)), len(z), p/100, reps=reps, \ - alternative=alternative, keep_dist=keep_dist, seed=seed) + thePvalue = { + 'greater': lambda p: 1 - p, + 'less': lambda p: p, + 'two-sided': lambda p: 2 * np.min([p, 1 - p]) + } + + num_under = np.sum(x <= x_p) + p_value = binom.cdf(num_under, n=len(x), p=p/100) + return thePvalue[alternative](p_value), num_under -# ROUGH DRAFT: One sample confidence intervals for percentiles -def one_sample_percentile_conf_int(x, y=None, p=50, cl=0.95, alternative="two-sided", seed=None): +def one_sample_percentile_ci(x, p=50, cl=0.95, alternative="two-sided", seed=None, reps=10**5): """ Confidence intervals for a percentile P of the population distribution of a univariate or paired sample. @@ -524,134 +521,37 @@ def one_sample_percentile_conf_int(x, y=None, p=50, cl=0.95, alternative="two-si """ - if y is None: - z = x - elif len(x) != len(y): - raise ValueError('x and y must be pairs') - else: - z = np.array(x) - np.array(y) - if not 0 <= p <= 100: raise ValueError('p must be between 0 and 100') - conf_int = binom_conf_interval(len(z), np.sum(z <= np.percentile(z, p)), cl=cl, alternative="two-sided", p=p/100) - return (np.percentile(z, conf_int[0]*100), np.percentile(z, conf_int[1]*100)) - - -def one_sample(x, y=None, reps=10**5, stat='mean', alternative="greater", - keep_dist=False, seed=None): - r""" - One-sided or two-sided, one-sample permutation test for the mean, - with p-value estimated by simulated random sampling with - reps replications. - - Alternatively, a permutation test for equality of means of two paired - samples. - - Tests the hypothesis that x is distributed symmetrically symmetric about 0 - (or x and y have the same center) against the alternative that x comes from - a population with mean - - (a) greater than 0 (greater than that of the population from which y comes), - if side = 'greater' - (b) less than 0 (less than that of the population from which y comes), - if side = 'less' - (c) different from 0 (different from that of the population from which y comes), - if side = 'two-sided' - - If ``keep_dist``, return the distribution of values of the test statistic; - otherwise, return only the number of permutations for which the value of - the test statistic and p-value. - - Parameters - ---------- - x : array-like - Sample 1 - y : array-like - Sample 2. Must preserve the order of pairs with x. - If None, x is taken to be the one sample. - reps : int - number of repetitions - stat : {'mean', 't'} - The test statistic. The statistic is computed based on either z = x or - z = x - y, if y is specified. - - (a) If stat == 'mean', the test statistic is mean(z). - (b) If stat == 't', the test statistic is the t-statistic-- - but the p-value is still estimated by the randomization, - approximating the permutation distribution. - (c) If stat is a function (a callable object), the test statistic is - that function. The function should take a permutation of the - data and compute the test function from it. For instance, if the - test statistic is the maximum absolute value, $\max_i |z_i|$, - the test statistic could be written: + #shift_limit = np.max([np.max(x) - np.median(x), np.median(x) - np.min(x)] + x = np.sort(x) + ci_low = np.min(x) + ci_upp = np.max(x) - f = lambda u: np.max(abs(u)) - alternative : {'greater', 'less', 'two-sided'} - The alternative hypothesis to test - keep_dist : bool - flag for whether to store and return the array of values - of the irr test statistic - seed : RandomState instance or {None, int, RandomState instance} - If None, the pseudorandom number generator is the RandomState - instance used by `np.random`; - If int, seed is the seed used by the random number generator; - If RandomState instance, seed is the pseudorandom number generator - - - Returns - ------- - float - the estimated p-value - float - the test statistic - list - The distribution of test statistics. - These values are only returned if `keep_dist` == True - """ - prng = get_prng(seed) + if alternative == 'two-sided': + cl = 1 - (1 - cl) / 2 - if y is None: - z = x - elif len(x) != len(y): - raise ValueError('x and y must be pairs') - else: - z = np.array(x) - np.array(y) + if alternative != "upper": + g = lambda q: cl - one_sample_percentile(x, q, p=p, alternative="greater")[0] + ci_low = brentq(g, np.min(x), np.max(x)) + ci_low = x[np.searchsorted(x, ci_low) - 1] - thePvalue = { - 'greater': lambda p: p, - 'less': lambda p: 1 - p, - 'two-sided': lambda p: 2 * np.min([p, 1 - p]) - } + if alternative != "lower": + g = lambda q: cl - one_sample_percentile(x, q, p=p, alternative="less")[0] + ci_upp = brentq(g, np.min(x), np.max(x)) + ci_upp = x[np.searchsorted(x, ci_upp)] - stats = { - 'mean': lambda u: np.mean(u), - 't': lambda u: ttest_1samp(u, 0)[0], - 'median': lambda u: np.median(u) - } - if callable(stat): - tst_fun = stat - else: - tst_fun = stats[stat] + return ci_low, ci_upp - tst = tst_fun(z) - n = len(z) - if keep_dist: - dist = [] - for i in range(reps): - dist.append(tst_fun(z * (1 - 2 * prng.binomial(1, .5, size=n)))) - hits = np.sum(dist >= tst) - return thePvalue[alternative](hits / reps), tst, dist - else: - hits = np.sum([(tst_fun(z * (1 - 2 * prng.binomial(1, .5, size=n)))) >= tst - for i in range(reps)]) - return thePvalue[alternative](hits / reps), tst + # conf_int = binom_conf_interval(len(z), np.sum(z <= np.percentile(z, p)), cl=cl, alternative="two-sided", p=p/100) + # return (np.percentile(z, conf_int[0]*100), np.percentile(z, conf_int[1]*100)) -def one_sample_shift(x, y=None, reps=10**5, stat='mean', alternative="greater", - keep_dist=False, seed=None, shift = None): +def one_sample(x, y=None, reps=10**5, stat='mean', alternative="greater", + keep_dist=False, seed=None, center = None): r""" One-sided or two-sided, one-sample permutation test for the mean, with p-value estimated by simulated random sampling with @@ -660,18 +560,15 @@ def one_sample_shift(x, y=None, reps=10**5, stat='mean', alternative="greater", Alternatively, a permutation test for equality of means of two paired samples. - This function assumed a shift model. Given a shift (float), this test will - find the - - Tests the hypothesis that x is distributed symmetrically symmetric about 0 - (or x and y have the same center) against the alternative that x comes from + Tests the hypothesis that x is distributed symmetrically symmetric about + CENTER (or x and y have the same center) against the alternative that x comes from a population with mean - (a) greater than 0 (greater than that of the population from which y comes), + (a) greater than CENTER (greater than that of the population from which y comes), if side = 'greater' - (b) less than 0 (less than that of the population from which y comes), + (b) less than CENTER (less than that of the population from which y comes), if side = 'less' - (c) different from 0 (different from that of the population from which y comes), + (c) different from CENTER (different from that of the population from which y comes), if side = 'two-sided' If ``keep_dist``, return the distribution of values of the test statistic; @@ -712,8 +609,8 @@ def one_sample_shift(x, y=None, reps=10**5, stat='mean', alternative="greater", instance used by `np.random`; If int, seed is the seed used by the random number generator; If RandomState instance, seed is the pseudorandom number generator - shift : float - Assumption of symmetry around the shift, d. + center : float + Assumption of symmetry around this center. Returns ------- @@ -751,18 +648,18 @@ def one_sample_shift(x, y=None, reps=10**5, stat='mean', alternative="greater", tst = tst_fun(z) - if shift is None: - shift = float(0) + if center is None: + center = float(0) n = len(z) if keep_dist: dist = [] for i in range(reps): - dist.append(tst_fun(shift + (z - shift) * (1 - 2 * prng.binomial(1, .5, size=n)))) + dist.append(tst_fun(center + (z - center) * (1 - 2 * prng.binomial(1, .5, size=n)))) hits = np.sum(dist >= tst) return thePvalue[alternative](hits / reps), tst, dist else: - hits = np.sum([(tst_fun(shift + (z - shift) * (1 - 2 * prng.binomial(1, .5, size=n)))) >= tst + hits = np.sum([(tst_fun(center + (z - center) * (1 - 2 * prng.binomial(1, .5, size=n)))) >= tst for i in range(reps)]) return thePvalue[alternative](hits / reps), tst @@ -792,7 +689,7 @@ def one_sample_conf_int(x, y = None, cl=0.95, alternative="two-sided", seed=None If RandomState instance, seed is the pseudorandom number generator reps : int number of repetitions in two_sample - stat : {'mean', 't'} + stat : {'mean', 't'} The test statistic. (a) If stat == 'mean', the test statistic is (mean(x) - mean(y)) @@ -878,19 +775,13 @@ def one_sample_conf_int(x, y = None, cl=0.95, alternative="two-sided", seed=None cl = 1 - (1 - cl) / 2 if alternative != "upper": - # if shift is None: - # g = lambda q: cl - one_sample_shift(x, y, alternative="less", seed=seed, reps=reps, stat=stat, shift=q)[0] - # else: - g = lambda q: cl - one_sample_shift(z, alternative="less", seed=seed, \ - reps=reps, stat=stat, shift=q)[0] + g = lambda q: cl - one_sample(z, alternative="less", seed=seed, \ + reps=reps, stat=stat, center=q)[0] ci_low = brentq(g, tst - 2 * shift_limit, tst + 2 * shift_limit) if alternative != "lower": - # if shift is None: - # g = lambda q: cl - one_sample_shift(x, y, alternative="greater", seed=seed, reps=reps, stat=stat, shift=q)[0] - # else: - g = lambda q: cl - one_sample_shift(z, alternative="greater", seed=seed, \ - reps=reps, stat=stat, shift=q)[0] + g = lambda q: cl - one_sample(z, alternative="greater", seed=seed, \ + reps=reps, stat=stat, center=q)[0] ci_upp = brentq(g, tst - 2 * shift_limit, tst + 2 * shift_limit) return ci_low, ci_upp diff --git a/permute/tests/test_core.py b/permute/tests/test_core.py index 6664dee3..858649fb 100644 --- a/permute/tests/test_core.py +++ b/permute/tests/test_core.py @@ -6,6 +6,7 @@ import numpy as np from numpy.random import RandomState +from scipy.stats import binom from ..core import (corr, @@ -13,10 +14,10 @@ two_sample_shift, two_sample_conf_int, one_sample, - one_sample_shift, + one_sample, one_sample_conf_int, one_sample_percentile, - one_sample_percentile_conf_int) + one_sample_percentile_ci) def test_corr(): @@ -233,27 +234,20 @@ def test_one_sample(): np.testing.assert_almost_equal(res[0], 0.059999999999) np.testing.assert_almost_equal(res[1], 0.8) - -def test_one_sample_shift(): prng = RandomState(42) x = np.array(range(10)) y = x - 1 # case 1: - res0 = one_sample_shift(x, seed=prng, reps=100, shift=3) - res1 = one_sample_shift(x, seed=prng, reps=100, shift=7) + res0 = one_sample(x, seed=prng, reps=100, center=3) + res1 = one_sample(x, seed=prng, reps=100, center=7) np.testing.assert_almost_equal(res0[0], 0.07) np.testing.assert_equal(res0[1], np.mean(x)) np.testing.assert_almost_equal(res1[0], 1) - # one_sample_shift with no shift is the same as a shift of 0 - zero = one_sample_shift(x, seed=42, reps=100) - zero_reg = one_sample(x, seed=42, reps=100) - np.testing.assert_almost_equal(zero, zero_reg) - # case 2: paired sample - res = one_sample_shift(x, y, seed=42, reps=100, shift=1, keep_dist=True) + res = one_sample(x, y, seed=42, reps=100, center=1, keep_dist=True) np.testing.assert_almost_equal(res[0], 1) np.testing.assert_equal(res[1], 1) dist_unique = np.unique(res[2]) # Distribution should be all 1's @@ -262,23 +256,23 @@ def test_one_sample_shift(): # case 3: t test statistic x = np.array(range(5)) - res = one_sample_shift(x, seed=42, reps=100, stat="t", alternative="less", shift=0.1) + res = one_sample(x, seed=42, reps=100, stat="t", alternative="less", center=0.1) np.testing.assert_almost_equal(res[0], 0.93999999999999995) np.testing.assert_almost_equal(res[1], 2.8284271247461898) # case 4: break it - supply x and y, but not paired y = np.append(y, 10) - assert_raises(ValueError, one_sample_shift, x, y) + assert_raises(ValueError, one_sample, x, y) # case 5: use median as test statistic x = np.array(range(10)) - res = one_sample_shift(x, seed=42, reps=100, stat="median", shift=4.5) + res = one_sample(x, seed=42, reps=100, stat="median", center=4.5) np.testing.assert_almost_equal(res[0], 0.53) np.testing.assert_equal(res[1], 4.5) # case 6: Test statistic is a function pcntl = lambda x: np.percentile(x, 20) - res = one_sample_shift(x, seed=42, reps=100, stat=pcntl, shift=2) + res = one_sample(x, seed=42, reps=100, stat=pcntl, center=2) np.testing.assert_almost_equal(res[0], 0.029999999999999999) np.testing.assert_almost_equal(res[1], 1.8) @@ -287,7 +281,7 @@ def test_one_sample_shift(): def test_one_sample_conf_int(): prng = RandomState(42) - # Standard confidenceinterval + # Standard confidence interval x = np.array(range(10)) res = one_sample_conf_int(x, seed=prng) expected_ci = (2.2696168, 6.6684788) @@ -349,39 +343,43 @@ def test_one_sample_conf_int_bad_shift(): def test_one_sample_percentile(): prng = RandomState(42) - x = np.arange(0, 100) - y = x + prng.normal(size=100) - res = one_sample_percentile(x=x, y=y, p=50, seed=42, reps=100) - np.testing.assert_almost_equal(res[0], 0.52) + x = np.arange(1, 101) + res = one_sample_percentile(x, 50, p=50, alternative="less") np.testing.assert_equal(res[1], 50) + expected_pval = binom.cdf(res[1], len(x), 0.5) + np.testing.assert_equal(res[0], expected_pval) - y = np.append(y, 10) - np.testing.assert_raises(ValueError, one_sample_percentile, x, y) + res = one_sample_percentile(x, 75, p=70, alternative="greater") + np.testing.assert_equal(res[1], 75) + expected_pval = 1 - binom.cdf(res[1], len(x), 0.7) + np.testing.assert_equal(res[0], expected_pval) - np.testing.assert_raises(ValueError, one_sample_percentile, x, p=101) + res = one_sample_percentile(x, 20, p=30, alternative="two-sided") + np.testing.assert_equal(res[1], 20) + expected_pval = 2 * binom.cdf(res[1], len(x), 0.3) + np.testing.assert_equal(res[0], expected_pval) - # Skewed sample - x_skew = np.array(list(x) + [80] * 50) - res = one_sample_percentile(x=x_skew, p=80, seed=42, reps=100) - np.testing.assert_almost_equal(res[0], 0.01) - np.testing.assert_equal(res[1], 131) + np.testing.assert_raises(ValueError, one_sample_percentile, x, x_p = 50, p=101) -@attr('slow') -def test_one_sample_percentile_conf_int(): +def test_one_sample_percentile_ci(): x = np.arange(0, 100) - res = one_sample_percentile_conf_int(x, seed=42, p=50) - np.testing.assert_almost_equal(res[0], 39.43379182082676) - np.testing.assert_almost_equal(res[1], 59.56620817917322) + res = one_sample_percentile_ci(x) + np.testing.assert_equal(res[0], 39) + np.testing.assert_almost_equal(res[1], 59) + res = one_sample_percentile_ci(x, p=50, alternative="upper") + np.testing.assert_equal(res[0], 0) + np.testing.assert_equal(res[1], 58) - y = np.append(x, 10) - np.testing.assert_raises(ValueError, one_sample_percentile_conf_int, x, y) + res = one_sample_percentile_ci(x, p=50, alternative="lower") + np.testing.assert_equal(res[0], 40) + np.testing.assert_equal(res[1], 99) - np.testing.assert_raises(ValueError, one_sample_percentile_conf_int, x, p=101) + y = np.append(x, 10) + np.testing.assert_raises(ValueError, one_sample_percentile_ci, x, p=101) prng = RandomState(42) - y = x + prng.normal(size=100) - res = one_sample_percentile_conf_int(x=x, y=y, p=20, seed=42) - expected_ci = (-0.94850907410172258, -0.34506992048853352) - np.testing.assert_almost_equal(res, expected_ci) + z = prng.normal(0, 5, 100) + res = one_sample_percentile_ci(z, p=50) + expected_ci = (-1.546061879256073, 0.55461294854933041) From b6cbb2b2e62d3a2d17ae8c8f07b5003d2a0013ac Mon Sep 17 00:00:00 2001 From: Michael Zheng Date: Thu, 14 Dec 2017 20:28:45 -0800 Subject: [PATCH 17/24] hoeffding and tests, regression --- permute/hoeffding.py | 31 +++++++++++++++------- permute/regcoeff.py | 47 +++++++++++++++++++++++++++++++++ permute/tests/test_hoeffding.py | 39 +++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 9 deletions(-) create mode 100644 permute/regcoeff.py create mode 100644 permute/tests/test_hoeffding.py diff --git a/permute/hoeffding.py b/permute/hoeffding.py index 219ff968..91b8f340 100644 --- a/permute/hoeffding.py +++ b/permute/hoeffding.py @@ -22,8 +22,8 @@ def hoeffding_conf_int(x, N, lower_bound, upper_bound, cl=0.95, alternative="one lower bound for the observations upper_bound : scalar or array-like upper bound for the observations - alternative : {'greater', 'less', 'two-sided'} - alternative hypothesis to test (default: 'greater') + alternative : {'one-sided', 'two-sided'} + alternative hypothesis to test (default: 'one-sided') Returns ------- @@ -32,23 +32,36 @@ def hoeffding_conf_int(x, N, lower_bound, upper_bound, cl=0.95, alternative="one """ #Check if Bounds are Scalar or Array-Like if (isinstance(lower_bound, float) or isinstance(lower_bound, int)) and (isinstance(upper_bound, float) or isinstance(upper_bound, int)): + if max(x) > upper_bound or min(x) < lower_bound: + raise ValueError("x values not contained in bounds") tau_sq = N*((upper_bound - lower_bound)**2) max_upper = upper_bound min_lower = lower_bound - elif isinstance(lower_bound, tuple) and isinstance(lower_bound, tuple): + elif isinstance(lower_bound, np.ndarray) and isinstance(lower_bound, np.ndarray): + #Makes sure the bounds are valid, same length, makes sense, and values are in the bounds if len(lower_bound) != N or len(upper_bound) != N: raise ValueError("Bad Bound Input Length") - if np.sum(upper_bound > lower_bound) != N: + if np.sum(upper_bound >= lower_bound) != N: raise ValueError("Invalid Upper and Lower bounds") + if np.sum((x <= upper_bound) and (x >= lower_bound)) != N: + raise ValueError("x values not contained in bounds") else: - tau_sq = np.sum(upper_bound - lower_bound)**2 - + tau_sq = np.sum((upper_bound - lower_bound)**2) max_upper = np.mean(upper_bound) min_lower = np.mean(lower_bound) + else: + raise ValueError("Invalid Upper and Lower Bounds") x_bar = np.mean(x) - hCrit = np.sqrt(-math.log((1-cl)/2)*tau_sq/(2*N**2)) - ci_low, ci_upp = x_bar - hCrit, x_bar + hCrit + + if alternative == "one-sided": + hCrit = np.sqrt(-math.log(1-cl)*tau_sq/(2*N**2)) + ci_low = x_bar - hCrit + ci_upp = max_upper + + if alternative == "two-sided": + hCrit = np.sqrt(-math.log((1-cl)/2)*tau_sq/(2*N**2)) + ci_low, ci_upp = x_bar - hCrit, x_bar + hCrit #Truncated Bounds of the Hoeffding Confidence Interval if ci_upp > max_upper: @@ -56,4 +69,4 @@ def hoeffding_conf_int(x, N, lower_bound, upper_bound, cl=0.95, alternative="one if ci_low < min_lower: ci_low = min_lower - return ci_low, ci_upp \ No newline at end of file + return ci_low, ci_upp diff --git a/permute/regcoeff.py b/permute/regcoeff.py new file mode 100644 index 00000000..54e5a96d --- /dev/null +++ b/permute/regcoeff.py @@ -0,0 +1,47 @@ +from __future__ import (absolute_import, division, + print_function, unicode_literals) + +import numpy as np +import math +from scipy.stats import linregress + +from utils import get_prng + +def reg_coeff(x, y, reps=10**5, cl=0.95, alternative="one-sided", seed=None): + r""" + Testing the coefficients of a linear regression. The assumption of the linear regression + is that + + Parameters + ---------- + x : array-like + list of observations + N : int + population size + cl : float in (0, 1) + the desired confidence level. Default 0.95. + lower_bound : scalar or array-like + lower bound for the observations + upper_bound : scalar or array-like + upper bound for the observations + alternative : {'greater', 'less', 'two-sided'} + alternative hypothesis to test (default: 'greater') + + Returns + ------- + tuple + the confidence limits + """ + true_slope = abs(linregress(x, y)[0]) + + prng = get_prng(seed) + slopes = [] + for _ in range(reps): + shuffle_x = np.random.permutation(x) + slope = linregress(shuffle_x, y)[0] + slopes.append(slope) + + p_value = sum(abs(np.array(slopes)) > true_slope)/float(reps) + + return p_value + diff --git a/permute/tests/test_hoeffding.py b/permute/tests/test_hoeffding.py new file mode 100644 index 00000000..eaf14dd4 --- /dev/null +++ b/permute/tests/test_hoeffding.py @@ -0,0 +1,39 @@ +from __future__ import (absolute_import, division, + print_function, unicode_literals) + +from nose.plugins.attrib import attr +from nose.tools import assert_raises, raises + +import numpy as np +from numpy.random import RandomState + + +from ..hoeffding import hoeffding_conf_int + +def test_hoeffding(): + values = np.array([0.5]*10) + res = hoeffding_conf_int(values, 10, 0, 1, cl=0.95, alternative="two-sided") + expected_ci = (0.112977243, 1) + np.testing.assert_almost_equal(res, expected_ci) + + res = hoeffding_conf_int(values, 10, 0, 1, cl=0.95) + expected_ci = (0.070530591653262475, 0.92946940834673752) + np.testing.assert_almost_equal(res, expected_ci) + + lb = np.array(list([0]*10)) + ub = np.array(list([1]*10)) + res = hoeffding_conf_int(values, 10, lb, up, cl=0.95, alternative="two-sided") + expected_ci = (0.070530591653262475, 0.92946940834673752) + np.testing.assert_almost_equal(res, expected_ci) + +@raises(ValueError) +def test_hoeffding_conf_int_bad_bound(): + # Break it with a bad shift + values = np.array(range(10)) + res = hoeffding_conf_int(values, 0, 9, 5, cl=0.95, alternative="two-sided") + + + + + + \ No newline at end of file From cef709f6d88ba39abdd30bcce12703723f604a9b Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Thu, 14 Dec 2017 20:33:23 -0800 Subject: [PATCH 18/24] Changes to documentation of one_sample_percentile and one_sample_percentile_ci --- permute/core.py | 51 ++++++++++++++++--------------------------------- 1 file changed, 16 insertions(+), 35 deletions(-) diff --git a/permute/core.py b/permute/core.py index 6f4b1ddf..1d1e8867 100644 --- a/permute/core.py +++ b/permute/core.py @@ -436,35 +436,27 @@ def two_sample_conf_int(x, y, cl=0.95, alternative="two-sided", seed=None, def one_sample_percentile(x, x_p, p=50, alternative="less"): r""" - One-sided or two-sided test for the percentile P of a population distribution. - assuming there is an P/100 chance for each value of the sample to be in the - Pth percentile. + One-sided or two-sided test for the true Pth percentile of a population distribution. - The null hypothesis is that there is an equal P/100 chance for any value of the - sample to lie at or below the sample Pth percentile. + The null hypothesis is that the given percentile is the true Pth percentile. + here is an equal P/100 chance for any value in a sample,of the same length of X, + and drawn from the population to lie at or below the Pth percentile. - This test defaults to P=50. + The P value returned would be the probability that the true population percentile is + as extreme as X_P. + + This test defaults to P=50, the 50th percentile. Parameters ---------- x : array-like - Sample 1 + Sample x_p : numeric - Null Hypothesis + An inputted estimated pth percentile of the distribution. p: int in [0,100] Percentile of interest to test. - reps : int - number of repetitions alternative : {'greater', 'less', 'two-sided'} The alternative hypothesis to test - keep_dist : bool - flag for whether to store and return the array of values - of the irr test statistic - seed : RandomState instance or {None, int, RandomState instance} - If None, the pseudorandom number generator is the RandomState - instance used by `np.random`; - If int, seed is the seed used by the random number generator; - If RandomState instance, seed is the pseudorandom number generator Returns ------- @@ -472,8 +464,6 @@ def one_sample_percentile(x, x_p, p=50, alternative="less"): the estimated p-value float the test statistic: Number of values at or below x_p in the samples - list - distribution of test statistics (only if keep_dist == True) """ if not 0 <= p <= 100: @@ -491,27 +481,22 @@ def one_sample_percentile(x, x_p, p=50, alternative="less"): return thePvalue[alternative](p_value), num_under -def one_sample_percentile_ci(x, p=50, cl=0.95, alternative="two-sided", seed=None, reps=10**5): +def one_sample_percentile_ci(x, p=50, cl=0.95, alternative="two-sided"): """ Confidence intervals for a percentile P of the population distribution of a - univariate or paired sample. - - Compute a confidence interval for a binomial p, the probability of success in each trial. + univariate or paired sample of a confidence level CL solely based on a given + sample X drawn from the population. Parameters ---------- - n : int - The number of Bernoulli trials. - x : int - The number of successes. + x : array-like + The sample cl : float in (0, 1) The desired confidence level. alternative : {"two-sided", "lower", "upper"} Indicates the alternative hypothesis. p : int in [0,100] - Desired percentile of interest to test. - kwargs : dict - Key word arguments + Desired percentile of interest to get a confidence interval of. Returns ------- @@ -524,7 +509,6 @@ def one_sample_percentile_ci(x, p=50, cl=0.95, alternative="two-sided", seed=Non if not 0 <= p <= 100: raise ValueError('p must be between 0 and 100') - #shift_limit = np.max([np.max(x) - np.median(x), np.median(x) - np.min(x)] x = np.sort(x) ci_low = np.min(x) ci_upp = np.max(x) @@ -547,9 +531,6 @@ def one_sample_percentile_ci(x, p=50, cl=0.95, alternative="two-sided", seed=Non - # conf_int = binom_conf_interval(len(z), np.sum(z <= np.percentile(z, p)), cl=cl, alternative="two-sided", p=p/100) - # return (np.percentile(z, conf_int[0]*100), np.percentile(z, conf_int[1]*100)) - def one_sample(x, y=None, reps=10**5, stat='mean', alternative="greater", keep_dist=False, seed=None, center = None): r""" From a21a801e0e2b096bea3366334b9853e24586e2c2 Mon Sep 17 00:00:00 2001 From: Michael Zheng Date: Thu, 14 Dec 2017 21:25:00 -0800 Subject: [PATCH 19/24] tests for hoeff and reg --- permute/hoeffding.py | 2 +- permute/regcoeff.py | 5 +---- permute/tests/test_hoeffding.py | 38 +++++++++++++++++++++++++++------ permute/tests/test_regcoeff.py | 19 +++++++++++++++++ 4 files changed, 52 insertions(+), 12 deletions(-) create mode 100644 permute/tests/test_regcoeff.py diff --git a/permute/hoeffding.py b/permute/hoeffding.py index 91b8f340..1ec91e23 100644 --- a/permute/hoeffding.py +++ b/permute/hoeffding.py @@ -43,7 +43,7 @@ def hoeffding_conf_int(x, N, lower_bound, upper_bound, cl=0.95, alternative="one raise ValueError("Bad Bound Input Length") if np.sum(upper_bound >= lower_bound) != N: raise ValueError("Invalid Upper and Lower bounds") - if np.sum((x <= upper_bound) and (x >= lower_bound)) != N: + if np.sum((x <= upper_bound)) != N or np.sum((x >= lower_bound)) != N: raise ValueError("x values not contained in bounds") else: tau_sq = np.sum((upper_bound - lower_bound)**2) diff --git a/permute/regcoeff.py b/permute/regcoeff.py index 54e5a96d..a70410b8 100644 --- a/permute/regcoeff.py +++ b/permute/regcoeff.py @@ -5,9 +5,7 @@ import math from scipy.stats import linregress -from utils import get_prng - -def reg_coeff(x, y, reps=10**5, cl=0.95, alternative="one-sided", seed=None): +def reg_coeff(x, y, reps=10**5, cl=0.95): r""" Testing the coefficients of a linear regression. The assumption of the linear regression is that @@ -34,7 +32,6 @@ def reg_coeff(x, y, reps=10**5, cl=0.95, alternative="one-sided", seed=None): """ true_slope = abs(linregress(x, y)[0]) - prng = get_prng(seed) slopes = [] for _ in range(reps): shuffle_x = np.random.permutation(x) diff --git a/permute/tests/test_hoeffding.py b/permute/tests/test_hoeffding.py index eaf14dd4..5095b217 100644 --- a/permute/tests/test_hoeffding.py +++ b/permute/tests/test_hoeffding.py @@ -13,24 +13,48 @@ def test_hoeffding(): values = np.array([0.5]*10) res = hoeffding_conf_int(values, 10, 0, 1, cl=0.95, alternative="two-sided") - expected_ci = (0.112977243, 1) + expected_ci = (0.070530591653262475, 0.92946940834673752) np.testing.assert_almost_equal(res, expected_ci) res = hoeffding_conf_int(values, 10, 0, 1, cl=0.95) - expected_ci = (0.070530591653262475, 0.92946940834673752) + expected_ci = (0.11297724397950515, 1) np.testing.assert_almost_equal(res, expected_ci) lb = np.array(list([0]*10)) ub = np.array(list([1]*10)) - res = hoeffding_conf_int(values, 10, lb, up, cl=0.95, alternative="two-sided") + res = hoeffding_conf_int(values, 10, lb, ub, cl=0.95, alternative="two-sided") expected_ci = (0.070530591653262475, 0.92946940834673752) np.testing.assert_almost_equal(res, expected_ci) -@raises(ValueError) + def test_hoeffding_conf_int_bad_bound(): - # Break it with a bad shift - values = np.array(range(10)) - res = hoeffding_conf_int(values, 0, 9, 5, cl=0.95, alternative="two-sided") + # Break it with incorrect upper and lower bounds + x = np.array(range(10)) + assert_raises(ValueError, hoeffding_conf_int, x, 10, 5, 6) + + # Invalid upper and lower bounds + upper_bound = 1 + lower_bound = np.array([10, 10, 10]) + assert_raises(ValueError, hoeffding_conf_int, x, 10, lower_bound, upper_bound) + + # Bad bound input length + upper_bound = np.array([10, 10]) + lower_bound = np.array([10, 10]) + assert_raises(ValueError, hoeffding_conf_int, x, 10, lower_bound, upper_bound) + + # bounds are not value + upper_bound = np.array([10]*10) + lower_bound = np.array([20]*10) + assert_raises(ValueError, hoeffding_conf_int, x, 10, lower_bound, upper_bound) + + # x not in range of bounds + x = np.array([5]*10) + upper_bound = np.array([3]*10) + lower_bound = np.array([2]*10) + assert_raises(ValueError, hoeffding_conf_int, x, 10, lower_bound, upper_bound) + + + diff --git a/permute/tests/test_regcoeff.py b/permute/tests/test_regcoeff.py new file mode 100644 index 00000000..bc6174ea --- /dev/null +++ b/permute/tests/test_regcoeff.py @@ -0,0 +1,19 @@ +from __future__ import (absolute_import, division, + print_function, unicode_literals) + +from nose.plugins.attrib import attr +from nose.tools import assert_raises, raises + +import numpy as np +from numpy.random import RandomState + +from ..regcoeff import reg_coeff + +def test_regcoeff(): + values = np.array([0.5]*10) + x = np.array(range(10)) + y = np.array(range(10)) + + res = reg_coeff(x, y, cl = 0.95) + expected = 0 + np.testing.assert_almost_equal(res, expected) \ No newline at end of file From 5584bb821c909865becf7d9ad4fd6f5c19627d9f Mon Sep 17 00:00:00 2001 From: Andrew Linxie Date: Thu, 14 Dec 2017 22:34:07 -0800 Subject: [PATCH 20/24] Added prng to regression tests --- permute/regcoeff.py | 11 ++++++++--- permute/tests/test_regcoeff.py | 8 +++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/permute/regcoeff.py b/permute/regcoeff.py index a70410b8..1c2ea31f 100644 --- a/permute/regcoeff.py +++ b/permute/regcoeff.py @@ -4,8 +4,9 @@ import numpy as np import math from scipy.stats import linregress +from .utils import get_prng -def reg_coeff(x, y, reps=10**5, cl=0.95): +def reg_coeff(x, y, reps=10**5, cl=0.95, seed=None): r""" Testing the coefficients of a linear regression. The assumption of the linear regression is that @@ -30,11 +31,15 @@ def reg_coeff(x, y, reps=10**5, cl=0.95): tuple the confidence limits """ - true_slope = abs(linregress(x, y)[0]) + if len(x) != len(y): + raise ValueError("Input Vector lengths do not match") + + true_slope = abs(linregress(x, y)[0]) + prng = get_prng(seed) slopes = [] for _ in range(reps): - shuffle_x = np.random.permutation(x) + shuffle_x = prng.permutation(x) slope = linregress(shuffle_x, y)[0] slopes.append(slope) diff --git a/permute/tests/test_regcoeff.py b/permute/tests/test_regcoeff.py index bc6174ea..42f3a981 100644 --- a/permute/tests/test_regcoeff.py +++ b/permute/tests/test_regcoeff.py @@ -13,7 +13,9 @@ def test_regcoeff(): values = np.array([0.5]*10) x = np.array(range(10)) y = np.array(range(10)) - - res = reg_coeff(x, y, cl = 0.95) + res = reg_coeff(x, y, cl = 0.95, seed=42) expected = 0 - np.testing.assert_almost_equal(res, expected) \ No newline at end of file + np.testing.assert_almost_equal(res, expected) + + y.append(10) + assert_raises(ValueError, reg_coeff, x, y) \ No newline at end of file From 932df1c60391241e6fe3199082d36c648b04e76c Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Fri, 15 Dec 2017 03:50:33 -0800 Subject: [PATCH 21/24] Implemented two_sample_fit but only KS is supported. --- permute/core.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/permute/core.py b/permute/core.py index 1d1e8867..9c8b0c16 100644 --- a/permute/core.py +++ b/permute/core.py @@ -6,6 +6,7 @@ print_function, unicode_literals) import numpy as np +import math from scipy.optimize import brentq, fsolve from scipy.stats import ttest_ind, ttest_1samp, binom @@ -188,7 +189,9 @@ def two_sample(x, y, reps=10**5, stat='mean', alternative="greater", # dictionary stats = { 'mean': lambda u, v: np.mean(u) - np.mean(v), - 't': lambda u, v: ttest_ind(u, v, equal_var=True)[0] + 't': lambda u, v: ttest_ind(u, v, equal_var=True)[0], + 'KS': lambda u, v: np.max([abs(sum(u <= a) / len(x) - sum(v <= a) / len(y)) + for a in set(u).intersection(set(y))]) } if callable(stat): tst_fun = stat @@ -206,6 +209,36 @@ def two_sample(x, y, reps=10**5, stat='mean', alternative="greater", else: return res, observed_tst +def two_sample_fit(x, y, alpha, method='KS'): + ''' + Apply permutation methods to find the fitness of distribution X and Y. + Currently the method only implemented the Kolmogorov-Smirnov (KS) test. + + Parameters + ---------- + x: array-like. + sample X + y: array-like. + sample Y + alpha: float in [0, 1] + type I error + + Returns + ------- + boolean + True means two samples come from the same underlying distribution. + Otherwise means they do not. + + Notes + ----- + method + This will be the interface function invoking two_sample + for all goodness of fit test statistics for future implementations. + ''' + _, stats = two_sample(x, y, stat='KS') + c = math.sqrt((len(x) + len(y))) / (len(x) * len(y)) + c *= math.sqrt(-0.5 * math.log(alpha * 0.5)) + return stats <= c def two_sample_shift(x, y, reps=10**5, stat='mean', alternative="greater", keep_dist=False, seed=None, shift=None): From 1a66639d7813db002df08b4a9a5b2d9102dfd46c Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Fri, 15 Dec 2017 03:54:46 -0800 Subject: [PATCH 22/24] added kaplan-ward and sprt --- permute/kaplanward.py | 110 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 permute/kaplanward.py diff --git a/permute/kaplanward.py b/permute/kaplanward.py new file mode 100644 index 00000000..2c163778 --- /dev/null +++ b/permute/kaplanward.py @@ -0,0 +1,110 @@ +import math +import numpy as np +import scipy as sp + +def kaplan_wald_CI(x, level=0.99, lo=0, hi=float('inf'), gamma=0.95, xtol=1.e-32): + ''' + Calculates the confidence interval for the mean of a nonnegative random variable + using the Kaplan-Wald method. + + Parameters + ---------- + x : array-like + your input sample + level : float in (0, 1) + indicates your desired confidence level + lo : float in [0, float('inf')] + the lower bound of your random variable, must be nonnegative + hi : float in (hi, float('inf')) + the upper bound of your random variable, optional + + Returns + ------- + tuple + The estimated confidence level. If the upper bound if not specified, + only the lower bound of the confidence interval will be returned. + + Notes + ----- + gamma : float in (0, 1) + the variable introduced in Kaplan-Wald method to hedge against small values + xtol : float + Tolerance in brentq + ''' + alpha = 1.0 - level + if not (0 < level < 1): + raise ValueError('CI level must be between 0 and 1.') + if any(x < 0): + raise ValueError('Data x must be nonnegative.') + if lo > hi or any(x < lo) or any(x > hi): + raise ValueError('Data x is not in the specified range.') + def find_t(data, start): + f = lambda t: (np.max(np.cumsum(np.log(gamma*data/t + 1-gamma))) + np.log(alpha)) + start, end = start, np.mean(data) + if f(start) * f(end) <= 0.0: + return sp.optimize.brentq(f, start, end, xtol=xtol) + else: + return start + lo = find_t(x, lo + xtol) + if hi != float('inf'): + hi = find_t(hi - x, xtol) + np.mean(x) + return (lo, hi) + + +def sprt_proportion(x, p0, p1, alpha, beta, start=1, plot=False): + ''' + The model aims to determine the quality of a batch of products by minimal sampling. + The idea is to sample the batch sequentially until a decision can be made whether + the batch conforms to specification and can be accepted or that it should be rejected. + + Parameters + ---------- + x : array-like. your input sample. + each value should indicate success or failure. + alpha: float in [0, 1] + type I error + beta: float in [0, 1] + type II error + p0: float in [0, 1]. proportion to accept null hypothesis + the proportion of positives below which + a decision that the proportion is null can be made + p1: float in [0, 1]. proportion to reject null hypothesis + the proportion of positives above which + a decision that the proportion is null can be made + + Returns + ------- + tuple + tuple[0] can be'Reject' the null, 'Accept' the null, + or 'Unknown' based on the data + tuple[1] is the log likelihood ratio for SPRT when a + decision is made or running out of data + tuple[2] is the index of the trail where a decision is made + + Notes + ----- + start + If the last batch of data gives unknown decision, + one can feed the tuple[1] into the param start to continue the SPRT. + ''' + if not (0 <= p0 <= 1 and 0 <= p1 <= 1): + raise ValueError('Proportion must lie in [0, 1]') + if not (0 < alpha < 1 and 0 < beta < 1): + raise ValueError('Error control param must lie in (0, 1)') + lr = start + s_fac = math.log(p1 / p0) + f_fac = math.log((1.0-p1) / (1.0-p0)) + s_bnd = math.log((1-beta) / alpha) + f_bnd = math.log(beta / (1-alpha)) + decision = 'Unknown' + for idx, trail in enumerate(x, 1): + trail = int(trail) + if trail != 0 and trail != 1: + raise ValueError('every trail in the data must be a truthy/falsy value.') + lr += s_fac if trail == 1 else f_fac + if lr >= s_bnd: + decision = 'Reject' + break + elif lr <= f_bnd: + decision = 'Accept' + return (decision, lr, idx) From e413f7350bbeeca6ca91b9d48a136f8c442cd738 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Fri, 15 Dec 2017 16:41:35 -0800 Subject: [PATCH 23/24] Fixed test_regcoeff syntax error --- permute/tests/test_regcoeff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/permute/tests/test_regcoeff.py b/permute/tests/test_regcoeff.py index 42f3a981..c595b520 100644 --- a/permute/tests/test_regcoeff.py +++ b/permute/tests/test_regcoeff.py @@ -17,5 +17,5 @@ def test_regcoeff(): expected = 0 np.testing.assert_almost_equal(res, expected) - y.append(10) - assert_raises(ValueError, reg_coeff, x, y) \ No newline at end of file + y = np.array(range(11)) + assert_raises(ValueError, reg_coeff, x, y) From e3ece9dcff0c110b3af0b02285cf55cd82055940 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Fri, 15 Dec 2017 17:20:01 -0800 Subject: [PATCH 24/24] Fixed tests for two_sample_fit --- permute/core.py | 2 +- permute/tests/test_core.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/permute/core.py b/permute/core.py index 9c8b0c16..b1b0d756 100644 --- a/permute/core.py +++ b/permute/core.py @@ -191,7 +191,7 @@ def two_sample(x, y, reps=10**5, stat='mean', alternative="greater", 'mean': lambda u, v: np.mean(u) - np.mean(v), 't': lambda u, v: ttest_ind(u, v, equal_var=True)[0], 'KS': lambda u, v: np.max([abs(sum(u <= a) / len(x) - sum(v <= a) / len(y)) - for a in set(u).intersection(set(y))]) + for a in set(u).union(set(y))]) } if callable(stat): tst_fun = stat diff --git a/permute/tests/test_core.py b/permute/tests/test_core.py index 858649fb..965858fe 100644 --- a/permute/tests/test_core.py +++ b/permute/tests/test_core.py @@ -13,6 +13,7 @@ two_sample, two_sample_shift, two_sample_conf_int, + two_sample_fit, one_sample, one_sample, one_sample_conf_int, @@ -95,6 +96,17 @@ def test_two_sample(): np.testing.assert_equal(res[0], expected[0]) np.testing.assert_equal(res[1], expected[1]) +def test_two_sample_fit(): + x = np.array(range(10)) + y = np.array(range(10)) + alpha = 0.01 + res = two_sample_fit(x, y, alpha) + np.testing.assert_equal(res, True) + + y = np.array(range(10, 20)) + alpha = 0.1 + res = two_sample_fit(x, y, alpha) + np.testing.assert_equal(res, False) def test_two_sample_shift(): prng = RandomState(42)