From 83a15bb5b8cda17bddda6bb373392e99deec00ad Mon Sep 17 00:00:00 2001 From: ennanco Date: Thu, 24 Feb 2022 19:30:06 +0100 Subject: [PATCH 01/21] changes in functions generate_samples and generate_counts to allow generate more than one feature --- examples/plot_vertical.py | 13 +++++++++++++ upsetplot/data.py | 25 +++++++++++++++---------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/examples/plot_vertical.py b/examples/plot_vertical.py index 4108fc5..2b9816a 100644 --- a/examples/plot_vertical.py +++ b/examples/plot_vertical.py @@ -25,3 +25,16 @@ plot(example, orientation='vertical', show_counts='%d', show_percentages=True) plt.suptitle('With counts and percentages shown') plt.show() + +######################################################################### + +from upsetplot import plotting + +# An UpSetplot with additional plots on vertical and tuning some visual parameters +fig = plotting.UpSet(example, orientation='vertical', show_counts=True, facecolor="grey", element_size=75) +fig.add_catplot('swarm', 'value', palette='colorblind') +fig.add_catplot('swarm', 'value1', palette='colorblind') +fig.add_catplot('swarm', 'value2', palette='colorblind') +fig.plot() +plt.show() + diff --git a/upsetplot/data.py b/upsetplot/data.py index ca8605d..2b75f37 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -8,7 +8,7 @@ import numpy as np -def generate_samples(seed=0, n_samples=10000, n_categories=3): +def generate_samples(seed=0, n_samples=10000, n_categories=3, len_samples=1): """Generate artificial samples assigned to set intersections Parameters @@ -25,6 +25,7 @@ def generate_samples(seed=0, n_samples=10000, n_categories=3): DataFrame Field 'value' is a weight or score for each element. Field 'index' is a unique id for each element. + Field(s) 'value{i}' additional values for multiple-feature samples Index includes a boolean indicator mask for each category. Note: Further fields may be added in future versions. @@ -35,18 +36,21 @@ def generate_samples(seed=0, n_samples=10000, n_categories=3): corresponding to these samples. """ rng = np.random.RandomState(seed) - df = pd.DataFrame({'value': np.zeros(n_samples)}) + df = pd.DataFrame(np.zeros((n_samples, len_samples))) + valuename_lst = [f'value{i}' if i >0 else 'value' for i in range(len_samples)] + df.columns = valuename_lst + for i in range(n_categories): - r = rng.rand(n_samples) - df['cat%d' % i] = r > rng.rand() - df['value'] += r + r = rng.rand(n_samples, len_samples) + df[f'cat{i}'] = r[:,0] > rng.rand() + df[valuename_lst] += r df.reset_index(inplace=True) - df.set_index(['cat%d' % i for i in range(n_categories)], inplace=True) + df.set_index([f'cat{i}' for i in range(n_categories)], inplace=True) return df -def generate_counts(seed=0, n_samples=10000, n_categories=3): +def generate_counts(seed=0, n_samples=10000, n_categories=3, len_samples=1): """Generate artificial counts corresponding to set intersections Parameters @@ -69,8 +73,9 @@ def generate_counts(seed=0, n_samples=10000, n_categories=3): derived from. """ df = generate_samples(seed=seed, n_samples=n_samples, - n_categories=n_categories) - return df.value.groupby(level=list(range(n_categories))).count() + n_categories=n_categories, len_samples=len_samples) + df.drop('index', axis=1, inplace=True) + return df.groupby(level=list(range(n_categories))).count() def generate_data(seed=0, n_samples=10000, n_sets=3, aggregated=False): @@ -79,7 +84,7 @@ def generate_data(seed=0, n_samples=10000, n_sets=3, aggregated=False): DeprecationWarning) if aggregated: return generate_counts(seed=seed, n_samples=n_samples, - n_categories=n_sets) + n_categories=n_sets)['value'] else: return generate_samples(seed=seed, n_samples=n_samples, n_categories=n_sets)['value'] From 5dab6084599a42c75638d17773b837be17ad92af Mon Sep 17 00:00:00 2001 From: ennanco Date: Mon, 28 Feb 2022 20:10:43 +0100 Subject: [PATCH 02/21] added unitary tests for generate_samples and generate_counts funtions --- upsetplot/data.py | 2 ++ upsetplot/tests/test_data.py | 61 ++++++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/upsetplot/data.py b/upsetplot/data.py index 2b75f37..72dd86f 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -61,6 +61,8 @@ def generate_counts(seed=0, n_samples=10000, n_categories=3, len_samples=1): Number of samples to generate statistics over n_categories : int Number of categories (named "cat0", "cat1", ...) to generate + len_samples: int + Number of features for each sample (value, value1, value2, ...) Returns ------- diff --git a/upsetplot/tests/test_data.py b/upsetplot/tests/test_data.py index 5937762..35b0856 100644 --- a/upsetplot/tests/test_data.py +++ b/upsetplot/tests/test_data.py @@ -7,6 +7,7 @@ assert_index_equal) from upsetplot import (from_memberships, from_contents, from_indicators, generate_data) +from upsetplot.data import generate_samples, generate_counts @pytest.mark.parametrize('typ', [set, list, tuple, iter]) @@ -206,7 +207,61 @@ def test_from_indicators_equivalence(indicators, data): assert_frame_equal(from_indicators(indicators, data), from_memberships([[], ["cat1"], []], data)) +class TestGenerateData: + def test_generate_data_warning(self): + ''' + Check the warning araised by the function + ''' + with pytest.warns(DeprecationWarning): + generate_data() + + def test_generate_default(self): + ''' + Check that the generated data by default, fullfills the + correct dimensions of the data + ''' + result = generate_data() + assert len(result.index[0]) == 3 + assert result.shape == (10_000,) + + def test_generate_samples_reproductibility(self): + ''' + This test explores the reproducibility of the results + when a random seed has been set + ''' + import numpy as np + seed = np.random.randint(0,100) + assert generate_samples(seed=seed).equals(generate_samples(seed=seed)) + + @pytest.mark.parametrize("n_samples", [100, 1_000, 10_000]) + @pytest.mark.parametrize("n_categories", [1,3]) + @pytest.mark.parametrize("len_samples", [1,3]) + def test_generate_samples_shapes(self, n_samples,n_categories, len_samples): + ''' + Check the generations of different sample sizes with different + arguments + NOTICE: the generate_samples funcition has one extra column due to index, + unless it is unused and it is removed + ''' + result = generate_samples(n_samples=n_samples, + n_categories=n_categories, + len_samples=len_samples) + + if type(result.index[0]) is tuple: + assert len(result.index[0]) == n_categories + else: + assert result.index.is_boolean() + + assert result.shape == (n_samples, len_samples+1) + + @pytest.mark.parametrize("n_samples", [100, 1_000, 10_000]) + @pytest.mark.parametrize("len_samples", [1,3]) + def test_generate_counts (self, n_samples, len_samples): + ''' + Test of the function generate_counts which internally uses generate_samples + ''' + result = generate_counts(n_samples=n_samples, len_samples=len_samples) + assert len(result.columns) == len_samples + assert (result.sum(axis=0) == n_samples).all() + -def test_generate_data_warning(): - with pytest.warns(DeprecationWarning): - generate_data() From 3eb9c65b50df1595c52706b6f610242f809858c9 Mon Sep 17 00:00:00 2001 From: ennanco Date: Wed, 2 Mar 2022 14:07:55 +0100 Subject: [PATCH 03/21] Repaired problems with some tests --- upsetplot/tests/test_upsetplot.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/upsetplot/tests/test_upsetplot.py b/upsetplot/tests/test_upsetplot.py index a1631f3..86cbca6 100644 --- a/upsetplot/tests/test_upsetplot.py +++ b/upsetplot/tests/test_upsetplot.py @@ -38,7 +38,8 @@ def get_all_texts(mpl_artist): @pytest.mark.parametrize('sort_by', ['cardinality', 'degree', None]) @pytest.mark.parametrize('sort_categories_by', [None, 'cardinality']) def test_process_data_series(x, sort_by, sort_categories_by): - assert x.name == 'value' + assert 'value' in x.columns + x = x.value for subset_size in ['auto', 'sum', 'count']: for sum_over in ['abc', False]: with pytest.raises(ValueError, match='sum_over is not applicable'): @@ -93,7 +94,7 @@ def test_process_data_series(x, sort_by, sort_categories_by): @pytest.mark.parametrize('x', [ generate_samples()['value'], - generate_counts(), + generate_counts()['value'], ]) def test_subset_size_series(x): kw = {'sort_by': 'cardinality', @@ -195,7 +196,7 @@ def test_process_data_frame(x, sort_by, sort_categories_by): @pytest.mark.parametrize('x', [ generate_samples()['value'], - generate_counts(), + generate_counts()['value'], ]) def test_subset_size_frame(x): kw = {'sort_by': 'cardinality', @@ -248,7 +249,7 @@ def test_not_unique(sort_by, sort_categories_by): 'sort_categories_by': sort_categories_by, 'subset_size': 'sum', 'sum_over': None} - Xagg = generate_counts() + Xagg = generate_counts().value total1, df1, intersections1, totals1 = _process_data(Xagg, **kw) Xunagg = generate_samples()['value'] Xunagg.loc[:] = 1 @@ -375,7 +376,7 @@ def _count_descendants(el): @pytest.mark.parametrize('orientation', ['horizontal', 'vertical']) def test_show_counts(orientation): fig = matplotlib.figure.Figure() - X = generate_counts(n_samples=10000) + X = generate_counts(n_samples=10000).value plot(X, fig, orientation=orientation) n_artists_no_sizes = _count_descendants(fig) @@ -416,7 +417,7 @@ def test_show_counts(orientation): def test_add_catplot(): pytest.importorskip('seaborn') - X = generate_counts(n_samples=100) + X = generate_counts(n_samples=100).value upset = UpSet(X) # smoke test upset.add_catplot('violin') @@ -430,7 +431,7 @@ def test_add_catplot(): # check the above add_catplot did not break the state upset.plot(fig) - X = generate_counts(n_samples=100) + X = generate_counts(n_samples=100).value X.name = 'foo' X = X.to_frame() upset = UpSet(X, subset_size='count') From e40d6b9d01b45460b9dbaf042ced44f9f769e0a7 Mon Sep 17 00:00:00 2001 From: ennanco Date: Wed, 2 Mar 2022 14:26:20 +0100 Subject: [PATCH 04/21] Repaired several examples due to the inclussion of the new generate_samples --- README.rst | 2 +- examples/plot_hide.py | 2 +- examples/plot_vertical.py | 5 ++--- upsetplot/tests/test_upsetplot.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 1f312d5..b80405a 100644 --- a/README.rst +++ b/README.rst @@ -22,7 +22,7 @@ categories, by having multiple boolean indices, like ``example`` in the following:: >>> from upsetplot import generate_counts - >>> example = generate_counts() + >>> example = generate_counts().value >>> example cat0 cat1 cat2 False False False 56 diff --git a/examples/plot_hide.py b/examples/plot_hide.py index a42b7f5..fe1de41 100644 --- a/examples/plot_hide.py +++ b/examples/plot_hide.py @@ -10,7 +10,7 @@ from matplotlib import pyplot as plt from upsetplot import generate_counts, plot -example = generate_counts() +example = generate_counts().value plot(example, show_counts=True) plt.suptitle('Nothing hidden') diff --git a/examples/plot_vertical.py b/examples/plot_vertical.py index 2b9816a..24c23e5 100644 --- a/examples/plot_vertical.py +++ b/examples/plot_vertical.py @@ -7,7 +7,7 @@ """ from matplotlib import pyplot as plt -from upsetplot import generate_counts, plot +from upsetplot import generate_counts, plot, plotting example = generate_counts() plot(example, orientation='vertical') @@ -28,9 +28,8 @@ ######################################################################### -from upsetplot import plotting - # An UpSetplot with additional plots on vertical and tuning some visual parameters +example = generate_counts(len_samples=3) fig = plotting.UpSet(example, orientation='vertical', show_counts=True, facecolor="grey", element_size=75) fig.add_catplot('swarm', 'value', palette='colorblind') fig.add_catplot('swarm', 'value1', palette='colorblind') diff --git a/upsetplot/tests/test_upsetplot.py b/upsetplot/tests/test_upsetplot.py index 86cbca6..a40da36 100644 --- a/upsetplot/tests/test_upsetplot.py +++ b/upsetplot/tests/test_upsetplot.py @@ -799,7 +799,7 @@ def _make_facecolor_list(colors): []), ]) def test_style_subsets(kwarg_list, expected_subset_styles, expected_legend): - data = generate_counts() + data = generate_counts().value upset = UpSet(data, facecolor="blue") for kw in kwarg_list: upset.style_subsets(**kw) From 7f5f918b5696f6f26de5af1eb8ee572f68c46a1b Mon Sep 17 00:00:00 2001 From: ennanco Date: Wed, 2 Mar 2022 18:16:37 +0100 Subject: [PATCH 05/21] Change the string format to make it compatible with Python v2 --- upsetplot/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/upsetplot/data.py b/upsetplot/data.py index 72dd86f..a980d92 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -37,7 +37,7 @@ def generate_samples(seed=0, n_samples=10000, n_categories=3, len_samples=1): """ rng = np.random.RandomState(seed) df = pd.DataFrame(np.zeros((n_samples, len_samples))) - valuename_lst = [f'value{i}' if i >0 else 'value' for i in range(len_samples)] + valuename_lst = ['value%d'%i if i >0 else 'value' for i in range(len_samples)] df.columns = valuename_lst for i in range(n_categories): @@ -46,7 +46,7 @@ def generate_samples(seed=0, n_samples=10000, n_categories=3, len_samples=1): df[valuename_lst] += r df.reset_index(inplace=True) - df.set_index([f'cat{i}' for i in range(n_categories)], inplace=True) + df.set_index(['cat%d'%i for i in range(n_categories)], inplace=True) return df From e0d9df0c28765e41930d4388cc2950bc08d5ac78 Mon Sep 17 00:00:00 2001 From: ennanco Date: Wed, 2 Mar 2022 18:24:43 +0100 Subject: [PATCH 06/21] Adding compatibility in generate_samples for python v2 --- upsetplot/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upsetplot/data.py b/upsetplot/data.py index a980d92..1150157 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -42,7 +42,7 @@ def generate_samples(seed=0, n_samples=10000, n_categories=3, len_samples=1): for i in range(n_categories): r = rng.rand(n_samples, len_samples) - df[f'cat{i}'] = r[:,0] > rng.rand() + df['cat%d'%i] = r[:,0] > rng.rand() df[valuename_lst] += r df.reset_index(inplace=True) From 4e18668fafed103dfebd3543cc0f6bd604e4d466 Mon Sep 17 00:00:00 2001 From: ennanco Date: Sat, 19 Mar 2022 20:08:33 +0100 Subject: [PATCH 07/21] Adding adaptations to made it retrocompatible with the examples --- README.rst | 2 +- examples/plot_hide.py | 2 +- examples/plot_vertical.py | 2 +- upsetplot/data.py | 11 ++++++----- upsetplot/tests/test_data.py | 9 +++++---- upsetplot/tests/test_upsetplot.py | 16 +++++++--------- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/README.rst b/README.rst index b80405a..1f312d5 100644 --- a/README.rst +++ b/README.rst @@ -22,7 +22,7 @@ categories, by having multiple boolean indices, like ``example`` in the following:: >>> from upsetplot import generate_counts - >>> example = generate_counts().value + >>> example = generate_counts() >>> example cat0 cat1 cat2 False False False 56 diff --git a/examples/plot_hide.py b/examples/plot_hide.py index fe1de41..a42b7f5 100644 --- a/examples/plot_hide.py +++ b/examples/plot_hide.py @@ -10,7 +10,7 @@ from matplotlib import pyplot as plt from upsetplot import generate_counts, plot -example = generate_counts().value +example = generate_counts() plot(example, show_counts=True) plt.suptitle('Nothing hidden') diff --git a/examples/plot_vertical.py b/examples/plot_vertical.py index 24c23e5..2466346 100644 --- a/examples/plot_vertical.py +++ b/examples/plot_vertical.py @@ -29,7 +29,7 @@ ######################################################################### # An UpSetplot with additional plots on vertical and tuning some visual parameters -example = generate_counts(len_samples=3) +example = generate_counts(extra_columns=2) fig = plotting.UpSet(example, orientation='vertical', show_counts=True, facecolor="grey", element_size=75) fig.add_catplot('swarm', 'value', palette='colorblind') fig.add_catplot('swarm', 'value1', palette='colorblind') diff --git a/upsetplot/data.py b/upsetplot/data.py index 1150157..95023bf 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -37,20 +37,20 @@ def generate_samples(seed=0, n_samples=10000, n_categories=3, len_samples=1): """ rng = np.random.RandomState(seed) df = pd.DataFrame(np.zeros((n_samples, len_samples))) - valuename_lst = ['value%d'%i if i >0 else 'value' for i in range(len_samples)] + valuename_lst = [f'value{i}' if i >0 else 'value' for i in range(len_samples)] df.columns = valuename_lst for i in range(n_categories): r = rng.rand(n_samples, len_samples) - df['cat%d'%i] = r[:,0] > rng.rand() + df[f'cat{i}'] = r[:,0] > rng.rand() df[valuename_lst] += r df.reset_index(inplace=True) - df.set_index(['cat%d'%i for i in range(n_categories)], inplace=True) + df.set_index([f'cat{i}' for i in range(n_categories)], inplace=True) return df -def generate_counts(seed=0, n_samples=10000, n_categories=3, len_samples=1): +def generate_counts(seed=0, n_samples=10000, n_categories=3, extra_columns=0): """Generate artificial counts corresponding to set intersections Parameters @@ -75,8 +75,9 @@ def generate_counts(seed=0, n_samples=10000, n_categories=3, len_samples=1): derived from. """ df = generate_samples(seed=seed, n_samples=n_samples, - n_categories=n_categories, len_samples=len_samples) + n_categories=n_categories, len_samples=1+extra_columns) df.drop('index', axis=1, inplace=True) + df = df if extra_columns > 0 else df.value return df.groupby(level=list(range(n_categories))).count() diff --git a/upsetplot/tests/test_data.py b/upsetplot/tests/test_data.py index 35b0856..70cbded 100644 --- a/upsetplot/tests/test_data.py +++ b/upsetplot/tests/test_data.py @@ -255,13 +255,14 @@ def test_generate_samples_shapes(self, n_samples,n_categories, len_samples): assert result.shape == (n_samples, len_samples+1) @pytest.mark.parametrize("n_samples", [100, 1_000, 10_000]) - @pytest.mark.parametrize("len_samples", [1,3]) - def test_generate_counts (self, n_samples, len_samples): + @pytest.mark.parametrize("extra_columns", [0,2]) + def test_generate_counts (self, n_samples, extra_columns): ''' Test of the function generate_counts which internally uses generate_samples ''' - result = generate_counts(n_samples=n_samples, len_samples=len_samples) - assert len(result.columns) == len_samples + result = generate_counts(n_samples=n_samples, extra_columns=extra_columns) + if extra_columns: + assert len(result.columns) == extra_columns + 1 assert (result.sum(axis=0) == n_samples).all() diff --git a/upsetplot/tests/test_upsetplot.py b/upsetplot/tests/test_upsetplot.py index a40da36..b58e7db 100644 --- a/upsetplot/tests/test_upsetplot.py +++ b/upsetplot/tests/test_upsetplot.py @@ -38,8 +38,6 @@ def get_all_texts(mpl_artist): @pytest.mark.parametrize('sort_by', ['cardinality', 'degree', None]) @pytest.mark.parametrize('sort_categories_by', [None, 'cardinality']) def test_process_data_series(x, sort_by, sort_categories_by): - assert 'value' in x.columns - x = x.value for subset_size in ['auto', 'sum', 'count']: for sum_over in ['abc', False]: with pytest.raises(ValueError, match='sum_over is not applicable'): @@ -94,7 +92,7 @@ def test_process_data_series(x, sort_by, sort_categories_by): @pytest.mark.parametrize('x', [ generate_samples()['value'], - generate_counts()['value'], + generate_counts(), ]) def test_subset_size_series(x): kw = {'sort_by': 'cardinality', @@ -196,7 +194,7 @@ def test_process_data_frame(x, sort_by, sort_categories_by): @pytest.mark.parametrize('x', [ generate_samples()['value'], - generate_counts()['value'], + generate_counts(), ]) def test_subset_size_frame(x): kw = {'sort_by': 'cardinality', @@ -249,7 +247,7 @@ def test_not_unique(sort_by, sort_categories_by): 'sort_categories_by': sort_categories_by, 'subset_size': 'sum', 'sum_over': None} - Xagg = generate_counts().value + Xagg = generate_counts() total1, df1, intersections1, totals1 = _process_data(Xagg, **kw) Xunagg = generate_samples()['value'] Xunagg.loc[:] = 1 @@ -376,7 +374,7 @@ def _count_descendants(el): @pytest.mark.parametrize('orientation', ['horizontal', 'vertical']) def test_show_counts(orientation): fig = matplotlib.figure.Figure() - X = generate_counts(n_samples=10000).value + X = generate_counts(n_samples=10000) plot(X, fig, orientation=orientation) n_artists_no_sizes = _count_descendants(fig) @@ -417,7 +415,7 @@ def test_show_counts(orientation): def test_add_catplot(): pytest.importorskip('seaborn') - X = generate_counts(n_samples=100).value + X = generate_counts(n_samples=100) upset = UpSet(X) # smoke test upset.add_catplot('violin') @@ -431,7 +429,7 @@ def test_add_catplot(): # check the above add_catplot did not break the state upset.plot(fig) - X = generate_counts(n_samples=100).value + X = generate_counts(n_samples=100) X.name = 'foo' X = X.to_frame() upset = UpSet(X, subset_size='count') @@ -799,7 +797,7 @@ def _make_facecolor_list(colors): []), ]) def test_style_subsets(kwarg_list, expected_subset_styles, expected_legend): - data = generate_counts().value + data = generate_counts() upset = UpSet(data, facecolor="blue") for kw in kwarg_list: upset.style_subsets(**kw) From b0c9c7bd357d5350ca9764bdc602fdb01555f36e Mon Sep 17 00:00:00 2001 From: ennanco Date: Sat, 19 Mar 2022 21:30:53 +0100 Subject: [PATCH 08/21] Fixing style --- examples/plot_vertical.py | 11 +++++++---- upsetplot/data.py | 8 +++++--- upsetplot/tests/test_data.py | 18 +++++++++++------- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/examples/plot_vertical.py b/examples/plot_vertical.py index 2466346..d192a5a 100644 --- a/examples/plot_vertical.py +++ b/examples/plot_vertical.py @@ -27,13 +27,16 @@ plt.show() ######################################################################### - -# An UpSetplot with additional plots on vertical and tuning some visual parameters +""" + An UpSetplot with additional plots on vertical + and tuning some visual parameters +""" example = generate_counts(extra_columns=2) -fig = plotting.UpSet(example, orientation='vertical', show_counts=True, facecolor="grey", element_size=75) +fig = plotting.UpSet(example, orientation='vertical', + show_counts=True, facecolor="grey", + element_size=75) fig.add_catplot('swarm', 'value', palette='colorblind') fig.add_catplot('swarm', 'value1', palette='colorblind') fig.add_catplot('swarm', 'value2', palette='colorblind') fig.plot() plt.show() - diff --git a/upsetplot/data.py b/upsetplot/data.py index 95023bf..65d0f9f 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -37,12 +37,13 @@ def generate_samples(seed=0, n_samples=10000, n_categories=3, len_samples=1): """ rng = np.random.RandomState(seed) df = pd.DataFrame(np.zeros((n_samples, len_samples))) - valuename_lst = [f'value{i}' if i >0 else 'value' for i in range(len_samples)] + valuename_lst = [f'value{i}' if i > 0 else 'value' for i in + range(len_samples)] df.columns = valuename_lst for i in range(n_categories): r = rng.rand(n_samples, len_samples) - df[f'cat{i}'] = r[:,0] > rng.rand() + df[f'cat{i}'] = r[:, 0] > rng.rand() df[valuename_lst] += r df.reset_index(inplace=True) @@ -75,7 +76,8 @@ def generate_counts(seed=0, n_samples=10000, n_categories=3, extra_columns=0): derived from. """ df = generate_samples(seed=seed, n_samples=n_samples, - n_categories=n_categories, len_samples=1+extra_columns) + n_categories=n_categories, + len_samples=1+extra_columns) df.drop('index', axis=1, inplace=True) df = df if extra_columns > 0 else df.value return df.groupby(level=list(range(n_categories))).count() diff --git a/upsetplot/tests/test_data.py b/upsetplot/tests/test_data.py index 70cbded..6ab50d1 100644 --- a/upsetplot/tests/test_data.py +++ b/upsetplot/tests/test_data.py @@ -207,6 +207,7 @@ def test_from_indicators_equivalence(indicators, data): assert_frame_equal(from_indicators(indicators, data), from_memberships([[], ["cat1"], []], data)) + class TestGenerateData: def test_generate_data_warning(self): ''' @@ -230,18 +231,19 @@ def test_generate_samples_reproductibility(self): when a random seed has been set ''' import numpy as np - seed = np.random.randint(0,100) + seed = np.random.randint(0, 100) assert generate_samples(seed=seed).equals(generate_samples(seed=seed)) @pytest.mark.parametrize("n_samples", [100, 1_000, 10_000]) @pytest.mark.parametrize("n_categories", [1,3]) @pytest.mark.parametrize("len_samples", [1,3]) - def test_generate_samples_shapes(self, n_samples,n_categories, len_samples): + def test_generate_samples_shapes(self, n_samples,n_categories, + len_samples): ''' Check the generations of different sample sizes with different arguments - NOTICE: the generate_samples funcition has one extra column due to index, - unless it is unused and it is removed + NOTICE: the generate_samples funcition has one extra + column due to index, unless it is unused and it is removed ''' result = generate_samples(n_samples=n_samples, n_categories=n_categories, @@ -252,15 +254,17 @@ def test_generate_samples_shapes(self, n_samples,n_categories, len_samples): else: assert result.index.is_boolean() - assert result.shape == (n_samples, len_samples+1) + assert result.shape == (n_samples, len_samples + 1) @pytest.mark.parametrize("n_samples", [100, 1_000, 10_000]) @pytest.mark.parametrize("extra_columns", [0,2]) def test_generate_counts (self, n_samples, extra_columns): ''' - Test of the function generate_counts which internally uses generate_samples + Test of the function generate_counts + which internally uses generate_samples ''' - result = generate_counts(n_samples=n_samples, extra_columns=extra_columns) + result = generate_counts(n_samples=n_samples, + extra_columns=extra_columns) if extra_columns: assert len(result.columns) == extra_columns + 1 assert (result.sum(axis=0) == n_samples).all() From 3374d0d1b28dda118b639f4b278ef726d3773ba0 Mon Sep 17 00:00:00 2001 From: ennanco Date: Mon, 21 Mar 2022 09:41:28 +0100 Subject: [PATCH 09/21] Fixing test_data.py according to python style sheet --- upsetplot/tests/test_data.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/upsetplot/tests/test_data.py b/upsetplot/tests/test_data.py index 6ab50d1..203e60d 100644 --- a/upsetplot/tests/test_data.py +++ b/upsetplot/tests/test_data.py @@ -226,18 +226,18 @@ def test_generate_default(self): assert result.shape == (10_000,) def test_generate_samples_reproductibility(self): - ''' - This test explores the reproducibility of the results - when a random seed has been set - ''' + ''' + This test explores the reproducibility of the results + when a random seed has been set + ''' import numpy as np seed = np.random.randint(0, 100) assert generate_samples(seed=seed).equals(generate_samples(seed=seed)) @pytest.mark.parametrize("n_samples", [100, 1_000, 10_000]) - @pytest.mark.parametrize("n_categories", [1,3]) - @pytest.mark.parametrize("len_samples", [1,3]) - def test_generate_samples_shapes(self, n_samples,n_categories, + @pytest.mark.parametrize("n_categories", [1, 3]) + @pytest.mark.parametrize("len_samples", [1, 3]) + def test_generate_samples_shapes(self, n_samples, n_categories, len_samples): ''' Check the generations of different sample sizes with different @@ -257,8 +257,8 @@ def test_generate_samples_shapes(self, n_samples,n_categories, assert result.shape == (n_samples, len_samples + 1) @pytest.mark.parametrize("n_samples", [100, 1_000, 10_000]) - @pytest.mark.parametrize("extra_columns", [0,2]) - def test_generate_counts (self, n_samples, extra_columns): + @pytest.mark.parametrize("extra_columns", [0, 2]) + def test_generate_counts(self, n_samples, extra_columns): ''' Test of the function generate_counts which internally uses generate_samples @@ -268,5 +268,3 @@ def test_generate_counts (self, n_samples, extra_columns): if extra_columns: assert len(result.columns) == extra_columns + 1 assert (result.sum(axis=0) == n_samples).all() - - From 9c546e07eb04350effdf289a8897beb5bb927140 Mon Sep 17 00:00:00 2001 From: ennanco Date: Mon, 21 Mar 2022 14:23:02 +0100 Subject: [PATCH 10/21] Fixing indentation --- upsetplot/tests/test_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/upsetplot/tests/test_data.py b/upsetplot/tests/test_data.py index 203e60d..fdb9b0e 100644 --- a/upsetplot/tests/test_data.py +++ b/upsetplot/tests/test_data.py @@ -230,9 +230,9 @@ def test_generate_samples_reproductibility(self): This test explores the reproducibility of the results when a random seed has been set ''' - import numpy as np - seed = np.random.randint(0, 100) - assert generate_samples(seed=seed).equals(generate_samples(seed=seed)) + import numpy as np + seed = np.random.randint(0, 100) + assert generate_samples(seed=seed).equals(generate_samples(seed=seed)) @pytest.mark.parametrize("n_samples", [100, 1_000, 10_000]) @pytest.mark.parametrize("n_categories", [1, 3]) From 7805d3fee6e0726ba3fe4a8d408c5428128a32ea Mon Sep 17 00:00:00 2001 From: ennanco Date: Mon, 21 Mar 2022 14:32:54 +0100 Subject: [PATCH 11/21] Fixing indentation --- upsetplot/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upsetplot/data.py b/upsetplot/data.py index 65d0f9f..fd6b8ea 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -77,7 +77,7 @@ def generate_counts(seed=0, n_samples=10000, n_categories=3, extra_columns=0): """ df = generate_samples(seed=seed, n_samples=n_samples, n_categories=n_categories, - len_samples=1+extra_columns) + len_samples=1 + extra_columns) df.drop('index', axis=1, inplace=True) df = df if extra_columns > 0 else df.value return df.groupby(level=list(range(n_categories))).count() From 4cab536efd2e4c6af8e05ed70a54a3dde8b22a38 Mon Sep 17 00:00:00 2001 From: ennanco Date: Tue, 22 Mar 2022 09:34:28 +0100 Subject: [PATCH 12/21] Fixing doctring in generete_counts and changing generate_samples for consistency to use extra_columns in order to keep retrocompatibility --- upsetplot/data.py | 25 ++++++++++++++++++++----- upsetplot/tests/test_data.py | 8 ++++---- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/upsetplot/data.py b/upsetplot/data.py index fd6b8ea..c228ef1 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -8,7 +8,7 @@ import numpy as np -def generate_samples(seed=0, n_samples=10000, n_categories=3, len_samples=1): +def generate_samples(seed=0, n_samples=10000, n_categories=3, extra_columns=0): """Generate artificial samples assigned to set intersections Parameters @@ -19,6 +19,9 @@ def generate_samples(seed=0, n_samples=10000, n_categories=3, len_samples=1): Number of samples to generate n_categories : int Number of categories (named "cat0", "cat1", ...) to generate + extra_columns : int + If a vector is required,this would indicated the number of additional + columns (named "value", "value1", "value2", ... ) Returns ------- @@ -35,7 +38,9 @@ def generate_samples(seed=0, n_samples=10000, n_categories=3, len_samples=1): generate_counts : Generates the counts for each subset of categories corresponding to these samples. """ + assert extra_columns >= 0, 'extra_columns parameter should be possitive' rng = np.random.RandomState(seed) + len_samples = 1 + extra_columns df = pd.DataFrame(np.zeros((n_samples, len_samples))) valuename_lst = [f'value{i}' if i > 0 else 'value' for i in range(len_samples)] @@ -62,22 +67,32 @@ def generate_counts(seed=0, n_samples=10000, n_categories=3, extra_columns=0): Number of samples to generate statistics over n_categories : int Number of categories (named "cat0", "cat1", ...) to generate - len_samples: int - Number of features for each sample (value, value1, value2, ...) + extra_coulmns: int + Number of addiotional features to be use to generate each sample (value, + value1, value2, ...) Returns ------- Series - Counts indexed by boolean indicator mask for each category. + (Default) When extra_columns is 0, counts indexed by boolean + indicator mask for each category. + DataFrame + When extra_columns is greater than 0, counts indexed boolean indicator mask + for each category and return the sum for each value. It includes the folling + fields: + Index includes a boolean indicator mask for each category. + Field(s) 'value{i}' counts the number of elements in that category according + to the mask See Also -------- generate_samples : Generates a DataFrame of samples that these counts are derived from. """ + assert extra_columns >= 0, 'extra_columns parameter should be possitive' df = generate_samples(seed=seed, n_samples=n_samples, n_categories=n_categories, - len_samples=1 + extra_columns) + extra_columns=extra_columns) df.drop('index', axis=1, inplace=True) df = df if extra_columns > 0 else df.value return df.groupby(level=list(range(n_categories))).count() diff --git a/upsetplot/tests/test_data.py b/upsetplot/tests/test_data.py index fdb9b0e..ca22adb 100644 --- a/upsetplot/tests/test_data.py +++ b/upsetplot/tests/test_data.py @@ -236,9 +236,9 @@ def test_generate_samples_reproductibility(self): @pytest.mark.parametrize("n_samples", [100, 1_000, 10_000]) @pytest.mark.parametrize("n_categories", [1, 3]) - @pytest.mark.parametrize("len_samples", [1, 3]) + @pytest.mark.parametrize("extra_columns", [0, 2]) def test_generate_samples_shapes(self, n_samples, n_categories, - len_samples): + extra_columns): ''' Check the generations of different sample sizes with different arguments @@ -247,14 +247,14 @@ def test_generate_samples_shapes(self, n_samples, n_categories, ''' result = generate_samples(n_samples=n_samples, n_categories=n_categories, - len_samples=len_samples) + extra_columns=extra_columns) if type(result.index[0]) is tuple: assert len(result.index[0]) == n_categories else: assert result.index.is_boolean() - assert result.shape == (n_samples, len_samples + 1) + assert result.shape == (n_samples, extra_columns + 2) @pytest.mark.parametrize("n_samples", [100, 1_000, 10_000]) @pytest.mark.parametrize("extra_columns", [0, 2]) From b1d0ec6f5ecd954f833c73c1c3c3e04876cdb2a8 Mon Sep 17 00:00:00 2001 From: ennanco Date: Tue, 22 Mar 2022 09:43:49 +0100 Subject: [PATCH 13/21] Fixing spacing style in some comments --- upsetplot/data.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/upsetplot/data.py b/upsetplot/data.py index c228ef1..fb6d1e6 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -68,8 +68,8 @@ def generate_counts(seed=0, n_samples=10000, n_categories=3, extra_columns=0): n_categories : int Number of categories (named "cat0", "cat1", ...) to generate extra_coulmns: int - Number of addiotional features to be use to generate each sample (value, - value1, value2, ...) + Number of addiotional features to be use to generate each + sample (value, value1, value2, ...) Returns ------- @@ -77,12 +77,12 @@ def generate_counts(seed=0, n_samples=10000, n_categories=3, extra_columns=0): (Default) When extra_columns is 0, counts indexed by boolean indicator mask for each category. DataFrame - When extra_columns is greater than 0, counts indexed boolean indicator mask - for each category and return the sum for each value. It includes the folling - fields: + When extra_columns is greater than 0, counts indexed boolean + indicator mask for each category and return the sum for each + value. It includes the folling fields: Index includes a boolean indicator mask for each category. - Field(s) 'value{i}' counts the number of elements in that category according - to the mask + Field(s) 'value{i}' counts the number of elements in that + category accordingto the mask See Also -------- From f64fb19ab1782820cba9429a02d5d0738bfcd006 Mon Sep 17 00:00:00 2001 From: ennanco Date: Tue, 22 Mar 2022 13:31:33 +0100 Subject: [PATCH 14/21] Adding unitary test for generate_data --- upsetplot/data.py | 2 +- upsetplot/tests/test_data.py | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/upsetplot/data.py b/upsetplot/data.py index fb6d1e6..d2900fe 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -104,7 +104,7 @@ def generate_data(seed=0, n_samples=10000, n_sets=3, aggregated=False): DeprecationWarning) if aggregated: return generate_counts(seed=seed, n_samples=n_samples, - n_categories=n_sets)['value'] + n_categories=n_sets) else: return generate_samples(seed=seed, n_samples=n_samples, n_categories=n_sets)['value'] diff --git a/upsetplot/tests/test_data.py b/upsetplot/tests/test_data.py index ca22adb..9a9c583 100644 --- a/upsetplot/tests/test_data.py +++ b/upsetplot/tests/test_data.py @@ -3,11 +3,11 @@ import pandas as pd import numpy as np from distutils.version import LooseVersion -from pandas.util.testing import (assert_series_equal, assert_frame_equal, +from pandas.testing import (assert_series_equal, assert_frame_equal, assert_index_equal) from upsetplot import (from_memberships, from_contents, from_indicators, generate_data) -from upsetplot.data import generate_samples, generate_counts +from upsetplot.data import generate_samples, generate_counts, generate_data @pytest.mark.parametrize('typ', [set, list, tuple, iter]) @@ -268,3 +268,15 @@ def test_generate_counts(self, n_samples, extra_columns): if extra_columns: assert len(result.columns) == extra_columns + 1 assert (result.sum(axis=0) == n_samples).all() + + @pytest.mark.parametrize("aggregated", [True, False]) + def test_generate_data(self, aggregated): + ''' + Test the return of the deprecated method + generate_data + ''' + data = generate_data(aggregated=aggregated) + if aggregated: + assert data.equals(generate_counts()) + else: + assert data.equals(generate_samples().value) From 5443acb9272d7a3699a7618249cb6f6c95694002 Mon Sep 17 00:00:00 2001 From: ennanco Date: Tue, 22 Mar 2022 13:37:16 +0100 Subject: [PATCH 15/21] Adding unitary test for generate_data --- upsetplot/tests/test_data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/upsetplot/tests/test_data.py b/upsetplot/tests/test_data.py index 9a9c583..7f475d4 100644 --- a/upsetplot/tests/test_data.py +++ b/upsetplot/tests/test_data.py @@ -4,10 +4,11 @@ import numpy as np from distutils.version import LooseVersion from pandas.testing import (assert_series_equal, assert_frame_equal, - assert_index_equal) + assert_index_equal) from upsetplot import (from_memberships, from_contents, from_indicators, generate_data) -from upsetplot.data import generate_samples, generate_counts, generate_data +from upsetplot.data import (generate_samples, generate_counts, + generate_data) @pytest.mark.parametrize('typ', [set, list, tuple, iter]) From 5731bc644064612777005c5fcd8a7e3d0b8c82b3 Mon Sep 17 00:00:00 2001 From: ennanco Date: Tue, 22 Mar 2022 18:07:35 +0100 Subject: [PATCH 16/21] Adding unitary test for generate_data --- upsetplot/tests/test_data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/upsetplot/tests/test_data.py b/upsetplot/tests/test_data.py index 7f475d4..4bd3634 100644 --- a/upsetplot/tests/test_data.py +++ b/upsetplot/tests/test_data.py @@ -7,8 +7,7 @@ assert_index_equal) from upsetplot import (from_memberships, from_contents, from_indicators, generate_data) -from upsetplot.data import (generate_samples, generate_counts, - generate_data) +from upsetplot.data import (generate_samples, generate_counts) @pytest.mark.parametrize('typ', [set, list, tuple, iter]) From 062e3374400af0e5dd8677dd65c5fb6a791762fa Mon Sep 17 00:00:00 2001 From: Enrique Fernandez-Blanco Date: Mon, 2 Jan 2023 10:01:16 +0100 Subject: [PATCH 17/21] Update upsetplot/data.py Co-authored-by: Joel Nothman --- upsetplot/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upsetplot/data.py b/upsetplot/data.py index d2900fe..042f399 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -67,7 +67,7 @@ def generate_counts(seed=0, n_samples=10000, n_categories=3, extra_columns=0): Number of samples to generate statistics over n_categories : int Number of categories (named "cat0", "cat1", ...) to generate - extra_coulmns: int + extra_columns: int Number of addiotional features to be use to generate each sample (value, value1, value2, ...) From 3d884c4e0fb1d909008bad1001ad34e8a586876e Mon Sep 17 00:00:00 2001 From: Enrique Fernandez-Blanco Date: Mon, 2 Jan 2023 10:01:35 +0100 Subject: [PATCH 18/21] Update upsetplot/data.py Co-authored-by: Joel Nothman --- upsetplot/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upsetplot/data.py b/upsetplot/data.py index 042f399..4dc5918 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -73,7 +73,7 @@ def generate_counts(seed=0, n_samples=10000, n_categories=3, extra_columns=0): Returns ------- - Series + Series or DataFrame (Default) When extra_columns is 0, counts indexed by boolean indicator mask for each category. DataFrame From b000f157072e336a2519382c5ca0aa7fba4a4cc0 Mon Sep 17 00:00:00 2001 From: Enrique Fernandez-Blanco Date: Mon, 2 Jan 2023 10:02:04 +0100 Subject: [PATCH 19/21] Update upsetplot/data.py Co-authored-by: Joel Nothman --- upsetplot/data.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/upsetplot/data.py b/upsetplot/data.py index 4dc5918..2200bc9 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -74,15 +74,10 @@ def generate_counts(seed=0, n_samples=10000, n_categories=3, extra_columns=0): Returns ------- Series or DataFrame - (Default) When extra_columns is 0, counts indexed by boolean - indicator mask for each category. - DataFrame - When extra_columns is greater than 0, counts indexed boolean - indicator mask for each category and return the sum for each - value. It includes the folling fields: - Index includes a boolean indicator mask for each category. - Field(s) 'value{i}' counts the number of elements in that - category accordingto the mask + A Series of counts indexed by boolean indicator mask for each category, + when ``extra_columns`` is 0. Otherwise a DataFrame with column ``value`` + equivalent to the value produced when ``extra_columns`` is 0, as well as + further random variables ``value1``, ``value2``, for extra columns. See Also -------- From 684be8c01a364f15239f37182ba516b57a2e5828 Mon Sep 17 00:00:00 2001 From: Enrique Fernandez-Blanco Date: Mon, 2 Jan 2023 10:02:14 +0100 Subject: [PATCH 20/21] Update upsetplot/data.py Co-authored-by: Joel Nothman --- upsetplot/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upsetplot/data.py b/upsetplot/data.py index 2200bc9..4e9fe62 100644 --- a/upsetplot/data.py +++ b/upsetplot/data.py @@ -68,7 +68,7 @@ def generate_counts(seed=0, n_samples=10000, n_categories=3, extra_columns=0): n_categories : int Number of categories (named "cat0", "cat1", ...) to generate extra_columns: int - Number of addiotional features to be use to generate each + Number of additional features to be use to generate each sample (value, value1, value2, ...) Returns From ce55bd046c05e1e01b8be2140fcd4d4e045a4b50 Mon Sep 17 00:00:00 2001 From: Enrique Fernandez-Blanco Date: Mon, 2 Jan 2023 10:02:35 +0100 Subject: [PATCH 21/21] Update examples/plot_vertical.py Co-authored-by: Joel Nothman --- examples/plot_vertical.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/plot_vertical.py b/examples/plot_vertical.py index d192a5a..c7ffca6 100644 --- a/examples/plot_vertical.py +++ b/examples/plot_vertical.py @@ -27,10 +27,8 @@ plt.show() ######################################################################### -""" - An UpSetplot with additional plots on vertical - and tuning some visual parameters -""" +# An UpSetplot with additional plots on vertical +# and tuning some visual parameters example = generate_counts(extra_columns=2) fig = plotting.UpSet(example, orientation='vertical', show_counts=True, facecolor="grey",