diff --git a/.circleci/config.yml b/.circleci/config.yml index e57663f..64cc55b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -16,6 +16,7 @@ jobs: source activate testenv conda install --yes pip numpy scipy scikit-learn pandas numba matplotlib sphinx sphinx_rtd_theme numpydoc pillow pip install sphinx-gallery + pip install hypothesis pip install . cd doc make html diff --git a/.travis.yml b/.travis.yml index fafdf9d..75a00f1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -37,6 +37,7 @@ install: fi - conda install --yes pandas numba - pip install codecov + - pip install hypothesis - pip install coverage - pip install coveralls - pip install . diff --git a/appveyor.yml b/appveyor.yml index 9f67dde..0f906ef 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -25,6 +25,7 @@ install: - "conda install --yes pip numpy==%NUMPY_VERSION% scipy==%SCIPY_VERSION% scikit-learn==%SKLEARN_VERSION% nose pytest pytest-cov" - conda install --yes numba pandas - pip install codecov + - pip install hypothesis - pip install . test_script: diff --git a/vectorizers/tests/test_common.py b/vectorizers/tests/test_common.py index ae6252a..9f52324 100644 --- a/vectorizers/tests/test_common.py +++ b/vectorizers/tests/test_common.py @@ -1,4 +1,8 @@ import pytest +from hypothesis import given, example, note, settings +import hypothesis.strategies as st +from hypothesis.strategies import composite + from sklearn.utils.estimator_checks import check_estimator @@ -159,7 +163,9 @@ def test_LabeledTreeCooccurrenceVectorizer_reduced_vocab(): @pytest.mark.parametrize("max_token_occurrences", [None, 2]) @pytest.mark.parametrize("min_document_occurrences", [None, 1]) @pytest.mark.parametrize("max_document_frequency", [None, 0.7]) -@pytest.mark.parametrize("window_orientation", ["before", "after", "symmetric", "directional"]) +@pytest.mark.parametrize( + "window_orientation", ["before", "after", "symmetric", "directional"] +) @pytest.mark.parametrize("window_radius", [1, 2]) @pytest.mark.parametrize("kernel_function", ["harmonic", "flat"]) def test_equality_of_CooccurrenceVectorizers( @@ -275,28 +281,39 @@ def test_triangle_kernel(): assert kernel[1] == 3.0 -def test_flat_kernel(): - kernel = flat_kernel([0] * np.random.randint(2, 10), 0.0) +@given(st.integers(min_value=2, max_value=30)) +@settings(deadline=None) +def test_flat_kernel(length): + kernel = flat_kernel([0] * length, 0.0) assert np.all(kernel == 1.0) -def test_ngrams_of(): - for ngram_size in (1, 2, 4): - tokens = np.random.randint(10, size=np.random.poisson(5 + ngram_size)) - ngrams = ngrams_of(tokens, ngram_size) - if len(tokens) >= ngram_size: - assert len(ngrams) == len(tokens) - (ngram_size - 1) - else: - assert len(ngrams) == 0 - assert np.all( - [ngrams[i][0] == tokens[i] for i in range(len(tokens) - (ngram_size - 1))] - ) - assert np.all( - [ - ngrams[i][-1] == tokens[i + (ngram_size - 1)] - for i in range(len(tokens) - (ngram_size - 1)) - ] +@given(ngram_size=st.integers(min_value=1, max_value=6), data=st.data()) +@settings(deadline=None) +def test_ngrams_of(ngram_size, data): + tokens = np.array( + data.draw( + st.lists( + st.integers(min_value=0, max_value=10), + min_size=3 + ngram_size, + max_size=10 + ngram_size, + ) ) + ) + ngrams = ngrams_of(tokens, ngram_size) + if len(tokens) >= ngram_size: + assert len(ngrams) == len(tokens) - (ngram_size - 1) + else: + assert len(ngrams) == 0 + assert np.all( + [ngrams[i][0] == tokens[i] for i in range(len(tokens) - (ngram_size - 1))] + ) + assert np.all( + [ + ngrams[i][-1] == tokens[i + (ngram_size - 1)] + for i in range(len(tokens) - (ngram_size - 1)) + ] + ) def test_find_bin_boundaries_min(): @@ -315,7 +332,7 @@ def test_find_boundaries_all_dupes(): def test_token_cooccurrence_vectorizer_basic(): - vectorizer = TokenCooccurrenceVectorizer(window_orientation='symmetric') + vectorizer = TokenCooccurrenceVectorizer(window_orientation="symmetric") result = vectorizer.fit_transform(token_data) transform = vectorizer.transform(token_data) assert (result != transform).nnz == 0 @@ -365,7 +382,7 @@ def test_token_cooccurrence_vectorizer_column_order(): def test_token_cooccurrence_vectorizer_transform(): - vectorizer = TokenCooccurrenceVectorizer(window_orientation='symmetric') + vectorizer = TokenCooccurrenceVectorizer(window_orientation="symmetric") result = vectorizer.fit_transform(text_token_data_subset) transform = vectorizer.transform(text_token_data) assert result.shape == transform.shape diff --git a/vectorizers/tests/test_distances.py b/vectorizers/tests/test_distances.py index 49b58cd..cb0582c 100644 --- a/vectorizers/tests/test_distances.py +++ b/vectorizers/tests/test_distances.py @@ -1,4 +1,7 @@ import pytest +import hypothesis.extra.numpy as hyp_np +from hypothesis import given, settings +import hypothesis.strategies as st import numpy as np import scipy.sparse @@ -11,6 +14,19 @@ sparse_jensen_shannon_divergence, ) +TestDataStrategy = hyp_np.arrays( + dtype=np.float, + elements=st.floats(min_value=0, max_value=1), + unique=True, + shape=(10, 50), +) +TestDataStrategy_100 = hyp_np.arrays( + dtype=np.float, + elements=st.floats(min_value=0, max_value=1), + unique=True, + shape=(10, 100), +) + def test_hellinger(): assert hellinger(np.array([0.0, 0.0]), np.array([0.0, 0.0])) == 0.0 @@ -70,8 +86,9 @@ def test_sparse_hellinger(): # Test using inequalities with Hellinger distance from Wikipedia # https://en.wikipedia.org/wiki/Hellinger_distance#Connection_with_the_statistical_distance -def test_total_variation(): - test_data = np.random.random(size=(10, 50)) +@given(TestDataStrategy) +@settings(deadline=None) +def test_total_variation(test_data): test_data = normalize(test_data, norm="l1") for i in range(test_data.shape[0]): for j in range(i + 1, test_data.shape[0]): @@ -83,8 +100,9 @@ def test_total_variation(): # Test using inequalities with Hellinger distance from Wikipedia # https://en.wikipedia.org/wiki/Hellinger_distance#Connection_with_the_statistical_distance -def test_sparse_total_variation(): - test_data = np.random.random(size=(10, 100)) +@given(TestDataStrategy_100) +@settings(deadline=None) +def test_sparse_total_variation(test_data): # sparsify test_data[test_data <= 0.5] = 0.0 test_data = scipy.sparse.csr_matrix(test_data) @@ -108,8 +126,9 @@ def test_sparse_total_variation(): assert tvd <= np.sqrt(2) * hd -def test_jensen_shannon(): - test_data = np.random.random(size=(10, 50)) +@given(TestDataStrategy) +@settings(deadline=None) +def test_jensen_shannon(test_data): test_data = normalize(test_data, norm="l1") for i in range(test_data.shape[0]): for j in range(i + 1, test_data.shape[0]): @@ -123,8 +142,9 @@ def test_jensen_shannon(): assert np.isclose(d, jensen_shannon_divergence(p, q)) -def test_sparse_jensen_shannon(): - test_data = np.random.random(size=(10, 100)) +@given(TestDataStrategy_100) +@settings(deadline=None) +def test_sparse_jensen_shannon(test_data): # sparsify test_data[test_data <= 0.5] = 0.0 sparse_test_data = scipy.sparse.csr_matrix(test_data)