TutteInstitute · acwooding · May 30, 2020 · May 30, 2020 · May 30, 2020
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -16,6 +16,7 @@ jobs:
             source activate testenv
             conda install --yes pip numpy scipy scikit-learn pandas numba matplotlib sphinx sphinx_rtd_theme numpydoc pillow
             pip install sphinx-gallery
+            pip install hypothesis
             pip install .
             cd doc
             make html

diff --git a/.travis.yml b/.travis.yml
@@ -37,6 +37,7 @@ install:
       fi
   - conda install --yes pandas numba
   - pip install codecov
+  - pip install hypothesis
   - pip install coverage
   - pip install coveralls
   - pip install .

diff --git a/appveyor.yml b/appveyor.yml
@@ -25,6 +25,7 @@ install:
   - "conda install --yes pip numpy==%NUMPY_VERSION% scipy==%SCIPY_VERSION% scikit-learn==%SKLEARN_VERSION% nose pytest pytest-cov"
   - conda install --yes numba pandas
   - pip install codecov
+  - pip install hypothesis
   - pip install .
 
 test_script:

diff --git a/vectorizers/tests/test_common.py b/vectorizers/tests/test_common.py
@@ -1,4 +1,8 @@
 import pytest
+from hypothesis import given, example, note, settings
+import hypothesis.strategies as st
+from hypothesis.strategies import composite
+
 
 from sklearn.utils.estimator_checks import check_estimator
 
@@ -159,7 +163,9 @@ def test_LabeledTreeCooccurrenceVectorizer_reduced_vocab():
 @pytest.mark.parametrize("max_token_occurrences", [None, 2])
 @pytest.mark.parametrize("min_document_occurrences", [None, 1])
 @pytest.mark.parametrize("max_document_frequency", [None, 0.7])
-@pytest.mark.parametrize("window_orientation", ["before", "after", "symmetric", "directional"])
+@pytest.mark.parametrize(
+    "window_orientation", ["before", "after", "symmetric", "directional"]
+)
 @pytest.mark.parametrize("window_radius", [1, 2])
 @pytest.mark.parametrize("kernel_function", ["harmonic", "flat"])
 def test_equality_of_CooccurrenceVectorizers(
@@ -275,28 +281,39 @@ def test_triangle_kernel():
     assert kernel[1] == 3.0
 
 
-def test_flat_kernel():
-    kernel = flat_kernel([0] * np.random.randint(2, 10), 0.0)
+@given(st.integers(min_value=2, max_value=30))
+@settings(deadline=None)
+def test_flat_kernel(length):
+    kernel = flat_kernel([0] * length, 0.0)
     assert np.all(kernel == 1.0)
 
 
-def test_ngrams_of():
-    for ngram_size in (1, 2, 4):
-        tokens = np.random.randint(10, size=np.random.poisson(5 + ngram_size))
-        ngrams = ngrams_of(tokens, ngram_size)
-        if len(tokens) >= ngram_size:
-            assert len(ngrams) == len(tokens) - (ngram_size - 1)
-        else:
-            assert len(ngrams) == 0
-        assert np.all(
-            [ngrams[i][0] == tokens[i] for i in range(len(tokens) - (ngram_size - 1))]
-        )
-        assert np.all(
-            [
-                ngrams[i][-1] == tokens[i + (ngram_size - 1)]
-                for i in range(len(tokens) - (ngram_size - 1))
-            ]
+@given(ngram_size=st.integers(min_value=1, max_value=6), data=st.data())
+@settings(deadline=None)
+def test_ngrams_of(ngram_size, data):
+    tokens = np.array(
+        data.draw(
+            st.lists(
+                st.integers(min_value=0, max_value=10),
+                min_size=3 + ngram_size,
+                max_size=10 + ngram_size,
+            )
         )
+    )
+    ngrams = ngrams_of(tokens, ngram_size)
+    if len(tokens) >= ngram_size:
+        assert len(ngrams) == len(tokens) - (ngram_size - 1)
+    else:
+        assert len(ngrams) == 0
+    assert np.all(
+        [ngrams[i][0] == tokens[i] for i in range(len(tokens) - (ngram_size - 1))]
+    )
+    assert np.all(
+        [
+            ngrams[i][-1] == tokens[i + (ngram_size - 1)]
+            for i in range(len(tokens) - (ngram_size - 1))
+        ]
+    )
 
 
 def test_find_bin_boundaries_min():
@@ -315,7 +332,7 @@ def test_find_boundaries_all_dupes():
 
 
 def test_token_cooccurrence_vectorizer_basic():
-    vectorizer = TokenCooccurrenceVectorizer(window_orientation='symmetric')
+    vectorizer = TokenCooccurrenceVectorizer(window_orientation="symmetric")
     result = vectorizer.fit_transform(token_data)
     transform = vectorizer.transform(token_data)
     assert (result != transform).nnz == 0
@@ -365,7 +382,7 @@ def test_token_cooccurrence_vectorizer_column_order():
 
 
 def test_token_cooccurrence_vectorizer_transform():
-    vectorizer = TokenCooccurrenceVectorizer(window_orientation='symmetric')
+    vectorizer = TokenCooccurrenceVectorizer(window_orientation="symmetric")
     result = vectorizer.fit_transform(text_token_data_subset)
     transform = vectorizer.transform(text_token_data)
     assert result.shape == transform.shape

diff --git a/vectorizers/tests/test_distances.py b/vectorizers/tests/test_distances.py
@@ -1,4 +1,7 @@
 import pytest
+import hypothesis.extra.numpy as hyp_np
+from hypothesis import given, settings
+import hypothesis.strategies as st
 
 import numpy as np
 import scipy.sparse
@@ -11,6 +14,19 @@
     sparse_jensen_shannon_divergence,
 )
 
+TestDataStrategy = hyp_np.arrays(
+    dtype=np.float,
+    elements=st.floats(min_value=0, max_value=1),
+    unique=True,
+    shape=(10, 50),
+)
+TestDataStrategy_100 = hyp_np.arrays(
+    dtype=np.float,
+    elements=st.floats(min_value=0, max_value=1),
+    unique=True,
+    shape=(10, 100),
+)
+
 
 def test_hellinger():
     assert hellinger(np.array([0.0, 0.0]), np.array([0.0, 0.0])) == 0.0
@@ -70,8 +86,9 @@ def test_sparse_hellinger():
 
 # Test using inequalities with Hellinger distance from Wikipedia
 # https://en.wikipedia.org/wiki/Hellinger_distance#Connection_with_the_statistical_distance
-def test_total_variation():
-    test_data = np.random.random(size=(10, 50))
+@given(TestDataStrategy)
+@settings(deadline=None)
+def test_total_variation(test_data):
     test_data = normalize(test_data, norm="l1")
     for i in range(test_data.shape[0]):
         for j in range(i + 1, test_data.shape[0]):
@@ -83,8 +100,9 @@ def test_total_variation():
 
 # Test using inequalities with Hellinger distance from Wikipedia
 # https://en.wikipedia.org/wiki/Hellinger_distance#Connection_with_the_statistical_distance
-def test_sparse_total_variation():
-    test_data = np.random.random(size=(10, 100))
+@given(TestDataStrategy_100)
+@settings(deadline=None)
+def test_sparse_total_variation(test_data):
     # sparsify
     test_data[test_data <= 0.5] = 0.0
     test_data = scipy.sparse.csr_matrix(test_data)
@@ -108,8 +126,9 @@ def test_sparse_total_variation():
             assert tvd <= np.sqrt(2) * hd
 
 
-def test_jensen_shannon():
-    test_data = np.random.random(size=(10, 50))
+@given(TestDataStrategy)
+@settings(deadline=None)
+def test_jensen_shannon(test_data):
     test_data = normalize(test_data, norm="l1")
     for i in range(test_data.shape[0]):
         for j in range(i + 1, test_data.shape[0]):
@@ -123,8 +142,9 @@ def test_jensen_shannon():
             assert np.isclose(d, jensen_shannon_divergence(p, q))
 
 
-def test_sparse_jensen_shannon():
-    test_data = np.random.random(size=(10, 100))
+@given(TestDataStrategy_100)
+@settings(deadline=None)
+def test_sparse_jensen_shannon(test_data):
     # sparsify
     test_data[test_data <= 0.5] = 0.0
     sparse_test_data = scipy.sparse.csr_matrix(test_data)