Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ jobs:
source activate testenv
conda install --yes pip numpy scipy scikit-learn pandas numba matplotlib sphinx sphinx_rtd_theme numpydoc pillow
pip install sphinx-gallery
pip install hypothesis
pip install .
cd doc
make html
Expand Down
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ install:
fi
- conda install --yes pandas numba
- pip install codecov
- pip install hypothesis
- pip install coverage
- pip install coveralls
- pip install .
Expand Down
1 change: 1 addition & 0 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ install:
- "conda install --yes pip numpy==%NUMPY_VERSION% scipy==%SCIPY_VERSION% scikit-learn==%SKLEARN_VERSION% nose pytest pytest-cov"
- conda install --yes numba pandas
- pip install codecov
- pip install hypothesis
- pip install .

test_script:
Expand Down
59 changes: 38 additions & 21 deletions vectorizers/tests/test_common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
import pytest
from hypothesis import given, example, note, settings
import hypothesis.strategies as st
from hypothesis.strategies import composite


from sklearn.utils.estimator_checks import check_estimator

Expand Down Expand Up @@ -159,7 +163,9 @@ def test_LabeledTreeCooccurrenceVectorizer_reduced_vocab():
@pytest.mark.parametrize("max_token_occurrences", [None, 2])
@pytest.mark.parametrize("min_document_occurrences", [None, 1])
@pytest.mark.parametrize("max_document_frequency", [None, 0.7])
@pytest.mark.parametrize("window_orientation", ["before", "after", "symmetric", "directional"])
@pytest.mark.parametrize(
"window_orientation", ["before", "after", "symmetric", "directional"]
)
@pytest.mark.parametrize("window_radius", [1, 2])
@pytest.mark.parametrize("kernel_function", ["harmonic", "flat"])
def test_equality_of_CooccurrenceVectorizers(
Expand Down Expand Up @@ -275,28 +281,39 @@ def test_triangle_kernel():
assert kernel[1] == 3.0


def test_flat_kernel():
kernel = flat_kernel([0] * np.random.randint(2, 10), 0.0)
@given(st.integers(min_value=2, max_value=30))
@settings(deadline=None)
def test_flat_kernel(length):
kernel = flat_kernel([0] * length, 0.0)
assert np.all(kernel == 1.0)


def test_ngrams_of():
for ngram_size in (1, 2, 4):
tokens = np.random.randint(10, size=np.random.poisson(5 + ngram_size))
ngrams = ngrams_of(tokens, ngram_size)
if len(tokens) >= ngram_size:
assert len(ngrams) == len(tokens) - (ngram_size - 1)
else:
assert len(ngrams) == 0
assert np.all(
[ngrams[i][0] == tokens[i] for i in range(len(tokens) - (ngram_size - 1))]
)
assert np.all(
[
ngrams[i][-1] == tokens[i + (ngram_size - 1)]
for i in range(len(tokens) - (ngram_size - 1))
]
@given(ngram_size=st.integers(min_value=1, max_value=6), data=st.data())
@settings(deadline=None)
def test_ngrams_of(ngram_size, data):
tokens = np.array(
data.draw(
st.lists(
st.integers(min_value=0, max_value=10),
min_size=3 + ngram_size,
max_size=10 + ngram_size,
)
)
)
ngrams = ngrams_of(tokens, ngram_size)
if len(tokens) >= ngram_size:
assert len(ngrams) == len(tokens) - (ngram_size - 1)
else:
assert len(ngrams) == 0
assert np.all(
[ngrams[i][0] == tokens[i] for i in range(len(tokens) - (ngram_size - 1))]
)
assert np.all(
[
ngrams[i][-1] == tokens[i + (ngram_size - 1)]
for i in range(len(tokens) - (ngram_size - 1))
]
)


def test_find_bin_boundaries_min():
Expand All @@ -315,7 +332,7 @@ def test_find_boundaries_all_dupes():


def test_token_cooccurrence_vectorizer_basic():
vectorizer = TokenCooccurrenceVectorizer(window_orientation='symmetric')
vectorizer = TokenCooccurrenceVectorizer(window_orientation="symmetric")
result = vectorizer.fit_transform(token_data)
transform = vectorizer.transform(token_data)
assert (result != transform).nnz == 0
Expand Down Expand Up @@ -365,7 +382,7 @@ def test_token_cooccurrence_vectorizer_column_order():


def test_token_cooccurrence_vectorizer_transform():
vectorizer = TokenCooccurrenceVectorizer(window_orientation='symmetric')
vectorizer = TokenCooccurrenceVectorizer(window_orientation="symmetric")
result = vectorizer.fit_transform(text_token_data_subset)
transform = vectorizer.transform(text_token_data)
assert result.shape == transform.shape
Expand Down
36 changes: 28 additions & 8 deletions vectorizers/tests/test_distances.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import pytest
import hypothesis.extra.numpy as hyp_np
from hypothesis import given, settings
import hypothesis.strategies as st

import numpy as np
import scipy.sparse
Expand All @@ -11,6 +14,19 @@
sparse_jensen_shannon_divergence,
)

TestDataStrategy = hyp_np.arrays(
dtype=np.float,
elements=st.floats(min_value=0, max_value=1),
unique=True,
shape=(10, 50),
)
TestDataStrategy_100 = hyp_np.arrays(
dtype=np.float,
elements=st.floats(min_value=0, max_value=1),
unique=True,
shape=(10, 100),
)


def test_hellinger():
assert hellinger(np.array([0.0, 0.0]), np.array([0.0, 0.0])) == 0.0
Expand Down Expand Up @@ -70,8 +86,9 @@ def test_sparse_hellinger():

# Test using inequalities with Hellinger distance from Wikipedia
# https://en.wikipedia.org/wiki/Hellinger_distance#Connection_with_the_statistical_distance
def test_total_variation():
test_data = np.random.random(size=(10, 50))
@given(TestDataStrategy)
@settings(deadline=None)
def test_total_variation(test_data):
test_data = normalize(test_data, norm="l1")
for i in range(test_data.shape[0]):
for j in range(i + 1, test_data.shape[0]):
Expand All @@ -83,8 +100,9 @@ def test_total_variation():

# Test using inequalities with Hellinger distance from Wikipedia
# https://en.wikipedia.org/wiki/Hellinger_distance#Connection_with_the_statistical_distance
def test_sparse_total_variation():
test_data = np.random.random(size=(10, 100))
@given(TestDataStrategy_100)
@settings(deadline=None)
def test_sparse_total_variation(test_data):
# sparsify
test_data[test_data <= 0.5] = 0.0
test_data = scipy.sparse.csr_matrix(test_data)
Expand All @@ -108,8 +126,9 @@ def test_sparse_total_variation():
assert tvd <= np.sqrt(2) * hd


def test_jensen_shannon():
test_data = np.random.random(size=(10, 50))
@given(TestDataStrategy)
@settings(deadline=None)
def test_jensen_shannon(test_data):
test_data = normalize(test_data, norm="l1")
for i in range(test_data.shape[0]):
for j in range(i + 1, test_data.shape[0]):
Expand All @@ -123,8 +142,9 @@ def test_jensen_shannon():
assert np.isclose(d, jensen_shannon_divergence(p, q))


def test_sparse_jensen_shannon():
test_data = np.random.random(size=(10, 100))
@given(TestDataStrategy_100)
@settings(deadline=None)
def test_sparse_jensen_shannon(test_data):
# sparsify
test_data[test_data <= 0.5] = 0.0
sparse_test_data = scipy.sparse.csr_matrix(test_data)
Expand Down