diff --git a/py_stringmatching/tests/test_sim_Affine.py b/py_stringmatching/tests/test_sim_Affine.py new file mode 100644 index 0000000..ea10a18 --- /dev/null +++ b/py_stringmatching/tests/test_sim_Affine.py @@ -0,0 +1,88 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * +from py_stringmatching.similarity_measure.affine import Affine + +class AffineTestCases(unittest.TestCase): + def setUp(self): + self.affine = Affine() + self.affine_with_params1 = Affine(gap_start=2, gap_continuation=0.5) + self.sim_func = lambda s1, s2: (int(1 if s1 == s2 else 0)) + self.affine_with_params2 = Affine(gap_continuation=0.2, sim_func=self.sim_func) + + def test_valid_input(self): + self.assertAlmostEqual(self.affine.get_raw_score('dva', 'deeva'), 1.5) + self.assertAlmostEqual(self.affine_with_params1.get_raw_score('dva', 'deeve'), -0.5) + self.assertAlmostEqual(self.affine_with_params2.get_raw_score('AAAGAATTCA', 'AAATCA'), + 4.4) + self.assertAlmostEqual(self.affine_with_params2.get_raw_score(' ', ' '), 1) + self.assertEqual(self.affine.get_raw_score('', 'deeva'), 0) + + def test_valid_input_non_ascii(self): + self.assertAlmostEqual(self.affine.get_raw_score(u'dva', u'dáóva'), 1.5) + self.assertAlmostEqual(self.affine.get_raw_score('dva', 'dáóva'), 1.5) + self.assertAlmostEqual(self.affine.get_raw_score('dva', b'd\xc3\xa1\xc3\xb3va'), 1.5) + + def test_get_gap_start(self): + self.assertEqual(self.affine_with_params1.get_gap_start(), 2) + + def test_get_gap_continuation(self): + self.assertEqual(self.affine_with_params2.get_gap_continuation(), 0.2) + + def test_get_sim_func(self): + self.assertEqual(self.affine_with_params2.get_sim_func(), self.sim_func) + + def test_set_gap_start(self): + af = Affine(gap_start=1) + self.assertEqual(af.get_gap_start(), 1) + self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 1.5) + self.assertEqual(af.set_gap_start(2), True) + self.assertEqual(af.get_gap_start(), 2) + self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 0.5) + + def test_set_gap_continuation(self): + af = Affine(gap_continuation=0.3) + self.assertEqual(af.get_gap_continuation(), 0.3) + self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 1.7) + self.assertEqual(af.set_gap_continuation(0.7), True) + self.assertEqual(af.get_gap_continuation(), 0.7) + self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 1.3) + + def test_set_sim_func(self): + fn1 = lambda s1, s2: (int(1 if s1 == s2 else 0)) + fn2 = lambda s1, s2: (int(2 if s1 == s2 else -1)) + af = Affine(sim_func=fn1) + self.assertEqual(af.get_sim_func(), fn1) + self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 1.5) + self.assertEqual(af.set_sim_func(fn2), True) + self.assertEqual(af.get_sim_func(), fn2) + self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 4.5) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.affine.get_raw_score(None, 'MARHTA') + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.affine.get_raw_score('MARHTA', None) + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.affine.get_raw_score('MARHTA', 12.90) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.affine.get_raw_score(12.90, 'MARTHA') + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.affine.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.affine.get_raw_score(12.90, 12.90) \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_BagDistance.py b/py_stringmatching/tests/test_sim_BagDistance.py new file mode 100644 index 0000000..0c06e7e --- /dev/null +++ b/py_stringmatching/tests/test_sim_BagDistance.py @@ -0,0 +1,115 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * +from py_stringmatching.similarity_measure.bag_distance import BagDistance + +class BagDistanceTestCases(unittest.TestCase): + def setUp(self): + self.bd = BagDistance() + + def test_valid_input_raw_score(self): + self.assertEqual(self.bd.get_raw_score('a', ''), 1) + self.assertEqual(self.bd.get_raw_score('', 'a'), 1) + self.assertEqual(self.bd.get_raw_score('abc', ''), 3) + self.assertEqual(self.bd.get_raw_score('', 'abc'), 3) + self.assertEqual(self.bd.get_raw_score('', ''), 0) + self.assertEqual(self.bd.get_raw_score('a', 'a'), 0) + self.assertEqual(self.bd.get_raw_score('abc', 'abc'), 0) + self.assertEqual(self.bd.get_raw_score('a', 'ab'), 1) + self.assertEqual(self.bd.get_raw_score('b', 'ab'), 1) + self.assertEqual(self.bd.get_raw_score('ac', 'abc'), 1) + self.assertEqual(self.bd.get_raw_score('abcdefg', 'xabxcdxxefxgx'), 6) + self.assertEqual(self.bd.get_raw_score('ab', 'a'), 1) + self.assertEqual(self.bd.get_raw_score('ab', 'b'), 1) + self.assertEqual(self.bd.get_raw_score('abc', 'ac'), 1) + self.assertEqual(self.bd.get_raw_score('xabxcdxxefxgx', 'abcdefg'), 6) + self.assertEqual(self.bd.get_raw_score('a', 'b'), 1) + self.assertEqual(self.bd.get_raw_score('ab', 'ac'), 1) + self.assertEqual(self.bd.get_raw_score('ac', 'bc'), 1) + self.assertEqual(self.bd.get_raw_score('abc', 'axc'), 1) + self.assertEqual(self.bd.get_raw_score('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6) + self.assertEqual(self.bd.get_raw_score('example', 'samples'), 2) + self.assertEqual(self.bd.get_raw_score('sturgeon', 'urgently'), 2) + self.assertEqual(self.bd.get_raw_score('bag_distance', 'frankenstein'), 6) + self.assertEqual(self.bd.get_raw_score('distance', 'difference'), 5) + self.assertEqual(self.bd.get_raw_score('java was neat', 'scala is great'), 6) + + def test_valid_input_sim_score(self): + self.assertEqual(self.bd.get_sim_score('a', ''), 0.0) + self.assertEqual(self.bd.get_sim_score('', 'a'), 0.0) + self.assertEqual(self.bd.get_sim_score('abc', ''), 0.0) + self.assertEqual(self.bd.get_sim_score('', 'abc'), 0.0) + self.assertEqual(self.bd.get_sim_score('', ''), 1.0) + self.assertEqual(self.bd.get_sim_score('a', 'a'), 1.0) + self.assertEqual(self.bd.get_sim_score('abc', 'abc'), 1.0) + self.assertEqual(self.bd.get_sim_score('a', 'ab'), 1.0 - (1.0/2.0)) + self.assertEqual(self.bd.get_sim_score('b', 'ab'), 1.0 - (1.0/2.0)) + self.assertEqual(self.bd.get_sim_score('ac', 'abc'), 1.0 - (1.0/3.0)) + self.assertEqual(self.bd.get_sim_score('abcdefg', 'xabxcdxxefxgx'), 1.0 - (6.0/13.0)) + self.assertEqual(self.bd.get_sim_score('ab', 'a'), 1.0 - (1.0/2.0)) + self.assertEqual(self.bd.get_sim_score('ab', 'b'), 1.0 - (1.0/2.0)) + self.assertEqual(self.bd.get_sim_score('abc', 'ac'), 1.0 - (1.0/3.0)) + self.assertEqual(self.bd.get_sim_score('xabxcdxxefxgx', 'abcdefg'), 1.0 - (6.0/13.0)) + self.assertEqual(self.bd.get_sim_score('a', 'b'), 0.0) + self.assertEqual(self.bd.get_sim_score('ab', 'ac'), 1.0 - (1.0/2.0)) + self.assertEqual(self.bd.get_sim_score('ac', 'bc'), 1.0 - (1.0/2.0)) + self.assertEqual(self.bd.get_sim_score('abc', 'axc'), 1.0 - (1.0/3.0)) + self.assertEqual(self.bd.get_sim_score('xabxcdxxefxgx', '1ab2cd34ef5g6'), 1.0 - (6.0/13.0)) + self.assertEqual(self.bd.get_sim_score('example', 'samples'), 1.0 - (2.0/7.0)) + self.assertEqual(self.bd.get_sim_score('sturgeon', 'urgently'), 1.0 - (2.0/8.0)) + self.assertEqual(self.bd.get_sim_score('bag_distance', 'frankenstein'), 1.0 - (6.0/12.0)) + self.assertEqual(self.bd.get_sim_score('distance', 'difference'), 1.0 - (5.0/10.0)) + self.assertEqual(self.bd.get_sim_score('java was neat', 'scala is great'), 1.0 - (6.0/14.0)) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.bd.get_raw_score('a', None) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.bd.get_raw_score(None, 'b') + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.bd.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.bd.get_raw_score('MARHTA', 12.90) + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.bd.get_raw_score(12.90, 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.bd.get_raw_score(12.90, 12.90) + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.bd.get_sim_score('a', None) + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.bd.get_sim_score(None, 'b') + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.bd.get_sim_score(None, None) + + @raises(TypeError) + def test_invalid_input4_sim_score(self): + self.bd.get_sim_score('MARHTA', 12.90) + + @raises(TypeError) + def test_invalid_input5_sim_score(self): + self.bd.get_sim_score(12.90, 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_sim_score(self): + self.bd.get_sim_score(12.90, 12.90) diff --git a/py_stringmatching/tests/test_sim_Cosine.py b/py_stringmatching/tests/test_sim_Cosine.py new file mode 100644 index 0000000..0e1ff2f --- /dev/null +++ b/py_stringmatching/tests/test_sim_Cosine.py @@ -0,0 +1,98 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.cosine import Cosine + +class CosineTestCases(unittest.TestCase): + def setUp(self): + self.cos = Cosine() + + def test_valid_input_raw_score(self): + self.assertEqual(self.cos.get_raw_score(['data', 'science'], ['data']), 1.0 / (math.sqrt(2) * math.sqrt(1))) + self.assertEqual(self.cos.get_raw_score(['data', 'science'], ['science', 'good']), + 1.0 / (math.sqrt(2) * math.sqrt(2))) + self.assertEqual(self.cos.get_raw_score([], ['data']), 0.0) + self.assertEqual(self.cos.get_raw_score(['data', 'data', 'science'], ['data', 'management']), + 1.0 / (math.sqrt(2) * math.sqrt(2))) + self.assertEqual(self.cos.get_raw_score(['data', 'management'], ['data', 'data', 'science']), + 1.0 / (math.sqrt(2) * math.sqrt(2))) + self.assertEqual(self.cos.get_raw_score([], []), 1.0) + self.assertEqual(self.cos.get_raw_score(set([]), set([])), 1.0) + self.assertEqual(self.cos.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), + 3.0 / (math.sqrt(4) * math.sqrt(7))) + + def test_valid_input_sim_score(self): + self.assertEqual(self.cos.get_sim_score(['data', 'science'], ['data']), 1.0 / (math.sqrt(2) * math.sqrt(1))) + self.assertEqual(self.cos.get_sim_score(['data', 'science'], ['science', 'good']), + 1.0 / (math.sqrt(2) * math.sqrt(2))) + self.assertEqual(self.cos.get_sim_score([], ['data']), 0.0) + self.assertEqual(self.cos.get_sim_score(['data', 'data', 'science'], ['data', 'management']), + 1.0 / (math.sqrt(2) * math.sqrt(2))) + self.assertEqual(self.cos.get_sim_score(['data', 'management'], ['data', 'data', 'science']), + 1.0 / (math.sqrt(2) * math.sqrt(2))) + self.assertEqual(self.cos.get_sim_score([], []), 1.0) + self.assertEqual(self.cos.get_sim_score(set([]), set([])), 1.0) + self.assertEqual(self.cos.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), + 3.0 / (math.sqrt(4) * math.sqrt(7))) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.cos.get_raw_score(1, 1) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.cos.get_raw_score(['a'], None) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.cos.get_raw_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.cos.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.cos.get_raw_score(['MARHTA'], 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.cos.get_raw_score('MARHTA', ['MARTHA']) + + @raises(TypeError) + def test_invalid_input7_raw_score(self): + self.cos.get_raw_score('MARTHA', 'MARTHA') + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.cos.get_sim_score(1, 1) + + @raises(TypeError) + def test_invalid_input4_sim_score(self): + self.cos.get_sim_score(['a'], None) + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.cos.get_sim_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.cos.get_sim_score(None, None) + + @raises(TypeError) + def test_invalid_input5_sim_score(self): + self.cos.get_sim_score(['MARHTA'], 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_sim_score(self): + self.cos.get_sim_score('MARHTA', ['MARTHA']) + + @raises(TypeError) + def test_invalid_input7_sim_score(self): + self.cos.get_sim_score('MARTHA', 'MARTHA') \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_Dice.py b/py_stringmatching/tests/test_sim_Dice.py new file mode 100644 index 0000000..5825a7a --- /dev/null +++ b/py_stringmatching/tests/test_sim_Dice.py @@ -0,0 +1,102 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.dice import Dice + +class DiceTestCases(unittest.TestCase): + def setUp(self): + self.dice = Dice() + + def test_valid_input_raw_score(self): + self.assertEqual(self.dice.get_raw_score(['data', 'science'], ['data']), + 2 * 1.0 / 3.0) + self.assertEqual(self.dice.get_raw_score(['data', 'science'], ['science', 'good']), + 2 * 1.0 / 4.0) + self.assertEqual(self.dice.get_raw_score([], ['data']), 0) + self.assertEqual(self.dice.get_raw_score(['data', 'data', 'science'], + ['data', 'management']), 2 * 1.0 / 4.0) + self.assertEqual(self.dice.get_raw_score(['data', 'management'], + ['data', 'data', 'science']), 2 * 1.0 / 4.0) + self.assertEqual(self.dice.get_raw_score([], []), 1.0) + self.assertEqual(self.dice.get_raw_score(['a', 'b'], ['b', 'a']), 1.0) + self.assertEqual(self.dice.get_raw_score(set([]), set([])), 1.0) + self.assertEqual(self.dice.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), + 2 * 3.0 / 11.0) + + def test_valid_input_sim_score(self): + self.assertEqual(self.dice.get_sim_score(['data', 'science'], ['data']), + 2 * 1.0 / 3.0) + self.assertEqual(self.dice.get_sim_score(['data', 'science'], ['science', 'good']), + 2 * 1.0 / 4.0) + self.assertEqual(self.dice.get_sim_score([], ['data']), 0) + self.assertEqual(self.dice.get_sim_score(['data', 'data', 'science'], + ['data', 'management']), 2 * 1.0 / 4.0) + self.assertEqual(self.dice.get_sim_score(['data', 'management'], + ['data', 'data', 'science']), 2 * 1.0 / 4.0) + self.assertEqual(self.dice.get_sim_score([], []), 1.0) + self.assertEqual(self.dice.get_sim_score(['a', 'b'], ['b', 'a']), 1.0) + self.assertEqual(self.dice.get_sim_score(set([]), set([])), 1.0) + self.assertEqual(self.dice.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), + 2 * 3.0 / 11.0) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.dice.get_raw_score(1, 1) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.dice.get_raw_score(['a'], None) + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.dice.get_raw_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.dice.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.dice.get_raw_score(None, 'MARHTA') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.dice.get_raw_score('MARHTA', None) + + @raises(TypeError) + def test_invalid_input7_raw_score(self): + self.dice.get_raw_score('MARHTA', 'MARTHA') + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.dice.get_sim_score(1, 1) + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.dice.get_sim_score(['a'], None) + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.dice.get_sim_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input4_sim_score(self): + self.dice.get_sim_score(None, None) + + @raises(TypeError) + def test_invalid_input5_sim_score(self): + self.dice.get_sim_score(None, 'MARHTA') + + @raises(TypeError) + def test_invalid_input6_sim_score(self): + self.dice.get_sim_score('MARHTA', None) + + @raises(TypeError) + def test_invalid_input7_sim_score(self): + self.dice.get_sim_score('MARHTA', 'MARTHA') \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_Editex.py b/py_stringmatching/tests/test_sim_Editex.py new file mode 100644 index 0000000..f48bbbc --- /dev/null +++ b/py_stringmatching/tests/test_sim_Editex.py @@ -0,0 +1,146 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.editex import Editex + +class EditexTestCases(unittest.TestCase): + def setUp(self): + self.ed = Editex() + self.ed_with_params1 = Editex(match_cost=2) + self.ed_with_params2 = Editex(mismatch_cost=2) + self.ed_with_params3 = Editex(mismatch_cost=1) + self.ed_with_params4 = Editex(mismatch_cost=3, group_cost=2) + self.ed_with_params5 = Editex(mismatch_cost=3, group_cost=2, local=True) + self.ed_with_params6 = Editex(local=True) + + def test_get_match_cost(self): + self.assertEqual(self.ed_with_params1.get_match_cost(), 2) + + def test_get_group_cost(self): + self.assertEqual(self.ed_with_params4.get_group_cost(), 2) + + def test_get_mismatch_cost(self): + self.assertEqual(self.ed_with_params4.get_mismatch_cost(), 3) + + def test_get_local(self): + self.assertEqual(self.ed_with_params5.get_local(), True) + + def test_set_match_cost(self): + ed = Editex(match_cost=2) + self.assertEqual(ed.get_match_cost(), 2) + self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 12) + self.assertEqual(ed.set_match_cost(4), True) + self.assertEqual(ed.get_match_cost(), 4) + self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 14) + + def test_set_group_cost(self): + ed = Editex(group_cost=1) + self.assertEqual(ed.get_group_cost(), 1) + self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 3) + self.assertEqual(ed.set_group_cost(2), True) + self.assertEqual(ed.get_group_cost(), 2) + self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 4) + + def test_set_mismatch_cost(self): + ed = Editex(mismatch_cost=2) + self.assertEqual(ed.get_mismatch_cost(), 2) + self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 3) + self.assertEqual(ed.set_mismatch_cost(4), True) + self.assertEqual(ed.get_mismatch_cost(), 4) + self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 5) + + def test_set_local(self): + ed = Editex(local=False) + self.assertEqual(ed.get_local(), False) + self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 3) + self.assertEqual(ed.set_local(True), True) + self.assertEqual(ed.get_local(), True) + self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 3) + + def test_valid_input_raw_score(self): + self.assertEqual(self.ed.get_raw_score('MARTHA', 'MARTHA'), 0) + self.assertEqual(self.ed.get_raw_score('MARTHA', 'MARHTA'), 3) + self.assertEqual(self.ed.get_raw_score('ALIE', 'ALI'), 1) + self.assertEqual(self.ed_with_params1.get_raw_score('ALIE', 'ALI'), 7) + self.assertEqual(self.ed_with_params2.get_raw_score('ALIE', 'ALIF'), 2) + self.assertEqual(self.ed_with_params3.get_raw_score('ALIE', 'ALIF'), 1) + self.assertEqual(self.ed_with_params4.get_raw_score('ALIP', 'ALIF'), 2) + self.assertEqual(self.ed_with_params4.get_raw_score('ALIe', 'ALIF'), 3) + self.assertEqual(self.ed_with_params5.get_raw_score('WALIW', 'HALIH'), 6) + self.assertEqual(self.ed_with_params6.get_raw_score('niall', 'nihal'), 2) + self.assertEqual(self.ed_with_params6.get_raw_score('nihal', 'niall'), 2) + self.assertEqual(self.ed_with_params6.get_raw_score('neal', 'nihl'), 3) + self.assertEqual(self.ed_with_params6.get_raw_score('nihl', 'neal'), 3) + self.assertEqual(self.ed.get_raw_score('', ''), 0) + self.assertEqual(self.ed.get_raw_score('', 'MARTHA'), 12) + self.assertEqual(self.ed.get_raw_score('MARTHA', ''), 12) + + def test_valid_input_sim_score(self): + self.assertEqual(self.ed.get_sim_score('MARTHA', 'MARTHA'), 1.0) + self.assertEqual(self.ed.get_sim_score('MARTHA', 'MARHTA'), 1.0 - (3.0/12.0)) + self.assertEqual(self.ed.get_sim_score('ALIE', 'ALI'), 1.0 - (1.0/8.0)) + self.assertEqual(self.ed_with_params1.get_sim_score('ALIE', 'ALI'), 1.0 - (7.0/8.0)) + self.assertEqual(self.ed_with_params2.get_sim_score('ALIE', 'ALIF'), 1.0 - (2.0/8.0)) + self.assertEqual(self.ed_with_params3.get_sim_score('ALIE', 'ALIF'), 1.0 - (1.0/4.0)) + self.assertEqual(self.ed_with_params4.get_sim_score('ALIP', 'ALIF'), 1.0 - (2.0/12.0)) + self.assertEqual(self.ed_with_params4.get_sim_score('ALIe', 'ALIF'), 1.0 - (3.0/12.0)) + self.assertEqual(self.ed_with_params5.get_sim_score('WALIW', 'HALIH'), 1.0 - (6.0/15.0)) + self.assertEqual(self.ed_with_params6.get_sim_score('niall', 'nihal'), 1.0 - (2.0/10.0)) + self.assertEqual(self.ed_with_params6.get_sim_score('nihal', 'niall'), 1.0 - (2.0/10.0)) + self.assertEqual(self.ed_with_params6.get_sim_score('neal', 'nihl'), 1.0 - (3.0/8.0)) + self.assertEqual(self.ed_with_params6.get_sim_score('nihl', 'neal'), 1.0 - (3.0/8.0)) + self.assertEqual(self.ed.get_sim_score('', ''), 1.0) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.ed.get_raw_score(None, 'MARHTA') + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.ed.get_raw_score('MARHTA', None) + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.ed.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.ed.get_raw_score('MARHTA', 12.90) + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.ed.get_raw_score(12.90, 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.ed.get_raw_score(12.90, 12.90) + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.ed.get_sim_score(None, 'MARHTA') + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.ed.get_sim_score('MARHTA', None) + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.ed.get_sim_score(None, None) + + @raises(TypeError) + def test_invalid_input4_sim_score(self): + self.ed.get_sim_score('MARHTA', 12.90) + + @raises(TypeError) + def test_invalid_input5_sim_score(self): + self.ed.get_sim_score(12.90, 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_sim_score(self): + self.ed.get_sim_score(12.90, 12.90) \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_GeneralizedJaccard.py b/py_stringmatching/tests/test_sim_GeneralizedJaccard.py new file mode 100644 index 0000000..fd9645a --- /dev/null +++ b/py_stringmatching/tests/test_sim_GeneralizedJaccard.py @@ -0,0 +1,185 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.generalized_jaccard import GeneralizedJaccard +from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler +from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch +from py_stringmatching.similarity_measure.jaro import Jaro + +class GeneralizedJaccardTestCases(unittest.TestCase): + def setUp(self): + self.gen_jac = GeneralizedJaccard() + self.jw_fn = JaroWinkler().get_raw_score + self.gen_jac_with_jw = GeneralizedJaccard(sim_func=self.jw_fn) + self.gen_jac_with_jw_08 = GeneralizedJaccard(sim_func=self.jw_fn, + threshold=0.8) + self.gen_jac_invalid = GeneralizedJaccard(sim_func=NeedlemanWunsch().get_raw_score, + threshold=0.8) + + def test_get_sim_func(self): + self.assertEqual(self.gen_jac_with_jw_08.get_sim_func(), self.jw_fn) + + def test_get_threshold(self): + self.assertEqual(self.gen_jac_with_jw_08.get_threshold(), 0.8) + + def test_set_threshold(self): + gj = GeneralizedJaccard(threshold=0.8) + self.assertEqual(gj.get_threshold(), 0.8) + self.assertAlmostEqual(gj.get_raw_score(['Niall'], ['Neal', 'Njall']), 0.43333333333333335) + self.assertEqual(gj.set_threshold(0.9), True) + self.assertEqual(gj.get_threshold(), 0.9) + self.assertAlmostEqual(gj.get_raw_score(['Niall'], ['Neal', 'Njall']), 0.0) + + def test_set_sim_func(self): + fn1 = JaroWinkler().get_raw_score + fn2 = Jaro().get_raw_score + gj = GeneralizedJaccard(sim_func=fn1) + self.assertEqual(gj.get_sim_func(), fn1) + self.assertAlmostEqual(gj.get_raw_score(['Niall'], ['Neal', 'Njall']), 0.44) + self.assertEqual(gj.set_sim_func(fn2), True) + self.assertEqual(gj.get_sim_func(), fn2) + self.assertAlmostEqual(gj.get_raw_score(['Niall'], ['Neal', 'Njall']), 0.43333333333333335) + + def test_valid_input_raw_score(self): + self.assertEqual(self.gen_jac.get_raw_score([''], ['']), 1.0) # need to check this + + self.assertEqual(self.gen_jac.get_raw_score([''], ['a']), 0.0) + self.assertEqual(self.gen_jac.get_raw_score(['a'], ['a']), 1.0) + + self.assertEqual(self.gen_jac.get_raw_score([], ['Nigel']), 0.0) + self.assertEqual(self.gen_jac.get_raw_score(['Niall'], ['Neal']), 0.7833333333333333) + self.assertEqual(self.gen_jac.get_raw_score(['Niall'], ['Njall', 'Neal']), 0.43333333333333335) + self.assertEqual(self.gen_jac.get_raw_score(['Niall'], ['Neal', 'Njall']), 0.43333333333333335) + self.assertEqual(self.gen_jac.get_raw_score( + ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), + 0.6800468975468975) + + self.assertEqual(self.gen_jac_with_jw.get_raw_score( + ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), + 0.7220003607503608) + self.assertEqual(self.gen_jac_with_jw.get_raw_score( + ['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), + 0.7075277777777778) + + self.assertEqual(self.gen_jac_with_jw_08.get_raw_score( + ['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), + 0.45810185185185187) + + def test_valid_input_sim_score(self): + self.assertEqual(self.gen_jac.get_sim_score([''], ['']), 1.0) # need to check this + + self.assertEqual(self.gen_jac.get_sim_score([''], ['a']), 0.0) + self.assertEqual(self.gen_jac.get_sim_score(['a'], ['a']), 1.0) + + self.assertEqual(self.gen_jac.get_sim_score([], ['Nigel']), 0.0) + self.assertEqual(self.gen_jac.get_sim_score(['Niall'], ['Neal']), 0.7833333333333333) + self.assertEqual(self.gen_jac.get_sim_score(['Niall'], ['Njall', 'Neal']), 0.43333333333333335) + self.assertEqual(self.gen_jac.get_sim_score(['Niall'], ['Neal', 'Njall']), 0.43333333333333335) + self.assertEqual(self.gen_jac.get_sim_score( + ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), + 0.6800468975468975) + + self.assertEqual(self.gen_jac_with_jw.get_sim_score( + ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), + 0.7220003607503608) + self.assertEqual(self.gen_jac_with_jw.get_sim_score( + ['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), + 0.7075277777777778) + + self.assertEqual(self.gen_jac_with_jw_08.get_sim_score( + ['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), + 0.45810185185185187) + + def test_valid_input_non_ascii_raw_score(self): + self.assertEqual(self.gen_jac.get_raw_score([u'Nóáll'], [u'Neál']), 0.7833333333333333) + self.assertEqual(self.gen_jac.get_raw_score(['Nóáll'], ['Neál']), 0.7833333333333333) + self.assertEqual(self.gen_jac.get_raw_score([b'N\xc3\xb3\xc3\xa1ll'], [b'Ne\xc3\xa1l']), + 0.7833333333333333) + + def test_valid_input_non_ascii_sim_score(self): + self.assertEqual(self.gen_jac.get_sim_score([u'Nóáll'], [u'Neál']), 0.7833333333333333) + self.assertEqual(self.gen_jac.get_sim_score(['Nóáll'], ['Neál']), 0.7833333333333333) + self.assertEqual(self.gen_jac.get_sim_score([b'N\xc3\xb3\xc3\xa1ll'], [b'Ne\xc3\xa1l']), + 0.7833333333333333) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.gen_jac.get_raw_score(1, 1) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.gen_jac.get_raw_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.gen_jac.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.gen_jac.get_raw_score("temp", "temp") + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.gen_jac.get_raw_score(['temp'], 'temp') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.gen_jac.get_raw_score(['a'], None) + + @raises(TypeError) + def test_invalid_input7_raw_score(self): + self.gen_jac.get_raw_score('temp', ['temp']) + + @raises(ValueError) + def test_invalid_sim_measure(self): + self.gen_jac_invalid.get_raw_score( + ['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.gen_jac.get_sim_score(1, 1) + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.gen_jac.get_sim_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.gen_jac.get_sim_score(None, None) + + @raises(TypeError) + def test_invalid_input4_sim_score(self): + self.gen_jac.get_sim_score("temp", "temp") + + @raises(TypeError) + def test_invalid_input5_sim_score(self): + self.gen_jac.get_sim_score(['temp'], 'temp') + + @raises(TypeError) + def test_invalid_input6_sim_score(self): + self.gen_jac.get_sim_score(['a'], None) + + @raises(TypeError) + def test_invalid_input7_sim_score(self): + self.gen_jac.get_sim_score('temp', ['temp']) + + @raises(ValueError) + def test_invalid_sim_measure_sim_score(self): + self.gen_jac_invalid.get_sim_score( + ['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_HammingDistance.py b/py_stringmatching/tests/test_sim_HammingDistance.py new file mode 100644 index 0000000..e922874 --- /dev/null +++ b/py_stringmatching/tests/test_sim_HammingDistance.py @@ -0,0 +1,130 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.hamming_distance import HammingDistance + +class HammingDistanceTestCases(unittest.TestCase): + def setUp(self): + self.hd = HammingDistance() + + def test_valid_input_raw_score(self): + self.assertEqual(self.hd.get_raw_score('-789', 'john'), 4) + self.assertEqual(self.hd.get_raw_score('a', '*'), 1) + self.assertEqual(self.hd.get_raw_score('b', 'a'), 1) + self.assertEqual(self.hd.get_raw_score('abc', 'p q'), 3) + self.assertEqual(self.hd.get_raw_score('karolin', 'kathrin'), 3) + self.assertEqual(self.hd.get_raw_score('KARI', 'kari'), 4) + self.assertEqual(self.hd.get_raw_score('', ''), 0) + + def test_valid_input_sim_score(self): + self.assertEqual(self.hd.get_sim_score('-789', 'john'), 1.0 - (4.0/4.0)) + self.assertEqual(self.hd.get_sim_score('a', '*'), 1.0 - (1.0/1.0)) + self.assertEqual(self.hd.get_sim_score('b', 'a'), 1.0 - (1.0/1.0)) + self.assertEqual(self.hd.get_sim_score('abc', 'p q'), 1.0 - (3.0/3.0)) + self.assertEqual(self.hd.get_sim_score('karolin', 'kathrin'), 1.0 - (3.0/7.0)) + self.assertEqual(self.hd.get_sim_score('KARI', 'kari'), 1.0 - (4.0/4.0)) + self.assertEqual(self.hd.get_sim_score('', ''), 1.0) + + def test_valid_input_compatibility_raw_score(self): + self.assertEqual(self.hd.get_raw_score(u'karolin', u'kathrin'), 3) + self.assertEqual(self.hd.get_raw_score(u'', u''), 0) + # str_1 = u'foo'.encode(encoding='UTF-8', errors='strict') + # str_2 = u'bar'.encode(encoding='UTF-8', errors='strict') + # self.assertEqual(self.hd.get_raw_score(str_1, str_2), 3) # check with Ali - python 3 returns type error + # self.assertEqual(self.hd.get_raw_score(str_1, str_1), 0) # check with Ali - python 3 returns type error + + def test_valid_input_compatibility_sim_score(self): + self.assertEqual(self.hd.get_sim_score(u'karolin', u'kathrin'), 1.0 - (3.0/7.0)) + self.assertEqual(self.hd.get_sim_score(u'', u''), 1.0) + + def test_valid_input_non_ascii_raw_score(self): + self.assertEqual(self.hd.get_raw_score(u'ábó', u'áóó'), 1) + self.assertEqual(self.hd.get_raw_score('ábó', 'áóó'), 1) + self.assertEqual(self.hd.get_raw_score(b'\xc3\xa1b\xc3\xb3', + b'\xc3\xa1\xc3\xb3\xc3\xb3'), + 1) + + def test_valid_input_non_ascii_sim_score(self): + self.assertEqual(self.hd.get_sim_score(u'ábó', u'áóó'), 1.0 - (1.0/3.0)) + self.assertEqual(self.hd.get_sim_score('ábó', 'áóó'), 1.0 - (1.0/3.0)) + self.assertEqual(self.hd.get_sim_score(b'\xc3\xa1b\xc3\xb3', + b'\xc3\xa1\xc3\xb3\xc3\xb3'), + 1.0 - (1.0/3.0)) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.hd.get_raw_score('a', None) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.hd.get_raw_score(None, 'b') + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.hd.get_raw_score(None, None) + + @raises(ValueError) + def test_invalid_input4_raw_score(self): + self.hd.get_raw_score('a', '') + + @raises(ValueError) + def test_invalid_input5_raw_score(self): + self.hd.get_raw_score('', 'This is a long string') + + @raises(ValueError) + def test_invalid_input6_raw_score(self): + self.hd.get_raw_score('ali', 'alex') + + @raises(TypeError) + def test_invalid_input7_raw_score(self): + self.hd.get_raw_score('MA', 12) + + @raises(TypeError) + def test_invalid_input8_raw_score(self): + self.hd.get_raw_score(12, 'MA') + + @raises(TypeError) + def test_invalid_input9_raw_score(self): + self.hd.get_raw_score(12, 12) + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.hd.get_sim_score('a', None) + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.hd.get_sim_score(None, 'b') + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.hd.get_sim_score(None, None) + + @raises(ValueError) + def test_invalid_input4_sim_score(self): + self.hd.get_sim_score('a', '') + + @raises(ValueError) + def test_invalid_input5_sim_score(self): + self.hd.get_sim_score('', 'This is a long string') + + @raises(ValueError) + def test_invalid_input6_sim_score(self): + self.hd.get_sim_score('ali', 'alex') + + @raises(TypeError) + def test_invalid_input7_sim_score(self): + self.hd.get_sim_score('MA', 12) + + @raises(TypeError) + def test_invalid_input8_sim_score(self): + self.hd.get_sim_score(12, 'MA') + + @raises(TypeError) + def test_invalid_input9_sim_score(self): + self.hd.get_sim_score(12, 12) \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_Jaccard.py b/py_stringmatching/tests/test_sim_Jaccard.py new file mode 100644 index 0000000..c853e60 --- /dev/null +++ b/py_stringmatching/tests/test_sim_Jaccard.py @@ -0,0 +1,100 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.jaccard import Jaccard + +class JaccardTestCases(unittest.TestCase): + def setUp(self): + self.jac = Jaccard() + + def test_valid_input_raw_score(self): + self.assertEqual(self.jac.get_raw_score(['data', 'science'], ['data']), + 1.0 / 2.0) + self.assertEqual(self.jac.get_raw_score(['data', 'science'], + ['science', 'good']), 1.0 / 3.0) + self.assertEqual(self.jac.get_raw_score([], ['data']), 0) + self.assertEqual(self.jac.get_raw_score(['data', 'data', 'science'], + ['data', 'management']), 1.0 / 3.0) + self.assertEqual(self.jac.get_raw_score(['data', 'management'], + ['data', 'data', 'science']), 1.0 / 3.0) + self.assertEqual(self.jac.get_raw_score([], []), 1.0) + self.assertEqual(self.jac.get_raw_score(set([]), set([])), 1.0) + self.assertEqual(self.jac.get_raw_score({1, 1, 2, 3, 4}, + {2, 3, 4, 5, 6, 7, 7, 8}), 3.0 / 8.0) + + def test_valid_input_sim_score(self): + self.assertEqual(self.jac.get_sim_score(['data', 'science'], ['data']), + 1.0 / 2.0) + self.assertEqual(self.jac.get_sim_score(['data', 'science'], + ['science', 'good']), 1.0 / 3.0) + self.assertEqual(self.jac.get_sim_score([], ['data']), 0) + self.assertEqual(self.jac.get_sim_score(['data', 'data', 'science'], + ['data', 'management']), 1.0 / 3.0) + self.assertEqual(self.jac.get_sim_score(['data', 'management'], + ['data', 'data', 'science']), 1.0 / 3.0) + self.assertEqual(self.jac.get_sim_score([], []), 1.0) + self.assertEqual(self.jac.get_sim_score(set([]), set([])), 1.0) + self.assertEqual(self.jac.get_sim_score({1, 1, 2, 3, 4}, + {2, 3, 4, 5, 6, 7, 7, 8}), 3.0 / 8.0) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.jac.get_raw_score(1, 1) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.jac.get_raw_score(['a'], None) + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.jac.get_raw_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.jac.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.jac.get_raw_score(['MARHTA'], 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.jac.get_raw_score('MARHTA', ['MARTHA']) + + @raises(TypeError) + def test_invalid_input7_raw_score(self): + self.jac.get_raw_score('MARTHA', 'MARTHA') + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.jac.get_sim_score(1, 1) + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.jac.get_sim_score(['a'], None) + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.jac.get_sim_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input4_sim_score(self): + self.jac.get_sim_score(None, None) + + @raises(TypeError) + def test_invalid_input5_sim_score(self): + self.jac.get_sim_score(['MARHTA'], 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_sim_score(self): + self.jac.get_sim_score('MARHTA', ['MARTHA']) + + @raises(TypeError) + def test_invalid_input7_sim_score(self): + self.jac.get_sim_score('MARTHA', 'MARTHA') \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_Jaro.py b/py_stringmatching/tests/test_sim_Jaro.py new file mode 100644 index 0000000..21fa650 --- /dev/null +++ b/py_stringmatching/tests/test_sim_Jaro.py @@ -0,0 +1,103 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.jaro import Jaro + +class JaroTestCases(unittest.TestCase): + def setUp(self): + self.jaro = Jaro() + + def test_valid_input_raw_score(self): + # https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance + self.assertAlmostEqual(self.jaro.get_raw_score('MARTHA', 'MARHTA'), + 0.9444444444444445) + self.assertAlmostEqual(self.jaro.get_raw_score('DWAYNE', 'DUANE'), + 0.8222222222222223) + self.assertAlmostEqual(self.jaro.get_raw_score('DIXON', 'DICKSONX'), + 0.7666666666666666) + self.assertEqual(self.jaro.get_raw_score('', 'deeva'), 0) + + def test_valid_input_sim_score(self): + self.assertAlmostEqual(self.jaro.get_sim_score('MARTHA', 'MARHTA'), + 0.9444444444444445) + self.assertAlmostEqual(self.jaro.get_sim_score('DWAYNE', 'DUANE'), + 0.8222222222222223) + self.assertAlmostEqual(self.jaro.get_sim_score('DIXON', 'DICKSONX'), + 0.7666666666666666) + self.assertEqual(self.jaro.get_sim_score('', 'deeva'), 0) + + def test_non_ascii_input_raw_score(self): + self.assertAlmostEqual(self.jaro.get_raw_score(u'MARTHA', u'MARHTA'), + 0.9444444444444445) + self.assertAlmostEqual(self.jaro.get_raw_score(u'László', u'Lsáló'), + 0.8777777777777779) + self.assertAlmostEqual(self.jaro.get_raw_score('László', 'Lsáló'), + 0.8777777777777779) + self.assertAlmostEqual(self.jaro.get_raw_score(b'L\xc3\xa1szl\xc3\xb3', + b'Ls\xc3\xa1l\xc3\xb3'), + 0.8777777777777779) + + def test_non_ascii_input_sim_score(self): + self.assertAlmostEqual(self.jaro.get_sim_score(u'MARTHA', u'MARHTA'), + 0.9444444444444445) + self.assertAlmostEqual(self.jaro.get_sim_score(u'László', u'Lsáló'), + 0.8777777777777779) + self.assertAlmostEqual(self.jaro.get_sim_score('László', 'Lsáló'), + 0.8777777777777779) + self.assertAlmostEqual(self.jaro.get_sim_score(b'L\xc3\xa1szl\xc3\xb3', + b'Ls\xc3\xa1l\xc3\xb3'), + 0.8777777777777779) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.jaro.get_raw_score(None, 'MARHTA') + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.jaro.get_raw_score('MARHTA', None) + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.jaro.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.jaro.get_raw_score('MARHTA', 12.90) + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.jaro.get_raw_score(12.90, 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.jaro.get_raw_score(12.90, 12.90) + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.jaro.get_sim_score(None, 'MARHTA') + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.jaro.get_sim_score('MARHTA', None) + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.jaro.get_sim_score(None, None) + + @raises(TypeError) + def test_invalid_input4_sim_score(self): + self.jaro.get_sim_score('MARHTA', 12.90) + + @raises(TypeError) + def test_invalid_input5_sim_score(self): + self.jaro.get_sim_score(12.90, 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_sim_score(self): + self.jaro.get_sim_score(12.90, 12.90) \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_JaroWinkler.py b/py_stringmatching/tests/test_sim_JaroWinkler.py new file mode 100644 index 0000000..a613d91 --- /dev/null +++ b/py_stringmatching/tests/test_sim_JaroWinkler.py @@ -0,0 +1,110 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler + +class JaroWinklerTestCases(unittest.TestCase): + def setUp(self): + self.jw = JaroWinkler() + + def test_get_prefix_weight(self): + self.assertEqual(self.jw.get_prefix_weight(), 0.1) + + def test_set_prefix_weight(self): + jw = JaroWinkler(prefix_weight=0.15) + self.assertEqual(jw.get_prefix_weight(), 0.15) + self.assertAlmostEqual(jw.get_raw_score('MARTHA', 'MARHTA'), 0.9694444444444444) + self.assertEqual(jw.set_prefix_weight(0.25), True) + self.assertEqual(jw.get_prefix_weight(), 0.25) + self.assertAlmostEqual(jw.get_raw_score('MARTHA', 'MARHTA'), 0.9861111111111112) + + def test_valid_input_raw_score(self): + # https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance + self.assertAlmostEqual(self.jw.get_raw_score('MARTHA', 'MARHTA'), + 0.9611111111111111) + self.assertAlmostEqual(self.jw.get_raw_score('DWAYNE', 'DUANE'), 0.84) + self.assertAlmostEqual(self.jw.get_raw_score('DIXON', 'DICKSONX'), + 0.8133333333333332) + + def test_valid_input_sim_score(self): + self.assertAlmostEqual(self.jw.get_sim_score('MARTHA', 'MARHTA'), + 0.9611111111111111) + self.assertAlmostEqual(self.jw.get_sim_score('DWAYNE', 'DUANE'), 0.84) + self.assertAlmostEqual(self.jw.get_sim_score('DIXON', 'DICKSONX'), + 0.8133333333333332) + + def test_non_ascii_input_raw_score(self): + self.assertAlmostEqual(self.jw.get_raw_score(u'MARTHA', u'MARHTA'), + 0.9611111111111111) + self.assertAlmostEqual(self.jw.get_raw_score(u'László', u'Lsáló'), + 0.8900000000000001) + self.assertAlmostEqual(self.jw.get_raw_score('László', 'Lsáló'), + 0.8900000000000001) + self.assertAlmostEqual(self.jw.get_raw_score(b'L\xc3\xa1szl\xc3\xb3', + b'Ls\xc3\xa1l\xc3\xb3'), + 0.8900000000000001) + + def test_non_ascii_input_sim_score(self): + self.assertAlmostEqual(self.jw.get_sim_score(u'MARTHA', u'MARHTA'), + 0.9611111111111111) + self.assertAlmostEqual(self.jw.get_sim_score(u'László', u'Lsáló'), + 0.8900000000000001) + self.assertAlmostEqual(self.jw.get_sim_score('László', 'Lsáló'), + 0.8900000000000001) + self.assertAlmostEqual(self.jw.get_sim_score(b'L\xc3\xa1szl\xc3\xb3', + b'Ls\xc3\xa1l\xc3\xb3'), + 0.8900000000000001) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.jw.get_raw_score(None, 'MARHTA') + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.jw.get_raw_score('MARHTA', None) + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.jw.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.jw.get_raw_score('MARHTA', 12.90) + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.jw.get_raw_score(12.90, 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.jw.get_raw_score(12.90, 12.90) + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.jw.get_sim_score(None, 'MARHTA') + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.jw.get_sim_score('MARHTA', None) + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.jw.get_sim_score(None, None) + + @raises(TypeError) + def test_invalid_input4_sim_score(self): + self.jw.get_sim_score('MARHTA', 12.90) + + @raises(TypeError) + def test_invalid_input5_sim_score(self): + self.jw.get_sim_score(12.90, 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_sim_score(self): + self.jw.get_sim_score(12.90, 12.90) \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_Levenshtein.py b/py_stringmatching/tests/test_sim_Levenshtein.py new file mode 100644 index 0000000..029f72d --- /dev/null +++ b/py_stringmatching/tests/test_sim_Levenshtein.py @@ -0,0 +1,127 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.levenshtein import Levenshtein + +class LevenshteinTestCases(unittest.TestCase): + def setUp(self): + self.lev = Levenshtein() + + def test_valid_input_raw_score(self): + # http://oldfashionedsoftware.com/tag/levenshtein-distance/ + self.assertEqual(self.lev.get_raw_score('a', ''), 1) + self.assertEqual(self.lev.get_raw_score('', 'a'), 1) + self.assertEqual(self.lev.get_raw_score('abc', ''), 3) + self.assertEqual(self.lev.get_raw_score('', 'abc'), 3) + self.assertEqual(self.lev.get_raw_score('', ''), 0) + self.assertEqual(self.lev.get_raw_score('a', 'a'), 0) + self.assertEqual(self.lev.get_raw_score('abc', 'abc'), 0) + self.assertEqual(self.lev.get_raw_score('a', 'ab'), 1) + self.assertEqual(self.lev.get_raw_score('b', 'ab'), 1) + self.assertEqual(self.lev.get_raw_score('ac', 'abc'), 1) + self.assertEqual(self.lev.get_raw_score('abcdefg', 'xabxcdxxefxgx'), 6) + self.assertEqual(self.lev.get_raw_score('ab', 'a'), 1) + self.assertEqual(self.lev.get_raw_score('ab', 'b'), 1) + self.assertEqual(self.lev.get_raw_score('abc', 'ac'), 1) + self.assertEqual(self.lev.get_raw_score('xabxcdxxefxgx', 'abcdefg'), 6) + self.assertEqual(self.lev.get_raw_score('a', 'b'), 1) + self.assertEqual(self.lev.get_raw_score('ab', 'ac'), 1) + self.assertEqual(self.lev.get_raw_score('ac', 'bc'), 1) + self.assertEqual(self.lev.get_raw_score('abc', 'axc'), 1) + self.assertEqual(self.lev.get_raw_score('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6) + self.assertEqual(self.lev.get_raw_score('example', 'samples'), 3) + self.assertEqual(self.lev.get_raw_score('sturgeon', 'urgently'), 6) + self.assertEqual(self.lev.get_raw_score('levenshtein', 'frankenstein'), 6) + self.assertEqual(self.lev.get_raw_score('distance', 'difference'), 5) + self.assertEqual(self.lev.get_raw_score('java was neat', 'scala is great'), 7) + + def test_valid_input_sim_score(self): + self.assertEqual(self.lev.get_sim_score('a', ''), 1.0 - (1.0/1.0)) + self.assertEqual(self.lev.get_sim_score('', 'a'), 1.0 - (1.0/1.0)) + self.assertEqual(self.lev.get_sim_score('abc', ''), 1.0 - (3.0/3.0)) + self.assertEqual(self.lev.get_sim_score('', 'abc'), 1.0 - (3.0/3.0)) + self.assertEqual(self.lev.get_sim_score('', ''), 1.0) + self.assertEqual(self.lev.get_sim_score('a', 'a'), 1.0) + self.assertEqual(self.lev.get_sim_score('abc', 'abc'), 1.0) + self.assertEqual(self.lev.get_sim_score('a', 'ab'), 1.0 - (1.0/2.0)) + self.assertEqual(self.lev.get_sim_score('b', 'ab'), 1.0 - (1.0/2.0)) + self.assertEqual(self.lev.get_sim_score('ac', 'abc'), 1.0 - (1.0/3.0)) + self.assertEqual(self.lev.get_sim_score('abcdefg', 'xabxcdxxefxgx'), 1.0 - (6.0/13.0)) + self.assertEqual(self.lev.get_sim_score('ab', 'a'), 1.0 - (1.0/2.0)) + self.assertEqual(self.lev.get_sim_score('ab', 'b'), 1.0 - (1.0/2.0)) + self.assertEqual(self.lev.get_sim_score('abc', 'ac'), 1.0 - (1.0/3.0)) + self.assertEqual(self.lev.get_sim_score('xabxcdxxefxgx', 'abcdefg'), 1.0 - (6.0/13.0)) + self.assertEqual(self.lev.get_sim_score('a', 'b'), 1.0 - (1.0/1.0)) + self.assertEqual(self.lev.get_sim_score('ab', 'ac'), 1.0 - (1.0/2.0)) + self.assertEqual(self.lev.get_sim_score('ac', 'bc'), 1.0 - (1.0/2.0)) + self.assertEqual(self.lev.get_sim_score('abc', 'axc'), 1.0 - (1.0/3.0)) + self.assertEqual(self.lev.get_sim_score('xabxcdxxefxgx', '1ab2cd34ef5g6'), 1.0 - (6.0/13.0)) + self.assertEqual(self.lev.get_sim_score('example', 'samples'), 1.0 - (3.0/7.0)) + self.assertEqual(self.lev.get_sim_score('sturgeon', 'urgently'), 1.0 - (6.0/8.0)) + self.assertEqual(self.lev.get_sim_score('levenshtein', 'frankenstein'), 1.0 - (6.0/12.0)) + self.assertEqual(self.lev.get_sim_score('distance', 'difference'), 1.0 - (5.0/10.0)) + self.assertEqual(self.lev.get_sim_score('java was neat', 'scala is great'), 1.0 - (7.0/14.0)) + + def test_valid_input_non_ascii_raw_score(self): + self.assertEqual(self.lev.get_raw_score('ác', 'áóc'), 1) + self.assertEqual(self.lev.get_raw_score(u'ác', u'áóc'), 1) + self.assertEqual(self.lev.get_raw_score(b'\xc3\xa1c', b'\xc3\xa1\xc3\xb3c'), 1) + + def test_valid_input_non_ascii_sim_score(self): + self.assertEqual(self.lev.get_sim_score('ác', 'áóc'), 1.0 - (1.0/3.0)) + self.assertEqual(self.lev.get_sim_score(u'ác', u'áóc'), 1.0 - (1.0/3.0)) + self.assertEqual(self.lev.get_sim_score(b'\xc3\xa1c', b'\xc3\xa1\xc3\xb3c'), 1.0 - (1.0/3.0)) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.lev.get_raw_score('a', None) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.lev.get_raw_score(None, 'b') + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.lev.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.lev.get_raw_score('MARHTA', 12.90) + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.lev.get_raw_score(12.90, 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.lev.get_raw_score(12.90, 12.90) + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.lev.get_sim_score('a', None) + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.lev.get_sim_score(None, 'b') + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.lev.get_sim_score(None, None) + + @raises(TypeError) + def test_invalid_input4_sim_score(self): + self.lev.get_sim_score('MARHTA', 12.90) + + @raises(TypeError) + def test_invalid_input5_sim_score(self): + self.lev.get_sim_score(12.90, 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_sim_score(self): + self.lev.get_sim_score(12.90, 12.90) \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_MongeElkan.py b/py_stringmatching/tests/test_sim_MongeElkan.py new file mode 100644 index 0000000..db45e03 --- /dev/null +++ b/py_stringmatching/tests/test_sim_MongeElkan.py @@ -0,0 +1,97 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.affine import Affine +from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler +from py_stringmatching.similarity_measure.monge_elkan import MongeElkan +from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch + +class MongeElkanTestCases(unittest.TestCase): + def setUp(self): + self.me = MongeElkan() + self.me_with_nw = MongeElkan(NeedlemanWunsch().get_raw_score) + self.affine_fn = Affine().get_raw_score + self.me_with_affine = MongeElkan(self.affine_fn) + + def test_get_sim_func(self): + self.assertEqual(self.me_with_affine.get_sim_func(), self.affine_fn) + + def test_set_sim_func(self): + fn1 = JaroWinkler().get_raw_score + fn2 = NeedlemanWunsch().get_raw_score + me = MongeElkan(sim_func=fn1) + self.assertEqual(me.get_sim_func(), fn1) + self.assertAlmostEqual(me.get_raw_score( + ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), + 0.8364448051948052) + self.assertEqual(me.set_sim_func(fn2), True) + self.assertEqual(me.get_sim_func(), fn2) + self.assertAlmostEqual(me.get_raw_score( + ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), + 2.0) + + def test_valid_input(self): + self.assertEqual(self.me.get_raw_score([''], ['']), 1.0) # need to check this + + self.assertEqual(self.me.get_raw_score([''], ['a']), 0.0) + self.assertEqual(self.me.get_raw_score(['a'], ['a']), 1.0) + + self.assertEqual(self.me.get_raw_score(['Niall'], ['Neal']), 0.8049999999999999) + self.assertEqual(self.me.get_raw_score(['Niall'], ['Njall']), 0.88) + self.assertEqual(self.me.get_raw_score( + ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), + 0.8364448051948052) + self.assertEqual(self.me_with_nw.get_raw_score( + ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), + 2.0) + self.assertEqual(self.me_with_affine.get_raw_score( + ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], + ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), + 2.25) + self.assertEqual(self.me.get_raw_score(['Niall'], ['Niel']), 0.8266666666666667) + self.assertEqual(self.me.get_raw_score(['Niall'], ['Nigel']), 0.7866666666666667) + self.assertEqual(self.me.get_raw_score([], ['Nigel']), 0.0) + + def test_valid_input_non_ascii(self): + self.assertEqual(self.me.get_raw_score([u'Nóáll'], [u'Neál']), 0.8049999999999999) + self.assertEqual(self.me.get_raw_score(['Nóáll'], ['Neál']), 0.8049999999999999) + self.assertEqual(self.me.get_raw_score([b'N\xc3\xb3\xc3\xa1ll'], [b'Ne\xc3\xa1l']), + 0.8049999999999999) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.me.get_raw_score(1, 1) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.me.get_raw_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.me.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.me.get_raw_score("temp", "temp") + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.me.get_raw_score(['temp'], 'temp') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.me.get_raw_score(['a'], None) + + @raises(TypeError) + def test_invalid_input7_raw_score(self): + self.me.get_raw_score('temp', ['temp']) \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_NeedlemanWunsch.py b/py_stringmatching/tests/test_sim_NeedlemanWunsch.py new file mode 100644 index 0000000..8bee9dc --- /dev/null +++ b/py_stringmatching/tests/test_sim_NeedlemanWunsch.py @@ -0,0 +1,80 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch + +class NeedlemanWunschTestCases(unittest.TestCase): + def setUp(self): + self.nw = NeedlemanWunsch() + self.nw_with_params1 = NeedlemanWunsch(0.0) + self.nw_with_params2 = NeedlemanWunsch(1.0, + sim_func=lambda s1, s2: (2 if s1 == s2 else -1)) + self.sim_func=lambda s1, s2: (1 if s1 == s2 else -1) + self.nw_with_params3 = NeedlemanWunsch(gap_cost=0.5, + sim_func=self.sim_func) + + def test_get_gap_cost(self): + self.assertEqual(self.nw_with_params3.get_gap_cost(), 0.5) + + def test_get_sim_func(self): + self.assertEqual(self.nw_with_params3.get_sim_func(), self.sim_func) + + def test_set_gap_cost(self): + nw = NeedlemanWunsch(gap_cost=0.5) + self.assertEqual(nw.get_gap_cost(), 0.5) + self.assertAlmostEqual(nw.get_raw_score('dva', 'deeva'), 2.0) + self.assertEqual(nw.set_gap_cost(0.7), True) + self.assertEqual(nw.get_gap_cost(), 0.7) + self.assertAlmostEqual(nw.get_raw_score('dva', 'deeva'), 1.6000000000000001) + + def test_set_sim_func(self): + fn1 = lambda s1, s2: (int(1 if s1 == s2 else 0)) + fn2 = lambda s1, s2: (int(2 if s1 == s2 else -1)) + nw = NeedlemanWunsch(sim_func=fn1) + self.assertEqual(nw.get_sim_func(), fn1) + self.assertAlmostEqual(nw.get_raw_score('dva', 'deeva'), 1.0) + self.assertEqual(nw.set_sim_func(fn2), True) + self.assertEqual(nw.get_sim_func(), fn2) + self.assertAlmostEqual(nw.get_raw_score('dva', 'deeva'), 4.0) + + def test_valid_input(self): + self.assertEqual(self.nw.get_raw_score('dva', 'deeva'), 1.0) + self.assertEqual(self.nw_with_params1.get_raw_score('dva', 'deeve'), 2.0) + self.assertEqual(self.nw_with_params2.get_raw_score('dva', 'deeve'), 1.0) + self.assertEqual(self.nw_with_params3.get_raw_score('GCATGCUA', 'GATTACA'), + 2.5) + + def test_valid_input_non_ascii(self): + self.assertEqual(self.nw.get_raw_score(u'dva', u'dáóva'), 1.0) + self.assertEqual(self.nw.get_raw_score('dva', 'dáóva'), 1.0) + self.assertEqual(self.nw.get_raw_score('dva', b'd\xc3\xa1\xc3\xb3va'), 1.0) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.nw.get_raw_score('a', None) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.nw.get_raw_score(None, 'b') + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.nw.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.nw.get_raw_score(['a'], 'b') + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.nw.get_raw_score('a', ['b']) + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.nw.get_raw_score(['a'], ['b']) \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_OverlapCoefficient.py b/py_stringmatching/tests/test_sim_OverlapCoefficient.py new file mode 100644 index 0000000..7a0c986 --- /dev/null +++ b/py_stringmatching/tests/test_sim_OverlapCoefficient.py @@ -0,0 +1,86 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.overlap_coefficient import OverlapCoefficient + +class OverlapCoefficientTestCases(unittest.TestCase): + def setUp(self): + self.oc = OverlapCoefficient() + + def test_valid_input_raw_score(self): + self.assertEqual(self.oc.get_raw_score([], []), 1.0) + self.assertEqual(self.oc.get_raw_score(['data', 'science'], ['data']), + 1.0 / min(2.0, 1.0)) + self.assertEqual(self.oc.get_raw_score(['data', 'science'], + ['science', 'good']), 1.0 / min(2.0, 3.0)) + self.assertEqual(self.oc.get_raw_score([], ['data']), 0) + self.assertEqual(self.oc.get_raw_score(['data', 'data', 'science'], + ['data', 'management']), 1.0 / min(3.0, 2.0)) + + def test_valid_input_raw_score_set_inp(self): + self.assertEqual(self.oc.get_raw_score(set(['data', 'science']), set(['data'])), + 1.0 / min(2.0, 1.0)) + + def test_valid_input_sim_score(self): + self.assertEqual(self.oc.get_sim_score([], []), 1.0) + self.assertEqual(self.oc.get_sim_score(['data', 'science'], ['data']), + 1.0 / min(2.0, 1.0)) + self.assertEqual(self.oc.get_sim_score(['data', 'science'], + ['science', 'good']), 1.0 / min(2.0, 3.0)) + self.assertEqual(self.oc.get_sim_score([], ['data']), 0) + self.assertEqual(self.oc.get_sim_score(['data', 'data', 'science'], + ['data', 'management']), 1.0 / min(3.0, 2.0)) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.oc.get_raw_score(['a'], None) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.oc.get_raw_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.oc.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.oc.get_raw_score(['MARHTA'], 'MARTHA') + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.oc.get_raw_score('MARHTA', ['MARTHA']) + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.oc.get_raw_score('MARTHA', 'MARTHA') + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.oc.get_sim_score(['a'], None) + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.oc.get_sim_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.oc.get_sim_score(None, None) + + @raises(TypeError) + def test_invalid_input4_sim_score(self): + self.oc.get_sim_score(['MARHTA'], 'MARTHA') + + @raises(TypeError) + def test_invalid_input5_sim_score(self): + self.oc.get_sim_score('MARHTA', ['MARTHA']) + + @raises(TypeError) + def test_invalid_input6_sim_score(self): + self.oc.get_sim_score('MARTHA', 'MARTHA') \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_SmithWaterman.py b/py_stringmatching/tests/test_sim_SmithWaterman.py new file mode 100644 index 0000000..27981ba --- /dev/null +++ b/py_stringmatching/tests/test_sim_SmithWaterman.py @@ -0,0 +1,85 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman + +class SmithWatermanTestCases(unittest.TestCase): + def setUp(self): + self.sw = SmithWaterman() + self.sw_with_params1 = SmithWaterman(2.2) + self.sw_with_params2 = SmithWaterman(1, + sim_func=lambda s1, s2: (2 if s1 == s2 else -1)) + self.sw_with_params3 = SmithWaterman(gap_cost=1, + sim_func=lambda s1, s2: (int(1 if s1 == s2 else -1))) + self.sim_func=lambda s1, s2: (1.5 if s1 == s2 else 0.5) + self.sw_with_params4 = SmithWaterman(gap_cost=1.4, + sim_func=self.sim_func) + + def test_get_gap_cost(self): + self.assertEqual(self.sw_with_params4.get_gap_cost(), 1.4) + + def test_get_sim_func(self): + self.assertEqual(self.sw_with_params4.get_sim_func(), self.sim_func) + + def test_set_gap_cost(self): + sw = SmithWaterman(gap_cost=0.3) + self.assertEqual(sw.get_gap_cost(), 0.3) + self.assertAlmostEqual(sw.get_raw_score('dva', 'deeva'), 2.3999999999999999) + self.assertEqual(sw.set_gap_cost(0.7), True) + self.assertEqual(sw.get_gap_cost(), 0.7) + self.assertAlmostEqual(sw.get_raw_score('dva', 'deeva'), 2.0) + + def test_set_sim_func(self): + fn1 = lambda s1, s2: (int(1 if s1 == s2 else 0)) + fn2 = lambda s1, s2: (int(2 if s1 == s2 else -1)) + sw = SmithWaterman(sim_func=fn1) + self.assertEqual(sw.get_sim_func(), fn1) + self.assertAlmostEqual(sw.get_raw_score('dva', 'deeva'), 2.0) + self.assertEqual(sw.set_sim_func(fn2), True) + self.assertEqual(sw.get_sim_func(), fn2) + self.assertAlmostEqual(sw.get_raw_score('dva', 'deeva'), 4.0) + + def test_valid_input(self): + self.assertEqual(self.sw.get_raw_score('cat', 'hat'), 2.0) + self.assertEqual(self.sw_with_params1.get_raw_score('dva', 'deeve'), 1.0) + self.assertEqual(self.sw_with_params2.get_raw_score('dva', 'deeve'), 2.0) + self.assertEqual(self.sw_with_params3.get_raw_score('GCATGCU', 'GATTACA'), + 2.0) + self.assertEqual(self.sw_with_params4.get_raw_score('GCATAGCU', 'GATTACA'), + 6.5) + + def test_valid_input_non_ascii(self): + self.assertEqual(self.sw.get_raw_score(u'óát', u'cát'), 2.0) + self.assertEqual(self.sw.get_raw_score('óát', 'cát'), 2.0) + self.assertEqual(self.sw.get_raw_score(b'\xc3\xb3\xc3\xa1t', b'c\xc3\xa1t'), + 2.0) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.sw.get_raw_score('a', None) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.sw.get_raw_score(None, 'b') + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.sw.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.sw.get_raw_score('MARHTA', 12) + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.sw.get_raw_score(12, 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.sw.get_raw_score(12, 12) \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_Soft_Tfidf.py b/py_stringmatching/tests/test_sim_Soft_Tfidf.py new file mode 100644 index 0000000..d93b41c --- /dev/null +++ b/py_stringmatching/tests/test_sim_Soft_Tfidf.py @@ -0,0 +1,117 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler +from py_stringmatching.similarity_measure.affine import Affine +from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf +from py_stringmatching.similarity_measure.jaro import Jaro + +class Soft_TfidfTestCases(unittest.TestCase): + def setUp(self): + self.soft_tfidf = SoftTfIdf() + self.corpus = [['a', 'b', 'a'], ['a', 'c'], ['a']] + self.non_ascii_corpus = [['á', 'b', 'á'], ['á', 'c'], ['á']] + self.soft_tfidf_with_params1 = SoftTfIdf(self.corpus, + sim_func=Jaro().get_raw_score, + threshold=0.8) + self.soft_tfidf_with_params2 = SoftTfIdf(self.corpus, + threshold=0.9) + self.soft_tfidf_with_params3 = SoftTfIdf([['x', 'y'], ['w'], ['q']]) + self.affine_fn = Affine().get_raw_score + self.soft_tfidf_with_params4 = SoftTfIdf(sim_func=self.affine_fn, threshold=0.6) + self.soft_tfidf_non_ascii = SoftTfIdf(self.non_ascii_corpus, + sim_func=Jaro().get_raw_score, + threshold=0.8) + + def test_get_corpus_list(self): + self.assertEqual(self.soft_tfidf_with_params1.get_corpus_list(), self.corpus) + + def test_get_sim_func(self): + self.assertEqual(self.soft_tfidf_with_params4.get_sim_func(), self.affine_fn) + + def test_get_threshold(self): + self.assertEqual(self.soft_tfidf_with_params4.get_threshold(), 0.6) + + def test_set_corpus_list(self): + corpus1 = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']] + corpus2 = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b'], ['c', 'a', 'b']] + soft_tfidf = SoftTfIdf(corpus_list=corpus1) + self.assertEqual(soft_tfidf.get_corpus_list(), corpus1) + self.assertAlmostEqual(soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a']), + 0.7999999999999999) + self.assertEqual(soft_tfidf.set_corpus_list(corpus2), True) + self.assertEqual(soft_tfidf.get_corpus_list(), corpus2) + self.assertAlmostEqual(soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a']), + 0.8320502943378437) + + def test_set_threshold(self): + soft_tfidf = SoftTfIdf(threshold=0.5) + self.assertEqual(soft_tfidf.get_threshold(), 0.5) + self.assertAlmostEqual(soft_tfidf.get_raw_score(['ar', 'bfff', 'ab'], ['abcd']), 0.8179128813519699) + self.assertEqual(soft_tfidf.set_threshold(0.7), True) + self.assertEqual(soft_tfidf.get_threshold(), 0.7) + self.assertAlmostEqual(soft_tfidf.get_raw_score(['ar', 'bfff', 'ab'], ['abcd']), 0.4811252243246882) + + def test_set_sim_func(self): + fn1 = JaroWinkler().get_raw_score + fn2 = Jaro().get_raw_score + soft_tfidf = SoftTfIdf(sim_func=fn1) + self.assertEqual(soft_tfidf.get_sim_func(), fn1) + self.assertAlmostEqual(soft_tfidf.get_raw_score(['ar', 'bfff', 'ab'], ['abcd']), 0.8612141515411919) + self.assertEqual(soft_tfidf.set_sim_func(fn2), True) + self.assertEqual(soft_tfidf.get_sim_func(), fn2) + self.assertAlmostEqual(soft_tfidf.get_raw_score(['ar', 'bfff', 'ab'], ['abcd']), 0.8179128813519699) + + def test_valid_input_raw_score(self): + self.assertEqual(self.soft_tfidf_with_params1.get_raw_score( + ['a', 'b', 'a'], ['a', 'c']), 0.17541160386140586) + self.assertEqual(self.soft_tfidf_with_params2.get_raw_score( + ['a', 'b', 'a'], ['a']), 0.5547001962252291) + self.assertEqual(self.soft_tfidf_with_params3.get_raw_score( + ['a', 'b', 'a'], ['a']), 0.0) + self.assertEqual(self.soft_tfidf_with_params4.get_raw_score( + ['aa', 'bb', 'a'], ['ab', 'ba']), + 0.81649658092772592) + self.assertEqual(self.soft_tfidf.get_raw_score( + ['a', 'b', 'a'], ['a', 'b', 'a']), 1.0) + self.assertEqual(self.soft_tfidf.get_raw_score([], ['a', 'b', 'a']), 0.0) + + def test_valid_input_non_ascii_raw_score(self): + self.assertEqual(self.soft_tfidf_non_ascii.get_raw_score( + [u'á', u'b', u'á'], [u'á', u'c']), 0.17541160386140586) + self.assertEqual(self.soft_tfidf_non_ascii.get_raw_score( + ['á', 'b', 'á'], ['á', 'c']), 0.17541160386140586) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.soft_tfidf.get_raw_score(1, 1) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.soft_tfidf.get_raw_score(['a'], None) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.soft_tfidf.get_raw_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.soft_tfidf.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.soft_tfidf.get_raw_score(['MARHTA'], 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.soft_tfidf.get_raw_score('MARHTA', ['MARTHA']) + + @raises(TypeError) + def test_invalid_input7_raw_score(self): + self.soft_tfidf.get_raw_score('MARTHA', 'MARTHA') \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_Soundex.py b/py_stringmatching/tests/test_sim_Soundex.py new file mode 100644 index 0000000..3b56d89 --- /dev/null +++ b/py_stringmatching/tests/test_sim_Soundex.py @@ -0,0 +1,82 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.soundex import Soundex + +class SoundexTestCases(unittest.TestCase): + def setUp(self): + self.sdx = Soundex() + + def test_valid_input_raw_score(self): + self.assertEqual(self.sdx.get_raw_score('Robert', 'Rupert'), 1) + self.assertEqual(self.sdx.get_raw_score('Sue', 'S'), 1) + self.assertEqual(self.sdx.get_raw_score('robert', 'rupert'), 1) + self.assertEqual(self.sdx.get_raw_score('Gough', 'goff'), 0) + self.assertEqual(self.sdx.get_raw_score('gough', 'Goff'), 0) + self.assertEqual(self.sdx.get_raw_score('ali', 'a,,,li'), 1) + self.assertEqual(self.sdx.get_raw_score('Jawornicki', 'Yavornitzky'), 0) + self.assertEqual(self.sdx.get_raw_score('Robert', 'Robert'), 1) + + def test_valid_input_sim_score(self): + self.assertEqual(self.sdx.get_sim_score('Robert', 'Rupert'), 1) + self.assertEqual(self.sdx.get_sim_score('Sue', 'S'), 1) + self.assertEqual(self.sdx.get_sim_score('robert', 'rupert'), 1) + self.assertEqual(self.sdx.get_sim_score('Gough', 'goff'), 0) + self.assertEqual(self.sdx.get_sim_score('gough', 'Goff'), 0) + self.assertEqual(self.sdx.get_sim_score('ali', 'a,,,li'), 1) + self.assertEqual(self.sdx.get_sim_score('Jawornicki', 'Yavornitzky'), 0) + self.assertEqual(self.sdx.get_sim_score('Robert', 'Robert'), 1) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.sdx.get_raw_score('a', None) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.sdx.get_raw_score(None, 'b') + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.sdx.get_raw_score(None, None) + + @raises(ValueError) + def test_invalid_input4_raw_score(self): + self.sdx.get_raw_score('a', '') + + @raises(ValueError) + def test_invalid_input5_raw_score(self): + self.sdx.get_raw_score('', 'This is a long string') + + @raises(TypeError) + def test_invalid_input7_raw_score(self): + self.sdx.get_raw_score('xyz', ['']) + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.sdx.get_sim_score('a', None) + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.sdx.get_sim_score(None, 'b') + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.sdx.get_sim_score(None, None) + + @raises(ValueError) + def test_invalid_input4_sim_score(self): + self.sdx.get_sim_score('a', '') + + @raises(ValueError) + def test_invalid_input5_sim_score(self): + self.sdx.get_sim_score('', 'This is a long string') + + @raises(TypeError) + def test_invalid_input7_sim_score(self): + self.sdx.get_sim_score('xyz', ['']) \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_Tfidf.py b/py_stringmatching/tests/test_sim_Tfidf.py new file mode 100644 index 0000000..7ff2af8 --- /dev/null +++ b/py_stringmatching/tests/test_sim_Tfidf.py @@ -0,0 +1,124 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.tfidf import TfIdf + +class TfidfTestCases(unittest.TestCase): + def setUp(self): + self.tfidf = TfIdf() + self.corpus = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']] + self.tfidf_with_params1 = TfIdf(self.corpus, True) + self.tfidf_with_params2 = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']]) + self.tfidf_with_params3 = TfIdf([['x', 'y'], ['w'], ['q']]) + + def test_get_corpus_list(self): + self.assertEqual(self.tfidf_with_params1.get_corpus_list(), self.corpus) + + def test_get_dampen(self): + self.assertEqual(self.tfidf_with_params1.get_dampen(), True) + + def test_set_corpus_list(self): + corpus1 = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']] + corpus2 = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b'], ['c', 'a', 'b']] + tfidf = TfIdf(corpus_list=corpus1) + self.assertEqual(tfidf.get_corpus_list(), corpus1) + self.assertAlmostEqual(tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.5495722661728765) + self.assertEqual(tfidf.set_corpus_list(corpus2), True) + self.assertEqual(tfidf.get_corpus_list(), corpus2) + self.assertAlmostEqual(tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.5692378887901467) + + def test_set_dampen(self): + tfidf = TfIdf(self.corpus, dampen=False) + self.assertEqual(tfidf.get_dampen(), False) + self.assertAlmostEqual(tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.7999999999999999) + self.assertEqual(tfidf.set_dampen(True), True) + self.assertEqual(tfidf.get_dampen(), True) + self.assertAlmostEqual(tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.5495722661728765) + + def test_valid_input_raw_score(self): + self.assertEqual(self.tfidf_with_params1.get_raw_score(['a', 'b', 'a'], ['a', 'c']), + 0.11166746710505392) + self.assertEqual(self.tfidf_with_params2.get_raw_score(['a', 'b', 'a'], ['a', 'c']), + 0.0) + self.assertEqual(self.tfidf_with_params2.get_raw_score(['a', 'b', 'a'], ['a']), + 0.0) + self.assertEqual(self.tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.0) + self.assertEqual(self.tfidf_with_params3.get_raw_score(['a', 'b', 'a'], ['a']), 0.0) + self.assertEqual(self.tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.0) + self.assertEqual(self.tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'b', 'a']), 1.0) + self.assertEqual(self.tfidf.get_raw_score([], ['a', 'b', 'a']), 0.0) + + def test_valid_input_sim_score(self): + self.assertEqual(self.tfidf_with_params1.get_sim_score(['a', 'b', 'a'], ['a', 'c']), + 0.11166746710505392) + self.assertEqual(self.tfidf_with_params2.get_sim_score(['a', 'b', 'a'], ['a', 'c']), + 0.0) + self.assertEqual(self.tfidf_with_params2.get_sim_score(['a', 'b', 'a'], ['a']), + 0.0) + self.assertEqual(self.tfidf.get_sim_score(['a', 'b', 'a'], ['a']), 0.0) + self.assertEqual(self.tfidf_with_params3.get_sim_score(['a', 'b', 'a'], ['a']), 0.0) + self.assertEqual(self.tfidf.get_sim_score(['a', 'b', 'a'], ['a']), 0.0) + self.assertEqual(self.tfidf.get_sim_score(['a', 'b', 'a'], ['a', 'b', 'a']), 1.0) + self.assertEqual(self.tfidf.get_sim_score([], ['a', 'b', 'a']), 0.0) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.tfidf.get_raw_score(1, 1) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.tfidf.get_raw_score(['a'], None) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.tfidf.get_raw_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.tfidf.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.tfidf.get_raw_score(['MARHTA'], 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.tfidf.get_raw_score('MARHTA', ['MARTHA']) + + @raises(TypeError) + def test_invalid_input7_raw_score(self): + self.tfidf.get_raw_score('MARTHA', 'MARTHA') + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.tfidf.get_sim_score(1, 1) + + @raises(TypeError) + def test_invalid_input4_sim_score(self): + self.tfidf.get_sim_score(['a'], None) + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.tfidf.get_sim_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.tfidf.get_sim_score(None, None) + + @raises(TypeError) + def test_invalid_input5_sim_score(self): + self.tfidf.get_sim_score(['MARHTA'], 'MARTHA') + + @raises(TypeError) + def test_invalid_input6_sim_score(self): + self.tfidf.get_sim_score('MARHTA', ['MARTHA']) + + @raises(TypeError) + def test_invalid_input7_sim_score(self): + self.tfidf.get_sim_score('MARTHA', 'MARTHA') \ No newline at end of file diff --git a/py_stringmatching/tests/test_sim_TverskyIndex.py b/py_stringmatching/tests/test_sim_TverskyIndex.py new file mode 100644 index 0000000..2f7b8ca --- /dev/null +++ b/py_stringmatching/tests/test_sim_TverskyIndex.py @@ -0,0 +1,151 @@ +# coding=utf-8 + +from __future__ import unicode_literals + +import math +import unittest + +from nose.tools import * + +from py_stringmatching.similarity_measure.tversky_index import TverskyIndex + +class TverskyIndexTestCases(unittest.TestCase): + def setUp(self): + self.tvi = TverskyIndex() + self.tvi_with_params1 = TverskyIndex(0.5, 0.5) + self.tvi_with_params2 = TverskyIndex(0.7, 0.8) + self.tvi_with_params3 = TverskyIndex(0.2, 0.4) + self.tvi_with_params4 = TverskyIndex(0.9, 0.8) + self.tvi_with_params5 = TverskyIndex(0.45, 0.85) + + def test_get_alpha(self): + self.assertEqual(self.tvi_with_params5.get_alpha(), 0.45) + + def test_get_beta(self): + self.assertEqual(self.tvi_with_params5.get_beta(), 0.85) + + def test_set_alpha(self): + tvi = TverskyIndex(alpha=0.3) + self.assertEqual(tvi.get_alpha(), 0.3) + self.assertAlmostEqual(tvi.get_raw_score(['data', 'science'], ['data']), + 0.7692307692307692) + self.assertEqual(tvi.set_alpha(0.7), True) + self.assertEqual(tvi.get_alpha(), 0.7) + self.assertAlmostEqual(tvi.get_raw_score(['data', 'science'], ['data']), + 0.5882352941176471) + + def test_set_beta(self): + tvi = TverskyIndex(beta=0.3) + self.assertEqual(tvi.get_beta(), 0.3) + self.assertAlmostEqual(tvi.get_raw_score(['data', 'science'], ['science', 'good']), + 0.5555555555555556) + self.assertEqual(tvi.set_beta(0.7), True) + self.assertEqual(tvi.get_beta(), 0.7) + self.assertAlmostEqual(tvi.get_raw_score(['data', 'science'], ['science', 'good']), + 0.45454545454545453) + + def test_valid_input_raw_score(self): + self.assertEqual(self.tvi_with_params1.get_raw_score(['data', 'science'], ['data']), + 1.0 / (1.0 + 0.5*1 + 0.5*0)) + self.assertEqual(self.tvi.get_raw_score(['data', 'science'], ['science', 'good']), + 1.0 / (1.0 + 0.5*1 + 0.5*1)) + self.assertEqual(self.tvi.get_raw_score([], ['data']), 0) + self.assertEqual(self.tvi_with_params2.get_raw_score(['data', 'data', 'science'], + ['data', 'management']), + 1.0 / (1.0 + 0.7*1 + 0.8*1)) + self.assertEqual(self.tvi_with_params3.get_raw_score(['data', 'management', 'science'], + ['data', 'data', 'science']), + 2.0 / (2.0 + 0.2*1 + 0)) + self.assertEqual(self.tvi.get_raw_score([], []), 1.0) + self.assertEqual(self.tvi_with_params4.get_raw_score(['a', 'b'], ['b', 'a']), 1.0) + self.assertEqual(self.tvi.get_raw_score(['a', 'b'], ['b', 'a']), 1.0) + self.assertEqual(self.tvi.get_raw_score(set([]), set([])), 1.0) + self.assertEqual(self.tvi_with_params5.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), + 3.0 / (3.0 + 0.45*1 + 0.85*4)) + + def test_valid_input_sim_score(self): + self.assertEqual(self.tvi_with_params1.get_sim_score(['data', 'science'], ['data']), + 1.0 / (1.0 + 0.5*1 + 0.5*0)) + self.assertEqual(self.tvi.get_sim_score(['data', 'science'], ['science', 'good']), + 1.0 / (1.0 + 0.5*1 + 0.5*1)) + self.assertEqual(self.tvi.get_sim_score([], ['data']), 0) + self.assertEqual(self.tvi_with_params2.get_sim_score(['data', 'data', 'science'], + ['data', 'management']), + 1.0 / (1.0 + 0.7*1 + 0.8*1)) + self.assertEqual(self.tvi_with_params3.get_sim_score(['data', 'management', 'science'], + ['data', 'data', 'science']), + 2.0 / (2.0 + 0.2*1 + 0)) + self.assertEqual(self.tvi.get_sim_score([], []), 1.0) + self.assertEqual(self.tvi_with_params4.get_sim_score(['a', 'b'], ['b', 'a']), 1.0) + self.assertEqual(self.tvi.get_sim_score(['a', 'b'], ['b', 'a']), 1.0) + self.assertEqual(self.tvi.get_sim_score(set([]), set([])), 1.0) + self.assertEqual(self.tvi_with_params5.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), + 3.0 / (3.0 + 0.45*1 + 0.85*4)) + + @raises(TypeError) + def test_invalid_input1_raw_score(self): + self.tvi.get_raw_score(1, 1) + + @raises(TypeError) + def test_invalid_input2_raw_score(self): + self.tvi.get_raw_score(['a'], None) + + @raises(TypeError) + def test_invalid_input3_raw_score(self): + self.tvi.get_raw_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input4_raw_score(self): + self.tvi.get_raw_score(None, None) + + @raises(TypeError) + def test_invalid_input5_raw_score(self): + self.tvi.get_raw_score(None, 'MARHTA') + + @raises(TypeError) + def test_invalid_input6_raw_score(self): + self.tvi.get_raw_score('MARHTA', None) + + @raises(TypeError) + def test_invalid_input7_raw_score(self): + self.tvi.get_raw_score('MARHTA', 'MARTHA') + + @raises(TypeError) + def test_invalid_input1_sim_score(self): + self.tvi.get_sim_score(1, 1) + + @raises(TypeError) + def test_invalid_input2_sim_score(self): + self.tvi.get_sim_score(['a'], None) + + @raises(TypeError) + def test_invalid_input3_sim_score(self): + self.tvi.get_sim_score(None, ['b']) + + @raises(TypeError) + def test_invalid_input4_sim_score(self): + self.tvi.get_sim_score(None, None) + + @raises(TypeError) + def test_invalid_input5_sim_score(self): + self.tvi.get_sim_score(None, 'MARHTA') + + @raises(TypeError) + def test_invalid_input6_sim_score(self): + self.tvi.get_sim_score('MARHTA', None) + + @raises(TypeError) + def test_invalid_input7_sim_score(self): + self.tvi.get_sim_score('MARHTA', 'MARTHA') + + @raises(ValueError) + def test_invalid_input8(self): + tvi_invalid = TverskyIndex(0.5, -0.9) + + @raises(ValueError) + def test_invalid_input9(self): + tvi_invalid = TverskyIndex(-0.5, 0.9) + + @raises(ValueError) + def test_invalid_input10(self): + tvi_invalid = TverskyIndex(-0.5, -0.9) \ No newline at end of file diff --git a/py_stringmatching/tests/test_simfunctions.py b/py_stringmatching/tests/test_simfunctions.py deleted file mode 100644 index de58154..0000000 --- a/py_stringmatching/tests/test_simfunctions.py +++ /dev/null @@ -1,2011 +0,0 @@ -# coding=utf-8 - -from __future__ import unicode_literals - -import math -import unittest - -from nose.tools import * - - -# sequence based similarity measures -from py_stringmatching.similarity_measure.affine import Affine -from py_stringmatching.similarity_measure.bag_distance import BagDistance -from py_stringmatching.similarity_measure.editex import Editex -from py_stringmatching.similarity_measure.hamming_distance import HammingDistance -from py_stringmatching.similarity_measure.jaro import Jaro -from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler -from py_stringmatching.similarity_measure.levenshtein import Levenshtein -from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch -from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman -# token based similarity measures -from py_stringmatching.similarity_measure.cosine import Cosine -from py_stringmatching.similarity_measure.dice import Dice -from py_stringmatching.similarity_measure.jaccard import Jaccard -from py_stringmatching.similarity_measure.overlap_coefficient import OverlapCoefficient -from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf -from py_stringmatching.similarity_measure.tfidf import TfIdf -from py_stringmatching.similarity_measure.tversky_index import TverskyIndex -# hybrid similarity measures -from py_stringmatching.similarity_measure.generalized_jaccard import GeneralizedJaccard -from py_stringmatching.similarity_measure.monge_elkan import MongeElkan -#phonetic similarity measures -from py_stringmatching.similarity_measure.soundex import Soundex - - -# ---------------------- sequence based similarity measures ---------------------- - - -class AffineTestCases(unittest.TestCase): - def setUp(self): - self.affine = Affine() - self.affine_with_params1 = Affine(gap_start=2, gap_continuation=0.5) - self.sim_func = lambda s1, s2: (int(1 if s1 == s2 else 0)) - self.affine_with_params2 = Affine(gap_continuation=0.2, sim_func=self.sim_func) - - def test_valid_input(self): - self.assertAlmostEqual(self.affine.get_raw_score('dva', 'deeva'), 1.5) - self.assertAlmostEqual(self.affine_with_params1.get_raw_score('dva', 'deeve'), -0.5) - self.assertAlmostEqual(self.affine_with_params2.get_raw_score('AAAGAATTCA', 'AAATCA'), - 4.4) - self.assertAlmostEqual(self.affine_with_params2.get_raw_score(' ', ' '), 1) - self.assertEqual(self.affine.get_raw_score('', 'deeva'), 0) - - def test_valid_input_non_ascii(self): - self.assertAlmostEqual(self.affine.get_raw_score(u'dva', u'dáóva'), 1.5) - self.assertAlmostEqual(self.affine.get_raw_score('dva', 'dáóva'), 1.5) - self.assertAlmostEqual(self.affine.get_raw_score('dva', b'd\xc3\xa1\xc3\xb3va'), 1.5) - - def test_get_gap_start(self): - self.assertEqual(self.affine_with_params1.get_gap_start(), 2) - - def test_get_gap_continuation(self): - self.assertEqual(self.affine_with_params2.get_gap_continuation(), 0.2) - - def test_get_sim_func(self): - self.assertEqual(self.affine_with_params2.get_sim_func(), self.sim_func) - - def test_set_gap_start(self): - af = Affine(gap_start=1) - self.assertEqual(af.get_gap_start(), 1) - self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 1.5) - self.assertEqual(af.set_gap_start(2), True) - self.assertEqual(af.get_gap_start(), 2) - self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 0.5) - - def test_set_gap_continuation(self): - af = Affine(gap_continuation=0.3) - self.assertEqual(af.get_gap_continuation(), 0.3) - self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 1.7) - self.assertEqual(af.set_gap_continuation(0.7), True) - self.assertEqual(af.get_gap_continuation(), 0.7) - self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 1.3) - - def test_set_sim_func(self): - fn1 = lambda s1, s2: (int(1 if s1 == s2 else 0)) - fn2 = lambda s1, s2: (int(2 if s1 == s2 else -1)) - af = Affine(sim_func=fn1) - self.assertEqual(af.get_sim_func(), fn1) - self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 1.5) - self.assertEqual(af.set_sim_func(fn2), True) - self.assertEqual(af.get_sim_func(), fn2) - self.assertAlmostEqual(af.get_raw_score('dva', 'deeva'), 4.5) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.affine.get_raw_score(None, 'MARHTA') - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.affine.get_raw_score('MARHTA', None) - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.affine.get_raw_score('MARHTA', 12.90) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.affine.get_raw_score(12.90, 'MARTHA') - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.affine.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.affine.get_raw_score(12.90, 12.90) - - -class BagDistanceTestCases(unittest.TestCase): - def setUp(self): - self.bd = BagDistance() - - def test_valid_input_raw_score(self): - self.assertEqual(self.bd.get_raw_score('a', ''), 1) - self.assertEqual(self.bd.get_raw_score('', 'a'), 1) - self.assertEqual(self.bd.get_raw_score('abc', ''), 3) - self.assertEqual(self.bd.get_raw_score('', 'abc'), 3) - self.assertEqual(self.bd.get_raw_score('', ''), 0) - self.assertEqual(self.bd.get_raw_score('a', 'a'), 0) - self.assertEqual(self.bd.get_raw_score('abc', 'abc'), 0) - self.assertEqual(self.bd.get_raw_score('a', 'ab'), 1) - self.assertEqual(self.bd.get_raw_score('b', 'ab'), 1) - self.assertEqual(self.bd.get_raw_score('ac', 'abc'), 1) - self.assertEqual(self.bd.get_raw_score('abcdefg', 'xabxcdxxefxgx'), 6) - self.assertEqual(self.bd.get_raw_score('ab', 'a'), 1) - self.assertEqual(self.bd.get_raw_score('ab', 'b'), 1) - self.assertEqual(self.bd.get_raw_score('abc', 'ac'), 1) - self.assertEqual(self.bd.get_raw_score('xabxcdxxefxgx', 'abcdefg'), 6) - self.assertEqual(self.bd.get_raw_score('a', 'b'), 1) - self.assertEqual(self.bd.get_raw_score('ab', 'ac'), 1) - self.assertEqual(self.bd.get_raw_score('ac', 'bc'), 1) - self.assertEqual(self.bd.get_raw_score('abc', 'axc'), 1) - self.assertEqual(self.bd.get_raw_score('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6) - self.assertEqual(self.bd.get_raw_score('example', 'samples'), 2) - self.assertEqual(self.bd.get_raw_score('sturgeon', 'urgently'), 2) - self.assertEqual(self.bd.get_raw_score('bag_distance', 'frankenstein'), 6) - self.assertEqual(self.bd.get_raw_score('distance', 'difference'), 5) - self.assertEqual(self.bd.get_raw_score('java was neat', 'scala is great'), 6) - - def test_valid_input_sim_score(self): - self.assertEqual(self.bd.get_sim_score('a', ''), 0.0) - self.assertEqual(self.bd.get_sim_score('', 'a'), 0.0) - self.assertEqual(self.bd.get_sim_score('abc', ''), 0.0) - self.assertEqual(self.bd.get_sim_score('', 'abc'), 0.0) - self.assertEqual(self.bd.get_sim_score('', ''), 1.0) - self.assertEqual(self.bd.get_sim_score('a', 'a'), 1.0) - self.assertEqual(self.bd.get_sim_score('abc', 'abc'), 1.0) - self.assertEqual(self.bd.get_sim_score('a', 'ab'), 1.0 - (1.0/2.0)) - self.assertEqual(self.bd.get_sim_score('b', 'ab'), 1.0 - (1.0/2.0)) - self.assertEqual(self.bd.get_sim_score('ac', 'abc'), 1.0 - (1.0/3.0)) - self.assertEqual(self.bd.get_sim_score('abcdefg', 'xabxcdxxefxgx'), 1.0 - (6.0/13.0)) - self.assertEqual(self.bd.get_sim_score('ab', 'a'), 1.0 - (1.0/2.0)) - self.assertEqual(self.bd.get_sim_score('ab', 'b'), 1.0 - (1.0/2.0)) - self.assertEqual(self.bd.get_sim_score('abc', 'ac'), 1.0 - (1.0/3.0)) - self.assertEqual(self.bd.get_sim_score('xabxcdxxefxgx', 'abcdefg'), 1.0 - (6.0/13.0)) - self.assertEqual(self.bd.get_sim_score('a', 'b'), 0.0) - self.assertEqual(self.bd.get_sim_score('ab', 'ac'), 1.0 - (1.0/2.0)) - self.assertEqual(self.bd.get_sim_score('ac', 'bc'), 1.0 - (1.0/2.0)) - self.assertEqual(self.bd.get_sim_score('abc', 'axc'), 1.0 - (1.0/3.0)) - self.assertEqual(self.bd.get_sim_score('xabxcdxxefxgx', '1ab2cd34ef5g6'), 1.0 - (6.0/13.0)) - self.assertEqual(self.bd.get_sim_score('example', 'samples'), 1.0 - (2.0/7.0)) - self.assertEqual(self.bd.get_sim_score('sturgeon', 'urgently'), 1.0 - (2.0/8.0)) - self.assertEqual(self.bd.get_sim_score('bag_distance', 'frankenstein'), 1.0 - (6.0/12.0)) - self.assertEqual(self.bd.get_sim_score('distance', 'difference'), 1.0 - (5.0/10.0)) - self.assertEqual(self.bd.get_sim_score('java was neat', 'scala is great'), 1.0 - (6.0/14.0)) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.bd.get_raw_score('a', None) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.bd.get_raw_score(None, 'b') - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.bd.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.bd.get_raw_score('MARHTA', 12.90) - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.bd.get_raw_score(12.90, 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.bd.get_raw_score(12.90, 12.90) - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.bd.get_sim_score('a', None) - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.bd.get_sim_score(None, 'b') - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.bd.get_sim_score(None, None) - - @raises(TypeError) - def test_invalid_input4_sim_score(self): - self.bd.get_sim_score('MARHTA', 12.90) - - @raises(TypeError) - def test_invalid_input5_sim_score(self): - self.bd.get_sim_score(12.90, 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_sim_score(self): - self.bd.get_sim_score(12.90, 12.90) - - -class EditexTestCases(unittest.TestCase): - def setUp(self): - self.ed = Editex() - self.ed_with_params1 = Editex(match_cost=2) - self.ed_with_params2 = Editex(mismatch_cost=2) - self.ed_with_params3 = Editex(mismatch_cost=1) - self.ed_with_params4 = Editex(mismatch_cost=3, group_cost=2) - self.ed_with_params5 = Editex(mismatch_cost=3, group_cost=2, local=True) - self.ed_with_params6 = Editex(local=True) - - def test_get_match_cost(self): - self.assertEqual(self.ed_with_params1.get_match_cost(), 2) - - def test_get_group_cost(self): - self.assertEqual(self.ed_with_params4.get_group_cost(), 2) - - def test_get_mismatch_cost(self): - self.assertEqual(self.ed_with_params4.get_mismatch_cost(), 3) - - def test_get_local(self): - self.assertEqual(self.ed_with_params5.get_local(), True) - - def test_set_match_cost(self): - ed = Editex(match_cost=2) - self.assertEqual(ed.get_match_cost(), 2) - self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 12) - self.assertEqual(ed.set_match_cost(4), True) - self.assertEqual(ed.get_match_cost(), 4) - self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 14) - - def test_set_group_cost(self): - ed = Editex(group_cost=1) - self.assertEqual(ed.get_group_cost(), 1) - self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 3) - self.assertEqual(ed.set_group_cost(2), True) - self.assertEqual(ed.get_group_cost(), 2) - self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 4) - - def test_set_mismatch_cost(self): - ed = Editex(mismatch_cost=2) - self.assertEqual(ed.get_mismatch_cost(), 2) - self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 3) - self.assertEqual(ed.set_mismatch_cost(4), True) - self.assertEqual(ed.get_mismatch_cost(), 4) - self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 5) - - def test_set_local(self): - ed = Editex(local=False) - self.assertEqual(ed.get_local(), False) - self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 3) - self.assertEqual(ed.set_local(True), True) - self.assertEqual(ed.get_local(), True) - self.assertAlmostEqual(ed.get_raw_score('MARTHA', 'MARHTA'), 3) - - def test_valid_input_raw_score(self): - self.assertEqual(self.ed.get_raw_score('MARTHA', 'MARTHA'), 0) - self.assertEqual(self.ed.get_raw_score('MARTHA', 'MARHTA'), 3) - self.assertEqual(self.ed.get_raw_score('ALIE', 'ALI'), 1) - self.assertEqual(self.ed_with_params1.get_raw_score('ALIE', 'ALI'), 7) - self.assertEqual(self.ed_with_params2.get_raw_score('ALIE', 'ALIF'), 2) - self.assertEqual(self.ed_with_params3.get_raw_score('ALIE', 'ALIF'), 1) - self.assertEqual(self.ed_with_params4.get_raw_score('ALIP', 'ALIF'), 2) - self.assertEqual(self.ed_with_params4.get_raw_score('ALIe', 'ALIF'), 3) - self.assertEqual(self.ed_with_params5.get_raw_score('WALIW', 'HALIH'), 6) - self.assertEqual(self.ed_with_params6.get_raw_score('niall', 'nihal'), 2) - self.assertEqual(self.ed_with_params6.get_raw_score('nihal', 'niall'), 2) - self.assertEqual(self.ed_with_params6.get_raw_score('neal', 'nihl'), 3) - self.assertEqual(self.ed_with_params6.get_raw_score('nihl', 'neal'), 3) - self.assertEqual(self.ed.get_raw_score('', ''), 0) - self.assertEqual(self.ed.get_raw_score('', 'MARTHA'), 12) - self.assertEqual(self.ed.get_raw_score('MARTHA', ''), 12) - - def test_valid_input_sim_score(self): - self.assertEqual(self.ed.get_sim_score('MARTHA', 'MARTHA'), 1.0) - self.assertEqual(self.ed.get_sim_score('MARTHA', 'MARHTA'), 1.0 - (3.0/12.0)) - self.assertEqual(self.ed.get_sim_score('ALIE', 'ALI'), 1.0 - (1.0/8.0)) - self.assertEqual(self.ed_with_params1.get_sim_score('ALIE', 'ALI'), 1.0 - (7.0/8.0)) - self.assertEqual(self.ed_with_params2.get_sim_score('ALIE', 'ALIF'), 1.0 - (2.0/8.0)) - self.assertEqual(self.ed_with_params3.get_sim_score('ALIE', 'ALIF'), 1.0 - (1.0/4.0)) - self.assertEqual(self.ed_with_params4.get_sim_score('ALIP', 'ALIF'), 1.0 - (2.0/12.0)) - self.assertEqual(self.ed_with_params4.get_sim_score('ALIe', 'ALIF'), 1.0 - (3.0/12.0)) - self.assertEqual(self.ed_with_params5.get_sim_score('WALIW', 'HALIH'), 1.0 - (6.0/15.0)) - self.assertEqual(self.ed_with_params6.get_sim_score('niall', 'nihal'), 1.0 - (2.0/10.0)) - self.assertEqual(self.ed_with_params6.get_sim_score('nihal', 'niall'), 1.0 - (2.0/10.0)) - self.assertEqual(self.ed_with_params6.get_sim_score('neal', 'nihl'), 1.0 - (3.0/8.0)) - self.assertEqual(self.ed_with_params6.get_sim_score('nihl', 'neal'), 1.0 - (3.0/8.0)) - self.assertEqual(self.ed.get_sim_score('', ''), 1.0) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.ed.get_raw_score(None, 'MARHTA') - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.ed.get_raw_score('MARHTA', None) - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.ed.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.ed.get_raw_score('MARHTA', 12.90) - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.ed.get_raw_score(12.90, 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.ed.get_raw_score(12.90, 12.90) - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.ed.get_sim_score(None, 'MARHTA') - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.ed.get_sim_score('MARHTA', None) - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.ed.get_sim_score(None, None) - - @raises(TypeError) - def test_invalid_input4_sim_score(self): - self.ed.get_sim_score('MARHTA', 12.90) - - @raises(TypeError) - def test_invalid_input5_sim_score(self): - self.ed.get_sim_score(12.90, 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_sim_score(self): - self.ed.get_sim_score(12.90, 12.90) - - -class JaroTestCases(unittest.TestCase): - def setUp(self): - self.jaro = Jaro() - - def test_valid_input_raw_score(self): - # https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance - self.assertAlmostEqual(self.jaro.get_raw_score('MARTHA', 'MARHTA'), - 0.9444444444444445) - self.assertAlmostEqual(self.jaro.get_raw_score('DWAYNE', 'DUANE'), - 0.8222222222222223) - self.assertAlmostEqual(self.jaro.get_raw_score('DIXON', 'DICKSONX'), - 0.7666666666666666) - self.assertEqual(self.jaro.get_raw_score('', 'deeva'), 0) - - def test_valid_input_sim_score(self): - self.assertAlmostEqual(self.jaro.get_sim_score('MARTHA', 'MARHTA'), - 0.9444444444444445) - self.assertAlmostEqual(self.jaro.get_sim_score('DWAYNE', 'DUANE'), - 0.8222222222222223) - self.assertAlmostEqual(self.jaro.get_sim_score('DIXON', 'DICKSONX'), - 0.7666666666666666) - self.assertEqual(self.jaro.get_sim_score('', 'deeva'), 0) - - def test_non_ascii_input_raw_score(self): - self.assertAlmostEqual(self.jaro.get_raw_score(u'MARTHA', u'MARHTA'), - 0.9444444444444445) - self.assertAlmostEqual(self.jaro.get_raw_score(u'László', u'Lsáló'), - 0.8777777777777779) - self.assertAlmostEqual(self.jaro.get_raw_score('László', 'Lsáló'), - 0.8777777777777779) - self.assertAlmostEqual(self.jaro.get_raw_score(b'L\xc3\xa1szl\xc3\xb3', - b'Ls\xc3\xa1l\xc3\xb3'), - 0.8777777777777779) - - def test_non_ascii_input_sim_score(self): - self.assertAlmostEqual(self.jaro.get_sim_score(u'MARTHA', u'MARHTA'), - 0.9444444444444445) - self.assertAlmostEqual(self.jaro.get_sim_score(u'László', u'Lsáló'), - 0.8777777777777779) - self.assertAlmostEqual(self.jaro.get_sim_score('László', 'Lsáló'), - 0.8777777777777779) - self.assertAlmostEqual(self.jaro.get_sim_score(b'L\xc3\xa1szl\xc3\xb3', - b'Ls\xc3\xa1l\xc3\xb3'), - 0.8777777777777779) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.jaro.get_raw_score(None, 'MARHTA') - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.jaro.get_raw_score('MARHTA', None) - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.jaro.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.jaro.get_raw_score('MARHTA', 12.90) - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.jaro.get_raw_score(12.90, 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.jaro.get_raw_score(12.90, 12.90) - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.jaro.get_sim_score(None, 'MARHTA') - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.jaro.get_sim_score('MARHTA', None) - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.jaro.get_sim_score(None, None) - - @raises(TypeError) - def test_invalid_input4_sim_score(self): - self.jaro.get_sim_score('MARHTA', 12.90) - - @raises(TypeError) - def test_invalid_input5_sim_score(self): - self.jaro.get_sim_score(12.90, 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_sim_score(self): - self.jaro.get_sim_score(12.90, 12.90) - - -class JaroWinklerTestCases(unittest.TestCase): - def setUp(self): - self.jw = JaroWinkler() - - def test_get_prefix_weight(self): - self.assertEqual(self.jw.get_prefix_weight(), 0.1) - - def test_set_prefix_weight(self): - jw = JaroWinkler(prefix_weight=0.15) - self.assertEqual(jw.get_prefix_weight(), 0.15) - self.assertAlmostEqual(jw.get_raw_score('MARTHA', 'MARHTA'), 0.9694444444444444) - self.assertEqual(jw.set_prefix_weight(0.25), True) - self.assertEqual(jw.get_prefix_weight(), 0.25) - self.assertAlmostEqual(jw.get_raw_score('MARTHA', 'MARHTA'), 0.9861111111111112) - - def test_valid_input_raw_score(self): - # https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance - self.assertAlmostEqual(self.jw.get_raw_score('MARTHA', 'MARHTA'), - 0.9611111111111111) - self.assertAlmostEqual(self.jw.get_raw_score('DWAYNE', 'DUANE'), 0.84) - self.assertAlmostEqual(self.jw.get_raw_score('DIXON', 'DICKSONX'), - 0.8133333333333332) - - def test_valid_input_sim_score(self): - self.assertAlmostEqual(self.jw.get_sim_score('MARTHA', 'MARHTA'), - 0.9611111111111111) - self.assertAlmostEqual(self.jw.get_sim_score('DWAYNE', 'DUANE'), 0.84) - self.assertAlmostEqual(self.jw.get_sim_score('DIXON', 'DICKSONX'), - 0.8133333333333332) - - def test_non_ascii_input_raw_score(self): - self.assertAlmostEqual(self.jw.get_raw_score(u'MARTHA', u'MARHTA'), - 0.9611111111111111) - self.assertAlmostEqual(self.jw.get_raw_score(u'László', u'Lsáló'), - 0.8900000000000001) - self.assertAlmostEqual(self.jw.get_raw_score('László', 'Lsáló'), - 0.8900000000000001) - self.assertAlmostEqual(self.jw.get_raw_score(b'L\xc3\xa1szl\xc3\xb3', - b'Ls\xc3\xa1l\xc3\xb3'), - 0.8900000000000001) - - def test_non_ascii_input_sim_score(self): - self.assertAlmostEqual(self.jw.get_sim_score(u'MARTHA', u'MARHTA'), - 0.9611111111111111) - self.assertAlmostEqual(self.jw.get_sim_score(u'László', u'Lsáló'), - 0.8900000000000001) - self.assertAlmostEqual(self.jw.get_sim_score('László', 'Lsáló'), - 0.8900000000000001) - self.assertAlmostEqual(self.jw.get_sim_score(b'L\xc3\xa1szl\xc3\xb3', - b'Ls\xc3\xa1l\xc3\xb3'), - 0.8900000000000001) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.jw.get_raw_score(None, 'MARHTA') - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.jw.get_raw_score('MARHTA', None) - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.jw.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.jw.get_raw_score('MARHTA', 12.90) - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.jw.get_raw_score(12.90, 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.jw.get_raw_score(12.90, 12.90) - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.jw.get_sim_score(None, 'MARHTA') - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.jw.get_sim_score('MARHTA', None) - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.jw.get_sim_score(None, None) - - @raises(TypeError) - def test_invalid_input4_sim_score(self): - self.jw.get_sim_score('MARHTA', 12.90) - - @raises(TypeError) - def test_invalid_input5_sim_score(self): - self.jw.get_sim_score(12.90, 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_sim_score(self): - self.jw.get_sim_score(12.90, 12.90) - - -class LevenshteinTestCases(unittest.TestCase): - def setUp(self): - self.lev = Levenshtein() - - def test_valid_input_raw_score(self): - # http://oldfashionedsoftware.com/tag/levenshtein-distance/ - self.assertEqual(self.lev.get_raw_score('a', ''), 1) - self.assertEqual(self.lev.get_raw_score('', 'a'), 1) - self.assertEqual(self.lev.get_raw_score('abc', ''), 3) - self.assertEqual(self.lev.get_raw_score('', 'abc'), 3) - self.assertEqual(self.lev.get_raw_score('', ''), 0) - self.assertEqual(self.lev.get_raw_score('a', 'a'), 0) - self.assertEqual(self.lev.get_raw_score('abc', 'abc'), 0) - self.assertEqual(self.lev.get_raw_score('a', 'ab'), 1) - self.assertEqual(self.lev.get_raw_score('b', 'ab'), 1) - self.assertEqual(self.lev.get_raw_score('ac', 'abc'), 1) - self.assertEqual(self.lev.get_raw_score('abcdefg', 'xabxcdxxefxgx'), 6) - self.assertEqual(self.lev.get_raw_score('ab', 'a'), 1) - self.assertEqual(self.lev.get_raw_score('ab', 'b'), 1) - self.assertEqual(self.lev.get_raw_score('abc', 'ac'), 1) - self.assertEqual(self.lev.get_raw_score('xabxcdxxefxgx', 'abcdefg'), 6) - self.assertEqual(self.lev.get_raw_score('a', 'b'), 1) - self.assertEqual(self.lev.get_raw_score('ab', 'ac'), 1) - self.assertEqual(self.lev.get_raw_score('ac', 'bc'), 1) - self.assertEqual(self.lev.get_raw_score('abc', 'axc'), 1) - self.assertEqual(self.lev.get_raw_score('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6) - self.assertEqual(self.lev.get_raw_score('example', 'samples'), 3) - self.assertEqual(self.lev.get_raw_score('sturgeon', 'urgently'), 6) - self.assertEqual(self.lev.get_raw_score('levenshtein', 'frankenstein'), 6) - self.assertEqual(self.lev.get_raw_score('distance', 'difference'), 5) - self.assertEqual(self.lev.get_raw_score('java was neat', 'scala is great'), 7) - - def test_valid_input_sim_score(self): - self.assertEqual(self.lev.get_sim_score('a', ''), 1.0 - (1.0/1.0)) - self.assertEqual(self.lev.get_sim_score('', 'a'), 1.0 - (1.0/1.0)) - self.assertEqual(self.lev.get_sim_score('abc', ''), 1.0 - (3.0/3.0)) - self.assertEqual(self.lev.get_sim_score('', 'abc'), 1.0 - (3.0/3.0)) - self.assertEqual(self.lev.get_sim_score('', ''), 1.0) - self.assertEqual(self.lev.get_sim_score('a', 'a'), 1.0) - self.assertEqual(self.lev.get_sim_score('abc', 'abc'), 1.0) - self.assertEqual(self.lev.get_sim_score('a', 'ab'), 1.0 - (1.0/2.0)) - self.assertEqual(self.lev.get_sim_score('b', 'ab'), 1.0 - (1.0/2.0)) - self.assertEqual(self.lev.get_sim_score('ac', 'abc'), 1.0 - (1.0/3.0)) - self.assertEqual(self.lev.get_sim_score('abcdefg', 'xabxcdxxefxgx'), 1.0 - (6.0/13.0)) - self.assertEqual(self.lev.get_sim_score('ab', 'a'), 1.0 - (1.0/2.0)) - self.assertEqual(self.lev.get_sim_score('ab', 'b'), 1.0 - (1.0/2.0)) - self.assertEqual(self.lev.get_sim_score('abc', 'ac'), 1.0 - (1.0/3.0)) - self.assertEqual(self.lev.get_sim_score('xabxcdxxefxgx', 'abcdefg'), 1.0 - (6.0/13.0)) - self.assertEqual(self.lev.get_sim_score('a', 'b'), 1.0 - (1.0/1.0)) - self.assertEqual(self.lev.get_sim_score('ab', 'ac'), 1.0 - (1.0/2.0)) - self.assertEqual(self.lev.get_sim_score('ac', 'bc'), 1.0 - (1.0/2.0)) - self.assertEqual(self.lev.get_sim_score('abc', 'axc'), 1.0 - (1.0/3.0)) - self.assertEqual(self.lev.get_sim_score('xabxcdxxefxgx', '1ab2cd34ef5g6'), 1.0 - (6.0/13.0)) - self.assertEqual(self.lev.get_sim_score('example', 'samples'), 1.0 - (3.0/7.0)) - self.assertEqual(self.lev.get_sim_score('sturgeon', 'urgently'), 1.0 - (6.0/8.0)) - self.assertEqual(self.lev.get_sim_score('levenshtein', 'frankenstein'), 1.0 - (6.0/12.0)) - self.assertEqual(self.lev.get_sim_score('distance', 'difference'), 1.0 - (5.0/10.0)) - self.assertEqual(self.lev.get_sim_score('java was neat', 'scala is great'), 1.0 - (7.0/14.0)) - - def test_valid_input_non_ascii_raw_score(self): - self.assertEqual(self.lev.get_raw_score('ác', 'áóc'), 1) - self.assertEqual(self.lev.get_raw_score(u'ác', u'áóc'), 1) - self.assertEqual(self.lev.get_raw_score(b'\xc3\xa1c', b'\xc3\xa1\xc3\xb3c'), 1) - - def test_valid_input_non_ascii_sim_score(self): - self.assertEqual(self.lev.get_sim_score('ác', 'áóc'), 1.0 - (1.0/3.0)) - self.assertEqual(self.lev.get_sim_score(u'ác', u'áóc'), 1.0 - (1.0/3.0)) - self.assertEqual(self.lev.get_sim_score(b'\xc3\xa1c', b'\xc3\xa1\xc3\xb3c'), 1.0 - (1.0/3.0)) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.lev.get_raw_score('a', None) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.lev.get_raw_score(None, 'b') - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.lev.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.lev.get_raw_score('MARHTA', 12.90) - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.lev.get_raw_score(12.90, 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.lev.get_raw_score(12.90, 12.90) - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.lev.get_sim_score('a', None) - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.lev.get_sim_score(None, 'b') - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.lev.get_sim_score(None, None) - - @raises(TypeError) - def test_invalid_input4_sim_score(self): - self.lev.get_sim_score('MARHTA', 12.90) - - @raises(TypeError) - def test_invalid_input5_sim_score(self): - self.lev.get_sim_score(12.90, 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_sim_score(self): - self.lev.get_sim_score(12.90, 12.90) - - -class HammingDistanceTestCases(unittest.TestCase): - def setUp(self): - self.hd = HammingDistance() - - def test_valid_input_raw_score(self): - self.assertEqual(self.hd.get_raw_score('-789', 'john'), 4) - self.assertEqual(self.hd.get_raw_score('a', '*'), 1) - self.assertEqual(self.hd.get_raw_score('b', 'a'), 1) - self.assertEqual(self.hd.get_raw_score('abc', 'p q'), 3) - self.assertEqual(self.hd.get_raw_score('karolin', 'kathrin'), 3) - self.assertEqual(self.hd.get_raw_score('KARI', 'kari'), 4) - self.assertEqual(self.hd.get_raw_score('', ''), 0) - - def test_valid_input_sim_score(self): - self.assertEqual(self.hd.get_sim_score('-789', 'john'), 1.0 - (4.0/4.0)) - self.assertEqual(self.hd.get_sim_score('a', '*'), 1.0 - (1.0/1.0)) - self.assertEqual(self.hd.get_sim_score('b', 'a'), 1.0 - (1.0/1.0)) - self.assertEqual(self.hd.get_sim_score('abc', 'p q'), 1.0 - (3.0/3.0)) - self.assertEqual(self.hd.get_sim_score('karolin', 'kathrin'), 1.0 - (3.0/7.0)) - self.assertEqual(self.hd.get_sim_score('KARI', 'kari'), 1.0 - (4.0/4.0)) - self.assertEqual(self.hd.get_sim_score('', ''), 1.0) - - def test_valid_input_compatibility_raw_score(self): - self.assertEqual(self.hd.get_raw_score(u'karolin', u'kathrin'), 3) - self.assertEqual(self.hd.get_raw_score(u'', u''), 0) - # str_1 = u'foo'.encode(encoding='UTF-8', errors='strict') - # str_2 = u'bar'.encode(encoding='UTF-8', errors='strict') - # self.assertEqual(self.hd.get_raw_score(str_1, str_2), 3) # check with Ali - python 3 returns type error - # self.assertEqual(self.hd.get_raw_score(str_1, str_1), 0) # check with Ali - python 3 returns type error - - def test_valid_input_compatibility_sim_score(self): - self.assertEqual(self.hd.get_sim_score(u'karolin', u'kathrin'), 1.0 - (3.0/7.0)) - self.assertEqual(self.hd.get_sim_score(u'', u''), 1.0) - - def test_valid_input_non_ascii_raw_score(self): - self.assertEqual(self.hd.get_raw_score(u'ábó', u'áóó'), 1) - self.assertEqual(self.hd.get_raw_score('ábó', 'áóó'), 1) - self.assertEqual(self.hd.get_raw_score(b'\xc3\xa1b\xc3\xb3', - b'\xc3\xa1\xc3\xb3\xc3\xb3'), - 1) - - def test_valid_input_non_ascii_sim_score(self): - self.assertEqual(self.hd.get_sim_score(u'ábó', u'áóó'), 1.0 - (1.0/3.0)) - self.assertEqual(self.hd.get_sim_score('ábó', 'áóó'), 1.0 - (1.0/3.0)) - self.assertEqual(self.hd.get_sim_score(b'\xc3\xa1b\xc3\xb3', - b'\xc3\xa1\xc3\xb3\xc3\xb3'), - 1.0 - (1.0/3.0)) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.hd.get_raw_score('a', None) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.hd.get_raw_score(None, 'b') - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.hd.get_raw_score(None, None) - - @raises(ValueError) - def test_invalid_input4_raw_score(self): - self.hd.get_raw_score('a', '') - - @raises(ValueError) - def test_invalid_input5_raw_score(self): - self.hd.get_raw_score('', 'This is a long string') - - @raises(ValueError) - def test_invalid_input6_raw_score(self): - self.hd.get_raw_score('ali', 'alex') - - @raises(TypeError) - def test_invalid_input7_raw_score(self): - self.hd.get_raw_score('MA', 12) - - @raises(TypeError) - def test_invalid_input8_raw_score(self): - self.hd.get_raw_score(12, 'MA') - - @raises(TypeError) - def test_invalid_input9_raw_score(self): - self.hd.get_raw_score(12, 12) - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.hd.get_sim_score('a', None) - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.hd.get_sim_score(None, 'b') - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.hd.get_sim_score(None, None) - - @raises(ValueError) - def test_invalid_input4_sim_score(self): - self.hd.get_sim_score('a', '') - - @raises(ValueError) - def test_invalid_input5_sim_score(self): - self.hd.get_sim_score('', 'This is a long string') - - @raises(ValueError) - def test_invalid_input6_sim_score(self): - self.hd.get_sim_score('ali', 'alex') - - @raises(TypeError) - def test_invalid_input7_sim_score(self): - self.hd.get_sim_score('MA', 12) - - @raises(TypeError) - def test_invalid_input8_sim_score(self): - self.hd.get_sim_score(12, 'MA') - - @raises(TypeError) - def test_invalid_input9_sim_score(self): - self.hd.get_sim_score(12, 12) - - -class NeedlemanWunschTestCases(unittest.TestCase): - def setUp(self): - self.nw = NeedlemanWunsch() - self.nw_with_params1 = NeedlemanWunsch(0.0) - self.nw_with_params2 = NeedlemanWunsch(1.0, - sim_func=lambda s1, s2: (2 if s1 == s2 else -1)) - self.sim_func=lambda s1, s2: (1 if s1 == s2 else -1) - self.nw_with_params3 = NeedlemanWunsch(gap_cost=0.5, - sim_func=self.sim_func) - - def test_get_gap_cost(self): - self.assertEqual(self.nw_with_params3.get_gap_cost(), 0.5) - - def test_get_sim_func(self): - self.assertEqual(self.nw_with_params3.get_sim_func(), self.sim_func) - - def test_set_gap_cost(self): - nw = NeedlemanWunsch(gap_cost=0.5) - self.assertEqual(nw.get_gap_cost(), 0.5) - self.assertAlmostEqual(nw.get_raw_score('dva', 'deeva'), 2.0) - self.assertEqual(nw.set_gap_cost(0.7), True) - self.assertEqual(nw.get_gap_cost(), 0.7) - self.assertAlmostEqual(nw.get_raw_score('dva', 'deeva'), 1.6000000000000001) - - def test_set_sim_func(self): - fn1 = lambda s1, s2: (int(1 if s1 == s2 else 0)) - fn2 = lambda s1, s2: (int(2 if s1 == s2 else -1)) - nw = NeedlemanWunsch(sim_func=fn1) - self.assertEqual(nw.get_sim_func(), fn1) - self.assertAlmostEqual(nw.get_raw_score('dva', 'deeva'), 1.0) - self.assertEqual(nw.set_sim_func(fn2), True) - self.assertEqual(nw.get_sim_func(), fn2) - self.assertAlmostEqual(nw.get_raw_score('dva', 'deeva'), 4.0) - - def test_valid_input(self): - self.assertEqual(self.nw.get_raw_score('dva', 'deeva'), 1.0) - self.assertEqual(self.nw_with_params1.get_raw_score('dva', 'deeve'), 2.0) - self.assertEqual(self.nw_with_params2.get_raw_score('dva', 'deeve'), 1.0) - self.assertEqual(self.nw_with_params3.get_raw_score('GCATGCUA', 'GATTACA'), - 2.5) - - def test_valid_input_non_ascii(self): - self.assertEqual(self.nw.get_raw_score(u'dva', u'dáóva'), 1.0) - self.assertEqual(self.nw.get_raw_score('dva', 'dáóva'), 1.0) - self.assertEqual(self.nw.get_raw_score('dva', b'd\xc3\xa1\xc3\xb3va'), 1.0) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.nw.get_raw_score('a', None) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.nw.get_raw_score(None, 'b') - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.nw.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.nw.get_raw_score(['a'], 'b') - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.nw.get_raw_score('a', ['b']) - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.nw.get_raw_score(['a'], ['b']) - - -class SmithWatermanTestCases(unittest.TestCase): - def setUp(self): - self.sw = SmithWaterman() - self.sw_with_params1 = SmithWaterman(2.2) - self.sw_with_params2 = SmithWaterman(1, - sim_func=lambda s1, s2: (2 if s1 == s2 else -1)) - self.sw_with_params3 = SmithWaterman(gap_cost=1, - sim_func=lambda s1, s2: (int(1 if s1 == s2 else -1))) - self.sim_func=lambda s1, s2: (1.5 if s1 == s2 else 0.5) - self.sw_with_params4 = SmithWaterman(gap_cost=1.4, - sim_func=self.sim_func) - - def test_get_gap_cost(self): - self.assertEqual(self.sw_with_params4.get_gap_cost(), 1.4) - - def test_get_sim_func(self): - self.assertEqual(self.sw_with_params4.get_sim_func(), self.sim_func) - - def test_set_gap_cost(self): - sw = SmithWaterman(gap_cost=0.3) - self.assertEqual(sw.get_gap_cost(), 0.3) - self.assertAlmostEqual(sw.get_raw_score('dva', 'deeva'), 2.3999999999999999) - self.assertEqual(sw.set_gap_cost(0.7), True) - self.assertEqual(sw.get_gap_cost(), 0.7) - self.assertAlmostEqual(sw.get_raw_score('dva', 'deeva'), 2.0) - - def test_set_sim_func(self): - fn1 = lambda s1, s2: (int(1 if s1 == s2 else 0)) - fn2 = lambda s1, s2: (int(2 if s1 == s2 else -1)) - sw = SmithWaterman(sim_func=fn1) - self.assertEqual(sw.get_sim_func(), fn1) - self.assertAlmostEqual(sw.get_raw_score('dva', 'deeva'), 2.0) - self.assertEqual(sw.set_sim_func(fn2), True) - self.assertEqual(sw.get_sim_func(), fn2) - self.assertAlmostEqual(sw.get_raw_score('dva', 'deeva'), 4.0) - - def test_valid_input(self): - self.assertEqual(self.sw.get_raw_score('cat', 'hat'), 2.0) - self.assertEqual(self.sw_with_params1.get_raw_score('dva', 'deeve'), 1.0) - self.assertEqual(self.sw_with_params2.get_raw_score('dva', 'deeve'), 2.0) - self.assertEqual(self.sw_with_params3.get_raw_score('GCATGCU', 'GATTACA'), - 2.0) - self.assertEqual(self.sw_with_params4.get_raw_score('GCATAGCU', 'GATTACA'), - 6.5) - - def test_valid_input_non_ascii(self): - self.assertEqual(self.sw.get_raw_score(u'óát', u'cát'), 2.0) - self.assertEqual(self.sw.get_raw_score('óát', 'cát'), 2.0) - self.assertEqual(self.sw.get_raw_score(b'\xc3\xb3\xc3\xa1t', b'c\xc3\xa1t'), - 2.0) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.sw.get_raw_score('a', None) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.sw.get_raw_score(None, 'b') - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.sw.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.sw.get_raw_score('MARHTA', 12) - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.sw.get_raw_score(12, 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.sw.get_raw_score(12, 12) - - -class SoundexTestCases(unittest.TestCase): - def setUp(self): - self.sdx = Soundex() - - def test_valid_input_raw_score(self): - self.assertEqual(self.sdx.get_raw_score('Robert', 'Rupert'), 1) - self.assertEqual(self.sdx.get_raw_score('Sue', 'S'), 1) - self.assertEqual(self.sdx.get_raw_score('robert', 'rupert'), 1) - self.assertEqual(self.sdx.get_raw_score('Gough', 'goff'), 0) - self.assertEqual(self.sdx.get_raw_score('gough', 'Goff'), 0) - self.assertEqual(self.sdx.get_raw_score('ali', 'a,,,li'), 1) - self.assertEqual(self.sdx.get_raw_score('Jawornicki', 'Yavornitzky'), 0) - self.assertEqual(self.sdx.get_raw_score('Robert', 'Robert'), 1) - - def test_valid_input_sim_score(self): - self.assertEqual(self.sdx.get_sim_score('Robert', 'Rupert'), 1) - self.assertEqual(self.sdx.get_sim_score('Sue', 'S'), 1) - self.assertEqual(self.sdx.get_sim_score('robert', 'rupert'), 1) - self.assertEqual(self.sdx.get_sim_score('Gough', 'goff'), 0) - self.assertEqual(self.sdx.get_sim_score('gough', 'Goff'), 0) - self.assertEqual(self.sdx.get_sim_score('ali', 'a,,,li'), 1) - self.assertEqual(self.sdx.get_sim_score('Jawornicki', 'Yavornitzky'), 0) - self.assertEqual(self.sdx.get_sim_score('Robert', 'Robert'), 1) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.sdx.get_raw_score('a', None) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.sdx.get_raw_score(None, 'b') - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.sdx.get_raw_score(None, None) - - @raises(ValueError) - def test_invalid_input4_raw_score(self): - self.sdx.get_raw_score('a', '') - - @raises(ValueError) - def test_invalid_input5_raw_score(self): - self.sdx.get_raw_score('', 'This is a long string') - - @raises(TypeError) - def test_invalid_input7_raw_score(self): - self.sdx.get_raw_score('xyz', ['']) - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.sdx.get_sim_score('a', None) - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.sdx.get_sim_score(None, 'b') - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.sdx.get_sim_score(None, None) - - @raises(ValueError) - def test_invalid_input4_sim_score(self): - self.sdx.get_sim_score('a', '') - - @raises(ValueError) - def test_invalid_input5_sim_score(self): - self.sdx.get_sim_score('', 'This is a long string') - - @raises(TypeError) - def test_invalid_input7_sim_score(self): - self.sdx.get_sim_score('xyz', ['']) - - -# ---------------------- token based similarity measures ---------------------- - -# ---------------------- set based similarity measures ---------------------- -class OverlapCoefficientTestCases(unittest.TestCase): - def setUp(self): - self.oc = OverlapCoefficient() - - def test_valid_input_raw_score(self): - self.assertEqual(self.oc.get_raw_score([], []), 1.0) - self.assertEqual(self.oc.get_raw_score(['data', 'science'], ['data']), - 1.0 / min(2.0, 1.0)) - self.assertEqual(self.oc.get_raw_score(['data', 'science'], - ['science', 'good']), 1.0 / min(2.0, 3.0)) - self.assertEqual(self.oc.get_raw_score([], ['data']), 0) - self.assertEqual(self.oc.get_raw_score(['data', 'data', 'science'], - ['data', 'management']), 1.0 / min(3.0, 2.0)) - - def test_valid_input_raw_score_set_inp(self): - self.assertEqual(self.oc.get_raw_score(set(['data', 'science']), set(['data'])), - 1.0 / min(2.0, 1.0)) - - def test_valid_input_sim_score(self): - self.assertEqual(self.oc.get_sim_score([], []), 1.0) - self.assertEqual(self.oc.get_sim_score(['data', 'science'], ['data']), - 1.0 / min(2.0, 1.0)) - self.assertEqual(self.oc.get_sim_score(['data', 'science'], - ['science', 'good']), 1.0 / min(2.0, 3.0)) - self.assertEqual(self.oc.get_sim_score([], ['data']), 0) - self.assertEqual(self.oc.get_sim_score(['data', 'data', 'science'], - ['data', 'management']), 1.0 / min(3.0, 2.0)) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.oc.get_raw_score(['a'], None) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.oc.get_raw_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.oc.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.oc.get_raw_score(['MARHTA'], 'MARTHA') - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.oc.get_raw_score('MARHTA', ['MARTHA']) - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.oc.get_raw_score('MARTHA', 'MARTHA') - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.oc.get_sim_score(['a'], None) - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.oc.get_sim_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.oc.get_sim_score(None, None) - - @raises(TypeError) - def test_invalid_input4_sim_score(self): - self.oc.get_sim_score(['MARHTA'], 'MARTHA') - - @raises(TypeError) - def test_invalid_input5_sim_score(self): - self.oc.get_sim_score('MARHTA', ['MARTHA']) - - @raises(TypeError) - def test_invalid_input6_sim_score(self): - self.oc.get_sim_score('MARTHA', 'MARTHA') - - -class DiceTestCases(unittest.TestCase): - def setUp(self): - self.dice = Dice() - - def test_valid_input_raw_score(self): - self.assertEqual(self.dice.get_raw_score(['data', 'science'], ['data']), - 2 * 1.0 / 3.0) - self.assertEqual(self.dice.get_raw_score(['data', 'science'], ['science', 'good']), - 2 * 1.0 / 4.0) - self.assertEqual(self.dice.get_raw_score([], ['data']), 0) - self.assertEqual(self.dice.get_raw_score(['data', 'data', 'science'], - ['data', 'management']), 2 * 1.0 / 4.0) - self.assertEqual(self.dice.get_raw_score(['data', 'management'], - ['data', 'data', 'science']), 2 * 1.0 / 4.0) - self.assertEqual(self.dice.get_raw_score([], []), 1.0) - self.assertEqual(self.dice.get_raw_score(['a', 'b'], ['b', 'a']), 1.0) - self.assertEqual(self.dice.get_raw_score(set([]), set([])), 1.0) - self.assertEqual(self.dice.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), - 2 * 3.0 / 11.0) - - def test_valid_input_sim_score(self): - self.assertEqual(self.dice.get_sim_score(['data', 'science'], ['data']), - 2 * 1.0 / 3.0) - self.assertEqual(self.dice.get_sim_score(['data', 'science'], ['science', 'good']), - 2 * 1.0 / 4.0) - self.assertEqual(self.dice.get_sim_score([], ['data']), 0) - self.assertEqual(self.dice.get_sim_score(['data', 'data', 'science'], - ['data', 'management']), 2 * 1.0 / 4.0) - self.assertEqual(self.dice.get_sim_score(['data', 'management'], - ['data', 'data', 'science']), 2 * 1.0 / 4.0) - self.assertEqual(self.dice.get_sim_score([], []), 1.0) - self.assertEqual(self.dice.get_sim_score(['a', 'b'], ['b', 'a']), 1.0) - self.assertEqual(self.dice.get_sim_score(set([]), set([])), 1.0) - self.assertEqual(self.dice.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), - 2 * 3.0 / 11.0) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.dice.get_raw_score(1, 1) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.dice.get_raw_score(['a'], None) - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.dice.get_raw_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.dice.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.dice.get_raw_score(None, 'MARHTA') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.dice.get_raw_score('MARHTA', None) - - @raises(TypeError) - def test_invalid_input7_raw_score(self): - self.dice.get_raw_score('MARHTA', 'MARTHA') - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.dice.get_sim_score(1, 1) - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.dice.get_sim_score(['a'], None) - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.dice.get_sim_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input4_sim_score(self): - self.dice.get_sim_score(None, None) - - @raises(TypeError) - def test_invalid_input5_sim_score(self): - self.dice.get_sim_score(None, 'MARHTA') - - @raises(TypeError) - def test_invalid_input6_sim_score(self): - self.dice.get_sim_score('MARHTA', None) - - @raises(TypeError) - def test_invalid_input7_sim_score(self): - self.dice.get_sim_score('MARHTA', 'MARTHA') - - -class JaccardTestCases(unittest.TestCase): - def setUp(self): - self.jac = Jaccard() - - def test_valid_input_raw_score(self): - self.assertEqual(self.jac.get_raw_score(['data', 'science'], ['data']), - 1.0 / 2.0) - self.assertEqual(self.jac.get_raw_score(['data', 'science'], - ['science', 'good']), 1.0 / 3.0) - self.assertEqual(self.jac.get_raw_score([], ['data']), 0) - self.assertEqual(self.jac.get_raw_score(['data', 'data', 'science'], - ['data', 'management']), 1.0 / 3.0) - self.assertEqual(self.jac.get_raw_score(['data', 'management'], - ['data', 'data', 'science']), 1.0 / 3.0) - self.assertEqual(self.jac.get_raw_score([], []), 1.0) - self.assertEqual(self.jac.get_raw_score(set([]), set([])), 1.0) - self.assertEqual(self.jac.get_raw_score({1, 1, 2, 3, 4}, - {2, 3, 4, 5, 6, 7, 7, 8}), 3.0 / 8.0) - - def test_valid_input_sim_score(self): - self.assertEqual(self.jac.get_sim_score(['data', 'science'], ['data']), - 1.0 / 2.0) - self.assertEqual(self.jac.get_sim_score(['data', 'science'], - ['science', 'good']), 1.0 / 3.0) - self.assertEqual(self.jac.get_sim_score([], ['data']), 0) - self.assertEqual(self.jac.get_sim_score(['data', 'data', 'science'], - ['data', 'management']), 1.0 / 3.0) - self.assertEqual(self.jac.get_sim_score(['data', 'management'], - ['data', 'data', 'science']), 1.0 / 3.0) - self.assertEqual(self.jac.get_sim_score([], []), 1.0) - self.assertEqual(self.jac.get_sim_score(set([]), set([])), 1.0) - self.assertEqual(self.jac.get_sim_score({1, 1, 2, 3, 4}, - {2, 3, 4, 5, 6, 7, 7, 8}), 3.0 / 8.0) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.jac.get_raw_score(1, 1) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.jac.get_raw_score(['a'], None) - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.jac.get_raw_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.jac.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.jac.get_raw_score(['MARHTA'], 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.jac.get_raw_score('MARHTA', ['MARTHA']) - - @raises(TypeError) - def test_invalid_input7_raw_score(self): - self.jac.get_raw_score('MARTHA', 'MARTHA') - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.jac.get_sim_score(1, 1) - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.jac.get_sim_score(['a'], None) - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.jac.get_sim_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input4_sim_score(self): - self.jac.get_sim_score(None, None) - - @raises(TypeError) - def test_invalid_input5_sim_score(self): - self.jac.get_sim_score(['MARHTA'], 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_sim_score(self): - self.jac.get_sim_score('MARHTA', ['MARTHA']) - - @raises(TypeError) - def test_invalid_input7_sim_score(self): - self.jac.get_sim_score('MARTHA', 'MARTHA') - - -class GeneralizedJaccardTestCases(unittest.TestCase): - def setUp(self): - self.gen_jac = GeneralizedJaccard() - self.jw_fn = JaroWinkler().get_raw_score - self.gen_jac_with_jw = GeneralizedJaccard(sim_func=self.jw_fn) - self.gen_jac_with_jw_08 = GeneralizedJaccard(sim_func=self.jw_fn, - threshold=0.8) - self.gen_jac_invalid = GeneralizedJaccard(sim_func=NeedlemanWunsch().get_raw_score, - threshold=0.8) - - def test_get_sim_func(self): - self.assertEqual(self.gen_jac_with_jw_08.get_sim_func(), self.jw_fn) - - def test_get_threshold(self): - self.assertEqual(self.gen_jac_with_jw_08.get_threshold(), 0.8) - - def test_set_threshold(self): - gj = GeneralizedJaccard(threshold=0.8) - self.assertEqual(gj.get_threshold(), 0.8) - self.assertAlmostEqual(gj.get_raw_score(['Niall'], ['Neal', 'Njall']), 0.43333333333333335) - self.assertEqual(gj.set_threshold(0.9), True) - self.assertEqual(gj.get_threshold(), 0.9) - self.assertAlmostEqual(gj.get_raw_score(['Niall'], ['Neal', 'Njall']), 0.0) - - def test_set_sim_func(self): - fn1 = JaroWinkler().get_raw_score - fn2 = Jaro().get_raw_score - gj = GeneralizedJaccard(sim_func=fn1) - self.assertEqual(gj.get_sim_func(), fn1) - self.assertAlmostEqual(gj.get_raw_score(['Niall'], ['Neal', 'Njall']), 0.44) - self.assertEqual(gj.set_sim_func(fn2), True) - self.assertEqual(gj.get_sim_func(), fn2) - self.assertAlmostEqual(gj.get_raw_score(['Niall'], ['Neal', 'Njall']), 0.43333333333333335) - - def test_valid_input_raw_score(self): - self.assertEqual(self.gen_jac.get_raw_score([''], ['']), 1.0) # need to check this - - self.assertEqual(self.gen_jac.get_raw_score([''], ['a']), 0.0) - self.assertEqual(self.gen_jac.get_raw_score(['a'], ['a']), 1.0) - - self.assertEqual(self.gen_jac.get_raw_score([], ['Nigel']), 0.0) - self.assertEqual(self.gen_jac.get_raw_score(['Niall'], ['Neal']), 0.7833333333333333) - self.assertEqual(self.gen_jac.get_raw_score(['Niall'], ['Njall', 'Neal']), 0.43333333333333335) - self.assertEqual(self.gen_jac.get_raw_score(['Niall'], ['Neal', 'Njall']), 0.43333333333333335) - self.assertEqual(self.gen_jac.get_raw_score( - ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), - 0.6800468975468975) - - self.assertEqual(self.gen_jac_with_jw.get_raw_score( - ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), - 0.7220003607503608) - self.assertEqual(self.gen_jac_with_jw.get_raw_score( - ['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), - 0.7075277777777778) - - self.assertEqual(self.gen_jac_with_jw_08.get_raw_score( - ['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), - 0.45810185185185187) - - def test_valid_input_sim_score(self): - self.assertEqual(self.gen_jac.get_sim_score([''], ['']), 1.0) # need to check this - - self.assertEqual(self.gen_jac.get_sim_score([''], ['a']), 0.0) - self.assertEqual(self.gen_jac.get_sim_score(['a'], ['a']), 1.0) - - self.assertEqual(self.gen_jac.get_sim_score([], ['Nigel']), 0.0) - self.assertEqual(self.gen_jac.get_sim_score(['Niall'], ['Neal']), 0.7833333333333333) - self.assertEqual(self.gen_jac.get_sim_score(['Niall'], ['Njall', 'Neal']), 0.43333333333333335) - self.assertEqual(self.gen_jac.get_sim_score(['Niall'], ['Neal', 'Njall']), 0.43333333333333335) - self.assertEqual(self.gen_jac.get_sim_score( - ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), - 0.6800468975468975) - - self.assertEqual(self.gen_jac_with_jw.get_sim_score( - ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), - 0.7220003607503608) - self.assertEqual(self.gen_jac_with_jw.get_sim_score( - ['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), - 0.7075277777777778) - - self.assertEqual(self.gen_jac_with_jw_08.get_sim_score( - ['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), - 0.45810185185185187) - - def test_valid_input_non_ascii_raw_score(self): - self.assertEqual(self.gen_jac.get_raw_score([u'Nóáll'], [u'Neál']), 0.7833333333333333) - self.assertEqual(self.gen_jac.get_raw_score(['Nóáll'], ['Neál']), 0.7833333333333333) - self.assertEqual(self.gen_jac.get_raw_score([b'N\xc3\xb3\xc3\xa1ll'], [b'Ne\xc3\xa1l']), - 0.7833333333333333) - - def test_valid_input_non_ascii_sim_score(self): - self.assertEqual(self.gen_jac.get_sim_score([u'Nóáll'], [u'Neál']), 0.7833333333333333) - self.assertEqual(self.gen_jac.get_sim_score(['Nóáll'], ['Neál']), 0.7833333333333333) - self.assertEqual(self.gen_jac.get_sim_score([b'N\xc3\xb3\xc3\xa1ll'], [b'Ne\xc3\xa1l']), - 0.7833333333333333) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.gen_jac.get_raw_score(1, 1) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.gen_jac.get_raw_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.gen_jac.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.gen_jac.get_raw_score("temp", "temp") - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.gen_jac.get_raw_score(['temp'], 'temp') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.gen_jac.get_raw_score(['a'], None) - - @raises(TypeError) - def test_invalid_input7_raw_score(self): - self.gen_jac.get_raw_score('temp', ['temp']) - - @raises(ValueError) - def test_invalid_sim_measure(self): - self.gen_jac_invalid.get_raw_score( - ['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.gen_jac.get_sim_score(1, 1) - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.gen_jac.get_sim_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.gen_jac.get_sim_score(None, None) - - @raises(TypeError) - def test_invalid_input4_sim_score(self): - self.gen_jac.get_sim_score("temp", "temp") - - @raises(TypeError) - def test_invalid_input5_sim_score(self): - self.gen_jac.get_sim_score(['temp'], 'temp') - - @raises(TypeError) - def test_invalid_input6_sim_score(self): - self.gen_jac.get_sim_score(['a'], None) - - @raises(TypeError) - def test_invalid_input7_sim_score(self): - self.gen_jac.get_sim_score('temp', ['temp']) - - @raises(ValueError) - def test_invalid_sim_measure_sim_score(self): - self.gen_jac_invalid.get_sim_score( - ['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) - - -class CosineTestCases(unittest.TestCase): - def setUp(self): - self.cos = Cosine() - - def test_valid_input_raw_score(self): - self.assertEqual(self.cos.get_raw_score(['data', 'science'], ['data']), 1.0 / (math.sqrt(2) * math.sqrt(1))) - self.assertEqual(self.cos.get_raw_score(['data', 'science'], ['science', 'good']), - 1.0 / (math.sqrt(2) * math.sqrt(2))) - self.assertEqual(self.cos.get_raw_score([], ['data']), 0.0) - self.assertEqual(self.cos.get_raw_score(['data', 'data', 'science'], ['data', 'management']), - 1.0 / (math.sqrt(2) * math.sqrt(2))) - self.assertEqual(self.cos.get_raw_score(['data', 'management'], ['data', 'data', 'science']), - 1.0 / (math.sqrt(2) * math.sqrt(2))) - self.assertEqual(self.cos.get_raw_score([], []), 1.0) - self.assertEqual(self.cos.get_raw_score(set([]), set([])), 1.0) - self.assertEqual(self.cos.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), - 3.0 / (math.sqrt(4) * math.sqrt(7))) - - def test_valid_input_sim_score(self): - self.assertEqual(self.cos.get_sim_score(['data', 'science'], ['data']), 1.0 / (math.sqrt(2) * math.sqrt(1))) - self.assertEqual(self.cos.get_sim_score(['data', 'science'], ['science', 'good']), - 1.0 / (math.sqrt(2) * math.sqrt(2))) - self.assertEqual(self.cos.get_sim_score([], ['data']), 0.0) - self.assertEqual(self.cos.get_sim_score(['data', 'data', 'science'], ['data', 'management']), - 1.0 / (math.sqrt(2) * math.sqrt(2))) - self.assertEqual(self.cos.get_sim_score(['data', 'management'], ['data', 'data', 'science']), - 1.0 / (math.sqrt(2) * math.sqrt(2))) - self.assertEqual(self.cos.get_sim_score([], []), 1.0) - self.assertEqual(self.cos.get_sim_score(set([]), set([])), 1.0) - self.assertEqual(self.cos.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), - 3.0 / (math.sqrt(4) * math.sqrt(7))) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.cos.get_raw_score(1, 1) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.cos.get_raw_score(['a'], None) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.cos.get_raw_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.cos.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.cos.get_raw_score(['MARHTA'], 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.cos.get_raw_score('MARHTA', ['MARTHA']) - - @raises(TypeError) - def test_invalid_input7_raw_score(self): - self.cos.get_raw_score('MARTHA', 'MARTHA') - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.cos.get_sim_score(1, 1) - - @raises(TypeError) - def test_invalid_input4_sim_score(self): - self.cos.get_sim_score(['a'], None) - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.cos.get_sim_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.cos.get_sim_score(None, None) - - @raises(TypeError) - def test_invalid_input5_sim_score(self): - self.cos.get_sim_score(['MARHTA'], 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_sim_score(self): - self.cos.get_sim_score('MARHTA', ['MARTHA']) - - @raises(TypeError) - def test_invalid_input7_sim_score(self): - self.cos.get_sim_score('MARTHA', 'MARTHA') - - -class TfidfTestCases(unittest.TestCase): - def setUp(self): - self.tfidf = TfIdf() - self.corpus = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']] - self.tfidf_with_params1 = TfIdf(self.corpus, True) - self.tfidf_with_params2 = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']]) - self.tfidf_with_params3 = TfIdf([['x', 'y'], ['w'], ['q']]) - - def test_get_corpus_list(self): - self.assertEqual(self.tfidf_with_params1.get_corpus_list(), self.corpus) - - def test_get_dampen(self): - self.assertEqual(self.tfidf_with_params1.get_dampen(), True) - - def test_set_corpus_list(self): - corpus1 = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']] - corpus2 = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b'], ['c', 'a', 'b']] - tfidf = TfIdf(corpus_list=corpus1) - self.assertEqual(tfidf.get_corpus_list(), corpus1) - self.assertAlmostEqual(tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.5495722661728765) - self.assertEqual(tfidf.set_corpus_list(corpus2), True) - self.assertEqual(tfidf.get_corpus_list(), corpus2) - self.assertAlmostEqual(tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.5692378887901467) - - def test_set_dampen(self): - tfidf = TfIdf(self.corpus, dampen=False) - self.assertEqual(tfidf.get_dampen(), False) - self.assertAlmostEqual(tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.7999999999999999) - self.assertEqual(tfidf.set_dampen(True), True) - self.assertEqual(tfidf.get_dampen(), True) - self.assertAlmostEqual(tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.5495722661728765) - - def test_valid_input_raw_score(self): - self.assertEqual(self.tfidf_with_params1.get_raw_score(['a', 'b', 'a'], ['a', 'c']), - 0.11166746710505392) - self.assertEqual(self.tfidf_with_params2.get_raw_score(['a', 'b', 'a'], ['a', 'c']), - 0.0) - self.assertEqual(self.tfidf_with_params2.get_raw_score(['a', 'b', 'a'], ['a']), - 0.0) - self.assertEqual(self.tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.0) - self.assertEqual(self.tfidf_with_params3.get_raw_score(['a', 'b', 'a'], ['a']), 0.0) - self.assertEqual(self.tfidf.get_raw_score(['a', 'b', 'a'], ['a']), 0.0) - self.assertEqual(self.tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'b', 'a']), 1.0) - self.assertEqual(self.tfidf.get_raw_score([], ['a', 'b', 'a']), 0.0) - - def test_valid_input_sim_score(self): - self.assertEqual(self.tfidf_with_params1.get_sim_score(['a', 'b', 'a'], ['a', 'c']), - 0.11166746710505392) - self.assertEqual(self.tfidf_with_params2.get_sim_score(['a', 'b', 'a'], ['a', 'c']), - 0.0) - self.assertEqual(self.tfidf_with_params2.get_sim_score(['a', 'b', 'a'], ['a']), - 0.0) - self.assertEqual(self.tfidf.get_sim_score(['a', 'b', 'a'], ['a']), 0.0) - self.assertEqual(self.tfidf_with_params3.get_sim_score(['a', 'b', 'a'], ['a']), 0.0) - self.assertEqual(self.tfidf.get_sim_score(['a', 'b', 'a'], ['a']), 0.0) - self.assertEqual(self.tfidf.get_sim_score(['a', 'b', 'a'], ['a', 'b', 'a']), 1.0) - self.assertEqual(self.tfidf.get_sim_score([], ['a', 'b', 'a']), 0.0) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.tfidf.get_raw_score(1, 1) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.tfidf.get_raw_score(['a'], None) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.tfidf.get_raw_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.tfidf.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.tfidf.get_raw_score(['MARHTA'], 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.tfidf.get_raw_score('MARHTA', ['MARTHA']) - - @raises(TypeError) - def test_invalid_input7_raw_score(self): - self.tfidf.get_raw_score('MARTHA', 'MARTHA') - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.tfidf.get_sim_score(1, 1) - - @raises(TypeError) - def test_invalid_input4_sim_score(self): - self.tfidf.get_sim_score(['a'], None) - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.tfidf.get_sim_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.tfidf.get_sim_score(None, None) - - @raises(TypeError) - def test_invalid_input5_sim_score(self): - self.tfidf.get_sim_score(['MARHTA'], 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_sim_score(self): - self.tfidf.get_sim_score('MARHTA', ['MARTHA']) - - @raises(TypeError) - def test_invalid_input7_sim_score(self): - self.tfidf.get_sim_score('MARTHA', 'MARTHA') - - -class TverskyIndexTestCases(unittest.TestCase): - def setUp(self): - self.tvi = TverskyIndex() - self.tvi_with_params1 = TverskyIndex(0.5, 0.5) - self.tvi_with_params2 = TverskyIndex(0.7, 0.8) - self.tvi_with_params3 = TverskyIndex(0.2, 0.4) - self.tvi_with_params4 = TverskyIndex(0.9, 0.8) - self.tvi_with_params5 = TverskyIndex(0.45, 0.85) - - def test_get_alpha(self): - self.assertEqual(self.tvi_with_params5.get_alpha(), 0.45) - - def test_get_beta(self): - self.assertEqual(self.tvi_with_params5.get_beta(), 0.85) - - def test_set_alpha(self): - tvi = TverskyIndex(alpha=0.3) - self.assertEqual(tvi.get_alpha(), 0.3) - self.assertAlmostEqual(tvi.get_raw_score(['data', 'science'], ['data']), - 0.7692307692307692) - self.assertEqual(tvi.set_alpha(0.7), True) - self.assertEqual(tvi.get_alpha(), 0.7) - self.assertAlmostEqual(tvi.get_raw_score(['data', 'science'], ['data']), - 0.5882352941176471) - - def test_set_beta(self): - tvi = TverskyIndex(beta=0.3) - self.assertEqual(tvi.get_beta(), 0.3) - self.assertAlmostEqual(tvi.get_raw_score(['data', 'science'], ['science', 'good']), - 0.5555555555555556) - self.assertEqual(tvi.set_beta(0.7), True) - self.assertEqual(tvi.get_beta(), 0.7) - self.assertAlmostEqual(tvi.get_raw_score(['data', 'science'], ['science', 'good']), - 0.45454545454545453) - - def test_valid_input_raw_score(self): - self.assertEqual(self.tvi_with_params1.get_raw_score(['data', 'science'], ['data']), - 1.0 / (1.0 + 0.5*1 + 0.5*0)) - self.assertEqual(self.tvi.get_raw_score(['data', 'science'], ['science', 'good']), - 1.0 / (1.0 + 0.5*1 + 0.5*1)) - self.assertEqual(self.tvi.get_raw_score([], ['data']), 0) - self.assertEqual(self.tvi_with_params2.get_raw_score(['data', 'data', 'science'], - ['data', 'management']), - 1.0 / (1.0 + 0.7*1 + 0.8*1)) - self.assertEqual(self.tvi_with_params3.get_raw_score(['data', 'management', 'science'], - ['data', 'data', 'science']), - 2.0 / (2.0 + 0.2*1 + 0)) - self.assertEqual(self.tvi.get_raw_score([], []), 1.0) - self.assertEqual(self.tvi_with_params4.get_raw_score(['a', 'b'], ['b', 'a']), 1.0) - self.assertEqual(self.tvi.get_raw_score(['a', 'b'], ['b', 'a']), 1.0) - self.assertEqual(self.tvi.get_raw_score(set([]), set([])), 1.0) - self.assertEqual(self.tvi_with_params5.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), - 3.0 / (3.0 + 0.45*1 + 0.85*4)) - - def test_valid_input_sim_score(self): - self.assertEqual(self.tvi_with_params1.get_sim_score(['data', 'science'], ['data']), - 1.0 / (1.0 + 0.5*1 + 0.5*0)) - self.assertEqual(self.tvi.get_sim_score(['data', 'science'], ['science', 'good']), - 1.0 / (1.0 + 0.5*1 + 0.5*1)) - self.assertEqual(self.tvi.get_sim_score([], ['data']), 0) - self.assertEqual(self.tvi_with_params2.get_sim_score(['data', 'data', 'science'], - ['data', 'management']), - 1.0 / (1.0 + 0.7*1 + 0.8*1)) - self.assertEqual(self.tvi_with_params3.get_sim_score(['data', 'management', 'science'], - ['data', 'data', 'science']), - 2.0 / (2.0 + 0.2*1 + 0)) - self.assertEqual(self.tvi.get_sim_score([], []), 1.0) - self.assertEqual(self.tvi_with_params4.get_sim_score(['a', 'b'], ['b', 'a']), 1.0) - self.assertEqual(self.tvi.get_sim_score(['a', 'b'], ['b', 'a']), 1.0) - self.assertEqual(self.tvi.get_sim_score(set([]), set([])), 1.0) - self.assertEqual(self.tvi_with_params5.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), - 3.0 / (3.0 + 0.45*1 + 0.85*4)) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.tvi.get_raw_score(1, 1) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.tvi.get_raw_score(['a'], None) - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.tvi.get_raw_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.tvi.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.tvi.get_raw_score(None, 'MARHTA') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.tvi.get_raw_score('MARHTA', None) - - @raises(TypeError) - def test_invalid_input7_raw_score(self): - self.tvi.get_raw_score('MARHTA', 'MARTHA') - - @raises(TypeError) - def test_invalid_input1_sim_score(self): - self.tvi.get_sim_score(1, 1) - - @raises(TypeError) - def test_invalid_input2_sim_score(self): - self.tvi.get_sim_score(['a'], None) - - @raises(TypeError) - def test_invalid_input3_sim_score(self): - self.tvi.get_sim_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input4_sim_score(self): - self.tvi.get_sim_score(None, None) - - @raises(TypeError) - def test_invalid_input5_sim_score(self): - self.tvi.get_sim_score(None, 'MARHTA') - - @raises(TypeError) - def test_invalid_input6_sim_score(self): - self.tvi.get_sim_score('MARHTA', None) - - @raises(TypeError) - def test_invalid_input7_sim_score(self): - self.tvi.get_sim_score('MARHTA', 'MARTHA') - - @raises(ValueError) - def test_invalid_input8(self): - tvi_invalid = TverskyIndex(0.5, -0.9) - - @raises(ValueError) - def test_invalid_input9(self): - tvi_invalid = TverskyIndex(-0.5, 0.9) - - @raises(ValueError) - def test_invalid_input10(self): - tvi_invalid = TverskyIndex(-0.5, -0.9) - - -# ---------------------- bag based similarity measures ---------------------- -# class CosineTestCases(unittest.TestCase): -# def test_valid_input(self): -# NONQ_FROM = 'The quick brown fox jumped over the lazy dog.' -# NONQ_TO = 'That brown dog jumped over the fox.' -# self.assertEqual(cosine([], []), 1) # check-- done. both simmetrics, abydos return 1. -# self.assertEqual(cosine(['the', 'quick'], []), 0) -# self.assertEqual(cosine([], ['the', 'quick']), 0) -# self.assertAlmostEqual(cosine(whitespace(NONQ_TO), whitespace(NONQ_FROM)), -# 4/math.sqrt(9*7)) -# -# @raises(TypeError) -# def test_invalid_input1_raw_score(self): -# cosine(['a'], None) -# @raises(TypeError) -# def test_invalid_input2_raw_score(self): -# cosine(None, ['b']) -# @raises(TypeError) -# def test_invalid_input3_raw_score(self): -# cosine(None, None) - - -# ---------------------- hybrid similarity measure ---------------------- - -class Soft_TfidfTestCases(unittest.TestCase): - def setUp(self): - self.soft_tfidf = SoftTfIdf() - self.corpus = [['a', 'b', 'a'], ['a', 'c'], ['a']] - self.non_ascii_corpus = [['á', 'b', 'á'], ['á', 'c'], ['á']] - self.soft_tfidf_with_params1 = SoftTfIdf(self.corpus, - sim_func=Jaro().get_raw_score, - threshold=0.8) - self.soft_tfidf_with_params2 = SoftTfIdf(self.corpus, - threshold=0.9) - self.soft_tfidf_with_params3 = SoftTfIdf([['x', 'y'], ['w'], ['q']]) - self.affine_fn = Affine().get_raw_score - self.soft_tfidf_with_params4 = SoftTfIdf(sim_func=self.affine_fn, threshold=0.6) - self.soft_tfidf_non_ascii = SoftTfIdf(self.non_ascii_corpus, - sim_func=Jaro().get_raw_score, - threshold=0.8) - - def test_get_corpus_list(self): - self.assertEqual(self.soft_tfidf_with_params1.get_corpus_list(), self.corpus) - - def test_get_sim_func(self): - self.assertEqual(self.soft_tfidf_with_params4.get_sim_func(), self.affine_fn) - - def test_get_threshold(self): - self.assertEqual(self.soft_tfidf_with_params4.get_threshold(), 0.6) - - def test_set_corpus_list(self): - corpus1 = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']] - corpus2 = [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b'], ['c', 'a', 'b']] - soft_tfidf = SoftTfIdf(corpus_list=corpus1) - self.assertEqual(soft_tfidf.get_corpus_list(), corpus1) - self.assertAlmostEqual(soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a']), - 0.7999999999999999) - self.assertEqual(soft_tfidf.set_corpus_list(corpus2), True) - self.assertEqual(soft_tfidf.get_corpus_list(), corpus2) - self.assertAlmostEqual(soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a']), - 0.8320502943378437) - - def test_set_threshold(self): - soft_tfidf = SoftTfIdf(threshold=0.5) - self.assertEqual(soft_tfidf.get_threshold(), 0.5) - self.assertAlmostEqual(soft_tfidf.get_raw_score(['ar', 'bfff', 'ab'], ['abcd']), 0.8179128813519699) - self.assertEqual(soft_tfidf.set_threshold(0.7), True) - self.assertEqual(soft_tfidf.get_threshold(), 0.7) - self.assertAlmostEqual(soft_tfidf.get_raw_score(['ar', 'bfff', 'ab'], ['abcd']), 0.4811252243246882) - - def test_set_sim_func(self): - fn1 = JaroWinkler().get_raw_score - fn2 = Jaro().get_raw_score - soft_tfidf = SoftTfIdf(sim_func=fn1) - self.assertEqual(soft_tfidf.get_sim_func(), fn1) - self.assertAlmostEqual(soft_tfidf.get_raw_score(['ar', 'bfff', 'ab'], ['abcd']), 0.8612141515411919) - self.assertEqual(soft_tfidf.set_sim_func(fn2), True) - self.assertEqual(soft_tfidf.get_sim_func(), fn2) - self.assertAlmostEqual(soft_tfidf.get_raw_score(['ar', 'bfff', 'ab'], ['abcd']), 0.8179128813519699) - - def test_valid_input_raw_score(self): - self.assertEqual(self.soft_tfidf_with_params1.get_raw_score( - ['a', 'b', 'a'], ['a', 'c']), 0.17541160386140586) - self.assertEqual(self.soft_tfidf_with_params2.get_raw_score( - ['a', 'b', 'a'], ['a']), 0.5547001962252291) - self.assertEqual(self.soft_tfidf_with_params3.get_raw_score( - ['a', 'b', 'a'], ['a']), 0.0) - self.assertEqual(self.soft_tfidf_with_params4.get_raw_score( - ['aa', 'bb', 'a'], ['ab', 'ba']), - 0.81649658092772592) - self.assertEqual(self.soft_tfidf.get_raw_score( - ['a', 'b', 'a'], ['a', 'b', 'a']), 1.0) - self.assertEqual(self.soft_tfidf.get_raw_score([], ['a', 'b', 'a']), 0.0) - - def test_valid_input_non_ascii_raw_score(self): - self.assertEqual(self.soft_tfidf_non_ascii.get_raw_score( - [u'á', u'b', u'á'], [u'á', u'c']), 0.17541160386140586) - self.assertEqual(self.soft_tfidf_non_ascii.get_raw_score( - ['á', 'b', 'á'], ['á', 'c']), 0.17541160386140586) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.soft_tfidf.get_raw_score(1, 1) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.soft_tfidf.get_raw_score(['a'], None) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.soft_tfidf.get_raw_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.soft_tfidf.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.soft_tfidf.get_raw_score(['MARHTA'], 'MARTHA') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.soft_tfidf.get_raw_score('MARHTA', ['MARTHA']) - - @raises(TypeError) - def test_invalid_input7_raw_score(self): - self.soft_tfidf.get_raw_score('MARTHA', 'MARTHA') - - -class MongeElkanTestCases(unittest.TestCase): - def setUp(self): - self.me = MongeElkan() - self.me_with_nw = MongeElkan(NeedlemanWunsch().get_raw_score) - self.affine_fn = Affine().get_raw_score - self.me_with_affine = MongeElkan(self.affine_fn) - - def test_get_sim_func(self): - self.assertEqual(self.me_with_affine.get_sim_func(), self.affine_fn) - - def test_set_sim_func(self): - fn1 = JaroWinkler().get_raw_score - fn2 = NeedlemanWunsch().get_raw_score - me = MongeElkan(sim_func=fn1) - self.assertEqual(me.get_sim_func(), fn1) - self.assertAlmostEqual(me.get_raw_score( - ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), - 0.8364448051948052) - self.assertEqual(me.set_sim_func(fn2), True) - self.assertEqual(me.get_sim_func(), fn2) - self.assertAlmostEqual(me.get_raw_score( - ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), - 2.0) - - def test_valid_input(self): - self.assertEqual(self.me.get_raw_score([''], ['']), 1.0) # need to check this - - self.assertEqual(self.me.get_raw_score([''], ['a']), 0.0) - self.assertEqual(self.me.get_raw_score(['a'], ['a']), 1.0) - - self.assertEqual(self.me.get_raw_score(['Niall'], ['Neal']), 0.8049999999999999) - self.assertEqual(self.me.get_raw_score(['Niall'], ['Njall']), 0.88) - self.assertEqual(self.me.get_raw_score( - ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), - 0.8364448051948052) - self.assertEqual(self.me_with_nw.get_raw_score( - ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), - 2.0) - self.assertEqual(self.me_with_affine.get_raw_score( - ['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], - ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), - 2.25) - self.assertEqual(self.me.get_raw_score(['Niall'], ['Niel']), 0.8266666666666667) - self.assertEqual(self.me.get_raw_score(['Niall'], ['Nigel']), 0.7866666666666667) - self.assertEqual(self.me.get_raw_score([], ['Nigel']), 0.0) - - def test_valid_input_non_ascii(self): - self.assertEqual(self.me.get_raw_score([u'Nóáll'], [u'Neál']), 0.8049999999999999) - self.assertEqual(self.me.get_raw_score(['Nóáll'], ['Neál']), 0.8049999999999999) - self.assertEqual(self.me.get_raw_score([b'N\xc3\xb3\xc3\xa1ll'], [b'Ne\xc3\xa1l']), - 0.8049999999999999) - - @raises(TypeError) - def test_invalid_input1_raw_score(self): - self.me.get_raw_score(1, 1) - - @raises(TypeError) - def test_invalid_input2_raw_score(self): - self.me.get_raw_score(None, ['b']) - - @raises(TypeError) - def test_invalid_input3_raw_score(self): - self.me.get_raw_score(None, None) - - @raises(TypeError) - def test_invalid_input4_raw_score(self): - self.me.get_raw_score("temp", "temp") - - @raises(TypeError) - def test_invalid_input5_raw_score(self): - self.me.get_raw_score(['temp'], 'temp') - - @raises(TypeError) - def test_invalid_input6_raw_score(self): - self.me.get_raw_score(['a'], None) - - @raises(TypeError) - def test_invalid_input7_raw_score(self): - self.me.get_raw_score('temp', ['temp']) diff --git a/py_stringmatching/tests/test_tok_AlphabeticTokenizer.py b/py_stringmatching/tests/test_tok_AlphabeticTokenizer.py new file mode 100644 index 0000000..46adf94 --- /dev/null +++ b/py_stringmatching/tests/test_tok_AlphabeticTokenizer.py @@ -0,0 +1,48 @@ +from __future__ import unicode_literals + +import unittest +from nose.tools import * + +from py_stringmatching.tokenizer.alphabetic_tokenizer import AlphabeticTokenizer + +class AlphabeticTokenizerTestCases(unittest.TestCase): + def setUp(self): + self.al_tok = AlphabeticTokenizer() + self.al_tok_return_set = AlphabeticTokenizer(return_set=True) + + def test_alphabetic_tok_valid(self): + self.assertEqual(self.al_tok.tokenize(''), []) + self.assertEqual(self.al_tok.tokenize('99'), []) + self.assertEqual(self.al_tok.tokenize('hello'), ['hello']) + self.assertEqual(self.al_tok.tokenize('ab bc. cd##de ef09 bc fg ab.'), + ['ab', 'bc', 'cd', 'de', 'ef', 'bc', 'fg', 'ab']) + self.assertEqual( + self.al_tok_return_set.tokenize('ab bc. cd##de ef09 bc fg ab.'), + ['ab', 'bc', 'cd', 'de', 'ef', 'fg']) + + def test_get_return_set(self): + self.assertEqual(self.al_tok.get_return_set(), False) + self.assertEqual(self.al_tok_return_set.get_return_set(), True) + + def test_set_return_set(self): + tok = AlphabeticTokenizer() + self.assertEqual(tok.get_return_set(), False) + self.assertEqual(tok.tokenize('ab bc. cd##de ef09 bc fg ab.'), + ['ab', 'bc', 'cd', 'de', 'ef', 'bc', 'fg', 'ab']) + self.assertEqual(tok.set_return_set(True), True) + self.assertEqual(tok.get_return_set(), True) + self.assertEqual( + tok.tokenize('ab bc. cd##de ef09 bc fg ab.'), + ['ab', 'bc', 'cd', 'de', 'ef', 'fg']) + self.assertEqual(tok.set_return_set(False), True) + self.assertEqual(tok.get_return_set(), False) + self.assertEqual(tok.tokenize('ab bc. cd##de ef09 bc fg ab.'), + ['ab', 'bc', 'cd', 'de', 'ef', 'bc', 'fg', 'ab']) + + @raises(TypeError) + def test_alphabetic_tok_invalid1(self): + self.al_tok.tokenize(None) + + @raises(TypeError) + def test_alphabetic_tok_invalid2(self): + self.al_tok.tokenize(99) \ No newline at end of file diff --git a/py_stringmatching/tests/test_tok_AlphanumericTokenizer.py b/py_stringmatching/tests/test_tok_AlphanumericTokenizer.py new file mode 100644 index 0000000..32ffa3e --- /dev/null +++ b/py_stringmatching/tests/test_tok_AlphanumericTokenizer.py @@ -0,0 +1,51 @@ +from __future__ import unicode_literals + +import unittest +from nose.tools import * + +from py_stringmatching.tokenizer.alphanumeric_tokenizer import AlphanumericTokenizer + +class AlphanumericTokenizerTestCases(unittest.TestCase): + def setUp(self): + self.alnum_tok = AlphanumericTokenizer() + self.alnum_tok_return_set = AlphanumericTokenizer(return_set=True) + + def test_alphanumeric_tok_valid(self): + self.assertEqual(self.alnum_tok.tokenize(''), []) + self.assertEqual(self.alnum_tok.tokenize('#$'), []) + self.assertEqual(self.alnum_tok.tokenize('hello99'), ['hello99']) + self.assertEqual( + self.alnum_tok.tokenize(',data9,(science), data9#.(integration).88!'), + ['data9', 'science', 'data9', 'integration', '88']) + self.assertEqual(self.alnum_tok_return_set.tokenize( + ',data9,(science), data9#.(integration).88!'), + ['data9', 'science', 'integration', '88']) + + def test_get_return_set(self): + self.assertEqual(self.alnum_tok.get_return_set(), False) + self.assertEqual(self.alnum_tok_return_set.get_return_set(), True) + + def test_set_return_set(self): + tok = AlphanumericTokenizer() + self.assertEqual(tok.get_return_set(), False) + self.assertEqual( + tok.tokenize(',data9,(science), data9#.(integration).88!'), + ['data9', 'science', 'data9', 'integration', '88']) + self.assertEqual(tok.set_return_set(True), True) + self.assertEqual(tok.get_return_set(), True) + self.assertEqual( + tok.tokenize(',data9,(science), data9#.(integration).88!'), + ['data9', 'science', 'integration', '88']) + self.assertEqual(tok.set_return_set(False), True) + self.assertEqual(tok.get_return_set(), False) + self.assertEqual( + tok.tokenize(',data9,(science), data9#.(integration).88!'), + ['data9', 'science', 'data9', 'integration', '88']) + + @raises(TypeError) + def test_alphanumeric_tok_invalid1(self): + self.alnum_tok.tokenize(None) + + @raises(TypeError) + def test_alphanumeric_tok_invalid2(self): + self.alnum_tok.tokenize(99) \ No newline at end of file diff --git a/py_stringmatching/tests/test_tok_DelimiterTokenizer.py b/py_stringmatching/tests/test_tok_DelimiterTokenizer.py new file mode 100644 index 0000000..7d5d9d2 --- /dev/null +++ b/py_stringmatching/tests/test_tok_DelimiterTokenizer.py @@ -0,0 +1,85 @@ +from __future__ import unicode_literals + +import unittest +from nose.tools import * + +from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer + +class DelimiterTokenizerTestCases(unittest.TestCase): + def setUp(self): + self.delim_tok1 = DelimiterTokenizer() + self.delim_tok2 = DelimiterTokenizer(set([','])) + self.delim_tok3 = DelimiterTokenizer(set(['*', '.'])) + self.delim_tok4 = DelimiterTokenizer(set(['..', 'ab'])) + self.delim_tok4_list = DelimiterTokenizer(['..', 'ab', '..']) + self.delim_tok4_return_set = DelimiterTokenizer(set(['..', 'ab']), + return_set=True) + + def test_delimiter_valid(self): + self.assertEqual(self.delim_tok1.tokenize('data science'), + ['data', 'science']) + self.assertEqual(self.delim_tok2.tokenize('data,science'), + ['data', 'science']) + self.assertEqual(self.delim_tok2.tokenize('data science'), + ['data science']) + self.assertEqual(self.delim_tok3.tokenize('ab cd*ef.*bb. gg.'), + ['ab cd', 'ef', 'bb', ' gg']) + self.assertEqual( + self.delim_tok4.tokenize('ab cd..efabbb....ggab cd..efabgh'), + [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) + self.assertEqual( + self.delim_tok4_list.tokenize('ab cd..efabbb....ggab cd..efabgh'), + [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) + self.assertEqual( + self.delim_tok4_return_set.tokenize( + 'ab cd..efabbb....ggab cd..efabgh'), + [' cd', 'ef', 'bb', 'gg', 'gh']) + + def test_get_return_set(self): + self.assertEqual(self.delim_tok4.get_return_set(), False) + self.assertEqual(self.delim_tok4_return_set.get_return_set(), True) + + def test_get_delim_set(self): + self.assertSetEqual(self.delim_tok1.get_delim_set(), {' '}) + self.assertSetEqual(self.delim_tok3.get_delim_set(), {'*', '.'}) + self.assertSetEqual(self.delim_tok4_list.get_delim_set(), {'..', 'ab'}) + + def test_set_return_set(self): + tok = DelimiterTokenizer(set(['..', 'ab'])) + self.assertEqual(tok.get_return_set(), False) + self.assertEqual( + tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), + [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) + self.assertEqual(tok.set_return_set(True), True) + self.assertEqual(tok.get_return_set(), True) + self.assertEqual( + tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), + [' cd', 'ef', 'bb', 'gg', 'gh']) + self.assertEqual(tok.set_return_set(False), True) + self.assertEqual(tok.get_return_set(), False) + self.assertEqual( + tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), + [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) + + def test_set_delim_set(self): + tok = DelimiterTokenizer(['*', '.']) + self.assertSetEqual(tok.get_delim_set(), {'*', '.'}) + self.assertEqual(tok.tokenize('ab cd*ef.*bb. gg.'), + ['ab cd', 'ef', 'bb', ' gg']) + self.assertEqual(tok.set_delim_set({'..', 'ab'}), True) + self.assertSetEqual(tok.get_delim_set(), {'..', 'ab'}) + self.assertEqual( + tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), + [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) + + @raises(TypeError) + def test_delimiter_invalid1(self): + invalid_delim_tok = DelimiterTokenizer(set([',', 10])) + + @raises(TypeError) + def test_delimiter_invalid2(self): + self.delim_tok1.tokenize(None) + + @raises(TypeError) + def test_delimiter_invalid3(self): + self.delim_tok1.tokenize(99) \ No newline at end of file diff --git a/py_stringmatching/tests/test_tok_QgramTokenizer.py b/py_stringmatching/tests/test_tok_QgramTokenizer.py new file mode 100644 index 0000000..634d921 --- /dev/null +++ b/py_stringmatching/tests/test_tok_QgramTokenizer.py @@ -0,0 +1,203 @@ +from __future__ import unicode_literals + +import unittest +from nose.tools import * + +from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer + +class QgramTokenizerTestCases(unittest.TestCase): + def setUp(self): + self.qg1_tok = QgramTokenizer(qval=1, padding=False) + self.qg2_tok = QgramTokenizer(padding=False) + self.qg2_tok_return_set = QgramTokenizer(padding=False,return_set=True) + self.qg3_tok = QgramTokenizer(qval=3, padding=False) + self.qg1_tok_wipad = QgramTokenizer(qval=1) + self.qg2_tok_wipad = QgramTokenizer() + self.qg2_tok_wipad_return_set = QgramTokenizer(return_set=True) + self.qg3_tok_wipad = QgramTokenizer(qval=3) + self.qg3_tok_wipad_diffpad = QgramTokenizer(qval=3,prefix_pad='^', + suffix_pad='!') + + def test_qgrams_valid(self): + self.assertEqual(self.qg2_tok.tokenize(''), []) + self.assertEqual(self.qg2_tok.tokenize('a'), []) + self.assertEqual(self.qg2_tok.tokenize('aa'), ['aa']) + self.assertEqual(self.qg2_tok.tokenize('database'), + ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']) + self.assertEqual(self.qg2_tok.tokenize('aabaabcdba'), + ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba']) + self.assertEqual(self.qg2_tok_return_set.tokenize('aabaabcdba'), + ['aa', 'ab', 'ba', 'bc', 'cd', 'db']) + self.assertEqual(self.qg1_tok.tokenize('d'), ['d']) + self.assertEqual(self.qg3_tok.tokenize('database'), + ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']) + + self.assertEqual(self.qg2_tok_wipad.tokenize(''), ['#$']) + self.assertEqual(self.qg2_tok_wipad.tokenize('a'), ['#a', 'a$']) + self.assertEqual(self.qg2_tok_wipad.tokenize('aa'), ['#a', 'aa', 'a$']) + self.assertEqual(self.qg2_tok_wipad.tokenize('database'), + ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) + self.assertEqual(self.qg2_tok_wipad.tokenize('aabaabcdba'), + ['#a', 'aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba', 'a$']) + self.assertEqual(self.qg2_tok_wipad_return_set.tokenize('aabaabcdba'), + ['#a', 'aa', 'ab', 'ba', 'bc', 'cd', 'db', 'a$']) + self.assertEqual(self.qg1_tok_wipad.tokenize('d'), ['d']) + self.assertEqual(self.qg3_tok_wipad.tokenize('database'), + ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$']) + + self.assertEqual(self.qg3_tok_wipad_diffpad.tokenize('database'), + ['^^d', '^da', 'dat', 'ata', 'tab', 'aba', 'bas', + 'ase', 'se!', 'e!!']) + + def test_get_return_set(self): + self.assertEqual(self.qg2_tok.get_return_set(), False) + self.assertEqual(self.qg2_tok_return_set.get_return_set(), True) + self.assertEqual(self.qg2_tok_wipad.get_return_set(), False) + self.assertEqual(self.qg2_tok_wipad_return_set.get_return_set(), True) + + + def test_get_qval(self): + self.assertEqual(self.qg2_tok.get_qval(), 2) + self.assertEqual(self.qg3_tok.get_qval(), 3) + self.assertEqual(self.qg2_tok_wipad.get_qval(), 2) + self.assertEqual(self.qg3_tok_wipad.get_qval(), 3) + + + def test_set_return_set(self): + tok = QgramTokenizer(padding=False) + self.assertEqual(tok.get_return_set(), False) + self.assertEqual(tok.tokenize('aabaabcdba'), + ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba']) + self.assertEqual(tok.set_return_set(True), True) + self.assertEqual(tok.get_return_set(), True) + self.assertEqual(tok.tokenize('aabaabcdba'), + ['aa', 'ab', 'ba', 'bc', 'cd', 'db']) + self.assertEqual(tok.set_return_set(False), True) + self.assertEqual(tok.get_return_set(), False) + self.assertEqual(tok.tokenize('aabaabcdba'), + ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba']) + tok = QgramTokenizer() + self.assertEqual(tok.get_return_set(), False) + self.assertEqual(tok.tokenize('aabaabcdba'), + ['#a', 'aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba', 'a$']) + self.assertEqual(tok.set_return_set(True), True) + self.assertEqual(tok.get_return_set(), True) + self.assertEqual(tok.tokenize('aabaabcdba'), + ['#a', 'aa', 'ab', 'ba', 'bc', 'cd', 'db', 'a$']) + self.assertEqual(tok.set_return_set(False), True) + self.assertEqual(tok.get_return_set(), False) + self.assertEqual(tok.tokenize('aabaabcdba'), + ['#a', 'aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba', 'a$']) + + + + def test_set_qval(self): + tok = QgramTokenizer(padding=False) + self.assertEqual(tok.get_qval(), 2) + self.assertEqual(tok.tokenize('database'), + ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']) + self.assertEqual(tok.set_qval(3), True) + self.assertEqual(tok.get_qval(), 3) + self.assertEqual(tok.tokenize('database'), + ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']) + + tok = QgramTokenizer() + self.assertEqual(tok.get_qval(), 2) + self.assertEqual(tok.tokenize('database'), + ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) + self.assertEqual(tok.set_qval(3), True) + self.assertEqual(tok.get_qval(), 3) + self.assertEqual(tok.tokenize('database'), + ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$']) + + def test_set_padding(self): + tok = QgramTokenizer() + self.assertEqual(tok.get_padding(), True) + self.assertEqual(tok.tokenize('database'), + ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) + tok.set_padding(False) + self.assertEqual(tok.get_padding(), False) + self.assertEqual(tok.tokenize('database'), + ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']) + + def test_set_prefix_pad(self): + tok = QgramTokenizer() + self.assertEqual(tok.get_prefix_pad(), '#') + self.assertEqual(tok.tokenize('database'), + ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) + tok.set_prefix_pad('^') + self.assertEqual(tok.get_prefix_pad(), '^') + self.assertEqual(tok.tokenize('database'), + ['^d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) + + def test_set_suffix_pad(self): + tok = QgramTokenizer() + self.assertEqual(tok.get_suffix_pad(), '$') + self.assertEqual(tok.tokenize('database'), + ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) + tok.set_suffix_pad('!') + self.assertEqual(tok.get_suffix_pad(), '!') + self.assertEqual(tok.tokenize('database'), + ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e!']) + + @raises(TypeError) + def test_qgrams_none(self): + self.qg2_tok.tokenize(None) + + @raises(AssertionError) + def test_qgrams_invalid1(self): + invalid_qg_tok = QgramTokenizer(0) + + @raises(TypeError) + def test_qgrams_invalid2(self): + self.qg2_tok.tokenize(99) + + @raises(AssertionError) + def test_set_qval_invalid(self): + qg_tok = QgramTokenizer() + qg_tok.set_qval(0) + + @raises(AssertionError) + def test_padding_invalid(self): + _ = QgramTokenizer(padding=10) + + @raises(AssertionError) + def test_set_padding_invalid(self): + qg = QgramTokenizer() + qg.set_padding(10) + + @raises(AssertionError) + def test_prefixpad_invalid1(self): + _ = QgramTokenizer(prefix_pad=10) + + @raises(AssertionError) + def test_prefixpad_invalid2(self): + _ = QgramTokenizer(prefix_pad="###") + + @raises(AssertionError) + def test_set_prefix_pad_invalid1(self): + qg = QgramTokenizer() + qg.set_prefix_pad(10) + + @raises(AssertionError) + def test_set_prefix_pad_invalid2(self): + qg = QgramTokenizer() + qg.set_prefix_pad('###') + + @raises(AssertionError) + def test_suffixpad_invalid1(self): + _ = QgramTokenizer(suffix_pad=10) + + @raises(AssertionError) + def test_suffixpad_invalid2(self): + _ = QgramTokenizer(suffix_pad="###") + + @raises(AssertionError) + def test_set_suffix_pad_invalid1(self): + qg = QgramTokenizer() + qg.set_suffix_pad(10) + + @raises(AssertionError) + def test_set_suffix_pad_invalid2(self): + qg = QgramTokenizer() + qg.set_suffix_pad('###') \ No newline at end of file diff --git a/py_stringmatching/tests/test_tok_WhitespaceTokenizer.py b/py_stringmatching/tests/test_tok_WhitespaceTokenizer.py new file mode 100644 index 0000000..d4e968f --- /dev/null +++ b/py_stringmatching/tests/test_tok_WhitespaceTokenizer.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +import unittest +from nose.tools import * + +from py_stringmatching.tokenizer.whitespace_tokenizer import WhitespaceTokenizer + +class WhitespaceTokenizerTestCases(unittest.TestCase): + def setUp(self): + self.ws_tok = WhitespaceTokenizer() + self.ws_tok_return_set = WhitespaceTokenizer(return_set=True) + + def test_whitespace_tok_valid(self): + self.assertEqual(self.ws_tok.tokenize('data science'), + ['data', 'science']) + self.assertEqual(self.ws_tok.tokenize('data science'), + ['data', 'science']) + self.assertEqual(self.ws_tok.tokenize('data science'), + ['data', 'science']) + self.assertEqual(self.ws_tok.tokenize('data\tscience'), + ['data', 'science']) + self.assertEqual(self.ws_tok.tokenize('data\nscience'), + ['data', 'science']) + self.assertEqual(self.ws_tok.tokenize('ab cd ab bb cd db'), + ['ab', 'cd', 'ab', 'bb', 'cd', 'db']) + self.assertEqual(self.ws_tok_return_set.tokenize('ab cd ab bb cd db'), + ['ab', 'cd', 'bb', 'db']) + + def test_get_return_set(self): + self.assertEqual(self.ws_tok.get_return_set(), False) + self.assertEqual(self.ws_tok_return_set.get_return_set(), True) + + def test_set_return_set(self): + tok = WhitespaceTokenizer() + self.assertEqual(tok.get_return_set(), False) + self.assertEqual(tok.tokenize('ab cd ab bb cd db'), + ['ab', 'cd', 'ab', 'bb', 'cd', 'db']) + self.assertEqual(tok.set_return_set(True), True) + self.assertEqual(tok.get_return_set(), True) + self.assertEqual(tok.tokenize('ab cd ab bb cd db'), + ['ab', 'cd', 'bb', 'db']) + self.assertEqual(tok.set_return_set(False), True) + self.assertEqual(tok.get_return_set(), False) + self.assertEqual(tok.tokenize('ab cd ab bb cd db'), + ['ab', 'cd', 'ab', 'bb', 'cd', 'db']) + + def test_get_delim_set(self): + self.assertSetEqual(self.ws_tok.get_delim_set(), {' ', '\t', '\n'}) + + @raises(TypeError) + def test_whitespace_tok_invalid1(self): + self.ws_tok.tokenize(None) + + @raises(TypeError) + def test_whitespace_tok_invalid2(self): + self.ws_tok.tokenize(99) + + @raises(AttributeError) + def test_set_delim_set(self): + self.ws_tok.set_delim_set({'*', '.'}) \ No newline at end of file diff --git a/py_stringmatching/tests/test_tokenizers.py b/py_stringmatching/tests/test_tokenizers.py deleted file mode 100644 index ff10a36..0000000 --- a/py_stringmatching/tests/test_tokenizers.py +++ /dev/null @@ -1,432 +0,0 @@ -from __future__ import unicode_literals - -import unittest -from nose.tools import * - -from py_stringmatching.tokenizer.alphabetic_tokenizer import AlphabeticTokenizer -from py_stringmatching.tokenizer.alphanumeric_tokenizer import AlphanumericTokenizer -from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer -from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer -from py_stringmatching.tokenizer.whitespace_tokenizer import WhitespaceTokenizer - - -class QgramTokenizerTestCases(unittest.TestCase): - def setUp(self): - self.qg1_tok = QgramTokenizer(qval=1, padding=False) - self.qg2_tok = QgramTokenizer(padding=False) - self.qg2_tok_return_set = QgramTokenizer(padding=False,return_set=True) - self.qg3_tok = QgramTokenizer(qval=3, padding=False) - self.qg1_tok_wipad = QgramTokenizer(qval=1) - self.qg2_tok_wipad = QgramTokenizer() - self.qg2_tok_wipad_return_set = QgramTokenizer(return_set=True) - self.qg3_tok_wipad = QgramTokenizer(qval=3) - self.qg3_tok_wipad_diffpad = QgramTokenizer(qval=3,prefix_pad='^', - suffix_pad='!') - - def test_qgrams_valid(self): - self.assertEqual(self.qg2_tok.tokenize(''), []) - self.assertEqual(self.qg2_tok.tokenize('a'), []) - self.assertEqual(self.qg2_tok.tokenize('aa'), ['aa']) - self.assertEqual(self.qg2_tok.tokenize('database'), - ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']) - self.assertEqual(self.qg2_tok.tokenize('aabaabcdba'), - ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba']) - self.assertEqual(self.qg2_tok_return_set.tokenize('aabaabcdba'), - ['aa', 'ab', 'ba', 'bc', 'cd', 'db']) - self.assertEqual(self.qg1_tok.tokenize('d'), ['d']) - self.assertEqual(self.qg3_tok.tokenize('database'), - ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']) - - self.assertEqual(self.qg2_tok_wipad.tokenize(''), ['#$']) - self.assertEqual(self.qg2_tok_wipad.tokenize('a'), ['#a', 'a$']) - self.assertEqual(self.qg2_tok_wipad.tokenize('aa'), ['#a', 'aa', 'a$']) - self.assertEqual(self.qg2_tok_wipad.tokenize('database'), - ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) - self.assertEqual(self.qg2_tok_wipad.tokenize('aabaabcdba'), - ['#a', 'aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba', 'a$']) - self.assertEqual(self.qg2_tok_wipad_return_set.tokenize('aabaabcdba'), - ['#a', 'aa', 'ab', 'ba', 'bc', 'cd', 'db', 'a$']) - self.assertEqual(self.qg1_tok_wipad.tokenize('d'), ['d']) - self.assertEqual(self.qg3_tok_wipad.tokenize('database'), - ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$']) - - self.assertEqual(self.qg3_tok_wipad_diffpad.tokenize('database'), - ['^^d', '^da', 'dat', 'ata', 'tab', 'aba', 'bas', - 'ase', 'se!', 'e!!']) - - def test_get_return_set(self): - self.assertEqual(self.qg2_tok.get_return_set(), False) - self.assertEqual(self.qg2_tok_return_set.get_return_set(), True) - self.assertEqual(self.qg2_tok_wipad.get_return_set(), False) - self.assertEqual(self.qg2_tok_wipad_return_set.get_return_set(), True) - - - def test_get_qval(self): - self.assertEqual(self.qg2_tok.get_qval(), 2) - self.assertEqual(self.qg3_tok.get_qval(), 3) - self.assertEqual(self.qg2_tok_wipad.get_qval(), 2) - self.assertEqual(self.qg3_tok_wipad.get_qval(), 3) - - - def test_set_return_set(self): - tok = QgramTokenizer(padding=False) - self.assertEqual(tok.get_return_set(), False) - self.assertEqual(tok.tokenize('aabaabcdba'), - ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba']) - self.assertEqual(tok.set_return_set(True), True) - self.assertEqual(tok.get_return_set(), True) - self.assertEqual(tok.tokenize('aabaabcdba'), - ['aa', 'ab', 'ba', 'bc', 'cd', 'db']) - self.assertEqual(tok.set_return_set(False), True) - self.assertEqual(tok.get_return_set(), False) - self.assertEqual(tok.tokenize('aabaabcdba'), - ['aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba']) - tok = QgramTokenizer() - self.assertEqual(tok.get_return_set(), False) - self.assertEqual(tok.tokenize('aabaabcdba'), - ['#a', 'aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba', 'a$']) - self.assertEqual(tok.set_return_set(True), True) - self.assertEqual(tok.get_return_set(), True) - self.assertEqual(tok.tokenize('aabaabcdba'), - ['#a', 'aa', 'ab', 'ba', 'bc', 'cd', 'db', 'a$']) - self.assertEqual(tok.set_return_set(False), True) - self.assertEqual(tok.get_return_set(), False) - self.assertEqual(tok.tokenize('aabaabcdba'), - ['#a', 'aa', 'ab', 'ba', 'aa', 'ab', 'bc', 'cd', 'db', 'ba', 'a$']) - - - - def test_set_qval(self): - tok = QgramTokenizer(padding=False) - self.assertEqual(tok.get_qval(), 2) - self.assertEqual(tok.tokenize('database'), - ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']) - self.assertEqual(tok.set_qval(3), True) - self.assertEqual(tok.get_qval(), 3) - self.assertEqual(tok.tokenize('database'), - ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']) - - tok = QgramTokenizer() - self.assertEqual(tok.get_qval(), 2) - self.assertEqual(tok.tokenize('database'), - ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) - self.assertEqual(tok.set_qval(3), True) - self.assertEqual(tok.get_qval(), 3) - self.assertEqual(tok.tokenize('database'), - ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$']) - - def test_set_padding(self): - tok = QgramTokenizer() - self.assertEqual(tok.get_padding(), True) - self.assertEqual(tok.tokenize('database'), - ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) - tok.set_padding(False) - self.assertEqual(tok.get_padding(), False) - self.assertEqual(tok.tokenize('database'), - ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']) - - def test_set_prefix_pad(self): - tok = QgramTokenizer() - self.assertEqual(tok.get_prefix_pad(), '#') - self.assertEqual(tok.tokenize('database'), - ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) - tok.set_prefix_pad('^') - self.assertEqual(tok.get_prefix_pad(), '^') - self.assertEqual(tok.tokenize('database'), - ['^d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) - - def test_set_suffix_pad(self): - tok = QgramTokenizer() - self.assertEqual(tok.get_suffix_pad(), '$') - self.assertEqual(tok.tokenize('database'), - ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']) - tok.set_suffix_pad('!') - self.assertEqual(tok.get_suffix_pad(), '!') - self.assertEqual(tok.tokenize('database'), - ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e!']) - - @raises(TypeError) - def test_qgrams_none(self): - self.qg2_tok.tokenize(None) - - @raises(AssertionError) - def test_qgrams_invalid1(self): - invalid_qg_tok = QgramTokenizer(0) - - @raises(TypeError) - def test_qgrams_invalid2(self): - self.qg2_tok.tokenize(99) - - @raises(AssertionError) - def test_set_qval_invalid(self): - qg_tok = QgramTokenizer() - qg_tok.set_qval(0) - - @raises(AssertionError) - def test_padding_invalid(self): - _ = QgramTokenizer(padding=10) - - @raises(AssertionError) - def test_set_padding_invalid(self): - qg = QgramTokenizer() - qg.set_padding(10) - - @raises(AssertionError) - def test_prefixpad_invalid1(self): - _ = QgramTokenizer(prefix_pad=10) - - @raises(AssertionError) - def test_prefixpad_invalid2(self): - _ = QgramTokenizer(prefix_pad="###") - - @raises(AssertionError) - def test_set_prefix_pad_invalid1(self): - qg = QgramTokenizer() - qg.set_prefix_pad(10) - - @raises(AssertionError) - def test_set_prefix_pad_invalid2(self): - qg = QgramTokenizer() - qg.set_prefix_pad('###') - - @raises(AssertionError) - def test_suffixpad_invalid1(self): - _ = QgramTokenizer(suffix_pad=10) - - @raises(AssertionError) - def test_suffixpad_invalid2(self): - _ = QgramTokenizer(suffix_pad="###") - - @raises(AssertionError) - def test_set_suffix_pad_invalid1(self): - qg = QgramTokenizer() - qg.set_suffix_pad(10) - - @raises(AssertionError) - def test_set_suffix_pad_invalid2(self): - qg = QgramTokenizer() - qg.set_suffix_pad('###') - - -class DelimiterTokenizerTestCases(unittest.TestCase): - def setUp(self): - self.delim_tok1 = DelimiterTokenizer() - self.delim_tok2 = DelimiterTokenizer(set([','])) - self.delim_tok3 = DelimiterTokenizer(set(['*', '.'])) - self.delim_tok4 = DelimiterTokenizer(set(['..', 'ab'])) - self.delim_tok4_list = DelimiterTokenizer(['..', 'ab', '..']) - self.delim_tok4_return_set = DelimiterTokenizer(set(['..', 'ab']), - return_set=True) - - def test_delimiter_valid(self): - self.assertEqual(self.delim_tok1.tokenize('data science'), - ['data', 'science']) - self.assertEqual(self.delim_tok2.tokenize('data,science'), - ['data', 'science']) - self.assertEqual(self.delim_tok2.tokenize('data science'), - ['data science']) - self.assertEqual(self.delim_tok3.tokenize('ab cd*ef.*bb. gg.'), - ['ab cd', 'ef', 'bb', ' gg']) - self.assertEqual( - self.delim_tok4.tokenize('ab cd..efabbb....ggab cd..efabgh'), - [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) - self.assertEqual( - self.delim_tok4_list.tokenize('ab cd..efabbb....ggab cd..efabgh'), - [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) - self.assertEqual( - self.delim_tok4_return_set.tokenize( - 'ab cd..efabbb....ggab cd..efabgh'), - [' cd', 'ef', 'bb', 'gg', 'gh']) - - def test_get_return_set(self): - self.assertEqual(self.delim_tok4.get_return_set(), False) - self.assertEqual(self.delim_tok4_return_set.get_return_set(), True) - - def test_get_delim_set(self): - self.assertSetEqual(self.delim_tok1.get_delim_set(), {' '}) - self.assertSetEqual(self.delim_tok3.get_delim_set(), {'*', '.'}) - self.assertSetEqual(self.delim_tok4_list.get_delim_set(), {'..', 'ab'}) - - def test_set_return_set(self): - tok = DelimiterTokenizer(set(['..', 'ab'])) - self.assertEqual(tok.get_return_set(), False) - self.assertEqual( - tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), - [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) - self.assertEqual(tok.set_return_set(True), True) - self.assertEqual(tok.get_return_set(), True) - self.assertEqual( - tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), - [' cd', 'ef', 'bb', 'gg', 'gh']) - self.assertEqual(tok.set_return_set(False), True) - self.assertEqual(tok.get_return_set(), False) - self.assertEqual( - tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), - [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) - - def test_set_delim_set(self): - tok = DelimiterTokenizer(['*', '.']) - self.assertSetEqual(tok.get_delim_set(), {'*', '.'}) - self.assertEqual(tok.tokenize('ab cd*ef.*bb. gg.'), - ['ab cd', 'ef', 'bb', ' gg']) - self.assertEqual(tok.set_delim_set({'..', 'ab'}), True) - self.assertSetEqual(tok.get_delim_set(), {'..', 'ab'}) - self.assertEqual( - tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), - [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) - - @raises(TypeError) - def test_delimiter_invalid1(self): - invalid_delim_tok = DelimiterTokenizer(set([',', 10])) - - @raises(TypeError) - def test_delimiter_invalid2(self): - self.delim_tok1.tokenize(None) - - @raises(TypeError) - def test_delimiter_invalid3(self): - self.delim_tok1.tokenize(99) - - -class WhitespaceTokenizerTestCases(unittest.TestCase): - def setUp(self): - self.ws_tok = WhitespaceTokenizer() - self.ws_tok_return_set = WhitespaceTokenizer(return_set=True) - - def test_whitespace_tok_valid(self): - self.assertEqual(self.ws_tok.tokenize('data science'), - ['data', 'science']) - self.assertEqual(self.ws_tok.tokenize('data science'), - ['data', 'science']) - self.assertEqual(self.ws_tok.tokenize('data science'), - ['data', 'science']) - self.assertEqual(self.ws_tok.tokenize('data\tscience'), - ['data', 'science']) - self.assertEqual(self.ws_tok.tokenize('data\nscience'), - ['data', 'science']) - self.assertEqual(self.ws_tok.tokenize('ab cd ab bb cd db'), - ['ab', 'cd', 'ab', 'bb', 'cd', 'db']) - self.assertEqual(self.ws_tok_return_set.tokenize('ab cd ab bb cd db'), - ['ab', 'cd', 'bb', 'db']) - - def test_get_return_set(self): - self.assertEqual(self.ws_tok.get_return_set(), False) - self.assertEqual(self.ws_tok_return_set.get_return_set(), True) - - def test_set_return_set(self): - tok = WhitespaceTokenizer() - self.assertEqual(tok.get_return_set(), False) - self.assertEqual(tok.tokenize('ab cd ab bb cd db'), - ['ab', 'cd', 'ab', 'bb', 'cd', 'db']) - self.assertEqual(tok.set_return_set(True), True) - self.assertEqual(tok.get_return_set(), True) - self.assertEqual(tok.tokenize('ab cd ab bb cd db'), - ['ab', 'cd', 'bb', 'db']) - self.assertEqual(tok.set_return_set(False), True) - self.assertEqual(tok.get_return_set(), False) - self.assertEqual(tok.tokenize('ab cd ab bb cd db'), - ['ab', 'cd', 'ab', 'bb', 'cd', 'db']) - - def test_get_delim_set(self): - self.assertSetEqual(self.ws_tok.get_delim_set(), {' ', '\t', '\n'}) - - @raises(TypeError) - def test_whitespace_tok_invalid1(self): - self.ws_tok.tokenize(None) - - @raises(TypeError) - def test_whitespace_tok_invalid2(self): - self.ws_tok.tokenize(99) - - @raises(AttributeError) - def test_set_delim_set(self): - self.ws_tok.set_delim_set({'*', '.'}) - - -class AlphabeticTokenizerTestCases(unittest.TestCase): - def setUp(self): - self.al_tok = AlphabeticTokenizer() - self.al_tok_return_set = AlphabeticTokenizer(return_set=True) - - def test_alphabetic_tok_valid(self): - self.assertEqual(self.al_tok.tokenize(''), []) - self.assertEqual(self.al_tok.tokenize('99'), []) - self.assertEqual(self.al_tok.tokenize('hello'), ['hello']) - self.assertEqual(self.al_tok.tokenize('ab bc. cd##de ef09 bc fg ab.'), - ['ab', 'bc', 'cd', 'de', 'ef', 'bc', 'fg', 'ab']) - self.assertEqual( - self.al_tok_return_set.tokenize('ab bc. cd##de ef09 bc fg ab.'), - ['ab', 'bc', 'cd', 'de', 'ef', 'fg']) - - def test_get_return_set(self): - self.assertEqual(self.al_tok.get_return_set(), False) - self.assertEqual(self.al_tok_return_set.get_return_set(), True) - - def test_set_return_set(self): - tok = AlphabeticTokenizer() - self.assertEqual(tok.get_return_set(), False) - self.assertEqual(tok.tokenize('ab bc. cd##de ef09 bc fg ab.'), - ['ab', 'bc', 'cd', 'de', 'ef', 'bc', 'fg', 'ab']) - self.assertEqual(tok.set_return_set(True), True) - self.assertEqual(tok.get_return_set(), True) - self.assertEqual( - tok.tokenize('ab bc. cd##de ef09 bc fg ab.'), - ['ab', 'bc', 'cd', 'de', 'ef', 'fg']) - self.assertEqual(tok.set_return_set(False), True) - self.assertEqual(tok.get_return_set(), False) - self.assertEqual(tok.tokenize('ab bc. cd##de ef09 bc fg ab.'), - ['ab', 'bc', 'cd', 'de', 'ef', 'bc', 'fg', 'ab']) - - @raises(TypeError) - def test_alphabetic_tok_invalid1(self): - self.al_tok.tokenize(None) - - @raises(TypeError) - def test_alphabetic_tok_invalid2(self): - self.al_tok.tokenize(99) - - -class AlphanumericTokenizerTestCases(unittest.TestCase): - def setUp(self): - self.alnum_tok = AlphanumericTokenizer() - self.alnum_tok_return_set = AlphanumericTokenizer(return_set=True) - - def test_alphanumeric_tok_valid(self): - self.assertEqual(self.alnum_tok.tokenize(''), []) - self.assertEqual(self.alnum_tok.tokenize('#$'), []) - self.assertEqual(self.alnum_tok.tokenize('hello99'), ['hello99']) - self.assertEqual( - self.alnum_tok.tokenize(',data9,(science), data9#.(integration).88!'), - ['data9', 'science', 'data9', 'integration', '88']) - self.assertEqual(self.alnum_tok_return_set.tokenize( - ',data9,(science), data9#.(integration).88!'), - ['data9', 'science', 'integration', '88']) - - def test_get_return_set(self): - self.assertEqual(self.alnum_tok.get_return_set(), False) - self.assertEqual(self.alnum_tok_return_set.get_return_set(), True) - - def test_set_return_set(self): - tok = AlphanumericTokenizer() - self.assertEqual(tok.get_return_set(), False) - self.assertEqual( - tok.tokenize(',data9,(science), data9#.(integration).88!'), - ['data9', 'science', 'data9', 'integration', '88']) - self.assertEqual(tok.set_return_set(True), True) - self.assertEqual(tok.get_return_set(), True) - self.assertEqual( - tok.tokenize(',data9,(science), data9#.(integration).88!'), - ['data9', 'science', 'integration', '88']) - self.assertEqual(tok.set_return_set(False), True) - self.assertEqual(tok.get_return_set(), False) - self.assertEqual( - tok.tokenize(',data9,(science), data9#.(integration).88!'), - ['data9', 'science', 'data9', 'integration', '88']) - - @raises(TypeError) - def test_alphanumeric_tok_invalid1(self): - self.alnum_tok.tokenize(None) - - @raises(TypeError) - def test_alphanumeric_tok_invalid2(self): - self.alnum_tok.tokenize(99)