From 2ccbfb9dafff680af704af218d8e41072a4e554a Mon Sep 17 00:00:00 2001 From: srujithpoondla Date: Thu, 15 Jun 2017 16:32:02 -0500 Subject: [PATCH 1/5] Added code to do the benchmark with an option to choose the measure and the size of dataset --- benchmarks/benchmark.py | 54 +++++++++++++++ benchmarks/bm_select_measure.py | 113 ++++++++++++++++++++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 benchmarks/benchmark.py create mode 100644 benchmarks/bm_select_measure.py diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py new file mode 100644 index 0000000..101eedd --- /dev/null +++ b/benchmarks/benchmark.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +import timeit + + +class Benchmark: + + def __init__(self, bm_pairs=1.0): + """Default value for bm_pairs + bm_pairs = 1 + """ + self.bm_pairs = bm_pairs + + def get_benchmark(self, measure, set1, set2): + result = [] + bm_result = [] + start = timeit.default_timer() + + if len(set1) > len(set2): + max_len = len(set2) + else: + max_len = len(set1) + + if self.bm_pairs != 1: + min_len = self.bm_pairs + for m in range(max_len): + for n in range(min_len): + score = measure.get_raw_score(str(set1[m]), str(set2[n])) + result.append(score) + stop = timeit.default_timer() + bm = stop - start + bm_result.append(bm) + return bm_result + else: + for m in range(max_len): + score = measure.get_raw_score(str(set1[m]), str(set2[m])) + result.append(score) + stop = timeit.default_timer() + bm = stop - start + bm_result.append(bm) + return bm_result + + def set_bm_pairs(self, bm_pairs): + """Set benchmark pairs. + If bm_pairs = 1 then the benchmark is for string-string pairs + if bm_pairs > 1 then the benchmark is for the cartesian products between + each string in setA and benchmark pairs in setB + + Args: + bm_pairs (int): number of pairs with each string. + """ + self.bm_pairs = bm_pairs + return True + + diff --git a/benchmarks/bm_select_measure.py b/benchmarks/bm_select_measure.py new file mode 100644 index 0000000..301b79d --- /dev/null +++ b/benchmarks/bm_select_measure.py @@ -0,0 +1,113 @@ +import os.path +from math import floor + +import pandas as pd +from openpyxl import load_workbook + +import benchmarks.benchmark as bm +from py_stringmatching.similarity_measure.affine import Affine +from py_stringmatching.similarity_measure.bag_distance import BagDistance +from py_stringmatching.similarity_measure.editex import Editex +from py_stringmatching.similarity_measure.hamming_distance import HammingDistance +from py_stringmatching.similarity_measure.jaro import Jaro +from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler +from py_stringmatching.similarity_measure.levenshtein import Levenshtein +from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch +from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman +# token based similarity measures +from py_stringmatching.similarity_measure.cosine import Cosine +from py_stringmatching.similarity_measure.dice import Dice +from py_stringmatching.similarity_measure.jaccard import Jaccard +from py_stringmatching.similarity_measure.overlap_coefficient import OverlapCoefficient +from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf +from py_stringmatching.similarity_measure.tfidf import TfIdf +from py_stringmatching.similarity_measure.tversky_index import TverskyIndex +# hybrid similarity measures +from py_stringmatching.similarity_measure.generalized_jaccard import GeneralizedJaccard +from py_stringmatching.similarity_measure.monge_elkan import MongeElkan +# phonetic similarity measures +from py_stringmatching.similarity_measure.soundex import Soundex + +df1 = pd.read_csv(str(input('Enter the short data set path: '))) +df2 = pd.read_csv(str(input('Enter the medium data set path: '))) +df3 = pd.read_csv(str(input('Enter the long data set path: '))) + + +short_setA = df1['SET A'] +short_setB = df1['SET B'] +medium_setA = df2['SET A'] +medium_setB = df2['SET B'] +long_setA = df3['SET A'] +long_setB = df3['SET B'] + +present_measures = { + # sequence based similarity measures + 'Affine': Affine, + 'BagDistance': BagDistance, + 'Editex': Editex, + 'HammingDistance': HammingDistance, + 'Jaro': Jaro, + 'JaroWinkler': JaroWinkler, + 'Levenshtein': Levenshtein, + 'NeedlemanWunsch': NeedlemanWunsch, + 'SmithWaterman': SmithWaterman, + # token based similarity measures + 'Cosine': Cosine, + 'Dice': Dice, + 'Jaccard': Jaccard, + 'OverlapCoefficient': OverlapCoefficient, + 'SoftTfIdf': SoftTfIdf, + 'TfIdf': TfIdf, + 'TverskyIndex': TverskyIndex, + # hybrid similarity measures + 'GeneralizedJaccard': GeneralizedJaccard, + 'MongeElkan': MongeElkan, + # phonetic similarity measures + 'Soundex': Soundex + +} + +bench_measure = input('Choose one measure to benchmark among the following available measures: Affine, BagDistance,\n' \ + ' Editex, HammingDistance, Jaro, JaroWinkler,Levenshtein,\n\ + NeedlemanWunsch, SmithWaterman, Cosine, Dice, Jaccard, OverlapCoefficient, SoftTfIdf, TfIdf,\n \ + TverskyIndex, GeneralizedJaccard, MongeElkan, Soundex ') + +if present_measures.keys().__contains__(bench_measure): + bench_measure = present_measures.get(bench_measure) + bm_size =input('Enter the size of the dataset to be benchmarked: ') + + new_index = ['short_short', 'short_medium', 'short_long', 'medium_medium', 'medium_long', 'long_long'] + writer = pd.ExcelWriter('benchmark.xlsx') + if os.path.isfile('benchmark.xlsx'): + book = load_workbook('benchmark.xlsx') + writer.book = book + + df = pd.DataFrame() + bm_list = [] + bench_object = bm.Benchmark() + size = floor(bm_size/11000) + bench_object.set_bm_pairs(size) + # Benchmark for short-short + bm_list.append(bench_object.get_benchmark(bench_measure(), short_setA, short_setB)[0]) + + # Benchmark for short-medium + bm_list.append(bench_object.get_benchmark(bench_measure(), short_setA, medium_setA)[0]) + + # Benchmark for short-long + bm_list.append(bench_object.get_benchmark(bench_measure(), short_setA, long_setA)[0]) + + # Benchmark for medium-medium + bm_list.append(bench_object.get_benchmark(bench_measure(), medium_setA, medium_setB)[0]) + + # Benchmark for medium-long + bm_list.append(bench_object.get_benchmark(bench_measure(), medium_setA, long_setA)[0]) + + # Benchmark for long-long + bm_list.append(bench_object.get_benchmark(bench_measure(), long_setA, long_setB)[0]) + + temp_df = pd.DataFrame({str(size): bm_list}, index=new_index) + df = pd.concat([df, temp_df], axis=1) + +df.to_excel(writer, str(bench_measure[0]) + '.xlsx') +writer.save() +# From d3020e2c4317a8076db8343b0046e04cadcc384f Mon Sep 17 00:00:00 2001 From: srujithpoondla Date: Tue, 20 Jun 2017 11:45:24 -0500 Subject: [PATCH 2/5] Added code to do the benchmark with an option to choose the measure and the size of dataset --- benchmarks/bm_select_measure.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/bm_select_measure.py b/benchmarks/bm_select_measure.py index 301b79d..baec490 100644 --- a/benchmarks/bm_select_measure.py +++ b/benchmarks/bm_select_measure.py @@ -28,9 +28,9 @@ # phonetic similarity measures from py_stringmatching.similarity_measure.soundex import Soundex -df1 = pd.read_csv(str(input('Enter the short data set path: '))) -df2 = pd.read_csv(str(input('Enter the medium data set path: '))) -df3 = pd.read_csv(str(input('Enter the long data set path: '))) +df1 = pd.read_csv(str(input('Enter the short strings data set path: '))) +df2 = pd.read_csv(str(input('Enter the medium strings data set path: '))) +df3 = pd.read_csv(str(input('Enter the long strings data set path: '))) short_setA = df1['SET A'] From b48babad5d3ffac0836783d88af4bc5355c0620c Mon Sep 17 00:00:00 2001 From: srujithpoondla Date: Tue, 20 Jun 2017 11:58:27 -0500 Subject: [PATCH 3/5] Added code to do the benchmark with an option to choose the measure and the size of dataset --- benchmarks/benchmark.py | 8 ++++++++ benchmarks/bm_select_measure.py | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index 101eedd..5345083 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -11,6 +11,14 @@ def __init__(self, bm_pairs=1.0): self.bm_pairs = bm_pairs def get_benchmark(self, measure, set1, set2): + """ To get the total runtime for the string matching operation on given sets of strings. + Args: + measure (function): Similarity measure to be benchmarked. + set1, set2 (Set): input sets to compare the strings and get the score. + Return: + Total runtime to do the string matching operation in seconds. + + """ result = [] bm_result = [] start = timeit.default_timer() diff --git a/benchmarks/bm_select_measure.py b/benchmarks/bm_select_measure.py index baec490..6c307bc 100644 --- a/benchmarks/bm_select_measure.py +++ b/benchmarks/bm_select_measure.py @@ -28,11 +28,13 @@ # phonetic similarity measures from py_stringmatching.similarity_measure.soundex import Soundex + +# User input for dataset path and reading the csv file df1 = pd.read_csv(str(input('Enter the short strings data set path: '))) df2 = pd.read_csv(str(input('Enter the medium strings data set path: '))) df3 = pd.read_csv(str(input('Enter the long strings data set path: '))) - +# Converting each column in dataset to separate sets short_setA = df1['SET A'] short_setB = df1['SET B'] medium_setA = df2['SET A'] @@ -40,6 +42,7 @@ long_setA = df3['SET A'] long_setB = df3['SET B'] +# Dict of Current Measures present_measures = { # sequence based similarity measures 'Affine': Affine, @@ -67,15 +70,18 @@ } +# User input to select the measure to be benchmarked bench_measure = input('Choose one measure to benchmark among the following available measures: Affine, BagDistance,\n' \ ' Editex, HammingDistance, Jaro, JaroWinkler,Levenshtein,\n\ NeedlemanWunsch, SmithWaterman, Cosine, Dice, Jaccard, OverlapCoefficient, SoftTfIdf, TfIdf,\n \ TverskyIndex, GeneralizedJaccard, MongeElkan, Soundex ') +# User input for the size of dataset to be benchmarked if present_measures.keys().__contains__(bench_measure): bench_measure = present_measures.get(bench_measure) bm_size =input('Enter the size of the dataset to be benchmarked: ') +# Changing the index in dataframe new_index = ['short_short', 'short_medium', 'short_long', 'medium_medium', 'medium_long', 'long_long'] writer = pd.ExcelWriter('benchmark.xlsx') if os.path.isfile('benchmark.xlsx'): From 229dc0167b2670b470943c6482f3d9d5de0538d6 Mon Sep 17 00:00:00 2001 From: srujithpoondla Date: Tue, 20 Jun 2017 14:40:27 -0500 Subject: [PATCH 4/5] Moved the benchmarks code to new directory custom_benchmarks --- benchmarks/custom_benchmarks/__init__.py | 0 .../{ => custom_benchmarks}/benchmark.py | 0 .../bm_select_measure.py | 23 +++++++++---------- 3 files changed, 11 insertions(+), 12 deletions(-) create mode 100644 benchmarks/custom_benchmarks/__init__.py rename benchmarks/{ => custom_benchmarks}/benchmark.py (100%) rename benchmarks/{ => custom_benchmarks}/bm_select_measure.py (98%) diff --git a/benchmarks/custom_benchmarks/__init__.py b/benchmarks/custom_benchmarks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/benchmark.py b/benchmarks/custom_benchmarks/benchmark.py similarity index 100% rename from benchmarks/benchmark.py rename to benchmarks/custom_benchmarks/benchmark.py diff --git a/benchmarks/bm_select_measure.py b/benchmarks/custom_benchmarks/bm_select_measure.py similarity index 98% rename from benchmarks/bm_select_measure.py rename to benchmarks/custom_benchmarks/bm_select_measure.py index 6c307bc..d6012f9 100644 --- a/benchmarks/bm_select_measure.py +++ b/benchmarks/custom_benchmarks/bm_select_measure.py @@ -4,30 +4,29 @@ import pandas as pd from openpyxl import load_workbook -import benchmarks.benchmark as bm +import benchmarks.custom_benchmarks.benchmark as bm from py_stringmatching.similarity_measure.affine import Affine from py_stringmatching.similarity_measure.bag_distance import BagDistance +# token based similarity measures +from py_stringmatching.similarity_measure.cosine import Cosine +from py_stringmatching.similarity_measure.dice import Dice from py_stringmatching.similarity_measure.editex import Editex +# hybrid similarity measures +from py_stringmatching.similarity_measure.generalized_jaccard import GeneralizedJaccard from py_stringmatching.similarity_measure.hamming_distance import HammingDistance +from py_stringmatching.similarity_measure.jaccard import Jaccard from py_stringmatching.similarity_measure.jaro import Jaro from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler from py_stringmatching.similarity_measure.levenshtein import Levenshtein +from py_stringmatching.similarity_measure.monge_elkan import MongeElkan from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch -from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman -# token based similarity measures -from py_stringmatching.similarity_measure.cosine import Cosine -from py_stringmatching.similarity_measure.dice import Dice -from py_stringmatching.similarity_measure.jaccard import Jaccard from py_stringmatching.similarity_measure.overlap_coefficient import OverlapCoefficient +from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf -from py_stringmatching.similarity_measure.tfidf import TfIdf -from py_stringmatching.similarity_measure.tversky_index import TverskyIndex -# hybrid similarity measures -from py_stringmatching.similarity_measure.generalized_jaccard import GeneralizedJaccard -from py_stringmatching.similarity_measure.monge_elkan import MongeElkan # phonetic similarity measures from py_stringmatching.similarity_measure.soundex import Soundex - +from py_stringmatching.similarity_measure.tfidf import TfIdf +from py_stringmatching.similarity_measure.tversky_index import TverskyIndex # User input for dataset path and reading the csv file df1 = pd.read_csv(str(input('Enter the short strings data set path: '))) From 0f05df4ea5c4e2dfa4697c45a340ff19b30b14eb Mon Sep 17 00:00:00 2001 From: srujithpoondla Date: Mon, 17 Jul 2017 08:50:44 -0700 Subject: [PATCH 5/5] Added Readme --- benchmarks/custom_benchmarks/Readme.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 benchmarks/custom_benchmarks/Readme.md diff --git a/benchmarks/custom_benchmarks/Readme.md b/benchmarks/custom_benchmarks/Readme.md new file mode 100644 index 0000000..0b3dc7a --- /dev/null +++ b/benchmarks/custom_benchmarks/Readme.md @@ -0,0 +1,12 @@ +Benchmarks Documentation: + +1. Run the bm_select_measure python file to get the benchmarks for individual measure. +2. Enter the path for each dataset (Datasets can be downloaded from the website) +3. Enter the measure to be benchmarked (Example: HammingDistance) +4. Enter the size of the data set to be benchmarked (Example: 9000) + +Results: + +1. After it completes the benchmarking, results will be stored in a benchmarks directory + in an excel file. +