diff --git a/benchmarks/custom_benchmarks/Readme.md b/benchmarks/custom_benchmarks/Readme.md new file mode 100644 index 0000000..0b3dc7a --- /dev/null +++ b/benchmarks/custom_benchmarks/Readme.md @@ -0,0 +1,12 @@ +Benchmarks Documentation: + +1. Run the bm_select_measure python file to get the benchmarks for individual measure. +2. Enter the path for each dataset (Datasets can be downloaded from the website) +3. Enter the measure to be benchmarked (Example: HammingDistance) +4. Enter the size of the data set to be benchmarked (Example: 9000) + +Results: + +1. After it completes the benchmarking, results will be stored in a benchmarks directory + in an excel file. + diff --git a/benchmarks/custom_benchmarks/__init__.py b/benchmarks/custom_benchmarks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/custom_benchmarks/benchmark.py b/benchmarks/custom_benchmarks/benchmark.py new file mode 100644 index 0000000..5345083 --- /dev/null +++ b/benchmarks/custom_benchmarks/benchmark.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +import timeit + + +class Benchmark: + + def __init__(self, bm_pairs=1.0): + """Default value for bm_pairs + bm_pairs = 1 + """ + self.bm_pairs = bm_pairs + + def get_benchmark(self, measure, set1, set2): + """ To get the total runtime for the string matching operation on given sets of strings. + Args: + measure (function): Similarity measure to be benchmarked. + set1, set2 (Set): input sets to compare the strings and get the score. + Return: + Total runtime to do the string matching operation in seconds. + + """ + result = [] + bm_result = [] + start = timeit.default_timer() + + if len(set1) > len(set2): + max_len = len(set2) + else: + max_len = len(set1) + + if self.bm_pairs != 1: + min_len = self.bm_pairs + for m in range(max_len): + for n in range(min_len): + score = measure.get_raw_score(str(set1[m]), str(set2[n])) + result.append(score) + stop = timeit.default_timer() + bm = stop - start + bm_result.append(bm) + return bm_result + else: + for m in range(max_len): + score = measure.get_raw_score(str(set1[m]), str(set2[m])) + result.append(score) + stop = timeit.default_timer() + bm = stop - start + bm_result.append(bm) + return bm_result + + def set_bm_pairs(self, bm_pairs): + """Set benchmark pairs. + If bm_pairs = 1 then the benchmark is for string-string pairs + if bm_pairs > 1 then the benchmark is for the cartesian products between + each string in setA and benchmark pairs in setB + + Args: + bm_pairs (int): number of pairs with each string. + """ + self.bm_pairs = bm_pairs + return True + + diff --git a/benchmarks/custom_benchmarks/bm_select_measure.py b/benchmarks/custom_benchmarks/bm_select_measure.py new file mode 100644 index 0000000..d6012f9 --- /dev/null +++ b/benchmarks/custom_benchmarks/bm_select_measure.py @@ -0,0 +1,118 @@ +import os.path +from math import floor + +import pandas as pd +from openpyxl import load_workbook + +import benchmarks.custom_benchmarks.benchmark as bm +from py_stringmatching.similarity_measure.affine import Affine +from py_stringmatching.similarity_measure.bag_distance import BagDistance +# token based similarity measures +from py_stringmatching.similarity_measure.cosine import Cosine +from py_stringmatching.similarity_measure.dice import Dice +from py_stringmatching.similarity_measure.editex import Editex +# hybrid similarity measures +from py_stringmatching.similarity_measure.generalized_jaccard import GeneralizedJaccard +from py_stringmatching.similarity_measure.hamming_distance import HammingDistance +from py_stringmatching.similarity_measure.jaccard import Jaccard +from py_stringmatching.similarity_measure.jaro import Jaro +from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler +from py_stringmatching.similarity_measure.levenshtein import Levenshtein +from py_stringmatching.similarity_measure.monge_elkan import MongeElkan +from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch +from py_stringmatching.similarity_measure.overlap_coefficient import OverlapCoefficient +from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman +from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf +# phonetic similarity measures +from py_stringmatching.similarity_measure.soundex import Soundex +from py_stringmatching.similarity_measure.tfidf import TfIdf +from py_stringmatching.similarity_measure.tversky_index import TverskyIndex + +# User input for dataset path and reading the csv file +df1 = pd.read_csv(str(input('Enter the short strings data set path: '))) +df2 = pd.read_csv(str(input('Enter the medium strings data set path: '))) +df3 = pd.read_csv(str(input('Enter the long strings data set path: '))) + +# Converting each column in dataset to separate sets +short_setA = df1['SET A'] +short_setB = df1['SET B'] +medium_setA = df2['SET A'] +medium_setB = df2['SET B'] +long_setA = df3['SET A'] +long_setB = df3['SET B'] + +# Dict of Current Measures +present_measures = { + # sequence based similarity measures + 'Affine': Affine, + 'BagDistance': BagDistance, + 'Editex': Editex, + 'HammingDistance': HammingDistance, + 'Jaro': Jaro, + 'JaroWinkler': JaroWinkler, + 'Levenshtein': Levenshtein, + 'NeedlemanWunsch': NeedlemanWunsch, + 'SmithWaterman': SmithWaterman, + # token based similarity measures + 'Cosine': Cosine, + 'Dice': Dice, + 'Jaccard': Jaccard, + 'OverlapCoefficient': OverlapCoefficient, + 'SoftTfIdf': SoftTfIdf, + 'TfIdf': TfIdf, + 'TverskyIndex': TverskyIndex, + # hybrid similarity measures + 'GeneralizedJaccard': GeneralizedJaccard, + 'MongeElkan': MongeElkan, + # phonetic similarity measures + 'Soundex': Soundex + +} + +# User input to select the measure to be benchmarked +bench_measure = input('Choose one measure to benchmark among the following available measures: Affine, BagDistance,\n' \ + ' Editex, HammingDistance, Jaro, JaroWinkler,Levenshtein,\n\ + NeedlemanWunsch, SmithWaterman, Cosine, Dice, Jaccard, OverlapCoefficient, SoftTfIdf, TfIdf,\n \ + TverskyIndex, GeneralizedJaccard, MongeElkan, Soundex ') + +# User input for the size of dataset to be benchmarked +if present_measures.keys().__contains__(bench_measure): + bench_measure = present_measures.get(bench_measure) + bm_size =input('Enter the size of the dataset to be benchmarked: ') + +# Changing the index in dataframe + new_index = ['short_short', 'short_medium', 'short_long', 'medium_medium', 'medium_long', 'long_long'] + writer = pd.ExcelWriter('benchmark.xlsx') + if os.path.isfile('benchmark.xlsx'): + book = load_workbook('benchmark.xlsx') + writer.book = book + + df = pd.DataFrame() + bm_list = [] + bench_object = bm.Benchmark() + size = floor(bm_size/11000) + bench_object.set_bm_pairs(size) + # Benchmark for short-short + bm_list.append(bench_object.get_benchmark(bench_measure(), short_setA, short_setB)[0]) + + # Benchmark for short-medium + bm_list.append(bench_object.get_benchmark(bench_measure(), short_setA, medium_setA)[0]) + + # Benchmark for short-long + bm_list.append(bench_object.get_benchmark(bench_measure(), short_setA, long_setA)[0]) + + # Benchmark for medium-medium + bm_list.append(bench_object.get_benchmark(bench_measure(), medium_setA, medium_setB)[0]) + + # Benchmark for medium-long + bm_list.append(bench_object.get_benchmark(bench_measure(), medium_setA, long_setA)[0]) + + # Benchmark for long-long + bm_list.append(bench_object.get_benchmark(bench_measure(), long_setA, long_setB)[0]) + + temp_df = pd.DataFrame({str(size): bm_list}, index=new_index) + df = pd.concat([df, temp_df], axis=1) + +df.to_excel(writer, str(bench_measure[0]) + '.xlsx') +writer.save() +#