Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions benchmarks/custom_benchmarks/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Benchmarks Documentation:

1. Run the bm_select_measure python file to get the benchmarks for individual measure.
2. Enter the path for each dataset (Datasets can be downloaded from the website)
3. Enter the measure to be benchmarked (Example: HammingDistance)
4. Enter the size of the data set to be benchmarked (Example: 9000)

Results:

1. After it completes the benchmarking, results will be stored in a benchmarks directory
in an excel file.

Empty file.
62 changes: 62 additions & 0 deletions benchmarks/custom_benchmarks/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
import timeit


class Benchmark:

def __init__(self, bm_pairs=1.0):
"""Default value for bm_pairs
bm_pairs = 1
"""
self.bm_pairs = bm_pairs

def get_benchmark(self, measure, set1, set2):
""" To get the total runtime for the string matching operation on given sets of strings.
Args:
measure (function): Similarity measure to be benchmarked.
set1, set2 (Set): input sets to compare the strings and get the score.
Return:
Total runtime to do the string matching operation in seconds.

"""
result = []
bm_result = []
start = timeit.default_timer()

if len(set1) > len(set2):
max_len = len(set2)
else:
max_len = len(set1)

if self.bm_pairs != 1:
min_len = self.bm_pairs
for m in range(max_len):
for n in range(min_len):
score = measure.get_raw_score(str(set1[m]), str(set2[n]))
result.append(score)
stop = timeit.default_timer()
bm = stop - start
bm_result.append(bm)
return bm_result
else:
for m in range(max_len):
score = measure.get_raw_score(str(set1[m]), str(set2[m]))
result.append(score)
stop = timeit.default_timer()
bm = stop - start
bm_result.append(bm)
return bm_result

def set_bm_pairs(self, bm_pairs):
"""Set benchmark pairs.
If bm_pairs = 1 then the benchmark is for string-string pairs
if bm_pairs > 1 then the benchmark is for the cartesian products between
each string in setA and benchmark pairs in setB

Args:
bm_pairs (int): number of pairs with each string.
"""
self.bm_pairs = bm_pairs
return True


118 changes: 118 additions & 0 deletions benchmarks/custom_benchmarks/bm_select_measure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import os.path
from math import floor

import pandas as pd
from openpyxl import load_workbook

import benchmarks.custom_benchmarks.benchmark as bm
from py_stringmatching.similarity_measure.affine import Affine
from py_stringmatching.similarity_measure.bag_distance import BagDistance
# token based similarity measures
from py_stringmatching.similarity_measure.cosine import Cosine
from py_stringmatching.similarity_measure.dice import Dice
from py_stringmatching.similarity_measure.editex import Editex
# hybrid similarity measures
from py_stringmatching.similarity_measure.generalized_jaccard import GeneralizedJaccard
from py_stringmatching.similarity_measure.hamming_distance import HammingDistance
from py_stringmatching.similarity_measure.jaccard import Jaccard
from py_stringmatching.similarity_measure.jaro import Jaro
from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler
from py_stringmatching.similarity_measure.levenshtein import Levenshtein
from py_stringmatching.similarity_measure.monge_elkan import MongeElkan
from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch
from py_stringmatching.similarity_measure.overlap_coefficient import OverlapCoefficient
from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman
from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf
# phonetic similarity measures
from py_stringmatching.similarity_measure.soundex import Soundex
from py_stringmatching.similarity_measure.tfidf import TfIdf
from py_stringmatching.similarity_measure.tversky_index import TverskyIndex

# User input for dataset path and reading the csv file
df1 = pd.read_csv(str(input('Enter the short strings data set path: ')))
df2 = pd.read_csv(str(input('Enter the medium strings data set path: ')))
df3 = pd.read_csv(str(input('Enter the long strings data set path: ')))

# Converting each column in dataset to separate sets
short_setA = df1['SET A']
short_setB = df1['SET B']
medium_setA = df2['SET A']
medium_setB = df2['SET B']
long_setA = df3['SET A']
long_setB = df3['SET B']

# Dict of Current Measures
present_measures = {
# sequence based similarity measures
'Affine': Affine,
'BagDistance': BagDistance,
'Editex': Editex,
'HammingDistance': HammingDistance,
'Jaro': Jaro,
'JaroWinkler': JaroWinkler,
'Levenshtein': Levenshtein,
'NeedlemanWunsch': NeedlemanWunsch,
'SmithWaterman': SmithWaterman,
# token based similarity measures
'Cosine': Cosine,
'Dice': Dice,
'Jaccard': Jaccard,
'OverlapCoefficient': OverlapCoefficient,
'SoftTfIdf': SoftTfIdf,
'TfIdf': TfIdf,
'TverskyIndex': TverskyIndex,
# hybrid similarity measures
'GeneralizedJaccard': GeneralizedJaccard,
'MongeElkan': MongeElkan,
# phonetic similarity measures
'Soundex': Soundex

}

# User input to select the measure to be benchmarked
bench_measure = input('Choose one measure to benchmark among the following available measures: Affine, BagDistance,\n' \
' Editex, HammingDistance, Jaro, JaroWinkler,Levenshtein,\n\
NeedlemanWunsch, SmithWaterman, Cosine, Dice, Jaccard, OverlapCoefficient, SoftTfIdf, TfIdf,\n \
TverskyIndex, GeneralizedJaccard, MongeElkan, Soundex ')

# User input for the size of dataset to be benchmarked
if present_measures.keys().__contains__(bench_measure):
bench_measure = present_measures.get(bench_measure)
bm_size =input('Enter the size of the dataset to be benchmarked: ')

# Changing the index in dataframe
new_index = ['short_short', 'short_medium', 'short_long', 'medium_medium', 'medium_long', 'long_long']
writer = pd.ExcelWriter('benchmark.xlsx')
if os.path.isfile('benchmark.xlsx'):
book = load_workbook('benchmark.xlsx')
writer.book = book

df = pd.DataFrame()
bm_list = []
bench_object = bm.Benchmark()
size = floor(bm_size/11000)
bench_object.set_bm_pairs(size)
# Benchmark for short-short
bm_list.append(bench_object.get_benchmark(bench_measure(), short_setA, short_setB)[0])

# Benchmark for short-medium
bm_list.append(bench_object.get_benchmark(bench_measure(), short_setA, medium_setA)[0])

# Benchmark for short-long
bm_list.append(bench_object.get_benchmark(bench_measure(), short_setA, long_setA)[0])

# Benchmark for medium-medium
bm_list.append(bench_object.get_benchmark(bench_measure(), medium_setA, medium_setB)[0])

# Benchmark for medium-long
bm_list.append(bench_object.get_benchmark(bench_measure(), medium_setA, long_setA)[0])

# Benchmark for long-long
bm_list.append(bench_object.get_benchmark(bench_measure(), long_setA, long_setB)[0])

temp_df = pd.DataFrame({str(size): bm_list}, index=new_index)
df = pd.concat([df, temp_df], axis=1)

df.to_excel(writer, str(bench_measure[0]) + '.xlsx')
writer.save()
#