From cf17a0b6020a43aea074d6b763d80272aa22fd09 Mon Sep 17 00:00:00 2001 From: Yevhen Cherkes Date: Sun, 26 May 2024 14:29:43 +0200 Subject: [PATCH 01/20] replace arrays with spans --- FuzzySharp.Test/FuzzySharp.Test.csproj | 2 +- FuzzySharp/FuzzySharp.csproj | 7 +--- FuzzySharp/Levenshtein.cs | 42 +++++++------------ .../Strategy/Generic/PartialRatioStrategyT.cs | 4 +- 4 files changed, 21 insertions(+), 34 deletions(-) diff --git a/FuzzySharp.Test/FuzzySharp.Test.csproj b/FuzzySharp.Test/FuzzySharp.Test.csproj index 6ea573a..876916e 100644 --- a/FuzzySharp.Test/FuzzySharp.Test.csproj +++ b/FuzzySharp.Test/FuzzySharp.Test.csproj @@ -1,7 +1,7 @@ - netcoreapp3.1 + NET8.0 false diff --git a/FuzzySharp/FuzzySharp.csproj b/FuzzySharp/FuzzySharp.csproj index d2cee32..74ee9b4 100644 --- a/FuzzySharp/FuzzySharp.csproj +++ b/FuzzySharp/FuzzySharp.csproj @@ -1,7 +1,7 @@  - netcoreapp2.0;netcoreapp2.1;netstandard1.6;netstandard2.0;netstandard2.1;net45;net46;net461 + netstandard2.1 true Jacob Bayer Fuzzy string matcher based on FuzzyWuzzy algorithm from SeatGeek @@ -24,14 +24,11 @@ - + - - System - diff --git a/FuzzySharp/Levenshtein.cs b/FuzzySharp/Levenshtein.cs index 5b620af..80abb80 100644 --- a/FuzzySharp/Levenshtein.cs +++ b/FuzzySharp/Levenshtein.cs @@ -1,7 +1,5 @@ using System; -using System.Collections.Generic; using System.Diagnostics; -using System.Linq; using FuzzySharp.Edits; namespace FuzzySharp @@ -10,20 +8,20 @@ public static class Levenshtein { private static EditOp[] GetEditOps(T[] arr1, T[] arr2) where T : IEquatable { - return GetEditOps(arr1.Length, arr1, arr2.Length, arr2); + return GetEditOps(arr1.Length, (ReadOnlySpan)arr1, arr2.Length, (ReadOnlySpan)arr2); } // Special Case private static EditOp[] GetEditOps(string s1, string s2) { - return GetEditOps(s1.Length, s1.ToCharArray(), s2.Length, s2.ToCharArray()); + return GetEditOps(s1.Length, s1.AsSpan(), s2.Length, s2.AsSpan()); } - private static EditOp[] GetEditOps(int len1, T[] c1, int len2, T[] c2) where T : IEquatable + private static EditOp[] GetEditOps(int len1, ReadOnlySpan c1, int len2, ReadOnlySpan c2) where T : IEquatable { int i; - int[] matrix; + Span matrix; int p1 = 0; int p2 = 0; @@ -103,9 +101,9 @@ private static EditOp[] GetEditOps(int len1, T[] c1, int len2, T[] c2) where } - private static EditOp[] EditOpsFromCostMatrix(int len1, T[] c1, int p1, int o1, - int len2, T[] c2, int p2, int o2, - int[] matrix) + private static EditOp[] EditOpsFromCostMatrix(int len1, ReadOnlySpan c1, int p1, int o1, + int len2, ReadOnlySpan c2, int p2, int o2, + Span matrix) where T: IEquatable { @@ -665,13 +663,7 @@ private static OpCode[] EditOpsToOpCodes(EditOp[] ops, int len1, int len2) return opCodes; } - // Special Case - public static int EditDistance(string s1, string s2, int xcost = 0) - { - return EditDistance(s1.ToCharArray(), s2.ToCharArray(), xcost); - } - - public static int EditDistance(T[] c1, T[] c2, int xcost = 0) where T: IEquatable + public static int EditDistance(ReadOnlySpan c1, ReadOnlySpan c2, int xcost = 0) where T: IEquatable { int i; @@ -720,7 +712,7 @@ public static int EditDistance(T[] c1, T[] c2, int xcost = 0) where T: IEqua str1 = str2; str2 = temp; - T[] t = c2; + ReadOnlySpan t = c2; c2 = c1; c1 = t; @@ -873,7 +865,7 @@ public static int EditDistance(T[] c1, T[] c2, int xcost = 0) where T: IEqua } - private static int Memchr(T[] haystack, int offset, T needle, int num) where T : IEquatable + private static int Memchr(ReadOnlySpan haystack, int offset, T needle, int num) where T : IEquatable { if (num != 0) @@ -899,20 +891,18 @@ public static double GetRatio(T[] input1, T[] input2) where T : IEquatable int len2 = input2.Length; int lensum = len1 + len2; - int editDistance = EditDistance(input1, input2, 1); + int editDistance = EditDistance(input1.AsSpan(), input2.AsSpan(), 1); return editDistance == 0 ? 1 : (lensum - editDistance) / (double)lensum; } - public static double GetRatio(IEnumerable input1, IEnumerable input2) where T : IEquatable + public static double GetRatio(ReadOnlySpan input1, ReadOnlySpan input2) where T : IEquatable { - var s1 = input1.ToArray(); - var s2 = input2.ToArray(); - int len1 = s1.Length; - int len2 = s2.Length; + int len1 = input1.Length; + int len2 = input2.Length; int lensum = len1 + len2; - int editDistance = EditDistance(s1, s2, 1); + int editDistance = EditDistance(input1, input2, 1); return editDistance == 0 ? 1 : (lensum - editDistance) / (double)lensum; } @@ -920,7 +910,7 @@ public static double GetRatio(IEnumerable input1, IEnumerable input2) w // Special Case public static double GetRatio(string s1, string s2) { - return GetRatio(s1.ToCharArray(), s2.ToCharArray()); + return GetRatio(s1.AsSpan(), s2.AsSpan()); } } } diff --git a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs index a536da4..8983b4d 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs @@ -41,9 +41,9 @@ public static int Calculate(T[] input1, T[] input2) if (longEnd > longer.Length) longEnd = longer.Length; - var longSubstr = longer.Skip(longStart).Take(longEnd - longStart); + var longSubstr = longer.AsSpan().Slice(longStart, longEnd - longStart); - double ratio = Levenshtein.GetRatio(shorter, longSubstr); + double ratio = Levenshtein.GetRatio(shorter.AsSpan(), longSubstr); if (ratio > .995) { From d4e0613ac30f7986f1067703de0a61733d7782c3 Mon Sep 17 00:00:00 2001 From: "yevhen.cherkes" Date: Wed, 31 Jul 2024 17:14:46 +0200 Subject: [PATCH 02/20] further optimizations --- .../ScorerTests/TokenSetScorerBaseTest.cs | 3 - FuzzySharp/Extensions/StringExtensions.cs | 63 ++++++++++++++ FuzzySharp/Levenshtein.cs | 84 +++++++++---------- .../PreProcess/StringPreprocessorFactory.cs | 21 +++-- .../TokenAbbreviationScorerBase.cs | 22 ++--- .../TokenDifferenceScorerBase.cs | 7 +- .../TokenInitialismScorerBase.cs | 4 +- .../TokenSet/TokenSetScorerBase.cs | 15 ++-- .../TokenSort/TokenSortAlgorithm.cs | 8 +- .../Strategy/Generic/PartialRatioStrategyT.cs | 15 ++-- .../Strategy/PartialRatioStrategy.cs | 15 ++-- FuzzySharp/Utils/Heap.cs | 40 ++++----- FuzzySharp/Utils/Permutation.cs | 4 +- 13 files changed, 178 insertions(+), 123 deletions(-) create mode 100644 FuzzySharp/Extensions/StringExtensions.cs diff --git a/FuzzySharp.Test/FuzzyTests/ScorerTests/TokenSetScorerBaseTest.cs b/FuzzySharp.Test/FuzzyTests/ScorerTests/TokenSetScorerBaseTest.cs index 4cf7d6f..481b1f2 100644 --- a/FuzzySharp.Test/FuzzyTests/ScorerTests/TokenSetScorerBaseTest.cs +++ b/FuzzySharp.Test/FuzzyTests/ScorerTests/TokenSetScorerBaseTest.cs @@ -2,9 +2,6 @@ using FuzzySharp.SimilarityRatio.Scorer; using FuzzySharp.SimilarityRatio.Scorer.StrategySensitive; using NUnit.Framework; -using System; -using System.Collections.Generic; -using System.Text; namespace FuzzySharp.Test.FuzzyTests.ScorerTests { diff --git a/FuzzySharp/Extensions/StringExtensions.cs b/FuzzySharp/Extensions/StringExtensions.cs new file mode 100644 index 0000000..1ce26ad --- /dev/null +++ b/FuzzySharp/Extensions/StringExtensions.cs @@ -0,0 +1,63 @@ +using System; +using System.Collections.Generic; + +namespace FuzzySharp.Extensions +{ + internal static class StringExtensions + { + public static List ExtractLetterOnlyWords(this string input) + { + var result = new List(); + + if (string.IsNullOrEmpty(input)) + return result; + + var span = input.AsSpan(); + + int start = 0; + for (var i = 0; i < span.Length; i++) + { + if (!char.IsLetter(span[i])) + { + if (i - start > 0) + { + result.Add(span.Slice(start, i - start).ToString()); + } + + start = i+1; + } + } + + if (span.Length - start > 0) + result.Add(span.Slice(start, span.Length - start).ToString()); + + return result; + } + + public static string[] SplitByAnySpace(this string input) + { + if (string.IsNullOrWhiteSpace(input)) + return Array.Empty(); + + var words = input.Split(Array.Empty(), StringSplitOptions.RemoveEmptyEntries); + + return words; + } + + public static string[] GetSortedWords(this string input) + { + var words = SplitByAnySpace(input); + + Array.Sort(words); + + return words; + } + + public static string NormalizeSpacesAndSort(this string input) + { + var words = GetSortedWords(input); + + return string.Join(' ', words); + } + } +} diff --git a/FuzzySharp/Levenshtein.cs b/FuzzySharp/Levenshtein.cs index 80abb80..63d89a9 100644 --- a/FuzzySharp/Levenshtein.cs +++ b/FuzzySharp/Levenshtein.cs @@ -21,8 +21,6 @@ private static EditOp[] GetEditOps(int len1, ReadOnlySpan c1, int len2, Re { int i; - Span matrix; - int p1 = 0; int p2 = 0; @@ -51,7 +49,7 @@ private static EditOp[] GetEditOps(int len1, ReadOnlySpan c1, int len2, Re len1++; len2++; - matrix = new int[len2 * len1]; + Span matrix = new int[len2 * len1]; for (i = 0; i < len2; i++) matrix[i] = i; @@ -96,7 +94,6 @@ private static EditOp[] GetEditOps(int len1, ReadOnlySpan c1, int len2, Re } - return EditOpsFromCostMatrix(len1, c1, p1, len1o, len2, c2, p2, len2o, matrix); } @@ -248,7 +245,7 @@ public static MatchingBlock[] GetMatchingBlocks(int len1, int len2, OpCode[] ops noOfMB = 0; - for (i = n; i-- != 0; o++) + for (i = n; i != 0; i--, o++) { if (ops[o].EditType == EditType.KEEP) { @@ -298,7 +295,7 @@ public static MatchingBlock[] GetMatchingBlocks(int len1, int len2, OpCode[] ops Debug.Assert(mb != noOfMB); - MatchingBlock finalBlock = new MatchingBlock + var finalBlock = new MatchingBlock { SourcePos = len1, DestPos = len2, @@ -326,7 +323,9 @@ private static MatchingBlock[] GetMatchingBlocks(int len1, int len2, EditOp[] op EditType type; - for (i = n; i != 0;) + i = n; + + while (i > 0) { @@ -381,9 +380,6 @@ private static MatchingBlock[] GetMatchingBlocks(int len1, int len2, EditOp[] op } while (i != 0 && ops[o].EditType == type && SourcePos == ops[o].SourcePos && DestPos == ops[o].DestPos); break; - - default: - break; } } @@ -398,8 +394,9 @@ private static MatchingBlock[] GetMatchingBlocks(int len1, int len2, EditOp[] op SourcePos = DestPos = 0; int mbIndex = 0; + i = n; - for (i = n; i != 0;) + while (i > 0) { while (ops[o].EditType == EditType.KEEP && --i != 0) @@ -410,11 +407,13 @@ private static MatchingBlock[] GetMatchingBlocks(int len1, int len2, EditOp[] op if (SourcePos < ops[o].SourcePos || DestPos < ops[o].DestPos) { - MatchingBlock mb = new MatchingBlock(); + var mb = new MatchingBlock + { + SourcePos = SourcePos, + DestPos = DestPos, + Length = ops[o].SourcePos - SourcePos + }; - mb.SourcePos = SourcePos; - mb.DestPos = DestPos; - mb.Length = ops[o].SourcePos - SourcePos; SourcePos = ops[o].SourcePos; DestPos = ops[o].DestPos; @@ -456,9 +455,6 @@ private static MatchingBlock[] GetMatchingBlocks(int len1, int len2, EditOp[] op } while (i != 0 && ops[o].EditType == type && SourcePos == ops[o].SourcePos && DestPos == ops[o].DestPos); break; - - default: - break; } } @@ -466,20 +462,24 @@ private static MatchingBlock[] GetMatchingBlocks(int len1, int len2, EditOp[] op { Debug.Assert(len1 -SourcePos == len2 - DestPos); - MatchingBlock mb = new MatchingBlock(); - mb.SourcePos = SourcePos; - mb.DestPos = DestPos; - mb.Length = len1 - SourcePos; + var mb = new MatchingBlock + { + SourcePos = SourcePos, + DestPos = DestPos, + Length = len1 - SourcePos + }; matchingBlocks[mbIndex++] = mb; } Debug.Assert(numberOfMatchingBlocks == mbIndex); - MatchingBlock finalBlock = new MatchingBlock(); - finalBlock.SourcePos = len1; - finalBlock.DestPos = len2; - finalBlock.Length = 0; + var finalBlock = new MatchingBlock + { + SourcePos = len1, + DestPos = len2, + Length = 0 + }; matchingBlocks[mbIndex] = finalBlock; @@ -497,7 +497,9 @@ private static OpCode[] EditOpsToOpCodes(EditOp[] ops, int len1, int len2) noOfBlocks = 0; SourcePos = DestPos = 0; - for (i = n; i != 0;) + i = n; + + while (i > 0) { while (ops[o].EditType == EditType.KEEP && --i != 0) @@ -553,9 +555,6 @@ private static OpCode[] EditOpsToOpCodes(EditOp[] ops, int len1, int len2) } while (i != 0 && ops[o].EditType == type && SourcePos == ops[o].SourcePos && DestPos == ops[o].DestPos); break; - - default: - break; } } @@ -568,7 +567,9 @@ private static OpCode[] EditOpsToOpCodes(EditOp[] ops, int len1, int len2) SourcePos = DestPos = 0; int oIndex = 0; - for (i = n; i != 0;) + i = n; + + while (i > 0) { while (ops[o].EditType == EditType.KEEP && --i != 0) @@ -646,16 +647,15 @@ private static OpCode[] EditOpsToOpCodes(EditOp[] ops, int len1, int len2) { Debug.Assert(len1 - SourcePos == len2 - DestPos); - if (opCodes[oIndex] == null) - opCodes[oIndex] = new OpCode(); - opCodes[oIndex].EditType = EditType.KEEP; - opCodes[oIndex].SourceBegin = SourcePos; - opCodes[oIndex].DestBegin = DestPos; - opCodes[oIndex].SourceEnd = len1; - opCodes[oIndex].DestEnd = len2; - oIndex++; + var opcode = opCodes[oIndex] ?? (opCodes[oIndex] = new OpCode()); + opcode.EditType = EditType.KEEP; + opcode.SourceBegin = SourcePos; + opcode.DestBegin = DestPos; + opcode.SourceEnd = len1; + opcode.DestEnd = len2; + oIndex++; } Debug.Assert(oIndex == noOfBlocks); @@ -765,7 +765,7 @@ public static int EditDistance(ReadOnlySpan c1, ReadOnlySpan c2, int xc if (ch1.Equals(c2[c2p++])) { - x = --D; + x = D-1; } else { @@ -831,7 +831,7 @@ public static int EditDistance(ReadOnlySpan c1, ReadOnlySpan c2, int xc /* main */ while (p <= end) { - int c3 = --D + (!ch1.Equals(c2[c2p++]) ? 1 : 0); + int c3 = D-1 + (!ch1.Equals(c2[c2p++]) ? 1 : 0); x++; if (x > c3) { @@ -848,7 +848,7 @@ public static int EditDistance(ReadOnlySpan c1, ReadOnlySpan c2, int xc /* lower triangle sentinel */ if (i <= half) { - int c3 = --D + (!ch1.Equals(c2[c2p]) ? 1 : 0); + int c3 = D - 1 + (!ch1.Equals(c2[c2p]) ? 1 : 0); x++; if (x > c3) { diff --git a/FuzzySharp/PreProcess/StringPreprocessorFactory.cs b/FuzzySharp/PreProcess/StringPreprocessorFactory.cs index 0cc5647..eb67157 100644 --- a/FuzzySharp/PreProcess/StringPreprocessorFactory.cs +++ b/FuzzySharp/PreProcess/StringPreprocessorFactory.cs @@ -1,18 +1,25 @@ using System; -using System.Text.RegularExpressions; namespace FuzzySharp.PreProcess { - internal class StringPreprocessorFactory + internal static class StringPreprocessorFactory { - private static string pattern = "[^ a-zA-Z0-9]"; - private static string Default(string input) { - input = Regex.Replace(input, pattern, " "); - input = input.ToLower(); + if (string.IsNullOrWhiteSpace(input)) + { + return string.Empty; + } + + var result = new char[input.Length].AsSpan(); + + for (var i = 0; i < input.Length; i++) + { + var c = input[i]; + result[i] = char.IsLetterOrDigit(c) ? char.ToLower(c) : ' '; + } - return input.Trim(); + return result.ToString().Trim(); } public static Func GetPreprocessor(PreprocessMode mode) diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs index 98c95ce..8b4739a 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs @@ -1,6 +1,6 @@ using System.Collections.Generic; using System.Linq; -using System.Text.RegularExpressions; +using FuzzySharp.Extensions; using FuzzySharp.Utils; namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive @@ -23,25 +23,25 @@ public override int Score(string input1, string input2) longer = input1; } - double lenRatio = ((double)longer.Length) / shorter.Length; + double lenRatio = (double)longer.Length / shorter.Length; // if longer isn't at least 1.5 times longer than the other, then its probably not an abbreviation if (lenRatio < 1.5) return 0; // numbers can't be abbreviations for other numbers, though that would be hilarious. "Yes, 4 - as in 4,238" - var tokensLonger = Regex.Matches(longer, @"[a-zA-Z]+").Cast().Select(m => m.Value).ToArray(); - var tokensShorter = Regex.Matches(shorter, @"[a-zA-Z]+").Cast().Select(m => m.Value).ToArray(); + var tokensLonger = longer.ExtractLetterOnlyWords(); + var tokensShorter = shorter.ExtractLetterOnlyWords(); // more than 4 tokens and it's probably not an abbreviation (and could get costly) - if (tokensShorter.Length > 4) + if (tokensShorter.Count > 4) { return 0; } - string[] moreTokens; - string[] fewerTokens; + List moreTokens; + List fewerTokens; - if (tokensLonger.Length > tokensShorter.Length) + if (tokensLonger.Count > tokensShorter.Count) { moreTokens = tokensLonger; fewerTokens = tokensShorter; @@ -52,13 +52,13 @@ public override int Score(string input1, string input2) fewerTokens = tokensLonger; } - var allPermutations = moreTokens.PermutationsOfSize(fewerTokens.Length); + var allPermutations = moreTokens.PermutationsOfSize(fewerTokens.Count); List allScores = new List(); foreach (var permutation in allPermutations) { double sum = 0; - for (int i = 0; i < fewerTokens.Length; i++) + for (int i = 0; i < fewerTokens.Count; i++) { var i1 = permutation[i]; var i2 = fewerTokens[i]; @@ -68,7 +68,7 @@ public override int Score(string input1, string input2) sum += score; } } - allScores.Add((int) (sum / fewerTokens.Length)); + allScores.Add((int) (sum / fewerTokens.Count)); } return allScores.Count==0?0:allScores.Max(); diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorerBase.cs index 11036af..0e02491 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorerBase.cs @@ -1,5 +1,4 @@ -using System.Linq; -using System.Text.RegularExpressions; +using FuzzySharp.Extensions; using FuzzySharp.PreProcess; using FuzzySharp.SimilarityRatio.Scorer.StrategySensitive.Generic; @@ -14,8 +13,8 @@ public override int Score(string[] input1, string[] input2) public int Score(string input1, string input2) { - var tokens1 = Regex.Split(input1, @"\s+").Where(s => s.Any()).OrderBy(s => s).ToArray(); - var tokens2 = Regex.Split(input2, @"\s+").Where(s => s.Any()).OrderBy(s => s).ToArray(); + var tokens1 = input1.GetSortedWords(); + var tokens2 = input2.GetSortedWords(); return Score(tokens1, tokens2); } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorerBase.cs index 10aa1af..bbf6bb9 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorerBase.cs @@ -1,5 +1,5 @@ using System.Linq; -using System.Text.RegularExpressions; +using FuzzySharp.Extensions; namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { @@ -26,7 +26,7 @@ public override int Score(string input1, string input2) // if longer isn't at least 3 times longer than the other, then it's probably not an initialism if (lenRatio < 3) return 0; - var initials = Regex.Split(longer, @"\s+").Where(s => s.Any()).Select(s => s[0]); + var initials = longer.SplitByAnySpace().Select(s => s[0]); return Scorer(string.Join("", initials), shorter); } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs index 785de55..63c29ae 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs @@ -1,7 +1,6 @@ -using System; -using System.Collections.Generic; +using System.Collections.Generic; using System.Linq; -using System.Text.RegularExpressions; +using FuzzySharp.Extensions; namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { @@ -9,12 +8,12 @@ public abstract class TokenSetScorerBase : StrategySensitiveScorerBase { public override int Score(string input1, string input2) { - var tokens1 = new HashSet(Regex.Split(input1, @"\s+").Where(s => s.Any())); - var tokens2 = new HashSet(Regex.Split(input2, @"\s+").Where(s => s.Any())); + var tokens1 = new HashSet(input1.SplitByAnySpace()); + var tokens2 = new HashSet(input2.SplitByAnySpace()); - var sortedIntersection = String.Join(" ", tokens1.Intersect(tokens2).OrderBy(s => s)).Trim(); - var sortedDiff1To2 = (sortedIntersection + " " + String.Join(" ", tokens1.Except(tokens2).OrderBy(s => s))).Trim(); - var sortedDiff2To1 = (sortedIntersection + " " + String.Join(" ", tokens2.Except(tokens1).OrderBy(s => s))).Trim(); + var sortedIntersection = string.Join(" ", tokens1.Intersect(tokens2).OrderBy(s => s)).Trim(); + var sortedDiff1To2 = (sortedIntersection + " " + string.Join(" ", tokens1.Except(tokens2).OrderBy(s => s))).Trim(); + var sortedDiff2To1 = (sortedIntersection + " " + string.Join(" ", tokens2.Except(tokens1).OrderBy(s => s))).Trim(); return new[] { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/TokenSortAlgorithm.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/TokenSortAlgorithm.cs index dbfa10a..032b779 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/TokenSortAlgorithm.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/TokenSortAlgorithm.cs @@ -1,6 +1,4 @@ -using System; -using System.Linq; -using System.Text.RegularExpressions; +using FuzzySharp.Extensions; namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { @@ -8,8 +6,8 @@ public abstract class TokenSortScorerBase : StrategySensitiveScorerBase { public override int Score(string input1, string input2) { - var sorted1 = String.Join(" ", Regex.Split(input1, @"\s+").Where(s => s.Any()).OrderBy(s => s)).Trim(); - var sorted2 = String.Join(" ", Regex.Split(input2, @"\s+").Where(s => s.Any()).OrderBy(s => s)).Trim(); + var sorted1 = input1.NormalizeSpacesAndSort(); + var sorted2 = input2.NormalizeSpacesAndSort(); return Scorer(sorted1, sorted2); } diff --git a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs index 8983b4d..937518c 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs @@ -1,7 +1,6 @@ using System; using System.Collections.Generic; using System.Linq; -using FuzzySharp.Edits; namespace FuzzySharp.SimilarityRatio.Strategy.Generic { @@ -28,22 +27,22 @@ public static int Calculate(T[] input1, T[] input2) longer = input1; } - MatchingBlock[] matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); + var matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); - List scores = new List(); + var scores = new List(); foreach (var matchingBlock in matchingBlocks) { - int dist = matchingBlock.DestPos - matchingBlock.SourcePos; + var dist = matchingBlock.DestPos - matchingBlock.SourcePos; - int longStart = dist > 0 ? dist : 0; - int longEnd = longStart + shorter.Length; + var longStart = dist > 0 ? dist : 0; + var longEnd = longStart + shorter.Length; if (longEnd > longer.Length) longEnd = longer.Length; - var longSubstr = longer.AsSpan().Slice(longStart, longEnd - longStart); + var longSubstr = longer.AsSpan()[longStart..longEnd]; - double ratio = Levenshtein.GetRatio(shorter.AsSpan(), longSubstr); + var ratio = Levenshtein.GetRatio(shorter.AsSpan(), longSubstr); if (ratio > .995) { diff --git a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs index 1d25991..442ac3f 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs @@ -1,7 +1,6 @@ using System; using System.Collections.Generic; using System.Linq; -using FuzzySharp.Edits; namespace FuzzySharp.SimilarityRatio.Strategy { @@ -28,22 +27,22 @@ public static int Calculate(string input1, string input2) longer = input1; } - MatchingBlock[] matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); + var matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); - List scores = new List(); + var scores = new List(); foreach (var matchingBlock in matchingBlocks) { - int dist = matchingBlock.DestPos - matchingBlock.SourcePos; + var dist = matchingBlock.DestPos - matchingBlock.SourcePos; - int longStart = dist > 0 ? dist : 0; - int longEnd = longStart + shorter.Length; + var longStart = dist > 0 ? dist : 0; + var longEnd = longStart + shorter.Length; if (longEnd > longer.Length) longEnd = longer.Length; - string longSubstr = longer.Substring(longStart, longEnd - longStart); + var longSubstr = longer.AsSpan()[longStart..longEnd]; - double ratio = Levenshtein.GetRatio(shorter, longSubstr); + var ratio = Levenshtein.GetRatio(shorter, longSubstr); if (ratio > .995) { diff --git a/FuzzySharp/Utils/Heap.cs b/FuzzySharp/Utils/Heap.cs index b890982..7b1d1eb 100644 --- a/FuzzySharp/Utils/Heap.cs +++ b/FuzzySharp/Utils/Heap.cs @@ -11,13 +11,11 @@ public abstract class Heap : IEnumerable private const int GrowFactor = 2; private const int MinGrow = 1; - private int _capacity = InitialCapacity; private T[] _heap = new T[InitialCapacity]; - private int _tail = 0; - public int Count => _tail; + public int Count { get; private set; } - public int Capacity => _capacity; + public int Capacity { get; private set; } = InitialCapacity; protected Comparer Comparer { get; } protected abstract bool Dominates(T x, T y); @@ -46,10 +44,10 @@ protected Heap(IEnumerable collection, Comparer comparer) if (Count == Capacity) Grow(); - _heap[_tail++] = item; + _heap[Count++] = item; } - for (int i = Parent(_tail - 1); i >= 0; i--) + for (var i = Parent(Count - 1); i >= 0; i--) BubbleDown(i); } @@ -58,8 +56,8 @@ public void Add(T item) if (Count == Capacity) Grow(); - _heap[_tail++] = item; - BubbleUp(_tail - 1); + _heap[Count++] = item; + BubbleUp(Count - 1); } private void BubbleUp(int i) @@ -82,9 +80,9 @@ public T GetMin() public T ExtractDominating() { if (Count == 0) throw new InvalidOperationException("Heap is empty"); - T ret = _heap[0]; - _tail--; - Swap(_tail, 0); + var ret = _heap[0]; + Count--; + Swap(Count, 0); BubbleDown(0); return ret; } @@ -93,7 +91,7 @@ private void BubbleDown(int i) { while (true) { - int dominatingNode = Dominating(i); + var dominatingNode = Dominating(i); if (dominatingNode == i) return; Swap(i, dominatingNode); i = dominatingNode; @@ -102,7 +100,7 @@ private void BubbleDown(int i) private int Dominating(int i) { - int dominatingNode = i; + var dominatingNode = i; dominatingNode = GetDominating(YoungChild(i), dominatingNode); dominatingNode = GetDominating(OldChild(i), dominatingNode); @@ -111,17 +109,15 @@ private int Dominating(int i) private int GetDominating(int newNode, int dominatingNode) { - if (newNode < _tail && !Dominates(_heap[dominatingNode], _heap[newNode])) + if (newNode < Count && !Dominates(_heap[dominatingNode], _heap[newNode])) return newNode; - else - return dominatingNode; + + return dominatingNode; } private void Swap(int i, int j) { - T tmp = _heap[i]; - _heap[i] = _heap[j]; - _heap[j] = tmp; + (_heap[i], _heap[j]) = (_heap[j], _heap[i]); } private static int Parent(int i) @@ -141,11 +137,11 @@ private static int OldChild(int i) private void Grow() { - int newCapacity = _capacity * GrowFactor + MinGrow; + var newCapacity = Capacity * GrowFactor + MinGrow; var newHeap = new T[newCapacity]; - Array.Copy(_heap, newHeap, _capacity); + Array.Copy(_heap, newHeap, Capacity); _heap = newHeap; - _capacity = newCapacity; + Capacity = newCapacity; } public IEnumerator GetEnumerator() diff --git a/FuzzySharp/Utils/Permutation.cs b/FuzzySharp/Utils/Permutation.cs index e6c0976..d09e8e6 100644 --- a/FuzzySharp/Utils/Permutation.cs +++ b/FuzzySharp/Utils/Permutation.cs @@ -122,9 +122,7 @@ private static IEnumerable> Permute(List set, int start, int end) private static void Swap(List set, int a, int b) { - var temp = set[a]; - set[a] = set[b]; - set[b] = temp; + (set[a], set[b]) = (set[b], set[a]); } public static IEnumerable> Cycles(IEnumerable seed) From 33daf545b30af5bdf1953ce0c78d5b8a14ea6e12 Mon Sep 17 00:00:00 2001 From: "yevhen.cherkes" Date: Wed, 31 Jul 2024 17:28:01 +0200 Subject: [PATCH 03/20] code cleanup --- FuzzySharp/Extensions/StringExtensions.cs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/FuzzySharp/Extensions/StringExtensions.cs b/FuzzySharp/Extensions/StringExtensions.cs index 1ce26ad..4cf4463 100644 --- a/FuzzySharp/Extensions/StringExtensions.cs +++ b/FuzzySharp/Extensions/StringExtensions.cs @@ -14,22 +14,21 @@ public static List ExtractLetterOnlyWords(this string input) var span = input.AsSpan(); - int start = 0; + var start = 0; for (var i = 0; i < span.Length; i++) { - if (!char.IsLetter(span[i])) - { - if (i - start > 0) - { - result.Add(span.Slice(start, i - start).ToString()); - } + if (char.IsLetter(span[i])) continue; - start = i+1; + if (i - start > 0) + { + result.Add(span[start..i].ToString()); } + + start = i+1; } if (span.Length - start > 0) - result.Add(span.Slice(start, span.Length - start).ToString()); + result.Add(span[start..].ToString()); return result; } From f5073bd6335d50f00406127a53b88d0eec3aa477 Mon Sep 17 00:00:00 2001 From: "yevhen.cherkes" Date: Fri, 2 Aug 2024 13:20:35 +0200 Subject: [PATCH 04/20] further optimizations, code cleanup --- FuzzySharp.Test/FuzzySharp.Test.csproj | 2 +- FuzzySharp/Edits/MatchingBlock.cs | 7 ++----- FuzzySharp/Extensions/StringExtensions.cs | 2 +- .../TokenAbbreviation/TokenAbbreviationScorerBase.cs | 9 +++++---- .../TokenInitialism/TokenInitialismScorerBase.cs | 4 ++-- .../StrategySensitive/TokenSet/TokenSetScorerBase.cs | 6 +++--- 6 files changed, 14 insertions(+), 16 deletions(-) diff --git a/FuzzySharp.Test/FuzzySharp.Test.csproj b/FuzzySharp.Test/FuzzySharp.Test.csproj index 876916e..5f90e92 100644 --- a/FuzzySharp.Test/FuzzySharp.Test.csproj +++ b/FuzzySharp.Test/FuzzySharp.Test.csproj @@ -13,7 +13,7 @@ all runtime; build; native; contentfiles; analyzers; buildtransitive - + diff --git a/FuzzySharp/Edits/MatchingBlock.cs b/FuzzySharp/Edits/MatchingBlock.cs index 585b6ad..9f0e399 100644 --- a/FuzzySharp/Edits/MatchingBlock.cs +++ b/FuzzySharp/Edits/MatchingBlock.cs @@ -1,14 +1,11 @@ namespace FuzzySharp.Edits { - public class MatchingBlock + public sealed class MatchingBlock { public int SourcePos { get; set; } public int DestPos { get; set; } public int Length { get; set; } - public override string ToString() - { - return $"({SourcePos},{DestPos},{Length})"; - } + public override string ToString() => $"({SourcePos},{DestPos},{Length})"; } } diff --git a/FuzzySharp/Extensions/StringExtensions.cs b/FuzzySharp/Extensions/StringExtensions.cs index 4cf4463..b3ef27d 100644 --- a/FuzzySharp/Extensions/StringExtensions.cs +++ b/FuzzySharp/Extensions/StringExtensions.cs @@ -5,7 +5,7 @@ namespace FuzzySharp.Extensions { internal static class StringExtensions { - public static List ExtractLetterOnlyWords(this string input) + public static List ExtractTokens(this string input) { var result = new List(); diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs index 8b4739a..848bb13 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs @@ -1,4 +1,5 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; using System.Linq; using FuzzySharp.Extensions; using FuzzySharp.Utils; @@ -29,8 +30,8 @@ public override int Score(string input1, string input2) if (lenRatio < 1.5) return 0; // numbers can't be abbreviations for other numbers, though that would be hilarious. "Yes, 4 - as in 4,238" - var tokensLonger = longer.ExtractLetterOnlyWords(); - var tokensShorter = shorter.ExtractLetterOnlyWords(); + var tokensLonger = longer.ExtractTokens(); + var tokensShorter = shorter.ExtractTokens(); // more than 4 tokens and it's probably not an abbreviation (and could get costly) if (tokensShorter.Count > 4) @@ -80,7 +81,7 @@ public override int Score(string input1, string input2) /// /// /// - private bool StringContainsInOrder(string s1, string s2) + private static bool StringContainsInOrder(ReadOnlySpan s1, ReadOnlySpan s2) { if (s1.Length < s2.Length) return false; int s2_idx = 0; diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorerBase.cs index bbf6bb9..4dd3a6b 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorerBase.cs @@ -26,9 +26,9 @@ public override int Score(string input1, string input2) // if longer isn't at least 3 times longer than the other, then it's probably not an initialism if (lenRatio < 3) return 0; - var initials = longer.SplitByAnySpace().Select(s => s[0]); + var initials = longer.SplitByAnySpace().Select(s => s[0]).ToArray(); - return Scorer(string.Join("", initials), shorter); + return Scorer(new string(initials), shorter); } } } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs index 63c29ae..091d3bd 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs @@ -11,9 +11,9 @@ public override int Score(string input1, string input2) var tokens1 = new HashSet(input1.SplitByAnySpace()); var tokens2 = new HashSet(input2.SplitByAnySpace()); - var sortedIntersection = string.Join(" ", tokens1.Intersect(tokens2).OrderBy(s => s)).Trim(); - var sortedDiff1To2 = (sortedIntersection + " " + string.Join(" ", tokens1.Except(tokens2).OrderBy(s => s))).Trim(); - var sortedDiff2To1 = (sortedIntersection + " " + string.Join(" ", tokens2.Except(tokens1).OrderBy(s => s))).Trim(); + var sortedIntersection = string.Join(' ', tokens1.Intersect(tokens2).OrderBy(s => s)).Trim(); + var sortedDiff1To2 = (sortedIntersection + " " + string.Join(' ', tokens1.Except(tokens2).OrderBy(s => s))).Trim(); + var sortedDiff2To1 = (sortedIntersection + " " + string.Join(' ', tokens2.Except(tokens1).OrderBy(s => s))).Trim(); return new[] { From 48b4b45a15c16b6fbf43070ac2404c5df1b9586f Mon Sep 17 00:00:00 2001 From: "yevhen.cherkes" Date: Tue, 6 Aug 2024 10:24:38 +0200 Subject: [PATCH 05/20] Refactor and optimize codebase; update dependencies Updated .gitignore to include .vshistory/. Upgraded NUnit3TestAdapter to 4.6.0. Refactored TestScoringEmptyString method in RegressionTests.cs. Simplified string formatting in ExtractedResult.cs. Updated target framework to NET8.0 in FuzzySharp.csproj. Simplified conditional logic in Levenshtein.cs. Refactored StringPreprocessorFactory.cs for better string trimming and switch expression. Used null-coalescing assignment in Process.cs methods. Made several scorer classes sealed. Removed unnecessary using directive in TokenAbbreviationScorerBase.cs. Optimized scoring logic in TokenAbbreviationScorerBase.cs. Changed several strategy classes to static. Optimized score calculation in PartialRatioStrategy.cs. Simplified Heap constructor and added null checks. Removed redundant ToList calls and optimized permutation logic in Permutation.cs. Simplified Cycles method in Permutation.cs. --- .gitignore | 1 + FuzzySharp.Test/FuzzySharp.Test.csproj | 2 +- FuzzySharp.Test/FuzzyTests/RegressionTests.cs | 21 ++++------ FuzzySharp/Extractor/ExtractedResult.cs | 2 +- FuzzySharp/FuzzySharp.csproj | 2 +- FuzzySharp/Levenshtein.cs | 6 +-- .../PreProcess/StringPreprocessorFactory.cs | 15 +++---- FuzzySharp/Process.cs | 24 +++++------ .../Scorer/Composite/WeightedRatioScorer.cs | 2 +- .../Simple/DefaultRatioScorer.cs | 2 +- .../Simple/PartialRatioScorer.cs | 2 +- .../TokenAbbreviationScorerBase.cs | 12 ++++-- .../PartialTokenDifferenceScorer.cs | 2 +- .../TokenDifference/TokenDifferenceScorer.cs | 2 +- .../Strategy/DefaultRatioStrategy.cs | 2 +- .../Strategy/Generic/DefaultRatioStrategyT.cs | 2 +- .../Strategy/Generic/PartialRatioStrategyT.cs | 2 +- .../Strategy/PartialRatioStrategy.cs | 14 +++---- FuzzySharp/Utils/Heap.cs | 7 ++-- FuzzySharp/Utils/Permutation.cs | 41 +++++++++---------- 20 files changed, 78 insertions(+), 85 deletions(-) diff --git a/.gitignore b/.gitignore index 940794e..861efa5 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ bld/ # Visual Studio 2015 cache/options directory .vs/ +.vshistory/ # Uncomment if you have tasks that create the project's static files in wwwroot #wwwroot/ diff --git a/FuzzySharp.Test/FuzzySharp.Test.csproj b/FuzzySharp.Test/FuzzySharp.Test.csproj index 5f90e92..48959cd 100644 --- a/FuzzySharp.Test/FuzzySharp.Test.csproj +++ b/FuzzySharp.Test/FuzzySharp.Test.csproj @@ -9,7 +9,7 @@ - + all runtime; build; native; contentfiles; analyzers; buildtransitive diff --git a/FuzzySharp.Test/FuzzyTests/RegressionTests.cs b/FuzzySharp.Test/FuzzyTests/RegressionTests.cs index 134d2ee..9fea3f8 100644 --- a/FuzzySharp.Test/FuzzyTests/RegressionTests.cs +++ b/FuzzySharp.Test/FuzzyTests/RegressionTests.cs @@ -12,43 +12,37 @@ namespace FuzzySharp.Test.FuzzyTests public class RegressionTests { - /// /// Test to ensure that all IRatioScorer implementations handle scoring empty strings & whitespace strings /// [Test] public void TestScoringEmptyString() { - var scorerType = typeof(IRatioScorer); var assemblies = AppDomain.CurrentDomain.GetAssemblies().ToList(); var types = assemblies.SelectMany(s => { - Type[] types = new Type[] { }; ; try { - types = s.GetTypes(); + return s.GetTypes(); } catch {} - return types; + return []; }).ToList(); var scorerTypes = types.Where(t => scorerType.IsAssignableFrom(t) && !t.IsAbstract && t.IsClass).ToList(); - //var scorerTypes = AppDomain.CurrentDomain.GetAssemblies().SelectMany(s => s.GetTypes()).Where(p => scorerType.IsAssignableFrom(p) && p.IsClass && !p.IsAbstract); - - - MethodInfo getScorerCacheMethodInfo = typeof(ScorerCache).GetMethod("Get"); - + string nullString = null; //Null doesnt seem to be handled by any scorer string emptyString = ""; string whitespaceString = " "; - string[] nullOrWhitespaceStrings = { emptyString, whitespaceString }; + string[] nullOrWhitespaceStrings = [emptyString, whitespaceString]; + MethodInfo getScorerCacheMethodInfo = typeof(ScorerCache).GetMethod("Get"); - foreach (Type t in scorerTypes) + foreach (var t in scorerTypes) { System.Diagnostics.Debug.WriteLine($"Testing {t.Name}"); MethodInfo m = getScorerCacheMethodInfo.MakeGenericMethod(t); - IRatioScorer scorer = m.Invoke(this, new object[] { }) as IRatioScorer; + IRatioScorer scorer = m.Invoke(this, []) as IRatioScorer; foreach(string s in nullOrWhitespaceStrings) { @@ -79,7 +73,6 @@ public void TestScoringEmptyString() } - } } diff --git a/FuzzySharp/Extractor/ExtractedResult.cs b/FuzzySharp/Extractor/ExtractedResult.cs index 43f41e2..30aad9c 100644 --- a/FuzzySharp/Extractor/ExtractedResult.cs +++ b/FuzzySharp/Extractor/ExtractedResult.cs @@ -34,7 +34,7 @@ public override string ToString() { return $"(string: {Value}, score: {Score}, index: {Index})"; } - return $"(value: {Value.ToString()}, score: {Score}, index: {Index})"; + return $"(value: {Value}, score: {Score}, index: {Index})"; } } } diff --git a/FuzzySharp/FuzzySharp.csproj b/FuzzySharp/FuzzySharp.csproj index 74ee9b4..975af4b 100644 --- a/FuzzySharp/FuzzySharp.csproj +++ b/FuzzySharp/FuzzySharp.csproj @@ -1,7 +1,7 @@  - netstandard2.1 + NET8.0 true Jacob Bayer Fuzzy string matcher based on FuzzyWuzzy algorithm from SeatGeek diff --git a/FuzzySharp/Levenshtein.cs b/FuzzySharp/Levenshtein.cs index 63d89a9..a815be1 100644 --- a/FuzzySharp/Levenshtein.cs +++ b/FuzzySharp/Levenshtein.cs @@ -725,10 +725,8 @@ public static int EditDistance(ReadOnlySpan c1, ReadOnlySpan c2, int xc { return len2 + 1 - 2 * Memchr(c2, str2, c1[str1], len2); } - else - { - return len2 - Memchr(c2, str2, c1[str1], len2); - } + + return len2 - Memchr(c2, str2, c1[str1], len2); } len1++; diff --git a/FuzzySharp/PreProcess/StringPreprocessorFactory.cs b/FuzzySharp/PreProcess/StringPreprocessorFactory.cs index eb67157..a454e5d 100644 --- a/FuzzySharp/PreProcess/StringPreprocessorFactory.cs +++ b/FuzzySharp/PreProcess/StringPreprocessorFactory.cs @@ -19,20 +19,17 @@ private static string Default(string input) result[i] = char.IsLetterOrDigit(c) ? char.ToLower(c) : ' '; } - return result.ToString().Trim(); + return result.Trim().ToString(); } public static Func GetPreprocessor(PreprocessMode mode) { - switch (mode) + return mode switch { - case PreprocessMode.Full: - return Default; - case PreprocessMode.None: - return s => s; - default: - throw new InvalidOperationException($"Invalid string preprocessor mode: {mode}"); - } + PreprocessMode.Full => Default, + PreprocessMode.None => s => s, + _ => throw new InvalidOperationException($"Invalid string preprocessor mode: {mode}") + }; } } } diff --git a/FuzzySharp/Process.cs b/FuzzySharp/Process.cs index dbc5caf..87717fe 100644 --- a/FuzzySharp/Process.cs +++ b/FuzzySharp/Process.cs @@ -31,8 +31,8 @@ public static IEnumerable> ExtractAll( IRatioScorer scorer = null, int cutoff = 0) { - if (processor == null) processor = s_defaultStringProcessor; - if (scorer == null) scorer = s_defaultScorer; + processor ??= s_defaultStringProcessor; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractWithoutOrder(query, choices, processor, scorer, cutoff); } @@ -53,7 +53,7 @@ public static IEnumerable> ExtractAll( IRatioScorer scorer = null, int cutoff = 0) { - if (scorer == null) scorer = s_defaultScorer; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractWithoutOrder(query, choices, processor, scorer, cutoff); } #endregion @@ -78,8 +78,8 @@ public static IEnumerable> ExtractTop( int limit = 5, int cutoff = 0) { - if (processor == null) processor = s_defaultStringProcessor; - if (scorer == null) scorer = s_defaultScorer; + processor ??= s_defaultStringProcessor; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractTop(query, choices, processor, scorer, limit, cutoff); } @@ -103,7 +103,7 @@ public static IEnumerable> ExtractTop( int limit = 5, int cutoff = 0) { - if (scorer == null) scorer = s_defaultScorer; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractTop(query, choices, processor, scorer, limit, cutoff); } #endregion @@ -125,8 +125,8 @@ public static IEnumerable> ExtractSorted( IRatioScorer scorer = null, int cutoff = 0) { - if (processor == null) processor = s_defaultStringProcessor; - if (scorer == null) scorer = s_defaultScorer; + processor ??= s_defaultStringProcessor; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractSorted(query, choices, processor, scorer, cutoff); } @@ -146,7 +146,7 @@ public static IEnumerable> ExtractSorted( IRatioScorer scorer = null, int cutoff = 0) { - if (scorer == null) scorer = s_defaultScorer; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractSorted(query, choices, processor, scorer, cutoff); } #endregion @@ -168,8 +168,8 @@ public static ExtractedResult ExtractOne( IRatioScorer scorer = null, int cutoff = 0) { - if (processor == null) processor = s_defaultStringProcessor; - if (scorer == null) scorer = s_defaultScorer; + processor ??= s_defaultStringProcessor; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractOne(query, choices, processor, scorer, cutoff); } @@ -189,7 +189,7 @@ public static ExtractedResult ExtractOne( IRatioScorer scorer = null, int cutoff = 0) { - if (scorer == null) scorer = s_defaultScorer; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractOne(query, choices, processor, scorer, cutoff); } diff --git a/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs index b3744ec..490decb 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs @@ -3,7 +3,7 @@ namespace FuzzySharp.SimilarityRatio.Scorer.Composite { - public class WeightedRatioScorer : ScorerBase + public sealed class WeightedRatioScorer : ScorerBase { private static double UNBASE_SCALE = .95; private static double PARTIAL_SCALE = .90; diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/DefaultRatioScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/DefaultRatioScorer.cs index 12ef6d1..e33dee9 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/DefaultRatioScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/DefaultRatioScorer.cs @@ -3,7 +3,7 @@ namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { - public class DefaultRatioScorer : SimpleRatioScorerBase + public sealed class DefaultRatioScorer : SimpleRatioScorerBase { protected override Func Scorer => DefaultRatioStrategy.Calculate; } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/PartialRatioScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/PartialRatioScorer.cs index 049d8af..0065965 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/PartialRatioScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/PartialRatioScorer.cs @@ -3,7 +3,7 @@ namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { - public class PartialRatioScorer : SimpleRatioScorerBase + public sealed class PartialRatioScorer : SimpleRatioScorerBase { protected override Func Scorer => PartialRatioStrategy.Calculate; } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs index 848bb13..501e4d4 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; using FuzzySharp.Extensions; using FuzzySharp.Utils; @@ -55,7 +54,8 @@ public override int Score(string input1, string input2) var allPermutations = moreTokens.PermutationsOfSize(fewerTokens.Count); - List allScores = new List(); + int maxScore = 0; + foreach (var permutation in allPermutations) { double sum = 0; @@ -69,10 +69,14 @@ public override int Score(string input1, string input2) sum += score; } } - allScores.Add((int) (sum / fewerTokens.Count)); + var avgScore = (int) (sum / fewerTokens.Count); + if(avgScore > maxScore) + { + maxScore = avgScore; + } } - return allScores.Count==0?0:allScores.Max(); + return maxScore; } /// diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs index a216197..22cbed8 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs @@ -3,7 +3,7 @@ namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { - public class PartialTokenDifferenceScorer : TokenDifferenceScorerBase + public sealed class PartialTokenDifferenceScorer : TokenDifferenceScorerBase { protected override Func Scorer => PartialRatioStrategy.Calculate; } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorer.cs index fc2bfb9..a9f59fd 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorer.cs @@ -3,7 +3,7 @@ namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { - public class TokenDifferenceScorer : TokenDifferenceScorerBase + public sealed class TokenDifferenceScorer : TokenDifferenceScorerBase { protected override Func Scorer => DefaultRatioStrategy.Calculate; } diff --git a/FuzzySharp/SimilarityRatio/Strategy/DefaultRatioStrategy.cs b/FuzzySharp/SimilarityRatio/Strategy/DefaultRatioStrategy.cs index 8e8fac2..72d6cd3 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/DefaultRatioStrategy.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/DefaultRatioStrategy.cs @@ -2,7 +2,7 @@ namespace FuzzySharp.SimilarityRatio.Strategy { - internal class DefaultRatioStrategy + internal static class DefaultRatioStrategy { public static int Calculate(string input1, string input2) { diff --git a/FuzzySharp/SimilarityRatio/Strategy/Generic/DefaultRatioStrategyT.cs b/FuzzySharp/SimilarityRatio/Strategy/Generic/DefaultRatioStrategyT.cs index 2fdfb08..f6efd79 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/Generic/DefaultRatioStrategyT.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/Generic/DefaultRatioStrategyT.cs @@ -2,7 +2,7 @@ namespace FuzzySharp.SimilarityRatio.Strategy.Generic { - internal class DefaultRatioStrategy where T : IEquatable + internal static class DefaultRatioStrategy where T : IEquatable { public static int Calculate(T[] input1, T[] input2) { diff --git a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs index 937518c..24f0dd6 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs @@ -4,7 +4,7 @@ namespace FuzzySharp.SimilarityRatio.Strategy.Generic { - internal class PartialRatioStrategy where T : IEquatable + internal static class PartialRatioStrategy where T : IEquatable { public static int Calculate(T[] input1, T[] input2) { diff --git a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs index 442ac3f..4c7152f 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs @@ -1,10 +1,8 @@ using System; -using System.Collections.Generic; -using System.Linq; namespace FuzzySharp.SimilarityRatio.Strategy { - internal class PartialRatioStrategy + internal static class PartialRatioStrategy { public static int Calculate(string input1, string input2) { @@ -29,7 +27,7 @@ public static int Calculate(string input1, string input2) var matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); - var scores = new List(); + double maxScore = 0; foreach (var matchingBlock in matchingBlocks) { @@ -49,11 +47,13 @@ public static int Calculate(string input1, string input2) return 100; } - scores.Add(ratio); - + if (ratio > maxScore) + { + maxScore = ratio; + } } - return (int)Math.Round(100 * scores.Max()); + return (int)Math.Round(100 * maxScore); } } } diff --git a/FuzzySharp/Utils/Heap.cs b/FuzzySharp/Utils/Heap.cs index 7b1d1eb..88c4f06 100644 --- a/FuzzySharp/Utils/Heap.cs +++ b/FuzzySharp/Utils/Heap.cs @@ -24,7 +24,7 @@ protected Heap() : this(Comparer.Default) { } - protected Heap(Comparer comparer) : this(Enumerable.Empty(), comparer) + protected Heap(Comparer comparer) : this([], comparer) { } @@ -35,9 +35,10 @@ protected Heap(IEnumerable collection) protected Heap(IEnumerable collection, Comparer comparer) { - if (collection == null) throw new ArgumentNullException(nameof(collection)); + ArgumentNullException.ThrowIfNull(collection); + ArgumentNullException.ThrowIfNull(comparer); - Comparer = comparer ?? throw new ArgumentNullException(nameof(comparer)); + Comparer = comparer; foreach (var item in collection) { diff --git a/FuzzySharp/Utils/Permutation.cs b/FuzzySharp/Utils/Permutation.cs index d09e8e6..d28a748 100644 --- a/FuzzySharp/Utils/Permutation.cs +++ b/FuzzySharp/Utils/Permutation.cs @@ -15,7 +15,7 @@ public Permutor(IEnumerable set) public List PermutationAt(long i) { - var set = new List(_set.OrderBy(e => e).ToList()); + var set = new List(_set.OrderBy(e => e)); for (long j = 0; j < i - 1; j++) { NextPermutation(set); @@ -62,22 +62,22 @@ public bool NextPermutation(List set) public static class Permutation { - public static List> AllPermutations(this IEnumerable seed) + private static IEnumerable> AllPermutations(this IEnumerable seed) { var set = new List(seed); - return Permute(set, 0, set.Count - 1).ToList(); + return Permute(set, 0, set.Count - 1); } - public static List> PermutationsOfSize(this IEnumerable seed, int size) + public static IEnumerable> PermutationsOfSize(this List seed, int size) { - if (seed.Count() < size) - { - return new List>(); - } - return seed.PermutationsOfSize(new List(), size).ToList(); + var result = seed.Count < size + ? [] + : seed.PermutationsOfSize([], size); + + return result; } - private static IEnumerable> PermutationsOfSize(this IEnumerable seed, List set, int size) + private static IEnumerable> PermutationsOfSize(this List seed, List set, int size) { if (size == 0) { @@ -85,17 +85,16 @@ private static IEnumerable> PermutationsOfSize(this IEnumerable se { yield return permutation; } + + yield break; } - else + + for (int i = 0; i < seed.Count; i++) { - var seedAsList = seed.ToList(); - for (int i = 0; i < seedAsList.Count; i++) + var newSet = new List(set) { seed[i] }; + foreach (var permutation in seed.Skip(i + 1).ToList().PermutationsOfSize(newSet, size - 1)) { - var newSet = new List(set) { seedAsList[i] }; - foreach (var permutation in seedAsList.Skip(i + 1).PermutationsOfSize(newSet, size - 1)) - { - yield return permutation; - } + yield return permutation; } } } @@ -104,7 +103,7 @@ private static IEnumerable> Permute(List set, int start, int end) { if (start == end) { - yield return new List(set); + yield return [..set]; } else { @@ -130,8 +129,8 @@ public static IEnumerable> Cycles(IEnumerable seed) var set = new LinkedList(seed); for (int i = 0; i < set.Count; i++) { - yield return new List(set); - var top = set.First(); + yield return [..set]; + var top = set.First!; set.RemoveFirst(); set.AddLast(top); } From 1b39fb9e1175e280fbed4d86ae732a3676ef0836 Mon Sep 17 00:00:00 2001 From: "yevhen.cherkes" Date: Tue, 6 Aug 2024 13:59:29 +0200 Subject: [PATCH 06/20] target lib to netstandard2.0-2.1 target tests to netcore3.1,net8.0,netframework4.7.2 --- FuzzySharp.Test/FuzzySharp.Test.csproj | 4 ++-- FuzzySharp/Extensions/StringExtensions.cs | 4 ++-- FuzzySharp/FuzzySharp.csproj | 5 ++++- FuzzySharp/PreProcess/StringPreprocessorFactory.cs | 2 +- .../TokenAbbreviation/TokenAbbreviationScorerBase.cs | 2 +- .../StrategySensitive/TokenSet/TokenSetScorerBase.cs | 6 +++--- .../SimilarityRatio/Strategy/PartialRatioStrategy.cs | 2 +- FuzzySharp/Utils/Heap.cs | 7 ++----- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/FuzzySharp.Test/FuzzySharp.Test.csproj b/FuzzySharp.Test/FuzzySharp.Test.csproj index 48959cd..4d977ff 100644 --- a/FuzzySharp.Test/FuzzySharp.Test.csproj +++ b/FuzzySharp.Test/FuzzySharp.Test.csproj @@ -1,9 +1,9 @@ - NET8.0 - + NET8.0;netcoreapp3.1;netframework4.7.2 false + 12.0 diff --git a/FuzzySharp/Extensions/StringExtensions.cs b/FuzzySharp/Extensions/StringExtensions.cs index b3ef27d..19e0b5c 100644 --- a/FuzzySharp/Extensions/StringExtensions.cs +++ b/FuzzySharp/Extensions/StringExtensions.cs @@ -36,7 +36,7 @@ public static List ExtractTokens(this string input) public static string[] SplitByAnySpace(this string input) { if (string.IsNullOrWhiteSpace(input)) - return Array.Empty(); + return []; var words = input.Split(Array.Empty(), StringSplitOptions.RemoveEmptyEntries); @@ -56,7 +56,7 @@ public static string NormalizeSpacesAndSort(this string input) { var words = GetSortedWords(input); - return string.Join(' ', words); + return string.Join(" ", words); } } } diff --git a/FuzzySharp/FuzzySharp.csproj b/FuzzySharp/FuzzySharp.csproj index 975af4b..2265806 100644 --- a/FuzzySharp/FuzzySharp.csproj +++ b/FuzzySharp/FuzzySharp.csproj @@ -1,7 +1,7 @@  - NET8.0 + netstandard2.0;netstandard2.1 true Jacob Bayer Fuzzy string matcher based on FuzzyWuzzy algorithm from SeatGeek @@ -21,9 +21,12 @@ true true snupkg + 12.0 + + diff --git a/FuzzySharp/PreProcess/StringPreprocessorFactory.cs b/FuzzySharp/PreProcess/StringPreprocessorFactory.cs index a454e5d..3560e0e 100644 --- a/FuzzySharp/PreProcess/StringPreprocessorFactory.cs +++ b/FuzzySharp/PreProcess/StringPreprocessorFactory.cs @@ -19,7 +19,7 @@ private static string Default(string input) result[i] = char.IsLetterOrDigit(c) ? char.ToLower(c) : ' '; } - return result.Trim().ToString(); + return ((ReadOnlySpan)result).Trim().ToString(); } public static Func GetPreprocessor(PreprocessMode mode) diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs index 501e4d4..8e5a9b1 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs @@ -63,7 +63,7 @@ public override int Score(string input1, string input2) { var i1 = permutation[i]; var i2 = fewerTokens[i]; - if (StringContainsInOrder(i1, i2)) // must be at least twice as long + if (StringContainsInOrder(i1.AsSpan(), i2.AsSpan())) // must be at least twice as long { var score = Scorer(i1, i2); sum += score; diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs index 091d3bd..63c29ae 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs @@ -11,9 +11,9 @@ public override int Score(string input1, string input2) var tokens1 = new HashSet(input1.SplitByAnySpace()); var tokens2 = new HashSet(input2.SplitByAnySpace()); - var sortedIntersection = string.Join(' ', tokens1.Intersect(tokens2).OrderBy(s => s)).Trim(); - var sortedDiff1To2 = (sortedIntersection + " " + string.Join(' ', tokens1.Except(tokens2).OrderBy(s => s))).Trim(); - var sortedDiff2To1 = (sortedIntersection + " " + string.Join(' ', tokens2.Except(tokens1).OrderBy(s => s))).Trim(); + var sortedIntersection = string.Join(" ", tokens1.Intersect(tokens2).OrderBy(s => s)).Trim(); + var sortedDiff1To2 = (sortedIntersection + " " + string.Join(" ", tokens1.Except(tokens2).OrderBy(s => s))).Trim(); + var sortedDiff2To1 = (sortedIntersection + " " + string.Join(" ", tokens2.Except(tokens1).OrderBy(s => s))).Trim(); return new[] { diff --git a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs index 4c7152f..ce3b02d 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs @@ -40,7 +40,7 @@ public static int Calculate(string input1, string input2) var longSubstr = longer.AsSpan()[longStart..longEnd]; - var ratio = Levenshtein.GetRatio(shorter, longSubstr); + var ratio = Levenshtein.GetRatio(shorter.AsSpan(), longSubstr); if (ratio > .995) { diff --git a/FuzzySharp/Utils/Heap.cs b/FuzzySharp/Utils/Heap.cs index 88c4f06..bcb937a 100644 --- a/FuzzySharp/Utils/Heap.cs +++ b/FuzzySharp/Utils/Heap.cs @@ -35,12 +35,9 @@ protected Heap(IEnumerable collection) protected Heap(IEnumerable collection, Comparer comparer) { - ArgumentNullException.ThrowIfNull(collection); - ArgumentNullException.ThrowIfNull(comparer); + Comparer = comparer ?? throw new ArgumentNullException(nameof(comparer)); - Comparer = comparer; - - foreach (var item in collection) + foreach (var item in collection ?? throw new ArgumentNullException(nameof(collection))) { if (Count == Capacity) Grow(); From 0b4f383d464692da693100f53c66f7bb36557b61 Mon Sep 17 00:00:00 2001 From: "yevhen.cherkes" Date: Wed, 7 Aug 2024 08:57:27 +0200 Subject: [PATCH 07/20] simplify test --- .../EvaluationTests/EvaluationTests.cs | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/FuzzySharp.Test/EvaluationTests/EvaluationTests.cs b/FuzzySharp.Test/EvaluationTests/EvaluationTests.cs index eb22945..1ff2b5f 100644 --- a/FuzzySharp.Test/EvaluationTests/EvaluationTests.cs +++ b/FuzzySharp.Test/EvaluationTests/EvaluationTests.cs @@ -36,20 +36,20 @@ public void Evaluate() - var h1 = Process.ExtractOne("cowboys", new[] { "Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys" }); - var h2 = string.Join(", ", Process.ExtractTop("goolge", new[] { "google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl" }, limit: 3)); - var h3 = string.Join(", ", Process.ExtractAll("goolge", new [] {"google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl" })); - var h4 = string.Join(", ", Process.ExtractAll("goolge", new[] { "google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl" }, cutoff: 40)); - var h5 = string.Join(", ", Process.ExtractSorted("goolge", new [] {"google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl" })); - - var i1 = Process.ExtractOne("cowboys", new[] { "Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys" }, s => s, ScorerCache.Get()); - - var events = new[] - { - new[] { "chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm" }, - new[] { "new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm" }, - new[] { "atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm" }, - }; + var h1 = Process.ExtractOne("cowboys", ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]); + var h2 = string.Join(", ", Process.ExtractTop("goolge", ["google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl"], limit: 3)); + var h3 = string.Join(", ", Process.ExtractAll("goolge", ["google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl"])); + var h4 = string.Join(", ", Process.ExtractAll("goolge", ["google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl"], cutoff: 40)); + var h5 = string.Join(", ", Process.ExtractSorted("goolge", ["google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl"])); + + var i1 = Process.ExtractOne("cowboys", ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"], s => s, ScorerCache.Get()); + + string[][] events = + [ + ["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"], + ["new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm"], + ["atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm"] + ]; var query = new[] { "new york mets vs chicago cubs", "CitiField", "2017-03-19", "8pm" }; var best = Process.ExtractOne(query, events, strings => strings[0]); From bedb9b1f4c10136d970eb706addaaf64dd7c79d1 Mon Sep 17 00:00:00 2001 From: "yevhen.cherkes" Date: Thu, 8 Aug 2024 17:16:51 +0200 Subject: [PATCH 08/20] more optimization + benchmark --- FuzzySharp.Benchmarks/BenchmarkAll.cs | 92 +++++++++++++++++++ .../FuzzySharp.Benchmarks.csproj | 18 ++++ FuzzySharp.Benchmarks/Program.cs | 6 ++ FuzzySharp.sln | 12 ++- .../Scorer/Composite/WeightedRatioScorer.cs | 13 +-- .../TokenSet/TokenSetScorerBase.cs | 25 +++-- 6 files changed, 144 insertions(+), 22 deletions(-) create mode 100644 FuzzySharp.Benchmarks/BenchmarkAll.cs create mode 100644 FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj create mode 100644 FuzzySharp.Benchmarks/Program.cs diff --git a/FuzzySharp.Benchmarks/BenchmarkAll.cs b/FuzzySharp.Benchmarks/BenchmarkAll.cs new file mode 100644 index 0000000..45320fe --- /dev/null +++ b/FuzzySharp.Benchmarks/BenchmarkAll.cs @@ -0,0 +1,92 @@ +using BenchmarkDotNet.Attributes; +using FuzzySharp.PreProcess; + +namespace FuzzySharp.Benchmarks; + +[MemoryDiagnoser] +public class BenchmarkAll +{ + [Benchmark] + public int Ratio1() + { + return Fuzz.Ratio("mysmilarstring", "myawfullysimilarstirng"); + } + + [Benchmark] + public int Ratio2() + { + return Fuzz.Ratio("mysmilarstring", "mysimilarstring"); + } + + [Benchmark] + public int PartialRatio() + { + return Fuzz.PartialRatio("similar", "somewhresimlrbetweenthisstring"); + } + + [Benchmark] + public int TokenSortRatio() + { + return Fuzz.TokenSortRatio("order words out of", " words out of order"); + } + + [Benchmark] + public int PartialTokenSortRatio() + { + return Fuzz.PartialTokenSortRatio("order words out of", " words out of order"); + } + + [Benchmark] + public int TokenSetRatio() + { + return Fuzz.TokenSetRatio("fuzzy was a bear", "fuzzy fuzzy fuzzy bear"); + } + + [Benchmark] + public int PartialTokenSetRatio() + { + return Fuzz.PartialTokenSetRatio("fuzzy was a bear", "fuzzy fuzzy fuzzy bear"); + } + + [Benchmark] + public int WeightedRatio() + { + return Fuzz.WeightedRatio("The quick brown fox jimps ofver the small lazy dog", "the quick brown fox jumps over the small lazy dog"); + } + + [Benchmark] + public int TokenInitialismRatio1() + { + return Fuzz.TokenInitialismRatio("NASA", "National Aeronautics and Space Administration"); + } + + [Benchmark] + public int TokenInitialismRatio2() + { + return Fuzz.TokenInitialismRatio("NASA", "National Aeronautics Space Administration"); + } + + [Benchmark] + public int TokenInitialismRatio3() + { + return Fuzz.TokenInitialismRatio("NASA", "National Aeronautics Space Administration, Kennedy Space Center, Cape Canaveral, Florida 32899"); + } + + [Benchmark] + public int PartialTokenInitialismRatio() + { + return Fuzz.PartialTokenInitialismRatio("NASA", "National Aeronautics Space Administration, Kennedy Space Center, Cape Canaveral, Florida 32899"); + } + + [Benchmark] + public int TokenAbbreviationRatio() + { + return Fuzz.TokenAbbreviationRatio("bl 420", "Baseline section 420", PreprocessMode.Full); + } + + [Benchmark] + public int PartialTokenAbbreviationRatio() + { + return Fuzz.PartialTokenAbbreviationRatio("bl 420", "Baseline section 420", PreprocessMode.Full); + } +} \ No newline at end of file diff --git a/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj b/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj new file mode 100644 index 0000000..763a3ce --- /dev/null +++ b/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj @@ -0,0 +1,18 @@ + + + + Exe + net8.0 + enable + enable + + + + + + + + + + + diff --git a/FuzzySharp.Benchmarks/Program.cs b/FuzzySharp.Benchmarks/Program.cs new file mode 100644 index 0000000..6f75f6a --- /dev/null +++ b/FuzzySharp.Benchmarks/Program.cs @@ -0,0 +1,6 @@ +using BenchmarkDotNet.Running; +using FuzzySharp; + +BenchmarkRunner.Run(typeof(Program).Assembly); + +//Console.WriteLine(Fuzz.WeightedRatio("The quick brown fox jimps ofver the small lazy dog", "the quick brown fox jumps over the small lazy dog")); \ No newline at end of file diff --git a/FuzzySharp.sln b/FuzzySharp.sln index 78eed94..168d5a0 100644 --- a/FuzzySharp.sln +++ b/FuzzySharp.sln @@ -1,11 +1,13 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 16 -VisualStudioVersion = 16.0.29806.167 +# Visual Studio Version 17 +VisualStudioVersion = 17.10.35122.118 MinimumVisualStudioVersion = 10.0.40219.1 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FuzzySharp", "FuzzySharp\FuzzySharp.csproj", "{348B90DA-DA44-45AD-B857-D3A69D05AE46}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FuzzySharp.Test", "FuzzySharp.Test\FuzzySharp.Test.csproj", "{48F4C7CB-E669-410C-A455-DE3330347807}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FuzzySharp.Test", "FuzzySharp.Test\FuzzySharp.Test.csproj", "{48F4C7CB-E669-410C-A455-DE3330347807}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FuzzySharp.Benchmarks", "FuzzySharp.Benchmarks\FuzzySharp.Benchmarks.csproj", "{480CAE39-ACA7-411A-BF6B-72E61ED6E129}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -21,6 +23,10 @@ Global {48F4C7CB-E669-410C-A455-DE3330347807}.Debug|Any CPU.Build.0 = Debug|Any CPU {48F4C7CB-E669-410C-A455-DE3330347807}.Release|Any CPU.ActiveCfg = Release|Any CPU {48F4C7CB-E669-410C-A455-DE3330347807}.Release|Any CPU.Build.0 = Release|Any CPU + {480CAE39-ACA7-411A-BF6B-72E61ED6E129}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {480CAE39-ACA7-411A-BF6B-72E61ED6E129}.Debug|Any CPU.Build.0 = Debug|Any CPU + {480CAE39-ACA7-411A-BF6B-72E61ED6E129}.Release|Any CPU.ActiveCfg = Release|Any CPU + {480CAE39-ACA7-411A-BF6B-72E61ED6E129}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs index 490decb..e423422 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs @@ -1,5 +1,4 @@ using System; -using System.Linq; namespace FuzzySharp.SimilarityRatio.Scorer.Composite { @@ -38,16 +37,12 @@ public override int Score(string input1, string input2) double partialSor = Fuzz.TokenSortRatio(input1, input2) * unbaseScale * partialScale; double partialSet = Fuzz.TokenSetRatio(input1, input2) * unbaseScale * partialScale; - return (int) Math.Round(new[] { baseRatio, partial, partialSor, partialSet }.Max()); + return (int) Math.Round(Math.Max(baseRatio, Math.Max(partial, Math.Max(partialSor, partialSet)))); } - else - { - double tokenSort = Fuzz.TokenSortRatio(input1, input2) * unbaseScale; - double tokenSet = Fuzz.TokenSetRatio(input1, input2) * unbaseScale; - return (int) Math.Round(new[] { baseRatio, tokenSort, tokenSet }.Max()); - } + double tokenSort = Fuzz.TokenSortRatio(input1, input2) * unbaseScale; + double tokenSet = Fuzz.TokenSetRatio(input1, input2) * unbaseScale; + return (int) Math.Round(Math.Max(baseRatio, Math.Max(tokenSort, tokenSet))); } - } } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs index 63c29ae..7be4f76 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs @@ -1,4 +1,5 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; using System.Linq; using FuzzySharp.Extensions; @@ -9,18 +10,22 @@ public abstract class TokenSetScorerBase : StrategySensitiveScorerBase public override int Score(string input1, string input2) { var tokens1 = new HashSet(input1.SplitByAnySpace()); + var tokens1Copy = new HashSet(tokens1); var tokens2 = new HashSet(input2.SplitByAnySpace()); - var sortedIntersection = string.Join(" ", tokens1.Intersect(tokens2).OrderBy(s => s)).Trim(); - var sortedDiff1To2 = (sortedIntersection + " " + string.Join(" ", tokens1.Except(tokens2).OrderBy(s => s))).Trim(); - var sortedDiff2To1 = (sortedIntersection + " " + string.Join(" ", tokens2.Except(tokens1).OrderBy(s => s))).Trim(); + tokens1Copy.IntersectWith(tokens2); + tokens1.ExceptWith(tokens1Copy); + tokens2.ExceptWith(tokens1Copy); - return new[] - { - Scorer(sortedIntersection, sortedDiff1To2), - Scorer(sortedIntersection, sortedDiff2To1), - Scorer(sortedDiff1To2, sortedDiff2To1) - }.Max(); + var sortedIntersection = string.Join(" ", tokens1Copy.OrderBy(s => s)); + var sortedDiff1To2 = (sortedIntersection + " " + string.Join(" ", tokens1.OrderBy(s => s))); + var sortedDiff2To1 = (sortedIntersection + " " + string.Join(" ", tokens2.OrderBy(s => s))); + + var score1 = Scorer(sortedIntersection, sortedDiff1To2); + var score2 = Scorer(sortedIntersection, sortedDiff2To1); + var score3 = Scorer(sortedDiff1To2, sortedDiff2To1); + + return Math.Max(score1, Math.Max(score2, score3)); } } } From 605d700b03026f3e1bfc2e61e9c139e45660bc81 Mon Sep 17 00:00:00 2001 From: "yevhen.cherkes" Date: Fri, 9 Aug 2024 11:43:54 +0200 Subject: [PATCH 09/20] formatting --- .../TokenSet/TokenSetScorerBase.cs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs index 7be4f76..8b659c3 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs @@ -10,16 +10,17 @@ public abstract class TokenSetScorerBase : StrategySensitiveScorerBase public override int Score(string input1, string input2) { var tokens1 = new HashSet(input1.SplitByAnySpace()); - var tokens1Copy = new HashSet(tokens1); var tokens2 = new HashSet(input2.SplitByAnySpace()); - tokens1Copy.IntersectWith(tokens2); - tokens1.ExceptWith(tokens1Copy); - tokens2.ExceptWith(tokens1Copy); + var intersection = new HashSet(tokens1); + intersection.IntersectWith(tokens2); - var sortedIntersection = string.Join(" ", tokens1Copy.OrderBy(s => s)); - var sortedDiff1To2 = (sortedIntersection + " " + string.Join(" ", tokens1.OrderBy(s => s))); - var sortedDiff2To1 = (sortedIntersection + " " + string.Join(" ", tokens2.OrderBy(s => s))); + tokens1.ExceptWith(intersection); + tokens2.ExceptWith(intersection); + + var sortedIntersection = string.Join(" ", intersection.OrderBy(s => s)); + var sortedDiff1To2 = sortedIntersection + " " + string.Join(" ", tokens1.OrderBy(s => s)); + var sortedDiff2To1 = sortedIntersection + " " + string.Join(" ", tokens2.OrderBy(s => s)); var score1 = Scorer(sortedIntersection, sortedDiff1To2); var score2 = Scorer(sortedIntersection, sortedDiff2To1); From accca0ac4ecd7af08198e2c07c90c8c25da4911a Mon Sep 17 00:00:00 2001 From: Yevhen Cherkes Date: Sat, 10 Aug 2024 18:18:12 +0200 Subject: [PATCH 10/20] revert some frameworks, fix unit tests --- FuzzySharp/Extensions/StringExtensions.cs | 7 ++++- FuzzySharp/FuzzySharp.csproj | 27 ++++++++++++------- .../TokenSet/TokenSetScorerBase.cs | 4 +-- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/FuzzySharp/Extensions/StringExtensions.cs b/FuzzySharp/Extensions/StringExtensions.cs index 19e0b5c..f05209a 100644 --- a/FuzzySharp/Extensions/StringExtensions.cs +++ b/FuzzySharp/Extensions/StringExtensions.cs @@ -38,7 +38,7 @@ public static string[] SplitByAnySpace(this string input) if (string.IsNullOrWhiteSpace(input)) return []; - var words = input.Split(Array.Empty(), StringSplitOptions.RemoveEmptyEntries); + var words = input.Split(EmptyArray(), StringSplitOptions.RemoveEmptyEntries); return words; } @@ -58,5 +58,10 @@ public static string NormalizeSpacesAndSort(this string input) return string.Join(" ", words); } + + private static T[] EmptyArray() + { + return []; + } } } diff --git a/FuzzySharp/FuzzySharp.csproj b/FuzzySharp/FuzzySharp.csproj index 2265806..d4378df 100644 --- a/FuzzySharp/FuzzySharp.csproj +++ b/FuzzySharp/FuzzySharp.csproj @@ -1,7 +1,7 @@  - netstandard2.0;netstandard2.1 + netstandard2.0;netstandard2.1;netcoreapp3.1;net45;net46;net461;net472;net48;net6;net8 true Jacob Bayer Fuzzy string matcher based on FuzzyWuzzy algorithm from SeatGeek @@ -11,12 +11,12 @@ MIT git - 2.0.2 + 2.0.3 Include source link true https://github.com/JakeBayer/FuzzySharp - 1.0.4.0 - 1.0.4.0 + 1.0.5.0 + 1.0.5.0 true true @@ -24,15 +24,22 @@ 12.0 + + + + - - - - - - + + + diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs index 8b659c3..744bcc8 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs @@ -19,8 +19,8 @@ public override int Score(string input1, string input2) tokens2.ExceptWith(intersection); var sortedIntersection = string.Join(" ", intersection.OrderBy(s => s)); - var sortedDiff1To2 = sortedIntersection + " " + string.Join(" ", tokens1.OrderBy(s => s)); - var sortedDiff2To1 = sortedIntersection + " " + string.Join(" ", tokens2.OrderBy(s => s)); + var sortedDiff1To2 = (sortedIntersection + " " + string.Join(" ", tokens1.OrderBy(s => s))).Trim(); + var sortedDiff2To1 = (sortedIntersection + " " + string.Join(" ", tokens2.OrderBy(s => s))).Trim(); var score1 = Scorer(sortedIntersection, sortedDiff1To2); var score2 = Scorer(sortedIntersection, sortedDiff2To1); From e299031bfad528249c34dd00d511afb2e413a72a Mon Sep 17 00:00:00 2001 From: Yevhen Cherkes Date: Sat, 10 Aug 2024 21:29:18 +0200 Subject: [PATCH 11/20] little speedup --- FuzzySharp/FuzzySharp.csproj | 2 +- .../TokenSet/TokenSetScorerBase.cs | 22 ++++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/FuzzySharp/FuzzySharp.csproj b/FuzzySharp/FuzzySharp.csproj index d4378df..b4fd89a 100644 --- a/FuzzySharp/FuzzySharp.csproj +++ b/FuzzySharp/FuzzySharp.csproj @@ -1,7 +1,7 @@  - netstandard2.0;netstandard2.1;netcoreapp3.1;net45;net46;net461;net472;net48;net6;net8 + netstandard2.0;netstandard2.1;netcoreapp3.1;net45;net46;net461;net472;net48;NET60;NET80 true Jacob Bayer Fuzzy string matcher based on FuzzyWuzzy algorithm from SeatGeek diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs index 744bcc8..af61e86 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs @@ -12,11 +12,7 @@ public override int Score(string input1, string input2) var tokens1 = new HashSet(input1.SplitByAnySpace()); var tokens2 = new HashSet(input2.SplitByAnySpace()); - var intersection = new HashSet(tokens1); - intersection.IntersectWith(tokens2); - - tokens1.ExceptWith(intersection); - tokens2.ExceptWith(intersection); + var intersection = GetIntersectionAndExcept(tokens1, tokens2); var sortedIntersection = string.Join(" ", intersection.OrderBy(s => s)); var sortedDiff1To2 = (sortedIntersection + " " + string.Join(" ", tokens1.OrderBy(s => s))).Trim(); @@ -28,5 +24,21 @@ public override int Score(string input1, string input2) return Math.Max(score1, Math.Max(score2, score3)); } + + private static List GetIntersectionAndExcept(HashSet first, HashSet second) + { + List intersection = []; + + foreach (var item in first.ToArray()) + { + if (second.Remove(item)) + { + first.Remove(item); + intersection.Add(item); + } + } + + return intersection; + } } } From 4f0722812a27e65714748809eadedaef611c53b2 Mon Sep 17 00:00:00 2001 From: Yevhen Cherkes Date: Sat, 10 Aug 2024 22:01:00 +0200 Subject: [PATCH 12/20] remove linq.max --- .../Strategy/Generic/PartialRatioStrategyT.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs index 24f0dd6..09c05e8 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs @@ -1,6 +1,4 @@ using System; -using System.Collections.Generic; -using System.Linq; namespace FuzzySharp.SimilarityRatio.Strategy.Generic { @@ -29,7 +27,7 @@ public static int Calculate(T[] input1, T[] input2) var matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); - var scores = new List(); + double maxScore = 0; foreach (var matchingBlock in matchingBlocks) { @@ -49,11 +47,13 @@ public static int Calculate(T[] input1, T[] input2) return 100; } - scores.Add(ratio); - + if (ratio > maxScore) + { + maxScore = ratio; + } } - return (int)Math.Round(100 * scores.Max()); + return (int)Math.Round(100 * maxScore); } } } From 51e27ce830b373b33b749ff5ee54a8c881441753 Mon Sep 17 00:00:00 2001 From: Yevhen Cherkes Date: Sat, 10 Aug 2024 22:06:54 +0200 Subject: [PATCH 13/20] revert sealed classes back --- FuzzySharp/Edits/MatchingBlock.cs | 2 +- .../SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs | 2 +- .../Scorer/StrategySensitive/Simple/DefaultRatioScorer.cs | 2 +- .../Scorer/StrategySensitive/Simple/PartialRatioScorer.cs | 2 +- .../TokenDifference/PartialTokenDifferenceScorer.cs | 2 +- .../StrategySensitive/TokenDifference/TokenDifferenceScorer.cs | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/FuzzySharp/Edits/MatchingBlock.cs b/FuzzySharp/Edits/MatchingBlock.cs index 9f0e399..16ea018 100644 --- a/FuzzySharp/Edits/MatchingBlock.cs +++ b/FuzzySharp/Edits/MatchingBlock.cs @@ -1,6 +1,6 @@ namespace FuzzySharp.Edits { - public sealed class MatchingBlock + public class MatchingBlock { public int SourcePos { get; set; } public int DestPos { get; set; } diff --git a/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs index e423422..2700f15 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs @@ -2,7 +2,7 @@ namespace FuzzySharp.SimilarityRatio.Scorer.Composite { - public sealed class WeightedRatioScorer : ScorerBase + public class WeightedRatioScorer : ScorerBase { private static double UNBASE_SCALE = .95; private static double PARTIAL_SCALE = .90; diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/DefaultRatioScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/DefaultRatioScorer.cs index e33dee9..12ef6d1 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/DefaultRatioScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/DefaultRatioScorer.cs @@ -3,7 +3,7 @@ namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { - public sealed class DefaultRatioScorer : SimpleRatioScorerBase + public class DefaultRatioScorer : SimpleRatioScorerBase { protected override Func Scorer => DefaultRatioStrategy.Calculate; } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/PartialRatioScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/PartialRatioScorer.cs index 0065965..049d8af 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/PartialRatioScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/PartialRatioScorer.cs @@ -3,7 +3,7 @@ namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { - public sealed class PartialRatioScorer : SimpleRatioScorerBase + public class PartialRatioScorer : SimpleRatioScorerBase { protected override Func Scorer => PartialRatioStrategy.Calculate; } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs index 22cbed8..a216197 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs @@ -3,7 +3,7 @@ namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { - public sealed class PartialTokenDifferenceScorer : TokenDifferenceScorerBase + public class PartialTokenDifferenceScorer : TokenDifferenceScorerBase { protected override Func Scorer => PartialRatioStrategy.Calculate; } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorer.cs index a9f59fd..fc2bfb9 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorer.cs @@ -3,7 +3,7 @@ namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { - public sealed class TokenDifferenceScorer : TokenDifferenceScorerBase + public class TokenDifferenceScorer : TokenDifferenceScorerBase { protected override Func Scorer => DefaultRatioStrategy.Calculate; } From b813c58c5e302eb11c315ca1353405dd44945a58 Mon Sep 17 00:00:00 2001 From: Yevhen Cherkes Date: Sat, 10 Aug 2024 22:20:27 +0200 Subject: [PATCH 14/20] remove unnecessary changes --- .../Strategy/Generic/PartialRatioStrategyT.cs | 11 ++++++----- .../SimilarityRatio/Strategy/PartialRatioStrategy.cs | 11 ++++++----- FuzzySharp/Utils/Heap.cs | 11 ++++++----- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs index 09c05e8..2f35fce 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs @@ -1,4 +1,5 @@ using System; +using FuzzySharp.Edits; namespace FuzzySharp.SimilarityRatio.Strategy.Generic { @@ -25,22 +26,22 @@ public static int Calculate(T[] input1, T[] input2) longer = input1; } - var matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); + MatchingBlock[] matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); double maxScore = 0; foreach (var matchingBlock in matchingBlocks) { - var dist = matchingBlock.DestPos - matchingBlock.SourcePos; + int dist = matchingBlock.DestPos - matchingBlock.SourcePos; - var longStart = dist > 0 ? dist : 0; - var longEnd = longStart + shorter.Length; + int longStart = dist > 0 ? dist : 0; + int longEnd = longStart + shorter.Length; if (longEnd > longer.Length) longEnd = longer.Length; var longSubstr = longer.AsSpan()[longStart..longEnd]; - var ratio = Levenshtein.GetRatio(shorter.AsSpan(), longSubstr); + double ratio = Levenshtein.GetRatio(shorter.AsSpan(), longSubstr); if (ratio > .995) { diff --git a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs index ce3b02d..fd95800 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs @@ -1,4 +1,5 @@ using System; +using FuzzySharp.Edits; namespace FuzzySharp.SimilarityRatio.Strategy { @@ -25,22 +26,22 @@ public static int Calculate(string input1, string input2) longer = input1; } - var matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); + MatchingBlock[] matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); double maxScore = 0; foreach (var matchingBlock in matchingBlocks) { - var dist = matchingBlock.DestPos - matchingBlock.SourcePos; + int dist = matchingBlock.DestPos - matchingBlock.SourcePos; - var longStart = dist > 0 ? dist : 0; - var longEnd = longStart + shorter.Length; + int longStart = dist > 0 ? dist : 0; + int longEnd = longStart + shorter.Length; if (longEnd > longer.Length) longEnd = longer.Length; var longSubstr = longer.AsSpan()[longStart..longEnd]; - var ratio = Levenshtein.GetRatio(shorter.AsSpan(), longSubstr); + double ratio = Levenshtein.GetRatio(shorter.AsSpan(), longSubstr); if (ratio > .995) { diff --git a/FuzzySharp/Utils/Heap.cs b/FuzzySharp/Utils/Heap.cs index bcb937a..a732edb 100644 --- a/FuzzySharp/Utils/Heap.cs +++ b/FuzzySharp/Utils/Heap.cs @@ -36,8 +36,9 @@ protected Heap(IEnumerable collection) protected Heap(IEnumerable collection, Comparer comparer) { Comparer = comparer ?? throw new ArgumentNullException(nameof(comparer)); + _ = collection ?? throw new ArgumentNullException(nameof(collection)); - foreach (var item in collection ?? throw new ArgumentNullException(nameof(collection))) + foreach (var item in collection) { if (Count == Capacity) Grow(); @@ -45,7 +46,7 @@ protected Heap(IEnumerable collection, Comparer comparer) _heap[Count++] = item; } - for (var i = Parent(Count - 1); i >= 0; i--) + for (int i = Parent(Count - 1); i >= 0; i--) BubbleDown(i); } @@ -78,7 +79,7 @@ public T GetMin() public T ExtractDominating() { if (Count == 0) throw new InvalidOperationException("Heap is empty"); - var ret = _heap[0]; + T ret = _heap[0]; Count--; Swap(Count, 0); BubbleDown(0); @@ -98,7 +99,7 @@ private void BubbleDown(int i) private int Dominating(int i) { - var dominatingNode = i; + int dominatingNode = i; dominatingNode = GetDominating(YoungChild(i), dominatingNode); dominatingNode = GetDominating(OldChild(i), dominatingNode); @@ -135,7 +136,7 @@ private static int OldChild(int i) private void Grow() { - var newCapacity = Capacity * GrowFactor + MinGrow; + int newCapacity = Capacity * GrowFactor + MinGrow; var newHeap = new T[newCapacity]; Array.Copy(_heap, newHeap, Capacity); _heap = newHeap; From a83f39abce3fba1b195b7ba7c79ebc7ec266d6d0 Mon Sep 17 00:00:00 2001 From: Yevhen Cherkes Date: Sat, 10 Aug 2024 22:43:38 +0200 Subject: [PATCH 15/20] remove unnecessary vshistory from gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 861efa5..940794e 100644 --- a/.gitignore +++ b/.gitignore @@ -26,7 +26,6 @@ bld/ # Visual Studio 2015 cache/options directory .vs/ -.vshistory/ # Uncomment if you have tasks that create the project's static files in wwwroot #wwwroot/ From 4fb074a0e27b423f8b14b6874f03bcc454e23b53 Mon Sep 17 00:00:00 2001 From: Yevhen Cherkes Date: Sat, 10 Aug 2024 23:26:14 +0200 Subject: [PATCH 16/20] remove unnecessary conversion to span --- FuzzySharp/Levenshtein.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FuzzySharp/Levenshtein.cs b/FuzzySharp/Levenshtein.cs index a815be1..9315b6f 100644 --- a/FuzzySharp/Levenshtein.cs +++ b/FuzzySharp/Levenshtein.cs @@ -49,7 +49,7 @@ private static EditOp[] GetEditOps(int len1, ReadOnlySpan c1, int len2, Re len1++; len2++; - Span matrix = new int[len2 * len1]; + int[] matrix = new int[len2 * len1]; for (i = 0; i < len2; i++) matrix[i] = i; @@ -100,7 +100,7 @@ private static EditOp[] GetEditOps(int len1, ReadOnlySpan c1, int len2, Re private static EditOp[] EditOpsFromCostMatrix(int len1, ReadOnlySpan c1, int p1, int o1, int len2, ReadOnlySpan c2, int p2, int o2, - Span matrix) + int[] matrix) where T: IEquatable { From f43f4ee7f369b20be0c5ec4620baed6e91a20d97 Mon Sep 17 00:00:00 2001 From: Yevhen Cherkes Date: Sat, 10 Aug 2024 23:52:24 +0200 Subject: [PATCH 17/20] replace concurrentdictionary with generic instance creation replace instance lambda with static copied from https://github.com/JakeBayer/FuzzySharp/pull/42 --- FuzzySharp/PreProcess/StringPreprocessorFactory.cs | 2 +- FuzzySharp/SimilarityRatio/ScorerCache.cs | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/FuzzySharp/PreProcess/StringPreprocessorFactory.cs b/FuzzySharp/PreProcess/StringPreprocessorFactory.cs index 3560e0e..e0aaa59 100644 --- a/FuzzySharp/PreProcess/StringPreprocessorFactory.cs +++ b/FuzzySharp/PreProcess/StringPreprocessorFactory.cs @@ -27,7 +27,7 @@ public static Func GetPreprocessor(PreprocessMode mode) return mode switch { PreprocessMode.Full => Default, - PreprocessMode.None => s => s, + PreprocessMode.None => static s => s, _ => throw new InvalidOperationException($"Invalid string preprocessor mode: {mode}") }; } diff --git a/FuzzySharp/SimilarityRatio/ScorerCache.cs b/FuzzySharp/SimilarityRatio/ScorerCache.cs index 34b405e..15229bb 100644 --- a/FuzzySharp/SimilarityRatio/ScorerCache.cs +++ b/FuzzySharp/SimilarityRatio/ScorerCache.cs @@ -1,15 +1,17 @@ -using System; -using System.Collections.Concurrent; +using System.Runtime.CompilerServices; using FuzzySharp.SimilarityRatio.Scorer; namespace FuzzySharp.SimilarityRatio { public static class ScorerCache { - private static readonly ConcurrentDictionary s_scorerCache = new ConcurrentDictionary(); - public static IRatioScorer Get() where T : IRatioScorer, new() + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static IRatioScorer Get() where T : IRatioScorer, new() => GenericCache.Instance; + + private static class GenericCache + where T : IRatioScorer, new() { - return s_scorerCache.GetOrAdd(typeof(T), new T()); + public static readonly T Instance = new T(); } } -} +} \ No newline at end of file From 81bf39f2b9081fecbbbaf36d65e80c2b2975ebc0 Mon Sep 17 00:00:00 2001 From: "yevhen.cherkes" Date: Mon, 12 Aug 2024 09:35:47 +0200 Subject: [PATCH 18/20] remove duplicate, format --- FuzzySharp/FuzzySharp.csproj | 42 +++++++++++++++++------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/FuzzySharp/FuzzySharp.csproj b/FuzzySharp/FuzzySharp.csproj index b4fd89a..529ab21 100644 --- a/FuzzySharp/FuzzySharp.csproj +++ b/FuzzySharp/FuzzySharp.csproj @@ -1,27 +1,25 @@  - - netstandard2.0;netstandard2.1;netcoreapp3.1;net45;net46;net461;net472;net48;NET60;NET80 - true - Jacob Bayer - Fuzzy string matcher based on FuzzyWuzzy algorithm from SeatGeek - Fuzzy String Matching Comparison FuzzyWuzzy FuzzySharp - false - https://github.com/JakeBayer/FuzzySharp - MIT - git - - 2.0.3 - Include source link - true - https://github.com/JakeBayer/FuzzySharp - 1.0.5.0 - 1.0.5.0 - - true - true - snupkg - 12.0 + + 1.0.5.0 + Jacob Bayer + + Fuzzy string matcher based on FuzzyWuzzy algorithm from SeatGeek + 1.0.5.0 + true + true + 12.0 + MIT + https://github.com/JakeBayer/FuzzySharp + Include source link + false + Fuzzy String Matching Comparison FuzzyWuzzy FuzzySharp + true + git + https://github.com/JakeBayer/FuzzySharp + snupkg + netstandard2.0;netstandard2.1;netcoreapp3.1;net45;net46;net461;net472;net48;NET60;NET80 + 2.0.3 Date: Mon, 12 Aug 2024 12:38:37 +0200 Subject: [PATCH 19/20] remove unnecessary calls asspan in the cycle --- FuzzySharp/Levenshtein.cs | 6 +++--- .../Strategy/PartialRatioStrategy.cs | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/FuzzySharp/Levenshtein.cs b/FuzzySharp/Levenshtein.cs index 9315b6f..1660705 100644 --- a/FuzzySharp/Levenshtein.cs +++ b/FuzzySharp/Levenshtein.cs @@ -12,9 +12,9 @@ private static EditOp[] GetEditOps(T[] arr1, T[] arr2) where T : IEquatable s1, ReadOnlySpan s2) { - return GetEditOps(s1.Length, s1.AsSpan(), s2.Length, s2.AsSpan()); + return GetEditOps(s1.Length, s1, s2.Length, s2); } private static EditOp[] GetEditOps(int len1, ReadOnlySpan c1, int len2, ReadOnlySpan c2) where T : IEquatable @@ -227,7 +227,7 @@ public static MatchingBlock[] GetMatchingBlocks(T[] s1, T[] s2) where T : IEq } // Special Case - public static MatchingBlock[] GetMatchingBlocks(string s1, string s2) + public static MatchingBlock[] GetMatchingBlocks(ReadOnlySpan s1, ReadOnlySpan s2) { return GetMatchingBlocks(s1.Length, s2.Length, GetEditOps(s1, s2)); diff --git a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs index fd95800..20d9b26 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs @@ -7,23 +7,23 @@ internal static class PartialRatioStrategy { public static int Calculate(string input1, string input2) { - string shorter; - string longer; - if (input1.Length == 0 || input2.Length == 0) { return 0; } + ReadOnlySpan shorter; + ReadOnlySpan longer; + if (input1.Length < input2.Length) { - shorter = input1; - longer = input2; + shorter = input1.AsSpan(); + longer = input2.AsSpan(); } else { - shorter = input2; - longer = input1; + shorter = input2.AsSpan(); + longer = input1.AsSpan(); } MatchingBlock[] matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); @@ -39,9 +39,9 @@ public static int Calculate(string input1, string input2) if (longEnd > longer.Length) longEnd = longer.Length; - var longSubstr = longer.AsSpan()[longStart..longEnd]; + var longSubstr = longer[longStart..longEnd]; - double ratio = Levenshtein.GetRatio(shorter.AsSpan(), longSubstr); + double ratio = Levenshtein.GetRatio(shorter, longSubstr); if (ratio > .995) { From 011155d078b67752ac2b3adb6a3ffc1a70565ccb Mon Sep 17 00:00:00 2001 From: "yevhen.cherkes" Date: Tue, 20 Aug 2024 08:48:30 +0200 Subject: [PATCH 20/20] Synchronize PR with [fork](https://github.com/Raffinert/FuzzySharp). - Implemented new Process.ExtractAll method, see [Issue!46](https://github.com/JakeBayer/FuzzySharp/issues/46). - Added fastenstein to benchmarks --- FuzzySharp.Benchmarks/BenchmarkAll.cs | 28 +++++++++++++++++++ .../FuzzySharp.Benchmarks.csproj | 1 + FuzzySharp/Extractor/ResultExtractor.cs | 11 ++++++-- FuzzySharp/Process.cs | 21 ++++++++++++++ 4 files changed, 58 insertions(+), 3 deletions(-) diff --git a/FuzzySharp.Benchmarks/BenchmarkAll.cs b/FuzzySharp.Benchmarks/BenchmarkAll.cs index 45320fe..2da09d3 100644 --- a/FuzzySharp.Benchmarks/BenchmarkAll.cs +++ b/FuzzySharp.Benchmarks/BenchmarkAll.cs @@ -1,4 +1,5 @@ using BenchmarkDotNet.Attributes; +using FuzzySharp.Extractor; using FuzzySharp.PreProcess; namespace FuzzySharp.Benchmarks; @@ -89,4 +90,31 @@ public int PartialTokenAbbreviationRatio() { return Fuzz.PartialTokenAbbreviationRatio("bl 420", "Baseline section 420", PreprocessMode.Full); } + + private static readonly string[][] Events = + [ + ["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"], + ["new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm"], + ["atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm"] + ]; + + private static readonly string[] Query = ["new york mets vs chicago cubs", "CitiField", "2017-03-19", "8pm"]; + + [Benchmark] + public ExtractedResult ExtractOne() + { + return Process.ExtractOne(Query, Events, static strings => strings[0]); + } + + [Benchmark] + public int LevenshteinDistance() + { + return Levenshtein.EditDistance("chicago cubs vs new york mets".AsSpan(), "new york mets vs chicago cubs".AsSpan()); + } + + [Benchmark] + public int FastenshteinDistance() + { + return Fastenshtein.Levenshtein.Distance("chicago cubs vs new york mets", "new york mets vs chicago cubs"); + } } \ No newline at end of file diff --git a/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj b/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj index 763a3ce..a6139a8 100644 --- a/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj +++ b/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj @@ -9,6 +9,7 @@ + diff --git a/FuzzySharp/Extractor/ResultExtractor.cs b/FuzzySharp/Extractor/ResultExtractor.cs index 66b7168..148e5ac 100644 --- a/FuzzySharp/Extractor/ResultExtractor.cs +++ b/FuzzySharp/Extractor/ResultExtractor.cs @@ -8,13 +8,12 @@ namespace FuzzySharp.Extractor { public static class ResultExtractor { - public static IEnumerable> ExtractWithoutOrder(T query, IEnumerable choices, Func processor, IRatioScorer scorer, int cutoff = 0) + public static IEnumerable> ExtractWithoutOrder(string query, IEnumerable choices, Func processor, IRatioScorer scorer, int cutoff = 0) { int index = 0; - var processedQuery = processor(query); foreach (var choice in choices) { - int score = scorer.Score(processedQuery, processor(choice)); + int score = scorer.Score(query, processor(choice)); if (score >= cutoff) { yield return new ExtractedResult(choice, score, index); @@ -23,6 +22,12 @@ public static IEnumerable> ExtractWithoutOrder(T query, IE } } + public static IEnumerable> ExtractWithoutOrder(T query, IEnumerable choices, Func processor, IRatioScorer scorer, int cutoff = 0) + { + var processedQuery = processor(query); + return ExtractWithoutOrder(processedQuery, choices, processor, scorer, cutoff); + } + public static ExtractedResult ExtractOne(T query, IEnumerable choices, Func processor, IRatioScorer calculator, int cutoff = 0) { return ExtractWithoutOrder(query, choices, processor, calculator, cutoff).Max(); diff --git a/FuzzySharp/Process.cs b/FuzzySharp/Process.cs index 87717fe..9e33700 100644 --- a/FuzzySharp/Process.cs +++ b/FuzzySharp/Process.cs @@ -56,6 +56,27 @@ public static IEnumerable> ExtractAll( scorer ??= s_defaultScorer; return ResultExtractor.ExtractWithoutOrder(query, choices, processor, scorer, cutoff); } + + /// + /// Creates a list of ExtractedResult which contain all the choices with + /// their corresponding score where higher is more similar + /// + /// + /// + /// + /// + /// + /// + public static IEnumerable> ExtractAll( + string query, + IEnumerable choices, + Func processor, + IRatioScorer scorer = null, + int cutoff = 0) + { + scorer ??= s_defaultScorer; + return ResultExtractor.ExtractWithoutOrder(query, choices, processor, scorer, cutoff); + } #endregion #region ExtractTop