diff --git a/FuzzySharp.Benchmarks/BenchmarkAll.cs b/FuzzySharp.Benchmarks/BenchmarkAll.cs new file mode 100644 index 0000000..e247b33 --- /dev/null +++ b/FuzzySharp.Benchmarks/BenchmarkAll.cs @@ -0,0 +1,211 @@ +using BenchmarkDotNet.Attributes; +using Raffinert.FuzzySharp.Extractor; +using Raffinert.FuzzySharp.PreProcess; +using Classic = FuzzySharp; + +namespace Raffinert.FuzzySharp.Benchmarks; + +[MemoryDiagnoser] +public class BenchmarkAll +{ + [Benchmark] + public int Ratio1() + { + return Fuzz.Ratio("mysmilarstring", "myawfullysimilarstirng"); + } + + [Benchmark] + public int Ratio2() + { + return Fuzz.Ratio("mysmilarstring", "mysimilarstring"); + } + + [Benchmark] + public int PartialRatio() + { + return Fuzz.PartialRatio("similar", "somewhresimlrbetweenthisstring"); + } + + [Benchmark] + public int TokenSortRatio() + { + return Fuzz.TokenSortRatio("order words out of", " words out of order"); + } + + [Benchmark] + public int PartialTokenSortRatio() + { + return Fuzz.PartialTokenSortRatio("order words out of", " words out of order"); + } + + [Benchmark] + public int TokenSetRatio() + { + return Fuzz.TokenSetRatio("fuzzy was a bear", "fuzzy fuzzy fuzzy bear"); + } + + [Benchmark] + public int PartialTokenSetRatio() + { + return Fuzz.PartialTokenSetRatio("fuzzy was a bear", "fuzzy fuzzy fuzzy bear"); + } + + [Benchmark] + public int WeightedRatio() + { + return Fuzz.WeightedRatio("The quick brown fox jimps ofver the small lazy dog", "the quick brown fox jumps over the small lazy dog"); + } + + [Benchmark] + public int TokenInitialismRatio1() + { + return Fuzz.TokenInitialismRatio("NASA", "National Aeronautics and Space Administration"); + } + + [Benchmark] + public int TokenInitialismRatio2() + { + return Fuzz.TokenInitialismRatio("NASA", "National Aeronautics Space Administration"); + } + + [Benchmark] + public int TokenInitialismRatio3() + { + return Fuzz.TokenInitialismRatio("NASA", "National Aeronautics Space Administration, Kennedy Space Center, Cape Canaveral, Florida 32899"); + } + + [Benchmark] + public int PartialTokenInitialismRatio() + { + return Fuzz.PartialTokenInitialismRatio("NASA", "National Aeronautics Space Administration, Kennedy Space Center, Cape Canaveral, Florida 32899"); + } + + [Benchmark] + public int TokenAbbreviationRatio() + { + return Fuzz.TokenAbbreviationRatio("bl 420", "Baseline section 420", PreprocessMode.Full); + } + + [Benchmark] + public int PartialTokenAbbreviationRatio() + { + return Fuzz.PartialTokenAbbreviationRatio("bl 420", "Baseline section 420", PreprocessMode.Full); + } + + [Benchmark] + public int Ratio1Classic() + { + return Classic.Fuzz.Ratio("mysmilarstring", "myawfullysimilarstirng"); + } + + [Benchmark] + public int Ratio2Classic() + { + return Classic.Fuzz.Ratio("mysmilarstring", "mysimilarstring"); + } + + [Benchmark] + public int PartialRatioClassic() + { + return Classic.Fuzz.PartialRatio("similar", "somewhresimlrbetweenthisstring"); + } + + [Benchmark] + public int TokenSortRatioClassic() + { + return Classic.Fuzz.TokenSortRatio("order words out of", " words out of order"); + } + + [Benchmark] + public int PartialTokenSortRatioClassic() + { + return Classic.Fuzz.PartialTokenSortRatio("order words out of", " words out of order"); + } + + [Benchmark] + public int TokenSetRatioClassic() + { + return Classic.Fuzz.TokenSetRatio("fuzzy was a bear", "fuzzy fuzzy fuzzy bear"); + } + + [Benchmark] + public int PartialTokenSetRatioClassic() + { + return Classic.Fuzz.PartialTokenSetRatio("fuzzy was a bear", "fuzzy fuzzy fuzzy bear"); + } + + [Benchmark] + public int WeightedRatioClassic() + { + return Classic.Fuzz.WeightedRatio("The quick brown fox jimps ofver the small lazy dog", "the quick brown fox jumps over the small lazy dog"); + } + + [Benchmark] + public int TokenInitialismRatio1Classic() + { + return Classic.Fuzz.TokenInitialismRatio("NASA", "National Aeronautics and Space Administration"); + } + + [Benchmark] + public int TokenInitialismRatio2Classic() + { + return Classic.Fuzz.TokenInitialismRatio("NASA", "National Aeronautics Space Administration"); + } + + [Benchmark] + public int TokenInitialismRatio3Classic() + { + return Classic.Fuzz.TokenInitialismRatio("NASA", "National Aeronautics Space Administration, Kennedy Space Center, Cape Canaveral, Florida 32899"); + } + + [Benchmark] + public int PartialTokenInitialismRatioClassic() + { + return Classic.Fuzz.PartialTokenInitialismRatio("NASA", "National Aeronautics Space Administration, Kennedy Space Center, Cape Canaveral, Florida 32899"); + } + + [Benchmark] + public int TokenAbbreviationRatioClassic() + { + return Classic.Fuzz.TokenAbbreviationRatio("bl 420", "Baseline section 420", Classic.PreProcess.PreprocessMode.Full); + } + + [Benchmark] + public int PartialTokenAbbreviationRatioClassic() + { + return Classic.Fuzz.PartialTokenAbbreviationRatio("bl 420", "Baseline section 420", Classic.PreProcess.PreprocessMode.Full); + } + + private static readonly string[][] Events = + [ + ["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"], + ["new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm"], + ["atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm"] + ]; + + private static readonly string[] Query = ["new york mets vs chicago cubs", "CitiField", "2017-03-19", "8pm"]; + + [Benchmark] + public ExtractedResult ExtractOne() + { + return Process.ExtractOne(Query, Events, static strings => strings[0]); + } + + [Benchmark] + public Classic.Extractor.ExtractedResult ExtractOneClassic() + { + return Classic.Process.ExtractOne(Query, Events, static strings => strings[0]); + } + + [Benchmark] + public int LevenshteinDistance() + { + return Levenshtein.EditDistance("chicago cubs vs new york mets".AsSpan(), "new york mets vs chicago cubs".AsSpan()); + } + + [Benchmark] + public int FastenshteinDistance() + { + return Fastenshtein.Levenshtein.Distance("chicago cubs vs new york mets", "new york mets vs chicago cubs"); + } +} \ No newline at end of file diff --git a/FuzzySharp.Benchmarks/BenchmarkDotNet.Artifacts/results/Raffinert.FuzzySharp.Benchmarks.BenchmarkAll-report-github.md b/FuzzySharp.Benchmarks/BenchmarkDotNet.Artifacts/results/Raffinert.FuzzySharp.Benchmarks.BenchmarkAll-report-github.md new file mode 100644 index 0000000..eab6cc3 --- /dev/null +++ b/FuzzySharp.Benchmarks/BenchmarkDotNet.Artifacts/results/Raffinert.FuzzySharp.Benchmarks.BenchmarkAll-report-github.md @@ -0,0 +1,44 @@ +``` + +BenchmarkDotNet v0.14.0, Windows 11 (10.0.22631.4037/23H2/2023Update/SunValley3) +12th Gen Intel Core i7-1255U, 1 CPU, 12 logical and 10 physical cores +.NET SDK 8.0.400 + [Host] : .NET 8.0.8 (8.0.824.36612), X64 RyuJIT AVX2 + DefaultJob : .NET 8.0.8 (8.0.824.36612), X64 RyuJIT AVX2 + + +``` +| Method | Mean | Error | StdDev | Median | Gen0 | Gen1 | Allocated | +|------------------------------------- |-------------:|-------------:|-------------:|-------------:|-------:|-------:|----------:| +| Ratio1 | 206.81 ns | 2.409 ns | 2.136 ns | 207.29 ns | 0.0165 | - | 104 B | +| Ratio2 | 13.76 ns | 0.319 ns | 0.899 ns | 13.29 ns | - | - | - | +| PartialRatio | 723.24 ns | 22.148 ns | 59.498 ns | 692.31 ns | 0.3786 | 0.0010 | 2376 B | +| TokenSortRatio | 801.65 ns | 54.000 ns | 159.219 ns | 882.82 ns | 0.0896 | - | 568 B | +| PartialTokenSortRatio | 899.87 ns | 30.597 ns | 89.254 ns | 921.35 ns | 0.1154 | - | 728 B | +| TokenSetRatio | 1,093.68 ns | 28.071 ns | 80.993 ns | 1,096.79 ns | 0.3500 | - | 2200 B | +| PartialTokenSetRatio | 1,380.95 ns | 52.967 ns | 154.507 ns | 1,392.58 ns | 0.5112 | - | 3208 B | +| WeightedRatio | 12,561.44 ns | 767.193 ns | 2,225.766 ns | 13,232.62 ns | 0.7935 | - | 5072 B | +| TokenInitialismRatio1 | 294.56 ns | 6.757 ns | 18.946 ns | 297.41 ns | 0.0625 | - | 392 B | +| TokenInitialismRatio2 | 275.14 ns | 5.562 ns | 15.503 ns | 272.03 ns | 0.0548 | - | 344 B | +| TokenInitialismRatio3 | 542.62 ns | 10.893 ns | 29.635 ns | 541.23 ns | 0.1106 | - | 696 B | +| PartialTokenInitialismRatio | 749.64 ns | 15.039 ns | 32.373 ns | 744.13 ns | 0.1845 | - | 1160 B | +| TokenAbbreviationRatio | 1,270.08 ns | 24.756 ns | 41.361 ns | 1,255.59 ns | 0.2508 | - | 1576 B | +| PartialTokenAbbreviationRatio | 1,536.55 ns | 45.771 ns | 129.097 ns | 1,561.22 ns | 0.3357 | - | 2112 B | +| Ratio1Classic | 677.17 ns | 13.437 ns | 29.212 ns | 681.43 ns | 0.0505 | - | 320 B | +| Ratio2Classic | 104.42 ns | 2.102 ns | 3.626 ns | 105.17 ns | 0.0318 | - | 200 B | +| PartialRatioClassic | 2,249.40 ns | 44.588 ns | 118.242 ns | 2,274.26 ns | 0.5360 | 0.0019 | 3368 B | +| TokenSortRatioClassic | 3,071.78 ns | 92.892 ns | 266.524 ns | 3,143.59 ns | 0.3510 | - | 2216 B | +| PartialTokenSortRatioClassic | 3,317.62 ns | 64.881 ns | 82.054 ns | 3,327.15 ns | 0.4005 | - | 2536 B | +| TokenSetRatioClassic | 4,309.09 ns | 85.081 ns | 184.959 ns | 4,337.85 ns | 0.6905 | - | 4352 B | +| PartialTokenSetRatioClassic | 4,771.35 ns | 92.361 ns | 230.012 ns | 4,849.64 ns | 0.9308 | - | 5840 B | +| WeightedRatioClassic | 24,181.32 ns | 721.231 ns | 2,046.011 ns | 24,472.06 ns | 2.1362 | - | 13482 B | +| TokenInitialismRatio1Classic | 1,041.92 ns | 20.745 ns | 39.470 ns | 1,044.25 ns | 0.1440 | - | 904 B | +| TokenInitialismRatio2Classic | 824.97 ns | 26.765 ns | 75.051 ns | 844.97 ns | 0.1173 | - | 736 B | +| TokenInitialismRatio3Classic | 1,971.98 ns | 39.316 ns | 91.901 ns | 1,989.39 ns | 0.2460 | - | 1552 B | +| PartialTokenInitialismRatioClassic | 2,249.70 ns | 44.057 ns | 65.943 ns | 2,259.86 ns | 0.3414 | - | 2144 B | +| TokenAbbreviationRatioClassic | 2,727.98 ns | 84.791 ns | 241.914 ns | 2,779.33 ns | 0.4730 | - | 2984 B | +| PartialTokenAbbreviationRatioClassic | 3,162.92 ns | 88.249 ns | 247.460 ns | 3,193.32 ns | 0.6180 | - | 3896 B | +| ExtractOne | 33,770.23 ns | 1,260.134 ns | 3,595.234 ns | 34,371.46 ns | 1.8616 | - | 11728 B | +| ExtractOneClassic | 54,594.63 ns | 1,971.629 ns | 5,625.169 ns | 55,347.68 ns | 4.5776 | - | 29011 B | +| LevenshteinDistance | 2,096.37 ns | 58.508 ns | 167.872 ns | 2,141.95 ns | 0.0229 | - | 144 B | +| FastenshteinDistance | 1,533.82 ns | 38.323 ns | 108.715 ns | 1,564.52 ns | 0.0229 | - | 144 B | diff --git a/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj b/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj new file mode 100644 index 0000000..44f0796 --- /dev/null +++ b/FuzzySharp.Benchmarks/FuzzySharp.Benchmarks.csproj @@ -0,0 +1,22 @@ + + + + Exe + net8.0 + enable + enable + $(MSBuildProjectName) + Raffinert.$(MSBuildProjectName.Replace(" ", "_")) + + + + + + + + + + + + + diff --git a/FuzzySharp.Benchmarks/Program.cs b/FuzzySharp.Benchmarks/Program.cs new file mode 100644 index 0000000..7c50760 --- /dev/null +++ b/FuzzySharp.Benchmarks/Program.cs @@ -0,0 +1,38 @@ +using BenchmarkDotNet.Running; +using Raffinert.FuzzySharp.Benchmarks; +//using Raffinert.FuzzySharp; +//using Raffinert.FuzzySharp.SimilarityRatio; +//using Raffinert.FuzzySharp.SimilarityRatio.Scorer.Composite; +//using Classic = FuzzySharp; + +BenchmarkRunner.Run(typeof(Program).Assembly); + +//var input1 = "+30.0% Damage to Close Enemies [30.01%"; +//var input2Collection = new[] +//{ +// "+#% Damage", +// "+#% Damage to Crowd Controlled Enemies", +// "+#% Damage to Close Enemies", +// "+#% Damage to Chilled Enemies", +// "+#% Damage to Poisoned Enemies", +// "#% Block Chance#% Blocked Damage Reduction", +// "#% Damage Reduction from Bleeding Enemies", +// "#% Damage Reduction", +// "+#% Cold Damage" +//}; + +//var classicScorer = Classic.SimilarityRatio.ScorerCache.Get(); + +//Func classicScorerFunc = input2 => classicScorer.Score(input1, input2); + +//var classicResult = input2Collection.Select(classicScorerFunc).ToList(); + +//var scorer = ScorerCache.Get(); + +//Func scorerFunc = input2 => scorer.Score(input1, input2); + +//var result = input2Collection.Select(scorerFunc).ToList(); + +//Console.WriteLine(); + +//Console.WriteLine(Fuzz.WeightedRatio("The quick brown fox jimps ofver the small lazy dog", "the quick brown fox jumps over the small lazy dog")); \ No newline at end of file diff --git a/FuzzySharp.Test/EvaluationTests/EvaluationTests.cs b/FuzzySharp.Test/EvaluationTests/EvaluationTests.cs index eb22945..726cb7c 100644 --- a/FuzzySharp.Test/EvaluationTests/EvaluationTests.cs +++ b/FuzzySharp.Test/EvaluationTests/EvaluationTests.cs @@ -1,10 +1,10 @@ -using FuzzySharp.PreProcess; -using FuzzySharp.SimilarityRatio; -using FuzzySharp.SimilarityRatio.Scorer.Composite; -using FuzzySharp.SimilarityRatio.Scorer.StrategySensitive; -using NUnit.Framework; +using NUnit.Framework; +using Raffinert.FuzzySharp.PreProcess; +using Raffinert.FuzzySharp.SimilarityRatio; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer.Composite; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive; -namespace FuzzySharp.Test.EvaluationTests +namespace Raffinert.FuzzySharp.Test.EvaluationTests { [TestFixture] public class EvaluationTests @@ -36,20 +36,20 @@ public void Evaluate() - var h1 = Process.ExtractOne("cowboys", new[] { "Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys" }); - var h2 = string.Join(", ", Process.ExtractTop("goolge", new[] { "google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl" }, limit: 3)); - var h3 = string.Join(", ", Process.ExtractAll("goolge", new [] {"google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl" })); - var h4 = string.Join(", ", Process.ExtractAll("goolge", new[] { "google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl" }, cutoff: 40)); - var h5 = string.Join(", ", Process.ExtractSorted("goolge", new [] {"google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl" })); + var h1 = Process.ExtractOne("cowboys", ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]); + var h2 = string.Join(", ", Process.ExtractTop("goolge", ["google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl"], limit: 3)); + var h3 = string.Join(", ", Process.ExtractAll("goolge", ["google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl"])); + var h4 = string.Join(", ", Process.ExtractAll("goolge", ["google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl"], cutoff: 40)); + var h5 = string.Join(", ", Process.ExtractSorted("goolge", ["google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl"])); - var i1 = Process.ExtractOne("cowboys", new[] { "Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys" }, s => s, ScorerCache.Get()); + var i1 = Process.ExtractOne("cowboys", ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"], s => s, ScorerCache.Get()); - var events = new[] - { - new[] { "chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm" }, - new[] { "new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm" }, - new[] { "atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm" }, - }; + string[][] events = + [ + ["chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm"], + ["new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm"], + ["atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm"] + ]; var query = new[] { "new york mets vs chicago cubs", "CitiField", "2017-03-19", "8pm" }; var best = Process.ExtractOne(query, events, strings => strings[0]); diff --git a/FuzzySharp.Test/FuzzySharp.Test.csproj b/FuzzySharp.Test/FuzzySharp.Test.csproj index 6ea573a..6f8fa0a 100644 --- a/FuzzySharp.Test/FuzzySharp.Test.csproj +++ b/FuzzySharp.Test/FuzzySharp.Test.csproj @@ -1,19 +1,21 @@ - netcoreapp3.1 - + NET8.0;netcoreapp3.1;netframework4.7.2 false + 12.0 + Raffinert.$(MSBuildProjectName) + Raffinert.$(MSBuildProjectName.Replace(" ", "_")) - + all runtime; build; native; contentfiles; analyzers; buildtransitive - + diff --git a/FuzzySharp.Test/FuzzyTests/ProcessTests.cs b/FuzzySharp.Test/FuzzyTests/ProcessTests.cs index d5cd503..0821252 100644 --- a/FuzzySharp.Test/FuzzyTests/ProcessTests.cs +++ b/FuzzySharp.Test/FuzzyTests/ProcessTests.cs @@ -1,10 +1,10 @@ using System.Collections.Generic; using System.Linq; -using FuzzySharp.SimilarityRatio; -using FuzzySharp.SimilarityRatio.Scorer.StrategySensitive; using NUnit.Framework; +using Raffinert.FuzzySharp.SimilarityRatio; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive; -namespace FuzzySharp.Test.FuzzyTests +namespace Raffinert.FuzzySharp.Test.FuzzyTests { [TestFixture] public class ProcessTests diff --git a/FuzzySharp.Test/FuzzyTests/RatioTests.cs b/FuzzySharp.Test/FuzzyTests/RatioTests.cs index ec499b0..63375db 100644 --- a/FuzzySharp.Test/FuzzyTests/RatioTests.cs +++ b/FuzzySharp.Test/FuzzyTests/RatioTests.cs @@ -1,7 +1,7 @@ -using FuzzySharp.PreProcess; using NUnit.Framework; +using Raffinert.FuzzySharp.PreProcess; -namespace FuzzySharp.Test.FuzzyTests +namespace Raffinert.FuzzySharp.Test.FuzzyTests { [TestFixture] public class RatioTests diff --git a/FuzzySharp.Test/FuzzyTests/RegressionTests.cs b/FuzzySharp.Test/FuzzyTests/RegressionTests.cs index 134d2ee..01979d1 100644 --- a/FuzzySharp.Test/FuzzyTests/RegressionTests.cs +++ b/FuzzySharp.Test/FuzzyTests/RegressionTests.cs @@ -1,54 +1,47 @@ - -using FuzzySharp.SimilarityRatio; -using FuzzySharp.SimilarityRatio.Scorer; -using NUnit.Framework; -using System; +using System; using System.Linq; using System.Reflection; +using NUnit.Framework; +using Raffinert.FuzzySharp.SimilarityRatio; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer; -namespace FuzzySharp.Test.FuzzyTests +namespace Raffinert.FuzzySharp.Test.FuzzyTests { [TestFixture] public class RegressionTests { - /// /// Test to ensure that all IRatioScorer implementations handle scoring empty strings & whitespace strings /// [Test] public void TestScoringEmptyString() { - var scorerType = typeof(IRatioScorer); var assemblies = AppDomain.CurrentDomain.GetAssemblies().ToList(); var types = assemblies.SelectMany(s => { - Type[] types = new Type[] { }; ; try { - types = s.GetTypes(); + return s.GetTypes(); } catch {} - return types; + return []; }).ToList(); var scorerTypes = types.Where(t => scorerType.IsAssignableFrom(t) && !t.IsAbstract && t.IsClass).ToList(); - //var scorerTypes = AppDomain.CurrentDomain.GetAssemblies().SelectMany(s => s.GetTypes()).Where(p => scorerType.IsAssignableFrom(p) && p.IsClass && !p.IsAbstract); - - - MethodInfo getScorerCacheMethodInfo = typeof(ScorerCache).GetMethod("Get"); - - string nullString = null; //Null doesnt seem to be handled by any scorer + + string nullString = null; //Null doesn't seem to be handled by any scorer string emptyString = ""; string whitespaceString = " "; - string[] nullOrWhitespaceStrings = { emptyString, whitespaceString }; + string[] nullOrWhitespaceStrings = [emptyString, whitespaceString]; + MethodInfo getScorerCacheMethodInfo = typeof(ScorerCache).GetMethod("Get"); - foreach (Type t in scorerTypes) + foreach (var t in scorerTypes) { System.Diagnostics.Debug.WriteLine($"Testing {t.Name}"); MethodInfo m = getScorerCacheMethodInfo.MakeGenericMethod(t); - IRatioScorer scorer = m.Invoke(this, new object[] { }) as IRatioScorer; + IRatioScorer scorer = m.Invoke(this, []) as IRatioScorer; foreach(string s in nullOrWhitespaceStrings) { @@ -79,7 +72,6 @@ public void TestScoringEmptyString() } - } } diff --git a/FuzzySharp.Test/FuzzyTests/ScorerTests/TokenSetScorerBaseTest.cs b/FuzzySharp.Test/FuzzyTests/ScorerTests/TokenSetScorerBaseTest.cs index 4cf7d6f..12f822f 100644 --- a/FuzzySharp.Test/FuzzyTests/ScorerTests/TokenSetScorerBaseTest.cs +++ b/FuzzySharp.Test/FuzzyTests/ScorerTests/TokenSetScorerBaseTest.cs @@ -1,12 +1,9 @@ -using FuzzySharp.SimilarityRatio; -using FuzzySharp.SimilarityRatio.Scorer; -using FuzzySharp.SimilarityRatio.Scorer.StrategySensitive; -using NUnit.Framework; -using System; -using System.Collections.Generic; -using System.Text; +using NUnit.Framework; +using Raffinert.FuzzySharp.SimilarityRatio; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive; -namespace FuzzySharp.Test.FuzzyTests.ScorerTests +namespace Raffinert.FuzzySharp.Test.FuzzyTests.ScorerTests { [TestFixture] public class TokenSetScorerBaseTest diff --git a/FuzzySharp.sln b/FuzzySharp.sln index 78eed94..ac51363 100644 --- a/FuzzySharp.sln +++ b/FuzzySharp.sln @@ -1,11 +1,13 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 16 -VisualStudioVersion = 16.0.29806.167 +# Visual Studio Version 17 +VisualStudioVersion = 17.10.35122.118 MinimumVisualStudioVersion = 10.0.40219.1 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FuzzySharp", "FuzzySharp\FuzzySharp.csproj", "{348B90DA-DA44-45AD-B857-D3A69D05AE46}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FuzzySharp.Test", "FuzzySharp.Test\FuzzySharp.Test.csproj", "{48F4C7CB-E669-410C-A455-DE3330347807}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FuzzySharp.Test", "FuzzySharp.Test\FuzzySharp.Test.csproj", "{48F4C7CB-E669-410C-A455-DE3330347807}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FuzzySharp.Benchmarks", "FuzzySharp.Benchmarks\FuzzySharp.Benchmarks.csproj", "{480CAE39-ACA7-411A-BF6B-72E61ED6E129}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -21,6 +23,10 @@ Global {48F4C7CB-E669-410C-A455-DE3330347807}.Debug|Any CPU.Build.0 = Debug|Any CPU {48F4C7CB-E669-410C-A455-DE3330347807}.Release|Any CPU.ActiveCfg = Release|Any CPU {48F4C7CB-E669-410C-A455-DE3330347807}.Release|Any CPU.Build.0 = Release|Any CPU + {480CAE39-ACA7-411A-BF6B-72E61ED6E129}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {480CAE39-ACA7-411A-BF6B-72E61ED6E129}.Debug|Any CPU.Build.0 = Debug|Any CPU + {480CAE39-ACA7-411A-BF6B-72E61ED6E129}.Release|Any CPU.ActiveCfg = Release|Any CPU + {480CAE39-ACA7-411A-BF6B-72E61ED6E129}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/FuzzySharp/Edits/EditOp.cs b/FuzzySharp/Edits/EditOp.cs index 674bc13..eeaa1b2 100644 --- a/FuzzySharp/Edits/EditOp.cs +++ b/FuzzySharp/Edits/EditOp.cs @@ -1,4 +1,4 @@ -namespace FuzzySharp.Edits +namespace Raffinert.FuzzySharp.Edits { public enum EditType { diff --git a/FuzzySharp/Edits/MatchingBlock.cs b/FuzzySharp/Edits/MatchingBlock.cs index 585b6ad..66f9629 100644 --- a/FuzzySharp/Edits/MatchingBlock.cs +++ b/FuzzySharp/Edits/MatchingBlock.cs @@ -1,4 +1,4 @@ -namespace FuzzySharp.Edits +namespace Raffinert.FuzzySharp.Edits { public class MatchingBlock { @@ -6,9 +6,6 @@ public class MatchingBlock public int DestPos { get; set; } public int Length { get; set; } - public override string ToString() - { - return $"({SourcePos},{DestPos},{Length})"; - } + public override string ToString() => $"({SourcePos},{DestPos},{Length})"; } } diff --git a/FuzzySharp/Edits/OpCode.cs b/FuzzySharp/Edits/OpCode.cs index f6fa1e3..f43a09e 100644 --- a/FuzzySharp/Edits/OpCode.cs +++ b/FuzzySharp/Edits/OpCode.cs @@ -1,4 +1,4 @@ -namespace FuzzySharp.Edits +namespace Raffinert.FuzzySharp.Edits { public class OpCode { diff --git a/FuzzySharp/Extensions/EnumerableExtensions.cs b/FuzzySharp/Extensions/EnumerableExtensions.cs index ad8af69..ecead6d 100644 --- a/FuzzySharp/Extensions/EnumerableExtensions.cs +++ b/FuzzySharp/Extensions/EnumerableExtensions.cs @@ -1,8 +1,8 @@ using System; using System.Collections.Generic; -using FuzzySharp.Utils; +using Raffinert.FuzzySharp.Utils; -namespace FuzzySharp.Extensions +namespace Raffinert.FuzzySharp.Extensions { public static class EnumerableExtensions { diff --git a/FuzzySharp/Extensions/StringExtensions.cs b/FuzzySharp/Extensions/StringExtensions.cs new file mode 100644 index 0000000..aa6afdf --- /dev/null +++ b/FuzzySharp/Extensions/StringExtensions.cs @@ -0,0 +1,67 @@ +using System; +using System.Collections.Generic; + +namespace Raffinert.FuzzySharp.Extensions +{ + internal static class StringExtensions + { + public static List ExtractTokens(this string input) + { + var result = new List(); + + if (string.IsNullOrEmpty(input)) + return result; + + var span = input.AsSpan(); + + var start = 0; + for (var i = 0; i < span.Length; i++) + { + if (char.IsLetter(span[i])) continue; + + if (i - start > 0) + { + result.Add(span[start..i].ToString()); + } + + start = i+1; + } + + if (span.Length - start > 0) + result.Add(span[start..].ToString()); + + return result; + } + + public static string[] SplitByAnySpace(this string input) + { + if (string.IsNullOrWhiteSpace(input)) + return []; + + var words = input.Split(EmptyArray(), StringSplitOptions.RemoveEmptyEntries); + + return words; + } + + public static string[] GetSortedWords(this string input) + { + var words = SplitByAnySpace(input); + + Array.Sort(words); + + return words; + } + + public static string NormalizeSpacesAndSort(this string input) + { + var words = GetSortedWords(input); + + return string.Join(" ", words); + } + + private static T[] EmptyArray() + { + return []; + } + } +} diff --git a/FuzzySharp/Extractor/ExtractedResult.cs b/FuzzySharp/Extractor/ExtractedResult.cs index 43f41e2..920f727 100644 --- a/FuzzySharp/Extractor/ExtractedResult.cs +++ b/FuzzySharp/Extractor/ExtractedResult.cs @@ -1,7 +1,7 @@ using System; using System.Collections.Generic; -namespace FuzzySharp.Extractor +namespace Raffinert.FuzzySharp.Extractor { public class ExtractedResult : IComparable> { @@ -34,7 +34,7 @@ public override string ToString() { return $"(string: {Value}, score: {Score}, index: {Index})"; } - return $"(value: {Value.ToString()}, score: {Score}, index: {Index})"; + return $"(value: {Value}, score: {Score}, index: {Index})"; } } } diff --git a/FuzzySharp/Extractor/ResultExtractor.cs b/FuzzySharp/Extractor/ResultExtractor.cs index 66b7168..b173944 100644 --- a/FuzzySharp/Extractor/ResultExtractor.cs +++ b/FuzzySharp/Extractor/ResultExtractor.cs @@ -1,20 +1,19 @@ using System; using System.Collections.Generic; using System.Linq; -using FuzzySharp.Extensions; -using FuzzySharp.SimilarityRatio.Scorer; +using Raffinert.FuzzySharp.Extensions; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer; -namespace FuzzySharp.Extractor +namespace Raffinert.FuzzySharp.Extractor { public static class ResultExtractor { - public static IEnumerable> ExtractWithoutOrder(T query, IEnumerable choices, Func processor, IRatioScorer scorer, int cutoff = 0) + public static IEnumerable> ExtractWithoutOrder(string query, IEnumerable choices, Func processor, IRatioScorer scorer, int cutoff = 0) { int index = 0; - var processedQuery = processor(query); foreach (var choice in choices) { - int score = scorer.Score(processedQuery, processor(choice)); + int score = scorer.Score(query, processor(choice)); if (score >= cutoff) { yield return new ExtractedResult(choice, score, index); @@ -23,6 +22,12 @@ public static IEnumerable> ExtractWithoutOrder(T query, IE } } + public static IEnumerable> ExtractWithoutOrder(T query, IEnumerable choices, Func processor, IRatioScorer scorer, int cutoff = 0) + { + var processedQuery = processor(query); + return ExtractWithoutOrder(processedQuery, choices, processor, scorer, cutoff); + } + public static ExtractedResult ExtractOne(T query, IEnumerable choices, Func processor, IRatioScorer calculator, int cutoff = 0) { return ExtractWithoutOrder(query, choices, processor, calculator, cutoff).Max(); diff --git a/FuzzySharp/Fuzz.cs b/FuzzySharp/Fuzz.cs index 77a4176..3e66f91 100644 --- a/FuzzySharp/Fuzz.cs +++ b/FuzzySharp/Fuzz.cs @@ -1,9 +1,9 @@ -using FuzzySharp.PreProcess; -using FuzzySharp.SimilarityRatio; -using FuzzySharp.SimilarityRatio.Scorer.Composite; -using FuzzySharp.SimilarityRatio.Scorer.StrategySensitive; +using Raffinert.FuzzySharp.PreProcess; +using Raffinert.FuzzySharp.SimilarityRatio; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer.Composite; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive; -namespace FuzzySharp +namespace Raffinert.FuzzySharp { public static class Fuzz { diff --git a/FuzzySharp/FuzzySharp.csproj b/FuzzySharp/FuzzySharp.csproj index d2cee32..3217e89 100644 --- a/FuzzySharp/FuzzySharp.csproj +++ b/FuzzySharp/FuzzySharp.csproj @@ -1,38 +1,40 @@  - - netcoreapp2.0;netcoreapp2.1;netstandard1.6;netstandard2.0;netstandard2.1;net45;net46;net461 - true - Jacob Bayer - Fuzzy string matcher based on FuzzyWuzzy algorithm from SeatGeek - Fuzzy String Matching Comparison FuzzyWuzzy FuzzySharp - false - https://github.com/JakeBayer/FuzzySharp - MIT - git - - 2.0.2 - Include source link - true - https://github.com/JakeBayer/FuzzySharp - 1.0.4.0 - 1.0.4.0 - - true + + 2.0.3.0 + Jacob Bayer;Yevhen Cherkes + + Fuzzy string matcher based on FuzzyWuzzy algorithm from SeatGeek + 2.0.3.0 + true true + 12.0 + MIT + https://github.com/Raffinert/FuzzySharp + Performance, allocations + false + Fuzzy String Matching Comparison FuzzyWuzzy FuzzySharp + true + git + https://github.com/Raffinert/FuzzySharp snupkg + netstandard2.0;netstandard2.1;netcoreapp3.1;net45;net46;net461;net472;net48;NET60;NET80 + 2.0.3 + Raffinert.$(MSBuildProjectName) + Raffinert.$(MSBuildProjectName.Replace(" ", "_")) + README.md + + + + - + - - - - System - - - + + + diff --git a/FuzzySharp/Levenshtein.cs b/FuzzySharp/Levenshtein.cs index 5b620af..36867c7 100644 --- a/FuzzySharp/Levenshtein.cs +++ b/FuzzySharp/Levenshtein.cs @@ -1,30 +1,26 @@ using System; -using System.Collections.Generic; using System.Diagnostics; -using System.Linq; -using FuzzySharp.Edits; +using Raffinert.FuzzySharp.Edits; -namespace FuzzySharp +namespace Raffinert.FuzzySharp { public static class Levenshtein { private static EditOp[] GetEditOps(T[] arr1, T[] arr2) where T : IEquatable { - return GetEditOps(arr1.Length, arr1, arr2.Length, arr2); + return GetEditOps(arr1.Length, (ReadOnlySpan)arr1, arr2.Length, (ReadOnlySpan)arr2); } // Special Case - private static EditOp[] GetEditOps(string s1, string s2) + private static EditOp[] GetEditOps(ReadOnlySpan s1, ReadOnlySpan s2) { - return GetEditOps(s1.Length, s1.ToCharArray(), s2.Length, s2.ToCharArray()); + return GetEditOps(s1.Length, s1, s2.Length, s2); } - private static EditOp[] GetEditOps(int len1, T[] c1, int len2, T[] c2) where T : IEquatable + private static EditOp[] GetEditOps(int len1, ReadOnlySpan c1, int len2, ReadOnlySpan c2) where T : IEquatable { int i; - int[] matrix; - int p1 = 0; int p2 = 0; @@ -53,7 +49,7 @@ private static EditOp[] GetEditOps(int len1, T[] c1, int len2, T[] c2) where len1++; len2++; - matrix = new int[len2 * len1]; + int[] matrix = new int[len2 * len1]; for (i = 0; i < len2; i++) matrix[i] = i; @@ -98,13 +94,12 @@ private static EditOp[] GetEditOps(int len1, T[] c1, int len2, T[] c2) where } - return EditOpsFromCostMatrix(len1, c1, p1, len1o, len2, c2, p2, len2o, matrix); } - private static EditOp[] EditOpsFromCostMatrix(int len1, T[] c1, int p1, int o1, - int len2, T[] c2, int p2, int o2, + private static EditOp[] EditOpsFromCostMatrix(int len1, ReadOnlySpan c1, int p1, int o1, + int len2, ReadOnlySpan c2, int p2, int o2, int[] matrix) where T: IEquatable { @@ -232,7 +227,7 @@ public static MatchingBlock[] GetMatchingBlocks(T[] s1, T[] s2) where T : IEq } // Special Case - public static MatchingBlock[] GetMatchingBlocks(string s1, string s2) + public static MatchingBlock[] GetMatchingBlocks(ReadOnlySpan s1, ReadOnlySpan s2) { return GetMatchingBlocks(s1.Length, s2.Length, GetEditOps(s1, s2)); @@ -250,7 +245,7 @@ public static MatchingBlock[] GetMatchingBlocks(int len1, int len2, OpCode[] ops noOfMB = 0; - for (i = n; i-- != 0; o++) + for (i = n; i != 0; i--, o++) { if (ops[o].EditType == EditType.KEEP) { @@ -300,7 +295,7 @@ public static MatchingBlock[] GetMatchingBlocks(int len1, int len2, OpCode[] ops Debug.Assert(mb != noOfMB); - MatchingBlock finalBlock = new MatchingBlock + var finalBlock = new MatchingBlock { SourcePos = len1, DestPos = len2, @@ -328,7 +323,9 @@ private static MatchingBlock[] GetMatchingBlocks(int len1, int len2, EditOp[] op EditType type; - for (i = n; i != 0;) + i = n; + + while (i > 0) { @@ -383,9 +380,6 @@ private static MatchingBlock[] GetMatchingBlocks(int len1, int len2, EditOp[] op } while (i != 0 && ops[o].EditType == type && SourcePos == ops[o].SourcePos && DestPos == ops[o].DestPos); break; - - default: - break; } } @@ -400,8 +394,9 @@ private static MatchingBlock[] GetMatchingBlocks(int len1, int len2, EditOp[] op SourcePos = DestPos = 0; int mbIndex = 0; + i = n; - for (i = n; i != 0;) + while (i > 0) { while (ops[o].EditType == EditType.KEEP && --i != 0) @@ -412,11 +407,13 @@ private static MatchingBlock[] GetMatchingBlocks(int len1, int len2, EditOp[] op if (SourcePos < ops[o].SourcePos || DestPos < ops[o].DestPos) { - MatchingBlock mb = new MatchingBlock(); + var mb = new MatchingBlock + { + SourcePos = SourcePos, + DestPos = DestPos, + Length = ops[o].SourcePos - SourcePos + }; - mb.SourcePos = SourcePos; - mb.DestPos = DestPos; - mb.Length = ops[o].SourcePos - SourcePos; SourcePos = ops[o].SourcePos; DestPos = ops[o].DestPos; @@ -458,9 +455,6 @@ private static MatchingBlock[] GetMatchingBlocks(int len1, int len2, EditOp[] op } while (i != 0 && ops[o].EditType == type && SourcePos == ops[o].SourcePos && DestPos == ops[o].DestPos); break; - - default: - break; } } @@ -468,20 +462,24 @@ private static MatchingBlock[] GetMatchingBlocks(int len1, int len2, EditOp[] op { Debug.Assert(len1 -SourcePos == len2 - DestPos); - MatchingBlock mb = new MatchingBlock(); - mb.SourcePos = SourcePos; - mb.DestPos = DestPos; - mb.Length = len1 - SourcePos; + var mb = new MatchingBlock + { + SourcePos = SourcePos, + DestPos = DestPos, + Length = len1 - SourcePos + }; matchingBlocks[mbIndex++] = mb; } Debug.Assert(numberOfMatchingBlocks == mbIndex); - MatchingBlock finalBlock = new MatchingBlock(); - finalBlock.SourcePos = len1; - finalBlock.DestPos = len2; - finalBlock.Length = 0; + var finalBlock = new MatchingBlock + { + SourcePos = len1, + DestPos = len2, + Length = 0 + }; matchingBlocks[mbIndex] = finalBlock; @@ -499,7 +497,9 @@ private static OpCode[] EditOpsToOpCodes(EditOp[] ops, int len1, int len2) noOfBlocks = 0; SourcePos = DestPos = 0; - for (i = n; i != 0;) + i = n; + + while (i > 0) { while (ops[o].EditType == EditType.KEEP && --i != 0) @@ -555,9 +555,6 @@ private static OpCode[] EditOpsToOpCodes(EditOp[] ops, int len1, int len2) } while (i != 0 && ops[o].EditType == type && SourcePos == ops[o].SourcePos && DestPos == ops[o].DestPos); break; - - default: - break; } } @@ -570,7 +567,9 @@ private static OpCode[] EditOpsToOpCodes(EditOp[] ops, int len1, int len2) SourcePos = DestPos = 0; int oIndex = 0; - for (i = n; i != 0;) + i = n; + + while (i > 0) { while (ops[o].EditType == EditType.KEEP && --i != 0) @@ -648,16 +647,15 @@ private static OpCode[] EditOpsToOpCodes(EditOp[] ops, int len1, int len2) { Debug.Assert(len1 - SourcePos == len2 - DestPos); - if (opCodes[oIndex] == null) - opCodes[oIndex] = new OpCode(); - opCodes[oIndex].EditType = EditType.KEEP; - opCodes[oIndex].SourceBegin = SourcePos; - opCodes[oIndex].DestBegin = DestPos; - opCodes[oIndex].SourceEnd = len1; - opCodes[oIndex].DestEnd = len2; - oIndex++; + var opcode = opCodes[oIndex] ?? (opCodes[oIndex] = new OpCode()); + opcode.EditType = EditType.KEEP; + opcode.SourceBegin = SourcePos; + opcode.DestBegin = DestPos; + opcode.SourceEnd = len1; + opcode.DestEnd = len2; + oIndex++; } Debug.Assert(oIndex == noOfBlocks); @@ -665,13 +663,7 @@ private static OpCode[] EditOpsToOpCodes(EditOp[] ops, int len1, int len2) return opCodes; } - // Special Case - public static int EditDistance(string s1, string s2, int xcost = 0) - { - return EditDistance(s1.ToCharArray(), s2.ToCharArray(), xcost); - } - - public static int EditDistance(T[] c1, T[] c2, int xcost = 0) where T: IEquatable + public static int EditDistance(ReadOnlySpan c1, ReadOnlySpan c2, int xcost = 0) where T: IEquatable { int i; @@ -720,7 +712,7 @@ public static int EditDistance(T[] c1, T[] c2, int xcost = 0) where T: IEqua str1 = str2; str2 = temp; - T[] t = c2; + ReadOnlySpan t = c2; c2 = c1; c1 = t; @@ -733,10 +725,8 @@ public static int EditDistance(T[] c1, T[] c2, int xcost = 0) where T: IEqua { return len2 + 1 - 2 * Memchr(c2, str2, c1[str1], len2); } - else - { - return len2 - Memchr(c2, str2, c1[str1], len2); - } + + return len2 - Memchr(c2, str2, c1[str1], len2); } len1++; @@ -773,7 +763,7 @@ public static int EditDistance(T[] c1, T[] c2, int xcost = 0) where T: IEqua if (ch1.Equals(c2[c2p++])) { - x = --D; + x = D-1; } else { @@ -839,7 +829,7 @@ public static int EditDistance(T[] c1, T[] c2, int xcost = 0) where T: IEqua /* main */ while (p <= end) { - int c3 = --D + (!ch1.Equals(c2[c2p++]) ? 1 : 0); + int c3 = D-1 + (!ch1.Equals(c2[c2p++]) ? 1 : 0); x++; if (x > c3) { @@ -856,7 +846,7 @@ public static int EditDistance(T[] c1, T[] c2, int xcost = 0) where T: IEqua /* lower triangle sentinel */ if (i <= half) { - int c3 = --D + (!ch1.Equals(c2[c2p]) ? 1 : 0); + int c3 = D - 1 + (!ch1.Equals(c2[c2p]) ? 1 : 0); x++; if (x > c3) { @@ -873,7 +863,7 @@ public static int EditDistance(T[] c1, T[] c2, int xcost = 0) where T: IEqua } - private static int Memchr(T[] haystack, int offset, T needle, int num) where T : IEquatable + private static int Memchr(ReadOnlySpan haystack, int offset, T needle, int num) where T : IEquatable { if (num != 0) @@ -899,20 +889,18 @@ public static double GetRatio(T[] input1, T[] input2) where T : IEquatable int len2 = input2.Length; int lensum = len1 + len2; - int editDistance = EditDistance(input1, input2, 1); + int editDistance = EditDistance(input1.AsSpan(), input2.AsSpan(), 1); return editDistance == 0 ? 1 : (lensum - editDistance) / (double)lensum; } - public static double GetRatio(IEnumerable input1, IEnumerable input2) where T : IEquatable + public static double GetRatio(ReadOnlySpan input1, ReadOnlySpan input2) where T : IEquatable { - var s1 = input1.ToArray(); - var s2 = input2.ToArray(); - int len1 = s1.Length; - int len2 = s2.Length; + int len1 = input1.Length; + int len2 = input2.Length; int lensum = len1 + len2; - int editDistance = EditDistance(s1, s2, 1); + int editDistance = EditDistance(input1, input2, 1); return editDistance == 0 ? 1 : (lensum - editDistance) / (double)lensum; } @@ -920,7 +908,7 @@ public static double GetRatio(IEnumerable input1, IEnumerable input2) w // Special Case public static double GetRatio(string s1, string s2) { - return GetRatio(s1.ToCharArray(), s2.ToCharArray()); + return GetRatio(s1.AsSpan(), s2.AsSpan()); } } } diff --git a/FuzzySharp/PreProcess/PreprocessMode.cs b/FuzzySharp/PreProcess/PreprocessMode.cs index be801e7..b8d1dde 100644 --- a/FuzzySharp/PreProcess/PreprocessMode.cs +++ b/FuzzySharp/PreProcess/PreprocessMode.cs @@ -1,4 +1,4 @@ -namespace FuzzySharp.PreProcess +namespace Raffinert.FuzzySharp.PreProcess { public enum PreprocessMode { diff --git a/FuzzySharp/PreProcess/StringPreprocessorFactory.cs b/FuzzySharp/PreProcess/StringPreprocessorFactory.cs index 0cc5647..fff0d3c 100644 --- a/FuzzySharp/PreProcess/StringPreprocessorFactory.cs +++ b/FuzzySharp/PreProcess/StringPreprocessorFactory.cs @@ -1,31 +1,35 @@ using System; -using System.Text.RegularExpressions; -namespace FuzzySharp.PreProcess +namespace Raffinert.FuzzySharp.PreProcess { - internal class StringPreprocessorFactory + internal static class StringPreprocessorFactory { - private static string pattern = "[^ a-zA-Z0-9]"; - private static string Default(string input) { - input = Regex.Replace(input, pattern, " "); - input = input.ToLower(); + if (string.IsNullOrWhiteSpace(input)) + { + return string.Empty; + } + + var result = new char[input.Length].AsSpan(); - return input.Trim(); + for (var i = 0; i < input.Length; i++) + { + var c = input[i]; + result[i] = char.IsLetterOrDigit(c) ? char.ToLower(c) : ' '; + } + + return ((ReadOnlySpan)result).Trim().ToString(); } public static Func GetPreprocessor(PreprocessMode mode) { - switch (mode) + return mode switch { - case PreprocessMode.Full: - return Default; - case PreprocessMode.None: - return s => s; - default: - throw new InvalidOperationException($"Invalid string preprocessor mode: {mode}"); - } + PreprocessMode.Full => Default, + PreprocessMode.None => static s => s, + _ => throw new InvalidOperationException($"Invalid string preprocessor mode: {mode}") + }; } } } diff --git a/FuzzySharp/Process.cs b/FuzzySharp/Process.cs index dbc5caf..b3f5109 100644 --- a/FuzzySharp/Process.cs +++ b/FuzzySharp/Process.cs @@ -1,12 +1,12 @@ using System; using System.Collections.Generic; -using FuzzySharp.Extractor; -using FuzzySharp.PreProcess; -using FuzzySharp.SimilarityRatio; -using FuzzySharp.SimilarityRatio.Scorer; -using FuzzySharp.SimilarityRatio.Scorer.Composite; +using Raffinert.FuzzySharp.Extractor; +using Raffinert.FuzzySharp.PreProcess; +using Raffinert.FuzzySharp.SimilarityRatio; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer.Composite; -namespace FuzzySharp +namespace Raffinert.FuzzySharp { public static class Process { @@ -31,8 +31,8 @@ public static IEnumerable> ExtractAll( IRatioScorer scorer = null, int cutoff = 0) { - if (processor == null) processor = s_defaultStringProcessor; - if (scorer == null) scorer = s_defaultScorer; + processor ??= s_defaultStringProcessor; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractWithoutOrder(query, choices, processor, scorer, cutoff); } @@ -53,7 +53,28 @@ public static IEnumerable> ExtractAll( IRatioScorer scorer = null, int cutoff = 0) { - if (scorer == null) scorer = s_defaultScorer; + scorer ??= s_defaultScorer; + return ResultExtractor.ExtractWithoutOrder(query, choices, processor, scorer, cutoff); + } + + /// + /// Creates a list of ExtractedResult which contain all the choices with + /// their corresponding score where higher is more similar + /// + /// + /// + /// + /// + /// + /// + public static IEnumerable> ExtractAll( + string query, + IEnumerable choices, + Func processor, + IRatioScorer scorer = null, + int cutoff = 0) + { + scorer ??= s_defaultScorer; return ResultExtractor.ExtractWithoutOrder(query, choices, processor, scorer, cutoff); } #endregion @@ -78,8 +99,8 @@ public static IEnumerable> ExtractTop( int limit = 5, int cutoff = 0) { - if (processor == null) processor = s_defaultStringProcessor; - if (scorer == null) scorer = s_defaultScorer; + processor ??= s_defaultStringProcessor; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractTop(query, choices, processor, scorer, limit, cutoff); } @@ -103,7 +124,7 @@ public static IEnumerable> ExtractTop( int limit = 5, int cutoff = 0) { - if (scorer == null) scorer = s_defaultScorer; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractTop(query, choices, processor, scorer, limit, cutoff); } #endregion @@ -125,8 +146,8 @@ public static IEnumerable> ExtractSorted( IRatioScorer scorer = null, int cutoff = 0) { - if (processor == null) processor = s_defaultStringProcessor; - if (scorer == null) scorer = s_defaultScorer; + processor ??= s_defaultStringProcessor; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractSorted(query, choices, processor, scorer, cutoff); } @@ -146,7 +167,7 @@ public static IEnumerable> ExtractSorted( IRatioScorer scorer = null, int cutoff = 0) { - if (scorer == null) scorer = s_defaultScorer; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractSorted(query, choices, processor, scorer, cutoff); } #endregion @@ -168,8 +189,8 @@ public static ExtractedResult ExtractOne( IRatioScorer scorer = null, int cutoff = 0) { - if (processor == null) processor = s_defaultStringProcessor; - if (scorer == null) scorer = s_defaultScorer; + processor ??= s_defaultStringProcessor; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractOne(query, choices, processor, scorer, cutoff); } @@ -189,7 +210,7 @@ public static ExtractedResult ExtractOne( IRatioScorer scorer = null, int cutoff = 0) { - if (scorer == null) scorer = s_defaultScorer; + scorer ??= s_defaultScorer; return ResultExtractor.ExtractOne(query, choices, processor, scorer, cutoff); } diff --git a/FuzzySharp/README.md b/FuzzySharp/README.md deleted file mode 100644 index 14835b5..0000000 --- a/FuzzySharp/README.md +++ /dev/null @@ -1,133 +0,0 @@ -# FuzzySharp -C# .NET fuzzy string matching implementation of Seat Geek's well known python FuzzyWuzzy algorithm. - -## Usage - -Install-Package FuzzySharp -Version 2.0.1 - -## NOTES -As of version 2.0.0, if either test string is an empty string, the scorers will return a score of 0. Previously this was returning 100 for all partial ratios, which was causing severe issues for some fo the compound scorers. - -#### Simple Ratio -```csharp -Fuzz.Ratio("mysmilarstring","myawfullysimilarstirng") -72 -Fuzz.Ratio("mysmilarstring","mysimilarstring") -97 -``` - -#### Partial Ratio -```csharp -Fuzz.PartialRatio("similar", "somewhresimlrbetweenthisstring") -71 -``` - -#### Token Sort Ratio -```csharp -Fuzz.TokenSortRatio("order words out of"," words out of order") -100 -Fuzz.PartialTokenSortRatio("order words out of"," words out of order") -100 -``` - -#### Token Set Ratio -```csharp -Fuzz.TokenSetRatio("fuzzy was a bear", "fuzzy fuzzy fuzzy bear") -100 -Fuzz.PartialTokenSetRatio("fuzzy was a bear", "fuzzy fuzzy fuzzy bear") -100 -``` - -#### Token Initialism Ratio -```csharp -Fuzz.TokenInitialismRatio("NASA", "National Aeronautics and Space Administration"); -89 -Fuzz.TokenInitialismRatio("NASA", "National Aeronautics Space Administration"); -100 - -Fuzz.TokenInitialismRatio("NASA", "National Aeronautics Space Administration, Kennedy Space Center, Cape Canaveral, Florida 32899"); -53 -Fuzz.PartialTokenInitialismRatio("NASA", "National Aeronautics Space Administration, Kennedy Space Center, Cape Canaveral, Florida 32899"); -100 -``` - -#### Token Abbreviation Ratio -```csharp -Fuzz.TokenAbbreviationRatio("bl 420", "Baseline section 420", PreprocessMode.Full); -40 -Fuzz.PartialTokenAbbreviationRatio("bl 420", "Baseline section 420", PreprocessMode.Full); -50 -``` - - -#### Weighted Ratio -```csharp -Fuzz.WeightedRatio("The quick brown fox jimps ofver the small lazy dog", "the quick brown fox jumps over the small lazy dog") -95 -``` - -#### Process -```csharp -Process.ExtractOne("cowboys", new[] { "Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"}) -(string: Dallas Cowboys, score: 90, index: 3) -``` -```csharp -Process.ExtractTop("goolge", new[] { "google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl" }, limit: 3); -[(string: google, score: 83, index: 0), (string: googleplus, score: 75, index: 5), (string: plexoogl, score: 43, index: 7)] -``` -```csharp -Process.ExtractAll("goolge", new [] {"google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl" }) -[(string: google, score: 83, index: 0), (string: bing, score: 22, index: 1), (string: facebook, score: 29, index: 2), (string: linkedin, score: 29, index: 3), (string: twitter, score: 15, index: 4), (string: googleplus, score: 75, index: 5), (string: bingnews, score: 29, index: 6), (string: plexoogl, score: 43, index: 7)] -// score cutoff -Process.ExtractAll("goolge", new[] { "google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl" }, cutoff: 40) -[(string: google, score: 83, index: 0), (string: googleplus, score: 75, index: 5), (string: plexoogl, score: 43, index: 7)] -``` -```csharp -Process.ExtractSorted("goolge", new [] {"google", "bing", "facebook", "linkedin", "twitter", "googleplus", "bingnews", "plexoogl" }) -[(string: google, score: 83, index: 0), (string: googleplus, score: 75, index: 5), (string: plexoogl, score: 43, index: 7), (string: facebook, score: 29, index: 2), (string: linkedin, score: 29, index: 3), (string: bingnews, score: 29, index: 6), (string: bing, score: 22, index: 1), (string: twitter, score: 15, index: 4)] -``` - -Extraction will use `WeightedRatio` and `full process` by default. Override these in the method parameters to use different scorers and processing. -Here we use the Fuzz.Ratio scorer and keep the strings as is, instead of Full Process (which will .ToLowercase() before comparing) -```csharp -Process.ExtractOne("cowboys", new[] { "Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys" }, s => s, ScorerCache.Get()); -(string: Dallas Cowboys, score: 57, index: 3) -``` - -Extraction can operate on objects of similar type. Use the "process" parameter to reduce the object to the string which it should be compared on. In the following example, the object is an array that contains the matchup, the arena, the date, and the time. We are matching on the first (0 index) parameter, the matchup. -```csharp -var events = new[] -{ - new[] { "chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm" }, - new[] { "new york yankees vs boston red sox", "Fenway Park", "2011-05-11", "8pm" }, - new[] { "atlanta braves vs pittsburgh pirates", "PNC Park", "2011-05-11", "8pm" }, -}; -var query = new[] { "new york mets vs chicago cubs", "CitiField", "2017-03-19", "8pm" }; -var best = Process.ExtractOne(query, events, strings => strings[0]); - -best: (value: { "chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm" }, score: 95, index: 0) -``` - -### Using Different Scorers -Scoring strategies are stateless, and as such should be static. However, in order to get them to share all the code they have in common via inheritance, making them static was not possible. -Currently one way around having to new up an instance everytime you want to use one is to use the cache. This will ensure only one instance of each scorer ever exists. -```csharp -var ratio = ScorerCache.Get(); -var partialRatio = ScorerCache.Get(); -var tokenSet = ScorerCache.Get(); -var partialTokenSet = ScorerCache.Get(); -var tokenSort = ScorerCache.Get(); -var partialTokenSort = ScorerCache.Get(); -var tokenAbbreviation = ScorerCache.Get(); -var partialTokenAbbreviation = ScorerCache.Get(); -var weighted = ScorerCache.Get(); -``` - -## Credits - -- SeatGeek -- Adam Cohen -- David Necas (python-Levenshtein) -- Mikko Ohtamaa (python-Levenshtein) -- Antti Haapala (python-Levenshtein) -- Panayiotis (Java implementation I heavily borrowed from) diff --git a/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs index b3744ec..e8f7b03 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/Composite/WeightedRatioScorer.cs @@ -1,7 +1,6 @@ using System; -using System.Linq; -namespace FuzzySharp.SimilarityRatio.Scorer.Composite +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.Composite { public class WeightedRatioScorer : ScorerBase { @@ -38,16 +37,12 @@ public override int Score(string input1, string input2) double partialSor = Fuzz.TokenSortRatio(input1, input2) * unbaseScale * partialScale; double partialSet = Fuzz.TokenSetRatio(input1, input2) * unbaseScale * partialScale; - return (int) Math.Round(new[] { baseRatio, partial, partialSor, partialSet }.Max()); + return (int) Math.Round(Math.Max(baseRatio, Math.Max(partial, Math.Max(partialSor, partialSet)))); } - else - { - double tokenSort = Fuzz.TokenSortRatio(input1, input2) * unbaseScale; - double tokenSet = Fuzz.TokenSetRatio(input1, input2) * unbaseScale; - return (int) Math.Round(new[] { baseRatio, tokenSort, tokenSet }.Max()); - } + double tokenSort = Fuzz.TokenSortRatio(input1, input2) * unbaseScale; + double tokenSet = Fuzz.TokenSetRatio(input1, input2) * unbaseScale; + return (int) Math.Round(Math.Max(baseRatio, Math.Max(tokenSort, tokenSet))); } - } } diff --git a/FuzzySharp/SimilarityRatio/Scorer/Generic/IRatioScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/Generic/IRatioScorer.cs index 4584037..900bf56 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/Generic/IRatioScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/Generic/IRatioScorer.cs @@ -1,6 +1,6 @@ using System; -namespace FuzzySharp.SimilarityRatio.Scorer.Generic +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.Generic { public interface IRatioScorer where T : IEquatable { diff --git a/FuzzySharp/SimilarityRatio/Scorer/Generic/ScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/Generic/ScorerBase.cs index 7898e76..90605a7 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/Generic/ScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/Generic/ScorerBase.cs @@ -1,6 +1,6 @@ using System; -namespace FuzzySharp.SimilarityRatio.Scorer.Generic +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.Generic { public abstract class ScorerBase : IRatioScorer where T : IEquatable { diff --git a/FuzzySharp/SimilarityRatio/Scorer/IRatioScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/IRatioScorer.cs index 1a69926..526e70b 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/IRatioScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/IRatioScorer.cs @@ -1,6 +1,6 @@ -using FuzzySharp.PreProcess; +using Raffinert.FuzzySharp.PreProcess; -namespace FuzzySharp.SimilarityRatio.Scorer +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer { public interface IRatioScorer { diff --git a/FuzzySharp/SimilarityRatio/Scorer/ScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/ScorerBase.cs index f898d90..a44c819 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/ScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/ScorerBase.cs @@ -1,6 +1,6 @@ -using FuzzySharp.PreProcess; +using Raffinert.FuzzySharp.PreProcess; -namespace FuzzySharp.SimilarityRatio.Scorer +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer { public abstract class ScorerBase : IRatioScorer { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Generic/StrategySensitiveScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Generic/StrategySensitiveScorerBase.cs index cec7017..20471d5 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Generic/StrategySensitiveScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Generic/StrategySensitiveScorerBase.cs @@ -1,7 +1,7 @@ using System; -using FuzzySharp.SimilarityRatio.Scorer.Generic; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer.Generic; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive.Generic +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive.Generic { public abstract class StrategySensitiveScorerBase : ScorerBase where T : IEquatable { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/DefaultRatioScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/DefaultRatioScorer.cs index 12ef6d1..393f972 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/DefaultRatioScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/DefaultRatioScorer.cs @@ -1,7 +1,7 @@ using System; -using FuzzySharp.SimilarityRatio.Strategy; +using Raffinert.FuzzySharp.SimilarityRatio.Strategy; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public class DefaultRatioScorer : SimpleRatioScorerBase { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/PartialRatioScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/PartialRatioScorer.cs index 049d8af..3127672 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/PartialRatioScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/PartialRatioScorer.cs @@ -1,7 +1,7 @@ using System; -using FuzzySharp.SimilarityRatio.Strategy; +using Raffinert.FuzzySharp.SimilarityRatio.Strategy; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public class PartialRatioScorer : SimpleRatioScorerBase { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/SimpleRatioScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/SimpleRatioScorerBase.cs index e2a15d4..8b24c56 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/SimpleRatioScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/Simple/SimpleRatioScorerBase.cs @@ -1,4 +1,4 @@ -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public abstract class SimpleRatioScorerBase : StrategySensitiveScorerBase { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/StrategySensitiveScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/StrategySensitiveScorerBase.cs index 6f01e30..3414bb3 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/StrategySensitiveScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/StrategySensitiveScorerBase.cs @@ -1,6 +1,6 @@ using System; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public abstract class StrategySensitiveScorerBase : ScorerBase { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/PartialTokenAbbreviationScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/PartialTokenAbbreviationScorer.cs index 4812645..6e30b05 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/PartialTokenAbbreviationScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/PartialTokenAbbreviationScorer.cs @@ -1,7 +1,7 @@ using System; -using FuzzySharp.SimilarityRatio.Strategy; +using Raffinert.FuzzySharp.SimilarityRatio.Strategy; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public class PartialTokenAbbreviationScorer : TokenAbbreviationScorerBase { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorer.cs index 847fec8..9650ce2 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorer.cs @@ -1,7 +1,7 @@ using System; -using FuzzySharp.SimilarityRatio.Strategy; +using Raffinert.FuzzySharp.SimilarityRatio.Strategy; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public class TokenAbbreviationScorer : TokenAbbreviationScorerBase { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs index 98c95ce..2dca081 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenAbbreviation/TokenAbbreviationScorerBase.cs @@ -1,9 +1,9 @@ -using System.Collections.Generic; -using System.Linq; -using System.Text.RegularExpressions; -using FuzzySharp.Utils; +using System; +using System.Collections.Generic; +using Raffinert.FuzzySharp.Extensions; +using Raffinert.FuzzySharp.Utils; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public abstract class TokenAbbreviationScorerBase : StrategySensitiveScorerBase { @@ -23,25 +23,25 @@ public override int Score(string input1, string input2) longer = input1; } - double lenRatio = ((double)longer.Length) / shorter.Length; + double lenRatio = (double)longer.Length / shorter.Length; // if longer isn't at least 1.5 times longer than the other, then its probably not an abbreviation if (lenRatio < 1.5) return 0; // numbers can't be abbreviations for other numbers, though that would be hilarious. "Yes, 4 - as in 4,238" - var tokensLonger = Regex.Matches(longer, @"[a-zA-Z]+").Cast().Select(m => m.Value).ToArray(); - var tokensShorter = Regex.Matches(shorter, @"[a-zA-Z]+").Cast().Select(m => m.Value).ToArray(); + var tokensLonger = longer.ExtractTokens(); + var tokensShorter = shorter.ExtractTokens(); // more than 4 tokens and it's probably not an abbreviation (and could get costly) - if (tokensShorter.Length > 4) + if (tokensShorter.Count > 4) { return 0; } - string[] moreTokens; - string[] fewerTokens; + List moreTokens; + List fewerTokens; - if (tokensLonger.Length > tokensShorter.Length) + if (tokensLonger.Count > tokensShorter.Count) { moreTokens = tokensLonger; fewerTokens = tokensShorter; @@ -52,26 +52,31 @@ public override int Score(string input1, string input2) fewerTokens = tokensLonger; } - var allPermutations = moreTokens.PermutationsOfSize(fewerTokens.Length); + var allPermutations = moreTokens.PermutationsOfSize(fewerTokens.Count); + + int maxScore = 0; - List allScores = new List(); foreach (var permutation in allPermutations) { double sum = 0; - for (int i = 0; i < fewerTokens.Length; i++) + for (int i = 0; i < fewerTokens.Count; i++) { var i1 = permutation[i]; var i2 = fewerTokens[i]; - if (StringContainsInOrder(i1, i2)) // must be at least twice as long + if (StringContainsInOrder(i1.AsSpan(), i2.AsSpan())) // must be at least twice as long { var score = Scorer(i1, i2); sum += score; } } - allScores.Add((int) (sum / fewerTokens.Length)); + var avgScore = (int) (sum / fewerTokens.Count); + if(avgScore > maxScore) + { + maxScore = avgScore; + } } - return allScores.Count==0?0:allScores.Max(); + return maxScore; } /// @@ -80,7 +85,7 @@ public override int Score(string input1, string input2) /// /// /// - private bool StringContainsInOrder(string s1, string s2) + private static bool StringContainsInOrder(ReadOnlySpan s1, ReadOnlySpan s2) { if (s1.Length < s2.Length) return false; int s2_idx = 0; diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs index a216197..6c76275 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/PartialTokenDifferenceScorer.cs @@ -1,7 +1,7 @@ using System; -using FuzzySharp.SimilarityRatio.Strategy.Generic; +using Raffinert.FuzzySharp.SimilarityRatio.Strategy.Generic; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public class PartialTokenDifferenceScorer : TokenDifferenceScorerBase { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorer.cs index fc2bfb9..21455a1 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorer.cs @@ -1,7 +1,7 @@ using System; -using FuzzySharp.SimilarityRatio.Strategy.Generic; +using Raffinert.FuzzySharp.SimilarityRatio.Strategy.Generic; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public class TokenDifferenceScorer : TokenDifferenceScorerBase { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorerBase.cs index 11036af..09734c1 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenDifference/TokenDifferenceScorerBase.cs @@ -1,9 +1,8 @@ -using System.Linq; -using System.Text.RegularExpressions; -using FuzzySharp.PreProcess; -using FuzzySharp.SimilarityRatio.Scorer.StrategySensitive.Generic; +using Raffinert.FuzzySharp.Extensions; +using Raffinert.FuzzySharp.PreProcess; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive.Generic; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public abstract class TokenDifferenceScorerBase : StrategySensitiveScorerBase, IRatioScorer { @@ -14,8 +13,8 @@ public override int Score(string[] input1, string[] input2) public int Score(string input1, string input2) { - var tokens1 = Regex.Split(input1, @"\s+").Where(s => s.Any()).OrderBy(s => s).ToArray(); - var tokens2 = Regex.Split(input2, @"\s+").Where(s => s.Any()).OrderBy(s => s).ToArray(); + var tokens1 = input1.GetSortedWords(); + var tokens2 = input2.GetSortedWords(); return Score(tokens1, tokens2); } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/PartialTokenInitialismScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/PartialTokenInitialismScorer.cs index 1eda5b8..2ebcc55 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/PartialTokenInitialismScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/PartialTokenInitialismScorer.cs @@ -1,7 +1,7 @@ using System; -using FuzzySharp.SimilarityRatio.Strategy; +using Raffinert.FuzzySharp.SimilarityRatio.Strategy; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public class PartialTokenInitialismScorer : TokenInitialismScorerBase { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorer.cs index 3ea2293..e6dfa82 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorer.cs @@ -1,7 +1,7 @@ using System; -using FuzzySharp.SimilarityRatio.Strategy; +using Raffinert.FuzzySharp.SimilarityRatio.Strategy; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public class TokenInitialismScorer : TokenInitialismScorerBase { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorerBase.cs index 10aa1af..8a8e293 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenInitialism/TokenInitialismScorerBase.cs @@ -1,7 +1,7 @@ using System.Linq; -using System.Text.RegularExpressions; +using Raffinert.FuzzySharp.Extensions; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public abstract class TokenInitialismScorerBase : StrategySensitiveScorerBase { @@ -26,9 +26,9 @@ public override int Score(string input1, string input2) // if longer isn't at least 3 times longer than the other, then it's probably not an initialism if (lenRatio < 3) return 0; - var initials = Regex.Split(longer, @"\s+").Where(s => s.Any()).Select(s => s[0]); + var initials = longer.SplitByAnySpace().Select(s => s[0]).ToArray(); - return Scorer(string.Join("", initials), shorter); + return Scorer(new string(initials), shorter); } } } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/PartialTokenSetScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/PartialTokenSetScorer.cs index 1011eed..54f4195 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/PartialTokenSetScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/PartialTokenSetScorer.cs @@ -1,7 +1,7 @@ using System; -using FuzzySharp.SimilarityRatio.Strategy; +using Raffinert.FuzzySharp.SimilarityRatio.Strategy; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public class PartialTokenSetScorer : TokenSetScorerBase { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorer.cs index fd88724..7f254be 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorer.cs @@ -1,7 +1,7 @@ using System; -using FuzzySharp.SimilarityRatio.Strategy; +using Raffinert.FuzzySharp.SimilarityRatio.Strategy; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public class TokenSetScorer : TokenSetScorerBase { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs index 785de55..1d3cc31 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSet/TokenSetScorerBase.cs @@ -1,27 +1,44 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Text.RegularExpressions; +using Raffinert.FuzzySharp.Extensions; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public abstract class TokenSetScorerBase : StrategySensitiveScorerBase { public override int Score(string input1, string input2) { - var tokens1 = new HashSet(Regex.Split(input1, @"\s+").Where(s => s.Any())); - var tokens2 = new HashSet(Regex.Split(input2, @"\s+").Where(s => s.Any())); + var tokens1 = new HashSet(input1.SplitByAnySpace()); + var tokens2 = new HashSet(input2.SplitByAnySpace()); - var sortedIntersection = String.Join(" ", tokens1.Intersect(tokens2).OrderBy(s => s)).Trim(); - var sortedDiff1To2 = (sortedIntersection + " " + String.Join(" ", tokens1.Except(tokens2).OrderBy(s => s))).Trim(); - var sortedDiff2To1 = (sortedIntersection + " " + String.Join(" ", tokens2.Except(tokens1).OrderBy(s => s))).Trim(); + var intersection = GetIntersectionAndExcept(tokens1, tokens2); - return new[] + var sortedIntersection = string.Join(" ", intersection.OrderBy(s => s)); + var sortedDiff1To2 = (sortedIntersection + " " + string.Join(" ", tokens1.OrderBy(s => s))).Trim(); + var sortedDiff2To1 = (sortedIntersection + " " + string.Join(" ", tokens2.OrderBy(s => s))).Trim(); + + var score1 = Scorer(sortedIntersection, sortedDiff1To2); + var score2 = Scorer(sortedIntersection, sortedDiff2To1); + var score3 = Scorer(sortedDiff1To2, sortedDiff2To1); + + return Math.Max(score1, Math.Max(score2, score3)); + } + + private static List GetIntersectionAndExcept(HashSet first, HashSet second) + { + List intersection = []; + + foreach (var item in first.ToArray()) { - Scorer(sortedIntersection, sortedDiff1To2), - Scorer(sortedIntersection, sortedDiff2To1), - Scorer(sortedDiff1To2, sortedDiff2To1) - }.Max(); + if (second.Remove(item)) + { + first.Remove(item); + intersection.Add(item); + } + } + + return intersection; } } } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/PartialTokenSortScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/PartialTokenSortScorer.cs index 5faa0e1..5aec4ae 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/PartialTokenSortScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/PartialTokenSortScorer.cs @@ -1,7 +1,7 @@ using System; -using FuzzySharp.SimilarityRatio.Strategy; +using Raffinert.FuzzySharp.SimilarityRatio.Strategy; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public class PartialTokenSortScorer : TokenSortScorerBase { diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/TokenSortAlgorithm.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/TokenSortAlgorithm.cs index dbfa10a..e11e6d7 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/TokenSortAlgorithm.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/TokenSortAlgorithm.cs @@ -1,15 +1,13 @@ -using System; -using System.Linq; -using System.Text.RegularExpressions; +using Raffinert.FuzzySharp.Extensions; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public abstract class TokenSortScorerBase : StrategySensitiveScorerBase { public override int Score(string input1, string input2) { - var sorted1 = String.Join(" ", Regex.Split(input1, @"\s+").Where(s => s.Any()).OrderBy(s => s)).Trim(); - var sorted2 = String.Join(" ", Regex.Split(input2, @"\s+").Where(s => s.Any()).OrderBy(s => s)).Trim(); + var sorted1 = input1.NormalizeSpacesAndSort(); + var sorted2 = input2.NormalizeSpacesAndSort(); return Scorer(sorted1, sorted2); } diff --git a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/TokenSortScorer.cs b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/TokenSortScorer.cs index 8cd6821..4a3e8f5 100644 --- a/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/TokenSortScorer.cs +++ b/FuzzySharp/SimilarityRatio/Scorer/StrategySensitive/TokenSort/TokenSortScorer.cs @@ -1,7 +1,7 @@ using System; -using FuzzySharp.SimilarityRatio.Strategy; +using Raffinert.FuzzySharp.SimilarityRatio.Strategy; -namespace FuzzySharp.SimilarityRatio.Scorer.StrategySensitive +namespace Raffinert.FuzzySharp.SimilarityRatio.Scorer.StrategySensitive { public class TokenSortScorer : TokenSortScorerBase { diff --git a/FuzzySharp/SimilarityRatio/ScorerCache.cs b/FuzzySharp/SimilarityRatio/ScorerCache.cs index 34b405e..a1f2ca9 100644 --- a/FuzzySharp/SimilarityRatio/ScorerCache.cs +++ b/FuzzySharp/SimilarityRatio/ScorerCache.cs @@ -1,15 +1,17 @@ -using System; -using System.Collections.Concurrent; -using FuzzySharp.SimilarityRatio.Scorer; +using System.Runtime.CompilerServices; +using Raffinert.FuzzySharp.SimilarityRatio.Scorer; -namespace FuzzySharp.SimilarityRatio +namespace Raffinert.FuzzySharp.SimilarityRatio { public static class ScorerCache { - private static readonly ConcurrentDictionary s_scorerCache = new ConcurrentDictionary(); - public static IRatioScorer Get() where T : IRatioScorer, new() + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static IRatioScorer Get() where T : IRatioScorer, new() => GenericCache.Instance; + + private static class GenericCache + where T : IRatioScorer, new() { - return s_scorerCache.GetOrAdd(typeof(T), new T()); + public static readonly T Instance = new T(); } } -} +} \ No newline at end of file diff --git a/FuzzySharp/SimilarityRatio/Strategy/DefaultRatioStrategy.cs b/FuzzySharp/SimilarityRatio/Strategy/DefaultRatioStrategy.cs index 8e8fac2..d7676a0 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/DefaultRatioStrategy.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/DefaultRatioStrategy.cs @@ -1,8 +1,8 @@ using System; -namespace FuzzySharp.SimilarityRatio.Strategy +namespace Raffinert.FuzzySharp.SimilarityRatio.Strategy { - internal class DefaultRatioStrategy + internal static class DefaultRatioStrategy { public static int Calculate(string input1, string input2) { diff --git a/FuzzySharp/SimilarityRatio/Strategy/Generic/DefaultRatioStrategyT.cs b/FuzzySharp/SimilarityRatio/Strategy/Generic/DefaultRatioStrategyT.cs index 2fdfb08..857673f 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/Generic/DefaultRatioStrategyT.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/Generic/DefaultRatioStrategyT.cs @@ -1,8 +1,8 @@ using System; -namespace FuzzySharp.SimilarityRatio.Strategy.Generic +namespace Raffinert.FuzzySharp.SimilarityRatio.Strategy.Generic { - internal class DefaultRatioStrategy where T : IEquatable + internal static class DefaultRatioStrategy where T : IEquatable { public static int Calculate(T[] input1, T[] input2) { diff --git a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs index a536da4..badd858 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/Generic/PartialRatioStrategyT.cs @@ -1,11 +1,9 @@ using System; -using System.Collections.Generic; -using System.Linq; -using FuzzySharp.Edits; +using Raffinert.FuzzySharp.Edits; -namespace FuzzySharp.SimilarityRatio.Strategy.Generic +namespace Raffinert.FuzzySharp.SimilarityRatio.Strategy.Generic { - internal class PartialRatioStrategy where T : IEquatable + internal static class PartialRatioStrategy where T : IEquatable { public static int Calculate(T[] input1, T[] input2) { @@ -30,7 +28,7 @@ public static int Calculate(T[] input1, T[] input2) MatchingBlock[] matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); - List scores = new List(); + double maxScore = 0; foreach (var matchingBlock in matchingBlocks) { @@ -41,20 +39,22 @@ public static int Calculate(T[] input1, T[] input2) if (longEnd > longer.Length) longEnd = longer.Length; - var longSubstr = longer.Skip(longStart).Take(longEnd - longStart); + var longSubstr = longer.AsSpan()[longStart..longEnd]; - double ratio = Levenshtein.GetRatio(shorter, longSubstr); + double ratio = Levenshtein.GetRatio(shorter.AsSpan(), longSubstr); if (ratio > .995) { return 100; } - scores.Add(ratio); - + if (ratio > maxScore) + { + maxScore = ratio; + } } - return (int)Math.Round(100 * scores.Max()); + return (int)Math.Round(100 * maxScore); } } } diff --git a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs index 1d25991..45bbf28 100644 --- a/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs +++ b/FuzzySharp/SimilarityRatio/Strategy/PartialRatioStrategy.cs @@ -1,36 +1,34 @@ using System; -using System.Collections.Generic; -using System.Linq; -using FuzzySharp.Edits; +using Raffinert.FuzzySharp.Edits; -namespace FuzzySharp.SimilarityRatio.Strategy +namespace Raffinert.FuzzySharp.SimilarityRatio.Strategy { - internal class PartialRatioStrategy + internal static class PartialRatioStrategy { public static int Calculate(string input1, string input2) { - string shorter; - string longer; - if (input1.Length == 0 || input2.Length == 0) { return 0; } + ReadOnlySpan shorter; + ReadOnlySpan longer; + if (input1.Length < input2.Length) { - shorter = input1; - longer = input2; + shorter = input1.AsSpan(); + longer = input2.AsSpan(); } else { - shorter = input2; - longer = input1; + shorter = input2.AsSpan(); + longer = input1.AsSpan(); } MatchingBlock[] matchingBlocks = Levenshtein.GetMatchingBlocks(shorter, longer); - List scores = new List(); + double maxScore = 0; foreach (var matchingBlock in matchingBlocks) { @@ -41,7 +39,7 @@ public static int Calculate(string input1, string input2) if (longEnd > longer.Length) longEnd = longer.Length; - string longSubstr = longer.Substring(longStart, longEnd - longStart); + var longSubstr = longer[longStart..longEnd]; double ratio = Levenshtein.GetRatio(shorter, longSubstr); @@ -50,11 +48,13 @@ public static int Calculate(string input1, string input2) return 100; } - scores.Add(ratio); - + if (ratio > maxScore) + { + maxScore = ratio; + } } - return (int)Math.Round(100 * scores.Max()); + return (int)Math.Round(100 * maxScore); } } } diff --git a/FuzzySharp/Utils/Heap.cs b/FuzzySharp/Utils/Heap.cs index b890982..9c3e611 100644 --- a/FuzzySharp/Utils/Heap.cs +++ b/FuzzySharp/Utils/Heap.cs @@ -3,7 +3,7 @@ using System.Collections.Generic; using System.Linq; -namespace FuzzySharp.Utils +namespace Raffinert.FuzzySharp.Utils { public abstract class Heap : IEnumerable { @@ -11,13 +11,11 @@ public abstract class Heap : IEnumerable private const int GrowFactor = 2; private const int MinGrow = 1; - private int _capacity = InitialCapacity; private T[] _heap = new T[InitialCapacity]; - private int _tail = 0; - public int Count => _tail; + public int Count { get; private set; } - public int Capacity => _capacity; + public int Capacity { get; private set; } = InitialCapacity; protected Comparer Comparer { get; } protected abstract bool Dominates(T x, T y); @@ -26,7 +24,7 @@ protected Heap() : this(Comparer.Default) { } - protected Heap(Comparer comparer) : this(Enumerable.Empty(), comparer) + protected Heap(Comparer comparer) : this([], comparer) { } @@ -37,19 +35,18 @@ protected Heap(IEnumerable collection) protected Heap(IEnumerable collection, Comparer comparer) { - if (collection == null) throw new ArgumentNullException(nameof(collection)); - Comparer = comparer ?? throw new ArgumentNullException(nameof(comparer)); + _ = collection ?? throw new ArgumentNullException(nameof(collection)); foreach (var item in collection) { if (Count == Capacity) Grow(); - _heap[_tail++] = item; + _heap[Count++] = item; } - for (int i = Parent(_tail - 1); i >= 0; i--) + for (int i = Parent(Count - 1); i >= 0; i--) BubbleDown(i); } @@ -58,8 +55,8 @@ public void Add(T item) if (Count == Capacity) Grow(); - _heap[_tail++] = item; - BubbleUp(_tail - 1); + _heap[Count++] = item; + BubbleUp(Count - 1); } private void BubbleUp(int i) @@ -83,8 +80,8 @@ public T ExtractDominating() { if (Count == 0) throw new InvalidOperationException("Heap is empty"); T ret = _heap[0]; - _tail--; - Swap(_tail, 0); + Count--; + Swap(Count, 0); BubbleDown(0); return ret; } @@ -93,7 +90,7 @@ private void BubbleDown(int i) { while (true) { - int dominatingNode = Dominating(i); + var dominatingNode = Dominating(i); if (dominatingNode == i) return; Swap(i, dominatingNode); i = dominatingNode; @@ -111,17 +108,15 @@ private int Dominating(int i) private int GetDominating(int newNode, int dominatingNode) { - if (newNode < _tail && !Dominates(_heap[dominatingNode], _heap[newNode])) + if (newNode < Count && !Dominates(_heap[dominatingNode], _heap[newNode])) return newNode; - else - return dominatingNode; + + return dominatingNode; } private void Swap(int i, int j) { - T tmp = _heap[i]; - _heap[i] = _heap[j]; - _heap[j] = tmp; + (_heap[i], _heap[j]) = (_heap[j], _heap[i]); } private static int Parent(int i) @@ -141,11 +136,11 @@ private static int OldChild(int i) private void Grow() { - int newCapacity = _capacity * GrowFactor + MinGrow; + int newCapacity = Capacity * GrowFactor + MinGrow; var newHeap = new T[newCapacity]; - Array.Copy(_heap, newHeap, _capacity); + Array.Copy(_heap, newHeap, Capacity); _heap = newHeap; - _capacity = newCapacity; + Capacity = newCapacity; } public IEnumerator GetEnumerator() diff --git a/FuzzySharp/Utils/Permutation.cs b/FuzzySharp/Utils/Permutation.cs index e6c0976..aa9fad1 100644 --- a/FuzzySharp/Utils/Permutation.cs +++ b/FuzzySharp/Utils/Permutation.cs @@ -2,7 +2,7 @@ using System.Collections.Generic; using System.Linq; -namespace FuzzySharp.Utils +namespace Raffinert.FuzzySharp.Utils { public class Permutor where T : IComparable { @@ -15,7 +15,7 @@ public Permutor(IEnumerable set) public List PermutationAt(long i) { - var set = new List(_set.OrderBy(e => e).ToList()); + var set = new List(_set.OrderBy(e => e)); for (long j = 0; j < i - 1; j++) { NextPermutation(set); @@ -62,22 +62,22 @@ public bool NextPermutation(List set) public static class Permutation { - public static List> AllPermutations(this IEnumerable seed) + private static IEnumerable> AllPermutations(this IEnumerable seed) { var set = new List(seed); - return Permute(set, 0, set.Count - 1).ToList(); + return Permute(set, 0, set.Count - 1); } - public static List> PermutationsOfSize(this IEnumerable seed, int size) + public static IEnumerable> PermutationsOfSize(this List seed, int size) { - if (seed.Count() < size) - { - return new List>(); - } - return seed.PermutationsOfSize(new List(), size).ToList(); + var result = seed.Count < size + ? [] + : seed.PermutationsOfSize([], size); + + return result; } - private static IEnumerable> PermutationsOfSize(this IEnumerable seed, List set, int size) + private static IEnumerable> PermutationsOfSize(this List seed, List set, int size) { if (size == 0) { @@ -85,17 +85,16 @@ private static IEnumerable> PermutationsOfSize(this IEnumerable se { yield return permutation; } + + yield break; } - else + + for (int i = 0; i < seed.Count; i++) { - var seedAsList = seed.ToList(); - for (int i = 0; i < seedAsList.Count; i++) + var newSet = new List(set) { seed[i] }; + foreach (var permutation in seed.Skip(i + 1).ToList().PermutationsOfSize(newSet, size - 1)) { - var newSet = new List(set) { seedAsList[i] }; - foreach (var permutation in seedAsList.Skip(i + 1).PermutationsOfSize(newSet, size - 1)) - { - yield return permutation; - } + yield return permutation; } } } @@ -104,7 +103,7 @@ private static IEnumerable> Permute(List set, int start, int end) { if (start == end) { - yield return new List(set); + yield return [..set]; } else { @@ -122,9 +121,7 @@ private static IEnumerable> Permute(List set, int start, int end) private static void Swap(List set, int a, int b) { - var temp = set[a]; - set[a] = set[b]; - set[b] = temp; + (set[a], set[b]) = (set[b], set[a]); } public static IEnumerable> Cycles(IEnumerable seed) @@ -132,8 +129,8 @@ public static IEnumerable> Cycles(IEnumerable seed) var set = new LinkedList(seed); for (int i = 0; i < set.Count; i++) { - yield return new List(set); - var top = set.First(); + yield return [..set]; + var top = set.First!; set.RemoveFirst(); set.AddLast(top); } diff --git a/README.md b/README.md index 77fdf38..0cd0a0c 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,18 @@ -# FuzzySharp +# Raffinert.FuzzySharp + C# .NET fuzzy string matching implementation of Seat Geek's well known python FuzzyWuzzy algorithm. +A refined version of original [FuzzySharp](https://github.com/JakeBayer/FuzzySharp). The original one looks abandoned. + # Release Notes: +v.2.0.3 + +Accent to performance and allocations. See [Benchmark](https://github.com/Raffinert/FuzzySharp/blob/dc2b858dc4cc56d8cdf26411904e255a019b0549/FuzzySharp.Benchmarks/BenchmarkDotNet.Artifacts/results/Raffinert.FuzzySharp.Benchmarks.BenchmarkAll-report-github.md). +Support local languages more naturally (removed regexps "a-zA-Z"). All regexps were replaced with string manipulations (fixes [PR!7](https://github.com/JakeBayer/FuzzySharp/pull/7)). +Extra performance improvement, reused approach [Dmitry Sushchevsky](https://github.com/blowin) - see [PR!42](https://github.com/JakeBayer/FuzzySharp/pull/42). +Implemented new Process.ExtractAll method, see [Issue!46](https://github.com/JakeBayer/FuzzySharp/issues/46). +Remove support of outdated/vulnerable platforms netcoreapp2.0;netcoreapp2.1;netstandard1.6. + v.2.0.0 As of 2.0.0, all empty strings will return a score of 0. Prior, the partial scoring system would return a score of 100, regardless if the other input had correct value or not. This was a result of the partial scoring system returning an empty set for the matching blocks As a result, this led to incorrrect values in the composite scores; several of them (token set, token sort), relied on the prior value of empty strings. @@ -11,7 +22,7 @@ As a result, many 1.X.X unit test may be broken with the 2.X.X upgrade, but it i ## Usage -Install-Package FuzzySharp +Install-Package Raffinert.FuzzySharp #### Simple Ratio ```csharp @@ -113,18 +124,6 @@ var best = Process.ExtractOne(query, events, strings => strings[0]); best: (value: { "chicago cubs vs new york mets", "CitiField", "2011-05-11", "8pm" }, score: 95, index: 0) ``` -### FuzzySharp in Different Languages -FuzzySharp was written with English in mind, and as such the Default string preprocessor only looks at English alphanumeric characters in the input strings, and will strip all others out. However, the `Extract` methods in the `Process` class do provide the option to specify your own string preprocessor. If this parameter is omitted, the Default will be used. However if you provide your own, the provided one will be used, so you are free to provide your own criteria for whatever character set you want to admit. For instance, using the parameter `(s) => s` will prevent the string from being altered at all before being run through the similarity algorithms. - -E.g., - -```csharp -var query = "strng"; -var choices = new [] { "stríng", "stráng", "stréng" }; -var results = Process.ExtractAll(query, choices, (s) => s); -``` -The above will run the similarity algorithm on all the choices without stripping out the accented characters. - ### Using Different Scorers Scoring strategies are stateless, and as such should be static. However, in order to get them to share all the code they have in common via inheritance, making them static was not possible. Currently one way around having to new up an instance everytime you want to use one is to use the cache. This will ensure only one instance of each scorer ever exists.