From f2dbc2cbfd8fd91061103681c55902fc03e8d65f Mon Sep 17 00:00:00 2001 From: Peter Chapman Date: Thu, 4 Dec 2025 09:48:47 +1300 Subject: [PATCH] Fix crash when surrogate pair handling multibyte Unicode characters --- .../PunctuationAnalysis/TextSegment.cs | 12 ++++---- .../QuotationMarkFinderTests.cs | 28 +++++++++++++++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs index e4a6d06f..229aec56 100644 --- a/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs +++ b/src/SIL.Machine/PunctuationAnalysis/TextSegment.cs @@ -187,13 +187,13 @@ public class SurrogatePairString public SurrogatePairString(string stringValue) { _stringValue = stringValue; - IEnumerable<(int SurrogatePairIndex, int StringIndex)> indexPairs = _stringValue - .Select((c, i) => (c, i)) + IEnumerable<(int StringIndex, int SurrogatePairIndex)> indexPairs = _stringValue + .Select((c, stringIndex) => (c, stringIndex)) .Where(tup => !char.IsLowSurrogate(tup.c)) - .Select((tup, i) => (tup.i, i)); + .Select((tup, surrogatePairIndex) => (tup.stringIndex, surrogatePairIndex)); _surrogatePairIndexByStringIndex = new Dictionary(); _stringIndexBySurrogatePairIndex = new Dictionary(); - foreach ((int surrogatePairIndex, int stringIndex) in indexPairs) + foreach ((int stringIndex, int surrogatePairIndex) in indexPairs) { _surrogatePairIndexByStringIndex[stringIndex] = surrogatePairIndex; _stringIndexBySurrogatePairIndex[surrogatePairIndex] = stringIndex; @@ -251,11 +251,11 @@ public string Substring(int startSurrogatePairIndex, int length) public int GetStringIndexForSurrogatePairIndex(int surrogatePairIndex) { - if (surrogatePairIndex == _surrogatePairIndexByStringIndex.Count) + if (surrogatePairIndex == _stringIndexBySurrogatePairIndex.Count) { return _stringValue.Length; } - return _surrogatePairIndexByStringIndex[surrogatePairIndex]; + return _stringIndexBySurrogatePairIndex[surrogatePairIndex]; } } } diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs index 0af3f39c..056efa23 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationMarkFinderTests.cs @@ -441,4 +441,32 @@ public void ThatItUsesTheQuoteConventionSet() ) ); } + + [Test] + public void SupportsMultibyteUnicodeCharacters() + { + var quotationMarkFinder = new QuotationMarkFinder(QuoteConventions.Standard); + + // [grinning face], [left double quotation mark][grinning face with big eyes][right double quotation mark] + Assert.That( + quotationMarkFinder + .FindAllPotentialQuotationMarksInTextSegment( + new TextSegment.Builder().SetText("\U0001f600, \u201c\U0001f603\u201d").Build() + ) + .SequenceEqual( + [ + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\U0001f600, \u201c\U0001f603\u201d").Build(), + 3, + 4 + ), + new QuotationMarkStringMatch( + new TextSegment.Builder().SetText("\U0001f600, \u201c\U0001f603\u201d").Build(), + 5, + 6 + ), + ] + ) + ); + } }