diff --git a/src/ManagedCode.GraphRag/Chunking/MarkdownTextChunker.cs b/src/ManagedCode.GraphRag/Chunking/MarkdownTextChunker.cs index 092a566e9..67febeedb 100644 --- a/src/ManagedCode.GraphRag/Chunking/MarkdownTextChunker.cs +++ b/src/ManagedCode.GraphRag/Chunking/MarkdownTextChunker.cs @@ -1,3 +1,4 @@ +using System.Buffers; using System.Text; using GraphRag.Config; using GraphRag.Tokenization; @@ -14,7 +15,7 @@ public IReadOnlyList Chunk(IReadOnlyList slices, Chunking if (slices.Count == 0) { - return Array.Empty(); + return []; } var tokenizer = TokenizerRegistry.GetTokenizer(config.EncodingModel); @@ -31,13 +32,13 @@ public IReadOnlyList Chunk(IReadOnlyList slices, Chunking var fragments = Split(slice.Text, options, tokenizer); foreach (var fragment in fragments) { - var tokens = tokenizer.EncodeToIds(fragment); - if (tokens.Count == 0) + var tokenCount = tokenizer.CountTokens(fragment.AsSpan()); + if (tokenCount == 0) { continue; } - results.Add(new TextChunk(new[] { slice.DocumentId }, fragment, tokens.Count)); + results.Add(new TextChunk([slice.DocumentId], fragment, tokenCount)); } } @@ -51,21 +52,39 @@ private List Split(string text, MarkdownChunkerOptions options, Tokenize var primarySize = options.MaxTokensPerChunk; var secondarySize = Math.Max(MinChunkSize, options.MaxTokensPerChunk - options.Overlap); - var rawChunks = RecursiveSplit(text, primarySize, secondarySize, SeparatorType.ExplicitSeparator, tokenizer, ref firstChunkDone); + var rawChunkRanges = RecursiveSplitRanges( + text, 0..text.Length, + primarySize, secondarySize, + SeparatorType.ExplicitSeparator, tokenizer, ref firstChunkDone); + + List rawChunks; - if (options.Overlap > 0 && rawChunks.Count > 1) + if (options.Overlap > 0 && rawChunkRanges.Count > 1) { - var newChunks = new List { rawChunks[0] }; + rawChunks = new List(rawChunkRanges.Count); - for (var index = 1; index < rawChunks.Count; index++) + var firstChunkText = text[rawChunkRanges[0]]; + rawChunks.Add(firstChunkText); + var previousTokens = tokenizer.EncodeToIds(firstChunkText.AsSpan()); + + for (var i = 1; i < rawChunkRanges.Count; i++) { - var previousTokens = tokenizer.EncodeToIds(rawChunks[index - 1]); - var overlapTokens = previousTokens.Skip(Math.Max(0, previousTokens.Count - options.Overlap)).ToArray(); - var overlapText = tokenizer.Decode(overlapTokens); - newChunks.Add(string.Concat(overlapText, rawChunks[index])); - } + var currentChunkText = text[rawChunkRanges[i]]; + var skipCount = Math.Max(0, previousTokens.Count - options.Overlap); + var overlapText = tokenizer.Decode(previousTokens.Skip(skipCount)); - rawChunks = newChunks; + rawChunks.Add(string.Concat(overlapText, currentChunkText)); + previousTokens = tokenizer.EncodeToIds(currentChunkText.AsSpan()); + } + } + else + { + // No overlap - simple range to string conversion + rawChunks = new List(rawChunkRanges.Count); + foreach (var range in rawChunkRanges) + { + rawChunks.Add(text[range]); + } } return MergeImageChunks(rawChunks); @@ -101,11 +120,12 @@ private List RecursiveSplit( _ => throw new ArgumentOutOfRangeException(nameof(separatorType), separatorType, null) }; - return GenerateChunks(fragments, maxChunk1Size, maxChunkNSize, separatorType, tokenizer, ref firstChunkDone); + return GenerateChunks(text, fragments, maxChunk1Size, maxChunkNSize, separatorType, tokenizer, ref firstChunkDone); } private List GenerateChunks( - List fragments, + string text, + List fragments, int maxChunk1Size, int maxChunkNSize, SeparatorType separatorType, @@ -119,10 +139,11 @@ private List GenerateChunks( var chunks = new List(); var builder = new ChunkBuilder(); + var textSpan = text.AsSpan(); foreach (var fragment in fragments) { - builder.NextSentence.Append(fragment.Content); + builder.NextSentence.Append(textSpan[fragment.Range]); if (!fragment.IsSeparator) { @@ -210,11 +231,18 @@ private List GenerateChunks( return chunks; } - private static List SplitToFragments(string text, SeparatorTrie? separators) + internal static List SplitToFragments(string text, SeparatorTrie? separators) { if (separators is null) { - return text.Select(ch => new Fragment(ch.ToString(), true)).ToList(); + // Character-level fallback + var charFragments = new List(text.Length); + for (var i = 0; i < text.Length; i++) + { + charFragments.Add(new FragmentRange(i..(i + 1), true)); + } + + return charFragments; } if (text.Length == 0 || separators.Length == 0) @@ -222,41 +250,58 @@ private static List SplitToFragments(string text, SeparatorTrie? separ return []; } - var fragments = new List(); - var fragmentBuilder = new StringBuilder(); + var span = text.AsSpan(); + var fragments = new List(); + var contentStart = 0; var index = 0; - while (index < text.Length) + while (index < span.Length) { - var found = separators.MatchLongest(text, index); + // Use SearchValues for vectorized skip to next potential separator + var remaining = span[index..]; + var nextPotential = remaining.IndexOfAny(separators.FirstChars); + + if (nextPotential < 0) + { + // No more potential separators - rest is content + break; + } + + index += nextPotential; + + // Try to match a separator at this position + var matchLength = separators.MatchLongest(span, index); - if (found is not null) + if (matchLength > 0) { - if (fragmentBuilder.Length > 0) + // Emit content fragment if any + if (index > contentStart) { - fragments.Add(new Fragment(fragmentBuilder.ToString(), false)); - fragmentBuilder.Clear(); + fragments.Add(new FragmentRange(contentStart..index, false)); } - fragments.Add(new Fragment(found, true)); - index += found.Length; + // Emit separator fragment + fragments.Add(new FragmentRange(index..(index + matchLength), true)); + index += matchLength; + contentStart = index; } else { - fragmentBuilder.Append(text[index]); + // Not a real separator, continue index++; } } - if (fragmentBuilder.Length > 0) + // Emit remaining content + if (contentStart < text.Length) { - fragments.Add(new Fragment(fragmentBuilder.ToString(), false)); + fragments.Add(new FragmentRange(contentStart..text.Length, false)); } return fragments; } - private static List MergeImageChunks(List chunks) + internal static List MergeImageChunks(List chunks) { if (chunks.Count <= 1) { @@ -302,6 +347,245 @@ private static void AddChunk(List chunks, string chunk, ref bool firstCh } } + internal static string NormalizeNewlines(string input) => input.ReplaceLineEndings("\n"); + + private List RecursiveSplitRanges( + string originalText, + Range workingRange, + int maxChunk1Size, + int maxChunkNSize, + SeparatorType separatorType, + Tokenizer tokenizer, + ref bool firstChunkDone) + { + var offset = workingRange.Start.Value; + var length = workingRange.End.Value - offset; + + if (length == 0 || originalText.AsSpan()[workingRange].IsWhiteSpace()) + { + return []; + } + + var maxChunkSize = firstChunkDone ? maxChunkNSize : maxChunk1Size; + + // CountTokens with Span - ZERO allocation! + if (tokenizer.CountTokens(originalText.AsSpan()[workingRange]) <= maxChunkSize) + { + return [workingRange]; + } + + // Get fragments for the working range - this still needs a substring for SplitToFragments + // but the fragments returned use ranges relative to this substring + var workingText = originalText[workingRange]; + var fragments = separatorType switch + { + SeparatorType.ExplicitSeparator => SplitToFragments(workingText, ExplicitSeparators), + SeparatorType.PotentialSeparator => SplitToFragments(workingText, PotentialSeparators), + SeparatorType.WeakSeparator1 => SplitToFragments(workingText, WeakSeparators1), + SeparatorType.WeakSeparator2 => SplitToFragments(workingText, WeakSeparators2), + SeparatorType.WeakSeparator3 => SplitToFragments(workingText, WeakSeparators3), + SeparatorType.NotASeparator => SplitToFragments(workingText, null), + _ => throw new ArgumentOutOfRangeException(nameof(separatorType), separatorType, null) + }; + + return GenerateChunksRanges(originalText, offset, workingText, fragments, maxChunk1Size, maxChunkNSize, separatorType, tokenizer, ref firstChunkDone); + } + + + private List GenerateChunksRanges( + string originalText, + int textOffset, + string workingText, + List fragments, + int maxChunk1Size, + int maxChunkNSize, + SeparatorType separatorType, + Tokenizer tokenizer, + ref bool firstChunkDone) + { + if (fragments.Count == 0) + { + return []; + } + + var chunks = new List(); + var workingSpan = workingText.AsSpan(); + + // Track positions as indices (in original text coordinates) + var chunkStart = textOffset; + var chunkEnd = textOffset; + var sentenceStart = textOffset; + + foreach (var fragment in fragments) + { + var fragLocalEnd = fragment.Range.End.Value; + var fragGlobalEnd = textOffset + fragLocalEnd; + + if (!fragment.IsSeparator) + { + continue; + } + + // We have accumulated a sentence from sentenceStart to fragGlobalEnd + var sentenceLocalStart = sentenceStart - textOffset; + var sentenceLocalEnd = fragLocalEnd; + + var sentenceSpan = workingSpan[sentenceLocalStart..sentenceLocalEnd]; + var sentenceTokens = tokenizer.CountTokens(sentenceSpan); + + var maxChunkSize = firstChunkDone ? maxChunkNSize : maxChunk1Size; + var chunkEmpty = chunkEnd <= chunkStart; + var sentenceTooLong = sentenceTokens > maxChunkSize; + + if (chunkEmpty && !sentenceTooLong) + { + // First sentence in chunk, it fits + chunkEnd = fragGlobalEnd; + sentenceStart = fragGlobalEnd; + continue; + } + + if (chunkEmpty && sentenceTooLong) + { + // Sentence alone is too long - recursively split it + var sentenceRange = sentenceStart..fragGlobalEnd; + var moreRanges = RecursiveSplitRanges( + originalText, sentenceRange, + maxChunk1Size, maxChunkNSize, + NextSeparatorType(separatorType), tokenizer, ref firstChunkDone); + + if (moreRanges.Count > 0) + { + // Add all but last as finalized chunks + for (var i = 0; i < moreRanges.Count - 1; i++) + { + chunks.Add(moreRanges[i]); + } + + // Keep last range as the new chunk start + var lastRange = moreRanges[^1]; + chunkStart = lastRange.Start.Value; + chunkEnd = lastRange.End.Value; + } + + sentenceStart = fragGlobalEnd; + continue; + } + + // Check if chunk + sentence fits together + var chunkLocalStart = chunkStart - textOffset; + var combinedSpan = workingSpan[chunkLocalStart..sentenceLocalEnd]; + if (!sentenceTooLong && tokenizer.CountTokens(combinedSpan) <= maxChunkSize) + { + // Combined fits - extend chunk + chunkEnd = fragGlobalEnd; + sentenceStart = fragGlobalEnd; + continue; + } + + // Combined doesn't fit - finalize current chunk + if (chunkEnd > chunkStart) + { + chunks.Add(chunkStart..chunkEnd); + firstChunkDone = true; + } + + if (sentenceTooLong) + { + // Recursively split the sentence + var sentenceRange = sentenceStart..fragGlobalEnd; + var moreRanges = RecursiveSplitRanges( + originalText, sentenceRange, + maxChunk1Size, maxChunkNSize, + NextSeparatorType(separatorType), tokenizer, ref firstChunkDone); + + if (moreRanges.Count > 0) + { + for (var i = 0; i < moreRanges.Count - 1; i++) + { + chunks.Add(moreRanges[i]); + } + + var lastRange = moreRanges[^1]; + chunkStart = lastRange.Start.Value; + chunkEnd = lastRange.End.Value; + } + else + { + chunkStart = fragGlobalEnd; + chunkEnd = fragGlobalEnd; + } + } + else + { + // Start new chunk with this sentence + chunkStart = sentenceStart; + chunkEnd = fragGlobalEnd; + } + + sentenceStart = fragGlobalEnd; + } + + // Handle remaining content + var lastFragEnd = textOffset + fragments[^1].Range.End.Value; + + if (chunkEnd > chunkStart || sentenceStart < lastFragEnd) + { + // Combine any remaining chunk content with leftover sentence + var remainingStart = Math.Min(chunkStart, sentenceStart); + var remainingEnd = Math.Max(chunkEnd, lastFragEnd); + + if (remainingEnd > remainingStart) + { + var remainingLocalStart = remainingStart - textOffset; + var remainingLocalEnd = remainingEnd - textOffset; + var remainingSpan = workingSpan[remainingLocalStart..remainingLocalEnd]; + var remainingMax = firstChunkDone ? maxChunkNSize : maxChunk1Size; + + if (tokenizer.CountTokens(remainingSpan) <= remainingMax) + { + if (!remainingSpan.IsWhiteSpace()) + { + chunks.Add(remainingStart..remainingEnd); + firstChunkDone = true; + } + } + else + { + // Need to split remaining content + if (chunkEnd > chunkStart && !workingSpan[(chunkStart - textOffset)..(chunkEnd - textOffset)].IsWhiteSpace()) + { + chunks.Add(chunkStart..chunkEnd); + firstChunkDone = true; + } + + if (sentenceStart < lastFragEnd) + { + var leftoverSpan = workingSpan[(sentenceStart - textOffset)..(lastFragEnd - textOffset)]; + if (!leftoverSpan.IsWhiteSpace()) + { + if (tokenizer.CountTokens(leftoverSpan) <= remainingMax) + { + chunks.Add(sentenceStart..lastFragEnd); + firstChunkDone = true; + } + else + { + var moreRanges = RecursiveSplitRanges( + originalText, sentenceStart..lastFragEnd, + maxChunk1Size, maxChunkNSize, + NextSeparatorType(separatorType), tokenizer, ref firstChunkDone); + chunks.AddRange(moreRanges); + } + } + } + } + } + } + + return chunks; + } + private static SeparatorType NextSeparatorType(SeparatorType separatorType) => separatorType switch { SeparatorType.ExplicitSeparator => SeparatorType.PotentialSeparator, @@ -312,11 +596,9 @@ private static void AddChunk(List chunks, string chunk, ref bool firstCh _ => SeparatorType.NotASeparator }; - private static string NormalizeNewlines(string input) => input.Replace("\r\n", "\n", StringComparison.Ordinal).Replace('\r', '\n'); - private const int MinChunkSize = 5; - private static readonly SeparatorTrie ExplicitSeparators = new([ + internal static readonly SeparatorTrie ExplicitSeparators = new([ ".\n\n", "!\n\n", "!!\n\n", @@ -333,7 +615,7 @@ private static void AddChunk(List chunks, string chunk, ref bool firstCh "\n---" ]); - private static readonly SeparatorTrie PotentialSeparators = new([ + internal static readonly SeparatorTrie PotentialSeparators = new([ "\n> ", "\n>- ", "\n>* ", @@ -350,7 +632,7 @@ private static void AddChunk(List chunks, string chunk, ref bool firstCh "\n```" ]); - private static readonly SeparatorTrie WeakSeparators1 = new([ + internal static readonly SeparatorTrie WeakSeparators1 = new([ "![", "[", "| ", @@ -359,7 +641,7 @@ private static void AddChunk(List chunks, string chunk, ref bool firstCh "\n: " ]); - private static readonly SeparatorTrie WeakSeparators2 = new([ + internal static readonly SeparatorTrie WeakSeparators2 = new([ ". ", ".\t", ".\n", "? ", "?\t", "?\n", "! ", "!\t", "!\n", @@ -371,7 +653,7 @@ private static void AddChunk(List chunks, string chunk, ref bool firstCh ".", "?", "!", "⁉", "⁈", "⁇", "…" ]); - private static readonly SeparatorTrie WeakSeparators3 = new([ + internal static readonly SeparatorTrie WeakSeparators3 = new([ "; ", ";\t", ";\n", ";", "} ", "}\t", "}\n", "}", ") ", ")\t", ")\n", @@ -392,7 +674,7 @@ private enum SeparatorType NotASeparator } - private sealed record Fragment(string Content, bool IsSeparator); + internal readonly record struct FragmentRange(Range Range, bool IsSeparator); private sealed class ChunkBuilder { @@ -406,12 +688,15 @@ private sealed class MarkdownChunkerOptions public int Overlap { get; init; } } - private sealed class SeparatorTrie + internal sealed class SeparatorTrie { private readonly Dictionary> _lookup = new(); + private readonly SearchValues _firstChars; public int Length { get; } + public SearchValues FirstChars => _firstChars; + public SeparatorTrie(IEnumerable separators) { var list = separators.Where(static s => !string.IsNullOrEmpty(s)).ToList(); @@ -433,18 +718,24 @@ public SeparatorTrie(IEnumerable separators) { bucket.Sort((a, b) => b.Length.CompareTo(a.Length)); } + + // Create SearchValues from first chars for vectorized lookup + _firstChars = SearchValues.Create([.. _lookup.Keys]); } - public string? MatchLongest(string text, int index) + /// + /// Returns the length of the longest matching separator at the given index, or 0 if no match. + /// + public int MatchLongest(ReadOnlySpan text, int index) { if (index >= text.Length) { - return null; + return 0; } if (!_lookup.TryGetValue(text[index], out var candidates)) { - return null; + return 0; } foreach (var candidate in candidates) @@ -454,13 +745,13 @@ public SeparatorTrie(IEnumerable separators) continue; } - if (text.AsSpan(index, candidate.Length).SequenceEqual(candidate)) + if (text.Slice(index, candidate.Length).SequenceEqual(candidate)) { - return candidate; + return candidate.Length; } } - return null; + return 0; } } } diff --git a/tests/ManagedCode.GraphRag.Tests/Chunking/MarkdownTextChunkerTests.cs b/tests/ManagedCode.GraphRag.Tests/Chunking/MarkdownTextChunkerTests.cs index 750562bb1..3b0c1ff73 100644 --- a/tests/ManagedCode.GraphRag.Tests/Chunking/MarkdownTextChunkerTests.cs +++ b/tests/ManagedCode.GraphRag.Tests/Chunking/MarkdownTextChunkerTests.cs @@ -9,6 +9,8 @@ public sealed class MarkdownTextChunkerTests { private readonly MarkdownTextChunker _chunker = new(); + #region Chunk Tests (Original) + [Fact] public void Chunk_SplitsMarkdownBlocks() { @@ -79,4 +81,802 @@ public void Chunk_RespectsOverlapBetweenChunks() var secondText = chunks[1].Text.TrimStart(); Assert.StartsWith(overlapText, secondText, StringComparison.Ordinal); } + + #endregion + + #region SplitToFragments Tests + + [Fact] + public void SplitToFragments_EmptyString_ReturnsEmpty() + { + var result = MarkdownTextChunker.SplitToFragments("", MarkdownTextChunker.ExplicitSeparators); + Assert.Empty(result); + } + + [Fact] + public void SplitToFragments_NullSeparators_ReturnsCharacterLevelFragments() + { + var text = "abc"; + var result = MarkdownTextChunker.SplitToFragments(text, null); + + Assert.Equal(3, result.Count); + Assert.All(result, f => Assert.True(f.IsSeparator)); + Assert.Equal("a", text[result[0].Range]); + Assert.Equal("b", text[result[1].Range]); + Assert.Equal("c", text[result[2].Range]); + } + + [Fact] + public void SplitToFragments_NoSeparatorsInText_ReturnsSingleContentFragment() + { + var text = "hello world"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.ExplicitSeparators); + + Assert.Single(result); + Assert.False(result[0].IsSeparator); + Assert.Equal("hello world", text[result[0].Range]); + } + + [Fact] + public void SplitToFragments_SeparatorAtStart_FirstFragmentIsSeparator() + { + var text = "\n\nhello"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.ExplicitSeparators); + + Assert.Equal(2, result.Count); + Assert.True(result[0].IsSeparator); + Assert.Equal("\n\n", text[result[0].Range]); + Assert.False(result[1].IsSeparator); + Assert.Equal("hello", text[result[1].Range]); + } + + [Fact] + public void SplitToFragments_SeparatorAtEnd_LastFragmentIsSeparator() + { + var text = "hello.\n\n"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.ExplicitSeparators); + + Assert.Equal(2, result.Count); + Assert.False(result[0].IsSeparator); + Assert.Equal("hello", text[result[0].Range]); + Assert.True(result[1].IsSeparator); + Assert.Equal(".\n\n", text[result[1].Range]); + } + + [Fact] + public void SplitToFragments_AdjacentSeparators_CreatesSeparateFragments() + { + var text = "\n\n\n\n"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.ExplicitSeparators); + + Assert.Equal(2, result.Count); + Assert.All(result, f => Assert.True(f.IsSeparator)); + Assert.Equal("\n\n", text[result[0].Range]); + Assert.Equal("\n\n", text[result[1].Range]); + } + + [Fact] + public void SplitToFragments_LongestMatchPrecedence_MatchesDotNewlineNewlineOverDot() + { + // Using WeakSeparators2 which has both "." and ".\n\n" isn't there, but ExplicitSeparators has ".\n\n" + var text = "hello.\n\nworld"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.ExplicitSeparators); + + Assert.Equal(3, result.Count); + Assert.Equal("hello", text[result[0].Range]); + Assert.Equal(".\n\n", text[result[1].Range]); + Assert.True(result[1].IsSeparator); + Assert.Equal("world", text[result[2].Range]); + } + + [Fact] + public void SplitToFragments_LongestMatchPrecedence_MatchesTripleQuestionOverDouble() + { + var text = "what???really"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.WeakSeparators2); + + // Should match "???" not "??" + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "???"); + } + + [Fact] + public void SplitToFragments_UnicodeSeparators_HandlesInterrobangCorrectly() + { + var text = "what⁉ really"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.WeakSeparators2); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "⁉ "); + } + + [Fact] + public void SplitToFragments_UnicodeSeparators_HandlesEllipsisCorrectly() + { + var text = "wait… more"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.WeakSeparators2); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "… "); + } + + #endregion + + #region ExplicitSeparators Additional Tests + + [Fact] + public void SplitToFragments_HeaderSeparators_MatchesNewlineHash() + { + var text = "content\n# Header1\n## Header2"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.ExplicitSeparators); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n#"); + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n##"); + } + + [Fact] + public void SplitToFragments_HeaderSeparators_MatchesAllLevels() + { + var text = "a\n#b\n##c\n###d\n####e\n#####f"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.ExplicitSeparators); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n#"); + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n##"); + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n###"); + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n####"); + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n#####"); + } + + [Fact] + public void SplitToFragments_HorizontalRule_MatchesNewlineDashes() + { + var text = "above\n---below"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.ExplicitSeparators); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n---"); + } + + [Fact] + public void SplitToFragments_ExclamationNewlines_MatchesAllVariants() + { + var text1 = "wow!\n\nmore"; + var text2 = "wow!!\n\nmore"; + var text3 = "wow!!!\n\nmore"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.ExplicitSeparators); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.ExplicitSeparators); + var result3 = MarkdownTextChunker.SplitToFragments(text3, MarkdownTextChunker.ExplicitSeparators); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == "!\n\n"); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == "!!\n\n"); + Assert.Contains(result3, f => f.IsSeparator && text3[f.Range] == "!!!\n\n"); + } + + [Fact] + public void SplitToFragments_QuestionNewlines_MatchesAllVariants() + { + var text1 = "what?\n\nmore"; + var text2 = "what??\n\nmore"; + var text3 = "what???\n\nmore"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.ExplicitSeparators); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.ExplicitSeparators); + var result3 = MarkdownTextChunker.SplitToFragments(text3, MarkdownTextChunker.ExplicitSeparators); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == "?\n\n"); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == "??\n\n"); + Assert.Contains(result3, f => f.IsSeparator && text3[f.Range] == "???\n\n"); + } + + #endregion + + #region PotentialSeparators Tests + + [Fact] + public void SplitToFragments_Blockquote_MatchesNewlineGreaterThan() + { + var text = "text\n> quoted"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.PotentialSeparators); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n> "); + } + + [Fact] + public void SplitToFragments_BlockquoteList_MatchesVariants() + { + var text1 = "text\n>- item"; + var text2 = "text\n>* item"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.PotentialSeparators); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.PotentialSeparators); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == "\n>- "); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == "\n>* "); + } + + [Fact] + public void SplitToFragments_NumberedList_MatchesDigitDotSpace() + { + var text = "intro\n1. first\n2. second\n10. tenth"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.PotentialSeparators); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n1. "); + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n2. "); + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n10. "); + } + + [Fact] + public void SplitToFragments_CodeFence_MatchesTripleBacktick() + { + var text = "text\n```code"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.PotentialSeparators); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n```"); + } + + #endregion + + #region WeakSeparators1 Tests + + [Fact] + public void SplitToFragments_TablePipe_MatchesPipeVariants() + { + var text1 = "col1| col2"; + var text2 = "data |\nmore"; + var text3 = "---|-|\ndata"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.WeakSeparators1); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.WeakSeparators1); + var result3 = MarkdownTextChunker.SplitToFragments(text3, MarkdownTextChunker.WeakSeparators1); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == "| "); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == " |\n"); + Assert.Contains(result3, f => f.IsSeparator && text3[f.Range] == "-|\n"); + } + + [Fact] + public void SplitToFragments_LinkBracket_MatchesOpenBracket() + { + var text = "click [here](url)"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.WeakSeparators1); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "["); + } + + [Fact] + public void SplitToFragments_ImageBracket_MatchesExclamationBracket() + { + var text = "see ![alt](img.png)"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.WeakSeparators1); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "!["); + } + + [Fact] + public void SplitToFragments_DefinitionList_MatchesNewlineColon() + { + var text = "term\n: definition"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.WeakSeparators1); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n: "); + } + + #endregion + + #region WeakSeparators2 Additional Tests + + [Fact] + public void SplitToFragments_TabSeparators_MatchesPunctuationTab() + { + var text1 = "end.\tnext"; + var text2 = "what?\tnext"; + var text3 = "wow!\tnext"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.WeakSeparators2); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.WeakSeparators2); + var result3 = MarkdownTextChunker.SplitToFragments(text3, MarkdownTextChunker.WeakSeparators2); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == ".\t"); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == "?\t"); + Assert.Contains(result3, f => f.IsSeparator && text3[f.Range] == "!\t"); + } + + [Fact] + public void SplitToFragments_NewlineSeparators_MatchesPunctuationNewline() + { + var text1 = "end.\nnext"; + var text2 = "what?\nnext"; + var text3 = "wow!\nnext"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.WeakSeparators2); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.WeakSeparators2); + var result3 = MarkdownTextChunker.SplitToFragments(text3, MarkdownTextChunker.WeakSeparators2); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == ".\n"); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == "?\n"); + Assert.Contains(result3, f => f.IsSeparator && text3[f.Range] == "!\n"); + } + + [Fact] + public void SplitToFragments_QuadPunctuation_MatchesFourChars() + { + var text1 = "what!!!!really"; + var text2 = "what????really"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.WeakSeparators2); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.WeakSeparators2); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == "!!!!"); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == "????"); + } + + [Fact] + public void SplitToFragments_MixedPunctuation_MatchesInterrobangVariants() + { + var text1 = "what?!?really"; + var text2 = "what!?!really"; + var text3 = "what!?really"; + var text4 = "what?!really"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.WeakSeparators2); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.WeakSeparators2); + var result3 = MarkdownTextChunker.SplitToFragments(text3, MarkdownTextChunker.WeakSeparators2); + var result4 = MarkdownTextChunker.SplitToFragments(text4, MarkdownTextChunker.WeakSeparators2); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == "?!?"); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == "!?!"); + Assert.Contains(result3, f => f.IsSeparator && text3[f.Range] == "!?"); + Assert.Contains(result4, f => f.IsSeparator && text4[f.Range] == "?!"); + } + + [Fact] + public void SplitToFragments_Ellipsis_MatchesDotVariants() + { + var text1 = "wait....more"; + var text2 = "wait...more"; + var text3 = "wait..more"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.WeakSeparators2); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.WeakSeparators2); + var result3 = MarkdownTextChunker.SplitToFragments(text3, MarkdownTextChunker.WeakSeparators2); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == "...."); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == "..."); + Assert.Contains(result3, f => f.IsSeparator && text3[f.Range] == ".."); + } + + [Fact] + public void SplitToFragments_SinglePunctuation_MatchesWithoutSpace() + { + // Single punctuation at end of string (no space after) + var text1 = "end."; + var text2 = "end?"; + var text3 = "end!"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.WeakSeparators2); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.WeakSeparators2); + var result3 = MarkdownTextChunker.SplitToFragments(text3, MarkdownTextChunker.WeakSeparators2); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == "."); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == "?"); + Assert.Contains(result3, f => f.IsSeparator && text3[f.Range] == "!"); + } + + [Fact] + public void SplitToFragments_DoubleQuestion_MatchesBeforeTriple() + { + var text = "what??next"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.WeakSeparators2); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "??"); + } + + [Fact] + public void SplitToFragments_DoubleExclamation_MatchesBeforeTriple() + { + var text = "wow!!next"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.WeakSeparators2); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "!!"); + } + + #endregion + + #region WeakSeparators3 Tests + + [Fact] + public void SplitToFragments_Semicolon_MatchesAllVariants() + { + var text1 = "a; b"; + var text2 = "a;\tb"; + var text3 = "a;\nb"; + var text4 = "a;b"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.WeakSeparators3); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.WeakSeparators3); + var result3 = MarkdownTextChunker.SplitToFragments(text3, MarkdownTextChunker.WeakSeparators3); + var result4 = MarkdownTextChunker.SplitToFragments(text4, MarkdownTextChunker.WeakSeparators3); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == "; "); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == ";\t"); + Assert.Contains(result3, f => f.IsSeparator && text3[f.Range] == ";\n"); + Assert.Contains(result4, f => f.IsSeparator && text4[f.Range] == ";"); + } + + [Fact] + public void SplitToFragments_CloseBrace_MatchesAllVariants() + { + var text1 = "a} b"; + var text2 = "a}\tb"; + var text3 = "a}\nb"; + var text4 = "a}b"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.WeakSeparators3); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.WeakSeparators3); + var result3 = MarkdownTextChunker.SplitToFragments(text3, MarkdownTextChunker.WeakSeparators3); + var result4 = MarkdownTextChunker.SplitToFragments(text4, MarkdownTextChunker.WeakSeparators3); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == "} "); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == "}\t"); + Assert.Contains(result3, f => f.IsSeparator && text3[f.Range] == "}\n"); + Assert.Contains(result4, f => f.IsSeparator && text4[f.Range] == "}"); + } + + [Fact] + public void SplitToFragments_CloseParen_MatchesAllVariants() + { + var text1 = "(a) b"; + var text2 = "(a)\tb"; + var text3 = "(a)\nb"; + var text4 = "(a)b"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.WeakSeparators3); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.WeakSeparators3); + var result3 = MarkdownTextChunker.SplitToFragments(text3, MarkdownTextChunker.WeakSeparators3); + var result4 = MarkdownTextChunker.SplitToFragments(text4, MarkdownTextChunker.WeakSeparators3); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == ") "); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == ")\t"); + Assert.Contains(result3, f => f.IsSeparator && text3[f.Range] == ")\n"); + Assert.Contains(result4, f => f.IsSeparator && text4[f.Range] == ")"); + } + + [Fact] + public void SplitToFragments_CloseBracket_MatchesAllVariants() + { + var text1 = "[a] b"; + var text2 = "[a]\tb"; + var text3 = "[a]\nb"; + var text4 = "[a]b"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.WeakSeparators3); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.WeakSeparators3); + var result3 = MarkdownTextChunker.SplitToFragments(text3, MarkdownTextChunker.WeakSeparators3); + var result4 = MarkdownTextChunker.SplitToFragments(text4, MarkdownTextChunker.WeakSeparators3); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == "] "); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == "]\t"); + Assert.Contains(result3, f => f.IsSeparator && text3[f.Range] == "]\n"); + Assert.Contains(result4, f => f.IsSeparator && text4[f.Range] == "]"); + } + + [Fact] + public void SplitToFragments_Colon_MatchesAllVariants() + { + var text1 = "key: value"; + var text2 = "key:value"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.WeakSeparators3); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.WeakSeparators3); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == ": "); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == ":"); + } + + [Fact] + public void SplitToFragments_Comma_MatchesAllVariants() + { + var text1 = "a, b"; + var text2 = "a,b"; + var result1 = MarkdownTextChunker.SplitToFragments(text1, MarkdownTextChunker.WeakSeparators3); + var result2 = MarkdownTextChunker.SplitToFragments(text2, MarkdownTextChunker.WeakSeparators3); + + Assert.Contains(result1, f => f.IsSeparator && text1[f.Range] == ", "); + Assert.Contains(result2, f => f.IsSeparator && text2[f.Range] == ","); + } + + [Fact] + public void SplitToFragments_SingleNewline_MatchesInWeakSeparators3() + { + var text = "line1\nline2"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.WeakSeparators3); + + Assert.Contains(result, f => f.IsSeparator && text[f.Range] == "\n"); + } + + #endregion + + #region Edge Cases and Optimized Equivalence Tests + + [Fact] + public void SplitToFragments_MultipleSeparatorTypes_ProcessesInOrder() + { + // Mix of different separator types + var text = "hello.\n\nworld"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.ExplicitSeparators); + + Assert.Equal(3, result.Count); + Assert.Equal("hello", text[result[0].Range]); + Assert.False(result[0].IsSeparator); + Assert.Equal(".\n\n", text[result[1].Range]); + Assert.True(result[1].IsSeparator); + Assert.Equal("world", text[result[2].Range]); + Assert.False(result[2].IsSeparator); + } + + [Fact] + public void SplitToFragments_SixConsecutiveNewlines_CreatesSeparateFragments() + { + var text = "\n\n\n\n\n\n"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.ExplicitSeparators); + + // Should match \n\n three times + Assert.Equal(3, result.Count); + Assert.All(result, f => Assert.True(f.IsSeparator)); + Assert.All(result, f => Assert.Equal("\n\n", text[f.Range])); + } + + [Fact] + public void SplitToFragments_SeparatorOnly_ReturnsOnlySeparators() + { + var text = ".\n\n"; + var result = MarkdownTextChunker.SplitToFragments(text, MarkdownTextChunker.ExplicitSeparators); + + Assert.Single(result); + Assert.True(result[0].IsSeparator); + Assert.Equal(".\n\n", text[result[0].Range]); + } + + #endregion + + #region NormalizeNewlines Tests + + [Fact] + public void NormalizeNewlines_CRLF_ConvertsToLF() + { + var result = MarkdownTextChunker.NormalizeNewlines("hello\r\nworld"); + Assert.Equal("hello\nworld", result); + } + + [Fact] + public void NormalizeNewlines_CROnly_ConvertsToLF() + { + var result = MarkdownTextChunker.NormalizeNewlines("hello\rworld"); + Assert.Equal("hello\nworld", result); + } + + [Fact] + public void NormalizeNewlines_MixedLineEndings_AllConvertToLF() + { + var result = MarkdownTextChunker.NormalizeNewlines("a\r\nb\rc\nd"); + Assert.Equal("a\nb\nc\nd", result); + } + + [Fact] + public void NormalizeNewlines_AlreadyNormalized_Unchanged() + { + var result = MarkdownTextChunker.NormalizeNewlines("hello\nworld"); + Assert.Equal("hello\nworld", result); + } + + [Fact] + public void NormalizeNewlines_NoLineEndings_Unchanged() + { + var result = MarkdownTextChunker.NormalizeNewlines("hello world"); + Assert.Equal("hello world", result); + } + + #endregion + + #region MergeImageChunks Tests + + [Fact] + public void MergeImageChunks_NoImages_Unchanged() + { + var chunks = new List { "first", "second", "third" }; + var result = MarkdownTextChunker.MergeImageChunks(chunks); + + Assert.Equal(3, result.Count); + Assert.Equal(chunks, result); + } + + [Fact] + public void MergeImageChunks_ImageAtStart_NotMerged() + { + var chunks = new List { "![image](path)", "second" }; + var result = MarkdownTextChunker.MergeImageChunks(chunks); + + Assert.Equal(2, result.Count); + Assert.Equal("![image](path)", result[0]); + } + + [Fact] + public void MergeImageChunks_ImageAfterContent_MergedWithPrevious() + { + var chunks = new List { "some text", "![image](path)" }; + var result = MarkdownTextChunker.MergeImageChunks(chunks); + + Assert.Single(result); + Assert.Contains("some text", result[0]); + Assert.Contains("![image](path)", result[0]); + } + + [Fact] + public void MergeImageChunks_ConsecutiveImages_AllMergedIntoPreceding() + { + var chunks = new List { "content", "![img1](p1)", "![img2](p2)" }; + var result = MarkdownTextChunker.MergeImageChunks(chunks); + + Assert.Single(result); + Assert.Contains("content", result[0]); + Assert.Contains("![img1](p1)", result[0]); + Assert.Contains("![img2](p2)", result[0]); + } + + [Fact] + public void MergeImageChunks_SingleChunk_Unchanged() + { + var chunks = new List { "single chunk" }; + var result = MarkdownTextChunker.MergeImageChunks(chunks); + + Assert.Single(result); + Assert.Equal("single chunk", result[0]); + } + + #endregion + + #region Overlap Handling Tests + + [Fact] + public void Chunk_ZeroOverlap_NoOverlapProcessing() + { + var text = string.Join(' ', Enumerable.Repeat("This sentence repeats for testing purposes.", 20)); + var slices = new[] { new ChunkSlice("doc-1", text) }; + + var config = new ChunkingConfig + { + Size = 50, + Overlap = 0, + EncodingModel = TokenizerDefaults.DefaultEncoding + }; + + var chunks = _chunker.Chunk(slices, config); + + Assert.True(chunks.Count > 1); + // With zero overlap, chunks should not have shared prefix/suffix + var tokenizer = TokenizerRegistry.GetTokenizer(config.EncodingModel); + var firstTokens = tokenizer.EncodeToIds(chunks[0].Text); + var secondTokens = tokenizer.EncodeToIds(chunks[1].Text); + + // First token of second chunk shouldn't be last token of first chunk + // (unless by coincidence from the text itself) + Assert.True(firstTokens.Count > 0); + Assert.True(secondTokens.Count > 0); + } + + [Fact] + public void Chunk_OverlapSmallerThanChunk_AddsOverlapPrefix() + { + var text = string.Join(' ', Enumerable.Repeat("Word", 100)); + var slices = new[] { new ChunkSlice("doc-1", text) }; + + var config = new ChunkingConfig + { + Size = 30, + Overlap = 10, + EncodingModel = TokenizerDefaults.DefaultEncoding + }; + + var chunks = _chunker.Chunk(slices, config); + + Assert.True(chunks.Count > 1); + // Second chunk should start with overlap from first + var tokenizer = TokenizerRegistry.GetTokenizer(config.EncodingModel); + var firstTokens = tokenizer.EncodeToIds(chunks[0].Text); + var overlapTokens = firstTokens.Skip(Math.Max(0, firstTokens.Count - config.Overlap)).ToArray(); + var overlapText = tokenizer.Decode(overlapTokens); + + Assert.StartsWith(overlapText.Trim(), chunks[1].Text.Trim(), StringComparison.Ordinal); + } + + [Fact] + public void Chunk_SingleChunk_NoOverlapNeeded() + { + var text = "Short text"; + var slices = new[] { new ChunkSlice("doc-1", text) }; + + var config = new ChunkingConfig + { + Size = 100, + Overlap = 20, + EncodingModel = TokenizerDefaults.DefaultEncoding + }; + + var chunks = _chunker.Chunk(slices, config); + + Assert.Single(chunks); + Assert.Equal("Short text", chunks[0].Text); + } + + #endregion + + #region GenerateChunks Token Boundary Tests + + [Fact] + public void Chunk_SmallDocument_FitsInSingleChunk() + { + var text = "Hello world. This is a test."; + var slices = new[] { new ChunkSlice("doc-1", text) }; + + var config = new ChunkingConfig + { + Size = 100, + Overlap = 0, + EncodingModel = TokenizerDefaults.DefaultEncoding + }; + + var chunks = _chunker.Chunk(slices, config); + + Assert.Single(chunks); + } + + [Fact] + public void Chunk_LargeDocument_SplitsIntoMultipleChunks() + { + var text = string.Join("\n\n", Enumerable.Repeat("This is a paragraph with enough content to exceed token limits when repeated multiple times.", 20)); + var slices = new[] { new ChunkSlice("doc-1", text) }; + + var config = new ChunkingConfig + { + Size = 50, + Overlap = 0, + EncodingModel = TokenizerDefaults.DefaultEncoding + }; + + var chunks = _chunker.Chunk(slices, config); + + Assert.True(chunks.Count > 1); + + // Each chunk should respect token limit (approximately) + var tokenizer = TokenizerRegistry.GetTokenizer(config.EncodingModel); + foreach (var chunk in chunks) + { + var tokenCount = tokenizer.CountTokens(chunk.Text); + // Allow some flexibility due to overlap and boundary handling + Assert.True(tokenCount <= config.Size * 1.5, $"Chunk has {tokenCount} tokens, expected <= {config.Size * 1.5}"); + } + } + + [Fact] + public void Chunk_DocumentWithHeaders_SplitsAtHeaderBoundaries() + { + var text = "# Header 1\n\nContent for header 1.\n\n## Header 2\n\nContent for header 2.\n\n### Header 3\n\nContent for header 3."; + var slices = new[] { new ChunkSlice("doc-1", text) }; + + var config = new ChunkingConfig + { + Size = 20, + Overlap = 0, + EncodingModel = TokenizerDefaults.DefaultEncoding + }; + + var chunks = _chunker.Chunk(slices, config); + + Assert.True(chunks.Count >= 1); + // Headers should be preserved in chunks + Assert.Contains(chunks, c => c.Text.Contains('#')); + } + + [Fact] + public void Chunk_TrailingContent_Captured() + { + var text = "First paragraph.\n\nSecond paragraph.\n\nTrailing content."; + var slices = new[] { new ChunkSlice("doc-1", text) }; + + var config = new ChunkingConfig + { + Size = 200, + Overlap = 0, + EncodingModel = TokenizerDefaults.DefaultEncoding + }; + + var chunks = _chunker.Chunk(slices, config); + + var allText = string.Join("", chunks.Select(c => c.Text)); + Assert.Contains("Trailing content", allText); + } + + #endregion }