-
Notifications
You must be signed in to change notification settings - Fork 479
fix: Optimize chunk strategy #1038
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev-20260209-v2.0.6
Are you sure you want to change the base?
Changes from all commits
7a1e121
861ce14
b2ce876
bc5647e
7f723c3
1e80f0c
edeb180
db143a6
e695eb2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -216,3 +216,5 @@ cython_debug/ | |
| outputs | ||
|
|
||
| evaluation/data/temporal_locomo | ||
| test_add_pipeline.py | ||
| test_file_pipeline.py | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| from abc import ABC, abstractmethod | ||
|
|
||
| from memos.configs.chunker import BaseChunkerConfig | ||
| import re | ||
|
|
||
|
|
||
| class Chunk: | ||
|
|
@@ -22,3 +23,42 @@ def __init__(self, config: BaseChunkerConfig): | |
| @abstractmethod | ||
| def chunk(self, text: str) -> list[Chunk]: | ||
| """Chunk the given text into smaller chunks.""" | ||
|
|
||
| def protect_urls(self, text: str) -> tuple[str, dict[str, str]]: | ||
| """ | ||
| Protect URLs in text from being split during chunking. | ||
|
|
||
| Args: | ||
| text: Text to process | ||
|
|
||
| Returns: | ||
| tuple: (Text with URLs replaced by placeholders, URL mapping dictionary) | ||
| """ | ||
| url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+' | ||
| url_map = {} | ||
|
|
||
| def replace_url(match): | ||
| url = match.group(0) | ||
| placeholder = f"__URL_{len(url_map)}__" | ||
| url_map[placeholder] = url | ||
| return placeholder | ||
|
|
||
| protected_text = re.sub(url_pattern, replace_url, text) | ||
| return protected_text, url_map | ||
|
|
||
| def restore_urls(self, text: str, url_map: dict[str, str]) -> str: | ||
| """ | ||
| Restore protected URLs in text back to their original form. | ||
|
|
||
| Args: | ||
| text: Text with URL placeholders | ||
| url_map: URL mapping dictionary from protect_urls | ||
|
|
||
| Returns: | ||
| str: Text with URLs restored | ||
| """ | ||
| restored_text = text | ||
| for placeholder, url in url_map.items(): | ||
| restored_text = restored_text.replace(placeholder, url) | ||
|
|
||
| return restored_text | ||
|
Comment on lines
+27
to
+64
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -2,6 +2,8 @@ | |||||||||||||
| from memos.dependency import require_python_package | ||||||||||||||
| from memos.log import get_logger | ||||||||||||||
|
|
||||||||||||||
| import re | ||||||||||||||
|
||||||||||||||
|
|
||||||||||||||
| from .base import BaseChunker, Chunk | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
|
|
@@ -22,13 +24,15 @@ def __init__( | |||||||||||||
| chunk_size: int = 1000, | ||||||||||||||
| chunk_overlap: int = 200, | ||||||||||||||
| recursive: bool = False, | ||||||||||||||
| auto_fix_headers: bool = True, | ||||||||||||||
|
||||||||||||||
| auto_fix_headers: bool = True, | |
| auto_fix_headers: bool = True, |
Copilot
AI
Feb 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trailing whitespace after the comment. Remove the trailing space after # Extract all valid markdown header lines.
| # Extract all valid markdown header lines | |
| # Extract all valid markdown header lines |
Copilot
AI
Feb 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trailing whitespace after the comment. Remove the trailing space after pattern = re.compile(r'^#{1,6}\s+.+').
| # Extract all valid markdown header lines | |
| header_levels = [] | |
| pattern = re.compile(r'^#{1,6}\s+.+') | |
| # Extract all valid markdown header lines | |
| header_levels = [] | |
| pattern = re.compile(r'^#{1,6}\s+.+') |
Copilot
AI
Feb 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test is always true, because of this condition.
Copilot
AI
Feb 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trailing whitespace after the comment. Remove the trailing space after first_valid_header = False.
| first_valid_header = False | |
| first_valid_header = False |
Copilot
AI
Feb 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The header hierarchy fix strategy may produce unexpected results. When all headers are level 1 (e.g., # A, # B, # C), incrementing all subsequent headers creates # A, ## B, ## C, making B and C appear as children of A rather than siblings. A more appropriate fix might be to keep all headers at level 1, or to analyze the context to determine proper hierarchy. Consider documenting this behavior more clearly or revising the fix strategy to better handle flat header structures.
Copilot
AI
Feb 6, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trailing whitespace after the else statement. Remove the trailing spaces after else:.
| else: | |
| else: |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -43,11 +43,13 @@ def __init__(self, config: SentenceChunkerConfig): | |||||
|
|
||||||
| def chunk(self, text: str) -> list[str] | list[Chunk]: | ||||||
| """Chunk the given text into smaller chunks based on sentences.""" | ||||||
| chonkie_chunks = self.chunker.chunk(text) | ||||||
| protected_text, url_map = self.protect_urls(text) | ||||||
| chonkie_chunks = self.chunker.chunk(protected_text) | ||||||
|
|
||||||
| chunks = [] | ||||||
| for c in chonkie_chunks: | ||||||
| chunk = Chunk(text=c.text, token_count=c.token_count, sentences=c.sentences) | ||||||
| chunk = self.restore_urls(chunk.text, url_map) | ||||||
|
||||||
| chunk = self.restore_urls(chunk.text, url_map) | |
| chunk.text = self.restore_urls(chunk.text, url_map) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,12 +20,15 @@ def _simple_split_text(self, text: str, chunk_size: int, chunk_overlap: int) -> | |
| Returns: | ||
| List of text chunks | ||
| """ | ||
| if not text or len(text) <= chunk_size: | ||
| return [text] if text.strip() else [] | ||
| protected_text, url_map = self.protect_urls(text) | ||
|
||
|
|
||
| if not protected_text or len(protected_text) <= chunk_size: | ||
| chunks = [protected_text] if protected_text.strip() else [] | ||
| return [self.restore_urls(chunk, url_map) for chunk in chunks] | ||
|
|
||
| chunks = [] | ||
| start = 0 | ||
| text_len = len(text) | ||
| text_len = len(protected_text) | ||
|
|
||
| while start < text_len: | ||
| # Calculate end position | ||
|
|
@@ -35,16 +38,16 @@ def _simple_split_text(self, text: str, chunk_size: int, chunk_overlap: int) -> | |
| if end < text_len: | ||
| # Try to break at newline, sentence end, or space | ||
| for separator in ["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " "]: | ||
| last_sep = text.rfind(separator, start, end) | ||
| last_sep = protected_text.rfind(separator, start, end) | ||
| if last_sep != -1: | ||
| end = last_sep + len(separator) | ||
| break | ||
|
|
||
| chunk = text[start:end].strip() | ||
| chunk = protected_text[start:end].strip() | ||
| if chunk: | ||
| chunks.append(chunk) | ||
|
|
||
| # Move start position with overlap | ||
| start = max(start + 1, end - chunk_overlap) | ||
|
|
||
| return chunks | ||
| return [self.restore_urls(chunk, url_map) for chunk in chunks] | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Standard library imports (like
import re) should be placed before third-party imports according to PEP 8. Moveimport reto the top of the import section, before thefrom abc import ABC, abstractmethodline.