Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,5 @@ cython_debug/
outputs

evaluation/data/temporal_locomo
test_add_pipeline.py
test_file_pipeline.py
3 changes: 3 additions & 0 deletions src/memos/api/handlers/component_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
)
from memos.memories.textual.simple_preference import SimplePreferenceTextMemory
from memos.memories.textual.simple_tree import SimpleTreeTextMemory
from memos.memories.textual.tree_text_memory.organize.history_manager import MemoryHistoryManager
from memos.memories.textual.tree_text_memory.organize.manager import MemoryManager
from memos.memories.textual.tree_text_memory.retrieve.retrieve_utils import FastTokenizer

Expand Down Expand Up @@ -190,6 +191,7 @@ def init_server() -> dict[str, Any]:
)
embedder = EmbedderFactory.from_config(embedder_config)
nli_client = NLIClient(base_url=nli_client_config["base_url"])
memory_history_manager = MemoryHistoryManager(nli_client=nli_client, graph_db=graph_db)
# Pass graph_db to mem_reader for recall operations (deduplication, conflict detection)
mem_reader = MemReaderFactory.from_config(mem_reader_config, graph_db=graph_db)
reranker = RerankerFactory.from_config(reranker_config)
Expand Down Expand Up @@ -393,4 +395,5 @@ def init_server() -> dict[str, Any]:
"redis_client": redis_client,
"deepsearch_agent": deepsearch_agent,
"nli_client": nli_client,
"memory_history_manager": memory_history_manager,
}
40 changes: 40 additions & 0 deletions src/memos/chunkers/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from abc import ABC, abstractmethod

from memos.configs.chunker import BaseChunkerConfig
import re
Comment on lines 1 to +4
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Standard library imports (like import re) should be placed before third-party imports according to PEP 8. Move import re to the top of the import section, before the from abc import ABC, abstractmethod line.

Suggested change
from abc import ABC, abstractmethod
from memos.configs.chunker import BaseChunkerConfig
import re
import re
from abc import ABC, abstractmethod
from memos.configs.chunker import BaseChunkerConfig

Copilot uses AI. Check for mistakes.


class Chunk:
Expand All @@ -22,3 +23,42 @@ def __init__(self, config: BaseChunkerConfig):
@abstractmethod
def chunk(self, text: str) -> list[Chunk]:
"""Chunk the given text into smaller chunks."""

def protect_urls(self, text: str) -> tuple[str, dict[str, str]]:
"""
Protect URLs in text from being split during chunking.

Args:
text: Text to process

Returns:
tuple: (Text with URLs replaced by placeholders, URL mapping dictionary)
"""
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
url_map = {}

def replace_url(match):
url = match.group(0)
placeholder = f"__URL_{len(url_map)}__"
url_map[placeholder] = url
return placeholder

protected_text = re.sub(url_pattern, replace_url, text)
return protected_text, url_map

def restore_urls(self, text: str, url_map: dict[str, str]) -> str:
"""
Restore protected URLs in text back to their original form.

Args:
text: Text with URL placeholders
url_map: URL mapping dictionary from protect_urls

Returns:
str: Text with URLs restored
"""
restored_text = text
for placeholder, url in url_map.items():
restored_text = restored_text.replace(placeholder, url)

return restored_text
Comment on lines +27 to +64
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new URL protection/restoration functionality in the chunkers lacks test coverage. Consider adding tests to verify that URLs are properly protected during chunking and correctly restored afterwards, especially for edge cases like URLs at chunk boundaries or multiple URLs in a single chunk.

Copilot uses AI. Check for mistakes.
4 changes: 3 additions & 1 deletion src/memos/chunkers/charactertext_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def __init__(

def chunk(self, text: str, **kwargs) -> list[str] | list[Chunk]:
"""Chunk the given text into smaller chunks based on sentences."""
chunks = self.chunker.split_text(text)
protected_text, url_map = self.protect_urls(text)
chunks = self.chunker.split_text(protected_text)
chunks = [self.restore_urls(chunk, url_map) for chunk in chunks]
logger.debug(f"Generated {len(chunks)} chunks from input text")
return chunks
96 changes: 94 additions & 2 deletions src/memos/chunkers/markdown_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from memos.dependency import require_python_package
from memos.log import get_logger

import re
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Standard library imports (like import re) should be placed before third-party imports according to PEP 8. Move import re to the top of the import section, before the from memos.configs.chunker import MarkdownChunkerConfig line.

Copilot uses AI. Check for mistakes.

from .base import BaseChunker, Chunk


Expand All @@ -22,13 +24,15 @@ def __init__(
chunk_size: int = 1000,
chunk_overlap: int = 200,
recursive: bool = False,
auto_fix_headers: bool = True,
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace after the parameter definition. Remove the trailing space after True,.

Suggested change
auto_fix_headers: bool = True,
auto_fix_headers: bool = True,

Copilot uses AI. Check for mistakes.
):
from langchain_text_splitters import (
MarkdownHeaderTextSplitter,
RecursiveCharacterTextSplitter,
)

self.config = config
self.auto_fix_headers = auto_fix_headers
self.chunker = MarkdownHeaderTextSplitter(
headers_to_split_on=config.headers_to_split_on
if config
Expand All @@ -46,17 +50,105 @@ def __init__(

def chunk(self, text: str, **kwargs) -> list[str] | list[Chunk]:
"""Chunk the given text into smaller chunks based on sentences."""
md_header_splits = self.chunker.split_text(text)
# Protect URLs first
protected_text, url_map = self.protect_urls(text)
# Auto-detect and fix malformed header hierarchy if enabled
if self.auto_fix_headers and self._detect_malformed_headers(protected_text):
logger.info("detected malformed header hierarchy, attempting to fix...")
protected_text = self._fix_header_hierarchy(protected_text)
logger.info("Header hierarchy fix completed")

md_header_splits = self.chunker.split_text(protected_text)
chunks = []
if self.chunker_recursive:
md_header_splits = self.chunker_recursive.split_documents(md_header_splits)
for doc in md_header_splits:
try:
chunk = " ".join(list(doc.metadata.values())) + "\n" + doc.page_content
chunk = self.restore_urls(chunk, url_map)
chunks.append(chunk)
except Exception as e:
logger.warning(f"warning chunking document: {e}")
chunks.append(doc.page_content)
restored_chunk = self.restore_urls(doc.page_content, url_map)
chunks.append(restored_chunk)
logger.info(f"Generated chunks: {chunks[:5]}")
logger.debug(f"Generated {len(chunks)} chunks from input text")
return chunks

def _detect_malformed_headers(self, text: str) -> bool:
"""Detect if markdown has improper header hierarchy usage."""
# Extract all valid markdown header lines
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace after the comment. Remove the trailing space after # Extract all valid markdown header lines.

Suggested change
# Extract all valid markdown header lines
# Extract all valid markdown header lines

Copilot uses AI. Check for mistakes.
header_levels = []
pattern = re.compile(r'^#{1,6}\s+.+')
Comment on lines +80 to +82
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace after the comment. Remove the trailing space after pattern = re.compile(r'^#{1,6}\s+.+').

Suggested change
# Extract all valid markdown header lines
header_levels = []
pattern = re.compile(r'^#{1,6}\s+.+')
# Extract all valid markdown header lines
header_levels = []
pattern = re.compile(r'^#{1,6}\s+.+')

Copilot uses AI. Check for mistakes.
for line in text.split('\n'):
stripped_line = line.strip()
if pattern.match(stripped_line):
hash_match = re.match(r'^(#+)', stripped_line)
if hash_match:
level = len(hash_match.group(1))
header_levels.append(level)

total_headers = len(header_levels)
if total_headers == 0:
logger.debug("No valid headers detected, skipping check")
return False

# Calculate level-1 header ratio
level1_count = sum(1 for level in header_levels if level == 1)

# Determine if malformed: >90% are level-1 when total > 5
# OR all headers are level-1 when total ≤ 5
if total_headers > 5:
level1_ratio = level1_count / total_headers
if level1_ratio > 0.9:
logger.warning(
f"Detected header hierarchy issue: {level1_count}/{total_headers} "
f"({level1_ratio:.1%}) of headers are level 1"
)
return True
elif total_headers <= 5 and level1_count == total_headers:
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test is always true, because of this condition.

Copilot uses AI. Check for mistakes.
logger.warning(
f"Detected header hierarchy issue: all {total_headers} headers are level 1"
)
return True
return False

def _fix_header_hierarchy(self, text: str) -> str:
"""
Fix markdown header hierarchy by adjusting levels.

Strategy:
1. Keep the first header unchanged as level-1 parent
2. Increment all subsequent headers by 1 level (max level 6)
"""
header_pattern = re.compile(r'^(#{1,6})\s+(.+)$')
lines = text.split('\n')
fixed_lines = []
first_valid_header = False
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace after the comment. Remove the trailing space after first_valid_header = False.

Suggested change
first_valid_header = False
first_valid_header = False

Copilot uses AI. Check for mistakes.

for line in lines:
stripped_line = line.strip()
# Match valid header lines (invalid # lines kept as-is)
header_match = header_pattern.match(stripped_line)
if header_match:
current_hashes, title_content = header_match.groups()
current_level = len(current_hashes)

if not first_valid_header:
# First valid header: keep original level unchanged
fixed_line = f"{current_hashes} {title_content}"
first_valid_header = True
logger.debug(f"Keep first header at level {current_level}: {title_content[:50]}...")
else:
# Subsequent headers: increment by 1, cap at level 6
new_level = min(current_level + 1, 6)
new_hashes = '#' * new_level
fixed_line = f"{new_hashes} {title_content}"
logger.debug(f"Adjust header level: {current_level} -> {new_level}: {title_content[:50]}...")
Comment on lines +116 to +147
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The header hierarchy fix strategy may produce unexpected results. When all headers are level 1 (e.g., # A, # B, # C), incrementing all subsequent headers creates # A, ## B, ## C, making B and C appear as children of A rather than siblings. A more appropriate fix might be to keep all headers at level 1, or to analyze the context to determine proper hierarchy. Consider documenting this behavior more clearly or revising the fix strategy to better handle flat header structures.

Copilot uses AI. Check for mistakes.
fixed_lines.append(fixed_line)
else:
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Trailing whitespace after the else statement. Remove the trailing spaces after else:.

Suggested change
else:
else:

Copilot uses AI. Check for mistakes.
fixed_lines.append(line)

# Join with newlines to preserve original formatting
fixed_text = '\n'.join(fixed_lines)
return fixed_text
4 changes: 3 additions & 1 deletion src/memos/chunkers/sentence_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,13 @@ def __init__(self, config: SentenceChunkerConfig):

def chunk(self, text: str) -> list[str] | list[Chunk]:
"""Chunk the given text into smaller chunks based on sentences."""
chonkie_chunks = self.chunker.chunk(text)
protected_text, url_map = self.protect_urls(text)
chonkie_chunks = self.chunker.chunk(protected_text)

chunks = []
for c in chonkie_chunks:
chunk = Chunk(text=c.text, token_count=c.token_count, sentences=c.sentences)
chunk = self.restore_urls(chunk.text, url_map)
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The restored URL text is being assigned back to the chunk variable (which is a Chunk object), but then the Chunk object itself is appended to the chunks list. This will cause the chunk to be a string instead of a Chunk object. The restored text should be used to create a new Chunk object or the Chunk object's text attribute should be updated. Consider: chunk.text = self.restore_urls(chunk.text, url_map) or chunks.append(self.restore_urls(chunk.text, url_map)).

Suggested change
chunk = self.restore_urls(chunk.text, url_map)
chunk.text = self.restore_urls(chunk.text, url_map)

Copilot uses AI. Check for mistakes.
chunks.append(chunk)

logger.debug(f"Generated {len(chunks)} chunks from input text")
Expand Down
15 changes: 9 additions & 6 deletions src/memos/chunkers/simple_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@ def _simple_split_text(self, text: str, chunk_size: int, chunk_overlap: int) ->
Returns:
List of text chunks
"""
if not text or len(text) <= chunk_size:
return [text] if text.strip() else []
protected_text, url_map = self.protect_urls(text)
Copy link

Copilot AI Feb 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The SimpleTextSplitter class uses methods protect_urls and restore_urls but doesn't inherit from BaseChunker. This will cause an AttributeError when these methods are called. The class should inherit from BaseChunker or implement these methods directly.

Copilot uses AI. Check for mistakes.

if not protected_text or len(protected_text) <= chunk_size:
chunks = [protected_text] if protected_text.strip() else []
return [self.restore_urls(chunk, url_map) for chunk in chunks]

chunks = []
start = 0
text_len = len(text)
text_len = len(protected_text)

while start < text_len:
# Calculate end position
Expand All @@ -35,16 +38,16 @@ def _simple_split_text(self, text: str, chunk_size: int, chunk_overlap: int) ->
if end < text_len:
# Try to break at newline, sentence end, or space
for separator in ["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " "]:
last_sep = text.rfind(separator, start, end)
last_sep = protected_text.rfind(separator, start, end)
if last_sep != -1:
end = last_sep + len(separator)
break

chunk = text[start:end].strip()
chunk = protected_text[start:end].strip()
if chunk:
chunks.append(chunk)

# Move start position with overlap
start = max(start + 1, end - chunk_overlap)

return chunks
return [self.restore_urls(chunk, url_map) for chunk in chunks]
Loading
Loading