From 375b2b50322ee4d5dc391905ab53c6aa553ad8ef Mon Sep 17 00:00:00 2001 From: Matthew Zhou Date: Mon, 7 Jul 2025 13:22:50 -0700 Subject: [PATCH] feat: Further split structured documents (#3199) --- .../chunker/llama_index_chunker.py | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/letta/services/file_processor/chunker/llama_index_chunker.py b/letta/services/file_processor/chunker/llama_index_chunker.py index d2c3604a..ab6ea4a6 100644 --- a/letta/services/file_processor/chunker/llama_index_chunker.py +++ b/letta/services/file_processor/chunker/llama_index_chunker.py @@ -12,6 +12,10 @@ logger = get_logger(__name__) class LlamaIndexChunker: """LlamaIndex-based text chunking with automatic splitter selection""" + # Conservative default chunk sizes for fallback scenarios + DEFAULT_CONSERVATIVE_CHUNK_SIZE = 384 + DEFAULT_CONSERVATIVE_CHUNK_OVERLAP = 25 + def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50, file_type: Optional[str] = None): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap @@ -99,10 +103,24 @@ class LlamaIndexChunker: elif hasattr(self.parser, "get_nodes_from_documents"): # Some parsers need Document objects from llama_index.core import Document + from llama_index.core.node_parser import SentenceSplitter document = Document(text=text_content) nodes = self.parser.get_nodes_from_documents([document]) - return [node.text for node in nodes] + + # Further split nodes that exceed chunk_size using SentenceSplitter + final_chunks = [] + sentence_splitter = SentenceSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap) + + for node in nodes: + if len(node.text) > self.chunk_size: + # Split oversized nodes with sentence splitter + sub_chunks = sentence_splitter.split_text(node.text) + final_chunks.extend(sub_chunks) + else: + final_chunks.append(node.text) + + return final_chunks else: # Fallback - try to call the parser directly return self.parser(text_content) @@ -128,12 +146,14 @@ class LlamaIndexChunker: raise e # Raise the original error @trace_method - def default_chunk_text(self, content: Union[OCRPageObject, str], chunk_size: int = 384, chunk_overlap: int = 25) -> List[str]: + def default_chunk_text(self, content: Union[OCRPageObject, str], chunk_size: int = None, chunk_overlap: int = None) -> List[str]: """Chunk text using default SentenceSplitter regardless of file type with conservative defaults""" try: from llama_index.core.node_parser import SentenceSplitter # Use provided defaults or fallback to conservative values + chunk_size = chunk_size if chunk_size is not None else self.DEFAULT_CONSERVATIVE_CHUNK_SIZE + chunk_overlap = chunk_overlap if chunk_overlap is not None else self.DEFAULT_CONSERVATIVE_CHUNK_OVERLAP default_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) # Handle different input types