feat: Further split structured documents (#3199)

2025-07-07 13:22:50 -07:00
parent 9cb286e170
commit 375b2b5032
1 changed files with 22 additions and 2 deletions
--- a/letta/services/file_processor/chunker/llama_index_chunker.py
+++ b/letta/services/file_processor/chunker/llama_index_chunker.py
@@ -12,6 +12,10 @@ logger = get_logger(__name__)
 class LlamaIndexChunker:
    """LlamaIndex-based text chunking with automatic splitter selection"""

+    # Conservative default chunk sizes for fallback scenarios
+    DEFAULT_CONSERVATIVE_CHUNK_SIZE = 384
+    DEFAULT_CONSERVATIVE_CHUNK_OVERLAP = 25
+
    def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50, file_type: Optional[str] = None):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
@@ -99,10 +103,24 @@ class LlamaIndexChunker:
            elif hasattr(self.parser, "get_nodes_from_documents"):
                # Some parsers need Document objects
                from llama_index.core import Document
+                from llama_index.core.node_parser import SentenceSplitter

                document = Document(text=text_content)
                nodes = self.parser.get_nodes_from_documents([document])
-                return [node.text for node in nodes]
+
+                # Further split nodes that exceed chunk_size using SentenceSplitter
+                final_chunks = []
+                sentence_splitter = SentenceSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+
+                for node in nodes:
+                    if len(node.text) > self.chunk_size:
+                        # Split oversized nodes with sentence splitter
+                        sub_chunks = sentence_splitter.split_text(node.text)
+                        final_chunks.extend(sub_chunks)
+                    else:
+                        final_chunks.append(node.text)
+
+                return final_chunks
            else:
                # Fallback - try to call the parser directly
                return self.parser(text_content)
@@ -128,12 +146,14 @@ class LlamaIndexChunker:
                raise e  # Raise the original error

    @trace_method
-    def default_chunk_text(self, content: Union[OCRPageObject, str], chunk_size: int = 384, chunk_overlap: int = 25) -> List[str]:
+    def default_chunk_text(self, content: Union[OCRPageObject, str], chunk_size: int = None, chunk_overlap: int = None) -> List[str]:
        """Chunk text using default SentenceSplitter regardless of file type with conservative defaults"""
        try:
            from llama_index.core.node_parser import SentenceSplitter

            # Use provided defaults or fallback to conservative values
+            chunk_size = chunk_size if chunk_size is not None else self.DEFAULT_CONSERVATIVE_CHUNK_SIZE
+            chunk_overlap = chunk_overlap if chunk_overlap is not None else self.DEFAULT_CONSERVATIVE_CHUNK_OVERLAP
            default_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

            # Handle different input types