From 375b2b50322ee4d5dc391905ab53c6aa553ad8ef Mon Sep 17 00:00:00 2001
From: Matthew Zhou <mattzh1314@gmail.com>
Date: Mon, 7 Jul 2025 13:22:50 -0700
Subject: [PATCH] feat: Further split structured documents (#3199)

---
 .../chunker/llama_index_chunker.py            | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/letta/services/file_processor/chunker/llama_index_chunker.py b/letta/services/file_processor/chunker/llama_index_chunker.py
index d2c3604a..ab6ea4a6 100644
--- a/letta/services/file_processor/chunker/llama_index_chunker.py
+++ b/letta/services/file_processor/chunker/llama_index_chunker.py
@@ -12,6 +12,10 @@ logger = get_logger(__name__)
 class LlamaIndexChunker:
     """LlamaIndex-based text chunking with automatic splitter selection"""
 
+    # Conservative default chunk sizes for fallback scenarios
+    DEFAULT_CONSERVATIVE_CHUNK_SIZE = 384
+    DEFAULT_CONSERVATIVE_CHUNK_OVERLAP = 25
+
     def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50, file_type: Optional[str] = None):
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
@@ -99,10 +103,24 @@ class LlamaIndexChunker:
             elif hasattr(self.parser, "get_nodes_from_documents"):
                 # Some parsers need Document objects
                 from llama_index.core import Document
+                from llama_index.core.node_parser import SentenceSplitter
 
                 document = Document(text=text_content)
                 nodes = self.parser.get_nodes_from_documents([document])
-                return [node.text for node in nodes]
+
+                # Further split nodes that exceed chunk_size using SentenceSplitter
+                final_chunks = []
+                sentence_splitter = SentenceSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+
+                for node in nodes:
+                    if len(node.text) > self.chunk_size:
+                        # Split oversized nodes with sentence splitter
+                        sub_chunks = sentence_splitter.split_text(node.text)
+                        final_chunks.extend(sub_chunks)
+                    else:
+                        final_chunks.append(node.text)
+
+                return final_chunks
             else:
                 # Fallback - try to call the parser directly
                 return self.parser(text_content)
@@ -128,12 +146,14 @@ class LlamaIndexChunker:
                 raise e  # Raise the original error
 
     @trace_method
-    def default_chunk_text(self, content: Union[OCRPageObject, str], chunk_size: int = 384, chunk_overlap: int = 25) -> List[str]:
+    def default_chunk_text(self, content: Union[OCRPageObject, str], chunk_size: int = None, chunk_overlap: int = None) -> List[str]:
         """Chunk text using default SentenceSplitter regardless of file type with conservative defaults"""
         try:
             from llama_index.core.node_parser import SentenceSplitter
 
             # Use provided defaults or fallback to conservative values
+            chunk_size = chunk_size if chunk_size is not None else self.DEFAULT_CONSERVATIVE_CHUNK_SIZE
+            chunk_overlap = chunk_overlap if chunk_overlap is not None else self.DEFAULT_CONSERVATIVE_CHUNK_OVERLAP
             default_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 
             # Handle different input types