feat: Further split structured documents (#3199)

This commit is contained in:
Matthew Zhou
2025-07-07 13:22:50 -07:00
committed by GitHub
parent 9cb286e170
commit 375b2b5032

View File

@@ -12,6 +12,10 @@ logger = get_logger(__name__)
class LlamaIndexChunker:
"""LlamaIndex-based text chunking with automatic splitter selection"""
# Conservative default chunk sizes for fallback scenarios
DEFAULT_CONSERVATIVE_CHUNK_SIZE = 384
DEFAULT_CONSERVATIVE_CHUNK_OVERLAP = 25
def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50, file_type: Optional[str] = None):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
@@ -99,10 +103,24 @@ class LlamaIndexChunker:
elif hasattr(self.parser, "get_nodes_from_documents"):
# Some parsers need Document objects
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
document = Document(text=text_content)
nodes = self.parser.get_nodes_from_documents([document])
return [node.text for node in nodes]
# Further split nodes that exceed chunk_size using SentenceSplitter
final_chunks = []
sentence_splitter = SentenceSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
for node in nodes:
if len(node.text) > self.chunk_size:
# Split oversized nodes with sentence splitter
sub_chunks = sentence_splitter.split_text(node.text)
final_chunks.extend(sub_chunks)
else:
final_chunks.append(node.text)
return final_chunks
else:
# Fallback - try to call the parser directly
return self.parser(text_content)
@@ -128,12 +146,14 @@ class LlamaIndexChunker:
raise e # Raise the original error
@trace_method
def default_chunk_text(self, content: Union[OCRPageObject, str], chunk_size: int = 384, chunk_overlap: int = 25) -> List[str]:
def default_chunk_text(self, content: Union[OCRPageObject, str], chunk_size: int = None, chunk_overlap: int = None) -> List[str]:
"""Chunk text using default SentenceSplitter regardless of file type with conservative defaults"""
try:
from llama_index.core.node_parser import SentenceSplitter
# Use provided defaults or fallback to conservative values
chunk_size = chunk_size if chunk_size is not None else self.DEFAULT_CONSERVATIVE_CHUNK_SIZE
chunk_overlap = chunk_overlap if chunk_overlap is not None else self.DEFAULT_CONSERVATIVE_CHUNK_OVERLAP
default_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
# Handle different input types