feat: Further split structured documents (#3199)
This commit is contained in:
@@ -12,6 +12,10 @@ logger = get_logger(__name__)
|
||||
class LlamaIndexChunker:
|
||||
"""LlamaIndex-based text chunking with automatic splitter selection"""
|
||||
|
||||
# Conservative default chunk sizes for fallback scenarios
|
||||
DEFAULT_CONSERVATIVE_CHUNK_SIZE = 384
|
||||
DEFAULT_CONSERVATIVE_CHUNK_OVERLAP = 25
|
||||
|
||||
def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50, file_type: Optional[str] = None):
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
@@ -99,10 +103,24 @@ class LlamaIndexChunker:
|
||||
elif hasattr(self.parser, "get_nodes_from_documents"):
|
||||
# Some parsers need Document objects
|
||||
from llama_index.core import Document
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
document = Document(text=text_content)
|
||||
nodes = self.parser.get_nodes_from_documents([document])
|
||||
return [node.text for node in nodes]
|
||||
|
||||
# Further split nodes that exceed chunk_size using SentenceSplitter
|
||||
final_chunks = []
|
||||
sentence_splitter = SentenceSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
|
||||
|
||||
for node in nodes:
|
||||
if len(node.text) > self.chunk_size:
|
||||
# Split oversized nodes with sentence splitter
|
||||
sub_chunks = sentence_splitter.split_text(node.text)
|
||||
final_chunks.extend(sub_chunks)
|
||||
else:
|
||||
final_chunks.append(node.text)
|
||||
|
||||
return final_chunks
|
||||
else:
|
||||
# Fallback - try to call the parser directly
|
||||
return self.parser(text_content)
|
||||
@@ -128,12 +146,14 @@ class LlamaIndexChunker:
|
||||
raise e # Raise the original error
|
||||
|
||||
@trace_method
|
||||
def default_chunk_text(self, content: Union[OCRPageObject, str], chunk_size: int = 384, chunk_overlap: int = 25) -> List[str]:
|
||||
def default_chunk_text(self, content: Union[OCRPageObject, str], chunk_size: int = None, chunk_overlap: int = None) -> List[str]:
|
||||
"""Chunk text using default SentenceSplitter regardless of file type with conservative defaults"""
|
||||
try:
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
# Use provided defaults or fallback to conservative values
|
||||
chunk_size = chunk_size if chunk_size is not None else self.DEFAULT_CONSERVATIVE_CHUNK_SIZE
|
||||
chunk_overlap = chunk_overlap if chunk_overlap is not None else self.DEFAULT_CONSERVATIVE_CHUNK_OVERLAP
|
||||
default_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
||||
|
||||
# Handle different input types
|
||||
|
||||
Reference in New Issue
Block a user