feat: Add content aware line chunking (#2707)

This commit is contained in:
Matthew Zhou
2025-06-09 13:03:25 -07:00
committed by GitHub
parent 71fcbbc863
commit 951773d0ec
7 changed files with 333 additions and 172 deletions

View File

@@ -1,34 +1,139 @@
import re
from typing import List, Optional
from letta.log import get_logger
from letta.schemas.file import FileMetadata
from letta.services.file_processor.file_types import ChunkingStrategy, file_type_registry
logger = get_logger(__name__)
class LineChunker:
"""Newline chunker"""
"""Content-aware line chunker that adapts chunking strategy based on file type"""
def __init__(self):
pass
self.file_type_registry = file_type_registry
def chunk_text(self, text: str, start: Optional[int] = None, end: Optional[int] = None, add_metadata: bool = True) -> List[str]:
"""Split lines"""
content_lines = [line.strip() for line in text.splitlines() if line.strip()]
total_lines = len(content_lines)
def _determine_chunking_strategy(self, file_metadata: FileMetadata) -> ChunkingStrategy:
"""Determine the best chunking strategy based on file metadata"""
# Try to get strategy from MIME type first
if file_metadata.file_type:
try:
return self.file_type_registry.get_chunking_strategy_by_mime_type(file_metadata.file_type)
except Exception:
pass
if start and end:
# Fallback to filename extension
if file_metadata.file_name:
try:
# Extract extension from filename
import os
_, ext = os.path.splitext(file_metadata.file_name)
if ext:
return self.file_type_registry.get_chunking_strategy_by_extension(ext)
except Exception:
pass
# Default fallback
return ChunkingStrategy.LINE_BASED
def _chunk_by_lines(self, text: str, preserve_indentation: bool = False) -> List[str]:
"""Traditional line-based chunking for code and structured data"""
lines = []
for line in text.splitlines():
if preserve_indentation:
# For code: preserve leading whitespace (indentation), remove trailing whitespace
line = line.rstrip()
# Only skip completely empty lines
if line:
lines.append(line)
else:
# For structured data: strip all whitespace
line = line.strip()
if line:
lines.append(line)
return lines
def _chunk_by_sentences(self, text: str) -> List[str]:
"""Sentence-based chunking for documentation and markup"""
# Simple sentence splitting on periods, exclamation marks, and question marks
# followed by whitespace or end of string
sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])"
# Split text into sentences
sentences = re.split(sentence_pattern, text.strip())
# Clean up sentences - remove extra whitespace and empty sentences
cleaned_sentences = []
for sentence in sentences:
sentence = re.sub(r"\s+", " ", sentence.strip()) # Normalize whitespace
if sentence:
cleaned_sentences.append(sentence)
return cleaned_sentences
def _chunk_by_characters(self, text: str, target_line_length: int = 100) -> List[str]:
"""Character-based wrapping for prose text"""
words = text.split()
lines = []
current_line = []
current_length = 0
for word in words:
# Check if adding this word would exceed the target length
word_length = len(word)
if current_length + word_length + len(current_line) > target_line_length and current_line:
# Start a new line
lines.append(" ".join(current_line))
current_line = [word]
current_length = word_length
else:
current_line.append(word)
current_length += word_length
# Add the last line if there's content
if current_line:
lines.append(" ".join(current_line))
return [line for line in lines if line.strip()]
def chunk_text(
self, text: str, file_metadata: FileMetadata, start: Optional[int] = None, end: Optional[int] = None, add_metadata: bool = True
) -> List[str]:
"""Content-aware text chunking based on file type"""
strategy = self._determine_chunking_strategy(file_metadata)
# Apply the appropriate chunking strategy
if strategy == ChunkingStrategy.DOCUMENTATION:
content_lines = self._chunk_by_sentences(text)
elif strategy == ChunkingStrategy.PROSE:
content_lines = self._chunk_by_characters(text)
elif strategy == ChunkingStrategy.CODE:
content_lines = self._chunk_by_lines(text, preserve_indentation=True)
else: # STRUCTURED_DATA or LINE_BASED
content_lines = self._chunk_by_lines(text, preserve_indentation=False)
total_chunks = len(content_lines)
# Handle start/end slicing
if start is not None and end is not None:
content_lines = content_lines[start:end]
line_offset = start
else:
line_offset = 0
# Add line numbers for all strategies
content_lines = [f"{i + line_offset}: {line}" for i, line in enumerate(content_lines)]
# Add metadata about total lines
# Add metadata about total chunks
if add_metadata:
if start and end:
content_lines.insert(0, f"[Viewing lines {start} to {end-1} (out of {total_lines} lines)]")
chunk_type = (
"sentences" if strategy == ChunkingStrategy.DOCUMENTATION else "chunks" if strategy == ChunkingStrategy.PROSE else "lines"
)
if start is not None and end is not None:
content_lines.insert(0, f"[Viewing {chunk_type} {start} to {end-1} (out of {total_chunks} {chunk_type})]")
else:
content_lines.insert(0, f"[Viewing file start (out of {total_lines} lines)]")
content_lines.insert(0, f"[Viewing file start (out of {total_chunks} {chunk_type})]")
return content_lines

View File

@@ -82,7 +82,7 @@ class FileProcessor:
# Insert to agent context window
# TODO: Rethink this line chunking mechanism
content_lines = self.line_chunker.chunk_text(text=raw_markdown_text)
content_lines = self.line_chunker.chunk_text(text=raw_markdown_text, file_metadata=file_metadata)
visible_content = "\n".join(content_lines)
await server.insert_file_into_context_windows(

View File

@@ -7,9 +7,20 @@ mime types, and file processing capabilities across the Letta codebase.
import mimetypes
from dataclasses import dataclass
from enum import Enum
from typing import Dict, Set
class ChunkingStrategy(str, Enum):
"""Enum for different file chunking strategies."""
CODE = "code" # Line-based chunking for code files
STRUCTURED_DATA = "structured_data" # Line-based chunking for JSON, XML, etc.
DOCUMENTATION = "documentation" # Paragraph-aware chunking for Markdown, HTML
PROSE = "prose" # Character-based wrapping for plain text
LINE_BASED = "line_based" # Default line-based chunking
@dataclass
class FileTypeInfo:
"""Information about a supported file type."""
@@ -18,6 +29,7 @@ class FileTypeInfo:
mime_type: str
is_simple_text: bool
description: str
chunking_strategy: ChunkingStrategy = ChunkingStrategy.LINE_BASED
class FileTypeRegistry:
@@ -31,63 +43,70 @@ class FileTypeRegistry:
def _register_default_types(self) -> None:
"""Register all default supported file types."""
# Document formats
self.register(".pdf", "application/pdf", False, "PDF document")
self.register(".txt", "text/plain", True, "Plain text file")
self.register(".md", "text/markdown", True, "Markdown document")
self.register(".markdown", "text/markdown", True, "Markdown document")
self.register(".json", "application/json", True, "JSON data file")
self.register(".jsonl", "application/jsonl", True, "JSON Lines file")
self.register(".pdf", "application/pdf", False, "PDF document", ChunkingStrategy.LINE_BASED)
self.register(".txt", "text/plain", True, "Plain text file", ChunkingStrategy.PROSE)
self.register(".md", "text/markdown", True, "Markdown document", ChunkingStrategy.DOCUMENTATION)
self.register(".markdown", "text/markdown", True, "Markdown document", ChunkingStrategy.DOCUMENTATION)
self.register(".json", "application/json", True, "JSON data file", ChunkingStrategy.STRUCTURED_DATA)
self.register(".jsonl", "application/jsonl", True, "JSON Lines file", ChunkingStrategy.STRUCTURED_DATA)
# Programming languages
self.register(".py", "text/x-python", True, "Python source code")
self.register(".js", "text/javascript", True, "JavaScript source code")
self.register(".ts", "text/x-typescript", True, "TypeScript source code")
self.register(".java", "text/x-java-source", True, "Java source code")
self.register(".cpp", "text/x-c++", True, "C++ source code")
self.register(".cxx", "text/x-c++", True, "C++ source code")
self.register(".c", "text/x-c", True, "C source code")
self.register(".h", "text/x-c", True, "C/C++ header file")
self.register(".cs", "text/x-csharp", True, "C# source code")
self.register(".php", "text/x-php", True, "PHP source code")
self.register(".rb", "text/x-ruby", True, "Ruby source code")
self.register(".go", "text/x-go", True, "Go source code")
self.register(".rs", "text/x-rust", True, "Rust source code")
self.register(".swift", "text/x-swift", True, "Swift source code")
self.register(".kt", "text/x-kotlin", True, "Kotlin source code")
self.register(".scala", "text/x-scala", True, "Scala source code")
self.register(".r", "text/x-r", True, "R source code")
self.register(".m", "text/x-objective-c", True, "Objective-C source code")
self.register(".py", "text/x-python", True, "Python source code", ChunkingStrategy.CODE)
self.register(".js", "text/javascript", True, "JavaScript source code", ChunkingStrategy.CODE)
self.register(".ts", "text/x-typescript", True, "TypeScript source code", ChunkingStrategy.CODE)
self.register(".java", "text/x-java-source", True, "Java source code", ChunkingStrategy.CODE)
self.register(".cpp", "text/x-c++", True, "C++ source code", ChunkingStrategy.CODE)
self.register(".cxx", "text/x-c++", True, "C++ source code", ChunkingStrategy.CODE)
self.register(".c", "text/x-c", True, "C source code", ChunkingStrategy.CODE)
self.register(".h", "text/x-c", True, "C/C++ header file", ChunkingStrategy.CODE)
self.register(".cs", "text/x-csharp", True, "C# source code", ChunkingStrategy.CODE)
self.register(".php", "text/x-php", True, "PHP source code", ChunkingStrategy.CODE)
self.register(".rb", "text/x-ruby", True, "Ruby source code", ChunkingStrategy.CODE)
self.register(".go", "text/x-go", True, "Go source code", ChunkingStrategy.CODE)
self.register(".rs", "text/x-rust", True, "Rust source code", ChunkingStrategy.CODE)
self.register(".swift", "text/x-swift", True, "Swift source code", ChunkingStrategy.CODE)
self.register(".kt", "text/x-kotlin", True, "Kotlin source code", ChunkingStrategy.CODE)
self.register(".scala", "text/x-scala", True, "Scala source code", ChunkingStrategy.CODE)
self.register(".r", "text/x-r", True, "R source code", ChunkingStrategy.CODE)
self.register(".m", "text/x-objective-c", True, "Objective-C source code", ChunkingStrategy.CODE)
# Web technologies
self.register(".html", "text/html", True, "HTML document")
self.register(".htm", "text/html", True, "HTML document")
self.register(".css", "text/css", True, "CSS stylesheet")
self.register(".scss", "text/x-scss", True, "SCSS stylesheet")
self.register(".sass", "text/x-sass", True, "Sass stylesheet")
self.register(".less", "text/x-less", True, "Less stylesheet")
self.register(".vue", "text/x-vue", True, "Vue.js component")
self.register(".jsx", "text/x-jsx", True, "JSX source code")
self.register(".tsx", "text/x-tsx", True, "TSX source code")
self.register(".html", "text/html", True, "HTML document", ChunkingStrategy.CODE)
self.register(".htm", "text/html", True, "HTML document", ChunkingStrategy.CODE)
self.register(".css", "text/css", True, "CSS stylesheet", ChunkingStrategy.STRUCTURED_DATA)
self.register(".scss", "text/x-scss", True, "SCSS stylesheet", ChunkingStrategy.STRUCTURED_DATA)
self.register(".sass", "text/x-sass", True, "Sass stylesheet", ChunkingStrategy.STRUCTURED_DATA)
self.register(".less", "text/x-less", True, "Less stylesheet", ChunkingStrategy.STRUCTURED_DATA)
self.register(".vue", "text/x-vue", True, "Vue.js component", ChunkingStrategy.CODE)
self.register(".jsx", "text/x-jsx", True, "JSX source code", ChunkingStrategy.CODE)
self.register(".tsx", "text/x-tsx", True, "TSX source code", ChunkingStrategy.CODE)
# Configuration and data formats
self.register(".xml", "application/xml", True, "XML document")
self.register(".yaml", "text/x-yaml", True, "YAML configuration")
self.register(".yml", "text/x-yaml", True, "YAML configuration")
self.register(".toml", "application/toml", True, "TOML configuration")
self.register(".ini", "text/x-ini", True, "INI configuration")
self.register(".cfg", "text/x-conf", True, "Configuration file")
self.register(".conf", "text/x-conf", True, "Configuration file")
self.register(".xml", "application/xml", True, "XML document", ChunkingStrategy.STRUCTURED_DATA)
self.register(".yaml", "text/x-yaml", True, "YAML configuration", ChunkingStrategy.STRUCTURED_DATA)
self.register(".yml", "text/x-yaml", True, "YAML configuration", ChunkingStrategy.STRUCTURED_DATA)
self.register(".toml", "application/toml", True, "TOML configuration", ChunkingStrategy.STRUCTURED_DATA)
self.register(".ini", "text/x-ini", True, "INI configuration", ChunkingStrategy.STRUCTURED_DATA)
self.register(".cfg", "text/x-conf", True, "Configuration file", ChunkingStrategy.STRUCTURED_DATA)
self.register(".conf", "text/x-conf", True, "Configuration file", ChunkingStrategy.STRUCTURED_DATA)
# Scripts and SQL
self.register(".sh", "text/x-shellscript", True, "Shell script")
self.register(".bash", "text/x-shellscript", True, "Bash script")
self.register(".ps1", "text/x-powershell", True, "PowerShell script")
self.register(".bat", "text/x-batch", True, "Batch script")
self.register(".cmd", "text/x-batch", True, "Command script")
self.register(".dockerfile", "text/x-dockerfile", True, "Dockerfile")
self.register(".sql", "text/x-sql", True, "SQL script")
self.register(".sh", "text/x-shellscript", True, "Shell script", ChunkingStrategy.CODE)
self.register(".bash", "text/x-shellscript", True, "Bash script", ChunkingStrategy.CODE)
self.register(".ps1", "text/x-powershell", True, "PowerShell script", ChunkingStrategy.CODE)
self.register(".bat", "text/x-batch", True, "Batch script", ChunkingStrategy.CODE)
self.register(".cmd", "text/x-batch", True, "Command script", ChunkingStrategy.CODE)
self.register(".dockerfile", "text/x-dockerfile", True, "Dockerfile", ChunkingStrategy.CODE)
self.register(".sql", "text/x-sql", True, "SQL script", ChunkingStrategy.CODE)
def register(self, extension: str, mime_type: str, is_simple_text: bool, description: str) -> None:
def register(
self,
extension: str,
mime_type: str,
is_simple_text: bool,
description: str,
chunking_strategy: ChunkingStrategy = ChunkingStrategy.LINE_BASED,
) -> None:
"""
Register a new file type.
@@ -96,12 +115,17 @@ class FileTypeRegistry:
mime_type: MIME type for the file
is_simple_text: Whether this is a simple text file that can be read directly
description: Human-readable description of the file type
chunking_strategy: Strategy for chunking this file type
"""
if not extension.startswith("."):
extension = f".{extension}"
self._file_types[extension] = FileTypeInfo(
extension=extension, mime_type=mime_type, is_simple_text=is_simple_text, description=description
extension=extension,
mime_type=mime_type,
is_simple_text=is_simple_text,
description=description,
chunking_strategy=chunking_strategy,
)
def register_mime_types(self) -> None:
@@ -217,6 +241,37 @@ class FileTypeRegistry:
extension = f".{extension}"
return self._file_types[extension]
def get_chunking_strategy_by_extension(self, extension: str) -> ChunkingStrategy:
"""
Get the chunking strategy for a file based on its extension.
Args:
extension: File extension (with or without leading dot)
Returns:
ChunkingStrategy enum value for the file type
Raises:
KeyError: If the extension is not supported
"""
file_type_info = self.get_file_type_info(extension)
return file_type_info.chunking_strategy
def get_chunking_strategy_by_mime_type(self, mime_type: str) -> ChunkingStrategy:
"""
Get the chunking strategy for a file based on its MIME type.
Args:
mime_type: MIME type of the file
Returns:
ChunkingStrategy enum value for the file type, or LINE_BASED if not found
"""
for file_type in self._file_types.values():
if file_type.mime_type == mime_type:
return file_type.chunking_strategy
return ChunkingStrategy.LINE_BASED
# Global registry instance
file_type_registry = FileTypeRegistry()

View File

@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple
from letta.log import get_logger
from letta.schemas.agent import AgentState
from letta.schemas.file import FileMetadata
from letta.schemas.sandbox_config import SandboxConfig
from letta.schemas.tool import Tool
from letta.schemas.tool_execution_result import ToolExecutionResult
@@ -119,7 +120,7 @@ class LettaFileToolExecutor(ToolExecutor):
# TODO: Inefficient, maybe we can pre-compute this
# TODO: This is also not the best way to split things - would be cool to have "content aware" splitting
# TODO: Split code differently from large text blurbs
content_lines = LineChunker().chunk_text(text=file.content, start=start, end=end)
content_lines = LineChunker().chunk_text(text=file.content, file_metadata=file, start=start, end=end)
visible_content = "\n".join(content_lines)
await self.files_agents_manager.update_file_agent_by_id(
@@ -146,14 +147,14 @@ class LettaFileToolExecutor(ToolExecutor):
except re.error as e:
raise ValueError(f"Invalid regex pattern: {e}")
def _get_context_lines(self, text: str, match_line_idx: int, total_lines: int) -> List[str]:
def _get_context_lines(self, text: str, file_metadata: FileMetadata, match_line_idx: int, total_lines: int) -> List[str]:
"""Get context lines around a match using LineChunker."""
start_idx = max(0, match_line_idx - self.MAX_CONTEXT_LINES)
end_idx = min(total_lines, match_line_idx + self.MAX_CONTEXT_LINES + 1)
# Use LineChunker to get formatted lines with numbers
chunker = LineChunker()
context_lines = chunker.chunk_text(text, start=start_idx, end=end_idx, add_metadata=False)
context_lines = chunker.chunk_text(text, file_metadata=file_metadata, start=start_idx, end=end_idx, add_metadata=False)
# Add match indicator
formatted_lines = []
@@ -268,7 +269,7 @@ class LettaFileToolExecutor(ToolExecutor):
# Use LineChunker to get all lines with proper formatting
chunker = LineChunker()
formatted_lines = chunker.chunk_text(file.content)
formatted_lines = chunker.chunk_text(file.content, file_metadata=file)
# Remove metadata header
if formatted_lines and formatted_lines[0].startswith("[Viewing"):
@@ -295,7 +296,7 @@ class LettaFileToolExecutor(ToolExecutor):
if pattern_regex.search(line_content):
# Get context around the match (convert back to 0-based indexing)
context_lines = self._get_context_lines(file.content, line_num - 1, len(file.content.splitlines()))
context_lines = self._get_context_lines(file.content, file, line_num - 1, len(file.content.splitlines()))
# Format the match result
match_header = f"\n=== {file.file_name}:{line_num} ==="

100
tests/data/0_to_99.py Normal file
View File

@@ -0,0 +1,100 @@
x0 = 0
x1 = 1
x2 = 2
x3 = 3
x4 = 4
x5 = 5
x6 = 6
x7 = 7
x8 = 8
x9 = 9
x10 = 10
x11 = 11
x12 = 12
x13 = 13
x14 = 14
x15 = 15
x16 = 16
x17 = 17
x18 = 18
x19 = 19
x20 = 20
x21 = 21
x22 = 22
x23 = 23
x24 = 24
x25 = 25
x26 = 26
x27 = 27
x28 = 28
x29 = 29
x30 = 30
x31 = 31
x32 = 32
x33 = 33
x34 = 34
x35 = 35
x36 = 36
x37 = 37
x38 = 38
x39 = 39
x40 = 40
x41 = 41
x42 = 42
x43 = 43
x44 = 44
x45 = 45
x46 = 46
x47 = 47
x48 = 48
x49 = 49
x50 = 50
x51 = 51
x52 = 52
x53 = 53
x54 = 54
x55 = 55
x56 = 56
x57 = 57
x58 = 58
x59 = 59
x60 = 60
x61 = 61
x62 = 62
x63 = 63
x64 = 64
x65 = 65
x66 = 66
x67 = 67
x68 = 68
x69 = 69
x70 = 70
x71 = 71
x72 = 72
x73 = 73
x74 = 74
x75 = 75
x76 = 76
x77 = 77
x78 = 78
x79 = 79
x80 = 80
x81 = 81
x82 = 82
x83 = 83
x84 = 84
x85 = 85
x86 = 86
x87 = 87
x88 = 88
x89 = 89
x90 = 90
x91 = 91
x92 = 92
x93 = 93
x94 = 94
x95 = 95
x96 = 96
x97 = 97
x98 = 98
x99 = 99

View File

@@ -1,100 +0,0 @@
Line 1
Line 2
Line 3
Line 4
Line 5
Line 6
Line 7
Line 8
Line 9
Line 10
Line 11
Line 12
Line 13
Line 14
Line 15
Line 16
Line 17
Line 18
Line 19
Line 20
Line 21
Line 22
Line 23
Line 24
Line 25
Line 26
Line 27
Line 28
Line 29
Line 30
Line 31
Line 32
Line 33
Line 34
Line 35
Line 36
Line 37
Line 38
Line 39
Line 40
Line 41
Line 42
Line 43
Line 44
Line 45
Line 46
Line 47
Line 48
Line 49
Line 50
Line 51
Line 52
Line 53
Line 54
Line 55
Line 56
Line 57
Line 58
Line 59
Line 60
Line 61
Line 62
Line 63
Line 64
Line 65
Line 66
Line 67
Line 68
Line 69
Line 70
Line 71
Line 72
Line 73
Line 74
Line 75
Line 76
Line 77
Line 78
Line 79
Line 80
Line 81
Line 82
Line 83
Line 84
Line 85
Line 86
Line 87
Line 88
Line 89
Line 90
Line 91
Line 92
Line 93
Line 94
Line 95
Line 96
Line 97
Line 98
Line 99
Line 100

View File

@@ -499,7 +499,7 @@ def test_view_ranges_have_metadata(client: LettaSDKClient, agent_state: AgentSta
client.agents.sources.attach(source_id=source.id, agent_id=agent_state.id)
# Load files into the source
file_path = "tests/data/lines_1_to_100.txt"
file_path = "tests/data/0_to_99.py"
# Upload the files
with open(file_path, "rb") as f:
@@ -548,10 +548,10 @@ def test_view_ranges_have_metadata(client: LettaSDKClient, agent_state: AgentSta
block.value
== """
[Viewing lines 50 to 54 (out of 100 lines)]
50: Line 51
51: Line 52
52: Line 53
53: Line 54
54: Line 55
50: x50 = 50
51: x51 = 51
52: x52 = 52
53: x53 = 53
54: x54 = 54
""".strip()
)