feat: Add content aware line chunking (#2707)
This commit is contained in:
@@ -1,34 +1,139 @@
|
|||||||
|
import re
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from letta.log import get_logger
|
from letta.log import get_logger
|
||||||
|
from letta.schemas.file import FileMetadata
|
||||||
|
from letta.services.file_processor.file_types import ChunkingStrategy, file_type_registry
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class LineChunker:
|
class LineChunker:
|
||||||
"""Newline chunker"""
|
"""Content-aware line chunker that adapts chunking strategy based on file type"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
self.file_type_registry = file_type_registry
|
||||||
|
|
||||||
def chunk_text(self, text: str, start: Optional[int] = None, end: Optional[int] = None, add_metadata: bool = True) -> List[str]:
|
def _determine_chunking_strategy(self, file_metadata: FileMetadata) -> ChunkingStrategy:
|
||||||
"""Split lines"""
|
"""Determine the best chunking strategy based on file metadata"""
|
||||||
content_lines = [line.strip() for line in text.splitlines() if line.strip()]
|
# Try to get strategy from MIME type first
|
||||||
total_lines = len(content_lines)
|
if file_metadata.file_type:
|
||||||
|
try:
|
||||||
|
return self.file_type_registry.get_chunking_strategy_by_mime_type(file_metadata.file_type)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
if start and end:
|
# Fallback to filename extension
|
||||||
|
if file_metadata.file_name:
|
||||||
|
try:
|
||||||
|
# Extract extension from filename
|
||||||
|
import os
|
||||||
|
|
||||||
|
_, ext = os.path.splitext(file_metadata.file_name)
|
||||||
|
if ext:
|
||||||
|
return self.file_type_registry.get_chunking_strategy_by_extension(ext)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Default fallback
|
||||||
|
return ChunkingStrategy.LINE_BASED
|
||||||
|
|
||||||
|
def _chunk_by_lines(self, text: str, preserve_indentation: bool = False) -> List[str]:
|
||||||
|
"""Traditional line-based chunking for code and structured data"""
|
||||||
|
lines = []
|
||||||
|
for line in text.splitlines():
|
||||||
|
if preserve_indentation:
|
||||||
|
# For code: preserve leading whitespace (indentation), remove trailing whitespace
|
||||||
|
line = line.rstrip()
|
||||||
|
# Only skip completely empty lines
|
||||||
|
if line:
|
||||||
|
lines.append(line)
|
||||||
|
else:
|
||||||
|
# For structured data: strip all whitespace
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
lines.append(line)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
def _chunk_by_sentences(self, text: str) -> List[str]:
|
||||||
|
"""Sentence-based chunking for documentation and markup"""
|
||||||
|
# Simple sentence splitting on periods, exclamation marks, and question marks
|
||||||
|
# followed by whitespace or end of string
|
||||||
|
sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])"
|
||||||
|
|
||||||
|
# Split text into sentences
|
||||||
|
sentences = re.split(sentence_pattern, text.strip())
|
||||||
|
|
||||||
|
# Clean up sentences - remove extra whitespace and empty sentences
|
||||||
|
cleaned_sentences = []
|
||||||
|
for sentence in sentences:
|
||||||
|
sentence = re.sub(r"\s+", " ", sentence.strip()) # Normalize whitespace
|
||||||
|
if sentence:
|
||||||
|
cleaned_sentences.append(sentence)
|
||||||
|
|
||||||
|
return cleaned_sentences
|
||||||
|
|
||||||
|
def _chunk_by_characters(self, text: str, target_line_length: int = 100) -> List[str]:
|
||||||
|
"""Character-based wrapping for prose text"""
|
||||||
|
words = text.split()
|
||||||
|
lines = []
|
||||||
|
current_line = []
|
||||||
|
current_length = 0
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
# Check if adding this word would exceed the target length
|
||||||
|
word_length = len(word)
|
||||||
|
if current_length + word_length + len(current_line) > target_line_length and current_line:
|
||||||
|
# Start a new line
|
||||||
|
lines.append(" ".join(current_line))
|
||||||
|
current_line = [word]
|
||||||
|
current_length = word_length
|
||||||
|
else:
|
||||||
|
current_line.append(word)
|
||||||
|
current_length += word_length
|
||||||
|
|
||||||
|
# Add the last line if there's content
|
||||||
|
if current_line:
|
||||||
|
lines.append(" ".join(current_line))
|
||||||
|
|
||||||
|
return [line for line in lines if line.strip()]
|
||||||
|
|
||||||
|
def chunk_text(
|
||||||
|
self, text: str, file_metadata: FileMetadata, start: Optional[int] = None, end: Optional[int] = None, add_metadata: bool = True
|
||||||
|
) -> List[str]:
|
||||||
|
"""Content-aware text chunking based on file type"""
|
||||||
|
strategy = self._determine_chunking_strategy(file_metadata)
|
||||||
|
|
||||||
|
# Apply the appropriate chunking strategy
|
||||||
|
if strategy == ChunkingStrategy.DOCUMENTATION:
|
||||||
|
content_lines = self._chunk_by_sentences(text)
|
||||||
|
elif strategy == ChunkingStrategy.PROSE:
|
||||||
|
content_lines = self._chunk_by_characters(text)
|
||||||
|
elif strategy == ChunkingStrategy.CODE:
|
||||||
|
content_lines = self._chunk_by_lines(text, preserve_indentation=True)
|
||||||
|
else: # STRUCTURED_DATA or LINE_BASED
|
||||||
|
content_lines = self._chunk_by_lines(text, preserve_indentation=False)
|
||||||
|
|
||||||
|
total_chunks = len(content_lines)
|
||||||
|
|
||||||
|
# Handle start/end slicing
|
||||||
|
if start is not None and end is not None:
|
||||||
content_lines = content_lines[start:end]
|
content_lines = content_lines[start:end]
|
||||||
line_offset = start
|
line_offset = start
|
||||||
else:
|
else:
|
||||||
line_offset = 0
|
line_offset = 0
|
||||||
|
|
||||||
|
# Add line numbers for all strategies
|
||||||
content_lines = [f"{i + line_offset}: {line}" for i, line in enumerate(content_lines)]
|
content_lines = [f"{i + line_offset}: {line}" for i, line in enumerate(content_lines)]
|
||||||
|
|
||||||
# Add metadata about total lines
|
# Add metadata about total chunks
|
||||||
if add_metadata:
|
if add_metadata:
|
||||||
if start and end:
|
chunk_type = (
|
||||||
content_lines.insert(0, f"[Viewing lines {start} to {end-1} (out of {total_lines} lines)]")
|
"sentences" if strategy == ChunkingStrategy.DOCUMENTATION else "chunks" if strategy == ChunkingStrategy.PROSE else "lines"
|
||||||
|
)
|
||||||
|
if start is not None and end is not None:
|
||||||
|
content_lines.insert(0, f"[Viewing {chunk_type} {start} to {end-1} (out of {total_chunks} {chunk_type})]")
|
||||||
else:
|
else:
|
||||||
content_lines.insert(0, f"[Viewing file start (out of {total_lines} lines)]")
|
content_lines.insert(0, f"[Viewing file start (out of {total_chunks} {chunk_type})]")
|
||||||
|
|
||||||
return content_lines
|
return content_lines
|
||||||
|
|||||||
@@ -82,7 +82,7 @@ class FileProcessor:
|
|||||||
|
|
||||||
# Insert to agent context window
|
# Insert to agent context window
|
||||||
# TODO: Rethink this line chunking mechanism
|
# TODO: Rethink this line chunking mechanism
|
||||||
content_lines = self.line_chunker.chunk_text(text=raw_markdown_text)
|
content_lines = self.line_chunker.chunk_text(text=raw_markdown_text, file_metadata=file_metadata)
|
||||||
visible_content = "\n".join(content_lines)
|
visible_content = "\n".join(content_lines)
|
||||||
|
|
||||||
await server.insert_file_into_context_windows(
|
await server.insert_file_into_context_windows(
|
||||||
|
|||||||
@@ -7,9 +7,20 @@ mime types, and file processing capabilities across the Letta codebase.
|
|||||||
|
|
||||||
import mimetypes
|
import mimetypes
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from enum import Enum
|
||||||
from typing import Dict, Set
|
from typing import Dict, Set
|
||||||
|
|
||||||
|
|
||||||
|
class ChunkingStrategy(str, Enum):
|
||||||
|
"""Enum for different file chunking strategies."""
|
||||||
|
|
||||||
|
CODE = "code" # Line-based chunking for code files
|
||||||
|
STRUCTURED_DATA = "structured_data" # Line-based chunking for JSON, XML, etc.
|
||||||
|
DOCUMENTATION = "documentation" # Paragraph-aware chunking for Markdown, HTML
|
||||||
|
PROSE = "prose" # Character-based wrapping for plain text
|
||||||
|
LINE_BASED = "line_based" # Default line-based chunking
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class FileTypeInfo:
|
class FileTypeInfo:
|
||||||
"""Information about a supported file type."""
|
"""Information about a supported file type."""
|
||||||
@@ -18,6 +29,7 @@ class FileTypeInfo:
|
|||||||
mime_type: str
|
mime_type: str
|
||||||
is_simple_text: bool
|
is_simple_text: bool
|
||||||
description: str
|
description: str
|
||||||
|
chunking_strategy: ChunkingStrategy = ChunkingStrategy.LINE_BASED
|
||||||
|
|
||||||
|
|
||||||
class FileTypeRegistry:
|
class FileTypeRegistry:
|
||||||
@@ -31,63 +43,70 @@ class FileTypeRegistry:
|
|||||||
def _register_default_types(self) -> None:
|
def _register_default_types(self) -> None:
|
||||||
"""Register all default supported file types."""
|
"""Register all default supported file types."""
|
||||||
# Document formats
|
# Document formats
|
||||||
self.register(".pdf", "application/pdf", False, "PDF document")
|
self.register(".pdf", "application/pdf", False, "PDF document", ChunkingStrategy.LINE_BASED)
|
||||||
self.register(".txt", "text/plain", True, "Plain text file")
|
self.register(".txt", "text/plain", True, "Plain text file", ChunkingStrategy.PROSE)
|
||||||
self.register(".md", "text/markdown", True, "Markdown document")
|
self.register(".md", "text/markdown", True, "Markdown document", ChunkingStrategy.DOCUMENTATION)
|
||||||
self.register(".markdown", "text/markdown", True, "Markdown document")
|
self.register(".markdown", "text/markdown", True, "Markdown document", ChunkingStrategy.DOCUMENTATION)
|
||||||
self.register(".json", "application/json", True, "JSON data file")
|
self.register(".json", "application/json", True, "JSON data file", ChunkingStrategy.STRUCTURED_DATA)
|
||||||
self.register(".jsonl", "application/jsonl", True, "JSON Lines file")
|
self.register(".jsonl", "application/jsonl", True, "JSON Lines file", ChunkingStrategy.STRUCTURED_DATA)
|
||||||
|
|
||||||
# Programming languages
|
# Programming languages
|
||||||
self.register(".py", "text/x-python", True, "Python source code")
|
self.register(".py", "text/x-python", True, "Python source code", ChunkingStrategy.CODE)
|
||||||
self.register(".js", "text/javascript", True, "JavaScript source code")
|
self.register(".js", "text/javascript", True, "JavaScript source code", ChunkingStrategy.CODE)
|
||||||
self.register(".ts", "text/x-typescript", True, "TypeScript source code")
|
self.register(".ts", "text/x-typescript", True, "TypeScript source code", ChunkingStrategy.CODE)
|
||||||
self.register(".java", "text/x-java-source", True, "Java source code")
|
self.register(".java", "text/x-java-source", True, "Java source code", ChunkingStrategy.CODE)
|
||||||
self.register(".cpp", "text/x-c++", True, "C++ source code")
|
self.register(".cpp", "text/x-c++", True, "C++ source code", ChunkingStrategy.CODE)
|
||||||
self.register(".cxx", "text/x-c++", True, "C++ source code")
|
self.register(".cxx", "text/x-c++", True, "C++ source code", ChunkingStrategy.CODE)
|
||||||
self.register(".c", "text/x-c", True, "C source code")
|
self.register(".c", "text/x-c", True, "C source code", ChunkingStrategy.CODE)
|
||||||
self.register(".h", "text/x-c", True, "C/C++ header file")
|
self.register(".h", "text/x-c", True, "C/C++ header file", ChunkingStrategy.CODE)
|
||||||
self.register(".cs", "text/x-csharp", True, "C# source code")
|
self.register(".cs", "text/x-csharp", True, "C# source code", ChunkingStrategy.CODE)
|
||||||
self.register(".php", "text/x-php", True, "PHP source code")
|
self.register(".php", "text/x-php", True, "PHP source code", ChunkingStrategy.CODE)
|
||||||
self.register(".rb", "text/x-ruby", True, "Ruby source code")
|
self.register(".rb", "text/x-ruby", True, "Ruby source code", ChunkingStrategy.CODE)
|
||||||
self.register(".go", "text/x-go", True, "Go source code")
|
self.register(".go", "text/x-go", True, "Go source code", ChunkingStrategy.CODE)
|
||||||
self.register(".rs", "text/x-rust", True, "Rust source code")
|
self.register(".rs", "text/x-rust", True, "Rust source code", ChunkingStrategy.CODE)
|
||||||
self.register(".swift", "text/x-swift", True, "Swift source code")
|
self.register(".swift", "text/x-swift", True, "Swift source code", ChunkingStrategy.CODE)
|
||||||
self.register(".kt", "text/x-kotlin", True, "Kotlin source code")
|
self.register(".kt", "text/x-kotlin", True, "Kotlin source code", ChunkingStrategy.CODE)
|
||||||
self.register(".scala", "text/x-scala", True, "Scala source code")
|
self.register(".scala", "text/x-scala", True, "Scala source code", ChunkingStrategy.CODE)
|
||||||
self.register(".r", "text/x-r", True, "R source code")
|
self.register(".r", "text/x-r", True, "R source code", ChunkingStrategy.CODE)
|
||||||
self.register(".m", "text/x-objective-c", True, "Objective-C source code")
|
self.register(".m", "text/x-objective-c", True, "Objective-C source code", ChunkingStrategy.CODE)
|
||||||
|
|
||||||
# Web technologies
|
# Web technologies
|
||||||
self.register(".html", "text/html", True, "HTML document")
|
self.register(".html", "text/html", True, "HTML document", ChunkingStrategy.CODE)
|
||||||
self.register(".htm", "text/html", True, "HTML document")
|
self.register(".htm", "text/html", True, "HTML document", ChunkingStrategy.CODE)
|
||||||
self.register(".css", "text/css", True, "CSS stylesheet")
|
self.register(".css", "text/css", True, "CSS stylesheet", ChunkingStrategy.STRUCTURED_DATA)
|
||||||
self.register(".scss", "text/x-scss", True, "SCSS stylesheet")
|
self.register(".scss", "text/x-scss", True, "SCSS stylesheet", ChunkingStrategy.STRUCTURED_DATA)
|
||||||
self.register(".sass", "text/x-sass", True, "Sass stylesheet")
|
self.register(".sass", "text/x-sass", True, "Sass stylesheet", ChunkingStrategy.STRUCTURED_DATA)
|
||||||
self.register(".less", "text/x-less", True, "Less stylesheet")
|
self.register(".less", "text/x-less", True, "Less stylesheet", ChunkingStrategy.STRUCTURED_DATA)
|
||||||
self.register(".vue", "text/x-vue", True, "Vue.js component")
|
self.register(".vue", "text/x-vue", True, "Vue.js component", ChunkingStrategy.CODE)
|
||||||
self.register(".jsx", "text/x-jsx", True, "JSX source code")
|
self.register(".jsx", "text/x-jsx", True, "JSX source code", ChunkingStrategy.CODE)
|
||||||
self.register(".tsx", "text/x-tsx", True, "TSX source code")
|
self.register(".tsx", "text/x-tsx", True, "TSX source code", ChunkingStrategy.CODE)
|
||||||
|
|
||||||
# Configuration and data formats
|
# Configuration and data formats
|
||||||
self.register(".xml", "application/xml", True, "XML document")
|
self.register(".xml", "application/xml", True, "XML document", ChunkingStrategy.STRUCTURED_DATA)
|
||||||
self.register(".yaml", "text/x-yaml", True, "YAML configuration")
|
self.register(".yaml", "text/x-yaml", True, "YAML configuration", ChunkingStrategy.STRUCTURED_DATA)
|
||||||
self.register(".yml", "text/x-yaml", True, "YAML configuration")
|
self.register(".yml", "text/x-yaml", True, "YAML configuration", ChunkingStrategy.STRUCTURED_DATA)
|
||||||
self.register(".toml", "application/toml", True, "TOML configuration")
|
self.register(".toml", "application/toml", True, "TOML configuration", ChunkingStrategy.STRUCTURED_DATA)
|
||||||
self.register(".ini", "text/x-ini", True, "INI configuration")
|
self.register(".ini", "text/x-ini", True, "INI configuration", ChunkingStrategy.STRUCTURED_DATA)
|
||||||
self.register(".cfg", "text/x-conf", True, "Configuration file")
|
self.register(".cfg", "text/x-conf", True, "Configuration file", ChunkingStrategy.STRUCTURED_DATA)
|
||||||
self.register(".conf", "text/x-conf", True, "Configuration file")
|
self.register(".conf", "text/x-conf", True, "Configuration file", ChunkingStrategy.STRUCTURED_DATA)
|
||||||
|
|
||||||
# Scripts and SQL
|
# Scripts and SQL
|
||||||
self.register(".sh", "text/x-shellscript", True, "Shell script")
|
self.register(".sh", "text/x-shellscript", True, "Shell script", ChunkingStrategy.CODE)
|
||||||
self.register(".bash", "text/x-shellscript", True, "Bash script")
|
self.register(".bash", "text/x-shellscript", True, "Bash script", ChunkingStrategy.CODE)
|
||||||
self.register(".ps1", "text/x-powershell", True, "PowerShell script")
|
self.register(".ps1", "text/x-powershell", True, "PowerShell script", ChunkingStrategy.CODE)
|
||||||
self.register(".bat", "text/x-batch", True, "Batch script")
|
self.register(".bat", "text/x-batch", True, "Batch script", ChunkingStrategy.CODE)
|
||||||
self.register(".cmd", "text/x-batch", True, "Command script")
|
self.register(".cmd", "text/x-batch", True, "Command script", ChunkingStrategy.CODE)
|
||||||
self.register(".dockerfile", "text/x-dockerfile", True, "Dockerfile")
|
self.register(".dockerfile", "text/x-dockerfile", True, "Dockerfile", ChunkingStrategy.CODE)
|
||||||
self.register(".sql", "text/x-sql", True, "SQL script")
|
self.register(".sql", "text/x-sql", True, "SQL script", ChunkingStrategy.CODE)
|
||||||
|
|
||||||
def register(self, extension: str, mime_type: str, is_simple_text: bool, description: str) -> None:
|
def register(
|
||||||
|
self,
|
||||||
|
extension: str,
|
||||||
|
mime_type: str,
|
||||||
|
is_simple_text: bool,
|
||||||
|
description: str,
|
||||||
|
chunking_strategy: ChunkingStrategy = ChunkingStrategy.LINE_BASED,
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Register a new file type.
|
Register a new file type.
|
||||||
|
|
||||||
@@ -96,12 +115,17 @@ class FileTypeRegistry:
|
|||||||
mime_type: MIME type for the file
|
mime_type: MIME type for the file
|
||||||
is_simple_text: Whether this is a simple text file that can be read directly
|
is_simple_text: Whether this is a simple text file that can be read directly
|
||||||
description: Human-readable description of the file type
|
description: Human-readable description of the file type
|
||||||
|
chunking_strategy: Strategy for chunking this file type
|
||||||
"""
|
"""
|
||||||
if not extension.startswith("."):
|
if not extension.startswith("."):
|
||||||
extension = f".{extension}"
|
extension = f".{extension}"
|
||||||
|
|
||||||
self._file_types[extension] = FileTypeInfo(
|
self._file_types[extension] = FileTypeInfo(
|
||||||
extension=extension, mime_type=mime_type, is_simple_text=is_simple_text, description=description
|
extension=extension,
|
||||||
|
mime_type=mime_type,
|
||||||
|
is_simple_text=is_simple_text,
|
||||||
|
description=description,
|
||||||
|
chunking_strategy=chunking_strategy,
|
||||||
)
|
)
|
||||||
|
|
||||||
def register_mime_types(self) -> None:
|
def register_mime_types(self) -> None:
|
||||||
@@ -217,6 +241,37 @@ class FileTypeRegistry:
|
|||||||
extension = f".{extension}"
|
extension = f".{extension}"
|
||||||
return self._file_types[extension]
|
return self._file_types[extension]
|
||||||
|
|
||||||
|
def get_chunking_strategy_by_extension(self, extension: str) -> ChunkingStrategy:
|
||||||
|
"""
|
||||||
|
Get the chunking strategy for a file based on its extension.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
extension: File extension (with or without leading dot)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ChunkingStrategy enum value for the file type
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
KeyError: If the extension is not supported
|
||||||
|
"""
|
||||||
|
file_type_info = self.get_file_type_info(extension)
|
||||||
|
return file_type_info.chunking_strategy
|
||||||
|
|
||||||
|
def get_chunking_strategy_by_mime_type(self, mime_type: str) -> ChunkingStrategy:
|
||||||
|
"""
|
||||||
|
Get the chunking strategy for a file based on its MIME type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mime_type: MIME type of the file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ChunkingStrategy enum value for the file type, or LINE_BASED if not found
|
||||||
|
"""
|
||||||
|
for file_type in self._file_types.values():
|
||||||
|
if file_type.mime_type == mime_type:
|
||||||
|
return file_type.chunking_strategy
|
||||||
|
return ChunkingStrategy.LINE_BASED
|
||||||
|
|
||||||
|
|
||||||
# Global registry instance
|
# Global registry instance
|
||||||
file_type_registry = FileTypeRegistry()
|
file_type_registry = FileTypeRegistry()
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|||||||
|
|
||||||
from letta.log import get_logger
|
from letta.log import get_logger
|
||||||
from letta.schemas.agent import AgentState
|
from letta.schemas.agent import AgentState
|
||||||
|
from letta.schemas.file import FileMetadata
|
||||||
from letta.schemas.sandbox_config import SandboxConfig
|
from letta.schemas.sandbox_config import SandboxConfig
|
||||||
from letta.schemas.tool import Tool
|
from letta.schemas.tool import Tool
|
||||||
from letta.schemas.tool_execution_result import ToolExecutionResult
|
from letta.schemas.tool_execution_result import ToolExecutionResult
|
||||||
@@ -119,7 +120,7 @@ class LettaFileToolExecutor(ToolExecutor):
|
|||||||
# TODO: Inefficient, maybe we can pre-compute this
|
# TODO: Inefficient, maybe we can pre-compute this
|
||||||
# TODO: This is also not the best way to split things - would be cool to have "content aware" splitting
|
# TODO: This is also not the best way to split things - would be cool to have "content aware" splitting
|
||||||
# TODO: Split code differently from large text blurbs
|
# TODO: Split code differently from large text blurbs
|
||||||
content_lines = LineChunker().chunk_text(text=file.content, start=start, end=end)
|
content_lines = LineChunker().chunk_text(text=file.content, file_metadata=file, start=start, end=end)
|
||||||
visible_content = "\n".join(content_lines)
|
visible_content = "\n".join(content_lines)
|
||||||
|
|
||||||
await self.files_agents_manager.update_file_agent_by_id(
|
await self.files_agents_manager.update_file_agent_by_id(
|
||||||
@@ -146,14 +147,14 @@ class LettaFileToolExecutor(ToolExecutor):
|
|||||||
except re.error as e:
|
except re.error as e:
|
||||||
raise ValueError(f"Invalid regex pattern: {e}")
|
raise ValueError(f"Invalid regex pattern: {e}")
|
||||||
|
|
||||||
def _get_context_lines(self, text: str, match_line_idx: int, total_lines: int) -> List[str]:
|
def _get_context_lines(self, text: str, file_metadata: FileMetadata, match_line_idx: int, total_lines: int) -> List[str]:
|
||||||
"""Get context lines around a match using LineChunker."""
|
"""Get context lines around a match using LineChunker."""
|
||||||
start_idx = max(0, match_line_idx - self.MAX_CONTEXT_LINES)
|
start_idx = max(0, match_line_idx - self.MAX_CONTEXT_LINES)
|
||||||
end_idx = min(total_lines, match_line_idx + self.MAX_CONTEXT_LINES + 1)
|
end_idx = min(total_lines, match_line_idx + self.MAX_CONTEXT_LINES + 1)
|
||||||
|
|
||||||
# Use LineChunker to get formatted lines with numbers
|
# Use LineChunker to get formatted lines with numbers
|
||||||
chunker = LineChunker()
|
chunker = LineChunker()
|
||||||
context_lines = chunker.chunk_text(text, start=start_idx, end=end_idx, add_metadata=False)
|
context_lines = chunker.chunk_text(text, file_metadata=file_metadata, start=start_idx, end=end_idx, add_metadata=False)
|
||||||
|
|
||||||
# Add match indicator
|
# Add match indicator
|
||||||
formatted_lines = []
|
formatted_lines = []
|
||||||
@@ -268,7 +269,7 @@ class LettaFileToolExecutor(ToolExecutor):
|
|||||||
|
|
||||||
# Use LineChunker to get all lines with proper formatting
|
# Use LineChunker to get all lines with proper formatting
|
||||||
chunker = LineChunker()
|
chunker = LineChunker()
|
||||||
formatted_lines = chunker.chunk_text(file.content)
|
formatted_lines = chunker.chunk_text(file.content, file_metadata=file)
|
||||||
|
|
||||||
# Remove metadata header
|
# Remove metadata header
|
||||||
if formatted_lines and formatted_lines[0].startswith("[Viewing"):
|
if formatted_lines and formatted_lines[0].startswith("[Viewing"):
|
||||||
@@ -295,7 +296,7 @@ class LettaFileToolExecutor(ToolExecutor):
|
|||||||
|
|
||||||
if pattern_regex.search(line_content):
|
if pattern_regex.search(line_content):
|
||||||
# Get context around the match (convert back to 0-based indexing)
|
# Get context around the match (convert back to 0-based indexing)
|
||||||
context_lines = self._get_context_lines(file.content, line_num - 1, len(file.content.splitlines()))
|
context_lines = self._get_context_lines(file.content, file, line_num - 1, len(file.content.splitlines()))
|
||||||
|
|
||||||
# Format the match result
|
# Format the match result
|
||||||
match_header = f"\n=== {file.file_name}:{line_num} ==="
|
match_header = f"\n=== {file.file_name}:{line_num} ==="
|
||||||
|
|||||||
100
tests/data/0_to_99.py
Normal file
100
tests/data/0_to_99.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
x0 = 0
|
||||||
|
x1 = 1
|
||||||
|
x2 = 2
|
||||||
|
x3 = 3
|
||||||
|
x4 = 4
|
||||||
|
x5 = 5
|
||||||
|
x6 = 6
|
||||||
|
x7 = 7
|
||||||
|
x8 = 8
|
||||||
|
x9 = 9
|
||||||
|
x10 = 10
|
||||||
|
x11 = 11
|
||||||
|
x12 = 12
|
||||||
|
x13 = 13
|
||||||
|
x14 = 14
|
||||||
|
x15 = 15
|
||||||
|
x16 = 16
|
||||||
|
x17 = 17
|
||||||
|
x18 = 18
|
||||||
|
x19 = 19
|
||||||
|
x20 = 20
|
||||||
|
x21 = 21
|
||||||
|
x22 = 22
|
||||||
|
x23 = 23
|
||||||
|
x24 = 24
|
||||||
|
x25 = 25
|
||||||
|
x26 = 26
|
||||||
|
x27 = 27
|
||||||
|
x28 = 28
|
||||||
|
x29 = 29
|
||||||
|
x30 = 30
|
||||||
|
x31 = 31
|
||||||
|
x32 = 32
|
||||||
|
x33 = 33
|
||||||
|
x34 = 34
|
||||||
|
x35 = 35
|
||||||
|
x36 = 36
|
||||||
|
x37 = 37
|
||||||
|
x38 = 38
|
||||||
|
x39 = 39
|
||||||
|
x40 = 40
|
||||||
|
x41 = 41
|
||||||
|
x42 = 42
|
||||||
|
x43 = 43
|
||||||
|
x44 = 44
|
||||||
|
x45 = 45
|
||||||
|
x46 = 46
|
||||||
|
x47 = 47
|
||||||
|
x48 = 48
|
||||||
|
x49 = 49
|
||||||
|
x50 = 50
|
||||||
|
x51 = 51
|
||||||
|
x52 = 52
|
||||||
|
x53 = 53
|
||||||
|
x54 = 54
|
||||||
|
x55 = 55
|
||||||
|
x56 = 56
|
||||||
|
x57 = 57
|
||||||
|
x58 = 58
|
||||||
|
x59 = 59
|
||||||
|
x60 = 60
|
||||||
|
x61 = 61
|
||||||
|
x62 = 62
|
||||||
|
x63 = 63
|
||||||
|
x64 = 64
|
||||||
|
x65 = 65
|
||||||
|
x66 = 66
|
||||||
|
x67 = 67
|
||||||
|
x68 = 68
|
||||||
|
x69 = 69
|
||||||
|
x70 = 70
|
||||||
|
x71 = 71
|
||||||
|
x72 = 72
|
||||||
|
x73 = 73
|
||||||
|
x74 = 74
|
||||||
|
x75 = 75
|
||||||
|
x76 = 76
|
||||||
|
x77 = 77
|
||||||
|
x78 = 78
|
||||||
|
x79 = 79
|
||||||
|
x80 = 80
|
||||||
|
x81 = 81
|
||||||
|
x82 = 82
|
||||||
|
x83 = 83
|
||||||
|
x84 = 84
|
||||||
|
x85 = 85
|
||||||
|
x86 = 86
|
||||||
|
x87 = 87
|
||||||
|
x88 = 88
|
||||||
|
x89 = 89
|
||||||
|
x90 = 90
|
||||||
|
x91 = 91
|
||||||
|
x92 = 92
|
||||||
|
x93 = 93
|
||||||
|
x94 = 94
|
||||||
|
x95 = 95
|
||||||
|
x96 = 96
|
||||||
|
x97 = 97
|
||||||
|
x98 = 98
|
||||||
|
x99 = 99
|
||||||
@@ -1,100 +0,0 @@
|
|||||||
Line 1
|
|
||||||
Line 2
|
|
||||||
Line 3
|
|
||||||
Line 4
|
|
||||||
Line 5
|
|
||||||
Line 6
|
|
||||||
Line 7
|
|
||||||
Line 8
|
|
||||||
Line 9
|
|
||||||
Line 10
|
|
||||||
Line 11
|
|
||||||
Line 12
|
|
||||||
Line 13
|
|
||||||
Line 14
|
|
||||||
Line 15
|
|
||||||
Line 16
|
|
||||||
Line 17
|
|
||||||
Line 18
|
|
||||||
Line 19
|
|
||||||
Line 20
|
|
||||||
Line 21
|
|
||||||
Line 22
|
|
||||||
Line 23
|
|
||||||
Line 24
|
|
||||||
Line 25
|
|
||||||
Line 26
|
|
||||||
Line 27
|
|
||||||
Line 28
|
|
||||||
Line 29
|
|
||||||
Line 30
|
|
||||||
Line 31
|
|
||||||
Line 32
|
|
||||||
Line 33
|
|
||||||
Line 34
|
|
||||||
Line 35
|
|
||||||
Line 36
|
|
||||||
Line 37
|
|
||||||
Line 38
|
|
||||||
Line 39
|
|
||||||
Line 40
|
|
||||||
Line 41
|
|
||||||
Line 42
|
|
||||||
Line 43
|
|
||||||
Line 44
|
|
||||||
Line 45
|
|
||||||
Line 46
|
|
||||||
Line 47
|
|
||||||
Line 48
|
|
||||||
Line 49
|
|
||||||
Line 50
|
|
||||||
Line 51
|
|
||||||
Line 52
|
|
||||||
Line 53
|
|
||||||
Line 54
|
|
||||||
Line 55
|
|
||||||
Line 56
|
|
||||||
Line 57
|
|
||||||
Line 58
|
|
||||||
Line 59
|
|
||||||
Line 60
|
|
||||||
Line 61
|
|
||||||
Line 62
|
|
||||||
Line 63
|
|
||||||
Line 64
|
|
||||||
Line 65
|
|
||||||
Line 66
|
|
||||||
Line 67
|
|
||||||
Line 68
|
|
||||||
Line 69
|
|
||||||
Line 70
|
|
||||||
Line 71
|
|
||||||
Line 72
|
|
||||||
Line 73
|
|
||||||
Line 74
|
|
||||||
Line 75
|
|
||||||
Line 76
|
|
||||||
Line 77
|
|
||||||
Line 78
|
|
||||||
Line 79
|
|
||||||
Line 80
|
|
||||||
Line 81
|
|
||||||
Line 82
|
|
||||||
Line 83
|
|
||||||
Line 84
|
|
||||||
Line 85
|
|
||||||
Line 86
|
|
||||||
Line 87
|
|
||||||
Line 88
|
|
||||||
Line 89
|
|
||||||
Line 90
|
|
||||||
Line 91
|
|
||||||
Line 92
|
|
||||||
Line 93
|
|
||||||
Line 94
|
|
||||||
Line 95
|
|
||||||
Line 96
|
|
||||||
Line 97
|
|
||||||
Line 98
|
|
||||||
Line 99
|
|
||||||
Line 100
|
|
||||||
@@ -499,7 +499,7 @@ def test_view_ranges_have_metadata(client: LettaSDKClient, agent_state: AgentSta
|
|||||||
client.agents.sources.attach(source_id=source.id, agent_id=agent_state.id)
|
client.agents.sources.attach(source_id=source.id, agent_id=agent_state.id)
|
||||||
|
|
||||||
# Load files into the source
|
# Load files into the source
|
||||||
file_path = "tests/data/lines_1_to_100.txt"
|
file_path = "tests/data/0_to_99.py"
|
||||||
|
|
||||||
# Upload the files
|
# Upload the files
|
||||||
with open(file_path, "rb") as f:
|
with open(file_path, "rb") as f:
|
||||||
@@ -548,10 +548,10 @@ def test_view_ranges_have_metadata(client: LettaSDKClient, agent_state: AgentSta
|
|||||||
block.value
|
block.value
|
||||||
== """
|
== """
|
||||||
[Viewing lines 50 to 54 (out of 100 lines)]
|
[Viewing lines 50 to 54 (out of 100 lines)]
|
||||||
50: Line 51
|
50: x50 = 50
|
||||||
51: Line 52
|
51: x51 = 51
|
||||||
52: Line 53
|
52: x52 = 52
|
||||||
53: Line 54
|
53: x53 = 53
|
||||||
54: Line 55
|
54: x54 = 54
|
||||||
""".strip()
|
""".strip()
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user