feat: Add content aware line chunking (#2707)

2025-06-09 13:03:25 -07:00
parent 71fcbbc863
commit 951773d0ec
7 changed files with 333 additions and 172 deletions
--- a/letta/services/file_processor/chunker/line_chunker.py
+++ b/letta/services/file_processor/chunker/line_chunker.py
@@ -1,34 +1,139 @@
+import re
 from typing import List, Optional

 from letta.log import get_logger
+from letta.schemas.file import FileMetadata
+from letta.services.file_processor.file_types import ChunkingStrategy, file_type_registry

 logger = get_logger(__name__)


 class LineChunker:
-    """Newline chunker"""
+    """Content-aware line chunker that adapts chunking strategy based on file type"""

    def __init__(self):
-        pass
+        self.file_type_registry = file_type_registry

-    def chunk_text(self, text: str, start: Optional[int] = None, end: Optional[int] = None, add_metadata: bool = True) -> List[str]:
-        """Split lines"""
-        content_lines = [line.strip() for line in text.splitlines() if line.strip()]
-        total_lines = len(content_lines)
+    def _determine_chunking_strategy(self, file_metadata: FileMetadata) -> ChunkingStrategy:
+        """Determine the best chunking strategy based on file metadata"""
+        # Try to get strategy from MIME type first
+        if file_metadata.file_type:
+            try:
+                return self.file_type_registry.get_chunking_strategy_by_mime_type(file_metadata.file_type)
+            except Exception:
+                pass

-        if start and end:
+        # Fallback to filename extension
+        if file_metadata.file_name:
+            try:
+                # Extract extension from filename
+                import os
+
+                _, ext = os.path.splitext(file_metadata.file_name)
+                if ext:
+                    return self.file_type_registry.get_chunking_strategy_by_extension(ext)
+            except Exception:
+                pass
+
+        # Default fallback
+        return ChunkingStrategy.LINE_BASED
+
+    def _chunk_by_lines(self, text: str, preserve_indentation: bool = False) -> List[str]:
+        """Traditional line-based chunking for code and structured data"""
+        lines = []
+        for line in text.splitlines():
+            if preserve_indentation:
+                # For code: preserve leading whitespace (indentation), remove trailing whitespace
+                line = line.rstrip()
+                # Only skip completely empty lines
+                if line:
+                    lines.append(line)
+            else:
+                # For structured data: strip all whitespace
+                line = line.strip()
+                if line:
+                    lines.append(line)
+        return lines
+
+    def _chunk_by_sentences(self, text: str) -> List[str]:
+        """Sentence-based chunking for documentation and markup"""
+        # Simple sentence splitting on periods, exclamation marks, and question marks
+        # followed by whitespace or end of string
+        sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])"
+
+        # Split text into sentences
+        sentences = re.split(sentence_pattern, text.strip())
+
+        # Clean up sentences - remove extra whitespace and empty sentences
+        cleaned_sentences = []
+        for sentence in sentences:
+            sentence = re.sub(r"\s+", " ", sentence.strip())  # Normalize whitespace
+            if sentence:
+                cleaned_sentences.append(sentence)
+
+        return cleaned_sentences
+
+    def _chunk_by_characters(self, text: str, target_line_length: int = 100) -> List[str]:
+        """Character-based wrapping for prose text"""
+        words = text.split()
+        lines = []
+        current_line = []
+        current_length = 0
+
+        for word in words:
+            # Check if adding this word would exceed the target length
+            word_length = len(word)
+            if current_length + word_length + len(current_line) > target_line_length and current_line:
+                # Start a new line
+                lines.append(" ".join(current_line))
+                current_line = [word]
+                current_length = word_length
+            else:
+                current_line.append(word)
+                current_length += word_length
+
+        # Add the last line if there's content
+        if current_line:
+            lines.append(" ".join(current_line))
+
+        return [line for line in lines if line.strip()]
+
+    def chunk_text(
+        self, text: str, file_metadata: FileMetadata, start: Optional[int] = None, end: Optional[int] = None, add_metadata: bool = True
+    ) -> List[str]:
+        """Content-aware text chunking based on file type"""
+        strategy = self._determine_chunking_strategy(file_metadata)
+
+        # Apply the appropriate chunking strategy
+        if strategy == ChunkingStrategy.DOCUMENTATION:
+            content_lines = self._chunk_by_sentences(text)
+        elif strategy == ChunkingStrategy.PROSE:
+            content_lines = self._chunk_by_characters(text)
+        elif strategy == ChunkingStrategy.CODE:
+            content_lines = self._chunk_by_lines(text, preserve_indentation=True)
+        else:  # STRUCTURED_DATA or LINE_BASED
+            content_lines = self._chunk_by_lines(text, preserve_indentation=False)
+
+        total_chunks = len(content_lines)
+
+        # Handle start/end slicing
+        if start is not None and end is not None:
            content_lines = content_lines[start:end]
            line_offset = start
        else:
            line_offset = 0

+        # Add line numbers for all strategies
        content_lines = [f"{i + line_offset}: {line}" for i, line in enumerate(content_lines)]

-        # Add metadata about total lines
+        # Add metadata about total chunks
        if add_metadata:
-            if start and end:
-                content_lines.insert(0, f"[Viewing lines {start} to {end-1} (out of {total_lines} lines)]")
+            chunk_type = (
+                "sentences" if strategy == ChunkingStrategy.DOCUMENTATION else "chunks" if strategy == ChunkingStrategy.PROSE else "lines"
+            )
+            if start is not None and end is not None:
+                content_lines.insert(0, f"[Viewing {chunk_type} {start} to {end-1} (out of {total_chunks} {chunk_type})]")
            else:
-                content_lines.insert(0, f"[Viewing file start (out of {total_lines} lines)]")
+                content_lines.insert(0, f"[Viewing file start (out of {total_chunks} {chunk_type})]")

        return content_lines
--- a/letta/services/file_processor/file_processor.py
+++ b/letta/services/file_processor/file_processor.py
@@ -82,7 +82,7 @@ class FileProcessor:

            # Insert to agent context window
            # TODO: Rethink this line chunking mechanism
-            content_lines = self.line_chunker.chunk_text(text=raw_markdown_text)
+            content_lines = self.line_chunker.chunk_text(text=raw_markdown_text, file_metadata=file_metadata)
            visible_content = "\n".join(content_lines)

            await server.insert_file_into_context_windows(
--- a/letta/services/file_processor/file_types.py
+++ b/letta/services/file_processor/file_types.py
@@ -7,9 +7,20 @@ mime types, and file processing capabilities across the Letta codebase.

 import mimetypes
 from dataclasses import dataclass
+from enum import Enum
 from typing import Dict, Set


+class ChunkingStrategy(str, Enum):
+    """Enum for different file chunking strategies."""
+
+    CODE = "code"  # Line-based chunking for code files
+    STRUCTURED_DATA = "structured_data"  # Line-based chunking for JSON, XML, etc.
+    DOCUMENTATION = "documentation"  # Paragraph-aware chunking for Markdown, HTML
+    PROSE = "prose"  # Character-based wrapping for plain text
+    LINE_BASED = "line_based"  # Default line-based chunking
+
+
@dataclass
 class FileTypeInfo:
    """Information about a supported file type."""
@@ -18,6 +29,7 @@ class FileTypeInfo:
    mime_type: str
    is_simple_text: bool
    description: str
+    chunking_strategy: ChunkingStrategy = ChunkingStrategy.LINE_BASED


 class FileTypeRegistry:
@@ -31,63 +43,70 @@ class FileTypeRegistry:
    def _register_default_types(self) -> None:
        """Register all default supported file types."""
        # Document formats
-        self.register(".pdf", "application/pdf", False, "PDF document")
-        self.register(".txt", "text/plain", True, "Plain text file")
-        self.register(".md", "text/markdown", True, "Markdown document")
-        self.register(".markdown", "text/markdown", True, "Markdown document")
-        self.register(".json", "application/json", True, "JSON data file")
-        self.register(".jsonl", "application/jsonl", True, "JSON Lines file")
+        self.register(".pdf", "application/pdf", False, "PDF document", ChunkingStrategy.LINE_BASED)
+        self.register(".txt", "text/plain", True, "Plain text file", ChunkingStrategy.PROSE)
+        self.register(".md", "text/markdown", True, "Markdown document", ChunkingStrategy.DOCUMENTATION)
+        self.register(".markdown", "text/markdown", True, "Markdown document", ChunkingStrategy.DOCUMENTATION)
+        self.register(".json", "application/json", True, "JSON data file", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".jsonl", "application/jsonl", True, "JSON Lines file", ChunkingStrategy.STRUCTURED_DATA)

        # Programming languages
-        self.register(".py", "text/x-python", True, "Python source code")
-        self.register(".js", "text/javascript", True, "JavaScript source code")
-        self.register(".ts", "text/x-typescript", True, "TypeScript source code")
-        self.register(".java", "text/x-java-source", True, "Java source code")
-        self.register(".cpp", "text/x-c++", True, "C++ source code")
-        self.register(".cxx", "text/x-c++", True, "C++ source code")
-        self.register(".c", "text/x-c", True, "C source code")
-        self.register(".h", "text/x-c", True, "C/C++ header file")
-        self.register(".cs", "text/x-csharp", True, "C# source code")
-        self.register(".php", "text/x-php", True, "PHP source code")
-        self.register(".rb", "text/x-ruby", True, "Ruby source code")
-        self.register(".go", "text/x-go", True, "Go source code")
-        self.register(".rs", "text/x-rust", True, "Rust source code")
-        self.register(".swift", "text/x-swift", True, "Swift source code")
-        self.register(".kt", "text/x-kotlin", True, "Kotlin source code")
-        self.register(".scala", "text/x-scala", True, "Scala source code")
-        self.register(".r", "text/x-r", True, "R source code")
-        self.register(".m", "text/x-objective-c", True, "Objective-C source code")
+        self.register(".py", "text/x-python", True, "Python source code", ChunkingStrategy.CODE)
+        self.register(".js", "text/javascript", True, "JavaScript source code", ChunkingStrategy.CODE)
+        self.register(".ts", "text/x-typescript", True, "TypeScript source code", ChunkingStrategy.CODE)
+        self.register(".java", "text/x-java-source", True, "Java source code", ChunkingStrategy.CODE)
+        self.register(".cpp", "text/x-c++", True, "C++ source code", ChunkingStrategy.CODE)
+        self.register(".cxx", "text/x-c++", True, "C++ source code", ChunkingStrategy.CODE)
+        self.register(".c", "text/x-c", True, "C source code", ChunkingStrategy.CODE)
+        self.register(".h", "text/x-c", True, "C/C++ header file", ChunkingStrategy.CODE)
+        self.register(".cs", "text/x-csharp", True, "C# source code", ChunkingStrategy.CODE)
+        self.register(".php", "text/x-php", True, "PHP source code", ChunkingStrategy.CODE)
+        self.register(".rb", "text/x-ruby", True, "Ruby source code", ChunkingStrategy.CODE)
+        self.register(".go", "text/x-go", True, "Go source code", ChunkingStrategy.CODE)
+        self.register(".rs", "text/x-rust", True, "Rust source code", ChunkingStrategy.CODE)
+        self.register(".swift", "text/x-swift", True, "Swift source code", ChunkingStrategy.CODE)
+        self.register(".kt", "text/x-kotlin", True, "Kotlin source code", ChunkingStrategy.CODE)
+        self.register(".scala", "text/x-scala", True, "Scala source code", ChunkingStrategy.CODE)
+        self.register(".r", "text/x-r", True, "R source code", ChunkingStrategy.CODE)
+        self.register(".m", "text/x-objective-c", True, "Objective-C source code", ChunkingStrategy.CODE)

        # Web technologies
-        self.register(".html", "text/html", True, "HTML document")
-        self.register(".htm", "text/html", True, "HTML document")
-        self.register(".css", "text/css", True, "CSS stylesheet")
-        self.register(".scss", "text/x-scss", True, "SCSS stylesheet")
-        self.register(".sass", "text/x-sass", True, "Sass stylesheet")
-        self.register(".less", "text/x-less", True, "Less stylesheet")
-        self.register(".vue", "text/x-vue", True, "Vue.js component")
-        self.register(".jsx", "text/x-jsx", True, "JSX source code")
-        self.register(".tsx", "text/x-tsx", True, "TSX source code")
+        self.register(".html", "text/html", True, "HTML document", ChunkingStrategy.CODE)
+        self.register(".htm", "text/html", True, "HTML document", ChunkingStrategy.CODE)
+        self.register(".css", "text/css", True, "CSS stylesheet", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".scss", "text/x-scss", True, "SCSS stylesheet", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".sass", "text/x-sass", True, "Sass stylesheet", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".less", "text/x-less", True, "Less stylesheet", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".vue", "text/x-vue", True, "Vue.js component", ChunkingStrategy.CODE)
+        self.register(".jsx", "text/x-jsx", True, "JSX source code", ChunkingStrategy.CODE)
+        self.register(".tsx", "text/x-tsx", True, "TSX source code", ChunkingStrategy.CODE)

        # Configuration and data formats
-        self.register(".xml", "application/xml", True, "XML document")
-        self.register(".yaml", "text/x-yaml", True, "YAML configuration")
-        self.register(".yml", "text/x-yaml", True, "YAML configuration")
-        self.register(".toml", "application/toml", True, "TOML configuration")
-        self.register(".ini", "text/x-ini", True, "INI configuration")
-        self.register(".cfg", "text/x-conf", True, "Configuration file")
-        self.register(".conf", "text/x-conf", True, "Configuration file")
+        self.register(".xml", "application/xml", True, "XML document", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".yaml", "text/x-yaml", True, "YAML configuration", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".yml", "text/x-yaml", True, "YAML configuration", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".toml", "application/toml", True, "TOML configuration", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".ini", "text/x-ini", True, "INI configuration", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".cfg", "text/x-conf", True, "Configuration file", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".conf", "text/x-conf", True, "Configuration file", ChunkingStrategy.STRUCTURED_DATA)

        # Scripts and SQL
-        self.register(".sh", "text/x-shellscript", True, "Shell script")
-        self.register(".bash", "text/x-shellscript", True, "Bash script")
-        self.register(".ps1", "text/x-powershell", True, "PowerShell script")
-        self.register(".bat", "text/x-batch", True, "Batch script")
-        self.register(".cmd", "text/x-batch", True, "Command script")
-        self.register(".dockerfile", "text/x-dockerfile", True, "Dockerfile")
-        self.register(".sql", "text/x-sql", True, "SQL script")
+        self.register(".sh", "text/x-shellscript", True, "Shell script", ChunkingStrategy.CODE)
+        self.register(".bash", "text/x-shellscript", True, "Bash script", ChunkingStrategy.CODE)
+        self.register(".ps1", "text/x-powershell", True, "PowerShell script", ChunkingStrategy.CODE)
+        self.register(".bat", "text/x-batch", True, "Batch script", ChunkingStrategy.CODE)
+        self.register(".cmd", "text/x-batch", True, "Command script", ChunkingStrategy.CODE)
+        self.register(".dockerfile", "text/x-dockerfile", True, "Dockerfile", ChunkingStrategy.CODE)
+        self.register(".sql", "text/x-sql", True, "SQL script", ChunkingStrategy.CODE)

-    def register(self, extension: str, mime_type: str, is_simple_text: bool, description: str) -> None:
+    def register(
+        self,
+        extension: str,
+        mime_type: str,
+        is_simple_text: bool,
+        description: str,
+        chunking_strategy: ChunkingStrategy = ChunkingStrategy.LINE_BASED,
+    ) -> None:
        """
        Register a new file type.

@@ -96,12 +115,17 @@ class FileTypeRegistry:
            mime_type: MIME type for the file
            is_simple_text: Whether this is a simple text file that can be read directly
            description: Human-readable description of the file type
+            chunking_strategy: Strategy for chunking this file type
        """
        if not extension.startswith("."):
            extension = f".{extension}"

        self._file_types[extension] = FileTypeInfo(
-            extension=extension, mime_type=mime_type, is_simple_text=is_simple_text, description=description
+            extension=extension,
+            mime_type=mime_type,
+            is_simple_text=is_simple_text,
+            description=description,
+            chunking_strategy=chunking_strategy,
        )

    def register_mime_types(self) -> None:
@@ -217,6 +241,37 @@ class FileTypeRegistry:
            extension = f".{extension}"
        return self._file_types[extension]

+    def get_chunking_strategy_by_extension(self, extension: str) -> ChunkingStrategy:
+        """
+        Get the chunking strategy for a file based on its extension.
+
+        Args:
+            extension: File extension (with or without leading dot)
+
+        Returns:
+            ChunkingStrategy enum value for the file type
+
+        Raises:
+            KeyError: If the extension is not supported
+        """
+        file_type_info = self.get_file_type_info(extension)
+        return file_type_info.chunking_strategy
+
+    def get_chunking_strategy_by_mime_type(self, mime_type: str) -> ChunkingStrategy:
+        """
+        Get the chunking strategy for a file based on its MIME type.
+
+        Args:
+            mime_type: MIME type of the file
+
+        Returns:
+            ChunkingStrategy enum value for the file type, or LINE_BASED if not found
+        """
+        for file_type in self._file_types.values():
+            if file_type.mime_type == mime_type:
+                return file_type.chunking_strategy
+        return ChunkingStrategy.LINE_BASED
+

 # Global registry instance
 file_type_registry = FileTypeRegistry()
--- a/letta/services/tool_executor/files_tool_executor.py
+++ b/letta/services/tool_executor/files_tool_executor.py
@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple

 from letta.log import get_logger
 from letta.schemas.agent import AgentState
+from letta.schemas.file import FileMetadata
 from letta.schemas.sandbox_config import SandboxConfig
 from letta.schemas.tool import Tool
 from letta.schemas.tool_execution_result import ToolExecutionResult
@@ -119,7 +120,7 @@ class LettaFileToolExecutor(ToolExecutor):
        # TODO: Inefficient, maybe we can pre-compute this
        # TODO: This is also not the best way to split things - would be cool to have "content aware" splitting
        # TODO: Split code differently from large text blurbs
-        content_lines = LineChunker().chunk_text(text=file.content, start=start, end=end)
+        content_lines = LineChunker().chunk_text(text=file.content, file_metadata=file, start=start, end=end)
        visible_content = "\n".join(content_lines)

        await self.files_agents_manager.update_file_agent_by_id(
@@ -146,14 +147,14 @@ class LettaFileToolExecutor(ToolExecutor):
        except re.error as e:
            raise ValueError(f"Invalid regex pattern: {e}")

-    def _get_context_lines(self, text: str, match_line_idx: int, total_lines: int) -> List[str]:
+    def _get_context_lines(self, text: str, file_metadata: FileMetadata, match_line_idx: int, total_lines: int) -> List[str]:
        """Get context lines around a match using LineChunker."""
        start_idx = max(0, match_line_idx - self.MAX_CONTEXT_LINES)
        end_idx = min(total_lines, match_line_idx + self.MAX_CONTEXT_LINES + 1)

        # Use LineChunker to get formatted lines with numbers
        chunker = LineChunker()
-        context_lines = chunker.chunk_text(text, start=start_idx, end=end_idx, add_metadata=False)
+        context_lines = chunker.chunk_text(text, file_metadata=file_metadata, start=start_idx, end=end_idx, add_metadata=False)

        # Add match indicator
        formatted_lines = []
@@ -268,7 +269,7 @@ class LettaFileToolExecutor(ToolExecutor):

                # Use LineChunker to get all lines with proper formatting
                chunker = LineChunker()
-                formatted_lines = chunker.chunk_text(file.content)
+                formatted_lines = chunker.chunk_text(file.content, file_metadata=file)

                # Remove metadata header
                if formatted_lines and formatted_lines[0].startswith("[Viewing"):
@@ -295,7 +296,7 @@ class LettaFileToolExecutor(ToolExecutor):

                        if pattern_regex.search(line_content):
                            # Get context around the match (convert back to 0-based indexing)
-                            context_lines = self._get_context_lines(file.content, line_num - 1, len(file.content.splitlines()))
+                            context_lines = self._get_context_lines(file.content, file, line_num - 1, len(file.content.splitlines()))

                            # Format the match result
                            match_header = f"\n=== {file.file_name}:{line_num} ==="
--- a/tests/data/0_to_99.py
+++ b/tests/data/0_to_99.py
@@ -0,0 +1,100 @@
+x0 = 0
+x1 = 1
+x2 = 2
+x3 = 3
+x4 = 4
+x5 = 5
+x6 = 6
+x7 = 7
+x8 = 8
+x9 = 9
+x10 = 10
+x11 = 11
+x12 = 12
+x13 = 13
+x14 = 14
+x15 = 15
+x16 = 16
+x17 = 17
+x18 = 18
+x19 = 19
+x20 = 20
+x21 = 21
+x22 = 22
+x23 = 23
+x24 = 24
+x25 = 25
+x26 = 26
+x27 = 27
+x28 = 28
+x29 = 29
+x30 = 30
+x31 = 31
+x32 = 32
+x33 = 33
+x34 = 34
+x35 = 35
+x36 = 36
+x37 = 37
+x38 = 38
+x39 = 39
+x40 = 40
+x41 = 41
+x42 = 42
+x43 = 43
+x44 = 44
+x45 = 45
+x46 = 46
+x47 = 47
+x48 = 48
+x49 = 49
+x50 = 50
+x51 = 51
+x52 = 52
+x53 = 53
+x54 = 54
+x55 = 55
+x56 = 56
+x57 = 57
+x58 = 58
+x59 = 59
+x60 = 60
+x61 = 61
+x62 = 62
+x63 = 63
+x64 = 64
+x65 = 65
+x66 = 66
+x67 = 67
+x68 = 68
+x69 = 69
+x70 = 70
+x71 = 71
+x72 = 72
+x73 = 73
+x74 = 74
+x75 = 75
+x76 = 76
+x77 = 77
+x78 = 78
+x79 = 79
+x80 = 80
+x81 = 81
+x82 = 82
+x83 = 83
+x84 = 84
+x85 = 85
+x86 = 86
+x87 = 87
+x88 = 88
+x89 = 89
+x90 = 90
+x91 = 91
+x92 = 92
+x93 = 93
+x94 = 94
+x95 = 95
+x96 = 96
+x97 = 97
+x98 = 98
+x99 = 99
--- a/tests/data/lines_1_to_100.txt
+++ b/tests/data/lines_1_to_100.txt
@@ -1,100 +0,0 @@
-Line 1
-Line 2
-Line 3
-Line 4
-Line 5
-Line 6
-Line 7
-Line 8
-Line 9
-Line 10
-Line 11
-Line 12
-Line 13
-Line 14
-Line 15
-Line 16
-Line 17
-Line 18
-Line 19
-Line 20
-Line 21
-Line 22
-Line 23
-Line 24
-Line 25
-Line 26
-Line 27
-Line 28
-Line 29
-Line 30
-Line 31
-Line 32
-Line 33
-Line 34
-Line 35
-Line 36
-Line 37
-Line 38
-Line 39
-Line 40
-Line 41
-Line 42
-Line 43
-Line 44
-Line 45
-Line 46
-Line 47
-Line 48
-Line 49
-Line 50
-Line 51
-Line 52
-Line 53
-Line 54
-Line 55
-Line 56
-Line 57
-Line 58
-Line 59
-Line 60
-Line 61
-Line 62
-Line 63
-Line 64
-Line 65
-Line 66
-Line 67
-Line 68
-Line 69
-Line 70
-Line 71
-Line 72
-Line 73
-Line 74
-Line 75
-Line 76
-Line 77
-Line 78
-Line 79
-Line 80
-Line 81
-Line 82
-Line 83
-Line 84
-Line 85
-Line 86
-Line 87
-Line 88
-Line 89
-Line 90
-Line 91
-Line 92
-Line 93
-Line 94
-Line 95
-Line 96
-Line 97
-Line 98
-Line 99
-Line 100
--- a/tests/test_sources.py
+++ b/tests/test_sources.py
@@ -499,7 +499,7 @@ def test_view_ranges_have_metadata(client: LettaSDKClient, agent_state: AgentSta
    client.agents.sources.attach(source_id=source.id, agent_id=agent_state.id)

    # Load files into the source
-    file_path = "tests/data/lines_1_to_100.txt"
+    file_path = "tests/data/0_to_99.py"

    # Upload the files
    with open(file_path, "rb") as f:
@@ -548,10 +548,10 @@ def test_view_ranges_have_metadata(client: LettaSDKClient, agent_state: AgentSta
        block.value
        == """
    [Viewing lines 50 to 54 (out of 100 lines)]
-50: Line 51
-51: Line 52
-52: Line 53
-53: Line 54
-54: Line 55
+50: x50 = 50
+51: x51 = 51
+52: x52 = 52
+53: x53 = 53
+54: x54 = 54
    """.strip()
    )