From 951773d0ec04e5fb7947f200947b8a1e45f6e2a3 Mon Sep 17 00:00:00 2001
From: Matthew Zhou <mattzh1314@gmail.com>
Date: Mon, 9 Jun 2025 13:03:25 -0700
Subject: [PATCH] feat: Add content aware line chunking (#2707)

---
 .../file_processor/chunker/line_chunker.py    | 127 +++++++++++++--
 .../services/file_processor/file_processor.py |   2 +-
 letta/services/file_processor/file_types.py   | 153 ++++++++++++------
 .../tool_executor/files_tool_executor.py      |  11 +-
 tests/data/0_to_99.py                         | 100 ++++++++++++
 tests/data/lines_1_to_100.txt                 | 100 ------------
 tests/test_sources.py                         |  12 +-
 7 files changed, 333 insertions(+), 172 deletions(-)
 create mode 100644 tests/data/0_to_99.py
 delete mode 100644 tests/data/lines_1_to_100.txt

diff --git a/letta/services/file_processor/chunker/line_chunker.py b/letta/services/file_processor/chunker/line_chunker.py
index 8a5ef1ea..9bf65bea 100644
--- a/letta/services/file_processor/chunker/line_chunker.py
+++ b/letta/services/file_processor/chunker/line_chunker.py
@@ -1,34 +1,139 @@
+import re
 from typing import List, Optional
 
 from letta.log import get_logger
+from letta.schemas.file import FileMetadata
+from letta.services.file_processor.file_types import ChunkingStrategy, file_type_registry
 
 logger = get_logger(__name__)
 
 
 class LineChunker:
-    """Newline chunker"""
+    """Content-aware line chunker that adapts chunking strategy based on file type"""
 
     def __init__(self):
-        pass
+        self.file_type_registry = file_type_registry
 
-    def chunk_text(self, text: str, start: Optional[int] = None, end: Optional[int] = None, add_metadata: bool = True) -> List[str]:
-        """Split lines"""
-        content_lines = [line.strip() for line in text.splitlines() if line.strip()]
-        total_lines = len(content_lines)
+    def _determine_chunking_strategy(self, file_metadata: FileMetadata) -> ChunkingStrategy:
+        """Determine the best chunking strategy based on file metadata"""
+        # Try to get strategy from MIME type first
+        if file_metadata.file_type:
+            try:
+                return self.file_type_registry.get_chunking_strategy_by_mime_type(file_metadata.file_type)
+            except Exception:
+                pass
 
-        if start and end:
+        # Fallback to filename extension
+        if file_metadata.file_name:
+            try:
+                # Extract extension from filename
+                import os
+
+                _, ext = os.path.splitext(file_metadata.file_name)
+                if ext:
+                    return self.file_type_registry.get_chunking_strategy_by_extension(ext)
+            except Exception:
+                pass
+
+        # Default fallback
+        return ChunkingStrategy.LINE_BASED
+
+    def _chunk_by_lines(self, text: str, preserve_indentation: bool = False) -> List[str]:
+        """Traditional line-based chunking for code and structured data"""
+        lines = []
+        for line in text.splitlines():
+            if preserve_indentation:
+                # For code: preserve leading whitespace (indentation), remove trailing whitespace
+                line = line.rstrip()
+                # Only skip completely empty lines
+                if line:
+                    lines.append(line)
+            else:
+                # For structured data: strip all whitespace
+                line = line.strip()
+                if line:
+                    lines.append(line)
+        return lines
+
+    def _chunk_by_sentences(self, text: str) -> List[str]:
+        """Sentence-based chunking for documentation and markup"""
+        # Simple sentence splitting on periods, exclamation marks, and question marks
+        # followed by whitespace or end of string
+        sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])"
+
+        # Split text into sentences
+        sentences = re.split(sentence_pattern, text.strip())
+
+        # Clean up sentences - remove extra whitespace and empty sentences
+        cleaned_sentences = []
+        for sentence in sentences:
+            sentence = re.sub(r"\s+", " ", sentence.strip())  # Normalize whitespace
+            if sentence:
+                cleaned_sentences.append(sentence)
+
+        return cleaned_sentences
+
+    def _chunk_by_characters(self, text: str, target_line_length: int = 100) -> List[str]:
+        """Character-based wrapping for prose text"""
+        words = text.split()
+        lines = []
+        current_line = []
+        current_length = 0
+
+        for word in words:
+            # Check if adding this word would exceed the target length
+            word_length = len(word)
+            if current_length + word_length + len(current_line) > target_line_length and current_line:
+                # Start a new line
+                lines.append(" ".join(current_line))
+                current_line = [word]
+                current_length = word_length
+            else:
+                current_line.append(word)
+                current_length += word_length
+
+        # Add the last line if there's content
+        if current_line:
+            lines.append(" ".join(current_line))
+
+        return [line for line in lines if line.strip()]
+
+    def chunk_text(
+        self, text: str, file_metadata: FileMetadata, start: Optional[int] = None, end: Optional[int] = None, add_metadata: bool = True
+    ) -> List[str]:
+        """Content-aware text chunking based on file type"""
+        strategy = self._determine_chunking_strategy(file_metadata)
+
+        # Apply the appropriate chunking strategy
+        if strategy == ChunkingStrategy.DOCUMENTATION:
+            content_lines = self._chunk_by_sentences(text)
+        elif strategy == ChunkingStrategy.PROSE:
+            content_lines = self._chunk_by_characters(text)
+        elif strategy == ChunkingStrategy.CODE:
+            content_lines = self._chunk_by_lines(text, preserve_indentation=True)
+        else:  # STRUCTURED_DATA or LINE_BASED
+            content_lines = self._chunk_by_lines(text, preserve_indentation=False)
+
+        total_chunks = len(content_lines)
+
+        # Handle start/end slicing
+        if start is not None and end is not None:
             content_lines = content_lines[start:end]
             line_offset = start
         else:
             line_offset = 0
 
+        # Add line numbers for all strategies
         content_lines = [f"{i + line_offset}: {line}" for i, line in enumerate(content_lines)]
 
-        # Add metadata about total lines
+        # Add metadata about total chunks
         if add_metadata:
-            if start and end:
-                content_lines.insert(0, f"[Viewing lines {start} to {end-1} (out of {total_lines} lines)]")
+            chunk_type = (
+                "sentences" if strategy == ChunkingStrategy.DOCUMENTATION else "chunks" if strategy == ChunkingStrategy.PROSE else "lines"
+            )
+            if start is not None and end is not None:
+                content_lines.insert(0, f"[Viewing {chunk_type} {start} to {end-1} (out of {total_chunks} {chunk_type})]")
             else:
-                content_lines.insert(0, f"[Viewing file start (out of {total_lines} lines)]")
+                content_lines.insert(0, f"[Viewing file start (out of {total_chunks} {chunk_type})]")
 
         return content_lines
diff --git a/letta/services/file_processor/file_processor.py b/letta/services/file_processor/file_processor.py
index 20e2bf50..36cb8b7c 100644
--- a/letta/services/file_processor/file_processor.py
+++ b/letta/services/file_processor/file_processor.py
@@ -82,7 +82,7 @@ class FileProcessor:
 
             # Insert to agent context window
             # TODO: Rethink this line chunking mechanism
-            content_lines = self.line_chunker.chunk_text(text=raw_markdown_text)
+            content_lines = self.line_chunker.chunk_text(text=raw_markdown_text, file_metadata=file_metadata)
             visible_content = "\n".join(content_lines)
 
             await server.insert_file_into_context_windows(
diff --git a/letta/services/file_processor/file_types.py b/letta/services/file_processor/file_types.py
index ee24587c..34be23f0 100644
--- a/letta/services/file_processor/file_types.py
+++ b/letta/services/file_processor/file_types.py
@@ -7,9 +7,20 @@ mime types, and file processing capabilities across the Letta codebase.
 
 import mimetypes
 from dataclasses import dataclass
+from enum import Enum
 from typing import Dict, Set
 
 
+class ChunkingStrategy(str, Enum):
+    """Enum for different file chunking strategies."""
+
+    CODE = "code"  # Line-based chunking for code files
+    STRUCTURED_DATA = "structured_data"  # Line-based chunking for JSON, XML, etc.
+    DOCUMENTATION = "documentation"  # Paragraph-aware chunking for Markdown, HTML
+    PROSE = "prose"  # Character-based wrapping for plain text
+    LINE_BASED = "line_based"  # Default line-based chunking
+
+
 @dataclass
 class FileTypeInfo:
     """Information about a supported file type."""
@@ -18,6 +29,7 @@ class FileTypeInfo:
     mime_type: str
     is_simple_text: bool
     description: str
+    chunking_strategy: ChunkingStrategy = ChunkingStrategy.LINE_BASED
 
 
 class FileTypeRegistry:
@@ -31,63 +43,70 @@ class FileTypeRegistry:
     def _register_default_types(self) -> None:
         """Register all default supported file types."""
         # Document formats
-        self.register(".pdf", "application/pdf", False, "PDF document")
-        self.register(".txt", "text/plain", True, "Plain text file")
-        self.register(".md", "text/markdown", True, "Markdown document")
-        self.register(".markdown", "text/markdown", True, "Markdown document")
-        self.register(".json", "application/json", True, "JSON data file")
-        self.register(".jsonl", "application/jsonl", True, "JSON Lines file")
+        self.register(".pdf", "application/pdf", False, "PDF document", ChunkingStrategy.LINE_BASED)
+        self.register(".txt", "text/plain", True, "Plain text file", ChunkingStrategy.PROSE)
+        self.register(".md", "text/markdown", True, "Markdown document", ChunkingStrategy.DOCUMENTATION)
+        self.register(".markdown", "text/markdown", True, "Markdown document", ChunkingStrategy.DOCUMENTATION)
+        self.register(".json", "application/json", True, "JSON data file", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".jsonl", "application/jsonl", True, "JSON Lines file", ChunkingStrategy.STRUCTURED_DATA)
 
         # Programming languages
-        self.register(".py", "text/x-python", True, "Python source code")
-        self.register(".js", "text/javascript", True, "JavaScript source code")
-        self.register(".ts", "text/x-typescript", True, "TypeScript source code")
-        self.register(".java", "text/x-java-source", True, "Java source code")
-        self.register(".cpp", "text/x-c++", True, "C++ source code")
-        self.register(".cxx", "text/x-c++", True, "C++ source code")
-        self.register(".c", "text/x-c", True, "C source code")
-        self.register(".h", "text/x-c", True, "C/C++ header file")
-        self.register(".cs", "text/x-csharp", True, "C# source code")
-        self.register(".php", "text/x-php", True, "PHP source code")
-        self.register(".rb", "text/x-ruby", True, "Ruby source code")
-        self.register(".go", "text/x-go", True, "Go source code")
-        self.register(".rs", "text/x-rust", True, "Rust source code")
-        self.register(".swift", "text/x-swift", True, "Swift source code")
-        self.register(".kt", "text/x-kotlin", True, "Kotlin source code")
-        self.register(".scala", "text/x-scala", True, "Scala source code")
-        self.register(".r", "text/x-r", True, "R source code")
-        self.register(".m", "text/x-objective-c", True, "Objective-C source code")
+        self.register(".py", "text/x-python", True, "Python source code", ChunkingStrategy.CODE)
+        self.register(".js", "text/javascript", True, "JavaScript source code", ChunkingStrategy.CODE)
+        self.register(".ts", "text/x-typescript", True, "TypeScript source code", ChunkingStrategy.CODE)
+        self.register(".java", "text/x-java-source", True, "Java source code", ChunkingStrategy.CODE)
+        self.register(".cpp", "text/x-c++", True, "C++ source code", ChunkingStrategy.CODE)
+        self.register(".cxx", "text/x-c++", True, "C++ source code", ChunkingStrategy.CODE)
+        self.register(".c", "text/x-c", True, "C source code", ChunkingStrategy.CODE)
+        self.register(".h", "text/x-c", True, "C/C++ header file", ChunkingStrategy.CODE)
+        self.register(".cs", "text/x-csharp", True, "C# source code", ChunkingStrategy.CODE)
+        self.register(".php", "text/x-php", True, "PHP source code", ChunkingStrategy.CODE)
+        self.register(".rb", "text/x-ruby", True, "Ruby source code", ChunkingStrategy.CODE)
+        self.register(".go", "text/x-go", True, "Go source code", ChunkingStrategy.CODE)
+        self.register(".rs", "text/x-rust", True, "Rust source code", ChunkingStrategy.CODE)
+        self.register(".swift", "text/x-swift", True, "Swift source code", ChunkingStrategy.CODE)
+        self.register(".kt", "text/x-kotlin", True, "Kotlin source code", ChunkingStrategy.CODE)
+        self.register(".scala", "text/x-scala", True, "Scala source code", ChunkingStrategy.CODE)
+        self.register(".r", "text/x-r", True, "R source code", ChunkingStrategy.CODE)
+        self.register(".m", "text/x-objective-c", True, "Objective-C source code", ChunkingStrategy.CODE)
 
         # Web technologies
-        self.register(".html", "text/html", True, "HTML document")
-        self.register(".htm", "text/html", True, "HTML document")
-        self.register(".css", "text/css", True, "CSS stylesheet")
-        self.register(".scss", "text/x-scss", True, "SCSS stylesheet")
-        self.register(".sass", "text/x-sass", True, "Sass stylesheet")
-        self.register(".less", "text/x-less", True, "Less stylesheet")
-        self.register(".vue", "text/x-vue", True, "Vue.js component")
-        self.register(".jsx", "text/x-jsx", True, "JSX source code")
-        self.register(".tsx", "text/x-tsx", True, "TSX source code")
+        self.register(".html", "text/html", True, "HTML document", ChunkingStrategy.CODE)
+        self.register(".htm", "text/html", True, "HTML document", ChunkingStrategy.CODE)
+        self.register(".css", "text/css", True, "CSS stylesheet", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".scss", "text/x-scss", True, "SCSS stylesheet", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".sass", "text/x-sass", True, "Sass stylesheet", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".less", "text/x-less", True, "Less stylesheet", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".vue", "text/x-vue", True, "Vue.js component", ChunkingStrategy.CODE)
+        self.register(".jsx", "text/x-jsx", True, "JSX source code", ChunkingStrategy.CODE)
+        self.register(".tsx", "text/x-tsx", True, "TSX source code", ChunkingStrategy.CODE)
 
         # Configuration and data formats
-        self.register(".xml", "application/xml", True, "XML document")
-        self.register(".yaml", "text/x-yaml", True, "YAML configuration")
-        self.register(".yml", "text/x-yaml", True, "YAML configuration")
-        self.register(".toml", "application/toml", True, "TOML configuration")
-        self.register(".ini", "text/x-ini", True, "INI configuration")
-        self.register(".cfg", "text/x-conf", True, "Configuration file")
-        self.register(".conf", "text/x-conf", True, "Configuration file")
+        self.register(".xml", "application/xml", True, "XML document", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".yaml", "text/x-yaml", True, "YAML configuration", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".yml", "text/x-yaml", True, "YAML configuration", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".toml", "application/toml", True, "TOML configuration", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".ini", "text/x-ini", True, "INI configuration", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".cfg", "text/x-conf", True, "Configuration file", ChunkingStrategy.STRUCTURED_DATA)
+        self.register(".conf", "text/x-conf", True, "Configuration file", ChunkingStrategy.STRUCTURED_DATA)
 
         # Scripts and SQL
-        self.register(".sh", "text/x-shellscript", True, "Shell script")
-        self.register(".bash", "text/x-shellscript", True, "Bash script")
-        self.register(".ps1", "text/x-powershell", True, "PowerShell script")
-        self.register(".bat", "text/x-batch", True, "Batch script")
-        self.register(".cmd", "text/x-batch", True, "Command script")
-        self.register(".dockerfile", "text/x-dockerfile", True, "Dockerfile")
-        self.register(".sql", "text/x-sql", True, "SQL script")
+        self.register(".sh", "text/x-shellscript", True, "Shell script", ChunkingStrategy.CODE)
+        self.register(".bash", "text/x-shellscript", True, "Bash script", ChunkingStrategy.CODE)
+        self.register(".ps1", "text/x-powershell", True, "PowerShell script", ChunkingStrategy.CODE)
+        self.register(".bat", "text/x-batch", True, "Batch script", ChunkingStrategy.CODE)
+        self.register(".cmd", "text/x-batch", True, "Command script", ChunkingStrategy.CODE)
+        self.register(".dockerfile", "text/x-dockerfile", True, "Dockerfile", ChunkingStrategy.CODE)
+        self.register(".sql", "text/x-sql", True, "SQL script", ChunkingStrategy.CODE)
 
-    def register(self, extension: str, mime_type: str, is_simple_text: bool, description: str) -> None:
+    def register(
+        self,
+        extension: str,
+        mime_type: str,
+        is_simple_text: bool,
+        description: str,
+        chunking_strategy: ChunkingStrategy = ChunkingStrategy.LINE_BASED,
+    ) -> None:
         """
         Register a new file type.
 
@@ -96,12 +115,17 @@ class FileTypeRegistry:
             mime_type: MIME type for the file
             is_simple_text: Whether this is a simple text file that can be read directly
             description: Human-readable description of the file type
+            chunking_strategy: Strategy for chunking this file type
         """
         if not extension.startswith("."):
             extension = f".{extension}"
 
         self._file_types[extension] = FileTypeInfo(
-            extension=extension, mime_type=mime_type, is_simple_text=is_simple_text, description=description
+            extension=extension,
+            mime_type=mime_type,
+            is_simple_text=is_simple_text,
+            description=description,
+            chunking_strategy=chunking_strategy,
         )
 
     def register_mime_types(self) -> None:
@@ -217,6 +241,37 @@ class FileTypeRegistry:
             extension = f".{extension}"
         return self._file_types[extension]
 
+    def get_chunking_strategy_by_extension(self, extension: str) -> ChunkingStrategy:
+        """
+        Get the chunking strategy for a file based on its extension.
+
+        Args:
+            extension: File extension (with or without leading dot)
+
+        Returns:
+            ChunkingStrategy enum value for the file type
+
+        Raises:
+            KeyError: If the extension is not supported
+        """
+        file_type_info = self.get_file_type_info(extension)
+        return file_type_info.chunking_strategy
+
+    def get_chunking_strategy_by_mime_type(self, mime_type: str) -> ChunkingStrategy:
+        """
+        Get the chunking strategy for a file based on its MIME type.
+
+        Args:
+            mime_type: MIME type of the file
+
+        Returns:
+            ChunkingStrategy enum value for the file type, or LINE_BASED if not found
+        """
+        for file_type in self._file_types.values():
+            if file_type.mime_type == mime_type:
+                return file_type.chunking_strategy
+        return ChunkingStrategy.LINE_BASED
+
 
 # Global registry instance
 file_type_registry = FileTypeRegistry()
diff --git a/letta/services/tool_executor/files_tool_executor.py b/letta/services/tool_executor/files_tool_executor.py
index ceed535f..53979312 100644
--- a/letta/services/tool_executor/files_tool_executor.py
+++ b/letta/services/tool_executor/files_tool_executor.py
@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple
 
 from letta.log import get_logger
 from letta.schemas.agent import AgentState
+from letta.schemas.file import FileMetadata
 from letta.schemas.sandbox_config import SandboxConfig
 from letta.schemas.tool import Tool
 from letta.schemas.tool_execution_result import ToolExecutionResult
@@ -119,7 +120,7 @@ class LettaFileToolExecutor(ToolExecutor):
         # TODO: Inefficient, maybe we can pre-compute this
         # TODO: This is also not the best way to split things - would be cool to have "content aware" splitting
         # TODO: Split code differently from large text blurbs
-        content_lines = LineChunker().chunk_text(text=file.content, start=start, end=end)
+        content_lines = LineChunker().chunk_text(text=file.content, file_metadata=file, start=start, end=end)
         visible_content = "\n".join(content_lines)
 
         await self.files_agents_manager.update_file_agent_by_id(
@@ -146,14 +147,14 @@ class LettaFileToolExecutor(ToolExecutor):
         except re.error as e:
             raise ValueError(f"Invalid regex pattern: {e}")
 
-    def _get_context_lines(self, text: str, match_line_idx: int, total_lines: int) -> List[str]:
+    def _get_context_lines(self, text: str, file_metadata: FileMetadata, match_line_idx: int, total_lines: int) -> List[str]:
         """Get context lines around a match using LineChunker."""
         start_idx = max(0, match_line_idx - self.MAX_CONTEXT_LINES)
         end_idx = min(total_lines, match_line_idx + self.MAX_CONTEXT_LINES + 1)
 
         # Use LineChunker to get formatted lines with numbers
         chunker = LineChunker()
-        context_lines = chunker.chunk_text(text, start=start_idx, end=end_idx, add_metadata=False)
+        context_lines = chunker.chunk_text(text, file_metadata=file_metadata, start=start_idx, end=end_idx, add_metadata=False)
 
         # Add match indicator
         formatted_lines = []
@@ -268,7 +269,7 @@ class LettaFileToolExecutor(ToolExecutor):
 
                 # Use LineChunker to get all lines with proper formatting
                 chunker = LineChunker()
-                formatted_lines = chunker.chunk_text(file.content)
+                formatted_lines = chunker.chunk_text(file.content, file_metadata=file)
 
                 # Remove metadata header
                 if formatted_lines and formatted_lines[0].startswith("[Viewing"):
@@ -295,7 +296,7 @@ class LettaFileToolExecutor(ToolExecutor):
 
                         if pattern_regex.search(line_content):
                             # Get context around the match (convert back to 0-based indexing)
-                            context_lines = self._get_context_lines(file.content, line_num - 1, len(file.content.splitlines()))
+                            context_lines = self._get_context_lines(file.content, file, line_num - 1, len(file.content.splitlines()))
 
                             # Format the match result
                             match_header = f"\n=== {file.file_name}:{line_num} ==="
diff --git a/tests/data/0_to_99.py b/tests/data/0_to_99.py
new file mode 100644
index 00000000..e8819bad
--- /dev/null
+++ b/tests/data/0_to_99.py
@@ -0,0 +1,100 @@
+x0 = 0
+x1 = 1
+x2 = 2
+x3 = 3
+x4 = 4
+x5 = 5
+x6 = 6
+x7 = 7
+x8 = 8
+x9 = 9
+x10 = 10
+x11 = 11
+x12 = 12
+x13 = 13
+x14 = 14
+x15 = 15
+x16 = 16
+x17 = 17
+x18 = 18
+x19 = 19
+x20 = 20
+x21 = 21
+x22 = 22
+x23 = 23
+x24 = 24
+x25 = 25
+x26 = 26
+x27 = 27
+x28 = 28
+x29 = 29
+x30 = 30
+x31 = 31
+x32 = 32
+x33 = 33
+x34 = 34
+x35 = 35
+x36 = 36
+x37 = 37
+x38 = 38
+x39 = 39
+x40 = 40
+x41 = 41
+x42 = 42
+x43 = 43
+x44 = 44
+x45 = 45
+x46 = 46
+x47 = 47
+x48 = 48
+x49 = 49
+x50 = 50
+x51 = 51
+x52 = 52
+x53 = 53
+x54 = 54
+x55 = 55
+x56 = 56
+x57 = 57
+x58 = 58
+x59 = 59
+x60 = 60
+x61 = 61
+x62 = 62
+x63 = 63
+x64 = 64
+x65 = 65
+x66 = 66
+x67 = 67
+x68 = 68
+x69 = 69
+x70 = 70
+x71 = 71
+x72 = 72
+x73 = 73
+x74 = 74
+x75 = 75
+x76 = 76
+x77 = 77
+x78 = 78
+x79 = 79
+x80 = 80
+x81 = 81
+x82 = 82
+x83 = 83
+x84 = 84
+x85 = 85
+x86 = 86
+x87 = 87
+x88 = 88
+x89 = 89
+x90 = 90
+x91 = 91
+x92 = 92
+x93 = 93
+x94 = 94
+x95 = 95
+x96 = 96
+x97 = 97
+x98 = 98
+x99 = 99
diff --git a/tests/data/lines_1_to_100.txt b/tests/data/lines_1_to_100.txt
deleted file mode 100644
index b9ed43de..00000000
--- a/tests/data/lines_1_to_100.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-Line 1
-Line 2
-Line 3
-Line 4
-Line 5
-Line 6
-Line 7
-Line 8
-Line 9
-Line 10
-Line 11
-Line 12
-Line 13
-Line 14
-Line 15
-Line 16
-Line 17
-Line 18
-Line 19
-Line 20
-Line 21
-Line 22
-Line 23
-Line 24
-Line 25
-Line 26
-Line 27
-Line 28
-Line 29
-Line 30
-Line 31
-Line 32
-Line 33
-Line 34
-Line 35
-Line 36
-Line 37
-Line 38
-Line 39
-Line 40
-Line 41
-Line 42
-Line 43
-Line 44
-Line 45
-Line 46
-Line 47
-Line 48
-Line 49
-Line 50
-Line 51
-Line 52
-Line 53
-Line 54
-Line 55
-Line 56
-Line 57
-Line 58
-Line 59
-Line 60
-Line 61
-Line 62
-Line 63
-Line 64
-Line 65
-Line 66
-Line 67
-Line 68
-Line 69
-Line 70
-Line 71
-Line 72
-Line 73
-Line 74
-Line 75
-Line 76
-Line 77
-Line 78
-Line 79
-Line 80
-Line 81
-Line 82
-Line 83
-Line 84
-Line 85
-Line 86
-Line 87
-Line 88
-Line 89
-Line 90
-Line 91
-Line 92
-Line 93
-Line 94
-Line 95
-Line 96
-Line 97
-Line 98
-Line 99
-Line 100
\ No newline at end of file
diff --git a/tests/test_sources.py b/tests/test_sources.py
index 53bf68ac..15580189 100644
--- a/tests/test_sources.py
+++ b/tests/test_sources.py
@@ -499,7 +499,7 @@ def test_view_ranges_have_metadata(client: LettaSDKClient, agent_state: AgentSta
     client.agents.sources.attach(source_id=source.id, agent_id=agent_state.id)
 
     # Load files into the source
-    file_path = "tests/data/lines_1_to_100.txt"
+    file_path = "tests/data/0_to_99.py"
 
     # Upload the files
     with open(file_path, "rb") as f:
@@ -548,10 +548,10 @@ def test_view_ranges_have_metadata(client: LettaSDKClient, agent_state: AgentSta
         block.value
         == """
     [Viewing lines 50 to 54 (out of 100 lines)]
-50: Line 51
-51: Line 52
-52: Line 53
-53: Line 54
-54: Line 55
+50: x50 = 50
+51: x51 = 51
+52: x52 = 52
+53: x53 = 53
+54: x54 = 54
     """.strip()
     )