From 723d44d816082b19ef73b87d72f61b023ad6ab4a Mon Sep 17 00:00:00 2001
From: Matthew Zhou <mattzh1314@gmail.com>
Date: Mon, 14 Jul 2025 10:50:15 -0700
Subject: [PATCH] feat: Fix empty splitlines bug (#3317)

---
 .../file_processor/chunker/line_chunker.py      | 17 +++++++++++++++++
 tests/test_utils.py                             |  5 ++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/letta/services/file_processor/chunker/line_chunker.py b/letta/services/file_processor/chunker/line_chunker.py
index c06f024b..fe5ed031 100644
--- a/letta/services/file_processor/chunker/line_chunker.py
+++ b/letta/services/file_processor/chunker/line_chunker.py
@@ -40,6 +40,10 @@ class LineChunker:
 
     def _chunk_by_lines(self, text: str, preserve_indentation: bool = False) -> List[str]:
         """Traditional line-based chunking for code and structured data"""
+        # early stop, can happen if the there's nothing on a specific file
+        if not text:
+            return []
+
         lines = []
         for line in text.splitlines():
             if preserve_indentation:
@@ -57,6 +61,10 @@ class LineChunker:
 
     def _chunk_by_sentences(self, text: str) -> List[str]:
         """Sentence-based chunking for documentation and markup"""
+        # early stop, can happen if the there's nothing on a specific file
+        if not text:
+            return []
+
         # Simple sentence splitting on periods, exclamation marks, and question marks
         # followed by whitespace or end of string
         sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])"
@@ -75,6 +83,10 @@ class LineChunker:
 
     def _chunk_by_characters(self, text: str, target_line_length: int = 100) -> List[str]:
         """Character-based wrapping for prose text"""
+        # early stop, can happen if the there's nothing on a specific file
+        if not text:
+            return []
+
         words = text.split()
         lines = []
         current_line = []
@@ -110,6 +122,11 @@ class LineChunker:
         strategy = self._determine_chunking_strategy(file_metadata)
         text = file_metadata.content
 
+        # early stop, can happen if the there's nothing on a specific file
+        if not text:
+            logger.warning(f"File ({file_metadata}) has no content")
+            return []
+
         # Apply the appropriate chunking strategy
         if strategy == ChunkingStrategy.DOCUMENTATION:
             content_lines = self._chunk_by_sentences(text)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 5b0e724c..5da46f61 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -464,9 +464,8 @@ def test_line_chunker_edge_case_empty_file():
     file = FileMetadata(file_name="empty.py", source_id="test_source", content="")
     chunker = LineChunker()
 
-    # Test requesting lines from empty file
-    with pytest.raises(ValueError, match="File empty.py has only 0 lines, but requested offset 1 is out of range"):
-        chunker.chunk_text(file, start=0, end=1, validate_range=True)
+    # no error
+    chunker.chunk_text(file, start=0, end=1, validate_range=True)
 
 
 def test_line_chunker_edge_case_single_line():