From 723d44d816082b19ef73b87d72f61b023ad6ab4a Mon Sep 17 00:00:00 2001 From: Matthew Zhou Date: Mon, 14 Jul 2025 10:50:15 -0700 Subject: [PATCH] feat: Fix empty splitlines bug (#3317) --- .../file_processor/chunker/line_chunker.py | 17 +++++++++++++++++ tests/test_utils.py | 5 ++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/letta/services/file_processor/chunker/line_chunker.py b/letta/services/file_processor/chunker/line_chunker.py index c06f024b..fe5ed031 100644 --- a/letta/services/file_processor/chunker/line_chunker.py +++ b/letta/services/file_processor/chunker/line_chunker.py @@ -40,6 +40,10 @@ class LineChunker: def _chunk_by_lines(self, text: str, preserve_indentation: bool = False) -> List[str]: """Traditional line-based chunking for code and structured data""" + # early stop, can happen if the there's nothing on a specific file + if not text: + return [] + lines = [] for line in text.splitlines(): if preserve_indentation: @@ -57,6 +61,10 @@ class LineChunker: def _chunk_by_sentences(self, text: str) -> List[str]: """Sentence-based chunking for documentation and markup""" + # early stop, can happen if the there's nothing on a specific file + if not text: + return [] + # Simple sentence splitting on periods, exclamation marks, and question marks # followed by whitespace or end of string sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])" @@ -75,6 +83,10 @@ class LineChunker: def _chunk_by_characters(self, text: str, target_line_length: int = 100) -> List[str]: """Character-based wrapping for prose text""" + # early stop, can happen if the there's nothing on a specific file + if not text: + return [] + words = text.split() lines = [] current_line = [] @@ -110,6 +122,11 @@ class LineChunker: strategy = self._determine_chunking_strategy(file_metadata) text = file_metadata.content + # early stop, can happen if the there's nothing on a specific file + if not text: + logger.warning(f"File ({file_metadata}) has no content") + return [] + # Apply the appropriate chunking strategy if strategy == ChunkingStrategy.DOCUMENTATION: content_lines = self._chunk_by_sentences(text) diff --git a/tests/test_utils.py b/tests/test_utils.py index 5b0e724c..5da46f61 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -464,9 +464,8 @@ def test_line_chunker_edge_case_empty_file(): file = FileMetadata(file_name="empty.py", source_id="test_source", content="") chunker = LineChunker() - # Test requesting lines from empty file - with pytest.raises(ValueError, match="File empty.py has only 0 lines, but requested offset 1 is out of range"): - chunker.chunk_text(file, start=0, end=1, validate_range=True) + # no error + chunker.chunk_text(file, start=0, end=1, validate_range=True) def test_line_chunker_edge_case_single_line():