fix: Fix state transitions for file processing (#3541)

This commit is contained in:
Matthew Zhou
2025-07-24 13:48:08 -07:00
committed by GitHub
parent 410cb24ba9
commit be7f4be4d8
6 changed files with 142 additions and 16 deletions

View File

@@ -217,3 +217,77 @@ class TestOpenAIEmbedder:
assert passages[2].embedding[:2] == [0.3, 0.3]
assert passages[3].text == "chunk 4"
assert passages[3].embedding[:2] == [0.4, 0.4]
class TestFileProcessorWithPinecone:
"""Test suite for file processor with Pinecone integration"""
@pytest.mark.asyncio
async def test_file_processor_sets_chunks_embedded_zero_with_pinecone(self):
"""Test that file processor sets total_chunks and chunks_embedded=0 when using Pinecone"""
from letta.schemas.enums import FileProcessingStatus
from letta.schemas.file import FileMetadata
from letta.services.file_processor.embedder.pinecone_embedder import PineconeEmbedder
from letta.services.file_processor.file_processor import FileProcessor
from letta.services.file_processor.parser.markitdown_parser import MarkitdownFileParser
# Mock dependencies
mock_actor = Mock()
mock_actor.organization_id = "test_org"
# Create real parser
file_parser = MarkitdownFileParser()
# Create file metadata with content
mock_file = FileMetadata(
file_name="test.txt",
source_id="source-87654321",
processing_status=FileProcessingStatus.PARSING,
total_chunks=0,
chunks_embedded=0,
content="This is test content that will be chunked.",
)
# Mock only the Pinecone-specific functionality
with patch("letta.services.file_processor.embedder.pinecone_embedder.PINECONE_AVAILABLE", True):
with patch("letta.services.file_processor.embedder.pinecone_embedder.upsert_file_records_to_pinecone_index") as mock_upsert:
# Mock successful Pinecone upsert
mock_upsert.return_value = None
# Create real Pinecone embedder
embedder = PineconeEmbedder()
# Create file processor with Pinecone enabled
file_processor = FileProcessor(file_parser=file_parser, embedder=embedder, actor=mock_actor, using_pinecone=True)
# Track file manager update calls
update_calls = []
async def track_update(*args, **kwargs):
update_calls.append(kwargs)
return mock_file
# Mock managers to track calls
with patch.object(file_processor.file_manager, "update_file_status", new=track_update):
with patch.object(file_processor.passage_manager, "create_many_source_passages_async", new=AsyncMock()):
# Process the imported file (which has content)
await file_processor.process_imported_file(mock_file, mock_file.source_id)
# Find the call that sets total_chunks and chunks_embedded
chunk_update_call = None
for call in update_calls:
if "total_chunks" in call and "chunks_embedded" in call:
chunk_update_call = call
break
# Verify the correct values were set
assert chunk_update_call is not None, "No update_file_status call found with total_chunks and chunks_embedded"
assert chunk_update_call["total_chunks"] > 0, "total_chunks should be greater than 0"
assert chunk_update_call["chunks_embedded"] == 0, "chunks_embedded should be 0 when using Pinecone"
# Verify Pinecone upsert was called
mock_upsert.assert_called_once()
call_args = mock_upsert.call_args
assert call_args.kwargs["file_id"] == mock_file.id
assert call_args.kwargs["source_id"] == mock_file.source_id
assert len(call_args.kwargs["chunks"]) > 0