fix: remove unused embedding generation (#9013)

* remove unused embedding generation * prevent double embed * fix embedding dimension comparison and valueerror
2026-01-21 15:50:51 -08:00
parent dbc4f88701
commit 2bb4caffc3
4 changed files with 34 additions and 17 deletions
--- a/letta/helpers/tpuf_client.py
+++ b/letta/helpers/tpuf_client.py
@@ -7,6 +7,7 @@ from datetime import datetime, timezone
 from typing import Any, Callable, List, Optional, Tuple

 from letta.constants import DEFAULT_EMBEDDING_CHUNK_SIZE
+from letta.errors import LettaInvalidArgumentError
 from letta.otel.tracing import trace_method
 from letta.schemas.embedding_config import EmbeddingConfig
 from letta.schemas.enums import MessageRole, TagMatchMode
@@ -321,6 +322,7 @@ class TurbopufferClient:
        actor: "PydanticUser",
        tags: Optional[List[str]] = None,
        created_at: Optional[datetime] = None,
+        embeddings: Optional[List[List[float]]] = None,
    ) -> List[PydanticPassage]:
        """Insert passages into Turbopuffer.

@@ -332,6 +334,7 @@ class TurbopufferClient:
            actor: User actor for embedding generation
            tags: Optional list of tags to attach to all passages
            created_at: Optional timestamp for retroactive entries (defaults to current UTC time)
+            embeddings: Optional pre-computed embeddings (must match 1:1 with text_chunks). If provided, skips embedding generation.

        Returns:
            List of PydanticPassage objects that were inserted
@@ -345,9 +348,30 @@ class TurbopufferClient:
            logger.warning("All text chunks were empty, skipping insertion")
            return []

-        # generate embeddings using the default config
        filtered_texts = [text for _, text in filtered_chunks]
-        embeddings = await self._generate_embeddings(filtered_texts, actor)
+
+        # use provided embeddings only if dimensions match TPUF's expected dimension
+        use_provided_embeddings = False
+        if embeddings is not None:
+            if len(embeddings) != len(text_chunks):
+                raise LettaInvalidArgumentError(
+                    f"embeddings length ({len(embeddings)}) must match text_chunks length ({len(text_chunks)})",
+                    argument_name="embeddings",
+                )
+            # check if first non-empty embedding has correct dimensions
+            filtered_indices = [i for i, _ in filtered_chunks]
+            sample_embedding = embeddings[filtered_indices[0]] if filtered_indices else None
+            if sample_embedding is not None and len(sample_embedding) == self.default_embedding_config.embedding_dim:
+                use_provided_embeddings = True
+                filtered_embeddings = [embeddings[i] for i, _ in filtered_chunks]
+            else:
+                logger.debug(
+                    f"Embedding dimension mismatch (got {len(sample_embedding) if sample_embedding else 'None'}, "
+                    f"expected {self.default_embedding_config.embedding_dim}), regenerating embeddings"
+                )
+
+        if not use_provided_embeddings:
+            filtered_embeddings = await self._generate_embeddings(filtered_texts, actor)

        namespace_name = await self._get_archive_namespace_name(archive_id)

@@ -379,7 +403,7 @@ class TurbopufferClient:
        tags_arrays = []  # Store tags as arrays
        passages = []

-        for (original_idx, text), embedding in zip(filtered_chunks, embeddings):
+        for (original_idx, text), embedding in zip(filtered_chunks, filtered_embeddings):
            passage_id = passage_ids[original_idx]

            # append to columns
--- a/letta/services/agent_manager.py
+++ b/letta/services/agent_manager.py
@@ -2321,15 +2321,6 @@ class AgentManager:
                # Use Turbopuffer for vector search if archive is configured for TPUF
                if archive.vector_db_provider == VectorDBProvider.TPUF:
                    from letta.helpers.tpuf_client import TurbopufferClient
-                    from letta.llm_api.llm_client import LLMClient
-
-                    # Generate embedding for query
-                    embedding_client = LLMClient.create(
-                        provider_type=embedding_config.embedding_endpoint_type,
-                        actor=actor,
-                    )
-                    embeddings = await embedding_client.request_embeddings([query_text], embedding_config)
-                    query_embedding = embeddings[0]

                    # Query Turbopuffer - use hybrid search when text is available
                    tpuf_client = TurbopufferClient()
--- a/letta/services/archive_manager.py
+++ b/letta/services/archive_manager.py
@@ -345,13 +345,14 @@ class ArchiveManager:

                tpuf_client = TurbopufferClient()

-                # Insert to Turbopuffer with the same ID as SQL
+                # Insert to Turbopuffer with the same ID as SQL, reusing existing embedding
                await tpuf_client.insert_archival_memories(
                    archive_id=archive.id,
                    text_chunks=[created_passage.text],
                    passage_ids=[created_passage.id],
                    organization_id=actor.organization_id,
                    actor=actor,
+                    embeddings=[created_passage.embedding],
                )
                logger.info(f"Uploaded passage {created_passage.id} to Turbopuffer for archive {archive_id}")
            except Exception as e:
--- a/letta/services/passage_manager.py
+++ b/letta/services/passage_manager.py
@@ -525,20 +525,21 @@ class PassageManager:

                    tpuf_client = TurbopufferClient()

-                    # Extract IDs and texts from the created passages
+                    # Extract IDs, texts, and embeddings from the created passages
                    passage_ids = [p.id for p in passages]
                    passage_texts = [p.text for p in passages]
+                    passage_embeddings = [p.embedding for p in passages]

-                    # Insert to Turbopuffer with the same IDs as SQL
-                    # TurbopufferClient will generate embeddings internally using default config
+                    # Insert to Turbopuffer with the same IDs as SQL, reusing existing embeddings
                    await tpuf_client.insert_archival_memories(
                        archive_id=archive.id,
                        text_chunks=passage_texts,
-                        passage_ids=passage_ids,  # Use same IDs as SQL
+                        passage_ids=passage_ids,
                        organization_id=actor.organization_id,
                        actor=actor,
                        tags=tags,
                        created_at=passages[0].created_at if passages else None,
+                        embeddings=passage_embeddings,
                    )
                except Exception as e:
                    logger.error(f"Failed to insert passages to Turbopuffer: {e}")