99 lines
4.1 KiB
Python
99 lines
4.1 KiB
Python
import time
|
|
from typing import List, Optional
|
|
|
|
from letta.helpers.tpuf_client import TurbopufferClient
|
|
from letta.log import get_logger
|
|
from letta.otel.tracing import log_event, trace_method
|
|
from letta.schemas.embedding_config import EmbeddingConfig
|
|
from letta.schemas.enums import VectorDBProvider
|
|
from letta.schemas.passage import Passage
|
|
from letta.schemas.user import User
|
|
from letta.services.file_processor.embedder.base_embedder import BaseEmbedder
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class TurbopufferEmbedder(BaseEmbedder):
|
|
"""Turbopuffer-based embedding generation and storage"""
|
|
|
|
def __init__(self, embedding_config: Optional[EmbeddingConfig] = None):
|
|
super().__init__()
|
|
# set the vector db type for turbopuffer
|
|
self.vector_db_type = VectorDBProvider.TPUF
|
|
# use the default embedding config from TurbopufferClient if not provided
|
|
self.embedding_config = embedding_config or TurbopufferClient.default_embedding_config
|
|
self.tpuf_client = TurbopufferClient()
|
|
|
|
@trace_method
|
|
async def generate_embedded_passages(self, file_id: str, source_id: str, chunks: List[str], actor: User) -> List[Passage]:
|
|
"""Generate embeddings and store in Turbopuffer, then return Passage objects"""
|
|
if not chunks:
|
|
return []
|
|
|
|
# Filter out empty or whitespace-only chunks
|
|
valid_chunks = [chunk for chunk in chunks if chunk and chunk.strip()]
|
|
|
|
if not valid_chunks:
|
|
logger.warning(f"No valid text chunks found for file {file_id}. PDF may contain only images without text layer.")
|
|
log_event(
|
|
"turbopuffer_embedder.no_valid_chunks",
|
|
{"file_id": file_id, "source_id": source_id, "total_chunks": len(chunks), "reason": "All chunks empty or whitespace-only"},
|
|
)
|
|
return []
|
|
|
|
if len(valid_chunks) < len(chunks):
|
|
logger.info(f"Filtered out {len(chunks) - len(valid_chunks)} empty chunks from {len(chunks)} total")
|
|
log_event(
|
|
"turbopuffer_embedder.chunks_filtered",
|
|
{
|
|
"file_id": file_id,
|
|
"original_chunks": len(chunks),
|
|
"valid_chunks": len(valid_chunks),
|
|
"filtered_chunks": len(chunks) - len(valid_chunks),
|
|
},
|
|
)
|
|
|
|
logger.info(f"Generating embeddings for {len(valid_chunks)} chunks using Turbopuffer")
|
|
log_event(
|
|
"turbopuffer_embedder.generation_started",
|
|
{
|
|
"total_chunks": len(valid_chunks),
|
|
"file_id": file_id,
|
|
"source_id": source_id,
|
|
"embedding_model": self.embedding_config.embedding_model,
|
|
},
|
|
)
|
|
|
|
try:
|
|
# insert passages to Turbopuffer - it will handle embedding generation internally
|
|
embedding_start = time.time()
|
|
passages = await self.tpuf_client.insert_file_passages(
|
|
source_id=source_id,
|
|
file_id=file_id,
|
|
text_chunks=valid_chunks,
|
|
organization_id=actor.organization_id,
|
|
actor=actor,
|
|
)
|
|
embedding_duration = time.time() - embedding_start
|
|
|
|
logger.info(f"Successfully generated and stored {len(passages)} passages in Turbopuffer (took {embedding_duration:.2f}s)")
|
|
log_event(
|
|
"turbopuffer_embedder.generation_completed",
|
|
{
|
|
"passages_created": len(passages),
|
|
"total_chunks_processed": len(valid_chunks),
|
|
"file_id": file_id,
|
|
"source_id": source_id,
|
|
"duration_seconds": embedding_duration,
|
|
},
|
|
)
|
|
return passages
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to generate embeddings with Turbopuffer: {str(e)}")
|
|
log_event(
|
|
"turbopuffer_embedder.generation_failed",
|
|
{"error": str(e), "error_type": type(e).__name__, "file_id": file_id, "source_id": source_id},
|
|
)
|
|
raise
|