diff --git a/letta/adapters/simple_llm_stream_adapter.py b/letta/adapters/simple_llm_stream_adapter.py index 2313ff2b..440f1994 100644 --- a/letta/adapters/simple_llm_stream_adapter.py +++ b/letta/adapters/simple_llm_stream_adapter.py @@ -70,6 +70,9 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter): # Store request data self.request_data = request_data + # Track request start time for latency calculation + request_start_ns = get_utc_timestamp_ns() + # Get cancellation event for this run to enable graceful cancellation (before branching) cancellation_event = get_cancellation_event_for_run(self.run_id) if self.run_id else None @@ -138,6 +141,16 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter): else: stream = await self.llm_client.stream_async(request_data, self.llm_config) except Exception as e: + self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns() + latency_ms = int((self.llm_request_finish_timestamp_ns - request_start_ns) / 1_000_000) + await self.llm_client.log_provider_trace_async( + request_data=request_data, + response_json=None, + llm_config=self.llm_config, + latency_ms=latency_ms, + error_msg=str(e), + error_type=type(e).__name__, + ) raise self.llm_client.handle_llm_error(e) # Process the stream and yield chunks immediately for TTFT @@ -146,7 +159,16 @@ class SimpleLLMStreamAdapter(LettaLLMStreamAdapter): # Yield each chunk immediately as it arrives yield chunk except Exception as e: - # Map provider-specific errors during streaming to common LLMError types + self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns() + latency_ms = int((self.llm_request_finish_timestamp_ns - request_start_ns) / 1_000_000) + await self.llm_client.log_provider_trace_async( + request_data=request_data, + response_json=None, + llm_config=self.llm_config, + latency_ms=latency_ms, + error_msg=str(e), + error_type=type(e).__name__, + ) raise self.llm_client.handle_llm_error(e) # After streaming completes, extract the accumulated data