Files
letta-server/letta/adapters/simple_llm_request_adapter.py
Kian Jones a92e868ee6 feat: centralize telemetry logging at LLM client level (#8815)
* feat: centralize telemetry logging at LLM client level

Moves telemetry logging from individual adapters to LLMClientBase:
- Add TelemetryStreamWrapper for streaming telemetry on stream close
- Add request_async_with_telemetry() for non-streaming requests
- Add stream_async_with_telemetry() for streaming requests
- Add set_telemetry_context() to configure agent_id, run_id, step_id

Updates adapters and agents to use new pattern:
- LettaLLMAdapter now accepts agent_id/run_id in constructor
- Adapters call set_telemetry_context() before LLM requests
- Removes duplicate telemetry logging from adapters
- Enriches traces with agent_id, run_id, call_type metadata

🐙 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix: accumulate streaming response content for telemetry

TelemetryStreamWrapper now extracts actual response data from chunks:
- Content text (concatenated from deltas)
- Tool calls (id, name, arguments)
- Model name, finish reason, usage stats

🐙 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* refactor: move streaming telemetry to caller (option 3)

- Remove TelemetryStreamWrapper class
- Add log_provider_trace_async() helper to LLMClientBase
- stream_async_with_telemetry() now just returns raw stream
- Callers log telemetry after processing with rich interface data

Updated callers:
- summarizer.py: logs content + usage after stream processing
- letta_agent.py: logs tool_call, reasoning, model, usage

🐙 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix: pass agent_id and run_id to parent adapter class

LettaLLMStreamAdapter was not passing agent_id/run_id to parent,
causing "unexpected keyword argument" errors.

🐙 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

---------

Co-authored-by: Letta <noreply@letta.com>
2026-01-19 15:54:43 -08:00

107 lines
4.5 KiB
Python

from typing import AsyncGenerator
from letta.adapters.letta_llm_request_adapter import LettaLLMRequestAdapter
from letta.helpers.datetime_helpers import get_utc_timestamp_ns
from letta.schemas.letta_message import LettaMessage
from letta.schemas.letta_message_content import OmittedReasoningContent, ReasoningContent, TextContent
from letta.schemas.usage import normalize_cache_tokens, normalize_reasoning_tokens
class SimpleLLMRequestAdapter(LettaLLMRequestAdapter):
"""Simplifying assumptions:
- No inner thoughts in kwargs
- No forced tool calls
- Content native as assistant message
"""
async def invoke_llm(
self,
request_data: dict,
messages: list,
tools: list,
use_assistant_message: bool,
requires_approval_tools: list[str] = [],
step_id: str | None = None,
actor: str | None = None,
) -> AsyncGenerator[LettaMessage | None, None]:
"""
Execute a blocking LLM request and yield the response.
This adapter:
1. Makes a blocking request to the LLM
2. Converts the response to chat completion format
3. Extracts reasoning and tool call information
4. Updates all instance variables
5. Yields nothing (blocking mode doesn't stream)
"""
# Store request data
self.request_data = request_data
# Set telemetry context and make the blocking LLM request
self.llm_client.set_telemetry_context(
telemetry_manager=self.telemetry_manager,
step_id=step_id,
agent_id=self.agent_id,
run_id=self.run_id,
call_type="agent_step",
)
try:
self.response_data = await self.llm_client.request_async_with_telemetry(request_data, self.llm_config)
except Exception as e:
raise self.llm_client.handle_llm_error(e)
self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()
# Convert response to chat completion format
self.chat_completions_response = await self.llm_client.convert_response_to_chat_completion(
self.response_data, messages, self.llm_config
)
# Extract reasoning content from the response
if self.chat_completions_response.choices[0].message.reasoning_content:
self.reasoning_content = [
ReasoningContent(
reasoning=self.chat_completions_response.choices[0].message.reasoning_content,
is_native=True,
signature=self.chat_completions_response.choices[0].message.reasoning_content_signature,
)
]
elif self.chat_completions_response.choices[0].message.omitted_reasoning_content:
self.reasoning_content = [OmittedReasoningContent()]
else:
# logger.info("No reasoning content found.")
self.reasoning_content = None
if self.chat_completions_response.choices[0].message.content:
# NOTE: big difference - 'content' goes into 'content'
# Reasoning placed into content for legacy reasons
self.content = [TextContent(text=self.chat_completions_response.choices[0].message.content)]
else:
self.content = None
if self.reasoning_content and len(self.reasoning_content) > 0:
# Temp workaround to consolidate parts to persist reasoning content, this should be integrated better
self.content = self.reasoning_content + (self.content or [])
# Extract tool call
tool_calls = self.chat_completions_response.choices[0].message.tool_calls or []
self.tool_calls = list(tool_calls)
self.tool_call = self.tool_calls[0] if self.tool_calls else None
# Extract usage statistics
self.usage.step_count = 1
self.usage.completion_tokens = self.chat_completions_response.usage.completion_tokens
self.usage.prompt_tokens = self.chat_completions_response.usage.prompt_tokens
self.usage.total_tokens = self.chat_completions_response.usage.total_tokens
# Extract cache and reasoning token details using normalized helpers
usage = self.chat_completions_response.usage
self.usage.cached_input_tokens, self.usage.cache_write_tokens = normalize_cache_tokens(usage.prompt_tokens_details)
self.usage.reasoning_tokens = normalize_reasoning_tokens(usage.completion_tokens_details)
self.log_provider_trace(step_id=step_id, actor=actor)
yield None
return