Files
letta-server/letta/adapters/letta_llm_request_adapter.py
Kevin Lin 23c94ec6d3 feat: add log probabilities from OpenAI-compatible servers and SGLang native endpoint (#9240)
* Add log probabilities support for RL training

This enables Letta server to request and return log probabilities from
OpenAI-compatible providers (including SGLang) for use in RL training.

Changes:
- LLMConfig: Add return_logprobs and top_logprobs fields
- OpenAIClient: Set logprobs in ChatCompletionRequest when enabled
- LettaLLMAdapter: Add logprobs field and extract from response
- LettaResponse: Add logprobs field to return log probs to client
- LettaRequest: Add return_logprobs/top_logprobs for per-request override
- LettaAgentV3: Store and pass logprobs through to response
- agents.py: Handle request-level logprobs override

Usage:
  response = client.agents.messages.create(
      agent_id=agent_id,
      messages=[...],
      return_logprobs=True,
      top_logprobs=5,
  )
  print(response.logprobs)  # Per-token log probabilities

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* Add multi-turn token tracking for RL training via SGLang native endpoint

- Add TurnTokenData schema to track token IDs and logprobs per turn
- Add return_token_ids flag to LettaRequest and LLMConfig
- Create SGLangNativeClient for /generate endpoint (returns output_ids)
- Create SGLangNativeAdapter that uses native endpoint
- Modify LettaAgentV3 to accumulate turns across LLM calls
- Include turns in LettaResponse when return_token_ids=True

* Fix: Add SGLang native adapter to step() method, not just stream()

* Fix: Handle Pydantic Message objects in SGLang native adapter

* Fix: Remove api_key reference from LLMConfig (not present)

* Fix: Add missing 'created' field to ChatCompletionResponse

* Add full tool support to SGLang native adapter

- Format tools into prompt in Qwen-style format
- Parse tool calls from <tool_call> tags in response
- Format tool results as <tool_response> in user messages
- Set finish_reason to 'tool_calls' when tools are called

* Use tokenizer.apply_chat_template for proper tool formatting

- Add tokenizer caching in SGLang native adapter
- Use apply_chat_template when tokenizer available
- Fall back to manual formatting if not
- Convert Letta messages to OpenAI format for tokenizer

* Fix: Use func_response instead of tool_return for ToolReturn content

* Fix: Get output_token_logprobs from meta_info in SGLang response

* Fix: Allow None in output_token_logprobs (SGLang format includes null)

* chore: remove unrelated files from logprobs branch

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* fix: add missing call_type param to adapter constructors in letta_agent_v3

The SGLang refactor dropped call_type=LLMCallType.agent_step when extracting
adapter creation into conditional blocks. Restores it for all 3 spots (SGLang
in step, SimpleLLM in step, SGLang in stream).

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* just stage-api && just publish-api

* fix: update expected LLMConfig fields in schema test for logprobs support

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* chore: remove rllm provider references

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

* just stage-api && just publish-api

🤖 Generated with [Letta Code](https://letta.com)

Co-Authored-By: Letta <noreply@letta.com>

---------

Co-authored-by: Ubuntu <ubuntu@ip-172-31-65-206.ec2.internal>
Co-authored-by: Letta <noreply@letta.com>
2026-02-24 10:52:07 -08:00

147 lines
6.1 KiB
Python

from typing import AsyncGenerator
from letta.adapters.letta_llm_adapter import LettaLLMAdapter
from letta.helpers.datetime_helpers import get_utc_timestamp_ns
from letta.otel.tracing import log_attributes, log_event, safe_json_dumps, trace_method
from letta.schemas.letta_message import LettaMessage
from letta.schemas.letta_message_content import OmittedReasoningContent, ReasoningContent, TextContent
from letta.schemas.provider_trace import ProviderTrace
from letta.schemas.usage import normalize_cache_tokens, normalize_reasoning_tokens
from letta.schemas.user import User
from letta.settings import settings
from letta.utils import safe_create_task
class LettaLLMRequestAdapter(LettaLLMAdapter):
"""
Adapter for handling blocking (non-streaming) LLM requests.
This adapter makes synchronous requests to the LLM and returns complete
responses. It extracts reasoning content, tool calls, and usage statistics
from the response and updates instance variables for access by the agent.
"""
async def invoke_llm(
self,
request_data: dict,
messages: list,
tools: list,
use_assistant_message: bool,
requires_approval_tools: list[str] = [],
step_id: str | None = None,
actor: str | None = None,
) -> AsyncGenerator[LettaMessage | None, None]:
"""
Execute a blocking LLM request and yield the response.
This adapter:
1. Makes a blocking request to the LLM
2. Converts the response to chat completion format
3. Extracts reasoning and tool call information
4. Updates all instance variables
5. Yields nothing (blocking mode doesn't stream)
"""
# Store request data
self.request_data = request_data
# Make the blocking LLM request
self.response_data = await self.llm_client.request_async(request_data, self.llm_config)
self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()
# Convert response to chat completion format
self.chat_completions_response = await self.llm_client.convert_response_to_chat_completion(
self.response_data, messages, self.llm_config
)
# Extract reasoning content from the response
if self.chat_completions_response.choices[0].message.reasoning_content:
self.reasoning_content = [
ReasoningContent(
reasoning=self.chat_completions_response.choices[0].message.reasoning_content,
is_native=True,
signature=self.chat_completions_response.choices[0].message.reasoning_content_signature,
)
]
elif self.chat_completions_response.choices[0].message.omitted_reasoning_content:
self.reasoning_content = [OmittedReasoningContent()]
elif self.chat_completions_response.choices[0].message.content:
# Reasoning placed into content for legacy reasons
# Carry thought_signature on TextContent when ReasoningContent doesn't exist to hold it
self.reasoning_content = [
TextContent(
text=self.chat_completions_response.choices[0].message.content,
signature=self.chat_completions_response.choices[0].message.reasoning_content_signature,
)
]
else:
# logger.info("No reasoning content found.")
self.reasoning_content = None
# Extract tool call
if self.chat_completions_response.choices[0].message.tool_calls:
self.tool_call = self.chat_completions_response.choices[0].message.tool_calls[0]
else:
self.tool_call = None
# Extract logprobs if present
self.logprobs = self.chat_completions_response.choices[0].logprobs
# Extract usage statistics
self.usage.step_count = 1
self.usage.completion_tokens = self.chat_completions_response.usage.completion_tokens
self.usage.prompt_tokens = self.chat_completions_response.usage.prompt_tokens
self.usage.total_tokens = self.chat_completions_response.usage.total_tokens
# Extract cache and reasoning token details using normalized helpers
usage = self.chat_completions_response.usage
self.usage.cached_input_tokens, self.usage.cache_write_tokens = normalize_cache_tokens(usage.prompt_tokens_details)
self.usage.reasoning_tokens = normalize_reasoning_tokens(usage.completion_tokens_details)
self.log_provider_trace(step_id=step_id, actor=actor)
yield None
return
@trace_method
def log_provider_trace(self, step_id: str | None, actor: User | None) -> None:
"""
Log provider trace data for telemetry purposes in a fire-and-forget manner.
Creates an async task to log the request/response data without blocking
the main execution flow. The task runs in the background.
Args:
step_id: The step ID associated with this request for logging purposes
actor: The user associated with this request for logging purposes
"""
if step_id is None or actor is None:
return
log_attributes(
{
"request_data": safe_json_dumps(self.request_data),
"response_data": safe_json_dumps(self.response_data),
}
)
if settings.track_provider_trace:
safe_create_task(
self.telemetry_manager.create_provider_trace_async(
actor=actor,
provider_trace=ProviderTrace(
request_json=self.request_data,
response_json=self.response_data,
step_id=step_id,
agent_id=self.agent_id,
agent_tags=self.agent_tags,
run_id=self.run_id,
call_type=self.call_type,
org_id=self.org_id,
user_id=self.user_id,
llm_config=self.llm_config.model_dump() if self.llm_config else None,
),
),
label="create_provider_trace",
)