* Add log probabilities support for RL training
This enables Letta server to request and return log probabilities from
OpenAI-compatible providers (including SGLang) for use in RL training.
Changes:
- LLMConfig: Add return_logprobs and top_logprobs fields
- OpenAIClient: Set logprobs in ChatCompletionRequest when enabled
- LettaLLMAdapter: Add logprobs field and extract from response
- LettaResponse: Add logprobs field to return log probs to client
- LettaRequest: Add return_logprobs/top_logprobs for per-request override
- LettaAgentV3: Store and pass logprobs through to response
- agents.py: Handle request-level logprobs override
Usage:
response = client.agents.messages.create(
agent_id=agent_id,
messages=[...],
return_logprobs=True,
top_logprobs=5,
)
print(response.logprobs) # Per-token log probabilities
🤖 Generated with [Letta Code](https://letta.com)
Co-Authored-By: Letta <noreply@letta.com>
* Add multi-turn token tracking for RL training via SGLang native endpoint
- Add TurnTokenData schema to track token IDs and logprobs per turn
- Add return_token_ids flag to LettaRequest and LLMConfig
- Create SGLangNativeClient for /generate endpoint (returns output_ids)
- Create SGLangNativeAdapter that uses native endpoint
- Modify LettaAgentV3 to accumulate turns across LLM calls
- Include turns in LettaResponse when return_token_ids=True
* Fix: Add SGLang native adapter to step() method, not just stream()
* Fix: Handle Pydantic Message objects in SGLang native adapter
* Fix: Remove api_key reference from LLMConfig (not present)
* Fix: Add missing 'created' field to ChatCompletionResponse
* Add full tool support to SGLang native adapter
- Format tools into prompt in Qwen-style format
- Parse tool calls from <tool_call> tags in response
- Format tool results as <tool_response> in user messages
- Set finish_reason to 'tool_calls' when tools are called
* Use tokenizer.apply_chat_template for proper tool formatting
- Add tokenizer caching in SGLang native adapter
- Use apply_chat_template when tokenizer available
- Fall back to manual formatting if not
- Convert Letta messages to OpenAI format for tokenizer
* Fix: Use func_response instead of tool_return for ToolReturn content
* Fix: Get output_token_logprobs from meta_info in SGLang response
* Fix: Allow None in output_token_logprobs (SGLang format includes null)
* chore: remove unrelated files from logprobs branch
🤖 Generated with [Letta Code](https://letta.com)
Co-Authored-By: Letta <noreply@letta.com>
* fix: add missing call_type param to adapter constructors in letta_agent_v3
The SGLang refactor dropped call_type=LLMCallType.agent_step when extracting
adapter creation into conditional blocks. Restores it for all 3 spots (SGLang
in step, SimpleLLM in step, SGLang in stream).
🤖 Generated with [Letta Code](https://letta.com)
Co-Authored-By: Letta <noreply@letta.com>
* just stage-api && just publish-api
* fix: update expected LLMConfig fields in schema test for logprobs support
🤖 Generated with [Letta Code](https://letta.com)
Co-Authored-By: Letta <noreply@letta.com>
* chore: remove rllm provider references
🤖 Generated with [Letta Code](https://letta.com)
Co-Authored-By: Letta <noreply@letta.com>
* just stage-api && just publish-api
🤖 Generated with [Letta Code](https://letta.com)
Co-Authored-By: Letta <noreply@letta.com>
---------
Co-authored-by: Ubuntu <ubuntu@ip-172-31-65-206.ec2.internal>
Co-authored-by: Letta <noreply@letta.com>
147 lines
6.1 KiB
Python
147 lines
6.1 KiB
Python
from typing import AsyncGenerator
|
|
|
|
from letta.adapters.letta_llm_adapter import LettaLLMAdapter
|
|
from letta.helpers.datetime_helpers import get_utc_timestamp_ns
|
|
from letta.otel.tracing import log_attributes, log_event, safe_json_dumps, trace_method
|
|
from letta.schemas.letta_message import LettaMessage
|
|
from letta.schemas.letta_message_content import OmittedReasoningContent, ReasoningContent, TextContent
|
|
from letta.schemas.provider_trace import ProviderTrace
|
|
from letta.schemas.usage import normalize_cache_tokens, normalize_reasoning_tokens
|
|
from letta.schemas.user import User
|
|
from letta.settings import settings
|
|
from letta.utils import safe_create_task
|
|
|
|
|
|
class LettaLLMRequestAdapter(LettaLLMAdapter):
|
|
"""
|
|
Adapter for handling blocking (non-streaming) LLM requests.
|
|
|
|
This adapter makes synchronous requests to the LLM and returns complete
|
|
responses. It extracts reasoning content, tool calls, and usage statistics
|
|
from the response and updates instance variables for access by the agent.
|
|
"""
|
|
|
|
async def invoke_llm(
|
|
self,
|
|
request_data: dict,
|
|
messages: list,
|
|
tools: list,
|
|
use_assistant_message: bool,
|
|
requires_approval_tools: list[str] = [],
|
|
step_id: str | None = None,
|
|
actor: str | None = None,
|
|
) -> AsyncGenerator[LettaMessage | None, None]:
|
|
"""
|
|
Execute a blocking LLM request and yield the response.
|
|
|
|
This adapter:
|
|
1. Makes a blocking request to the LLM
|
|
2. Converts the response to chat completion format
|
|
3. Extracts reasoning and tool call information
|
|
4. Updates all instance variables
|
|
5. Yields nothing (blocking mode doesn't stream)
|
|
"""
|
|
# Store request data
|
|
self.request_data = request_data
|
|
|
|
# Make the blocking LLM request
|
|
self.response_data = await self.llm_client.request_async(request_data, self.llm_config)
|
|
self.llm_request_finish_timestamp_ns = get_utc_timestamp_ns()
|
|
|
|
# Convert response to chat completion format
|
|
self.chat_completions_response = await self.llm_client.convert_response_to_chat_completion(
|
|
self.response_data, messages, self.llm_config
|
|
)
|
|
|
|
# Extract reasoning content from the response
|
|
if self.chat_completions_response.choices[0].message.reasoning_content:
|
|
self.reasoning_content = [
|
|
ReasoningContent(
|
|
reasoning=self.chat_completions_response.choices[0].message.reasoning_content,
|
|
is_native=True,
|
|
signature=self.chat_completions_response.choices[0].message.reasoning_content_signature,
|
|
)
|
|
]
|
|
elif self.chat_completions_response.choices[0].message.omitted_reasoning_content:
|
|
self.reasoning_content = [OmittedReasoningContent()]
|
|
elif self.chat_completions_response.choices[0].message.content:
|
|
# Reasoning placed into content for legacy reasons
|
|
# Carry thought_signature on TextContent when ReasoningContent doesn't exist to hold it
|
|
self.reasoning_content = [
|
|
TextContent(
|
|
text=self.chat_completions_response.choices[0].message.content,
|
|
signature=self.chat_completions_response.choices[0].message.reasoning_content_signature,
|
|
)
|
|
]
|
|
else:
|
|
# logger.info("No reasoning content found.")
|
|
self.reasoning_content = None
|
|
|
|
# Extract tool call
|
|
if self.chat_completions_response.choices[0].message.tool_calls:
|
|
self.tool_call = self.chat_completions_response.choices[0].message.tool_calls[0]
|
|
else:
|
|
self.tool_call = None
|
|
|
|
# Extract logprobs if present
|
|
self.logprobs = self.chat_completions_response.choices[0].logprobs
|
|
|
|
# Extract usage statistics
|
|
self.usage.step_count = 1
|
|
self.usage.completion_tokens = self.chat_completions_response.usage.completion_tokens
|
|
self.usage.prompt_tokens = self.chat_completions_response.usage.prompt_tokens
|
|
self.usage.total_tokens = self.chat_completions_response.usage.total_tokens
|
|
|
|
# Extract cache and reasoning token details using normalized helpers
|
|
usage = self.chat_completions_response.usage
|
|
self.usage.cached_input_tokens, self.usage.cache_write_tokens = normalize_cache_tokens(usage.prompt_tokens_details)
|
|
self.usage.reasoning_tokens = normalize_reasoning_tokens(usage.completion_tokens_details)
|
|
|
|
self.log_provider_trace(step_id=step_id, actor=actor)
|
|
|
|
yield None
|
|
return
|
|
|
|
@trace_method
|
|
def log_provider_trace(self, step_id: str | None, actor: User | None) -> None:
|
|
"""
|
|
Log provider trace data for telemetry purposes in a fire-and-forget manner.
|
|
|
|
Creates an async task to log the request/response data without blocking
|
|
the main execution flow. The task runs in the background.
|
|
|
|
Args:
|
|
step_id: The step ID associated with this request for logging purposes
|
|
actor: The user associated with this request for logging purposes
|
|
"""
|
|
|
|
if step_id is None or actor is None:
|
|
return
|
|
|
|
log_attributes(
|
|
{
|
|
"request_data": safe_json_dumps(self.request_data),
|
|
"response_data": safe_json_dumps(self.response_data),
|
|
}
|
|
)
|
|
|
|
if settings.track_provider_trace:
|
|
safe_create_task(
|
|
self.telemetry_manager.create_provider_trace_async(
|
|
actor=actor,
|
|
provider_trace=ProviderTrace(
|
|
request_json=self.request_data,
|
|
response_json=self.response_data,
|
|
step_id=step_id,
|
|
agent_id=self.agent_id,
|
|
agent_tags=self.agent_tags,
|
|
run_id=self.run_id,
|
|
call_type=self.call_type,
|
|
org_id=self.org_id,
|
|
user_id=self.user_id,
|
|
llm_config=self.llm_config.model_dump() if self.llm_config else None,
|
|
),
|
|
),
|
|
label="create_provider_trace",
|
|
)
|