* Add log probabilities support for RL training
This enables Letta server to request and return log probabilities from
OpenAI-compatible providers (including SGLang) for use in RL training.
Changes:
- LLMConfig: Add return_logprobs and top_logprobs fields
- OpenAIClient: Set logprobs in ChatCompletionRequest when enabled
- LettaLLMAdapter: Add logprobs field and extract from response
- LettaResponse: Add logprobs field to return log probs to client
- LettaRequest: Add return_logprobs/top_logprobs for per-request override
- LettaAgentV3: Store and pass logprobs through to response
- agents.py: Handle request-level logprobs override
Usage:
response = client.agents.messages.create(
agent_id=agent_id,
messages=[...],
return_logprobs=True,
top_logprobs=5,
)
print(response.logprobs) # Per-token log probabilities
🤖 Generated with [Letta Code](https://letta.com)
Co-Authored-By: Letta <noreply@letta.com>
* Add multi-turn token tracking for RL training via SGLang native endpoint
- Add TurnTokenData schema to track token IDs and logprobs per turn
- Add return_token_ids flag to LettaRequest and LLMConfig
- Create SGLangNativeClient for /generate endpoint (returns output_ids)
- Create SGLangNativeAdapter that uses native endpoint
- Modify LettaAgentV3 to accumulate turns across LLM calls
- Include turns in LettaResponse when return_token_ids=True
* Fix: Add SGLang native adapter to step() method, not just stream()
* Fix: Handle Pydantic Message objects in SGLang native adapter
* Fix: Remove api_key reference from LLMConfig (not present)
* Fix: Add missing 'created' field to ChatCompletionResponse
* Add full tool support to SGLang native adapter
- Format tools into prompt in Qwen-style format
- Parse tool calls from <tool_call> tags in response
- Format tool results as <tool_response> in user messages
- Set finish_reason to 'tool_calls' when tools are called
* Use tokenizer.apply_chat_template for proper tool formatting
- Add tokenizer caching in SGLang native adapter
- Use apply_chat_template when tokenizer available
- Fall back to manual formatting if not
- Convert Letta messages to OpenAI format for tokenizer
* Fix: Use func_response instead of tool_return for ToolReturn content
* Fix: Get output_token_logprobs from meta_info in SGLang response
* Fix: Allow None in output_token_logprobs (SGLang format includes null)
* chore: remove unrelated files from logprobs branch
🤖 Generated with [Letta Code](https://letta.com)
Co-Authored-By: Letta <noreply@letta.com>
* fix: add missing call_type param to adapter constructors in letta_agent_v3
The SGLang refactor dropped call_type=LLMCallType.agent_step when extracting
adapter creation into conditional blocks. Restores it for all 3 spots (SGLang
in step, SimpleLLM in step, SGLang in stream).
🤖 Generated with [Letta Code](https://letta.com)
Co-Authored-By: Letta <noreply@letta.com>
* just stage-api && just publish-api
* fix: update expected LLMConfig fields in schema test for logprobs support
🤖 Generated with [Letta Code](https://letta.com)
Co-Authored-By: Letta <noreply@letta.com>
* chore: remove rllm provider references
🤖 Generated with [Letta Code](https://letta.com)
Co-Authored-By: Letta <noreply@letta.com>
* just stage-api && just publish-api
🤖 Generated with [Letta Code](https://letta.com)
Co-Authored-By: Letta <noreply@letta.com>
---------
Co-authored-by: Ubuntu <ubuntu@ip-172-31-65-206.ec2.internal>
Co-authored-by: Letta <noreply@letta.com>
117 lines
4.7 KiB
Python
117 lines
4.7 KiB
Python
from abc import ABC, abstractmethod
|
|
from typing import AsyncGenerator, Optional
|
|
|
|
from letta.llm_api.llm_client_base import LLMClientBase
|
|
from letta.schemas.enums import LLMCallType
|
|
from letta.schemas.letta_message import LettaMessage
|
|
from letta.schemas.letta_message_content import ReasoningContent, RedactedReasoningContent, TextContent
|
|
from letta.schemas.llm_config import LLMConfig
|
|
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, ChoiceLogprobs, ToolCall
|
|
from letta.schemas.usage import LettaUsageStatistics
|
|
from letta.schemas.user import User
|
|
from letta.services.telemetry_manager import TelemetryManager
|
|
|
|
|
|
class LettaLLMAdapter(ABC):
|
|
"""
|
|
Base adapter for handling LLM calls in a unified way.
|
|
|
|
This abstract class defines the interface for both blocking and streaming
|
|
LLM interactions, allowing the agent to use different execution modes
|
|
through a consistent API.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
llm_client: LLMClientBase,
|
|
llm_config: LLMConfig,
|
|
call_type: LLMCallType,
|
|
agent_id: str | None = None,
|
|
agent_tags: list[str] | None = None,
|
|
run_id: str | None = None,
|
|
org_id: str | None = None,
|
|
user_id: str | None = None,
|
|
) -> None:
|
|
self.llm_client: LLMClientBase = llm_client
|
|
self.llm_config: LLMConfig = llm_config
|
|
self.call_type: LLMCallType = call_type
|
|
self.agent_id: str | None = agent_id
|
|
self.agent_tags: list[str] | None = agent_tags
|
|
self.run_id: str | None = run_id
|
|
self.org_id: str | None = org_id
|
|
self.user_id: str | None = user_id
|
|
self.message_id: str | None = None
|
|
self.request_data: dict | None = None
|
|
self.response_data: dict | None = None
|
|
self.chat_completions_response: ChatCompletionResponse | None = None
|
|
self.reasoning_content: list[TextContent | ReasoningContent | RedactedReasoningContent] | None = None
|
|
self.content: list[TextContent | ReasoningContent | RedactedReasoningContent] | None = None
|
|
self.tool_call: ToolCall | None = None
|
|
self.tool_calls: list[ToolCall] = []
|
|
self.logprobs: ChoiceLogprobs | None = None
|
|
# SGLang native endpoint data (for multi-turn RL training)
|
|
self.output_ids: list[int] | None = None
|
|
self.output_token_logprobs: list[list[float]] | None = None
|
|
self.usage: LettaUsageStatistics = LettaUsageStatistics()
|
|
self.telemetry_manager: TelemetryManager = TelemetryManager()
|
|
self.llm_request_finish_timestamp_ns: int | None = None
|
|
|
|
@abstractmethod
|
|
async def invoke_llm(
|
|
self,
|
|
request_data: dict,
|
|
messages: list,
|
|
tools: list,
|
|
use_assistant_message: bool,
|
|
requires_approval_tools: list[str] = [],
|
|
step_id: str | None = None,
|
|
actor: User | None = None,
|
|
) -> AsyncGenerator[LettaMessage | None, None]:
|
|
"""
|
|
Execute the LLM call and yield results as they become available.
|
|
|
|
Args:
|
|
request_data: The prepared request data for the LLM API
|
|
messages: The messages in context for the request
|
|
tools: The tools available for the LLM to use
|
|
use_assistant_message: If true, use assistant messages when streaming response
|
|
requires_approval_tools: The subset of tools that require approval before use
|
|
step_id: The step ID associated with this request. If provided, logs request and response data.
|
|
actor: The optional actor associated with this request for logging purposes.
|
|
|
|
Yields:
|
|
LettaMessage: Chunks of data for streaming adapters, or None for blocking adapters
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@property
|
|
def finish_reason(self) -> str | None:
|
|
"""
|
|
Get the finish_reason from the LLM response.
|
|
|
|
Returns:
|
|
str | None: The finish_reason if available, None otherwise
|
|
"""
|
|
if self.chat_completions_response and self.chat_completions_response.choices:
|
|
return self.chat_completions_response.choices[0].finish_reason
|
|
return None
|
|
|
|
def supports_token_streaming(self) -> bool:
|
|
"""
|
|
Check if the adapter supports token-level streaming.
|
|
|
|
Returns:
|
|
bool: True if the adapter can stream back tokens as they are generated, False otherwise
|
|
"""
|
|
return False
|
|
|
|
def log_provider_trace(self, step_id: str | None, actor: User | None) -> None:
|
|
"""
|
|
Log provider trace data for telemetry purposes.
|
|
|
|
Args:
|
|
step_id: The step ID associated with this request for logging purposes
|
|
actor: The user associated with this request for logging purposes
|
|
"""
|
|
raise NotImplementedError
|