diff --git a/letta/agent.py b/letta/agent.py index cfdfd7a8..b40ba27b 100644 --- a/letta/agent.py +++ b/letta/agent.py @@ -302,10 +302,8 @@ class Agent(BaseAgent): log_telemetry(self.logger, "_get_ai_reply create start") # New LLM client flow llm_client = LLMClient.create( - agent_id=self.agent_state.id, llm_config=self.agent_state.llm_config, put_inner_thoughts_first=put_inner_thoughts_first, - actor_id=self.agent_state.created_by_id, ) if llm_client and not stream: diff --git a/letta/errors.py b/letta/errors.py index 818daebf..d5d832b0 100644 --- a/letta/errors.py +++ b/letta/errors.py @@ -62,6 +62,26 @@ class LLMError(LettaError): pass +class LLMConnectionError(LLMError): + """Error when unable to connect to LLM service""" + + +class LLMRateLimitError(LLMError): + """Error when rate limited by LLM service""" + + +class LLMPermissionDeniedError(LLMError): + """Error when permission is denied by LLM service""" + + +class LLMNotFoundError(LLMError): + """Error when requested resource is not found""" + + +class LLMUnprocessableEntityError(LLMError): + """Error when request is well-formed but semantically invalid""" + + class BedrockPermissionError(LettaError): """Exception raised for errors in the Bedrock permission process.""" diff --git a/letta/llm_api/anthropic_client.py b/letta/llm_api/anthropic_client.py index 945a23e0..402ba8a6 100644 --- a/letta/llm_api/anthropic_client.py +++ b/letta/llm_api/anthropic_client.py @@ -7,12 +7,11 @@ from anthropic.types import Message as AnthropicMessage from letta.helpers.datetime_helpers import get_utc_time from letta.llm_api.helpers import add_inner_thoughts_to_functions, unpack_all_inner_thoughts_from_kwargs -from letta.llm_api.llm_api_tools import cast_message_to_subtype from letta.llm_api.llm_client_base import LLMClientBase from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION from letta.log import get_logger from letta.schemas.message import Message as PydanticMessage -from letta.schemas.openai.chat_completion_request import ChatCompletionRequest, Tool +from letta.schemas.openai.chat_completion_request import Tool from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice, FunctionCall from letta.schemas.openai.chat_completion_response import Message as ChoiceMessage from letta.schemas.openai.chat_completion_response import ToolCall, UsageStatistics @@ -26,20 +25,14 @@ logger = get_logger(__name__) class AnthropicClient(LLMClientBase): def request(self, request_data: dict) -> dict: - try: - client = self._get_anthropic_client(async_client=False) - response = client.beta.messages.create(**request_data, betas=["tools-2024-04-04"]) - return response.model_dump() - except Exception as e: - self._handle_anthropic_error(e) + client = self._get_anthropic_client(async_client=False) + response = client.beta.messages.create(**request_data, betas=["tools-2024-04-04"]) + return response.model_dump() async def request_async(self, request_data: dict) -> dict: - try: - client = self._get_anthropic_client(async_client=True) - response = await client.beta.messages.create(**request_data, betas=["tools-2024-04-04"]) - return response.model_dump() - except Exception as e: - self._handle_anthropic_error(e) + client = self._get_anthropic_client(async_client=True) + response = await client.beta.messages.create(**request_data, betas=["tools-2024-04-04"]) + return response.model_dump() def _get_anthropic_client(self, async_client: bool = False) -> Union[anthropic.AsyncAnthropic, anthropic.Anthropic]: override_key = ProviderManager().get_anthropic_override_key() @@ -47,15 +40,6 @@ class AnthropicClient(LLMClientBase): return anthropic.AsyncAnthropic(api_key=override_key) if override_key else anthropic.AsyncAnthropic() return anthropic.Anthropic(api_key=override_key) if override_key else anthropic.Anthropic() - def _handle_anthropic_error(self, e: Exception): - if isinstance(e, anthropic.APIConnectionError): - logger.warning(f"[Anthropic] API connection error: {e.__cause__}") - elif isinstance(e, anthropic.RateLimitError): - logger.warning("[Anthropic] Rate limited (429). Consider backoff.") - elif isinstance(e, anthropic.APIStatusError): - logger.warning(f"[Anthropic] API status error: {e.status_code}, {e.response}") - raise e - def build_request_data( self, messages: List[PydanticMessage], @@ -63,43 +47,155 @@ class AnthropicClient(LLMClientBase): tool_call: Optional[str], force_tool_call: Optional[str] = None, ) -> dict: + prefix_fill = True if not self.use_tool_naming: raise NotImplementedError("Only tool calling supported on Anthropic API requests") - if tools is None: - # Special case for summarization path - available_tools = None - tool_choice = None - elif force_tool_call is not None: - assert tools is not None - tool_choice = {"type": "tool", "name": force_tool_call} - available_tools = [{"type": "function", "function": f} for f in tools if f["name"] == force_tool_call] + if not self.llm_config.max_tokens: + raise ValueError("Max tokens must be set for anthropic") - # need to have this setting to be able to put inner thoughts in kwargs - self.llm_config.put_inner_thoughts_in_kwargs = True - else: - if self.llm_config.put_inner_thoughts_in_kwargs: - # tool_choice_type other than "auto" only plays nice if thinking goes inside the tool calls - tool_choice = {"type": "any", "disable_parallel_tool_use": True} - else: - tool_choice = {"type": "auto", "disable_parallel_tool_use": True} - available_tools = [{"type": "function", "function": f} for f in tools] + data = { + "model": self.llm_config.model, + "max_tokens": self.llm_config.max_tokens, + "temperature": self.llm_config.temperature, + } - chat_completion_request = ChatCompletionRequest( - model=self.llm_config.model, - messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages], - tools=available_tools, - tool_choice=tool_choice, - max_tokens=self.llm_config.max_tokens, # Note: max_tokens is required for Anthropic API - temperature=self.llm_config.temperature, + # Extended Thinking + if self.llm_config.enable_reasoner: + assert ( + self.llm_config.max_reasoning_tokens is not None and self.llm_config.max_reasoning_tokens < self.llm_config.max_tokens + ), "max tokens must be greater than thinking budget" + assert not self.llm_config.put_inner_thoughts_in_kwargs, "extended thinking not compatible with put_inner_thoughts_in_kwargs" + + data["thinking"] = { + "type": "enabled", + "budget_tokens": self.llm_config.max_reasoning_tokens, + } + # `temperature` may only be set to 1 when thinking is enabled. Please consult our documentation at https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#important-considerations-when-using-extended-thinking' + data["temperature"] = 1.0 + + # Silently disable prefix_fill for now + prefix_fill = False + + # Tools + tools_for_request = ( + [Tool(function=f) for f in tools if f["name"] == force_tool_call] + if force_tool_call is not None + else [Tool(function=f) for f in tools] ) + if force_tool_call is not None: + self.llm_config.put_inner_thoughts_in_kwargs = True # why do we do this ? - return _prepare_anthropic_request( - data=chat_completion_request, - put_inner_thoughts_in_kwargs=self.llm_config.put_inner_thoughts_in_kwargs, - extended_thinking=self.llm_config.enable_reasoner, - max_reasoning_tokens=self.llm_config.max_reasoning_tokens, - ) + # Add inner thoughts kwarg + if len(tools_for_request) > 0 and self.llm_config.put_inner_thoughts_in_kwargs: + tools_with_inner_thoughts = add_inner_thoughts_to_functions( + functions=[t.function for t in tools_for_request], + inner_thoughts_key=INNER_THOUGHTS_KWARG, + inner_thoughts_description=INNER_THOUGHTS_KWARG_DESCRIPTION, + ) + tools_for_request = [Tool(function=f) for f in tools_with_inner_thoughts] + + if len(tools_for_request) > 0: + # TODO eventually enable parallel tool use + data["tools"] = convert_tools_to_anthropic_format(tools_for_request) + + # Messages + inner_thoughts_xml_tag = "thinking" + data["messages"] = [ + m.to_anthropic_dict( + inner_thoughts_xml_tag=inner_thoughts_xml_tag, + put_inner_thoughts_in_kwargs=self.llm_config.put_inner_thoughts_in_kwargs, + ) + for m in messages + ] + + # Move 'system' to the top level + # assert data["messages"][0]["role"] == "system", f"Expected 'system' role in messages[0]:\n{data['messages'][0]}" + data["system"] = data["messages"][0]["content"] + data["messages"] = data["messages"][1:] + + # Ensure first message is user + if data["messages"][0]["role"] != "user": + data["messages"] = [{"role": "user", "content": DUMMY_FIRST_USER_MESSAGE}] + data["messages"] + + # Handle alternating messages + data["messages"] = merge_tool_results_into_user_messages(data["messages"]) + + # Prefix fill + # https://docs.anthropic.com/en/api/messages#body-messages + # NOTE: cannot prefill with tools for opus: + # Your API request included an `assistant` message in the final position, which would pre-fill the `assistant` response. When using tools with "claude-3-opus-20240229" + if prefix_fill and not self.llm_config.put_inner_thoughts_in_kwargs and "opus" not in data["model"]: + data["messages"].append( + # Start the thinking process for the assistant + {"role": "assistant", "content": f"<{inner_thoughts_xml_tag}>"}, + ) + + return data + + def handle_llm_error(self, e: Exception) -> Exception: + if isinstance(e, anthropic.APIConnectionError): + logger.warning(f"[Anthropic] API connection error: {e.__cause__}") + return LLMConnectionError( + message=f"Failed to connect to Anthropic: {str(e)}", + code=ErrorCode.INTERNAL_SERVER_ERROR, + details={"cause": str(e.__cause__) if e.__cause__ else None}, + ) + + if isinstance(e, anthropic.RateLimitError): + logger.warning("[Anthropic] Rate limited (429). Consider backoff.") + return LLMRateLimitError( + message=f"Rate limited by Anthropic: {str(e)}", + code=ErrorCode.RATE_LIMIT_EXCEEDED, + ) + + if isinstance(e, anthropic.BadRequestError): + logger.warning(f"[Anthropic] Bad request: {str(e)}") + return LLMBadRequestError( + message=f"Bad request to Anthropic: {str(e)}", + code=ErrorCode.INTERNAL_SERVER_ERROR, + ) + + if isinstance(e, anthropic.AuthenticationError): + logger.warning(f"[Anthropic] Authentication error: {str(e)}") + return LLMAuthenticationError( + message=f"Authentication failed with Anthropic: {str(e)}", + code=ErrorCode.INTERNAL_SERVER_ERROR, + ) + + if isinstance(e, anthropic.PermissionDeniedError): + logger.warning(f"[Anthropic] Permission denied: {str(e)}") + return LLMPermissionDeniedError( + message=f"Permission denied by Anthropic: {str(e)}", + code=ErrorCode.INTERNAL_SERVER_ERROR, + ) + + if isinstance(e, anthropic.NotFoundError): + logger.warning(f"[Anthropic] Resource not found: {str(e)}") + return LLMNotFoundError( + message=f"Resource not found in Anthropic: {str(e)}", + code=ErrorCode.INTERNAL_SERVER_ERROR, + ) + + if isinstance(e, anthropic.UnprocessableEntityError): + logger.warning(f"[Anthropic] Unprocessable entity: {str(e)}") + return LLMUnprocessableEntityError( + message=f"Invalid request content for Anthropic: {str(e)}", + code=ErrorCode.INTERNAL_SERVER_ERROR, + ) + + if isinstance(e, anthropic.APIStatusError): + logger.warning(f"[Anthropic] API status error: {str(e)}") + return LLMServerError( + message=f"Anthropic API error: {str(e)}", + code=ErrorCode.INTERNAL_SERVER_ERROR, + details={ + "status_code": e.status_code if hasattr(e, "status_code") else None, + "response": str(e.response) if hasattr(e, "response") else None, + }, + ) + + return super().handle_llm_error(e) def convert_response_to_chat_completion( self, @@ -208,118 +304,6 @@ class AnthropicClient(LLMClientBase): return chat_completion_response -def _prepare_anthropic_request( - data: ChatCompletionRequest, - inner_thoughts_xml_tag: Optional[str] = "thinking", - # if true, prefix fill the generation with the thinking tag - prefix_fill: bool = True, - # if true, put COT inside the tool calls instead of inside the content - put_inner_thoughts_in_kwargs: bool = False, - bedrock: bool = False, - # extended thinking related fields - # https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking - extended_thinking: bool = False, - max_reasoning_tokens: Optional[int] = None, -) -> dict: - """Prepare the request data for Anthropic API format.""" - if extended_thinking: - assert ( - max_reasoning_tokens is not None and max_reasoning_tokens < data.max_tokens - ), "max tokens must be greater than thinking budget" - assert not put_inner_thoughts_in_kwargs, "extended thinking not compatible with put_inner_thoughts_in_kwargs" - # assert not prefix_fill, "extended thinking not compatible with prefix_fill" - # Silently disable prefix_fill for now - prefix_fill = False - - # if needed, put inner thoughts as a kwarg for all tools - if data.tools and put_inner_thoughts_in_kwargs: - functions = add_inner_thoughts_to_functions( - functions=[t.function.model_dump() for t in data.tools], - inner_thoughts_key=INNER_THOUGHTS_KWARG, - inner_thoughts_description=INNER_THOUGHTS_KWARG_DESCRIPTION, - ) - data.tools = [Tool(function=f) for f in functions] - - # convert the tools to Anthropic's payload format - anthropic_tools = None if data.tools is None else convert_tools_to_anthropic_format(data.tools) - - # pydantic -> dict - data = data.model_dump(exclude_none=True) - - if extended_thinking: - data["thinking"] = { - "type": "enabled", - "budget_tokens": max_reasoning_tokens, - } - # `temperature` may only be set to 1 when thinking is enabled. Please consult our documentation at https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#important-considerations-when-using-extended-thinking' - data["temperature"] = 1.0 - - if "functions" in data: - raise ValueError(f"'functions' unexpected in Anthropic API payload") - - # Handle tools - if "tools" in data and data["tools"] is None: - data.pop("tools") - data.pop("tool_choice", None) - elif anthropic_tools is not None: - # TODO eventually enable parallel tool use - data["tools"] = anthropic_tools - - # Move 'system' to the top level - assert data["messages"][0]["role"] == "system", f"Expected 'system' role in messages[0]:\n{data['messages'][0]}" - data["system"] = data["messages"][0]["content"] - data["messages"] = data["messages"][1:] - - # Process messages - for message in data["messages"]: - if "content" not in message: - message["content"] = None - - # Convert to Anthropic format - msg_objs = [ - PydanticMessage.dict_to_message( - user_id=None, - agent_id=None, - openai_message_dict=m, - ) - for m in data["messages"] - ] - data["messages"] = [ - m.to_anthropic_dict( - inner_thoughts_xml_tag=inner_thoughts_xml_tag, - put_inner_thoughts_in_kwargs=put_inner_thoughts_in_kwargs, - ) - for m in msg_objs - ] - - # Ensure first message is user - if data["messages"][0]["role"] != "user": - data["messages"] = [{"role": "user", "content": DUMMY_FIRST_USER_MESSAGE}] + data["messages"] - - # Handle alternating messages - data["messages"] = merge_tool_results_into_user_messages(data["messages"]) - - # Handle prefix fill (not compatible with inner-thouguhts-in-kwargs) - # https://docs.anthropic.com/en/api/messages#body-messages - # NOTE: cannot prefill with tools for opus: - # Your API request included an `assistant` message in the final position, which would pre-fill the `assistant` response. When using tools with "claude-3-opus-20240229" - if prefix_fill and not put_inner_thoughts_in_kwargs and "opus" not in data["model"]: - if not bedrock: # not support for bedrock - data["messages"].append( - # Start the thinking process for the assistant - {"role": "assistant", "content": f"<{inner_thoughts_xml_tag}>"}, - ) - - # Validate max_tokens - assert "max_tokens" in data, data - - # Remove OpenAI-specific fields - for field in ["frequency_penalty", "logprobs", "n", "top_p", "presence_penalty", "user", "stream"]: - data.pop(field, None) - - return data - - def convert_tools_to_anthropic_format(tools: List[Tool]) -> List[dict]: """See: https://docs.anthropic.com/claude/docs/tool-use diff --git a/letta/llm_api/llm_client.py b/letta/llm_api/llm_client.py index 41f813d6..15c8c6a1 100644 --- a/letta/llm_api/llm_client.py +++ b/letta/llm_api/llm_client.py @@ -9,21 +9,17 @@ class LLMClient: @staticmethod def create( - agent_id: str, llm_config: LLMConfig, put_inner_thoughts_first: bool = True, - actor_id: Optional[str] = None, ) -> Optional[LLMClientBase]: """ Create an LLM client based on the model endpoint type. Args: - agent_id: Unique identifier for the agent llm_config: Configuration for the LLM model put_inner_thoughts_first: Whether to put inner thoughts first in the response use_structured_output: Whether to use structured output use_tool_naming: Whether to use tool naming - actor_id: Optional actor identifier Returns: An instance of LLMClientBase subclass @@ -36,19 +32,22 @@ class LLMClient: from letta.llm_api.google_ai_client import GoogleAIClient return GoogleAIClient( - agent_id=agent_id, llm_config=llm_config, put_inner_thoughts_first=put_inner_thoughts_first, actor_id=actor_id + llm_config=llm_config, + put_inner_thoughts_first=put_inner_thoughts_first, ) case "google_vertex": from letta.llm_api.google_vertex_client import GoogleVertexClient return GoogleVertexClient( - agent_id=agent_id, llm_config=llm_config, put_inner_thoughts_first=put_inner_thoughts_first, actor_id=actor_id + llm_config=llm_config, + put_inner_thoughts_first=put_inner_thoughts_first, ) case "anthropic": from letta.llm_api.anthropic_client import AnthropicClient return AnthropicClient( - agent_id=agent_id, llm_config=llm_config, put_inner_thoughts_first=put_inner_thoughts_first, actor_id=actor_id + llm_config=llm_config, + put_inner_thoughts_first=put_inner_thoughts_first, ) case _: return None diff --git a/letta/llm_api/llm_client_base.py b/letta/llm_api/llm_client_base.py index fd87c57a..cab01dce 100644 --- a/letta/llm_api/llm_client_base.py +++ b/letta/llm_api/llm_client_base.py @@ -18,17 +18,13 @@ class LLMClientBase: def __init__( self, - agent_id: str, llm_config: LLMConfig, put_inner_thoughts_first: Optional[bool] = True, use_structured_output: Optional[bool] = True, use_tool_naming: bool = True, - actor_id: Optional[str] = None, ): - self.agent_id = agent_id self.llm_config = llm_config self.put_inner_thoughts_first = put_inner_thoughts_first - self.actor_id = actor_id self.use_tool_naming = use_tool_naming def send_llm_request( @@ -46,13 +42,19 @@ class LLMClientBase: Otherwise returns a ChatCompletionResponse. """ request_data = self.build_request_data(messages, tools, tool_call) - log_event(name="llm_request_sent", attributes=request_data) - if stream: - return self.stream(request_data) - else: - response_data = self.request(request_data) + response_data = {} + + try: + log_event(name="llm_request_sent", attributes=request_data) + if stream: + return self.stream(request_data) + else: + response_data = self.request(request_data) log_event(name="llm_response_received", attributes=response_data) - return self.convert_response_to_chat_completion(response_data, messages) + except Exception as e: + raise self.handle_llm_error(e) + + return self.convert_response_to_chat_completion(response_data, messages) async def send_llm_request_async( self, @@ -68,14 +70,20 @@ class LLMClientBase: If stream=True, returns an AsyncStream[ChatCompletionChunk] that can be async iterated over. Otherwise returns a ChatCompletionResponse. """ - request_data = self.build_request_data(messages, tools, tool_call) - log_event(name="llm_request_sent", attributes=request_data) - if stream: - return await self.stream_async(request_data) - else: - response_data = await self.request_async(request_data) + request_data = self.build_request_data(messages, tools, tool_call, force_tool_call) + response_data = {} + + try: + log_event(name="llm_request_sent", attributes=request_data) + if stream: + return await self.stream_async(request_data) + else: + response_data = await self.request_async(request_data) log_event(name="llm_response_received", attributes=response_data) - return self.convert_response_to_chat_completion(response_data, messages) + except Exception as e: + raise self.handle_llm_error(e) + + return self.convert_response_to_chat_completion(response_data, messages) @abstractmethod def build_request_data( @@ -129,3 +137,17 @@ class LLMClientBase: Performs underlying streaming request to llm and returns raw response. """ raise NotImplementedError(f"Streaming is not supported for {self.llm_config.model_endpoint_type}") + + @abstractmethod + def handle_llm_error(self, e: Exception) -> Exception: + """ + Maps provider-specific errors to common LLMError types. + Each LLM provider should implement this to translate their specific errors. + + Args: + e: The original provider-specific exception + + Returns: + An LLMError subclass that represents the error in a provider-agnostic way + """ + return LLMError(f"Unhandled LLM error: {str(e)}") diff --git a/tests/helpers/endpoints_helper.py b/tests/helpers/endpoints_helper.py index c487d07a..ebe7c2b8 100644 --- a/tests/helpers/endpoints_helper.py +++ b/tests/helpers/endpoints_helper.py @@ -104,11 +104,7 @@ def check_first_response_is_valid_for_llm_endpoint(filename: str, validate_inner messages = client.server.agent_manager.get_in_context_messages(agent_id=full_agent_state.id, actor=client.user) agent = Agent(agent_state=full_agent_state, interface=None, user=client.user) - llm_client = LLMClient.create( - agent_id=agent_state.id, - llm_config=agent_state.llm_config, - actor_id=str(uuid.UUID(int=1)), - ) + llm_client = LLMClient.create(llm_config=agent_state.llm_config) if llm_client: response = llm_client.send_llm_request( messages=messages,