diff --git a/letta/agent.py b/letta/agent.py index 5655e04c..b8c67cc2 100644 --- a/letta/agent.py +++ b/letta/agent.py @@ -29,6 +29,7 @@ from letta.helpers.json_helpers import json_dumps, json_loads from letta.interface import AgentInterface from letta.llm_api.helpers import calculate_summarizer_cutoff, get_token_counts_for_messages, is_context_overflow_error from letta.llm_api.llm_api_tools import create +from letta.llm_api.llm_client import LLMClient from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages from letta.log import get_logger from letta.memory import summarize_messages @@ -356,19 +357,38 @@ class Agent(BaseAgent): for attempt in range(1, empty_response_retry_limit + 1): try: log_telemetry(self.logger, "_get_ai_reply create start") - response = create( + # New LLM client flow + llm_client = LLMClient.create( + agent_id=self.agent_state.id, llm_config=self.agent_state.llm_config, - messages=message_sequence, - user_id=self.agent_state.created_by_id, - functions=allowed_functions, - # functions_python=self.functions_python, do we need this? - function_call=function_call, - first_message=first_message, - force_tool_call=force_tool_call, - stream=stream, - stream_interface=self.interface, put_inner_thoughts_first=put_inner_thoughts_first, + actor_id=self.agent_state.created_by_id, ) + + if llm_client and not stream: + response = llm_client.send_llm_request( + messages=message_sequence, + tools=allowed_functions, + tool_call=function_call, + stream=stream, + first_message=first_message, + force_tool_call=force_tool_call, + ) + else: + # Fallback to existing flow + response = create( + llm_config=self.agent_state.llm_config, + messages=message_sequence, + user_id=self.agent_state.created_by_id, + functions=allowed_functions, + # functions_python=self.functions_python, do we need this? + function_call=function_call, + first_message=first_message, + force_tool_call=force_tool_call, + stream=stream, + stream_interface=self.interface, + put_inner_thoughts_first=put_inner_thoughts_first, + ) log_telemetry(self.logger, "_get_ai_reply create finish") # These bottom two are retryable @@ -632,7 +652,7 @@ class Agent(BaseAgent): function_args, function_response, messages, - [tool_return] if tool_return else None, + [tool_return], include_function_failed_message=True, ) return messages, False, True # force a heartbeat to allow agent to handle error @@ -659,7 +679,7 @@ class Agent(BaseAgent): "content": function_response, "tool_call_id": tool_call_id, }, - tool_returns=[tool_return] if tool_return else None, + tool_returns=[tool_return] if sandbox_run_result else None, ) ) # extend conversation with function response self.interface.function_message(f"Ran {function_name}({function_args})", msg_obj=messages[-1]) diff --git a/letta/llm_api/google_ai_client.py b/letta/llm_api/google_ai_client.py new file mode 100644 index 00000000..c75deefd --- /dev/null +++ b/letta/llm_api/google_ai_client.py @@ -0,0 +1,332 @@ +import uuid +from typing import List, Optional, Tuple + +from letta.constants import NON_USER_MSG_PREFIX +from letta.helpers.datetime_helpers import get_utc_time +from letta.helpers.json_helpers import json_dumps +from letta.llm_api.helpers import make_post_request +from letta.llm_api.llm_client_base import LLMClientBase +from letta.local_llm.json_parser import clean_json_string_extra_backslash +from letta.local_llm.utils import count_tokens +from letta.schemas.message import Message as PydanticMessage +from letta.schemas.openai.chat_completion_request import Tool +from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice, FunctionCall, Message, ToolCall, UsageStatistics +from letta.settings import model_settings +from letta.utils import get_tool_call_id + + +class GoogleAIClient(LLMClientBase): + + def request(self, request_data: dict) -> dict: + """ + Performs underlying request to llm and returns raw response. + """ + url, headers = self.get_gemini_endpoint_and_headers(generate_content=True) + return make_post_request(url, headers, request_data) + + def build_request_data( + self, + messages: List[PydanticMessage], + tools: List[dict], + tool_call: Optional[str], + ) -> dict: + """ + Constructs a request object in the expected data format for this client. + """ + if tools: + tools = [{"type": "function", "function": f} for f in tools] + tools = self.convert_tools_to_google_ai_format( + [Tool(**t) for t in tools], + ) + contents = self.add_dummy_model_messages( + [m.to_google_ai_dict() for m in messages], + ) + + return { + "contents": contents, + "tools": tools, + "generation_config": { + "temperature": self.llm_config.temperature, + "max_output_tokens": self.llm_config.max_tokens, + }, + } + + def convert_response_to_chat_completion( + self, + response_data: dict, + input_messages: List[PydanticMessage], + ) -> ChatCompletionResponse: + """ + Converts custom response format from llm client into an OpenAI + ChatCompletionsResponse object. + + Example Input: + { + "candidates": [ + { + "content": { + "parts": [ + { + "text": " OK. Barbie is showing in two theaters in Mountain View, CA: AMC Mountain View 16 and Regal Edwards 14." + } + ] + } + } + ], + "usageMetadata": { + "promptTokenCount": 9, + "candidatesTokenCount": 27, + "totalTokenCount": 36 + } + } + """ + try: + choices = [] + index = 0 + for candidate in response_data["candidates"]: + content = candidate["content"] + + role = content["role"] + assert role == "model", f"Unknown role in response: {role}" + + parts = content["parts"] + # TODO support parts / multimodal + # TODO support parallel tool calling natively + # TODO Alternative here is to throw away everything else except for the first part + for response_message in parts: + # Convert the actual message style to OpenAI style + if "functionCall" in response_message and response_message["functionCall"] is not None: + function_call = response_message["functionCall"] + assert isinstance(function_call, dict), function_call + function_name = function_call["name"] + assert isinstance(function_name, str), function_name + function_args = function_call["args"] + assert isinstance(function_args, dict), function_args + + # NOTE: this also involves stripping the inner monologue out of the function + if self.llm_config.put_inner_thoughts_in_kwargs: + from letta.local_llm.constants import INNER_THOUGHTS_KWARG + + assert INNER_THOUGHTS_KWARG in function_args, f"Couldn't find inner thoughts in function args:\n{function_call}" + inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG) + assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}" + else: + inner_thoughts = None + + # Google AI API doesn't generate tool call IDs + openai_response_message = Message( + role="assistant", # NOTE: "model" -> "assistant" + content=inner_thoughts, + tool_calls=[ + ToolCall( + id=get_tool_call_id(), + type="function", + function=FunctionCall( + name=function_name, + arguments=clean_json_string_extra_backslash(json_dumps(function_args)), + ), + ) + ], + ) + + else: + + # Inner thoughts are the content by default + inner_thoughts = response_message["text"] + + # Google AI API doesn't generate tool call IDs + openai_response_message = Message( + role="assistant", # NOTE: "model" -> "assistant" + content=inner_thoughts, + ) + + # Google AI API uses different finish reason strings than OpenAI + # OpenAI: 'stop', 'length', 'function_call', 'content_filter', null + # see: https://platform.openai.com/docs/guides/text-generation/chat-completions-api + # Google AI API: FINISH_REASON_UNSPECIFIED, STOP, MAX_TOKENS, SAFETY, RECITATION, OTHER + # see: https://ai.google.dev/api/python/google/ai/generativelanguage/Candidate/FinishReason + finish_reason = candidate["finishReason"] + if finish_reason == "STOP": + openai_finish_reason = ( + "function_call" + if openai_response_message.tool_calls is not None and len(openai_response_message.tool_calls) > 0 + else "stop" + ) + elif finish_reason == "MAX_TOKENS": + openai_finish_reason = "length" + elif finish_reason == "SAFETY": + openai_finish_reason = "content_filter" + elif finish_reason == "RECITATION": + openai_finish_reason = "content_filter" + else: + raise ValueError(f"Unrecognized finish reason in Google AI response: {finish_reason}") + + choices.append( + Choice( + finish_reason=openai_finish_reason, + index=index, + message=openai_response_message, + ) + ) + index += 1 + + # if len(choices) > 1: + # raise UserWarning(f"Unexpected number of candidates in response (expected 1, got {len(choices)})") + + # NOTE: some of the Google AI APIs show UsageMetadata in the response, but it seems to not exist? + # "usageMetadata": { + # "promptTokenCount": 9, + # "candidatesTokenCount": 27, + # "totalTokenCount": 36 + # } + if "usageMetadata" in response_data: + usage = UsageStatistics( + prompt_tokens=response_data["usageMetadata"]["promptTokenCount"], + completion_tokens=response_data["usageMetadata"]["candidatesTokenCount"], + total_tokens=response_data["usageMetadata"]["totalTokenCount"], + ) + else: + # Count it ourselves + assert input_messages is not None, f"Didn't get UsageMetadata from the API response, so input_messages is required" + prompt_tokens = count_tokens(json_dumps(input_messages)) # NOTE: this is a very rough approximation + completion_tokens = count_tokens(json_dumps(openai_response_message.model_dump())) # NOTE: this is also approximate + total_tokens = prompt_tokens + completion_tokens + usage = UsageStatistics( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + ) + + response_id = str(uuid.uuid4()) + return ChatCompletionResponse( + id=response_id, + choices=choices, + model=self.llm_config.model, # NOTE: Google API doesn't pass back model in the response + created=get_utc_time(), + usage=usage, + ) + except KeyError as e: + raise e + + def get_gemini_endpoint_and_headers( + self, + key_in_header: bool = True, + generate_content: bool = False, + ) -> Tuple[str, dict]: + """ + Dynamically generate the model endpoint and headers. + """ + + url = f"{self.llm_config.model_endpoint}/v1beta/models" + + # Add the model + url += f"/{self.llm_config.model}" + + # Add extension for generating content if we're hitting the LM + if generate_content: + url += ":generateContent" + + # Decide if api key should be in header or not + # Two ways to pass the key: https://ai.google.dev/tutorials/setup + if key_in_header: + headers = {"Content-Type": "application/json", "x-goog-api-key": model_settings.gemini_api_key} + else: + url += f"?key={model_settings.gemini_api_key}" + headers = {"Content-Type": "application/json"} + + return url, headers + + def convert_tools_to_google_ai_format(self, tools: List[Tool]) -> List[dict]: + """ + OpenAI style: + "tools": [{ + "type": "function", + "function": { + "name": "find_movies", + "description": "find ....", + "parameters": { + "type": "object", + "properties": { + PARAM: { + "type": PARAM_TYPE, # eg "string" + "description": PARAM_DESCRIPTION, + }, + ... + }, + "required": List[str], + } + } + } + ] + + Google AI style: + "tools": [{ + "functionDeclarations": [{ + "name": "find_movies", + "description": "find movie titles currently playing in theaters based on any description, genre, title words, etc.", + "parameters": { + "type": "OBJECT", + "properties": { + "location": { + "type": "STRING", + "description": "The city and state, e.g. San Francisco, CA or a zip code e.g. 95616" + }, + "description": { + "type": "STRING", + "description": "Any kind of description including category or genre, title words, attributes, etc." + } + }, + "required": ["description"] + } + }, { + "name": "find_theaters", + ... + """ + function_list = [ + dict( + name=t.function.name, + description=t.function.description, + parameters=t.function.parameters, # TODO need to unpack + ) + for t in tools + ] + + # Correct casing + add inner thoughts if needed + for func in function_list: + func["parameters"]["type"] = "OBJECT" + for param_name, param_fields in func["parameters"]["properties"].items(): + param_fields["type"] = param_fields["type"].upper() + # Add inner thoughts + if self.llm_config.put_inner_thoughts_in_kwargs: + from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION + + func["parameters"]["properties"][INNER_THOUGHTS_KWARG] = { + "type": "STRING", + "description": INNER_THOUGHTS_KWARG_DESCRIPTION, + } + func["parameters"]["required"].append(INNER_THOUGHTS_KWARG) + + return [{"functionDeclarations": function_list}] + + def add_dummy_model_messages(self, messages: List[dict]) -> List[dict]: + """Google AI API requires all function call returns are immediately followed by a 'model' role message. + + In Letta, the 'model' will often call a function (e.g. send_message) that itself yields to the user, + so there is no natural follow-up 'model' role message. + + To satisfy the Google AI API restrictions, we can add a dummy 'yield' message + with role == 'model' that is placed in-betweeen and function output + (role == 'tool') and user message (role == 'user'). + """ + dummy_yield_message = { + "role": "model", + "parts": [{"text": f"{NON_USER_MSG_PREFIX}Function call returned, waiting for user response."}], + } + messages_with_padding = [] + for i, message in enumerate(messages): + messages_with_padding.append(message) + # Check if the current message role is 'tool' and the next message role is 'user' + if message["role"] in ["tool", "function"] and (i + 1 < len(messages) and messages[i + 1]["role"] == "user"): + messages_with_padding.append(dummy_yield_message) + + return messages_with_padding diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py new file mode 100644 index 00000000..1c703249 --- /dev/null +++ b/letta/llm_api/google_vertex_client.py @@ -0,0 +1,214 @@ +import uuid +from typing import List, Optional + +from google import genai +from google.genai.types import FunctionCallingConfig, FunctionCallingConfigMode, GenerateContentResponse, ToolConfig + +from letta.helpers.datetime_helpers import get_utc_time +from letta.helpers.json_helpers import json_dumps +from letta.llm_api.google_ai_client import GoogleAIClient +from letta.local_llm.json_parser import clean_json_string_extra_backslash +from letta.local_llm.utils import count_tokens +from letta.schemas.message import Message as PydanticMessage +from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice, FunctionCall, Message, ToolCall, UsageStatistics +from letta.settings import model_settings +from letta.utils import get_tool_call_id + + +class GoogleVertexClient(GoogleAIClient): + + def request(self, request_data: dict) -> dict: + """ + Performs underlying request to llm and returns raw response. + """ + client = genai.Client( + vertexai=True, + project=model_settings.google_cloud_project, + location=model_settings.google_cloud_location, + http_options={"api_version": "v1"}, + ) + response = client.models.generate_content( + model=self.llm_config.model, + contents=request_data["contents"], + config=request_data["config"], + ) + return response.model_dump() + + def build_request_data( + self, + messages: List[PydanticMessage], + tools: List[dict], + tool_call: Optional[str], + ) -> dict: + """ + Constructs a request object in the expected data format for this client. + """ + request_data = super().build_request_data(messages, tools, tool_call) + request_data["config"] = request_data.pop("generation_config") + request_data["config"]["tools"] = request_data.pop("tools") + + tool_config = ToolConfig( + function_calling_config=FunctionCallingConfig( + # ANY mode forces the model to predict only function calls + mode=FunctionCallingConfigMode.ANY, + ) + ) + request_data["config"]["tool_config"] = tool_config.model_dump() + + return request_data + + def convert_response_to_chat_completion( + self, + response_data: dict, + input_messages: List[PydanticMessage], + ) -> ChatCompletionResponse: + """ + Converts custom response format from llm client into an OpenAI + ChatCompletionsResponse object. + + Example: + { + "candidates": [ + { + "content": { + "parts": [ + { + "text": " OK. Barbie is showing in two theaters in Mountain View, CA: AMC Mountain View 16 and Regal Edwards 14." + } + ] + } + } + ], + "usageMetadata": { + "promptTokenCount": 9, + "candidatesTokenCount": 27, + "totalTokenCount": 36 + } + } + """ + response = GenerateContentResponse(**response_data) + try: + choices = [] + index = 0 + for candidate in response.candidates: + content = candidate.content + + role = content.role + assert role == "model", f"Unknown role in response: {role}" + + parts = content.parts + # TODO support parts / multimodal + # TODO support parallel tool calling natively + # TODO Alternative here is to throw away everything else except for the first part + for response_message in parts: + # Convert the actual message style to OpenAI style + if response_message.function_call: + function_call = response_message.function_call + function_name = function_call.name + function_args = function_call.args + assert isinstance(function_args, dict), function_args + + # NOTE: this also involves stripping the inner monologue out of the function + if self.llm_config.put_inner_thoughts_in_kwargs: + from letta.local_llm.constants import INNER_THOUGHTS_KWARG + + assert INNER_THOUGHTS_KWARG in function_args, f"Couldn't find inner thoughts in function args:\n{function_call}" + inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG) + assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}" + else: + inner_thoughts = None + + # Google AI API doesn't generate tool call IDs + openai_response_message = Message( + role="assistant", # NOTE: "model" -> "assistant" + content=inner_thoughts, + tool_calls=[ + ToolCall( + id=get_tool_call_id(), + type="function", + function=FunctionCall( + name=function_name, + arguments=clean_json_string_extra_backslash(json_dumps(function_args)), + ), + ) + ], + ) + + else: + + # Inner thoughts are the content by default + inner_thoughts = response_message.text + + # Google AI API doesn't generate tool call IDs + openai_response_message = Message( + role="assistant", # NOTE: "model" -> "assistant" + content=inner_thoughts, + ) + + # Google AI API uses different finish reason strings than OpenAI + # OpenAI: 'stop', 'length', 'function_call', 'content_filter', null + # see: https://platform.openai.com/docs/guides/text-generation/chat-completions-api + # Google AI API: FINISH_REASON_UNSPECIFIED, STOP, MAX_TOKENS, SAFETY, RECITATION, OTHER + # see: https://ai.google.dev/api/python/google/ai/generativelanguage/Candidate/FinishReason + finish_reason = candidate.finish_reason.value + if finish_reason == "STOP": + openai_finish_reason = ( + "function_call" + if openai_response_message.tool_calls is not None and len(openai_response_message.tool_calls) > 0 + else "stop" + ) + elif finish_reason == "MAX_TOKENS": + openai_finish_reason = "length" + elif finish_reason == "SAFETY": + openai_finish_reason = "content_filter" + elif finish_reason == "RECITATION": + openai_finish_reason = "content_filter" + else: + raise ValueError(f"Unrecognized finish reason in Google AI response: {finish_reason}") + + choices.append( + Choice( + finish_reason=openai_finish_reason, + index=index, + message=openai_response_message, + ) + ) + index += 1 + + # if len(choices) > 1: + # raise UserWarning(f"Unexpected number of candidates in response (expected 1, got {len(choices)})") + + # NOTE: some of the Google AI APIs show UsageMetadata in the response, but it seems to not exist? + # "usageMetadata": { + # "promptTokenCount": 9, + # "candidatesTokenCount": 27, + # "totalTokenCount": 36 + # } + if response.usage_metadata: + usage = UsageStatistics( + prompt_tokens=response.usage_metadata.prompt_token_count, + completion_tokens=response.usage_metadata.candidates_token_count, + total_tokens=response.usage_metadata.total_token_count, + ) + else: + # Count it ourselves + assert input_messages is not None, f"Didn't get UsageMetadata from the API response, so input_messages is required" + prompt_tokens = count_tokens(json_dumps(input_messages)) # NOTE: this is a very rough approximation + completion_tokens = count_tokens(json_dumps(openai_response_message.model_dump())) # NOTE: this is also approximate + total_tokens = prompt_tokens + completion_tokens + usage = UsageStatistics( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + ) + + response_id = str(uuid.uuid4()) + return ChatCompletionResponse( + id=response_id, + choices=choices, + model=self.llm_config.model, # NOTE: Google API doesn't pass back model in the response + created=get_utc_time(), + usage=usage, + ) + except KeyError as e: + raise e diff --git a/letta/llm_api/llm_client.py b/letta/llm_api/llm_client.py new file mode 100644 index 00000000..1769cb4d --- /dev/null +++ b/letta/llm_api/llm_client.py @@ -0,0 +1,48 @@ +from typing import Optional + +from letta.llm_api.llm_client_base import LLMClientBase +from letta.schemas.llm_config import LLMConfig + + +class LLMClient: + """Factory class for creating LLM clients based on the model endpoint type.""" + + @staticmethod + def create( + agent_id: str, + llm_config: LLMConfig, + put_inner_thoughts_first: bool = True, + actor_id: Optional[str] = None, + ) -> Optional[LLMClientBase]: + """ + Create an LLM client based on the model endpoint type. + + Args: + agent_id: Unique identifier for the agent + llm_config: Configuration for the LLM model + put_inner_thoughts_first: Whether to put inner thoughts first in the response + use_structured_output: Whether to use structured output + use_tool_naming: Whether to use tool naming + actor_id: Optional actor identifier + + Returns: + An instance of LLMClientBase subclass + + Raises: + ValueError: If the model endpoint type is not supported + """ + match llm_config.model_endpoint_type: + case "google_ai": + from letta.llm_api.google_ai_client import GoogleAIClient + + return GoogleAIClient( + agent_id=agent_id, llm_config=llm_config, put_inner_thoughts_first=put_inner_thoughts_first, actor_id=actor_id + ) + case "google_vertex": + from letta.llm_api.google_vertex_client import GoogleVertexClient + + return GoogleVertexClient( + agent_id=agent_id, llm_config=llm_config, put_inner_thoughts_first=put_inner_thoughts_first, actor_id=actor_id + ) + case _: + return None diff --git a/letta/llm_api/llm_client_base.py b/letta/llm_api/llm_client_base.py new file mode 100644 index 00000000..c55658c7 --- /dev/null +++ b/letta/llm_api/llm_client_base.py @@ -0,0 +1,129 @@ +from abc import abstractmethod +from typing import List, Optional, Union + +from openai import AsyncStream, Stream +from openai.types.chat.chat_completion_chunk import ChatCompletionChunk + +from letta.schemas.llm_config import LLMConfig +from letta.schemas.message import Message +from letta.schemas.openai.chat_completion_response import ChatCompletionResponse +from letta.tracing import log_event + + +class LLMClientBase: + """ + Abstract base class for LLM clients, formatting the request objects, + handling the downstream request and parsing into chat completions response format + """ + + def __init__( + self, + agent_id: str, + llm_config: LLMConfig, + put_inner_thoughts_first: Optional[bool] = True, + use_structured_output: Optional[bool] = True, + use_tool_naming: bool = True, + actor_id: Optional[str] = None, + ): + self.agent_id = agent_id + self.llm_config = llm_config + self.put_inner_thoughts_first = put_inner_thoughts_first + self.actor_id = actor_id + + def send_llm_request( + self, + messages: List[Message], + tools: Optional[List[dict]] = None, # TODO: change to Tool object + tool_call: Optional[str] = None, + stream: bool = False, + first_message: bool = False, + force_tool_call: Optional[str] = None, + ) -> Union[ChatCompletionResponse, Stream[ChatCompletionChunk]]: + """ + Issues a request to the downstream model endpoint and parses response. + If stream=True, returns a Stream[ChatCompletionChunk] that can be iterated over. + Otherwise returns a ChatCompletionResponse. + """ + request_data = self.build_request_data(messages, tools, tool_call) + log_event(name="llm_request_sent", attributes=request_data) + if stream: + return self.stream(request_data) + else: + response_data = self.request(request_data) + log_event(name="llm_response_received", attributes=response_data) + return self.convert_response_to_chat_completion(response_data, messages) + + async def send_llm_request_async( + self, + messages: List[Message], + tools: Optional[List[dict]] = None, # TODO: change to Tool object + tool_call: Optional[str] = None, + stream: bool = False, + first_message: bool = False, + force_tool_call: Optional[str] = None, + ) -> Union[ChatCompletionResponse, AsyncStream[ChatCompletionChunk]]: + """ + Issues a request to the downstream model endpoint. + If stream=True, returns an AsyncStream[ChatCompletionChunk] that can be async iterated over. + Otherwise returns a ChatCompletionResponse. + """ + request_data = self.build_request_data(messages, tools, tool_call) + log_event(name="llm_request_sent", attributes=request_data) + if stream: + return await self.stream_async(request_data) + else: + response_data = await self.request_async(request_data) + log_event(name="llm_response_received", attributes=response_data) + return self.convert_response_to_chat_completion(response_data, messages) + + @abstractmethod + def build_request_data( + self, + messages: List[Message], + tools: List[dict], + tool_call: Optional[str], + ) -> dict: + """ + Constructs a request object in the expected data format for this client. + """ + raise NotImplementedError + + @abstractmethod + def request(self, request_data: dict) -> dict: + """ + Performs underlying request to llm and returns raw response. + """ + raise NotImplementedError + + @abstractmethod + async def request_async(self, request_data: dict) -> dict: + """ + Performs underlying request to llm and returns raw response. + """ + raise NotImplementedError + + @abstractmethod + def convert_response_to_chat_completion( + self, + response_data: dict, + input_messages: List[Message], + ) -> ChatCompletionResponse: + """ + Converts custom response format from llm client into an OpenAI + ChatCompletionsResponse object. + """ + raise NotImplementedError + + @abstractmethod + def stream(self, request_data: dict) -> Stream[ChatCompletionChunk]: + """ + Performs underlying streaming request to llm and returns raw response. + """ + raise NotImplementedError(f"Streaming is not supported for {self.llm_config.model_endpoint_type}") + + @abstractmethod + async def stream_async(self, request_data: dict) -> AsyncStream[ChatCompletionChunk]: + """ + Performs underlying streaming request to llm and returns raw response. + """ + raise NotImplementedError(f"Streaming is not supported for {self.llm_config.model_endpoint_type}") diff --git a/tests/helpers/endpoints_helper.py b/tests/helpers/endpoints_helper.py index 2c721262..c487d07a 100644 --- a/tests/helpers/endpoints_helper.py +++ b/tests/helpers/endpoints_helper.py @@ -17,6 +17,7 @@ from letta.embeddings import embedding_model from letta.errors import InvalidInnerMonologueError, InvalidToolCallError, MissingInnerMonologueError, MissingToolCallError from letta.helpers.json_helpers import json_dumps from letta.llm_api.llm_api_tools import create +from letta.llm_api.llm_client import LLMClient from letta.local_llm.constants import INNER_THOUGHTS_KWARG from letta.schemas.agent import AgentState from letta.schemas.embedding_config import EmbeddingConfig @@ -103,12 +104,23 @@ def check_first_response_is_valid_for_llm_endpoint(filename: str, validate_inner messages = client.server.agent_manager.get_in_context_messages(agent_id=full_agent_state.id, actor=client.user) agent = Agent(agent_state=full_agent_state, interface=None, user=client.user) - response = create( + llm_client = LLMClient.create( + agent_id=agent_state.id, llm_config=agent_state.llm_config, - user_id=str(uuid.UUID(int=1)), # dummy user_id - messages=messages, - functions=[t.json_schema for t in agent.agent_state.tools], + actor_id=str(uuid.UUID(int=1)), ) + if llm_client: + response = llm_client.send_llm_request( + messages=messages, + tools=[t.json_schema for t in agent.agent_state.tools], + ) + else: + response = create( + llm_config=agent_state.llm_config, + user_id=str(uuid.UUID(int=1)), # dummy user_id + messages=messages, + functions=[t.json_schema for t in agent.agent_state.tools], + ) # Basic check assert response is not None, response