feat: add new llm client framework and migrate google apis (#1209)

2025-03-07 16:34:06 -08:00
parent 4656164753
commit 2a36af8a5d
6 changed files with 771 additions and 16 deletions
--- a/letta/agent.py
+++ b/letta/agent.py
@@ -29,6 +29,7 @@ from letta.helpers.json_helpers import json_dumps, json_loads
 from letta.interface import AgentInterface
 from letta.llm_api.helpers import calculate_summarizer_cutoff, get_token_counts_for_messages, is_context_overflow_error
 from letta.llm_api.llm_api_tools import create
+from letta.llm_api.llm_client import LLMClient
 from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
 from letta.log import get_logger
 from letta.memory import summarize_messages
@@ -356,19 +357,38 @@ class Agent(BaseAgent):
        for attempt in range(1, empty_response_retry_limit + 1):
            try:
                log_telemetry(self.logger, "_get_ai_reply create start")
-                response = create(
+                # New LLM client flow
+                llm_client = LLMClient.create(
+                    agent_id=self.agent_state.id,
                    llm_config=self.agent_state.llm_config,
-                    messages=message_sequence,
-                    user_id=self.agent_state.created_by_id,
-                    functions=allowed_functions,
-                    # functions_python=self.functions_python, do we need this?
-                    function_call=function_call,
-                    first_message=first_message,
-                    force_tool_call=force_tool_call,
-                    stream=stream,
-                    stream_interface=self.interface,
                    put_inner_thoughts_first=put_inner_thoughts_first,
+                    actor_id=self.agent_state.created_by_id,
                )
+
+                if llm_client and not stream:
+                    response = llm_client.send_llm_request(
+                        messages=message_sequence,
+                        tools=allowed_functions,
+                        tool_call=function_call,
+                        stream=stream,
+                        first_message=first_message,
+                        force_tool_call=force_tool_call,
+                    )
+                else:
+                    # Fallback to existing flow
+                    response = create(
+                        llm_config=self.agent_state.llm_config,
+                        messages=message_sequence,
+                        user_id=self.agent_state.created_by_id,
+                        functions=allowed_functions,
+                        # functions_python=self.functions_python, do we need this?
+                        function_call=function_call,
+                        first_message=first_message,
+                        force_tool_call=force_tool_call,
+                        stream=stream,
+                        stream_interface=self.interface,
+                        put_inner_thoughts_first=put_inner_thoughts_first,
+                    )
                log_telemetry(self.logger, "_get_ai_reply create finish")

                # These bottom two are retryable
@@ -632,7 +652,7 @@ class Agent(BaseAgent):
                    function_args,
                    function_response,
                    messages,
-                    [tool_return] if tool_return else None,
+                    [tool_return],
                    include_function_failed_message=True,
                )
                return messages, False, True  # force a heartbeat to allow agent to handle error
@@ -659,7 +679,7 @@ class Agent(BaseAgent):
                        "content": function_response,
                        "tool_call_id": tool_call_id,
                    },
-                    tool_returns=[tool_return] if tool_return else None,
+                    tool_returns=[tool_return] if sandbox_run_result else None,
                )
            )  # extend conversation with function response
            self.interface.function_message(f"Ran {function_name}({function_args})", msg_obj=messages[-1])
--- a/letta/llm_api/google_ai_client.py
+++ b/letta/llm_api/google_ai_client.py
@@ -0,0 +1,332 @@
+import uuid
+from typing import List, Optional, Tuple
+
+from letta.constants import NON_USER_MSG_PREFIX
+from letta.helpers.datetime_helpers import get_utc_time
+from letta.helpers.json_helpers import json_dumps
+from letta.llm_api.helpers import make_post_request
+from letta.llm_api.llm_client_base import LLMClientBase
+from letta.local_llm.json_parser import clean_json_string_extra_backslash
+from letta.local_llm.utils import count_tokens
+from letta.schemas.message import Message as PydanticMessage
+from letta.schemas.openai.chat_completion_request import Tool
+from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice, FunctionCall, Message, ToolCall, UsageStatistics
+from letta.settings import model_settings
+from letta.utils import get_tool_call_id
+
+
+class GoogleAIClient(LLMClientBase):
+
+    def request(self, request_data: dict) -> dict:
+        """
+        Performs underlying request to llm and returns raw response.
+        """
+        url, headers = self.get_gemini_endpoint_and_headers(generate_content=True)
+        return make_post_request(url, headers, request_data)
+
+    def build_request_data(
+        self,
+        messages: List[PydanticMessage],
+        tools: List[dict],
+        tool_call: Optional[str],
+    ) -> dict:
+        """
+        Constructs a request object in the expected data format for this client.
+        """
+        if tools:
+            tools = [{"type": "function", "function": f} for f in tools]
+            tools = self.convert_tools_to_google_ai_format(
+                [Tool(**t) for t in tools],
+            )
+        contents = self.add_dummy_model_messages(
+            [m.to_google_ai_dict() for m in messages],
+        )
+
+        return {
+            "contents": contents,
+            "tools": tools,
+            "generation_config": {
+                "temperature": self.llm_config.temperature,
+                "max_output_tokens": self.llm_config.max_tokens,
+            },
+        }
+
+    def convert_response_to_chat_completion(
+        self,
+        response_data: dict,
+        input_messages: List[PydanticMessage],
+    ) -> ChatCompletionResponse:
+        """
+        Converts custom response format from llm client into an OpenAI
+        ChatCompletionsResponse object.
+
+        Example Input:
+        {
+            "candidates": [
+                {
+                    "content": {
+                        "parts": [
+                            {
+                                "text": " OK. Barbie is showing in two theaters in Mountain View, CA: AMC Mountain View 16 and Regal Edwards 14."
+                            }
+                        ]
+                    }
+                }
+            ],
+            "usageMetadata": {
+                "promptTokenCount": 9,
+                "candidatesTokenCount": 27,
+                "totalTokenCount": 36
+            }
+        }
+        """
+        try:
+            choices = []
+            index = 0
+            for candidate in response_data["candidates"]:
+                content = candidate["content"]
+
+                role = content["role"]
+                assert role == "model", f"Unknown role in response: {role}"
+
+                parts = content["parts"]
+                # TODO support parts / multimodal
+                # TODO support parallel tool calling natively
+                # TODO Alternative here is to throw away everything else except for the first part
+                for response_message in parts:
+                    # Convert the actual message style to OpenAI style
+                    if "functionCall" in response_message and response_message["functionCall"] is not None:
+                        function_call = response_message["functionCall"]
+                        assert isinstance(function_call, dict), function_call
+                        function_name = function_call["name"]
+                        assert isinstance(function_name, str), function_name
+                        function_args = function_call["args"]
+                        assert isinstance(function_args, dict), function_args
+
+                        # NOTE: this also involves stripping the inner monologue out of the function
+                        if self.llm_config.put_inner_thoughts_in_kwargs:
+                            from letta.local_llm.constants import INNER_THOUGHTS_KWARG
+
+                            assert INNER_THOUGHTS_KWARG in function_args, f"Couldn't find inner thoughts in function args:\n{function_call}"
+                            inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG)
+                            assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}"
+                        else:
+                            inner_thoughts = None
+
+                        # Google AI API doesn't generate tool call IDs
+                        openai_response_message = Message(
+                            role="assistant",  # NOTE: "model" -> "assistant"
+                            content=inner_thoughts,
+                            tool_calls=[
+                                ToolCall(
+                                    id=get_tool_call_id(),
+                                    type="function",
+                                    function=FunctionCall(
+                                        name=function_name,
+                                        arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
+                                    ),
+                                )
+                            ],
+                        )
+
+                    else:
+
+                        # Inner thoughts are the content by default
+                        inner_thoughts = response_message["text"]
+
+                        # Google AI API doesn't generate tool call IDs
+                        openai_response_message = Message(
+                            role="assistant",  # NOTE: "model" -> "assistant"
+                            content=inner_thoughts,
+                        )
+
+                    # Google AI API uses different finish reason strings than OpenAI
+                    # OpenAI: 'stop', 'length', 'function_call', 'content_filter', null
+                    #   see: https://platform.openai.com/docs/guides/text-generation/chat-completions-api
+                    # Google AI API: FINISH_REASON_UNSPECIFIED, STOP, MAX_TOKENS, SAFETY, RECITATION, OTHER
+                    #   see: https://ai.google.dev/api/python/google/ai/generativelanguage/Candidate/FinishReason
+                    finish_reason = candidate["finishReason"]
+                    if finish_reason == "STOP":
+                        openai_finish_reason = (
+                            "function_call"
+                            if openai_response_message.tool_calls is not None and len(openai_response_message.tool_calls) > 0
+                            else "stop"
+                        )
+                    elif finish_reason == "MAX_TOKENS":
+                        openai_finish_reason = "length"
+                    elif finish_reason == "SAFETY":
+                        openai_finish_reason = "content_filter"
+                    elif finish_reason == "RECITATION":
+                        openai_finish_reason = "content_filter"
+                    else:
+                        raise ValueError(f"Unrecognized finish reason in Google AI response: {finish_reason}")
+
+                    choices.append(
+                        Choice(
+                            finish_reason=openai_finish_reason,
+                            index=index,
+                            message=openai_response_message,
+                        )
+                    )
+                    index += 1
+
+            # if len(choices) > 1:
+            #     raise UserWarning(f"Unexpected number of candidates in response (expected 1, got {len(choices)})")
+
+            # NOTE: some of the Google AI APIs show UsageMetadata in the response, but it seems to not exist?
+            #  "usageMetadata": {
+            #     "promptTokenCount": 9,
+            #     "candidatesTokenCount": 27,
+            #     "totalTokenCount": 36
+            #   }
+            if "usageMetadata" in response_data:
+                usage = UsageStatistics(
+                    prompt_tokens=response_data["usageMetadata"]["promptTokenCount"],
+                    completion_tokens=response_data["usageMetadata"]["candidatesTokenCount"],
+                    total_tokens=response_data["usageMetadata"]["totalTokenCount"],
+                )
+            else:
+                # Count it ourselves
+                assert input_messages is not None, f"Didn't get UsageMetadata from the API response, so input_messages is required"
+                prompt_tokens = count_tokens(json_dumps(input_messages))  # NOTE: this is a very rough approximation
+                completion_tokens = count_tokens(json_dumps(openai_response_message.model_dump()))  # NOTE: this is also approximate
+                total_tokens = prompt_tokens + completion_tokens
+                usage = UsageStatistics(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
+                )
+
+            response_id = str(uuid.uuid4())
+            return ChatCompletionResponse(
+                id=response_id,
+                choices=choices,
+                model=self.llm_config.model,  # NOTE: Google API doesn't pass back model in the response
+                created=get_utc_time(),
+                usage=usage,
+            )
+        except KeyError as e:
+            raise e
+
+    def get_gemini_endpoint_and_headers(
+        self,
+        key_in_header: bool = True,
+        generate_content: bool = False,
+    ) -> Tuple[str, dict]:
+        """
+        Dynamically generate the model endpoint and headers.
+        """
+
+        url = f"{self.llm_config.model_endpoint}/v1beta/models"
+
+        # Add the model
+        url += f"/{self.llm_config.model}"
+
+        # Add extension for generating content if we're hitting the LM
+        if generate_content:
+            url += ":generateContent"
+
+        # Decide if api key should be in header or not
+        # Two ways to pass the key: https://ai.google.dev/tutorials/setup
+        if key_in_header:
+            headers = {"Content-Type": "application/json", "x-goog-api-key": model_settings.gemini_api_key}
+        else:
+            url += f"?key={model_settings.gemini_api_key}"
+            headers = {"Content-Type": "application/json"}
+
+        return url, headers
+
+    def convert_tools_to_google_ai_format(self, tools: List[Tool]) -> List[dict]:
+        """
+        OpenAI style:
+        "tools": [{
+            "type": "function",
+            "function": {
+                "name": "find_movies",
+                "description": "find ....",
+                "parameters": {
+                "type": "object",
+                "properties": {
+                    PARAM: {
+                    "type": PARAM_TYPE,  # eg "string"
+                    "description": PARAM_DESCRIPTION,
+                    },
+                    ...
+                },
+                "required": List[str],
+                }
+            }
+        }
+        ]
+
+        Google AI style:
+        "tools": [{
+            "functionDeclarations": [{
+            "name": "find_movies",
+            "description": "find movie titles currently playing in theaters based on any description, genre, title words, etc.",
+            "parameters": {
+                "type": "OBJECT",
+                "properties": {
+                "location": {
+                    "type": "STRING",
+                    "description": "The city and state, e.g. San Francisco, CA or a zip code e.g. 95616"
+                },
+                "description": {
+                    "type": "STRING",
+                    "description": "Any kind of description including category or genre, title words, attributes, etc."
+                }
+                },
+                "required": ["description"]
+            }
+            }, {
+            "name": "find_theaters",
+            ...
+        """
+        function_list = [
+            dict(
+                name=t.function.name,
+                description=t.function.description,
+                parameters=t.function.parameters,  # TODO need to unpack
+            )
+            for t in tools
+        ]
+
+        # Correct casing + add inner thoughts if needed
+        for func in function_list:
+            func["parameters"]["type"] = "OBJECT"
+            for param_name, param_fields in func["parameters"]["properties"].items():
+                param_fields["type"] = param_fields["type"].upper()
+            # Add inner thoughts
+            if self.llm_config.put_inner_thoughts_in_kwargs:
+                from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION
+
+                func["parameters"]["properties"][INNER_THOUGHTS_KWARG] = {
+                    "type": "STRING",
+                    "description": INNER_THOUGHTS_KWARG_DESCRIPTION,
+                }
+                func["parameters"]["required"].append(INNER_THOUGHTS_KWARG)
+
+        return [{"functionDeclarations": function_list}]
+
+    def add_dummy_model_messages(self, messages: List[dict]) -> List[dict]:
+        """Google AI API requires all function call returns are immediately followed by a 'model' role message.
+
+        In Letta, the 'model' will often call a function (e.g. send_message) that itself yields to the user,
+        so there is no natural follow-up 'model' role message.
+
+        To satisfy the Google AI API restrictions, we can add a dummy 'yield' message
+        with role == 'model' that is placed in-betweeen and function output
+        (role == 'tool') and user message (role == 'user').
+        """
+        dummy_yield_message = {
+            "role": "model",
+            "parts": [{"text": f"{NON_USER_MSG_PREFIX}Function call returned, waiting for user response."}],
+        }
+        messages_with_padding = []
+        for i, message in enumerate(messages):
+            messages_with_padding.append(message)
+            # Check if the current message role is 'tool' and the next message role is 'user'
+            if message["role"] in ["tool", "function"] and (i + 1 < len(messages) and messages[i + 1]["role"] == "user"):
+                messages_with_padding.append(dummy_yield_message)
+
+        return messages_with_padding
--- a/letta/llm_api/google_vertex_client.py
+++ b/letta/llm_api/google_vertex_client.py
@@ -0,0 +1,214 @@
+import uuid
+from typing import List, Optional
+
+from google import genai
+from google.genai.types import FunctionCallingConfig, FunctionCallingConfigMode, GenerateContentResponse, ToolConfig
+
+from letta.helpers.datetime_helpers import get_utc_time
+from letta.helpers.json_helpers import json_dumps
+from letta.llm_api.google_ai_client import GoogleAIClient
+from letta.local_llm.json_parser import clean_json_string_extra_backslash
+from letta.local_llm.utils import count_tokens
+from letta.schemas.message import Message as PydanticMessage
+from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice, FunctionCall, Message, ToolCall, UsageStatistics
+from letta.settings import model_settings
+from letta.utils import get_tool_call_id
+
+
+class GoogleVertexClient(GoogleAIClient):
+
+    def request(self, request_data: dict) -> dict:
+        """
+        Performs underlying request to llm and returns raw response.
+        """
+        client = genai.Client(
+            vertexai=True,
+            project=model_settings.google_cloud_project,
+            location=model_settings.google_cloud_location,
+            http_options={"api_version": "v1"},
+        )
+        response = client.models.generate_content(
+            model=self.llm_config.model,
+            contents=request_data["contents"],
+            config=request_data["config"],
+        )
+        return response.model_dump()
+
+    def build_request_data(
+        self,
+        messages: List[PydanticMessage],
+        tools: List[dict],
+        tool_call: Optional[str],
+    ) -> dict:
+        """
+        Constructs a request object in the expected data format for this client.
+        """
+        request_data = super().build_request_data(messages, tools, tool_call)
+        request_data["config"] = request_data.pop("generation_config")
+        request_data["config"]["tools"] = request_data.pop("tools")
+
+        tool_config = ToolConfig(
+            function_calling_config=FunctionCallingConfig(
+                # ANY mode forces the model to predict only function calls
+                mode=FunctionCallingConfigMode.ANY,
+            )
+        )
+        request_data["config"]["tool_config"] = tool_config.model_dump()
+
+        return request_data
+
+    def convert_response_to_chat_completion(
+        self,
+        response_data: dict,
+        input_messages: List[PydanticMessage],
+    ) -> ChatCompletionResponse:
+        """
+        Converts custom response format from llm client into an OpenAI
+        ChatCompletionsResponse object.
+
+        Example:
+        {
+        "candidates": [
+            {
+                "content": {
+                    "parts": [
+                        {
+                            "text": " OK. Barbie is showing in two theaters in Mountain View, CA: AMC Mountain View 16 and Regal Edwards 14."
+                        }
+                    ]
+                }
+            }
+        ],
+        "usageMetadata": {
+            "promptTokenCount": 9,
+            "candidatesTokenCount": 27,
+            "totalTokenCount": 36
+        }
+        }
+        """
+        response = GenerateContentResponse(**response_data)
+        try:
+            choices = []
+            index = 0
+            for candidate in response.candidates:
+                content = candidate.content
+
+                role = content.role
+                assert role == "model", f"Unknown role in response: {role}"
+
+                parts = content.parts
+                # TODO support parts / multimodal
+                # TODO support parallel tool calling natively
+                # TODO Alternative here is to throw away everything else except for the first part
+                for response_message in parts:
+                    # Convert the actual message style to OpenAI style
+                    if response_message.function_call:
+                        function_call = response_message.function_call
+                        function_name = function_call.name
+                        function_args = function_call.args
+                        assert isinstance(function_args, dict), function_args
+
+                        # NOTE: this also involves stripping the inner monologue out of the function
+                        if self.llm_config.put_inner_thoughts_in_kwargs:
+                            from letta.local_llm.constants import INNER_THOUGHTS_KWARG
+
+                            assert INNER_THOUGHTS_KWARG in function_args, f"Couldn't find inner thoughts in function args:\n{function_call}"
+                            inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG)
+                            assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}"
+                        else:
+                            inner_thoughts = None
+
+                        # Google AI API doesn't generate tool call IDs
+                        openai_response_message = Message(
+                            role="assistant",  # NOTE: "model" -> "assistant"
+                            content=inner_thoughts,
+                            tool_calls=[
+                                ToolCall(
+                                    id=get_tool_call_id(),
+                                    type="function",
+                                    function=FunctionCall(
+                                        name=function_name,
+                                        arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
+                                    ),
+                                )
+                            ],
+                        )
+
+                    else:
+
+                        # Inner thoughts are the content by default
+                        inner_thoughts = response_message.text
+
+                        # Google AI API doesn't generate tool call IDs
+                        openai_response_message = Message(
+                            role="assistant",  # NOTE: "model" -> "assistant"
+                            content=inner_thoughts,
+                        )
+
+                    # Google AI API uses different finish reason strings than OpenAI
+                    # OpenAI: 'stop', 'length', 'function_call', 'content_filter', null
+                    #   see: https://platform.openai.com/docs/guides/text-generation/chat-completions-api
+                    # Google AI API: FINISH_REASON_UNSPECIFIED, STOP, MAX_TOKENS, SAFETY, RECITATION, OTHER
+                    #   see: https://ai.google.dev/api/python/google/ai/generativelanguage/Candidate/FinishReason
+                    finish_reason = candidate.finish_reason.value
+                    if finish_reason == "STOP":
+                        openai_finish_reason = (
+                            "function_call"
+                            if openai_response_message.tool_calls is not None and len(openai_response_message.tool_calls) > 0
+                            else "stop"
+                        )
+                    elif finish_reason == "MAX_TOKENS":
+                        openai_finish_reason = "length"
+                    elif finish_reason == "SAFETY":
+                        openai_finish_reason = "content_filter"
+                    elif finish_reason == "RECITATION":
+                        openai_finish_reason = "content_filter"
+                    else:
+                        raise ValueError(f"Unrecognized finish reason in Google AI response: {finish_reason}")
+
+                    choices.append(
+                        Choice(
+                            finish_reason=openai_finish_reason,
+                            index=index,
+                            message=openai_response_message,
+                        )
+                    )
+                    index += 1
+
+            # if len(choices) > 1:
+            #     raise UserWarning(f"Unexpected number of candidates in response (expected 1, got {len(choices)})")
+
+            # NOTE: some of the Google AI APIs show UsageMetadata in the response, but it seems to not exist?
+            #  "usageMetadata": {
+            #     "promptTokenCount": 9,
+            #     "candidatesTokenCount": 27,
+            #     "totalTokenCount": 36
+            #   }
+            if response.usage_metadata:
+                usage = UsageStatistics(
+                    prompt_tokens=response.usage_metadata.prompt_token_count,
+                    completion_tokens=response.usage_metadata.candidates_token_count,
+                    total_tokens=response.usage_metadata.total_token_count,
+                )
+            else:
+                # Count it ourselves
+                assert input_messages is not None, f"Didn't get UsageMetadata from the API response, so input_messages is required"
+                prompt_tokens = count_tokens(json_dumps(input_messages))  # NOTE: this is a very rough approximation
+                completion_tokens = count_tokens(json_dumps(openai_response_message.model_dump()))  # NOTE: this is also approximate
+                total_tokens = prompt_tokens + completion_tokens
+                usage = UsageStatistics(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=total_tokens,
+                )
+
+            response_id = str(uuid.uuid4())
+            return ChatCompletionResponse(
+                id=response_id,
+                choices=choices,
+                model=self.llm_config.model,  # NOTE: Google API doesn't pass back model in the response
+                created=get_utc_time(),
+                usage=usage,
+            )
+        except KeyError as e:
+            raise e
--- a/letta/llm_api/llm_client.py
+++ b/letta/llm_api/llm_client.py
@@ -0,0 +1,48 @@
+from typing import Optional
+
+from letta.llm_api.llm_client_base import LLMClientBase
+from letta.schemas.llm_config import LLMConfig
+
+
+class LLMClient:
+    """Factory class for creating LLM clients based on the model endpoint type."""
+
+    @staticmethod
+    def create(
+        agent_id: str,
+        llm_config: LLMConfig,
+        put_inner_thoughts_first: bool = True,
+        actor_id: Optional[str] = None,
+    ) -> Optional[LLMClientBase]:
+        """
+        Create an LLM client based on the model endpoint type.
+
+        Args:
+            agent_id: Unique identifier for the agent
+            llm_config: Configuration for the LLM model
+            put_inner_thoughts_first: Whether to put inner thoughts first in the response
+            use_structured_output: Whether to use structured output
+            use_tool_naming: Whether to use tool naming
+            actor_id: Optional actor identifier
+
+        Returns:
+            An instance of LLMClientBase subclass
+
+        Raises:
+            ValueError: If the model endpoint type is not supported
+        """
+        match llm_config.model_endpoint_type:
+            case "google_ai":
+                from letta.llm_api.google_ai_client import GoogleAIClient
+
+                return GoogleAIClient(
+                    agent_id=agent_id, llm_config=llm_config, put_inner_thoughts_first=put_inner_thoughts_first, actor_id=actor_id
+                )
+            case "google_vertex":
+                from letta.llm_api.google_vertex_client import GoogleVertexClient
+
+                return GoogleVertexClient(
+                    agent_id=agent_id, llm_config=llm_config, put_inner_thoughts_first=put_inner_thoughts_first, actor_id=actor_id
+                )
+            case _:
+                return None
--- a/letta/llm_api/llm_client_base.py
+++ b/letta/llm_api/llm_client_base.py
@@ -0,0 +1,129 @@
+from abc import abstractmethod
+from typing import List, Optional, Union
+
+from openai import AsyncStream, Stream
+from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
+
+from letta.schemas.llm_config import LLMConfig
+from letta.schemas.message import Message
+from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
+from letta.tracing import log_event
+
+
+class LLMClientBase:
+    """
+    Abstract base class for LLM clients, formatting the request objects,
+    handling the downstream request and parsing into chat completions response format
+    """
+
+    def __init__(
+        self,
+        agent_id: str,
+        llm_config: LLMConfig,
+        put_inner_thoughts_first: Optional[bool] = True,
+        use_structured_output: Optional[bool] = True,
+        use_tool_naming: bool = True,
+        actor_id: Optional[str] = None,
+    ):
+        self.agent_id = agent_id
+        self.llm_config = llm_config
+        self.put_inner_thoughts_first = put_inner_thoughts_first
+        self.actor_id = actor_id
+
+    def send_llm_request(
+        self,
+        messages: List[Message],
+        tools: Optional[List[dict]] = None,  # TODO: change to Tool object
+        tool_call: Optional[str] = None,
+        stream: bool = False,
+        first_message: bool = False,
+        force_tool_call: Optional[str] = None,
+    ) -> Union[ChatCompletionResponse, Stream[ChatCompletionChunk]]:
+        """
+        Issues a request to the downstream model endpoint and parses response.
+        If stream=True, returns a Stream[ChatCompletionChunk] that can be iterated over.
+        Otherwise returns a ChatCompletionResponse.
+        """
+        request_data = self.build_request_data(messages, tools, tool_call)
+        log_event(name="llm_request_sent", attributes=request_data)
+        if stream:
+            return self.stream(request_data)
+        else:
+            response_data = self.request(request_data)
+            log_event(name="llm_response_received", attributes=response_data)
+            return self.convert_response_to_chat_completion(response_data, messages)
+
+    async def send_llm_request_async(
+        self,
+        messages: List[Message],
+        tools: Optional[List[dict]] = None,  # TODO: change to Tool object
+        tool_call: Optional[str] = None,
+        stream: bool = False,
+        first_message: bool = False,
+        force_tool_call: Optional[str] = None,
+    ) -> Union[ChatCompletionResponse, AsyncStream[ChatCompletionChunk]]:
+        """
+        Issues a request to the downstream model endpoint.
+        If stream=True, returns an AsyncStream[ChatCompletionChunk] that can be async iterated over.
+        Otherwise returns a ChatCompletionResponse.
+        """
+        request_data = self.build_request_data(messages, tools, tool_call)
+        log_event(name="llm_request_sent", attributes=request_data)
+        if stream:
+            return await self.stream_async(request_data)
+        else:
+            response_data = await self.request_async(request_data)
+            log_event(name="llm_response_received", attributes=response_data)
+            return self.convert_response_to_chat_completion(response_data, messages)
+
+    @abstractmethod
+    def build_request_data(
+        self,
+        messages: List[Message],
+        tools: List[dict],
+        tool_call: Optional[str],
+    ) -> dict:
+        """
+        Constructs a request object in the expected data format for this client.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def request(self, request_data: dict) -> dict:
+        """
+        Performs underlying request to llm and returns raw response.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    async def request_async(self, request_data: dict) -> dict:
+        """
+        Performs underlying request to llm and returns raw response.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def convert_response_to_chat_completion(
+        self,
+        response_data: dict,
+        input_messages: List[Message],
+    ) -> ChatCompletionResponse:
+        """
+        Converts custom response format from llm client into an OpenAI
+        ChatCompletionsResponse object.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def stream(self, request_data: dict) -> Stream[ChatCompletionChunk]:
+        """
+        Performs underlying streaming request to llm and returns raw response.
+        """
+        raise NotImplementedError(f"Streaming is not supported for {self.llm_config.model_endpoint_type}")
+
+    @abstractmethod
+    async def stream_async(self, request_data: dict) -> AsyncStream[ChatCompletionChunk]:
+        """
+        Performs underlying streaming request to llm and returns raw response.
+        """
+        raise NotImplementedError(f"Streaming is not supported for {self.llm_config.model_endpoint_type}")
--- a/tests/helpers/endpoints_helper.py
+++ b/tests/helpers/endpoints_helper.py
@@ -17,6 +17,7 @@ from letta.embeddings import embedding_model
 from letta.errors import InvalidInnerMonologueError, InvalidToolCallError, MissingInnerMonologueError, MissingToolCallError
 from letta.helpers.json_helpers import json_dumps
 from letta.llm_api.llm_api_tools import create
+from letta.llm_api.llm_client import LLMClient
 from letta.local_llm.constants import INNER_THOUGHTS_KWARG
 from letta.schemas.agent import AgentState
 from letta.schemas.embedding_config import EmbeddingConfig
@@ -103,12 +104,23 @@ def check_first_response_is_valid_for_llm_endpoint(filename: str, validate_inner
    messages = client.server.agent_manager.get_in_context_messages(agent_id=full_agent_state.id, actor=client.user)
    agent = Agent(agent_state=full_agent_state, interface=None, user=client.user)

-    response = create(
+    llm_client = LLMClient.create(
+        agent_id=agent_state.id,
        llm_config=agent_state.llm_config,
-        user_id=str(uuid.UUID(int=1)),  # dummy user_id
-        messages=messages,
-        functions=[t.json_schema for t in agent.agent_state.tools],
+        actor_id=str(uuid.UUID(int=1)),
    )
+    if llm_client:
+        response = llm_client.send_llm_request(
+            messages=messages,
+            tools=[t.json_schema for t in agent.agent_state.tools],
+        )
+    else:
+        response = create(
+            llm_config=agent_state.llm_config,
+            user_id=str(uuid.UUID(int=1)),  # dummy user_id
+            messages=messages,
+            functions=[t.json_schema for t in agent.agent_state.tools],
+        )

    # Basic check
    assert response is not None, response