fix: google clients thinking config (#2414)

Co-authored-by: Sarah Wooders <sarahwooders@gmail.com>
2025-05-24 09:42:34 -07:00
parent 6c9fef84e9
commit eaeac54798
2 changed files with 6 additions and 20 deletions
--- a/letta/llm_api/google_ai_client.py
+++ b/letta/llm_api/google_ai_client.py
@@ -7,10 +7,7 @@ from letta.errors import ErrorCode, LLMAuthenticationError, LLMError
 from letta.llm_api.google_constants import GOOGLE_MODEL_FOR_API_KEY_CHECK
 from letta.llm_api.google_vertex_client import GoogleVertexClient
 from letta.log import get_logger
-from letta.schemas.llm_config import LLMConfig
-from letta.schemas.message import Message as PydanticMessage
 from letta.settings import model_settings
-from letta.tracing import trace_method

 logger = get_logger(__name__)

@@ -20,18 +17,6 @@ class GoogleAIClient(GoogleVertexClient):
    def _get_client(self):
        return genai.Client(api_key=model_settings.gemini_api_key)

-    @trace_method
-    def build_request_data(
-        self,
-        messages: List[PydanticMessage],
-        llm_config: LLMConfig,
-        tools: List[dict],
-        force_tool_call: Optional[str] = None,
-    ) -> dict:
-        request = super().build_request_data(messages, llm_config, tools, force_tool_call)
-        del request["config"]["thinking_config"]
-        return request
-

 def get_gemini_endpoint_and_headers(
    base_url: str, model: Optional[str], api_key: str, key_in_header: bool = True, generate_content: bool = False
--- a/letta/llm_api/google_vertex_client.py
+++ b/letta/llm_api/google_vertex_client.py
@@ -241,13 +241,14 @@ class GoogleVertexClient(LLMClientBase):
            )
            request_data["config"]["tool_config"] = tool_config.model_dump()

-        # Add thinking_config
+        # Add thinking_config for flash
        # If enable_reasoner is False, set thinking_budget to 0
        # Otherwise, use the value from max_reasoning_tokens
-        thinking_config = ThinkingConfig(
-            thinking_budget=llm_config.max_reasoning_tokens if llm_config.enable_reasoner else 0,
-        )
-        request_data["config"]["thinking_config"] = thinking_config.model_dump()
+        if "flash" in llm_config.model:
+            thinking_config = ThinkingConfig(
+                thinking_budget=llm_config.max_reasoning_tokens if llm_config.enable_reasoner else 0,
+            )
+            request_data["config"]["thinking_config"] = thinking_config.model_dump()

        return request_data