feat: fix streaming put_inner_thoughts_in_kwargs (#1913)

2024-10-21 17:07:20 -07:00
parent e940511a6f
commit 1a93b85bfd
6 changed files with 677 additions and 126 deletions
--- a/letta/llm_api/openai.py
+++ b/letta/llm_api/openai.py
@@ -9,7 +9,11 @@ from httpx_sse._exceptions import SSEError

 from letta.constants import OPENAI_CONTEXT_WINDOW_ERROR_SUBSTRING
 from letta.errors import LLMError
-from letta.llm_api.helpers import add_inner_thoughts_to_functions, make_post_request
+from letta.llm_api.helpers import (
+    add_inner_thoughts_to_functions,
+    convert_to_structured_output,
+    make_post_request,
+)
 from letta.local_llm.constants import (
    INNER_THOUGHTS_KWARG,
    INNER_THOUGHTS_KWARG_DESCRIPTION,
@@ -112,7 +116,7 @@ def build_openai_chat_completions_request(
    use_tool_naming: bool,
    max_tokens: Optional[int],
 ) -> ChatCompletionRequest:
-    if llm_config.put_inner_thoughts_in_kwargs:
+    if functions and llm_config.put_inner_thoughts_in_kwargs:
        functions = add_inner_thoughts_to_functions(
            functions=functions,
            inner_thoughts_key=INNER_THOUGHTS_KWARG,
@@ -154,8 +158,8 @@ def build_openai_chat_completions_request(
        )
        # https://platform.openai.com/docs/guides/text-generation/json-mode
        # only supported by gpt-4o, gpt-4-turbo, or gpt-3.5-turbo
-        if "gpt-4o" in llm_config.model or "gpt-4-turbo" in llm_config.model or "gpt-3.5-turbo" in llm_config.model:
-            data.response_format = {"type": "json_object"}
+        # if "gpt-4o" in llm_config.model or "gpt-4-turbo" in llm_config.model or "gpt-3.5-turbo" in llm_config.model:
+        # data.response_format = {"type": "json_object"}

    if "inference.memgpt.ai" in llm_config.model_endpoint:
        # override user id for inference.memgpt.ai
@@ -362,6 +366,8 @@ def openai_chat_completions_process_stream(
    chat_completion_response.usage.completion_tokens = n_chunks
    chat_completion_response.usage.total_tokens = prompt_tokens + n_chunks

+    assert len(chat_completion_response.choices) > 0, chat_completion_response
+
    # printd(chat_completion_response)
    return chat_completion_response

@@ -461,6 +467,13 @@ def openai_chat_completions_request_stream(
        data.pop("tools")
        data.pop("tool_choice", None)  # extra safe,  should exist always (default="auto")

+    if "tools" in data:
+        for tool in data["tools"]:
+            # tool["strict"] = True
+            tool["function"] = convert_to_structured_output(tool["function"])
+
+    # print(f"\n\n\n\nData[tools]: {json.dumps(data['tools'], indent=2)}")
+
    printd(f"Sending request to {url}")
    try:
        return _sse_post(url=url, data=data, headers=headers)