fix: patches to the API for non-streaming OAI proxy backends (#1653)

2024-08-16 09:48:28 -07:00
parent 1dfadcb21f
commit 1b64ccbbc1
4 changed files with 14 additions and 4 deletions
--- a/memgpt/llm_api/llm_api_tools.py
+++ b/memgpt/llm_api/llm_api_tools.py
@@ -272,7 +272,9 @@ def create(
        else:
            inner_thoughts_in_kwargs = True if inner_thoughts_in_kwargs == OptionState.YES else False

-        assert isinstance(inner_thoughts_in_kwargs, bool), type(inner_thoughts_in_kwargs)
+        if not isinstance(inner_thoughts_in_kwargs, bool):
+            warnings.warn(f"Bad type detected: {type(inner_thoughts_in_kwargs)}")
+            inner_thoughts_in_kwargs = bool(inner_thoughts_in_kwargs)
        if inner_thoughts_in_kwargs:
            functions = add_inner_thoughts_to_functions(
                functions=functions,
--- a/memgpt/local_llm/utils.py
+++ b/memgpt/local_llm/utils.py
@@ -1,4 +1,5 @@
 import os
+import warnings
 from typing import List

 import requests
@@ -191,9 +192,13 @@ def num_tokens_from_messages(messages: List[dict], model: str = "gpt-4") -> int:
        # print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
-        raise NotImplementedError(
+        warnings.warn(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
+        return num_tokens_from_messages(messages, model="gpt-4-0613")
+        # raise NotImplementedError(
+        # f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
+        # )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
--- a/memgpt/server/rest_api/agents/message.py
+++ b/memgpt/server/rest_api/agents/message.py
@@ -95,7 +95,10 @@ async def send_message_to_agent(
 ) -> Union[StreamingResponse, UserMessageResponse]:
    """Split off into a separate function so that it can be imported in the /chat/completion proxy."""

-    include_final_message = True
+    # TODO this is a total hack but is required until we move streaming into the model config
+    if server.server_llm_config.model_endpoint != "https://api.openai.com/v1":
+        stream_tokens = False
+
    # handle the legacy mode streaming
    if stream_legacy:
        # NOTE: override
--- a/memgpt/server/rest_api/interface.py
+++ b/memgpt/server/rest_api/interface.py
@@ -500,7 +500,7 @@ class StreamingServerInterface(AgentChunkStreamingInterface):

                    processed_chunk = {
                        "function_call": {
-                            # "id": function_call.id,
+                            "id": function_call.id,
                            "name": function_call.function["name"],
                            "arguments": function_call.function["arguments"],
                        },