feat: support togetherAI via /completions (#2045)

2024-11-18 15:15:05 -08:00
parent cada5976da
commit f57dc28552
14 changed files with 364 additions and 6 deletions
--- a/.github/workflows/test_together.yml
+++ b/.github/workflows/test_together.yml
@@ -0,0 +1,104 @@
+name: Together Llama 3.1 70b Capabilities Test
+
+env:
+  TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+
+    - name: "Setup Python, Poetry and Dependencies"
+      uses: packetcoders/action-setup-cache-python-poetry@main
+      with:
+        python-version: "3.12"
+        poetry-version: "1.8.2"
+        install-args: "-E dev -E external-tools"
+
+    - name: Test first message contains expected function call and inner monologue
+      id: test_first_message
+      env:
+        TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      run: |
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_together_llama_3_70b_returns_valid_first_message
+        echo "TEST_FIRST_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
+      continue-on-error: true
+
+    - name: Test model sends message with keyword
+      id: test_keyword_message
+      env:
+        TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      run: |
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_together_llama_3_70b_returns_keyword
+        echo "TEST_KEYWORD_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV
+      continue-on-error: true
+
+    - name: Test model uses external tool correctly
+      id: test_external_tool
+      env:
+        TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      run: |
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_together_llama_3_70b_uses_external_tool
+        echo "TEST_EXTERNAL_TOOL_EXIT_CODE=$?" >> $GITHUB_ENV
+      continue-on-error: true
+
+    - name: Test model recalls chat memory
+      id: test_chat_memory
+      env:
+        TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      run: |
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_together_llama_3_70b_recall_chat_memory
+        echo "TEST_CHAT_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
+      continue-on-error: true
+
+    - name: Test model uses 'archival_memory_search' to find secret
+      id: test_archival_memory
+      env:
+        TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      run: |
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_together_llama_3_70b_archival_memory_retrieval
+        echo "TEST_ARCHIVAL_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
+      continue-on-error: true
+
+    - name: Test model can edit core memories
+      id: test_core_memory
+      env:
+        TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      run: |
+        poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_together_llama_3_70b_edit_core_memory
+        echo "TEST_CORE_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV
+      continue-on-error: true
+
+    - name: Summarize test results
+      if: always()
+      run: |
+        echo "Test Results Summary:"
+
+        # If the exit code is empty, treat it as a failure (❌)
+        echo "Test first message: $([[ -z $TEST_FIRST_MESSAGE_EXIT_CODE || $TEST_FIRST_MESSAGE_EXIT_CODE -ne 0 ]] && echo ❌ || echo ✅)"
+        echo "Test model sends message with keyword: $([[ -z $TEST_KEYWORD_MESSAGE_EXIT_CODE || $TEST_KEYWORD_MESSAGE_EXIT_CODE -ne 0 ]] && echo ❌ || echo ✅)"
+        echo "Test model uses external tool: $([[ -z $TEST_EXTERNAL_TOOL_EXIT_CODE || $TEST_EXTERNAL_TOOL_EXIT_CODE -ne 0 ]] && echo ❌ || echo ✅)"
+        echo "Test model recalls chat memory: $([[ -z $TEST_CHAT_MEMORY_EXIT_CODE || $TEST_CHAT_MEMORY_EXIT_CODE -ne 0 ]] && echo ❌ || echo ✅)"
+        echo "Test model uses 'archival_memory_search' to find secret: $([[ -z $TEST_ARCHIVAL_MEMORY_EXIT_CODE || $TEST_ARCHIVAL_MEMORY_EXIT_CODE -ne 0 ]] && echo ❌ || echo ✅)"
+        echo "Test model can edit core memories: $([[ -z $TEST_CORE_MEMORY_EXIT_CODE || $TEST_CORE_MEMORY_EXIT_CODE -ne 0 ]] && echo ❌ || echo ✅)"
+
+        # Check if any test failed (either non-zero or unset exit code)
+        if [[ -z $TEST_FIRST_MESSAGE_EXIT_CODE || $TEST_FIRST_MESSAGE_EXIT_CODE -ne 0 || \
+              -z $TEST_KEYWORD_MESSAGE_EXIT_CODE || $TEST_KEYWORD_MESSAGE_EXIT_CODE -ne 0 || \
+              -z $TEST_EXTERNAL_TOOL_EXIT_CODE || $TEST_EXTERNAL_TOOL_EXIT_CODE -ne 0 || \
+              -z $TEST_CHAT_MEMORY_EXIT_CODE || $TEST_CHAT_MEMORY_EXIT_CODE -ne 0 || \
+              -z $TEST_ARCHIVAL_MEMORY_EXIT_CODE || $TEST_ARCHIVAL_MEMORY_EXIT_CODE -ne 0 || \
+              -z $TEST_CORE_MEMORY_EXIT_CODE || $TEST_CORE_MEMORY_EXIT_CODE -ne 0 ]]; then
+          echo "Some tests failed."
+          exit 78
+        fi
+      continue-on-error: true
--- a/letta/constants.py
+++ b/letta/constants.py
@@ -19,7 +19,7 @@ IN_CONTEXT_MEMORY_KEYWORD = "CORE_MEMORY"
 TOOL_CALL_ID_MAX_LEN = 29

 # minimum context window size
-MIN_CONTEXT_WINDOW = 4000
+MIN_CONTEXT_WINDOW = 4096

 # embeddings
 MAX_EMBEDDING_DIM = 4096  # maximum supported embeding size - do NOT change or else DBs will need to be reset
--- a/letta/llm_api/llm_api_tools.py
+++ b/letta/llm_api/llm_api_tools.py
@@ -33,6 +33,7 @@ from letta.schemas.openai.chat_completion_request import (
    cast_message_to_subtype,
 )
 from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
+from letta.settings import ModelSettings
 from letta.streaming_interface import (
    AgentChunkStreamingInterface,
    AgentRefreshStreamingInterface,
@@ -126,6 +127,7 @@ def create(
        from letta.settings import model_settings

        model_settings = model_settings
+        assert isinstance(model_settings, ModelSettings)

    printd(f"Using model {llm_config.model_endpoint_type}, endpoint: {llm_config.model_endpoint}")

@@ -326,6 +328,33 @@ def create(

        return response

+    elif llm_config.model_endpoint_type == "together":
+        """TogetherAI endpoint that goes via /completions instead of /chat/completions"""
+
+        if stream:
+            raise NotImplementedError(f"Streaming not yet implemented for TogetherAI (via the /completions endpoint).")
+
+        if model_settings.together_api_key is None and llm_config.model_endpoint == "https://api.together.ai/v1/completions":
+            raise ValueError(f"TogetherAI key is missing from letta config file")
+
+        return get_chat_completion(
+            model=llm_config.model,
+            messages=messages,
+            functions=functions,
+            functions_python=functions_python,
+            function_call=function_call,
+            context_window=llm_config.context_window,
+            endpoint=llm_config.model_endpoint,
+            endpoint_type="vllm",  # NOTE: use the vLLM path through /completions
+            wrapper=llm_config.model_wrapper,
+            user=str(user_id),
+            # hint
+            first_message=first_message,
+            # auth-related
+            auth_type="bearer_token",  # NOTE: Together expects bearer token auth
+            auth_key=model_settings.together_api_key,
+        )
+
    # local model
    else:
        if stream:
--- a/letta/llm_api/openai.py
+++ b/letta/llm_api/openai.py
@@ -536,7 +536,6 @@ def openai_chat_completions_request(
            tool["function"] = convert_to_structured_output(tool["function"])

    response_json = make_post_request(url, headers, data)
-
    return ChatCompletionResponse(**response_json)


--- a/letta/local_llm/utils.py
+++ b/letta/local_llm/utils.py
@@ -184,6 +184,7 @@ def num_tokens_from_messages(messages: List[dict], model: str = "gpt-4") -> int:
        https://community.openai.com/t/how-to-calculate-the-tokens-when-using-function-call/266573/11
    """
    try:
+        # Attempt to search for the encoding based on the model string
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        # print("Warning: model not found. Using cl100k_base encoding.")
--- a/letta/providers.py
+++ b/letta/providers.py
@@ -2,7 +2,7 @@ from typing import List, Optional

 from pydantic import BaseModel, Field, model_validator

-from letta.constants import LLM_MAX_TOKENS
+from letta.constants import LLM_MAX_TOKENS, MIN_CONTEXT_WINDOW
 from letta.llm_api.azure_openai import (
    get_azure_chat_completions_endpoint,
    get_azure_embeddings_endpoint,
@@ -67,10 +67,15 @@ class OpenAIProvider(Provider):
        extra_params = {"supported_parameters": "tools"} if "openrouter.ai" in self.base_url else None
        response = openai_get_model_list(self.base_url, api_key=self.api_key, extra_params=extra_params)

-        assert "data" in response, f"OpenAI model query response missing 'data' field: {response}"
+        # TogetherAI's response is missing the 'data' field
+        # assert "data" in response, f"OpenAI model query response missing 'data' field: {response}"
+        if "data" in response:
+            data = response["data"]
+        else:
+            data = response

        configs = []
-        for model in response["data"]:
+        for model in data:
            assert "id" in model, f"OpenAI model missing 'id' field: {model}"
            model_name = model["id"]

@@ -82,6 +87,32 @@ class OpenAIProvider(Provider):

            if not context_window_size:
                continue
+
+            # TogetherAI includes the type, which we can use to filter out embedding models
+            if self.base_url == "https://api.together.ai/v1":
+                if "type" in model and model["type"] != "chat":
+                    continue
+
+                # for TogetherAI, we need to skip the models that don't support JSON mode / function calling
+                # requests.exceptions.HTTPError: HTTP error occurred: 400 Client Error: Bad Request for url: https://api.together.ai/v1/chat/completions | Status code: 400, Message: {
+                #   "error": {
+                #     "message": "mistralai/Mixtral-8x7B-v0.1 is not supported for JSON mode/function calling",
+                #     "type": "invalid_request_error",
+                #     "param": null,
+                #     "code": "constraints_model"
+                #   }
+                # }
+                if "config" not in model:
+                    continue
+                if "chat_template" not in model["config"]:
+                    continue
+                if model["config"]["chat_template"] is None:
+                    continue
+                if "tools" not in model["config"]["chat_template"]:
+                    continue
+                # if "config" in data and "chat_template" in data["config"] and "tools" not in data["config"]["chat_template"]:
+                # continue
+
            configs.append(
                LLMConfig(model=model_name, model_endpoint_type="openai", model_endpoint=self.base_url, context_window=context_window_size)
            )
@@ -325,6 +356,113 @@ class GroqProvider(OpenAIProvider):
        raise NotImplementedError


+class TogetherProvider(OpenAIProvider):
+    """TogetherAI provider that uses the /completions API
+
+    TogetherAI can also be used via the /chat/completions API
+    by settings OPENAI_API_KEY and OPENAI_API_BASE to the TogetherAI API key
+    and API URL, however /completions is preferred because their /chat/completions
+    function calling support is limited.
+    """
+
+    name: str = "together"
+    base_url: str = "https://api.together.ai/v1"
+    api_key: str = Field(..., description="API key for the TogetherAI API.")
+    default_prompt_formatter: str = Field(..., description="Default prompt formatter (aka model wrapper) to use on vLLM /completions API.")
+
+    def list_llm_models(self) -> List[LLMConfig]:
+        from letta.llm_api.openai import openai_get_model_list
+
+        response = openai_get_model_list(self.base_url, api_key=self.api_key)
+
+        # TogetherAI's response is missing the 'data' field
+        # assert "data" in response, f"OpenAI model query response missing 'data' field: {response}"
+        if "data" in response:
+            data = response["data"]
+        else:
+            data = response
+
+        configs = []
+        for model in data:
+            assert "id" in model, f"TogetherAI model missing 'id' field: {model}"
+            model_name = model["id"]
+
+            if "context_length" in model:
+                # Context length is returned in OpenRouter as "context_length"
+                context_window_size = model["context_length"]
+            else:
+                context_window_size = self.get_model_context_window_size(model_name)
+
+            # We need the context length for embeddings too
+            if not context_window_size:
+                continue
+
+            # Skip models that are too small for Letta
+            if context_window_size <= MIN_CONTEXT_WINDOW:
+                continue
+
+            # TogetherAI includes the type, which we can use to filter for embedding models
+            if "type" in model and model["type"] not in ["chat", "language"]:
+                continue
+
+            configs.append(
+                LLMConfig(
+                    model=model_name,
+                    model_endpoint_type="together",
+                    model_endpoint=self.base_url,
+                    model_wrapper=self.default_prompt_formatter,
+                    context_window=context_window_size,
+                )
+            )
+
+        return configs
+
+    def list_embedding_models(self) -> List[EmbeddingConfig]:
+        # TODO renable once we figure out how to pass API keys through properly
+        return []
+
+        # from letta.llm_api.openai import openai_get_model_list
+
+        # response = openai_get_model_list(self.base_url, api_key=self.api_key)
+
+        # # TogetherAI's response is missing the 'data' field
+        # # assert "data" in response, f"OpenAI model query response missing 'data' field: {response}"
+        # if "data" in response:
+        #     data = response["data"]
+        # else:
+        #     data = response
+
+        # configs = []
+        # for model in data:
+        #     assert "id" in model, f"TogetherAI model missing 'id' field: {model}"
+        #     model_name = model["id"]
+
+        #     if "context_length" in model:
+        #         # Context length is returned in OpenRouter as "context_length"
+        #         context_window_size = model["context_length"]
+        #     else:
+        #         context_window_size = self.get_model_context_window_size(model_name)
+
+        #     if not context_window_size:
+        #         continue
+
+        #     # TogetherAI includes the type, which we can use to filter out embedding models
+        #     if "type" in model and model["type"] not in ["embedding"]:
+        #         continue
+
+        #     configs.append(
+        #         EmbeddingConfig(
+        #             embedding_model=model_name,
+        #             embedding_endpoint_type="openai",
+        #             embedding_endpoint=self.base_url,
+        #             embedding_dim=context_window_size,
+        #             embedding_chunk_size=300,  # TODO: change?
+        #         )
+        #     )
+
+        # return configs
+
+
 class GoogleAIProvider(Provider):
    # gemini
    api_key: str = Field(..., description="API key for the Google AI API.")
--- a/letta/schemas/llm_config.py
+++ b/letta/schemas/llm_config.py
@@ -35,6 +35,7 @@ class LLMConfig(BaseModel):
        "vllm",
        "hugging-face",
        "mistral",
+        "together",  # completions endpoint
    ] = Field(..., description="The endpoint type for the model.")
    model_endpoint: Optional[str] = Field(None, description="The endpoint for the model.")
    model_wrapper: Optional[str] = Field(None, description="The wrapper for the model.")
--- a/letta/schemas/openai/chat_completion_response.py
+++ b/letta/schemas/openai/chat_completion_response.py
@@ -46,6 +46,7 @@ class Choice(BaseModel):
    index: int
    message: Message
    logprobs: Optional[Dict[str, Union[List[MessageContentLogProb], None]]] = None
+    seed: Optional[int] = None  # found in TogetherAI


 class UsageStatistics(BaseModel):
--- a/letta/server/server.py
+++ b/letta/server/server.py
@@ -49,6 +49,7 @@ from letta.providers import (
    OllamaProvider,
    OpenAIProvider,
    Provider,
+    TogetherProvider,
    VLLMChatCompletionsProvider,
    VLLMCompletionsProvider,
 )
@@ -303,7 +304,18 @@ class SyncServer(Server):
                )
            )
        if model_settings.groq_api_key:
-            self._enabled_providers.append(GroqProvider(api_key=model_settings.groq_api_key))
+            self._enabled_providers.append(
+                GroqProvider(
+                    api_key=model_settings.groq_api_key,
+                )
+            )
+        if model_settings.together_api_key:
+            self._enabled_providers.append(
+                TogetherProvider(
+                    api_key=model_settings.together_api_key,
+                    default_prompt_formatter=model_settings.default_prompt_formatter,
+                )
+            )
        if model_settings.vllm_api_base:
            # vLLM exposes both a /chat/completions and a /completions endpoint
            self._enabled_providers.append(
--- a/letta/settings.py
+++ b/letta/settings.py
@@ -43,6 +43,9 @@ class ModelSettings(BaseSettings):
    # google ai
    gemini_api_key: Optional[str] = None

+    # together
+    together_api_key: Optional[str] = None
+
    # vLLM
    vllm_api_base: Optional[str] = None

--- a/tests/configs/llm_model_configs/together-llama-3-1-405b.json
+++ b/tests/configs/llm_model_configs/together-llama-3-1-405b.json
@@ -0,0 +1,7 @@
+{
+    "context_window": 16000,
+    "model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+    "model_endpoint_type": "together",
+    "model_endpoint": "https://api.together.ai/v1",
+    "model_wrapper": "chatml"
+}
--- a/tests/configs/llm_model_configs/together-llama-3-70b.json
+++ b/tests/configs/llm_model_configs/together-llama-3-70b.json
@@ -0,0 +1,7 @@
+{
+    "context_window": 8192,
+    "model": "meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
+    "model_endpoint_type": "together",
+    "model_endpoint": "https://api.together.ai/v1",
+    "model_wrapper": "chatml"
+}
--- a/tests/test_model_letta_perfomance.py
+++ b/tests/test_model_letta_perfomance.py
@@ -340,3 +340,49 @@ def test_gemini_pro_15_edit_core_memory():
    response = check_agent_edit_core_memory(filename)
    # Log out successful response
    print(f"Got successful response from client: \n\n{response}")
+
+
+# ======================================================================================================================
+# TOGETHER TESTS
+# ======================================================================================================================
+def test_together_llama_3_70b_returns_valid_first_message():
+    filename = os.path.join(llm_config_dir, "together-llama-3-70b.json")
+    response = check_first_response_is_valid_for_llm_endpoint(filename)
+    # Log out successful response
+    print(f"Got successful response from client: \n\n{response}")
+
+
+def test_together_llama_3_70b_returns_keyword():
+    keyword = "banana"
+    filename = os.path.join(llm_config_dir, "together-llama-3-70b.json")
+    response = check_response_contains_keyword(filename, keyword=keyword)
+    # Log out successful response
+    print(f"Got successful response from client: \n\n{response}")
+
+
+def test_together_llama_3_70b_uses_external_tool():
+    filename = os.path.join(llm_config_dir, "together-llama-3-70b.json")
+    response = check_agent_uses_external_tool(filename)
+    # Log out successful response
+    print(f"Got successful response from client: \n\n{response}")
+
+
+def test_together_llama_3_70b_recall_chat_memory():
+    filename = os.path.join(llm_config_dir, "together-llama-3-70b.json")
+    response = check_agent_recall_chat_memory(filename)
+    # Log out successful response
+    print(f"Got successful response from client: \n\n{response}")
+
+
+def test_together_llama_3_70b_archival_memory_retrieval():
+    filename = os.path.join(llm_config_dir, "together-llama-3-70b.json")
+    response = check_agent_archival_memory_retrieval(filename)
+    # Log out successful response
+    print(f"Got successful response from client: \n\n{response}")
+
+
+def test_together_llama_3_70b_edit_core_memory():
+    filename = os.path.join(llm_config_dir, "together-llama-3-70b.json")
+    response = check_agent_edit_core_memory(filename)
+    # Log out successful response
+    print(f"Got successful response from client: \n\n{response}")
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -8,6 +8,7 @@ from letta.providers import (
    MistralProvider,
    OllamaProvider,
    OpenAIProvider,
+    TogetherProvider,
 )
 from letta.settings import model_settings

@@ -70,6 +71,15 @@ def test_mistral():
    print([m.model for m in models])


+def test_together():
+    provider = TogetherProvider(api_key=os.getenv("TOGETHER_API_KEY"), default_prompt_formatter="chatml")
+    models = provider.list_llm_models()
+    print([m.model for m in models])
+
+    embedding_models = provider.list_embedding_models()
+    print([m.embedding_model for m in embedding_models])
+
+
 # def test_vllm():
 #    provider = VLLMProvider(base_url=os.getenv("VLLM_API_BASE"))
 #    models = provider.list_llm_models()