From f57dc28552a020eea5e943bdfc89d476bc84cb99 Mon Sep 17 00:00:00 2001 From: Charles Packer Date: Mon, 18 Nov 2024 15:15:05 -0800 Subject: [PATCH] feat: support togetherAI via `/completions` (#2045) --- .github/workflows/test_together.yml | 104 +++++++++++++ letta/constants.py | 2 +- letta/llm_api/llm_api_tools.py | 29 ++++ letta/llm_api/openai.py | 1 - letta/local_llm/utils.py | 1 + letta/providers.py | 144 +++++++++++++++++- letta/schemas/llm_config.py | 1 + .../openai/chat_completion_response.py | 1 + letta/server/server.py | 14 +- letta/settings.py | 3 + .../together-llama-3-1-405b.json | 7 + .../together-llama-3-70b.json | 7 + tests/test_model_letta_perfomance.py | 46 ++++++ tests/test_providers.py | 10 ++ 14 files changed, 364 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/test_together.yml create mode 100644 tests/configs/llm_model_configs/together-llama-3-1-405b.json create mode 100644 tests/configs/llm_model_configs/together-llama-3-70b.json diff --git a/.github/workflows/test_together.yml b/.github/workflows/test_together.yml new file mode 100644 index 00000000..7fd399d2 --- /dev/null +++ b/.github/workflows/test_together.yml @@ -0,0 +1,104 @@ +name: Together Llama 3.1 70b Capabilities Test + +env: + TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: "Setup Python, Poetry and Dependencies" + uses: packetcoders/action-setup-cache-python-poetry@main + with: + python-version: "3.12" + poetry-version: "1.8.2" + install-args: "-E dev -E external-tools" + + - name: Test first message contains expected function call and inner monologue + id: test_first_message + env: + TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} + run: | + poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_together_llama_3_70b_returns_valid_first_message + echo "TEST_FIRST_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV + continue-on-error: true + + - name: Test model sends message with keyword + id: test_keyword_message + env: + TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} + run: | + poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_together_llama_3_70b_returns_keyword + echo "TEST_KEYWORD_MESSAGE_EXIT_CODE=$?" >> $GITHUB_ENV + continue-on-error: true + + - name: Test model uses external tool correctly + id: test_external_tool + env: + TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} + run: | + poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_together_llama_3_70b_uses_external_tool + echo "TEST_EXTERNAL_TOOL_EXIT_CODE=$?" >> $GITHUB_ENV + continue-on-error: true + + - name: Test model recalls chat memory + id: test_chat_memory + env: + TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} + run: | + poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_together_llama_3_70b_recall_chat_memory + echo "TEST_CHAT_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV + continue-on-error: true + + - name: Test model uses 'archival_memory_search' to find secret + id: test_archival_memory + env: + TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} + run: | + poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_together_llama_3_70b_archival_memory_retrieval + echo "TEST_ARCHIVAL_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV + continue-on-error: true + + - name: Test model can edit core memories + id: test_core_memory + env: + TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} + run: | + poetry run pytest -s -vv tests/test_model_letta_perfomance.py::test_together_llama_3_70b_edit_core_memory + echo "TEST_CORE_MEMORY_EXIT_CODE=$?" >> $GITHUB_ENV + continue-on-error: true + + - name: Summarize test results + if: always() + run: | + echo "Test Results Summary:" + + # If the exit code is empty, treat it as a failure (❌) + echo "Test first message: $([[ -z $TEST_FIRST_MESSAGE_EXIT_CODE || $TEST_FIRST_MESSAGE_EXIT_CODE -ne 0 ]] && echo ❌ || echo ✅)" + echo "Test model sends message with keyword: $([[ -z $TEST_KEYWORD_MESSAGE_EXIT_CODE || $TEST_KEYWORD_MESSAGE_EXIT_CODE -ne 0 ]] && echo ❌ || echo ✅)" + echo "Test model uses external tool: $([[ -z $TEST_EXTERNAL_TOOL_EXIT_CODE || $TEST_EXTERNAL_TOOL_EXIT_CODE -ne 0 ]] && echo ❌ || echo ✅)" + echo "Test model recalls chat memory: $([[ -z $TEST_CHAT_MEMORY_EXIT_CODE || $TEST_CHAT_MEMORY_EXIT_CODE -ne 0 ]] && echo ❌ || echo ✅)" + echo "Test model uses 'archival_memory_search' to find secret: $([[ -z $TEST_ARCHIVAL_MEMORY_EXIT_CODE || $TEST_ARCHIVAL_MEMORY_EXIT_CODE -ne 0 ]] && echo ❌ || echo ✅)" + echo "Test model can edit core memories: $([[ -z $TEST_CORE_MEMORY_EXIT_CODE || $TEST_CORE_MEMORY_EXIT_CODE -ne 0 ]] && echo ❌ || echo ✅)" + + # Check if any test failed (either non-zero or unset exit code) + if [[ -z $TEST_FIRST_MESSAGE_EXIT_CODE || $TEST_FIRST_MESSAGE_EXIT_CODE -ne 0 || \ + -z $TEST_KEYWORD_MESSAGE_EXIT_CODE || $TEST_KEYWORD_MESSAGE_EXIT_CODE -ne 0 || \ + -z $TEST_EXTERNAL_TOOL_EXIT_CODE || $TEST_EXTERNAL_TOOL_EXIT_CODE -ne 0 || \ + -z $TEST_CHAT_MEMORY_EXIT_CODE || $TEST_CHAT_MEMORY_EXIT_CODE -ne 0 || \ + -z $TEST_ARCHIVAL_MEMORY_EXIT_CODE || $TEST_ARCHIVAL_MEMORY_EXIT_CODE -ne 0 || \ + -z $TEST_CORE_MEMORY_EXIT_CODE || $TEST_CORE_MEMORY_EXIT_CODE -ne 0 ]]; then + echo "Some tests failed." + exit 78 + fi + continue-on-error: true diff --git a/letta/constants.py b/letta/constants.py index eddb82c8..0539477a 100644 --- a/letta/constants.py +++ b/letta/constants.py @@ -19,7 +19,7 @@ IN_CONTEXT_MEMORY_KEYWORD = "CORE_MEMORY" TOOL_CALL_ID_MAX_LEN = 29 # minimum context window size -MIN_CONTEXT_WINDOW = 4000 +MIN_CONTEXT_WINDOW = 4096 # embeddings MAX_EMBEDDING_DIM = 4096 # maximum supported embeding size - do NOT change or else DBs will need to be reset diff --git a/letta/llm_api/llm_api_tools.py b/letta/llm_api/llm_api_tools.py index 95f0e5ac..3484f720 100644 --- a/letta/llm_api/llm_api_tools.py +++ b/letta/llm_api/llm_api_tools.py @@ -33,6 +33,7 @@ from letta.schemas.openai.chat_completion_request import ( cast_message_to_subtype, ) from letta.schemas.openai.chat_completion_response import ChatCompletionResponse +from letta.settings import ModelSettings from letta.streaming_interface import ( AgentChunkStreamingInterface, AgentRefreshStreamingInterface, @@ -126,6 +127,7 @@ def create( from letta.settings import model_settings model_settings = model_settings + assert isinstance(model_settings, ModelSettings) printd(f"Using model {llm_config.model_endpoint_type}, endpoint: {llm_config.model_endpoint}") @@ -326,6 +328,33 @@ def create( return response + elif llm_config.model_endpoint_type == "together": + """TogetherAI endpoint that goes via /completions instead of /chat/completions""" + + if stream: + raise NotImplementedError(f"Streaming not yet implemented for TogetherAI (via the /completions endpoint).") + + if model_settings.together_api_key is None and llm_config.model_endpoint == "https://api.together.ai/v1/completions": + raise ValueError(f"TogetherAI key is missing from letta config file") + + return get_chat_completion( + model=llm_config.model, + messages=messages, + functions=functions, + functions_python=functions_python, + function_call=function_call, + context_window=llm_config.context_window, + endpoint=llm_config.model_endpoint, + endpoint_type="vllm", # NOTE: use the vLLM path through /completions + wrapper=llm_config.model_wrapper, + user=str(user_id), + # hint + first_message=first_message, + # auth-related + auth_type="bearer_token", # NOTE: Together expects bearer token auth + auth_key=model_settings.together_api_key, + ) + # local model else: if stream: diff --git a/letta/llm_api/openai.py b/letta/llm_api/openai.py index 8277239a..f63a66fe 100644 --- a/letta/llm_api/openai.py +++ b/letta/llm_api/openai.py @@ -536,7 +536,6 @@ def openai_chat_completions_request( tool["function"] = convert_to_structured_output(tool["function"]) response_json = make_post_request(url, headers, data) - return ChatCompletionResponse(**response_json) diff --git a/letta/local_llm/utils.py b/letta/local_llm/utils.py index a5381159..cc3f0bc1 100644 --- a/letta/local_llm/utils.py +++ b/letta/local_llm/utils.py @@ -184,6 +184,7 @@ def num_tokens_from_messages(messages: List[dict], model: str = "gpt-4") -> int: https://community.openai.com/t/how-to-calculate-the-tokens-when-using-function-call/266573/11 """ try: + # Attempt to search for the encoding based on the model string encoding = tiktoken.encoding_for_model(model) except KeyError: # print("Warning: model not found. Using cl100k_base encoding.") diff --git a/letta/providers.py b/letta/providers.py index 63bbe475..b28baee7 100644 --- a/letta/providers.py +++ b/letta/providers.py @@ -2,7 +2,7 @@ from typing import List, Optional from pydantic import BaseModel, Field, model_validator -from letta.constants import LLM_MAX_TOKENS +from letta.constants import LLM_MAX_TOKENS, MIN_CONTEXT_WINDOW from letta.llm_api.azure_openai import ( get_azure_chat_completions_endpoint, get_azure_embeddings_endpoint, @@ -67,10 +67,15 @@ class OpenAIProvider(Provider): extra_params = {"supported_parameters": "tools"} if "openrouter.ai" in self.base_url else None response = openai_get_model_list(self.base_url, api_key=self.api_key, extra_params=extra_params) - assert "data" in response, f"OpenAI model query response missing 'data' field: {response}" + # TogetherAI's response is missing the 'data' field + # assert "data" in response, f"OpenAI model query response missing 'data' field: {response}" + if "data" in response: + data = response["data"] + else: + data = response configs = [] - for model in response["data"]: + for model in data: assert "id" in model, f"OpenAI model missing 'id' field: {model}" model_name = model["id"] @@ -82,6 +87,32 @@ class OpenAIProvider(Provider): if not context_window_size: continue + + # TogetherAI includes the type, which we can use to filter out embedding models + if self.base_url == "https://api.together.ai/v1": + if "type" in model and model["type"] != "chat": + continue + + # for TogetherAI, we need to skip the models that don't support JSON mode / function calling + # requests.exceptions.HTTPError: HTTP error occurred: 400 Client Error: Bad Request for url: https://api.together.ai/v1/chat/completions | Status code: 400, Message: { + # "error": { + # "message": "mistralai/Mixtral-8x7B-v0.1 is not supported for JSON mode/function calling", + # "type": "invalid_request_error", + # "param": null, + # "code": "constraints_model" + # } + # } + if "config" not in model: + continue + if "chat_template" not in model["config"]: + continue + if model["config"]["chat_template"] is None: + continue + if "tools" not in model["config"]["chat_template"]: + continue + # if "config" in data and "chat_template" in data["config"] and "tools" not in data["config"]["chat_template"]: + # continue + configs.append( LLMConfig(model=model_name, model_endpoint_type="openai", model_endpoint=self.base_url, context_window=context_window_size) ) @@ -325,6 +356,113 @@ class GroqProvider(OpenAIProvider): raise NotImplementedError +class TogetherProvider(OpenAIProvider): + """TogetherAI provider that uses the /completions API + + TogetherAI can also be used via the /chat/completions API + by settings OPENAI_API_KEY and OPENAI_API_BASE to the TogetherAI API key + and API URL, however /completions is preferred because their /chat/completions + function calling support is limited. + """ + + name: str = "together" + base_url: str = "https://api.together.ai/v1" + api_key: str = Field(..., description="API key for the TogetherAI API.") + default_prompt_formatter: str = Field(..., description="Default prompt formatter (aka model wrapper) to use on vLLM /completions API.") + + def list_llm_models(self) -> List[LLMConfig]: + from letta.llm_api.openai import openai_get_model_list + + response = openai_get_model_list(self.base_url, api_key=self.api_key) + + # TogetherAI's response is missing the 'data' field + # assert "data" in response, f"OpenAI model query response missing 'data' field: {response}" + if "data" in response: + data = response["data"] + else: + data = response + + configs = [] + for model in data: + assert "id" in model, f"TogetherAI model missing 'id' field: {model}" + model_name = model["id"] + + if "context_length" in model: + # Context length is returned in OpenRouter as "context_length" + context_window_size = model["context_length"] + else: + context_window_size = self.get_model_context_window_size(model_name) + + # We need the context length for embeddings too + if not context_window_size: + continue + + # Skip models that are too small for Letta + if context_window_size <= MIN_CONTEXT_WINDOW: + continue + + # TogetherAI includes the type, which we can use to filter for embedding models + if "type" in model and model["type"] not in ["chat", "language"]: + continue + + configs.append( + LLMConfig( + model=model_name, + model_endpoint_type="together", + model_endpoint=self.base_url, + model_wrapper=self.default_prompt_formatter, + context_window=context_window_size, + ) + ) + + return configs + + def list_embedding_models(self) -> List[EmbeddingConfig]: + # TODO renable once we figure out how to pass API keys through properly + return [] + + # from letta.llm_api.openai import openai_get_model_list + + # response = openai_get_model_list(self.base_url, api_key=self.api_key) + + # # TogetherAI's response is missing the 'data' field + # # assert "data" in response, f"OpenAI model query response missing 'data' field: {response}" + # if "data" in response: + # data = response["data"] + # else: + # data = response + + # configs = [] + # for model in data: + # assert "id" in model, f"TogetherAI model missing 'id' field: {model}" + # model_name = model["id"] + + # if "context_length" in model: + # # Context length is returned in OpenRouter as "context_length" + # context_window_size = model["context_length"] + # else: + # context_window_size = self.get_model_context_window_size(model_name) + + # if not context_window_size: + # continue + + # # TogetherAI includes the type, which we can use to filter out embedding models + # if "type" in model and model["type"] not in ["embedding"]: + # continue + + # configs.append( + # EmbeddingConfig( + # embedding_model=model_name, + # embedding_endpoint_type="openai", + # embedding_endpoint=self.base_url, + # embedding_dim=context_window_size, + # embedding_chunk_size=300, # TODO: change? + # ) + # ) + + # return configs + + class GoogleAIProvider(Provider): # gemini api_key: str = Field(..., description="API key for the Google AI API.") diff --git a/letta/schemas/llm_config.py b/letta/schemas/llm_config.py index 6695cecf..ed63e766 100644 --- a/letta/schemas/llm_config.py +++ b/letta/schemas/llm_config.py @@ -35,6 +35,7 @@ class LLMConfig(BaseModel): "vllm", "hugging-face", "mistral", + "together", # completions endpoint ] = Field(..., description="The endpoint type for the model.") model_endpoint: Optional[str] = Field(None, description="The endpoint for the model.") model_wrapper: Optional[str] = Field(None, description="The wrapper for the model.") diff --git a/letta/schemas/openai/chat_completion_response.py b/letta/schemas/openai/chat_completion_response.py index ea37ec45..07a11703 100644 --- a/letta/schemas/openai/chat_completion_response.py +++ b/letta/schemas/openai/chat_completion_response.py @@ -46,6 +46,7 @@ class Choice(BaseModel): index: int message: Message logprobs: Optional[Dict[str, Union[List[MessageContentLogProb], None]]] = None + seed: Optional[int] = None # found in TogetherAI class UsageStatistics(BaseModel): diff --git a/letta/server/server.py b/letta/server/server.py index a23d0795..47df6c2e 100644 --- a/letta/server/server.py +++ b/letta/server/server.py @@ -49,6 +49,7 @@ from letta.providers import ( OllamaProvider, OpenAIProvider, Provider, + TogetherProvider, VLLMChatCompletionsProvider, VLLMCompletionsProvider, ) @@ -303,7 +304,18 @@ class SyncServer(Server): ) ) if model_settings.groq_api_key: - self._enabled_providers.append(GroqProvider(api_key=model_settings.groq_api_key)) + self._enabled_providers.append( + GroqProvider( + api_key=model_settings.groq_api_key, + ) + ) + if model_settings.together_api_key: + self._enabled_providers.append( + TogetherProvider( + api_key=model_settings.together_api_key, + default_prompt_formatter=model_settings.default_prompt_formatter, + ) + ) if model_settings.vllm_api_base: # vLLM exposes both a /chat/completions and a /completions endpoint self._enabled_providers.append( diff --git a/letta/settings.py b/letta/settings.py index c5e7ee3b..2f7e82f9 100644 --- a/letta/settings.py +++ b/letta/settings.py @@ -43,6 +43,9 @@ class ModelSettings(BaseSettings): # google ai gemini_api_key: Optional[str] = None + # together + together_api_key: Optional[str] = None + # vLLM vllm_api_base: Optional[str] = None diff --git a/tests/configs/llm_model_configs/together-llama-3-1-405b.json b/tests/configs/llm_model_configs/together-llama-3-1-405b.json new file mode 100644 index 00000000..0d3c4b16 --- /dev/null +++ b/tests/configs/llm_model_configs/together-llama-3-1-405b.json @@ -0,0 +1,7 @@ +{ + "context_window": 16000, + "model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", + "model_endpoint_type": "together", + "model_endpoint": "https://api.together.ai/v1", + "model_wrapper": "chatml" +} diff --git a/tests/configs/llm_model_configs/together-llama-3-70b.json b/tests/configs/llm_model_configs/together-llama-3-70b.json new file mode 100644 index 00000000..9cd9738e --- /dev/null +++ b/tests/configs/llm_model_configs/together-llama-3-70b.json @@ -0,0 +1,7 @@ +{ + "context_window": 8192, + "model": "meta-llama/Meta-Llama-3-70B-Instruct-Turbo", + "model_endpoint_type": "together", + "model_endpoint": "https://api.together.ai/v1", + "model_wrapper": "chatml" +} diff --git a/tests/test_model_letta_perfomance.py b/tests/test_model_letta_perfomance.py index c5778772..e473d5bb 100644 --- a/tests/test_model_letta_perfomance.py +++ b/tests/test_model_letta_perfomance.py @@ -340,3 +340,49 @@ def test_gemini_pro_15_edit_core_memory(): response = check_agent_edit_core_memory(filename) # Log out successful response print(f"Got successful response from client: \n\n{response}") + + +# ====================================================================================================================== +# TOGETHER TESTS +# ====================================================================================================================== +def test_together_llama_3_70b_returns_valid_first_message(): + filename = os.path.join(llm_config_dir, "together-llama-3-70b.json") + response = check_first_response_is_valid_for_llm_endpoint(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_together_llama_3_70b_returns_keyword(): + keyword = "banana" + filename = os.path.join(llm_config_dir, "together-llama-3-70b.json") + response = check_response_contains_keyword(filename, keyword=keyword) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_together_llama_3_70b_uses_external_tool(): + filename = os.path.join(llm_config_dir, "together-llama-3-70b.json") + response = check_agent_uses_external_tool(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_together_llama_3_70b_recall_chat_memory(): + filename = os.path.join(llm_config_dir, "together-llama-3-70b.json") + response = check_agent_recall_chat_memory(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_together_llama_3_70b_archival_memory_retrieval(): + filename = os.path.join(llm_config_dir, "together-llama-3-70b.json") + response = check_agent_archival_memory_retrieval(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") + + +def test_together_llama_3_70b_edit_core_memory(): + filename = os.path.join(llm_config_dir, "together-llama-3-70b.json") + response = check_agent_edit_core_memory(filename) + # Log out successful response + print(f"Got successful response from client: \n\n{response}") diff --git a/tests/test_providers.py b/tests/test_providers.py index 21f0c9ff..228e3352 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -8,6 +8,7 @@ from letta.providers import ( MistralProvider, OllamaProvider, OpenAIProvider, + TogetherProvider, ) from letta.settings import model_settings @@ -70,6 +71,15 @@ def test_mistral(): print([m.model for m in models]) +def test_together(): + provider = TogetherProvider(api_key=os.getenv("TOGETHER_API_KEY"), default_prompt_formatter="chatml") + models = provider.list_llm_models() + print([m.model for m in models]) + + embedding_models = provider.list_embedding_models() + print([m.embedding_model for m in embedding_models]) + + # def test_vllm(): # provider = VLLMProvider(base_url=os.getenv("VLLM_API_BASE")) # models = provider.list_llm_models()