fix: Disable default OpenAI retry behavior (#856)

This commit is contained in:
Matthew Zhou
2025-01-30 09:37:47 -10:00
committed by GitHub
parent 22c4c499a6
commit b981b617e1
3 changed files with 31 additions and 62 deletions

View File

@@ -1,7 +1,8 @@
import json
import re
import time
from typing import Generator, List, Optional, Tuple, Union
import warnings
from typing import Generator, List, Optional, Union
import anthropic
from anthropic import PermissionDeniedError
@@ -36,7 +37,7 @@ from letta.schemas.openai.chat_completion_response import MessageDelta, ToolCall
from letta.services.provider_manager import ProviderManager
from letta.settings import model_settings
from letta.streaming_interface import AgentChunkStreamingInterface, AgentRefreshStreamingInterface
from letta.utils import get_utc_time, smart_urljoin
from letta.utils import get_utc_time
BASE_URL = "https://api.anthropic.com/v1"
@@ -567,30 +568,6 @@ def _prepare_anthropic_request(
return data
def get_anthropic_endpoint_and_headers(
base_url: str,
api_key: str,
version: str = "2023-06-01",
beta: Optional[str] = "tools-2024-04-04",
) -> Tuple[str, dict]:
"""
Dynamically generate the Anthropic endpoint and headers.
"""
url = smart_urljoin(base_url, "messages")
headers = {
"Content-Type": "application/json",
"x-api-key": api_key,
"anthropic-version": version,
}
# Add beta header if specified
if beta:
headers["anthropic-beta"] = beta
return url, headers
def anthropic_chat_completions_request(
data: ChatCompletionRequest,
inner_thoughts_xml_tag: Optional[str] = "thinking",

View File

@@ -29,7 +29,6 @@ from letta.schemas.openai.chat_completion_request import ChatCompletionRequest,
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
from letta.settings import ModelSettings
from letta.streaming_interface import AgentChunkStreamingInterface, AgentRefreshStreamingInterface
from letta.utils import run_async_task
LLM_API_PROVIDER_OPTIONS = ["openai", "azure", "anthropic", "google_ai", "cohere", "local", "groq"]
@@ -57,7 +56,9 @@ def retry_with_exponential_backoff(
while True:
try:
return func(*args, **kwargs)
except KeyboardInterrupt:
# Stop retrying if user hits Ctrl-C
raise KeyboardInterrupt("User intentionally stopped thread. Stopping...")
except requests.exceptions.HTTPError as http_err:
if not hasattr(http_err, "response") or not http_err.response:
@@ -162,25 +163,21 @@ def create(
assert isinstance(stream_interface, AgentChunkStreamingInterface) or isinstance(
stream_interface, AgentRefreshStreamingInterface
), type(stream_interface)
response = run_async_task(
openai_chat_completions_process_stream(
url=llm_config.model_endpoint,
api_key=api_key,
chat_completion_request=data,
stream_interface=stream_interface,
)
response = openai_chat_completions_process_stream(
url=llm_config.model_endpoint,
api_key=api_key,
chat_completion_request=data,
stream_interface=stream_interface,
)
else: # Client did not request token streaming (expect a blocking backend response)
data.stream = False
if isinstance(stream_interface, AgentChunkStreamingInterface):
stream_interface.stream_start()
try:
response = run_async_task(
openai_chat_completions_request(
url=llm_config.model_endpoint,
api_key=api_key,
chat_completion_request=data,
)
response = openai_chat_completions_request(
url=llm_config.model_endpoint,
api_key=api_key,
chat_completion_request=data,
)
finally:
if isinstance(stream_interface, AgentChunkStreamingInterface):
@@ -354,12 +351,10 @@ def create(
stream_interface.stream_start()
try:
# groq uses the openai chat completions API, so this component should be reusable
response = run_async_task(
openai_chat_completions_request(
url=llm_config.model_endpoint,
api_key=model_settings.groq_api_key,
chat_completion_request=data,
)
response = openai_chat_completions_request(
url=llm_config.model_endpoint,
api_key=model_settings.groq_api_key,
chat_completion_request=data,
)
finally:
if isinstance(stream_interface, AgentChunkStreamingInterface):

View File

@@ -1,8 +1,8 @@
import warnings
from typing import AsyncGenerator, List, Optional, Union
from typing import Generator, List, Optional, Union
import requests
from openai import AsyncOpenAI
from openai import OpenAI
from letta.llm_api.helpers import add_inner_thoughts_to_functions, convert_to_structured_output, make_post_request
from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION, INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST
@@ -158,7 +158,7 @@ def build_openai_chat_completions_request(
return data
async def openai_chat_completions_process_stream(
def openai_chat_completions_process_stream(
url: str,
api_key: str,
chat_completion_request: ChatCompletionRequest,
@@ -231,7 +231,7 @@ async def openai_chat_completions_process_stream(
n_chunks = 0 # approx == n_tokens
chunk_idx = 0
try:
async for chat_completion_chunk in openai_chat_completions_request_stream(
for chat_completion_chunk in openai_chat_completions_request_stream(
url=url, api_key=api_key, chat_completion_request=chat_completion_request
):
assert isinstance(chat_completion_chunk, ChatCompletionChunkResponse), type(chat_completion_chunk)
@@ -382,24 +382,21 @@ async def openai_chat_completions_process_stream(
return chat_completion_response
async def openai_chat_completions_request_stream(
def openai_chat_completions_request_stream(
url: str,
api_key: str,
chat_completion_request: ChatCompletionRequest,
) -> AsyncGenerator[ChatCompletionChunkResponse, None]:
) -> Generator[ChatCompletionChunkResponse, None, None]:
data = prepare_openai_payload(chat_completion_request)
data["stream"] = True
client = AsyncOpenAI(
api_key=api_key,
base_url=url,
)
stream = await client.chat.completions.create(**data)
async for chunk in stream:
client = OpenAI(api_key=api_key, base_url=url, max_retries=0)
stream = client.chat.completions.create(**data)
for chunk in stream:
# TODO: Use the native OpenAI objects here?
yield ChatCompletionChunkResponse(**chunk.model_dump(exclude_none=True))
async def openai_chat_completions_request(
def openai_chat_completions_request(
url: str,
api_key: str,
chat_completion_request: ChatCompletionRequest,
@@ -412,8 +409,8 @@ async def openai_chat_completions_request(
https://platform.openai.com/docs/guides/text-generation?lang=curl
"""
data = prepare_openai_payload(chat_completion_request)
client = AsyncOpenAI(api_key=api_key, base_url=url)
chat_completion = await client.chat.completions.create(**data)
client = OpenAI(api_key=api_key, base_url=url, max_retries=0)
chat_completion = client.chat.completions.create(**data)
return ChatCompletionResponse(**chat_completion.model_dump())