feat: add new llm client framework and migrate google apis (#1209)
This commit is contained in:
@@ -29,6 +29,7 @@ from letta.helpers.json_helpers import json_dumps, json_loads
|
||||
from letta.interface import AgentInterface
|
||||
from letta.llm_api.helpers import calculate_summarizer_cutoff, get_token_counts_for_messages, is_context_overflow_error
|
||||
from letta.llm_api.llm_api_tools import create
|
||||
from letta.llm_api.llm_client import LLMClient
|
||||
from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
|
||||
from letta.log import get_logger
|
||||
from letta.memory import summarize_messages
|
||||
@@ -356,19 +357,38 @@ class Agent(BaseAgent):
|
||||
for attempt in range(1, empty_response_retry_limit + 1):
|
||||
try:
|
||||
log_telemetry(self.logger, "_get_ai_reply create start")
|
||||
response = create(
|
||||
# New LLM client flow
|
||||
llm_client = LLMClient.create(
|
||||
agent_id=self.agent_state.id,
|
||||
llm_config=self.agent_state.llm_config,
|
||||
messages=message_sequence,
|
||||
user_id=self.agent_state.created_by_id,
|
||||
functions=allowed_functions,
|
||||
# functions_python=self.functions_python, do we need this?
|
||||
function_call=function_call,
|
||||
first_message=first_message,
|
||||
force_tool_call=force_tool_call,
|
||||
stream=stream,
|
||||
stream_interface=self.interface,
|
||||
put_inner_thoughts_first=put_inner_thoughts_first,
|
||||
actor_id=self.agent_state.created_by_id,
|
||||
)
|
||||
|
||||
if llm_client and not stream:
|
||||
response = llm_client.send_llm_request(
|
||||
messages=message_sequence,
|
||||
tools=allowed_functions,
|
||||
tool_call=function_call,
|
||||
stream=stream,
|
||||
first_message=first_message,
|
||||
force_tool_call=force_tool_call,
|
||||
)
|
||||
else:
|
||||
# Fallback to existing flow
|
||||
response = create(
|
||||
llm_config=self.agent_state.llm_config,
|
||||
messages=message_sequence,
|
||||
user_id=self.agent_state.created_by_id,
|
||||
functions=allowed_functions,
|
||||
# functions_python=self.functions_python, do we need this?
|
||||
function_call=function_call,
|
||||
first_message=first_message,
|
||||
force_tool_call=force_tool_call,
|
||||
stream=stream,
|
||||
stream_interface=self.interface,
|
||||
put_inner_thoughts_first=put_inner_thoughts_first,
|
||||
)
|
||||
log_telemetry(self.logger, "_get_ai_reply create finish")
|
||||
|
||||
# These bottom two are retryable
|
||||
@@ -632,7 +652,7 @@ class Agent(BaseAgent):
|
||||
function_args,
|
||||
function_response,
|
||||
messages,
|
||||
[tool_return] if tool_return else None,
|
||||
[tool_return],
|
||||
include_function_failed_message=True,
|
||||
)
|
||||
return messages, False, True # force a heartbeat to allow agent to handle error
|
||||
@@ -659,7 +679,7 @@ class Agent(BaseAgent):
|
||||
"content": function_response,
|
||||
"tool_call_id": tool_call_id,
|
||||
},
|
||||
tool_returns=[tool_return] if tool_return else None,
|
||||
tool_returns=[tool_return] if sandbox_run_result else None,
|
||||
)
|
||||
) # extend conversation with function response
|
||||
self.interface.function_message(f"Ran {function_name}({function_args})", msg_obj=messages[-1])
|
||||
|
||||
332
letta/llm_api/google_ai_client.py
Normal file
332
letta/llm_api/google_ai_client.py
Normal file
@@ -0,0 +1,332 @@
|
||||
import uuid
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from letta.constants import NON_USER_MSG_PREFIX
|
||||
from letta.helpers.datetime_helpers import get_utc_time
|
||||
from letta.helpers.json_helpers import json_dumps
|
||||
from letta.llm_api.helpers import make_post_request
|
||||
from letta.llm_api.llm_client_base import LLMClientBase
|
||||
from letta.local_llm.json_parser import clean_json_string_extra_backslash
|
||||
from letta.local_llm.utils import count_tokens
|
||||
from letta.schemas.message import Message as PydanticMessage
|
||||
from letta.schemas.openai.chat_completion_request import Tool
|
||||
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice, FunctionCall, Message, ToolCall, UsageStatistics
|
||||
from letta.settings import model_settings
|
||||
from letta.utils import get_tool_call_id
|
||||
|
||||
|
||||
class GoogleAIClient(LLMClientBase):
|
||||
|
||||
def request(self, request_data: dict) -> dict:
|
||||
"""
|
||||
Performs underlying request to llm and returns raw response.
|
||||
"""
|
||||
url, headers = self.get_gemini_endpoint_and_headers(generate_content=True)
|
||||
return make_post_request(url, headers, request_data)
|
||||
|
||||
def build_request_data(
|
||||
self,
|
||||
messages: List[PydanticMessage],
|
||||
tools: List[dict],
|
||||
tool_call: Optional[str],
|
||||
) -> dict:
|
||||
"""
|
||||
Constructs a request object in the expected data format for this client.
|
||||
"""
|
||||
if tools:
|
||||
tools = [{"type": "function", "function": f} for f in tools]
|
||||
tools = self.convert_tools_to_google_ai_format(
|
||||
[Tool(**t) for t in tools],
|
||||
)
|
||||
contents = self.add_dummy_model_messages(
|
||||
[m.to_google_ai_dict() for m in messages],
|
||||
)
|
||||
|
||||
return {
|
||||
"contents": contents,
|
||||
"tools": tools,
|
||||
"generation_config": {
|
||||
"temperature": self.llm_config.temperature,
|
||||
"max_output_tokens": self.llm_config.max_tokens,
|
||||
},
|
||||
}
|
||||
|
||||
def convert_response_to_chat_completion(
|
||||
self,
|
||||
response_data: dict,
|
||||
input_messages: List[PydanticMessage],
|
||||
) -> ChatCompletionResponse:
|
||||
"""
|
||||
Converts custom response format from llm client into an OpenAI
|
||||
ChatCompletionsResponse object.
|
||||
|
||||
Example Input:
|
||||
{
|
||||
"candidates": [
|
||||
{
|
||||
"content": {
|
||||
"parts": [
|
||||
{
|
||||
"text": " OK. Barbie is showing in two theaters in Mountain View, CA: AMC Mountain View 16 and Regal Edwards 14."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
],
|
||||
"usageMetadata": {
|
||||
"promptTokenCount": 9,
|
||||
"candidatesTokenCount": 27,
|
||||
"totalTokenCount": 36
|
||||
}
|
||||
}
|
||||
"""
|
||||
try:
|
||||
choices = []
|
||||
index = 0
|
||||
for candidate in response_data["candidates"]:
|
||||
content = candidate["content"]
|
||||
|
||||
role = content["role"]
|
||||
assert role == "model", f"Unknown role in response: {role}"
|
||||
|
||||
parts = content["parts"]
|
||||
# TODO support parts / multimodal
|
||||
# TODO support parallel tool calling natively
|
||||
# TODO Alternative here is to throw away everything else except for the first part
|
||||
for response_message in parts:
|
||||
# Convert the actual message style to OpenAI style
|
||||
if "functionCall" in response_message and response_message["functionCall"] is not None:
|
||||
function_call = response_message["functionCall"]
|
||||
assert isinstance(function_call, dict), function_call
|
||||
function_name = function_call["name"]
|
||||
assert isinstance(function_name, str), function_name
|
||||
function_args = function_call["args"]
|
||||
assert isinstance(function_args, dict), function_args
|
||||
|
||||
# NOTE: this also involves stripping the inner monologue out of the function
|
||||
if self.llm_config.put_inner_thoughts_in_kwargs:
|
||||
from letta.local_llm.constants import INNER_THOUGHTS_KWARG
|
||||
|
||||
assert INNER_THOUGHTS_KWARG in function_args, f"Couldn't find inner thoughts in function args:\n{function_call}"
|
||||
inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG)
|
||||
assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}"
|
||||
else:
|
||||
inner_thoughts = None
|
||||
|
||||
# Google AI API doesn't generate tool call IDs
|
||||
openai_response_message = Message(
|
||||
role="assistant", # NOTE: "model" -> "assistant"
|
||||
content=inner_thoughts,
|
||||
tool_calls=[
|
||||
ToolCall(
|
||||
id=get_tool_call_id(),
|
||||
type="function",
|
||||
function=FunctionCall(
|
||||
name=function_name,
|
||||
arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
|
||||
),
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
else:
|
||||
|
||||
# Inner thoughts are the content by default
|
||||
inner_thoughts = response_message["text"]
|
||||
|
||||
# Google AI API doesn't generate tool call IDs
|
||||
openai_response_message = Message(
|
||||
role="assistant", # NOTE: "model" -> "assistant"
|
||||
content=inner_thoughts,
|
||||
)
|
||||
|
||||
# Google AI API uses different finish reason strings than OpenAI
|
||||
# OpenAI: 'stop', 'length', 'function_call', 'content_filter', null
|
||||
# see: https://platform.openai.com/docs/guides/text-generation/chat-completions-api
|
||||
# Google AI API: FINISH_REASON_UNSPECIFIED, STOP, MAX_TOKENS, SAFETY, RECITATION, OTHER
|
||||
# see: https://ai.google.dev/api/python/google/ai/generativelanguage/Candidate/FinishReason
|
||||
finish_reason = candidate["finishReason"]
|
||||
if finish_reason == "STOP":
|
||||
openai_finish_reason = (
|
||||
"function_call"
|
||||
if openai_response_message.tool_calls is not None and len(openai_response_message.tool_calls) > 0
|
||||
else "stop"
|
||||
)
|
||||
elif finish_reason == "MAX_TOKENS":
|
||||
openai_finish_reason = "length"
|
||||
elif finish_reason == "SAFETY":
|
||||
openai_finish_reason = "content_filter"
|
||||
elif finish_reason == "RECITATION":
|
||||
openai_finish_reason = "content_filter"
|
||||
else:
|
||||
raise ValueError(f"Unrecognized finish reason in Google AI response: {finish_reason}")
|
||||
|
||||
choices.append(
|
||||
Choice(
|
||||
finish_reason=openai_finish_reason,
|
||||
index=index,
|
||||
message=openai_response_message,
|
||||
)
|
||||
)
|
||||
index += 1
|
||||
|
||||
# if len(choices) > 1:
|
||||
# raise UserWarning(f"Unexpected number of candidates in response (expected 1, got {len(choices)})")
|
||||
|
||||
# NOTE: some of the Google AI APIs show UsageMetadata in the response, but it seems to not exist?
|
||||
# "usageMetadata": {
|
||||
# "promptTokenCount": 9,
|
||||
# "candidatesTokenCount": 27,
|
||||
# "totalTokenCount": 36
|
||||
# }
|
||||
if "usageMetadata" in response_data:
|
||||
usage = UsageStatistics(
|
||||
prompt_tokens=response_data["usageMetadata"]["promptTokenCount"],
|
||||
completion_tokens=response_data["usageMetadata"]["candidatesTokenCount"],
|
||||
total_tokens=response_data["usageMetadata"]["totalTokenCount"],
|
||||
)
|
||||
else:
|
||||
# Count it ourselves
|
||||
assert input_messages is not None, f"Didn't get UsageMetadata from the API response, so input_messages is required"
|
||||
prompt_tokens = count_tokens(json_dumps(input_messages)) # NOTE: this is a very rough approximation
|
||||
completion_tokens = count_tokens(json_dumps(openai_response_message.model_dump())) # NOTE: this is also approximate
|
||||
total_tokens = prompt_tokens + completion_tokens
|
||||
usage = UsageStatistics(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=total_tokens,
|
||||
)
|
||||
|
||||
response_id = str(uuid.uuid4())
|
||||
return ChatCompletionResponse(
|
||||
id=response_id,
|
||||
choices=choices,
|
||||
model=self.llm_config.model, # NOTE: Google API doesn't pass back model in the response
|
||||
created=get_utc_time(),
|
||||
usage=usage,
|
||||
)
|
||||
except KeyError as e:
|
||||
raise e
|
||||
|
||||
def get_gemini_endpoint_and_headers(
|
||||
self,
|
||||
key_in_header: bool = True,
|
||||
generate_content: bool = False,
|
||||
) -> Tuple[str, dict]:
|
||||
"""
|
||||
Dynamically generate the model endpoint and headers.
|
||||
"""
|
||||
|
||||
url = f"{self.llm_config.model_endpoint}/v1beta/models"
|
||||
|
||||
# Add the model
|
||||
url += f"/{self.llm_config.model}"
|
||||
|
||||
# Add extension for generating content if we're hitting the LM
|
||||
if generate_content:
|
||||
url += ":generateContent"
|
||||
|
||||
# Decide if api key should be in header or not
|
||||
# Two ways to pass the key: https://ai.google.dev/tutorials/setup
|
||||
if key_in_header:
|
||||
headers = {"Content-Type": "application/json", "x-goog-api-key": model_settings.gemini_api_key}
|
||||
else:
|
||||
url += f"?key={model_settings.gemini_api_key}"
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
return url, headers
|
||||
|
||||
def convert_tools_to_google_ai_format(self, tools: List[Tool]) -> List[dict]:
|
||||
"""
|
||||
OpenAI style:
|
||||
"tools": [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "find_movies",
|
||||
"description": "find ....",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
PARAM: {
|
||||
"type": PARAM_TYPE, # eg "string"
|
||||
"description": PARAM_DESCRIPTION,
|
||||
},
|
||||
...
|
||||
},
|
||||
"required": List[str],
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
Google AI style:
|
||||
"tools": [{
|
||||
"functionDeclarations": [{
|
||||
"name": "find_movies",
|
||||
"description": "find movie titles currently playing in theaters based on any description, genre, title words, etc.",
|
||||
"parameters": {
|
||||
"type": "OBJECT",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "STRING",
|
||||
"description": "The city and state, e.g. San Francisco, CA or a zip code e.g. 95616"
|
||||
},
|
||||
"description": {
|
||||
"type": "STRING",
|
||||
"description": "Any kind of description including category or genre, title words, attributes, etc."
|
||||
}
|
||||
},
|
||||
"required": ["description"]
|
||||
}
|
||||
}, {
|
||||
"name": "find_theaters",
|
||||
...
|
||||
"""
|
||||
function_list = [
|
||||
dict(
|
||||
name=t.function.name,
|
||||
description=t.function.description,
|
||||
parameters=t.function.parameters, # TODO need to unpack
|
||||
)
|
||||
for t in tools
|
||||
]
|
||||
|
||||
# Correct casing + add inner thoughts if needed
|
||||
for func in function_list:
|
||||
func["parameters"]["type"] = "OBJECT"
|
||||
for param_name, param_fields in func["parameters"]["properties"].items():
|
||||
param_fields["type"] = param_fields["type"].upper()
|
||||
# Add inner thoughts
|
||||
if self.llm_config.put_inner_thoughts_in_kwargs:
|
||||
from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION
|
||||
|
||||
func["parameters"]["properties"][INNER_THOUGHTS_KWARG] = {
|
||||
"type": "STRING",
|
||||
"description": INNER_THOUGHTS_KWARG_DESCRIPTION,
|
||||
}
|
||||
func["parameters"]["required"].append(INNER_THOUGHTS_KWARG)
|
||||
|
||||
return [{"functionDeclarations": function_list}]
|
||||
|
||||
def add_dummy_model_messages(self, messages: List[dict]) -> List[dict]:
|
||||
"""Google AI API requires all function call returns are immediately followed by a 'model' role message.
|
||||
|
||||
In Letta, the 'model' will often call a function (e.g. send_message) that itself yields to the user,
|
||||
so there is no natural follow-up 'model' role message.
|
||||
|
||||
To satisfy the Google AI API restrictions, we can add a dummy 'yield' message
|
||||
with role == 'model' that is placed in-betweeen and function output
|
||||
(role == 'tool') and user message (role == 'user').
|
||||
"""
|
||||
dummy_yield_message = {
|
||||
"role": "model",
|
||||
"parts": [{"text": f"{NON_USER_MSG_PREFIX}Function call returned, waiting for user response."}],
|
||||
}
|
||||
messages_with_padding = []
|
||||
for i, message in enumerate(messages):
|
||||
messages_with_padding.append(message)
|
||||
# Check if the current message role is 'tool' and the next message role is 'user'
|
||||
if message["role"] in ["tool", "function"] and (i + 1 < len(messages) and messages[i + 1]["role"] == "user"):
|
||||
messages_with_padding.append(dummy_yield_message)
|
||||
|
||||
return messages_with_padding
|
||||
214
letta/llm_api/google_vertex_client.py
Normal file
214
letta/llm_api/google_vertex_client.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
from google import genai
|
||||
from google.genai.types import FunctionCallingConfig, FunctionCallingConfigMode, GenerateContentResponse, ToolConfig
|
||||
|
||||
from letta.helpers.datetime_helpers import get_utc_time
|
||||
from letta.helpers.json_helpers import json_dumps
|
||||
from letta.llm_api.google_ai_client import GoogleAIClient
|
||||
from letta.local_llm.json_parser import clean_json_string_extra_backslash
|
||||
from letta.local_llm.utils import count_tokens
|
||||
from letta.schemas.message import Message as PydanticMessage
|
||||
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice, FunctionCall, Message, ToolCall, UsageStatistics
|
||||
from letta.settings import model_settings
|
||||
from letta.utils import get_tool_call_id
|
||||
|
||||
|
||||
class GoogleVertexClient(GoogleAIClient):
|
||||
|
||||
def request(self, request_data: dict) -> dict:
|
||||
"""
|
||||
Performs underlying request to llm and returns raw response.
|
||||
"""
|
||||
client = genai.Client(
|
||||
vertexai=True,
|
||||
project=model_settings.google_cloud_project,
|
||||
location=model_settings.google_cloud_location,
|
||||
http_options={"api_version": "v1"},
|
||||
)
|
||||
response = client.models.generate_content(
|
||||
model=self.llm_config.model,
|
||||
contents=request_data["contents"],
|
||||
config=request_data["config"],
|
||||
)
|
||||
return response.model_dump()
|
||||
|
||||
def build_request_data(
|
||||
self,
|
||||
messages: List[PydanticMessage],
|
||||
tools: List[dict],
|
||||
tool_call: Optional[str],
|
||||
) -> dict:
|
||||
"""
|
||||
Constructs a request object in the expected data format for this client.
|
||||
"""
|
||||
request_data = super().build_request_data(messages, tools, tool_call)
|
||||
request_data["config"] = request_data.pop("generation_config")
|
||||
request_data["config"]["tools"] = request_data.pop("tools")
|
||||
|
||||
tool_config = ToolConfig(
|
||||
function_calling_config=FunctionCallingConfig(
|
||||
# ANY mode forces the model to predict only function calls
|
||||
mode=FunctionCallingConfigMode.ANY,
|
||||
)
|
||||
)
|
||||
request_data["config"]["tool_config"] = tool_config.model_dump()
|
||||
|
||||
return request_data
|
||||
|
||||
def convert_response_to_chat_completion(
|
||||
self,
|
||||
response_data: dict,
|
||||
input_messages: List[PydanticMessage],
|
||||
) -> ChatCompletionResponse:
|
||||
"""
|
||||
Converts custom response format from llm client into an OpenAI
|
||||
ChatCompletionsResponse object.
|
||||
|
||||
Example:
|
||||
{
|
||||
"candidates": [
|
||||
{
|
||||
"content": {
|
||||
"parts": [
|
||||
{
|
||||
"text": " OK. Barbie is showing in two theaters in Mountain View, CA: AMC Mountain View 16 and Regal Edwards 14."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
],
|
||||
"usageMetadata": {
|
||||
"promptTokenCount": 9,
|
||||
"candidatesTokenCount": 27,
|
||||
"totalTokenCount": 36
|
||||
}
|
||||
}
|
||||
"""
|
||||
response = GenerateContentResponse(**response_data)
|
||||
try:
|
||||
choices = []
|
||||
index = 0
|
||||
for candidate in response.candidates:
|
||||
content = candidate.content
|
||||
|
||||
role = content.role
|
||||
assert role == "model", f"Unknown role in response: {role}"
|
||||
|
||||
parts = content.parts
|
||||
# TODO support parts / multimodal
|
||||
# TODO support parallel tool calling natively
|
||||
# TODO Alternative here is to throw away everything else except for the first part
|
||||
for response_message in parts:
|
||||
# Convert the actual message style to OpenAI style
|
||||
if response_message.function_call:
|
||||
function_call = response_message.function_call
|
||||
function_name = function_call.name
|
||||
function_args = function_call.args
|
||||
assert isinstance(function_args, dict), function_args
|
||||
|
||||
# NOTE: this also involves stripping the inner monologue out of the function
|
||||
if self.llm_config.put_inner_thoughts_in_kwargs:
|
||||
from letta.local_llm.constants import INNER_THOUGHTS_KWARG
|
||||
|
||||
assert INNER_THOUGHTS_KWARG in function_args, f"Couldn't find inner thoughts in function args:\n{function_call}"
|
||||
inner_thoughts = function_args.pop(INNER_THOUGHTS_KWARG)
|
||||
assert inner_thoughts is not None, f"Expected non-null inner thoughts function arg:\n{function_call}"
|
||||
else:
|
||||
inner_thoughts = None
|
||||
|
||||
# Google AI API doesn't generate tool call IDs
|
||||
openai_response_message = Message(
|
||||
role="assistant", # NOTE: "model" -> "assistant"
|
||||
content=inner_thoughts,
|
||||
tool_calls=[
|
||||
ToolCall(
|
||||
id=get_tool_call_id(),
|
||||
type="function",
|
||||
function=FunctionCall(
|
||||
name=function_name,
|
||||
arguments=clean_json_string_extra_backslash(json_dumps(function_args)),
|
||||
),
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
else:
|
||||
|
||||
# Inner thoughts are the content by default
|
||||
inner_thoughts = response_message.text
|
||||
|
||||
# Google AI API doesn't generate tool call IDs
|
||||
openai_response_message = Message(
|
||||
role="assistant", # NOTE: "model" -> "assistant"
|
||||
content=inner_thoughts,
|
||||
)
|
||||
|
||||
# Google AI API uses different finish reason strings than OpenAI
|
||||
# OpenAI: 'stop', 'length', 'function_call', 'content_filter', null
|
||||
# see: https://platform.openai.com/docs/guides/text-generation/chat-completions-api
|
||||
# Google AI API: FINISH_REASON_UNSPECIFIED, STOP, MAX_TOKENS, SAFETY, RECITATION, OTHER
|
||||
# see: https://ai.google.dev/api/python/google/ai/generativelanguage/Candidate/FinishReason
|
||||
finish_reason = candidate.finish_reason.value
|
||||
if finish_reason == "STOP":
|
||||
openai_finish_reason = (
|
||||
"function_call"
|
||||
if openai_response_message.tool_calls is not None and len(openai_response_message.tool_calls) > 0
|
||||
else "stop"
|
||||
)
|
||||
elif finish_reason == "MAX_TOKENS":
|
||||
openai_finish_reason = "length"
|
||||
elif finish_reason == "SAFETY":
|
||||
openai_finish_reason = "content_filter"
|
||||
elif finish_reason == "RECITATION":
|
||||
openai_finish_reason = "content_filter"
|
||||
else:
|
||||
raise ValueError(f"Unrecognized finish reason in Google AI response: {finish_reason}")
|
||||
|
||||
choices.append(
|
||||
Choice(
|
||||
finish_reason=openai_finish_reason,
|
||||
index=index,
|
||||
message=openai_response_message,
|
||||
)
|
||||
)
|
||||
index += 1
|
||||
|
||||
# if len(choices) > 1:
|
||||
# raise UserWarning(f"Unexpected number of candidates in response (expected 1, got {len(choices)})")
|
||||
|
||||
# NOTE: some of the Google AI APIs show UsageMetadata in the response, but it seems to not exist?
|
||||
# "usageMetadata": {
|
||||
# "promptTokenCount": 9,
|
||||
# "candidatesTokenCount": 27,
|
||||
# "totalTokenCount": 36
|
||||
# }
|
||||
if response.usage_metadata:
|
||||
usage = UsageStatistics(
|
||||
prompt_tokens=response.usage_metadata.prompt_token_count,
|
||||
completion_tokens=response.usage_metadata.candidates_token_count,
|
||||
total_tokens=response.usage_metadata.total_token_count,
|
||||
)
|
||||
else:
|
||||
# Count it ourselves
|
||||
assert input_messages is not None, f"Didn't get UsageMetadata from the API response, so input_messages is required"
|
||||
prompt_tokens = count_tokens(json_dumps(input_messages)) # NOTE: this is a very rough approximation
|
||||
completion_tokens = count_tokens(json_dumps(openai_response_message.model_dump())) # NOTE: this is also approximate
|
||||
total_tokens = prompt_tokens + completion_tokens
|
||||
usage = UsageStatistics(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=total_tokens,
|
||||
)
|
||||
|
||||
response_id = str(uuid.uuid4())
|
||||
return ChatCompletionResponse(
|
||||
id=response_id,
|
||||
choices=choices,
|
||||
model=self.llm_config.model, # NOTE: Google API doesn't pass back model in the response
|
||||
created=get_utc_time(),
|
||||
usage=usage,
|
||||
)
|
||||
except KeyError as e:
|
||||
raise e
|
||||
48
letta/llm_api/llm_client.py
Normal file
48
letta/llm_api/llm_client.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from typing import Optional
|
||||
|
||||
from letta.llm_api.llm_client_base import LLMClientBase
|
||||
from letta.schemas.llm_config import LLMConfig
|
||||
|
||||
|
||||
class LLMClient:
|
||||
"""Factory class for creating LLM clients based on the model endpoint type."""
|
||||
|
||||
@staticmethod
|
||||
def create(
|
||||
agent_id: str,
|
||||
llm_config: LLMConfig,
|
||||
put_inner_thoughts_first: bool = True,
|
||||
actor_id: Optional[str] = None,
|
||||
) -> Optional[LLMClientBase]:
|
||||
"""
|
||||
Create an LLM client based on the model endpoint type.
|
||||
|
||||
Args:
|
||||
agent_id: Unique identifier for the agent
|
||||
llm_config: Configuration for the LLM model
|
||||
put_inner_thoughts_first: Whether to put inner thoughts first in the response
|
||||
use_structured_output: Whether to use structured output
|
||||
use_tool_naming: Whether to use tool naming
|
||||
actor_id: Optional actor identifier
|
||||
|
||||
Returns:
|
||||
An instance of LLMClientBase subclass
|
||||
|
||||
Raises:
|
||||
ValueError: If the model endpoint type is not supported
|
||||
"""
|
||||
match llm_config.model_endpoint_type:
|
||||
case "google_ai":
|
||||
from letta.llm_api.google_ai_client import GoogleAIClient
|
||||
|
||||
return GoogleAIClient(
|
||||
agent_id=agent_id, llm_config=llm_config, put_inner_thoughts_first=put_inner_thoughts_first, actor_id=actor_id
|
||||
)
|
||||
case "google_vertex":
|
||||
from letta.llm_api.google_vertex_client import GoogleVertexClient
|
||||
|
||||
return GoogleVertexClient(
|
||||
agent_id=agent_id, llm_config=llm_config, put_inner_thoughts_first=put_inner_thoughts_first, actor_id=actor_id
|
||||
)
|
||||
case _:
|
||||
return None
|
||||
129
letta/llm_api/llm_client_base.py
Normal file
129
letta/llm_api/llm_client_base.py
Normal file
@@ -0,0 +1,129 @@
|
||||
from abc import abstractmethod
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from openai import AsyncStream, Stream
|
||||
from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
|
||||
|
||||
from letta.schemas.llm_config import LLMConfig
|
||||
from letta.schemas.message import Message
|
||||
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
|
||||
from letta.tracing import log_event
|
||||
|
||||
|
||||
class LLMClientBase:
|
||||
"""
|
||||
Abstract base class for LLM clients, formatting the request objects,
|
||||
handling the downstream request and parsing into chat completions response format
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
agent_id: str,
|
||||
llm_config: LLMConfig,
|
||||
put_inner_thoughts_first: Optional[bool] = True,
|
||||
use_structured_output: Optional[bool] = True,
|
||||
use_tool_naming: bool = True,
|
||||
actor_id: Optional[str] = None,
|
||||
):
|
||||
self.agent_id = agent_id
|
||||
self.llm_config = llm_config
|
||||
self.put_inner_thoughts_first = put_inner_thoughts_first
|
||||
self.actor_id = actor_id
|
||||
|
||||
def send_llm_request(
|
||||
self,
|
||||
messages: List[Message],
|
||||
tools: Optional[List[dict]] = None, # TODO: change to Tool object
|
||||
tool_call: Optional[str] = None,
|
||||
stream: bool = False,
|
||||
first_message: bool = False,
|
||||
force_tool_call: Optional[str] = None,
|
||||
) -> Union[ChatCompletionResponse, Stream[ChatCompletionChunk]]:
|
||||
"""
|
||||
Issues a request to the downstream model endpoint and parses response.
|
||||
If stream=True, returns a Stream[ChatCompletionChunk] that can be iterated over.
|
||||
Otherwise returns a ChatCompletionResponse.
|
||||
"""
|
||||
request_data = self.build_request_data(messages, tools, tool_call)
|
||||
log_event(name="llm_request_sent", attributes=request_data)
|
||||
if stream:
|
||||
return self.stream(request_data)
|
||||
else:
|
||||
response_data = self.request(request_data)
|
||||
log_event(name="llm_response_received", attributes=response_data)
|
||||
return self.convert_response_to_chat_completion(response_data, messages)
|
||||
|
||||
async def send_llm_request_async(
|
||||
self,
|
||||
messages: List[Message],
|
||||
tools: Optional[List[dict]] = None, # TODO: change to Tool object
|
||||
tool_call: Optional[str] = None,
|
||||
stream: bool = False,
|
||||
first_message: bool = False,
|
||||
force_tool_call: Optional[str] = None,
|
||||
) -> Union[ChatCompletionResponse, AsyncStream[ChatCompletionChunk]]:
|
||||
"""
|
||||
Issues a request to the downstream model endpoint.
|
||||
If stream=True, returns an AsyncStream[ChatCompletionChunk] that can be async iterated over.
|
||||
Otherwise returns a ChatCompletionResponse.
|
||||
"""
|
||||
request_data = self.build_request_data(messages, tools, tool_call)
|
||||
log_event(name="llm_request_sent", attributes=request_data)
|
||||
if stream:
|
||||
return await self.stream_async(request_data)
|
||||
else:
|
||||
response_data = await self.request_async(request_data)
|
||||
log_event(name="llm_response_received", attributes=response_data)
|
||||
return self.convert_response_to_chat_completion(response_data, messages)
|
||||
|
||||
@abstractmethod
|
||||
def build_request_data(
|
||||
self,
|
||||
messages: List[Message],
|
||||
tools: List[dict],
|
||||
tool_call: Optional[str],
|
||||
) -> dict:
|
||||
"""
|
||||
Constructs a request object in the expected data format for this client.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def request(self, request_data: dict) -> dict:
|
||||
"""
|
||||
Performs underlying request to llm and returns raw response.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def request_async(self, request_data: dict) -> dict:
|
||||
"""
|
||||
Performs underlying request to llm and returns raw response.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def convert_response_to_chat_completion(
|
||||
self,
|
||||
response_data: dict,
|
||||
input_messages: List[Message],
|
||||
) -> ChatCompletionResponse:
|
||||
"""
|
||||
Converts custom response format from llm client into an OpenAI
|
||||
ChatCompletionsResponse object.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def stream(self, request_data: dict) -> Stream[ChatCompletionChunk]:
|
||||
"""
|
||||
Performs underlying streaming request to llm and returns raw response.
|
||||
"""
|
||||
raise NotImplementedError(f"Streaming is not supported for {self.llm_config.model_endpoint_type}")
|
||||
|
||||
@abstractmethod
|
||||
async def stream_async(self, request_data: dict) -> AsyncStream[ChatCompletionChunk]:
|
||||
"""
|
||||
Performs underlying streaming request to llm and returns raw response.
|
||||
"""
|
||||
raise NotImplementedError(f"Streaming is not supported for {self.llm_config.model_endpoint_type}")
|
||||
@@ -17,6 +17,7 @@ from letta.embeddings import embedding_model
|
||||
from letta.errors import InvalidInnerMonologueError, InvalidToolCallError, MissingInnerMonologueError, MissingToolCallError
|
||||
from letta.helpers.json_helpers import json_dumps
|
||||
from letta.llm_api.llm_api_tools import create
|
||||
from letta.llm_api.llm_client import LLMClient
|
||||
from letta.local_llm.constants import INNER_THOUGHTS_KWARG
|
||||
from letta.schemas.agent import AgentState
|
||||
from letta.schemas.embedding_config import EmbeddingConfig
|
||||
@@ -103,12 +104,23 @@ def check_first_response_is_valid_for_llm_endpoint(filename: str, validate_inner
|
||||
messages = client.server.agent_manager.get_in_context_messages(agent_id=full_agent_state.id, actor=client.user)
|
||||
agent = Agent(agent_state=full_agent_state, interface=None, user=client.user)
|
||||
|
||||
response = create(
|
||||
llm_client = LLMClient.create(
|
||||
agent_id=agent_state.id,
|
||||
llm_config=agent_state.llm_config,
|
||||
user_id=str(uuid.UUID(int=1)), # dummy user_id
|
||||
messages=messages,
|
||||
functions=[t.json_schema for t in agent.agent_state.tools],
|
||||
actor_id=str(uuid.UUID(int=1)),
|
||||
)
|
||||
if llm_client:
|
||||
response = llm_client.send_llm_request(
|
||||
messages=messages,
|
||||
tools=[t.json_schema for t in agent.agent_state.tools],
|
||||
)
|
||||
else:
|
||||
response = create(
|
||||
llm_config=agent_state.llm_config,
|
||||
user_id=str(uuid.UUID(int=1)), # dummy user_id
|
||||
messages=messages,
|
||||
functions=[t.json_schema for t in agent.agent_state.tools],
|
||||
)
|
||||
|
||||
# Basic check
|
||||
assert response is not None, response
|
||||
|
||||
Reference in New Issue
Block a user