letta-server/memgpt/local_llm/chat_completion_proxy.py

"""Key idea: create drop-in replacement for agent's ChatCompletion call that runs on an OpenLLM backend"""

import os
import requests
import json

from .webui.api import get_webui_completion
from .lmstudio.api import get_lmstudio_completion
from .llamacpp.api import get_llamacpp_completion
from .koboldcpp.api import get_koboldcpp_completion
from .ollama.api import get_ollama_completion
from .llm_chat_completion_wrappers import airoboros, dolphin, zephyr, simple_summary_wrapper
from .constants import DEFAULT_WRAPPER
from .utils import DotDict, get_available_wrappers
from ..prompts.gpt_summarize import SYSTEM as SUMMARIZE_SYSTEM_MESSAGE
from ..errors import LocalLLMConnectionError, LocalLLMError

endpoint = os.getenv("OPENAI_API_BASE")
endpoint_type = os.getenv("BACKEND_TYPE")  # default None == ChatCompletion
DEBUG = False
# DEBUG = True

has_shown_warning = False


def get_chat_completion(
    model,  # no model required (except for Ollama), since the model is fixed to whatever you set in your own backend
    messages,
    functions=None,
    function_call="auto",
    context_window=None,
    # required
    wrapper=None,
    endpoint=None,
    endpoint_type=None,
):
    assert context_window is not None, "Local LLM calls need the context length to be explicitly set"
    assert endpoint is not None, "Local LLM calls need the endpoint (eg http://localendpoint:1234) to be explicitly set"
    assert endpoint_type is not None, "Local LLM calls need the endpoint type (eg webui) to be explicitly set"
    global has_shown_warning
    grammar_name = None

    if function_call != "auto":
        raise ValueError(f"function_call == {function_call} not supported (auto only)")

    available_wrappers = get_available_wrappers()
    if messages[0]["role"] == "system" and messages[0]["content"].strip() == SUMMARIZE_SYSTEM_MESSAGE.strip():
        # Special case for if the call we're making is coming from the summarizer
        llm_wrapper = simple_summary_wrapper.SimpleSummaryWrapper()
    elif wrapper is None:
        # Warn the user that we're using the fallback
        if not has_shown_warning:
            print(
                f"Warning: no wrapper specified for local LLM, using the default wrapper (you can remove this warning by specifying the wrapper with --wrapper)"
            )
            has_shown_warning = True
        if endpoint_type in ["koboldcpp", "llamacpp", "webui"]:
            # make the default to use grammar
            llm_wrapper = DEFAULT_WRAPPER(include_opening_brace_in_prefix=False)
            # grammar_name = "json"
            grammar_name = "json_func_calls_with_inner_thoughts"
        else:
            llm_wrapper = DEFAULT_WRAPPER()
    elif wrapper not in available_wrappers:
        raise ValueError(f"Could not find requested wrapper '{wrapper} in available wrappers list:\n{available_wrappers}")
    else:
        llm_wrapper = available_wrappers[wrapper]
        if "grammar" in wrapper:
            grammar_name = "json_func_calls_with_inner_thoughts"

    if grammar_name is not None and endpoint_type not in ["koboldcpp", "llamacpp", "webui"]:
        print(f"Warning: grammars are currently only supported when using llama.cpp as the MemGPT local LLM backend")

    # First step: turn the message sequence into a prompt that the model expects
    try:
        prompt = llm_wrapper.chat_completion_to_prompt(messages, functions)
        if DEBUG:
            print(prompt)
    except Exception as e:
        raise LocalLLMError(
            f"Failed to convert ChatCompletion messages into prompt string with wrapper {str(llm_wrapper)} - error: {str(e)}"
        )

    try:
        if endpoint_type == "webui":
            result = get_webui_completion(endpoint, prompt, context_window, grammar=grammar_name)
        elif endpoint_type == "lmstudio":
            result = get_lmstudio_completion(endpoint, prompt, context_window)
        elif endpoint_type == "llamacpp":
            result = get_llamacpp_completion(endpoint, prompt, context_window, grammar=grammar_name)
        elif endpoint_type == "koboldcpp":
            result = get_koboldcpp_completion(endpoint, prompt, context_window, grammar=grammar_name)
        elif endpoint_type == "ollama":
            result = get_ollama_completion(endpoint, model, prompt, context_window)
        else:
            raise LocalLLMError(
                f"BACKEND_TYPE is not set, please set variable depending on your backend (webui, lmstudio, llamacpp, koboldcpp)"
            )
    except requests.exceptions.ConnectionError as e:
        raise LocalLLMConnectionError(f"Unable to connect to endpoint {endpoint}")

    if result is None or result == "":
        raise LocalLLMError(f"Got back an empty response string from {endpoint}")
    if DEBUG:
        print(f"Raw LLM output:\n{result}")

    try:
        chat_completion_result = llm_wrapper.output_to_chat_completion_response(result)
        if DEBUG:
            print(json.dumps(chat_completion_result, indent=2))
    except Exception as e:
        raise LocalLLMError(f"Failed to parse JSON from local LLM response - error: {str(e)}")

    # unpack with response.choices[0].message.content
    response = DotDict(
        {
            "model": model,
            "choices": [
                DotDict(
                    {
                        "message": DotDict(chat_completion_result),
                        "finish_reason": "stop",  # TODO vary based on backend response
                    }
                )
            ],
            "usage": DotDict(
                {
                    # TODO fix, actually use real info
                    "prompt_tokens": 0,
                    "completion_tokens": 0,
                    "total_tokens": 0,
                }
            ),
        }
    )
    return response