Files
letta-server/paper_experiments/doc_qa_task/llm_judge_doc_qa.py
Kian Jones fecf6decfb chore: migrate to ruff (#4305)
* base requirements

* autofix

* Configure ruff for Python linting and formatting

- Set up minimal ruff configuration with basic checks (E, W, F, I)
- Add temporary ignores for common issues during migration
- Configure pre-commit hooks to use ruff with pass_filenames
- This enables gradual migration from black to ruff

* Delete sdj

* autofixed only

* migrate lint action

* more autofixed

* more fixes

* change precommit

* try changing the hook

* try this stuff
2025-08-29 11:11:19 -07:00

158 lines
5.7 KiB
Python

import argparse
import json
import re
from openai import OpenAI
from tqdm import tqdm
from letta.credentials import LettaCredentials
# Note: did not end up using since no cases of cheating were observed
# CHEATING_PROMPT = \
# """
# You are evaluating an LLM trying to do task.
# The main point of failure is the LLM may cheat and hallucinate the document it is meant to provide as evidence.
# For example, even though the LLM has been instructed to provide the document that it's using as evidence to answer the question, in this case the LLM hallucinates a document:
#
# The format of the LLM response with be:
# ANSWER: <answer>, DOCUMENT: <document_text>
#
# We also provide the list of documents that the LLM should have used to answer the question.
# This is provided with the format:
# ALL DOCUMENTS: <list of documents>
#
# Your task is to ensure the LLM did not cheat and hallucinate the document.
# You can ensure this by checking if the document provided by the LLM is in the list of documents provided.
#
# Write a short reason for your answer before stating the final answer (format <your reason> - CHEAT/CORRECT).
# """
EVAL_PROMPT = """
Your task is to evaluate whether an LLM correct answered a question.
The LLM response should be the format 'ANSWER: <answer>, DOCUMENT: <document_text>' or say 'INSUFFICIENT INFORMATION'.
The true answer is provided in the format 'TRUE ANSWER: <list of possible answers>'.
The questions is provided in the format 'QUESTION: <question>'.
If the LLM response contains both the correct answer and corresponding document text, the response is correct.
Even if the LLM's answer and the true answer are slightly different in wording, the response is still correct.
For example, if the answer is more specific than the true answer or uses a different phrasing that is still correct, the response is correct.
If the LLM response if 'INSUFFICIENT INFORMATION', or the 'DOCUMENT' field is missing, the response is incorrect.
Respond with a single token: 'CORRECT' or 'INCORRECT'.
"""
EVAL_MODEL = "gpt-4-0613"
def evaluate_response(output: str):
credentials = LettaCredentials().load()
assert credentials.openai_key is not None, credentials.openai_key
client = OpenAI(api_key=credentials.openai_key)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "\n".join([EVAL_PROMPT, "\n", output, "\n"]),
},
],
model=EVAL_MODEL,
)
response = chat_completion.choices[0].message.content
print("llm judge", response)
if "INCORRECT" in response:
return False
elif "CORRECT" in response:
return True
else:
print("INVALID RESPONSE", response)
return False
# Grab the last thing Letta generated, treat it as the reply
def extract_final_letta_response(letta_responses: list) -> str:
final_index = -1
if "function_return" in letta_responses[final_index]:
final_index = -2
final_letta_response = [v for k, v in letta_responses[final_index].items()]
final_letta_response = final_letta_response[-1]
return final_letta_response
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Test script")
parser.add_argument("--file", type=str, help="File data to evaluate")
parser.add_argument("--baseline", action="store_true", help="Whether to use the baseline model")
args = parser.parse_args()
# load data
data = json.load(open(args.file))
# counters
correct = 0
total = 0
# Make an intial pass to determine how many documents had the correct answer
results = [] # store all results
eval_results = [] # store results that need LLM judge
if args.baseline:
# baseline experiment
match = re.search(r"model_([^_]+)_num_docs_([^\.]+)\.json", args.file)
model = match.group(1)
num_docs = int(match.group(2))
baseline = "baseline"
else:
# model = re.search(r"model_([^\.]+)\.json", args.file).group(1)
model = re.search(r"model_([-\w.]+)(?:_num_docs_([-\d]+))?.json", args.file).group(1)
num_docs = None
baseline = "letta"
# evaluate data
for d in tqdm(data):
answer = d["true_answers"]
question = d["question"]
response = d["letta_responses"]
if not args.baseline:
# need to parse response for letta
response = extract_final_letta_response(response)
else:
response = response["response"]
found = False
for a in answer:
if a in response:
found = True
if not found and "INSUFFICIENT INFORMATION" not in response:
# inconclusive: pass to llm judge
print(question)
print(answer)
print(response)
print(args.baseline)
doc = "QUESTION: " + question + "\n" + "TRUE ANSWER: " + str(answer) + "\n" + response
judge = "llm"
judge_result = evaluate_response(doc)
print("JUDGEMENT", judge_result)
if judge_result:
correct += 1
found = True
elif found:
# answer found in text
correct += 1
judge = "text"
else:
judge = "text"
results.append({"question": question, "true_answers": answer, "response": response, "correct": found, "judge": judge})
total += 1
# Dump aggregated results
json.dump(
{"accuracy": correct / total, "total": total, "results": results},
open(f"results_{model}_{num_docs}_{baseline}.json", "w"),
indent=4,
)
print(correct / total)