diff --git a/memgpt/local_llm/json_parser.py b/memgpt/local_llm/json_parser.py index 3f03225c..1ebf0773 100644 --- a/memgpt/local_llm/json_parser.py +++ b/memgpt/local_llm/json_parser.py @@ -46,18 +46,114 @@ def add_missing_heartbeat(llm_json): raise NotImplementedError -def clean_json(raw_llm_output, messages=None, functions=None): - """Try a bunch of hacks to parse the data coming out of the LLM""" +def repair_json_string(json_string): + """ + This function repairs a JSON string where line feeds were accidentally added + within string literals. The line feeds are replaced with the escaped line + feed sequence '\\n'. + """ + new_string = "" + in_string = False + escape = False + for char in json_string: + if char == '"' and not escape: + in_string = not in_string + if char == "\\" and not escape: + escape = True + else: + escape = False + if char == "\n" and in_string: + new_string += "\\n" + else: + new_string += char + + return new_string + + +def repair_even_worse_json(json_string): + """ + This function repairs a malformed JSON string where string literals are broken up and + not properly enclosed in quotes. It aims to consolidate everything between 'message': and + the two ending curly braces into one string for the 'message' field. + """ + # State flags + in_message = False + in_string = False + escape = False + message_content = [] + + # Storage for the new JSON + new_json_parts = [] + + # Iterating through each character + for char in json_string: + if char == '"' and not escape: + in_string = not in_string + if not in_message: + # If we encounter a quote and are not in message, append normally + new_json_parts.append(char) + elif char == "\\" and not escape: + escape = True + new_json_parts.append(char) + else: + if escape: + escape = False + if in_message: + if char == "}": + # Append the consolidated message and the closing characters then reset the flag + new_json_parts.append('"{}"'.format("".join(message_content).replace("\n", " "))) + new_json_parts.append(char) + in_message = False + elif in_string or char.isalnum() or char.isspace() or char in ".',;:!": + # Collect the message content, excluding structural characters + message_content.append(char) + else: + # If we're not in message mode, append character to the output as is + new_json_parts.append(char) + if '"message":' in "".join(new_json_parts[-10:]): + # If we detect "message": pattern, switch to message mode + in_message = True + message_content = [] + + # Joining everything to form the new JSON + repaired_json = "".join(new_json_parts) + return repaired_json + + +def clean_json(raw_llm_output, messages=None, functions=None): + from memgpt.utils import printd + + """Try a bunch of hacks to parse the data coming out of the LLM""" try: + # printd("clean json runs:", raw_llm_output) data = json.loads(raw_llm_output) except json.JSONDecodeError: try: + printd("trying adding }") data = json.loads(raw_llm_output + "}") except json.JSONDecodeError: try: - data = extract_first_json(raw_llm_output + "}") - except: - raise - + printd("trying adding }}") + data = json.loads(raw_llm_output + "}}") + except json.JSONDecodeError: + try: + printd('trying adding "}}') + data = json.loads(raw_llm_output + '"}}') + except json.JSONDecodeError: + try: + repaired = repair_json_string(raw_llm_output) + printd("trying repair_json_string:", repaired) + data = json.loads(repaired) + except json.JSONDecodeError: + try: + repaired = repair_even_worse_json(raw_llm_output) + printd("trying repair_even_worse_json:", repaired) + data = json.loads(repaired) + except json.JSONDecodeError: + try: + printd("trying first_json") + data = extract_first_json(raw_llm_output + "}}") + except: + raise return data diff --git a/tests/test_json_parsers.py b/tests/test_json_parsers.py new file mode 100644 index 00000000..72ed7f3f --- /dev/null +++ b/tests/test_json_parsers.py @@ -0,0 +1,64 @@ +import json + +import memgpt.local_llm.json_parser as json_parser + + +EXAMPLE_MISSING_CLOSING_BRACE = """{ + "function": "send_message", + "params": { + "inner_thoughts": "Oops, I got their name wrong! I should apologize and correct myself.", + "message": "Sorry about that! I assumed you were Chad. Welcome, Brad! " + } +""" + +EXAMPLE_BAD_TOKEN_END = """{ + "function": "send_message", + "params": { + "inner_thoughts": "Oops, I got their name wrong! I should apologize and correct myself.", + "message": "Sorry about that! I assumed you were Chad. Welcome, Brad! " + } +}<|>""" + +EXAMPLE_DOUBLE_JSON = """{ + "function": "core_memory_append", + "params": { + "name": "human", + "content": "Brad, 42 years old, from Germany." + } +} +{ + "function": "send_message", + "params": { + "message": "Got it! Your age and nationality are now saved in my memory." + } +} +""" + +EXAMPLE_HARD_LINE_FEEDS = """{ + "function": "send_message", + "params": { + "message": "Let's create a list: +- First, we can do X +- Then, we can do Y! +- Lastly, we can do Z :)" + } +} +""" + + +def test_json_parsers(): + """Try various broken JSON and check that the parsers can fix it""" + + test_strings = [EXAMPLE_MISSING_CLOSING_BRACE, EXAMPLE_BAD_TOKEN_END, EXAMPLE_DOUBLE_JSON, EXAMPLE_HARD_LINE_FEEDS] + + for string in test_strings: + try: + json.loads(string) + assert False, f"Test JSON string should have failed basic JSON parsing:\n{string}" + except: + print("String failed (expectedly)") + try: + json_parser.clean_json(string) + except: + f"Failed to repair test JSON string:\n{string}" + raise