From 12f9bf29fdc7f2dc793eceefc7bc9af261e00d17 Mon Sep 17 00:00:00 2001
From: Hans Raaf <hara@oderwat.de>
Date: Fri, 10 Nov 2023 02:05:42 +0100
Subject: [PATCH] I added some json repairs that helped me with malformed
 messages (#341)

* I added some json repairs that helped me with malformed messages

There are two of them: The first will remove hard line feeds that appear
in the message part because the model added those instead of escaped
line feeds. This happens a lot in my experiments and that actually fixes
them.

The second one is less tested and should handle the case that the model
answers with multiple blocks of strings in quotes or even uses unescaped
quotes. It should grab everything betwenn the message: " and the ending
curly braces, escape them and makes it propper json that way.

Disclaimer: Both function were written with the help of ChatGPT-4 (I
can't write much Python). I think the first one is quite solid but doubt
that the second one is fully working. Maybe somebody with more Python
skills than me (or with more time) has a better idea for this type of
malformed replies.

* Moved the repair output behind the debug flag and removed the "clean" one

* Added even more fixes (out of what I just encountered while testing)

It seems that cut of json can be corrected and sometimes the model is to
lazy to add not just one curly brace but two. I think it does not "cost"
a lot to try them all out. But the expeptions get massive that way :)

* black

* for the final hail mary with extract_first_json, might as well add a double end bracket instead of single

---------

Co-authored-by: cpacker <packercharles@gmail.com>
---
 memgpt/local_llm/json_parser.py | 108 ++++++++++++++++++++++++++++++--
 tests/test_json_parsers.py      |  64 +++++++++++++++++++
 2 files changed, 166 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_json_parsers.py

diff --git a/memgpt/local_llm/json_parser.py b/memgpt/local_llm/json_parser.py
index 3f03225c..1ebf0773 100644
--- a/memgpt/local_llm/json_parser.py
+++ b/memgpt/local_llm/json_parser.py
@@ -46,18 +46,114 @@ def add_missing_heartbeat(llm_json):
     raise NotImplementedError
 
 
-def clean_json(raw_llm_output, messages=None, functions=None):
-    """Try a bunch of hacks to parse the data coming out of the LLM"""
+def repair_json_string(json_string):
+    """
+    This function repairs a JSON string where line feeds were accidentally added
+    within string literals. The line feeds are replaced with the escaped line
+    feed sequence '\\n'.
+    """
+    new_string = ""
+    in_string = False
+    escape = False
 
+    for char in json_string:
+        if char == '"' and not escape:
+            in_string = not in_string
+        if char == "\\" and not escape:
+            escape = True
+        else:
+            escape = False
+        if char == "\n" and in_string:
+            new_string += "\\n"
+        else:
+            new_string += char
+
+    return new_string
+
+
+def repair_even_worse_json(json_string):
+    """
+    This function repairs a malformed JSON string where string literals are broken up and
+    not properly enclosed in quotes. It aims to consolidate everything between 'message': and
+    the two ending curly braces into one string for the 'message' field.
+    """
+    # State flags
+    in_message = False
+    in_string = False
+    escape = False
+    message_content = []
+
+    # Storage for the new JSON
+    new_json_parts = []
+
+    # Iterating through each character
+    for char in json_string:
+        if char == '"' and not escape:
+            in_string = not in_string
+            if not in_message:
+                # If we encounter a quote and are not in message, append normally
+                new_json_parts.append(char)
+        elif char == "\\" and not escape:
+            escape = True
+            new_json_parts.append(char)
+        else:
+            if escape:
+                escape = False
+            if in_message:
+                if char == "}":
+                    # Append the consolidated message and the closing characters then reset the flag
+                    new_json_parts.append('"{}"'.format("".join(message_content).replace("\n", " ")))
+                    new_json_parts.append(char)
+                    in_message = False
+                elif in_string or char.isalnum() or char.isspace() or char in ".',;:!":
+                    # Collect the message content, excluding structural characters
+                    message_content.append(char)
+            else:
+                # If we're not in message mode, append character to the output as is
+                new_json_parts.append(char)
+                if '"message":' in "".join(new_json_parts[-10:]):
+                    # If we detect "message": pattern, switch to message mode
+                    in_message = True
+                    message_content = []
+
+    # Joining everything to form the new JSON
+    repaired_json = "".join(new_json_parts)
+    return repaired_json
+
+
+def clean_json(raw_llm_output, messages=None, functions=None):
+    from memgpt.utils import printd
+
+    """Try a bunch of hacks to parse the data coming out of the LLM"""
     try:
+        # printd("clean json runs:", raw_llm_output)
         data = json.loads(raw_llm_output)
     except json.JSONDecodeError:
         try:
+            printd("trying adding }")
             data = json.loads(raw_llm_output + "}")
         except json.JSONDecodeError:
             try:
-                data = extract_first_json(raw_llm_output + "}")
-            except:
-                raise
-
+                printd("trying adding }}")
+                data = json.loads(raw_llm_output + "}}")
+            except json.JSONDecodeError:
+                try:
+                    printd('trying adding "}}')
+                    data = json.loads(raw_llm_output + '"}}')
+                except json.JSONDecodeError:
+                    try:
+                        repaired = repair_json_string(raw_llm_output)
+                        printd("trying repair_json_string:", repaired)
+                        data = json.loads(repaired)
+                    except json.JSONDecodeError:
+                        try:
+                            repaired = repair_even_worse_json(raw_llm_output)
+                            printd("trying repair_even_worse_json:", repaired)
+                            data = json.loads(repaired)
+                        except json.JSONDecodeError:
+                            try:
+                                printd("trying first_json")
+                                data = extract_first_json(raw_llm_output + "}}")
+                            except:
+                                raise
     return data
diff --git a/tests/test_json_parsers.py b/tests/test_json_parsers.py
new file mode 100644
index 00000000..72ed7f3f
--- /dev/null
+++ b/tests/test_json_parsers.py
@@ -0,0 +1,64 @@
+import json
+
+import memgpt.local_llm.json_parser as json_parser
+
+
+EXAMPLE_MISSING_CLOSING_BRACE = """{
+  "function": "send_message",
+  "params": {
+    "inner_thoughts": "Oops, I got their name wrong! I should apologize and correct myself.",
+    "message": "Sorry about that! I assumed you were Chad. Welcome, Brad! "
+  }
+"""
+
+EXAMPLE_BAD_TOKEN_END = """{
+  "function": "send_message",
+  "params": {
+    "inner_thoughts": "Oops, I got their name wrong! I should apologize and correct myself.",
+    "message": "Sorry about that! I assumed you were Chad. Welcome, Brad! "
+  }
+}<|>"""
+
+EXAMPLE_DOUBLE_JSON = """{
+  "function": "core_memory_append",
+  "params": {
+    "name": "human",
+    "content": "Brad, 42 years old, from Germany."
+  }
+}
+{
+  "function": "send_message",
+  "params": {
+    "message": "Got it! Your age and nationality are now saved in my memory."
+  }
+}
+"""
+
+EXAMPLE_HARD_LINE_FEEDS = """{
+  "function": "send_message",
+  "params": {
+    "message": "Let's create a list:
+- First, we can do X
+- Then, we can do Y!
+- Lastly, we can do Z :)"
+  }
+}
+"""
+
+
+def test_json_parsers():
+    """Try various broken JSON and check that the parsers can fix it"""
+
+    test_strings = [EXAMPLE_MISSING_CLOSING_BRACE, EXAMPLE_BAD_TOKEN_END, EXAMPLE_DOUBLE_JSON, EXAMPLE_HARD_LINE_FEEDS]
+
+    for string in test_strings:
+        try:
+            json.loads(string)
+            assert False, f"Test JSON string should have failed basic JSON parsing:\n{string}"
+        except:
+            print("String failed (expectedly)")
+            try:
+                json_parser.clean_json(string)
+            except:
+                f"Failed to repair test JSON string:\n{string}"
+                raise