diff --git a/letta/services/tool_executor/core_tool_executor.py b/letta/services/tool_executor/core_tool_executor.py
index 6918daaa..e9e1ccf6 100644
--- a/letta/services/tool_executor/core_tool_executor.py
+++ b/letta/services/tool_executor/core_tool_executor.py
@@ -9,6 +9,7 @@ from letta.constants import (
     RETRIEVAL_QUERY_DEFAULT_PAGE_SIZE,
 )
 from letta.helpers.json_helpers import json_dumps
+from letta.helpers.tpuf_client import should_use_tpuf_for_messages
 from letta.log import get_logger
 from letta.schemas.agent import AgentState
 from letta.schemas.block import BlockUpdate
@@ -87,7 +88,7 @@ class LettaCoreToolExecutor(ToolExecutor):
         limit: Optional[int] = None,
         start_date: Optional[str] = None,
         end_date: Optional[str] = None,
-    ) -> Optional[str]:
+    ) -> Optional[dict]:
         try:
             # Parse datetime parameters if provided
             start_datetime = None
@@ -148,10 +149,32 @@ class LettaCoreToolExecutor(ToolExecutor):
                 end_date=end_datetime,
             )
 
-            if len(message_results) == 0:
-                results_str = "No results found."
+            # Filtering of tool messages is only necessary if we aren't using turbopuffer.
+            if should_use_tpuf_for_messages():
+                filtered_results = message_results
+            else:
+                # Filter out tool messages to prevent recursive results and exponential escaping
+                from letta.constants import CONVERSATION_SEARCH_TOOL_NAME
+                from letta.schemas.enums import MessageRole
+
+                filtered_results = []
+                for message, metadata in message_results:
+                    # Skip ALL tool messages - they contain tool execution results
+                    # which can cause recursive nesting and exponential escaping
+                    if message.role == MessageRole.tool:
+                        continue
+
+                    # Also skip assistant messages that call conversation_search
+                    # These can contain the search query which may lead to confusing results
+                    if message.role == MessageRole.assistant and message.tool_calls:
+                        if CONVERSATION_SEARCH_TOOL_NAME in [tool_call.function.name for tool_call in message.tool_calls]:
+                            continue
+
+                    filtered_results.append((message, metadata))
+
+            if len(filtered_results) == 0:
+                return {"message": "No results found.", "results": []}
             else:
-                results_pref = f"Showing {len(message_results)} results:"
                 results_formatted = []
                 # get current time in UTC, then convert to agent timezone for consistent comparison
                 from datetime import timezone
@@ -166,7 +189,7 @@ class LettaCoreToolExecutor(ToolExecutor):
                 else:
                     now = now_utc
 
-                for message, metadata in message_results:
+                for message, metadata in filtered_results:
                     # Format timestamp in agent's timezone if available
                     timestamp = message.created_at
                     time_delta_str = ""
@@ -249,10 +272,11 @@ class LettaCoreToolExecutor(ToolExecutor):
 
                     results_formatted.append(result_dict)
 
-                # Don't double-encode - results_formatted already has the parsed content
-                results_str = f"{results_pref} {json_dumps(results_formatted)}"
-
-            return results_str
+                # Return structured dict instead of JSON string to avoid double-encoding
+                return {
+                    "message": f"Showing {len(message_results)} results:",
+                    "results": results_formatted,
+                }
 
         except Exception as e:
             raise e
diff --git a/letta/services/tool_executor/tool_execution_manager.py b/letta/services/tool_executor/tool_execution_manager.py
index 9a190d5d..bffce487 100644
--- a/letta/services/tool_executor/tool_execution_manager.py
+++ b/letta/services/tool_executor/tool_execution_manager.py
@@ -1,4 +1,5 @@
 import asyncio
+import json
 import traceback
 from typing import Any, Dict, Optional, Type
 
@@ -122,9 +123,9 @@ class ToolExecutionManager:
             status = result.status
 
             # trim result
-            return_str = str(result.func_return)
+            # Convert to string representation, preserving dict structure when within limit
+            return_str = json.dumps(result.func_return) if isinstance(result.func_return, dict) else str(result.func_return)
             if len(return_str) > tool.return_char_limit:
-                # TODO: okay that this become a string?
                 result.func_return = FUNCTION_RETURN_VALUE_TRUNCATED(return_str, len(return_str), tool.return_char_limit)
             return result
 
diff --git a/letta/system.py b/letta/system.py
index dfbf5b28..c545fbbb 100644
--- a/letta/system.py
+++ b/letta/system.py
@@ -1,5 +1,5 @@
 import json
-from typing import Optional
+from typing import Any, Optional
 
 from letta.log import get_logger
 
@@ -147,11 +147,21 @@ def package_user_message(
     return json_dumps(packaged_message)
 
 
-def package_function_response(was_success: bool, response_string: str, timezone: str | None) -> str:
+def package_function_response(was_success: bool, response_string: Any, timezone: str | None) -> str:
+    """Package a function response with status and timestamp.
+
+    Args:
+        was_success: Whether the function execution succeeded
+        response_string: The function response - can be a string or dict. Dicts are NOT pre-encoded to avoid double JSON encoding.
+        timezone: The timezone to use for the timestamp
+
+    Returns:
+        JSON string with status, message, and time
+    """
     formatted_time = get_local_time(timezone=timezone)
     packaged_message = {
         "status": "OK" if was_success else "Failed",
-        "message": response_string,
+        "message": response_string,  # Can be str or dict - json_dumps handles both
         "time": formatted_time,
     }
 
diff --git a/letta/utils.py b/letta/utils.py
index 5f02d429..ae2f6d2c 100644
--- a/letta/utils.py
+++ b/letta/utils.py
@@ -854,11 +854,14 @@ def parse_json(string) -> dict:
         raise e
 
 
-def validate_function_response(function_response: Any, return_char_limit: int, strict: bool = False, truncate: bool = True) -> str:
+def validate_function_response(function_response: Any, return_char_limit: int, strict: bool = False, truncate: bool = True) -> Any:
     """Check to make sure that a function used by Letta returned a valid response. Truncates to return_char_limit if necessary.
 
-    This makes sure that we can coerce the function_response into a string that meets our criteria. We handle some soft coercion.
+    This makes sure that we can coerce the function_response into a string or dict that meets our criteria. We handle some soft coercion.
     If strict is True, we raise a ValueError if function_response is not a string or None.
+
+    Returns:
+        str or dict: Validated response. Dicts are returned as-is to avoid double JSON encoding by package_function_response.
     """
     if isinstance(function_response, str):
         function_response_string = function_response
@@ -870,9 +873,17 @@ def validate_function_response(function_response: Any, return_char_limit: int, s
         raise ValueError(f"Strict mode violation. Function returned type: {type(function_response).__name__}")
 
     elif isinstance(function_response, dict):
-        # As functions can return arbitrary data, if there's already nesting somewhere in the response, it's difficult
-        # for us to not result in double escapes.
-        function_response_string = json_dumps(function_response)
+        # For dicts, check if truncation is needed
+        if truncate and return_char_limit:
+            # Convert to JSON string to check size
+            json_str = json_dumps(function_response)
+            if len(json_str) > return_char_limit:
+                # If truncation is needed, return truncated string
+                logger.warning(f"function return was over limit ({len(json_str)} > {return_char_limit}) and was truncated")
+                return f"{json_str[:return_char_limit]}... [NOTE: function output was truncated since it exceeded the character limit ({len(json_str)} > {return_char_limit})]"
+        # Otherwise return dict as-is to avoid double JSON encoding
+        # package_function_response will handle the final JSON serialization
+        return function_response
     else:
         logger.debug(f"Function returned type {type(function_response).__name__}. Coercing to string.")
         function_response_string = str(function_response)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index bf89ed0b..3d16039e 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -569,14 +569,12 @@ def test_validate_function_response_none_input():
 
 
 def test_validate_function_response_dict_input():
-    """Test that dict inputs are JSON serialized"""
+    """Test that dict inputs are returned as-is (not pre-serialized) to avoid double JSON encoding"""
     test_dict = {"key": "value", "number": 42}
     response = validate_function_response(test_dict, return_char_limit=100)
-    # Response should be valid JSON string
-    import json
-
-    parsed = json.loads(response)
-    assert parsed == test_dict
+    # Response should be the dict itself, not a JSON string
+    assert isinstance(response, dict)
+    assert response == test_dict
 
 
 def test_validate_function_response_other_types():
@@ -641,14 +639,12 @@ def test_validate_function_response_exact_limit():
 
 
 def test_validate_function_response_complex_dict():
-    """Test with complex nested dictionary"""
+    """Test with complex nested dictionary - should be returned as-is"""
     complex_dict = {"nested": {"key": "value"}, "list": [1, 2, {"inner": "dict"}], "null": None, "bool": True}
     response = validate_function_response(complex_dict, return_char_limit=1000)
-    # Should be valid JSON
-    import json
-
-    parsed = json.loads(response)
-    assert parsed == complex_dict
+    # Should be the dict itself, not a JSON string
+    assert isinstance(response, dict)
+    assert response == complex_dict
 
 
 def test_validate_function_response_dict_truncation():