feat: Add fetch webpage tool [LET-4188] (#4395)

* Add fetch webpage tool * Use trafilatura for web extraction
2025-09-03 13:34:35 -07:00
parent 051a5cde6a
commit 129dd97902
5 changed files with 2767 additions and 2571 deletions
--- a/letta/constants.py
+++ b/letta/constants.py
@@ -129,7 +129,7 @@ MEMORY_TOOLS_LINE_NUMBER_PREFIX_REGEX = re.compile(
 )

 # Built in tools
-BUILTIN_TOOLS = ["run_code", "web_search"]
+BUILTIN_TOOLS = ["run_code", "web_search", "fetch_webpage"]

 # Built in tools
 FILES_TOOLS = ["open_files", "grep_files", "semantic_search_files"]
--- a/letta/functions/function_sets/builtin.py
+++ b/letta/functions/function_sets/builtin.py
@@ -45,3 +45,16 @@ async def web_search(tasks: List[SearchTask], limit: int = 1, return_raw: bool =
             corresponding to each search task.
    """
    raise NotImplementedError("This is only available on the latest agent architecture. Please contact the Letta team.")
+
+
+async def fetch_webpage(url: str) -> str:
+    """
+    Fetch a webpage and convert it to markdown/text format using Jina AI reader.
+
+    Args:
+        url: The URL of the webpage to fetch and convert
+
+    Returns:
+        String containing the webpage content in markdown/text format
+    """
+    raise NotImplementedError("This is only available on the latest agent architecture. Please contact the Letta team.")
--- a/letta/services/tool_executor/builtin_tool_executor.py
+++ b/letta/services/tool_executor/builtin_tool_executor.py
@@ -60,7 +60,7 @@ class LettaBuiltinToolExecutor(ToolExecutor):
        sandbox_config: Optional[SandboxConfig] = None,
        sandbox_env_vars: Optional[Dict[str, Any]] = None,
    ) -> ToolExecutionResult:
-        function_map = {"run_code": self.run_code, "web_search": self.web_search}
+        function_map = {"run_code": self.run_code, "web_search": self.web_search, "fetch_webpage": self.fetch_webpage}

        if function_name not in function_map:
            raise ValueError(f"Unknown function: {function_name}")
@@ -415,3 +415,48 @@ class LettaBuiltinToolExecutor(ToolExecutor):
            response["message"] = "No relevant passages found that directly answer the question."

        return response
+
+    async def fetch_webpage(self, agent_state: "AgentState", url: str) -> str:
+        """
+        Fetch a webpage and convert it to markdown/text format using trafilatura with readability fallback.
+
+        Args:
+            url: The URL of the webpage to fetch and convert
+
+        Returns:
+            String containing the webpage content in markdown/text format
+        """
+        import asyncio
+
+        import html2text
+        import requests
+        from readability import Document
+        from trafilatura import extract, fetch_url
+
+        try:
+            # single thread pool call for the entire trafilatura pipeline
+            def trafilatura_pipeline():
+                downloaded = fetch_url(url)  # fetch_url doesn't accept timeout parameter
+                if downloaded:
+                    md = extract(downloaded, output_format="markdown")
+                    return md
+
+            md = await asyncio.to_thread(trafilatura_pipeline)
+            if md:
+                return md
+
+            # single thread pool call for the entire fallback pipeline
+            def readability_pipeline():
+                response = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0 (compatible; LettaBot/1.0)"})
+                response.raise_for_status()
+
+                doc = Document(response.text)
+                clean_html = doc.summary(html_partial=True)
+                return html2text.html2text(clean_html)
+
+            return await asyncio.to_thread(readability_pipeline)
+
+        except requests.exceptions.RequestException as e:
+            raise Exception(f"Error fetching webpage: {str(e)}")
+        except Exception as e:
+            raise Exception(f"Unexpected error: {str(e)}")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,6 +68,8 @@ dependencies = [
    "markitdown[docx,pdf,pptx]>=0.1.2",
    "orjson>=3.11.1",
    "ruff[dev]>=0.12.10",
+    "trafilatura",
+    "readability-lxml",
 ]

 [project.scripts]
--- a/uv.lock
+++ b/uv.lock