feat: Add fetch webpage tool [LET-4188] (#4395)

* Add fetch webpage tool * Use trafilatura for web extraction
2025-09-03 13:34:35 -07:00
parent 051a5cde6a
commit 129dd97902
5 changed files with 2767 additions and 2571 deletions
--- a/letta/constants.py
+++ b/letta/constants.py
@@ -129,7 +129,7 @@ MEMORY_TOOLS_LINE_NUMBER_PREFIX_REGEX = re.compile(
 )
 # Built in tools
-BUILTIN_TOOLS = ["run_code", "web_search"]
+BUILTIN_TOOLS = ["run_code", "web_search", "fetch_webpage"]
 # Built in tools
 FILES_TOOLS = ["open_files", "grep_files", "semantic_search_files"]
--- a/letta/functions/function_sets/builtin.py
+++ b/letta/functions/function_sets/builtin.py
@@ -45,3 +45,16 @@ async def web_search(tasks: List[SearchTask], limit: int = 1, return_raw: bool =
             corresponding to each search task.
    """
    raise NotImplementedError("This is only available on the latest agent architecture. Please contact the Letta team.")
 async def fetch_webpage(url: str) -> str:
    """
    Fetch a webpage and convert it to markdown/text format using Jina AI reader.
    Args:
        url: The URL of the webpage to fetch and convert
    Returns:
        String containing the webpage content in markdown/text format
    """
    raise NotImplementedError("This is only available on the latest agent architecture. Please contact the Letta team.")
--- a/letta/services/tool_executor/builtin_tool_executor.py
+++ b/letta/services/tool_executor/builtin_tool_executor.py
@@ -60,7 +60,7 @@ class LettaBuiltinToolExecutor(ToolExecutor):
        sandbox_config: Optional[SandboxConfig] = None,
        sandbox_env_vars: Optional[Dict[str, Any]] = None,
    ) -> ToolExecutionResult:
-        function_map = {"run_code": self.run_code, "web_search": self.web_search}
+        function_map = {"run_code": self.run_code, "web_search": self.web_search, "fetch_webpage": self.fetch_webpage}
        if function_name not in function_map:
            raise ValueError(f"Unknown function: {function_name}")
@@ -415,3 +415,48 @@ class LettaBuiltinToolExecutor(ToolExecutor):
            response["message"] = "No relevant passages found that directly answer the question."
        return response
    async def fetch_webpage(self, agent_state: "AgentState", url: str) -> str:
        """
        Fetch a webpage and convert it to markdown/text format using trafilatura with readability fallback.
        Args:
            url: The URL of the webpage to fetch and convert
        Returns:
            String containing the webpage content in markdown/text format
        """
        import asyncio
        import html2text
        import requests
        from readability import Document
        from trafilatura import extract, fetch_url
        try:
            # single thread pool call for the entire trafilatura pipeline
            def trafilatura_pipeline():
                downloaded = fetch_url(url)  # fetch_url doesn't accept timeout parameter
                if downloaded:
                    md = extract(downloaded, output_format="markdown")
                    return md
            md = await asyncio.to_thread(trafilatura_pipeline)
            if md:
                return md
            # single thread pool call for the entire fallback pipeline
            def readability_pipeline():
                response = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0 (compatible; LettaBot/1.0)"})
                response.raise_for_status()
                doc = Document(response.text)
                clean_html = doc.summary(html_partial=True)
                return html2text.html2text(clean_html)
            return await asyncio.to_thread(readability_pipeline)
        except requests.exceptions.RequestException as e:
            raise Exception(f"Error fetching webpage: {str(e)}")
        except Exception as e:
            raise Exception(f"Unexpected error: {str(e)}")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,6 +68,8 @@ dependencies = [
    "markitdown[docx,pdf,pptx]>=0.1.2",
    "orjson>=3.11.1",
    "ruff[dev]>=0.12.10",
    "trafilatura",
    "readability-lxml",
 ]
 [project.scripts]
--- a/uv.lock
+++ b/uv.lock