feat: Add fetch webpage tool [LET-4188] (#4395)

* Add fetch webpage tool

* Use trafilatura for web extraction
This commit is contained in:
Matthew Zhou
2025-09-03 13:34:35 -07:00
committed by GitHub
parent 051a5cde6a
commit 129dd97902
5 changed files with 2767 additions and 2571 deletions

View File

@@ -129,7 +129,7 @@ MEMORY_TOOLS_LINE_NUMBER_PREFIX_REGEX = re.compile(
) )
# Built in tools # Built in tools
BUILTIN_TOOLS = ["run_code", "web_search"] BUILTIN_TOOLS = ["run_code", "web_search", "fetch_webpage"]
# Built in tools # Built in tools
FILES_TOOLS = ["open_files", "grep_files", "semantic_search_files"] FILES_TOOLS = ["open_files", "grep_files", "semantic_search_files"]

View File

@@ -45,3 +45,16 @@ async def web_search(tasks: List[SearchTask], limit: int = 1, return_raw: bool =
corresponding to each search task. corresponding to each search task.
""" """
raise NotImplementedError("This is only available on the latest agent architecture. Please contact the Letta team.") raise NotImplementedError("This is only available on the latest agent architecture. Please contact the Letta team.")
async def fetch_webpage(url: str) -> str:
"""
Fetch a webpage and convert it to markdown/text format using Jina AI reader.
Args:
url: The URL of the webpage to fetch and convert
Returns:
String containing the webpage content in markdown/text format
"""
raise NotImplementedError("This is only available on the latest agent architecture. Please contact the Letta team.")

View File

@@ -60,7 +60,7 @@ class LettaBuiltinToolExecutor(ToolExecutor):
sandbox_config: Optional[SandboxConfig] = None, sandbox_config: Optional[SandboxConfig] = None,
sandbox_env_vars: Optional[Dict[str, Any]] = None, sandbox_env_vars: Optional[Dict[str, Any]] = None,
) -> ToolExecutionResult: ) -> ToolExecutionResult:
function_map = {"run_code": self.run_code, "web_search": self.web_search} function_map = {"run_code": self.run_code, "web_search": self.web_search, "fetch_webpage": self.fetch_webpage}
if function_name not in function_map: if function_name not in function_map:
raise ValueError(f"Unknown function: {function_name}") raise ValueError(f"Unknown function: {function_name}")
@@ -415,3 +415,48 @@ class LettaBuiltinToolExecutor(ToolExecutor):
response["message"] = "No relevant passages found that directly answer the question." response["message"] = "No relevant passages found that directly answer the question."
return response return response
async def fetch_webpage(self, agent_state: "AgentState", url: str) -> str:
"""
Fetch a webpage and convert it to markdown/text format using trafilatura with readability fallback.
Args:
url: The URL of the webpage to fetch and convert
Returns:
String containing the webpage content in markdown/text format
"""
import asyncio
import html2text
import requests
from readability import Document
from trafilatura import extract, fetch_url
try:
# single thread pool call for the entire trafilatura pipeline
def trafilatura_pipeline():
downloaded = fetch_url(url) # fetch_url doesn't accept timeout parameter
if downloaded:
md = extract(downloaded, output_format="markdown")
return md
md = await asyncio.to_thread(trafilatura_pipeline)
if md:
return md
# single thread pool call for the entire fallback pipeline
def readability_pipeline():
response = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0 (compatible; LettaBot/1.0)"})
response.raise_for_status()
doc = Document(response.text)
clean_html = doc.summary(html_partial=True)
return html2text.html2text(clean_html)
return await asyncio.to_thread(readability_pipeline)
except requests.exceptions.RequestException as e:
raise Exception(f"Error fetching webpage: {str(e)}")
except Exception as e:
raise Exception(f"Unexpected error: {str(e)}")

View File

@@ -68,6 +68,8 @@ dependencies = [
"markitdown[docx,pdf,pptx]>=0.1.2", "markitdown[docx,pdf,pptx]>=0.1.2",
"orjson>=3.11.1", "orjson>=3.11.1",
"ruff[dev]>=0.12.10", "ruff[dev]>=0.12.10",
"trafilatura",
"readability-lxml",
] ]
[project.scripts] [project.scripts]

5274
uv.lock generated

File diff suppressed because it is too large Load Diff