feat: Add fetch webpage tool [LET-4188] (#4395)
* Add fetch webpage tool * Use trafilatura for web extraction
This commit is contained in:
@@ -129,7 +129,7 @@ MEMORY_TOOLS_LINE_NUMBER_PREFIX_REGEX = re.compile(
|
||||
)
|
||||
|
||||
# Built in tools
|
||||
BUILTIN_TOOLS = ["run_code", "web_search"]
|
||||
BUILTIN_TOOLS = ["run_code", "web_search", "fetch_webpage"]
|
||||
|
||||
# Built in tools
|
||||
FILES_TOOLS = ["open_files", "grep_files", "semantic_search_files"]
|
||||
|
||||
@@ -45,3 +45,16 @@ async def web_search(tasks: List[SearchTask], limit: int = 1, return_raw: bool =
|
||||
corresponding to each search task.
|
||||
"""
|
||||
raise NotImplementedError("This is only available on the latest agent architecture. Please contact the Letta team.")
|
||||
|
||||
|
||||
async def fetch_webpage(url: str) -> str:
|
||||
"""
|
||||
Fetch a webpage and convert it to markdown/text format using Jina AI reader.
|
||||
|
||||
Args:
|
||||
url: The URL of the webpage to fetch and convert
|
||||
|
||||
Returns:
|
||||
String containing the webpage content in markdown/text format
|
||||
"""
|
||||
raise NotImplementedError("This is only available on the latest agent architecture. Please contact the Letta team.")
|
||||
|
||||
@@ -60,7 +60,7 @@ class LettaBuiltinToolExecutor(ToolExecutor):
|
||||
sandbox_config: Optional[SandboxConfig] = None,
|
||||
sandbox_env_vars: Optional[Dict[str, Any]] = None,
|
||||
) -> ToolExecutionResult:
|
||||
function_map = {"run_code": self.run_code, "web_search": self.web_search}
|
||||
function_map = {"run_code": self.run_code, "web_search": self.web_search, "fetch_webpage": self.fetch_webpage}
|
||||
|
||||
if function_name not in function_map:
|
||||
raise ValueError(f"Unknown function: {function_name}")
|
||||
@@ -415,3 +415,48 @@ class LettaBuiltinToolExecutor(ToolExecutor):
|
||||
response["message"] = "No relevant passages found that directly answer the question."
|
||||
|
||||
return response
|
||||
|
||||
async def fetch_webpage(self, agent_state: "AgentState", url: str) -> str:
|
||||
"""
|
||||
Fetch a webpage and convert it to markdown/text format using trafilatura with readability fallback.
|
||||
|
||||
Args:
|
||||
url: The URL of the webpage to fetch and convert
|
||||
|
||||
Returns:
|
||||
String containing the webpage content in markdown/text format
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
import html2text
|
||||
import requests
|
||||
from readability import Document
|
||||
from trafilatura import extract, fetch_url
|
||||
|
||||
try:
|
||||
# single thread pool call for the entire trafilatura pipeline
|
||||
def trafilatura_pipeline():
|
||||
downloaded = fetch_url(url) # fetch_url doesn't accept timeout parameter
|
||||
if downloaded:
|
||||
md = extract(downloaded, output_format="markdown")
|
||||
return md
|
||||
|
||||
md = await asyncio.to_thread(trafilatura_pipeline)
|
||||
if md:
|
||||
return md
|
||||
|
||||
# single thread pool call for the entire fallback pipeline
|
||||
def readability_pipeline():
|
||||
response = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0 (compatible; LettaBot/1.0)"})
|
||||
response.raise_for_status()
|
||||
|
||||
doc = Document(response.text)
|
||||
clean_html = doc.summary(html_partial=True)
|
||||
return html2text.html2text(clean_html)
|
||||
|
||||
return await asyncio.to_thread(readability_pipeline)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise Exception(f"Error fetching webpage: {str(e)}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Unexpected error: {str(e)}")
|
||||
|
||||
@@ -68,6 +68,8 @@ dependencies = [
|
||||
"markitdown[docx,pdf,pptx]>=0.1.2",
|
||||
"orjson>=3.11.1",
|
||||
"ruff[dev]>=0.12.10",
|
||||
"trafilatura",
|
||||
"readability-lxml",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
Reference in New Issue
Block a user