Files
letta-server/letta/server/rest_api/routers/v1/git_http.py
Kian Jones f5c4ab50f4 chore: add ty + pre-commit hook and repeal even more ruff rules (#9504)
* auto fixes

* auto fix pt2 and transitive deps and undefined var checking locals()

* manual fixes (ignored or letta-code fixed)

* fix circular import

* remove all ignores, add FastAPI rules and Ruff rules

* add ty and precommit

* ruff stuff

* ty check fixes

* ty check fixes pt 2

* error on invalid
2026-02-24 10:55:11 -08:00

796 lines
28 KiB
Python

"""Git HTTP Smart Protocol endpoints via dulwich (proxied).
## Why a separate dulwich server?
Dulwich's `HTTPGitApplication` is a WSGI app and relies on the WSGI `write()`
callback pattern. Starlette's `WSGIMiddleware` does not fully support this
pattern, which causes failures when mounting dulwich directly into FastAPI.
To avoid the ASGI/WSGI impedance mismatch, we run dulwich's WSGI server on a
separate local port (default: 8284) and proxy `/v1/git/*` requests to it.
Example:
git clone http://localhost:8283/v1/git/{agent_id}/state.git
Routes (smart HTTP):
GET /v1/git/{agent_id}/state.git/info/refs?service=git-upload-pack
POST /v1/git/{agent_id}/state.git/git-upload-pack
GET /v1/git/{agent_id}/state.git/info/refs?service=git-receive-pack
POST /v1/git/{agent_id}/state.git/git-receive-pack
The dulwich server uses `GCSBackend` to materialize repositories from GCS on
-demand.
Post-push sync back to GCS/PostgreSQL is triggered from the proxy route after a
successful `git-receive-pack`.
"""
from __future__ import annotations
import asyncio
import contextvars
import os
import shutil
import tempfile
import threading
from typing import Dict, Iterable, Optional
import httpx
# dulwich is an optional dependency (extra = "git-state"). CI installs don't
# include it, so imports must be lazy/guarded.
try:
from dulwich.repo import Repo
from dulwich.server import Backend
from dulwich.web import HTTPGitApplication, make_server
_DULWICH_AVAILABLE = True
except ImportError: # pragma: no cover
Repo = None # type: ignore[assignment]
class Backend: # type: ignore[no-redef]
pass
HTTPGitApplication = None # type: ignore[assignment]
make_server = None # type: ignore[assignment]
_DULWICH_AVAILABLE = False
from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, StreamingResponse
from starlette.background import BackgroundTask
from letta.log import get_logger
from letta.server.rest_api.dependencies import HeaderParams, get_headers, get_letta_server
logger = get_logger(__name__)
_background_tasks: set[asyncio.Task] = set()
router = APIRouter(prefix="/git", tags=["git"], include_in_schema=False)
# Global storage for the server instance (set during app startup)
_server_instance = None
# org_id/agent_id -> temp working tree path (repo root, with .git inside)
_repo_cache: Dict[str, str] = {}
_repo_locks: Dict[str, threading.Lock] = {}
def _dulwich_repo_path_marker_file(cache_key: str) -> str:
"""Path to a marker file that stores the dulwich temp repo path.
Dulwich runs in-process and mutates a repo materialized into a temp directory.
We then need to locate that same temp directory after the push to persist the
updated `.git/` contents back to object storage.
In production we may have multiple FastAPI workers; in-memory `_repo_cache`
is not shared across workers, so we store the repo_path in a small file under
/tmp as a best-effort handoff. (Longer-term, we'll likely move dulwich to its
own service/process and remove this.)
"""
safe = cache_key.replace("/", "__")
base = os.path.join(tempfile.gettempdir(), "letta-git-http")
os.makedirs(base, exist_ok=True)
return os.path.join(base, f"dulwich_repo_path__{safe}.txt")
# org_id for the currently-handled dulwich request (set by a WSGI wrapper).
_current_org_id: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar("letta_git_http_org_id", default=None)
# Dulwich server globals
_dulwich_server = None
_dulwich_thread: Optional[threading.Thread] = None
def set_server_instance(server) -> None:
"""Set the Letta server instance for git operations. Called during app startup."""
global _server_instance
_server_instance = server
def _get_dulwich_port() -> int:
return int(os.getenv("LETTA_GIT_HTTP_DULWICH_PORT", "8284"))
def start_dulwich_server(host: str = "127.0.0.1", port: Optional[int] = None) -> None:
"""Start a local dulwich HTTP server in a background thread.
This is safe to call multiple times; only the first successful call will
start a server in the current process.
"""
global _dulwich_server, _dulwich_thread
if not _DULWICH_AVAILABLE:
logger.info("dulwich not installed; git smart HTTP is disabled")
return
if _dulwich_thread and _dulwich_thread.is_alive():
return
if port is None:
port = _get_dulwich_port()
# Ensure backend can access storage through the running server.
if _server_instance is None:
raise RuntimeError("Server instance not set (did you call set_server_instance?)")
try:
_dulwich_server = make_server(host, port, _git_wsgi_app)
except OSError as e:
# When running with multiple uvicorn workers, only one process can bind
# to the configured port.
logger.warning("Failed to bind dulwich git server on %s:%s: %s", host, port, e)
return
def _run():
logger.info("Starting dulwich git HTTP server on http://%s:%s", host, port)
try:
_dulwich_server.serve_forever()
except Exception:
logger.exception("Dulwich git HTTP server crashed")
_dulwich_thread = threading.Thread(target=_run, name="dulwich-git-http", daemon=True)
_dulwich_thread.start()
def stop_dulwich_server() -> None:
"""Stop the local dulwich server (best-effort)."""
global _dulwich_server
if _dulwich_server is None:
return
try:
_dulwich_server.shutdown()
except Exception:
logger.exception("Failed to shutdown dulwich server")
def _require_current_org_id() -> str:
"""Read the org_id set by the WSGI wrapper for the current request."""
org_id = _current_org_id.get()
if not org_id:
raise RuntimeError("Missing org_id for git HTTP request")
return org_id
def _resolve_org_id_from_wsgi_environ(environ: dict) -> Optional[str]:
"""Resolve org_id for dulwich, preferring X-Organization-Id.
This is used by the dulwich WSGI wrapper. If X-Organization-Id is missing,
we fall back to resolving via the authenticated user_id header.
Note: dulwich is served on 127.0.0.1, so these headers should only be set by
our trusted in-pod proxy layer.
"""
org_id = environ.get("HTTP_X_ORGANIZATION_ID")
if org_id:
return org_id
user_id = environ.get("HTTP_USER_ID")
if not user_id:
return None
if _server_instance is None:
return None
try:
# We are in a dulwich WSGI thread; run async DB lookup in a fresh loop.
actor = asyncio.run(_server_instance.user_manager.get_actor_by_id_async(user_id))
resolved = actor.organization_id
except Exception:
logger.exception("Failed to resolve org_id from user_id for dulwich request (user_id=%s)", user_id)
return None
return resolved
class GCSBackend(Backend):
"""Dulwich backend that materializes repos from GCS."""
def open_repository(self, path: str | bytes):
"""Open a repository by path.
dulwich passes paths like:
/{agent_id}/state.git
/{agent_id}/state.git/info/refs
/{agent_id}/state.git/git-upload-pack
/{agent_id}/state.git/git-receive-pack
We map those to an on-disk repo cached in a temp dir.
"""
if not _DULWICH_AVAILABLE or Repo is None:
raise RuntimeError("dulwich not installed")
if isinstance(path, (bytes, bytearray)):
path = path.decode("utf-8", errors="surrogateescape")
parts = path.strip("/").split("/")
# Supported path form: /{agent_id}/state.git[/...]
if "state.git" not in parts:
raise ValueError(f"Invalid repository path (missing state.git): {path}")
repo_idx = parts.index("state.git")
if repo_idx != 1:
raise ValueError(f"Invalid repository path (expected /{{agent_id}}/state.git): {path}")
agent_id = parts[0]
org_id = _require_current_org_id()
cache_key = f"{org_id}/{agent_id}"
logger.info("GCSBackend.open_repository: org=%s agent=%s", org_id, agent_id)
lock = _repo_locks.setdefault(cache_key, threading.Lock())
with lock:
# Always refresh from GCS to avoid serving stale refs/objects when the
# repo is mutated through non-git code paths (e.g. git-state APIs)
# or when multiple app workers are running.
old_repo_path = _repo_cache.pop(cache_key, None)
if old_repo_path:
shutil.rmtree(os.path.dirname(old_repo_path), ignore_errors=True)
try:
os.unlink(_dulwich_repo_path_marker_file(cache_key))
except FileNotFoundError:
pass
repo_path = self._download_repo_sync(agent_id=agent_id, org_id=org_id)
_repo_cache[cache_key] = repo_path
# Persist repo_path for cross-worker post-push sync.
try:
with open(_dulwich_repo_path_marker_file(cache_key), "w") as f:
f.write(repo_path)
except Exception:
logger.exception("Failed to write repo_path marker for %s", cache_key)
repo = Repo(repo_path)
_prune_broken_refs(repo)
return repo
def _download_repo_sync(self, agent_id: str, org_id: str) -> str:
"""Synchronously download a repo from GCS.
dulwich runs in a background thread (wsgiref server thread), so we should
not assume we're on the main event loop.
"""
if _server_instance is None:
raise RuntimeError("Server instance not set (did you call set_server_instance?)")
# This runs in a dulwich-managed WSGI thread, not an AnyIO worker thread.
# Use a dedicated event loop to run the async download.
return asyncio.run(self._download_repo(agent_id, org_id))
async def _download_repo(self, agent_id: str, org_id: str) -> str:
"""Download repo from GCS into a temporary working tree."""
storage = _server_instance.memory_repo_manager.git.storage
storage_prefix = f"{org_id}/{agent_id}/repo.git"
files = await storage.list_files(storage_prefix)
if not files:
# Create an empty repo on-demand so clients can `git clone` immediately.
logger.info("Repository not found for agent %s; creating empty repo", agent_id)
await _server_instance.memory_repo_manager.git.create_repo(
agent_id=agent_id,
org_id=org_id,
initial_files={},
author_name="Letta System",
author_email="system@letta.ai",
)
files = await storage.list_files(storage_prefix)
if not files:
raise FileNotFoundError(f"Repository not found for agent {agent_id}")
temp_dir = tempfile.mkdtemp(prefix="letta-git-http-")
repo_path = os.path.join(temp_dir, "repo")
git_dir = os.path.join(repo_path, ".git")
os.makedirs(git_dir)
# Ensure required git directories exist for fetch/push even if GCS doesn't
# have any objects packed yet.
for subdir in [
"objects",
os.path.join("objects", "pack"),
os.path.join("objects", "info"),
"refs",
os.path.join("refs", "heads"),
os.path.join("refs", "tags"),
"info",
]:
os.makedirs(os.path.join(git_dir, subdir), exist_ok=True)
async def download_file(file_path: str):
if file_path.startswith(storage_prefix):
rel_path = file_path[len(storage_prefix) + 1 :]
else:
rel_path = file_path.split("/")[-1]
if not rel_path:
return
local_path = os.path.join(git_dir, rel_path)
os.makedirs(os.path.dirname(local_path), exist_ok=True)
content = await storage.download_bytes(file_path)
with open(local_path, "wb") as f:
f.write(content)
await asyncio.gather(*[download_file(f) for f in files])
logger.info("Downloaded %s files from GCS for agent %s", len(files), agent_id)
return repo_path
def _prune_broken_refs(repo: Repo) -> int:
"""Remove refs that point at missing objects.
This can happen if a prior push partially failed after updating refs but
before all objects were persisted to backing storage.
We prune these so dulwich doesn't advertise/resolve against corrupt refs,
which can lead to `UnresolvedDeltas` during subsequent pushes.
"""
removed = 0
try:
ref_names = list(repo.refs.keys())
except Exception:
logger.exception("Failed to enumerate refs for pruning")
return 0
for name in ref_names:
# HEAD is commonly symbolic; skip.
if name in {b"HEAD", "HEAD"}:
continue
try:
sha = repo.refs[name]
except Exception:
continue
if not sha:
continue
try:
if sha not in repo.object_store:
logger.warning("Pruning broken ref %r -> %r", name, sha)
try:
repo.refs.remove_if_equals(name, sha)
except Exception:
# Best-effort fallback
try:
del repo.refs[name]
except Exception:
pass
removed += 1
except Exception:
logger.exception("Failed while checking ref %r", name)
return removed
async def _sync_after_push(actor_id: str, agent_id: str) -> None:
"""Sync repo back to GCS and PostgreSQL after a successful push.
When using memfs service:
- GCS sync is handled by memfs (skipped here)
- We still sync blocks to Postgres
When using local dulwich:
- Upload repo to GCS
- Sync blocks to Postgres
"""
from letta.settings import settings
if _server_instance is None:
logger.warning("Server instance not set; cannot sync after push")
return
try:
actor = await _server_instance.user_manager.get_actor_by_id_async(actor_id)
except Exception:
logger.exception("Failed to resolve actor for post-push sync (actor_id=%s)", actor_id)
return
org_id = actor.organization_id
using_memfs = bool(settings.memfs_service_url)
# When using local dulwich, we need to upload to GCS
if not using_memfs:
cache_key = f"{org_id}/{agent_id}"
repo_path = _repo_cache.get(cache_key)
if not repo_path:
# Cross-worker fallback: read marker file written by the dulwich process.
try:
with open(_dulwich_repo_path_marker_file(cache_key), "r") as f:
repo_path = f.read().strip() or None
except FileNotFoundError:
repo_path = None
if not repo_path:
logger.warning("No cached repo for %s after push", cache_key)
return
if not os.path.exists(repo_path):
logger.warning("Repo path %s does not exist after push", repo_path)
return
logger.info("Syncing repo after push: org=%s agent=%s", org_id, agent_id)
storage = _server_instance.memory_repo_manager.git.storage
storage_prefix = f"{org_id}/{agent_id}/repo.git"
git_dir = os.path.join(repo_path, ".git")
upload_tasks = []
for root, _dirs, files in os.walk(git_dir):
for filename in files:
local_file = os.path.join(root, filename)
rel_path = os.path.relpath(local_file, git_dir)
storage_path = f"{storage_prefix}/{rel_path}"
with open(local_file, "rb") as f:
content = f.read()
upload_tasks.append(storage.upload_bytes(storage_path, content))
await asyncio.gather(*upload_tasks)
logger.info("Uploaded %s files to GCS", len(upload_tasks))
else:
logger.info("Using memfs service; GCS sync handled by memfs (agent=%s)", agent_id)
# Sync blocks to Postgres (if using GitEnabledBlockManager).
#
# Keep the same pattern as API-driven edits: read from the source of truth
# in object storage after persisting the pushed refs/objects, rather than
# relying on a working tree checkout under repo_path/.
from letta.services.block_manager_git import GitEnabledBlockManager
if isinstance(_server_instance.block_manager, GitEnabledBlockManager):
# Retry with backoff to handle race condition where GCS upload is still in progress
# after git-receive-pack returns. The webhook fires immediately but commit objects
# may not be fully uploaded yet.
files = {}
max_retries = 3
for attempt in range(max_retries):
try:
files = await _server_instance.memory_repo_manager.git.get_files(
agent_id=agent_id,
org_id=org_id,
ref="HEAD",
)
logger.info("get_files returned %d files (attempt %d)", len(files), attempt + 1)
break
except Exception as e:
if attempt < max_retries - 1:
wait_time = 2**attempt # 1s, 2s, 4s
logger.warning("Failed to read repo files (attempt %d/%d), retrying in %ds: %s", attempt + 1, max_retries, wait_time, e)
await asyncio.sleep(wait_time)
else:
logger.exception("Failed to read repo files after %d retries (agent=%s)", max_retries, agent_id)
expected_labels = set()
from letta.services.memory_repo.block_markdown import parse_block_markdown
md_file_paths = sorted([file_path for file_path in files if file_path.endswith(".md")])
nested_md_file_paths = [file_path for file_path in md_file_paths if "/" in file_path[:-3]]
logger.info(
"Post-push sync file scan: agent=%s total_files=%d md_files=%d nested_md_files=%d sample_md_paths=%s",
agent_id,
len(files),
len(md_file_paths),
len(nested_md_file_paths),
md_file_paths[:10],
)
synced = 0
for file_path, content in files.items():
if not file_path.endswith(".md"):
continue
label = file_path[:-3]
expected_labels.add(label)
# Parse frontmatter to extract metadata alongside value
parsed = parse_block_markdown(content)
try:
await _server_instance.block_manager._sync_block_to_postgres(
agent_id=agent_id,
label=label,
value=parsed["value"],
actor=actor,
description=parsed.get("description"),
limit=parsed.get("limit"),
read_only=parsed.get("read_only"),
metadata=parsed.get("metadata"),
)
synced += 1
logger.info("Synced block %s to PostgreSQL", label)
except Exception:
logger.exception(
"Failed to sync block %s to PostgreSQL (agent=%s) [path=%s nested=%s]",
label,
agent_id,
file_path,
"/" in label,
)
if synced == 0:
logger.warning("No *.md files found in repo HEAD during post-push sync (agent=%s)", agent_id)
else:
# Detach blocks that were removed in git.
#
# We treat git as the source of truth for which blocks are attached to
# this agent. If a *.md file disappears from HEAD, detach the
# corresponding block from the agent in Postgres.
try:
existing_blocks = await _server_instance.agent_manager.list_agent_blocks_async(
agent_id=agent_id,
actor=actor,
before=None,
after=None,
limit=1000,
ascending=True,
)
existing_by_label = {b.label: b for b in existing_blocks}
removed_labels = set(existing_by_label.keys()) - expected_labels
for label in sorted(removed_labels):
block = existing_by_label.get(label)
if not block:
continue
await _server_instance.agent_manager.detach_block_async(
agent_id=agent_id,
block_id=block.id,
actor=actor,
)
logger.info("Detached block %s from agent (removed from git)", label)
except Exception:
logger.exception("Failed detaching removed blocks during post-push sync (agent=%s)", agent_id)
# Cleanup local cache (only relevant when using local dulwich)
if not using_memfs:
cache_key = f"{org_id}/{agent_id}"
_repo_cache.pop(cache_key, None)
try:
os.unlink(_dulwich_repo_path_marker_file(cache_key))
except FileNotFoundError:
pass
shutil.rmtree(os.path.dirname(repo_path), ignore_errors=True)
def _parse_agent_id_from_repo_path(path: str) -> Optional[str]:
"""Extract agent_id from a git HTTP path.
Expected path form:
- {agent_id}/state.git/...
"""
parts = path.strip("/").split("/")
if len(parts) < 2:
return None
if parts[1] != "state.git":
return None
return parts[0]
def _filter_out_hop_by_hop_headers(headers: Iterable[tuple[str, str]]) -> Dict[str, str]:
# RFC 7230 hop-by-hop headers that should not be forwarded
hop_by_hop = {
"connection",
"keep-alive",
"proxy-authenticate",
"proxy-authorization",
"te",
"trailers",
"transfer-encoding",
"upgrade",
}
out: Dict[str, str] = {}
for k, v in headers:
lk = k.lower()
if lk in hop_by_hop:
continue
out[k] = v
return out
def _get_memfs_service_url() -> Optional[str]:
"""Get the memfs service URL from settings, if configured."""
from letta.settings import settings
return settings.memfs_service_url
@router.api_route("/{path:path}", methods=["GET", "POST", "PUT", "PATCH", "DELETE", "HEAD", "OPTIONS"]) # pragma: no cover
async def proxy_git_http(
path: str,
request: Request,
server=Depends(get_letta_server),
headers: HeaderParams = Depends(get_headers),
):
"""Proxy `/v1/git/*` requests to the git HTTP backend.
If LETTA_MEMFS_SERVICE_URL is set, proxies to the external memfs service.
Otherwise, proxies to the local dulwich WSGI server.
"""
memfs_url = _get_memfs_service_url()
if memfs_url:
# Proxy to external memfs service
url = f"{memfs_url.rstrip('/')}/git/{path}"
logger.info("proxy_git_http: using memfs service at %s", memfs_url)
else:
# Proxy to local dulwich server
if not _DULWICH_AVAILABLE:
return JSONResponse(
status_code=501,
content={
"detail": "git smart HTTP is disabled (dulwich not installed)",
},
)
# Ensure server is running (best-effort). We also start it during lifespan.
start_dulwich_server()
port = _get_dulwich_port()
url = f"http://127.0.0.1:{port}/{path}"
req_headers = _filter_out_hop_by_hop_headers(request.headers.items())
# Avoid sending FastAPI host/length; httpx will compute
req_headers.pop("host", None)
req_headers.pop("content-length", None)
# Resolve org_id from the authenticated actor + agent and forward to dulwich.
agent_id = _parse_agent_id_from_repo_path(path)
if agent_id is not None:
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
# Authorization check: ensure the actor can access this agent.
await server.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor, include_relationships=[])
# Ensure we set exactly one X-Organization-Id header (avoid duplicate casing).
for k in list(req_headers.keys()):
if k.lower() == "x-organization-id":
req_headers.pop(k, None)
# Use the authenticated actor's org; AgentState may not carry an organization field.
req_headers["X-Organization-Id"] = actor.organization_id
logger.info(
"proxy_git_http: method=%s path=%s parsed_agent_id=%s actor_id=%s has_user_id_hdr=%s x_org_hdr=%s",
request.method,
path,
agent_id,
headers.actor_id,
bool(request.headers.get("user_id")),
req_headers.get("X-Organization-Id") or req_headers.get("x-organization-id"),
)
async def _body_iter():
async for chunk in request.stream():
yield chunk
client = httpx.AsyncClient(timeout=None)
req = client.build_request(
method=request.method,
url=url,
params=request.query_params,
headers=req_headers,
content=_body_iter() if request.method not in {"GET", "HEAD"} else None,
)
upstream = await client.send(req, stream=True)
resp_headers = _filter_out_hop_by_hop_headers(upstream.headers.items())
# If this was a push, trigger our sync.
if request.method == "POST" and path.endswith("git-receive-pack") and upstream.status_code < 400:
agent_id = _parse_agent_id_from_repo_path(path)
if agent_id is not None:
try:
actor = await server.user_manager.get_actor_or_default_async(actor_id=headers.actor_id)
# Authorization check: ensure the actor can access this agent.
await server.agent_manager.get_agent_by_id_async(agent_id=agent_id, actor=actor, include_relationships=[])
task = asyncio.create_task(_sync_after_push(actor.id, agent_id))
_background_tasks.add(task)
task.add_done_callback(_background_tasks.discard)
except Exception:
logger.exception("Failed to trigger post-push sync (agent_id=%s)", agent_id)
async def _aclose_upstream_and_client() -> None:
try:
await upstream.aclose()
finally:
await client.aclose()
return StreamingResponse(
upstream.aiter_raw(),
status_code=upstream.status_code,
headers=resp_headers,
media_type=upstream.headers.get("content-type"),
background=BackgroundTask(_aclose_upstream_and_client),
)
def _org_header_middleware(app):
"""WSGI wrapper to capture org_id from proxied requests.
FastAPI proxies requests to the dulwich server and injects `X-Organization-Id`.
Dulwich itself only passes repository *paths* into the Backend, so we capture
the org_id from the WSGI environ and stash it in a contextvar.
Important: WSGI apps can return iterables/generators, and the server may
iterate the response body *after* this wrapper returns. We must therefore
keep the contextvar set for the duration of iteration.
Defensive fallback: if X-Organization-Id is missing, attempt to derive org_id
from `user_id` (set by our auth proxy layer).
"""
def _wrapped(environ, start_response):
org_id = _resolve_org_id_from_wsgi_environ(environ)
logger.info(
"dulwich_wsgi: path=%s remote=%s has_x_org=%s has_user_id=%s resolved_org=%s",
environ.get("PATH_INFO"),
environ.get("REMOTE_ADDR"),
bool(environ.get("HTTP_X_ORGANIZATION_ID")),
bool(environ.get("HTTP_USER_ID")),
org_id,
)
token = _current_org_id.set(org_id)
try:
app_iter = app(environ, start_response)
except Exception:
_current_org_id.reset(token)
raise
def _iter():
try:
yield from app_iter
finally:
try:
if hasattr(app_iter, "close"):
app_iter.close()
finally:
_current_org_id.reset(token)
return _iter()
return _wrapped
# dulwich WSGI app (optional)
_backend = GCSBackend()
_git_wsgi_app = _org_header_middleware(HTTPGitApplication(_backend)) if _DULWICH_AVAILABLE and HTTPGitApplication is not None else None