perf(memfs): delta upload — only push new git objects after commit (#9548)
perf(memfs): delta upload — only push new/modified git objects after commit Instead of re-uploading the entire .git/ directory after every commit, snapshot file mtimes before the commit and only upload files that are new or changed. A typical single-block update creates ~5 new objects (blob, trees, commit, ref) vs re-uploading all ~30. Full _upload_repo retained for create_repo and other paths that need it. 🤖 Generated with [Letta Code](https://letta.com) Co-authored-by: Letta <noreply@letta.com>
This commit is contained in:
@@ -146,7 +146,7 @@ class GitOperations:
|
|||||||
shutil.rmtree(os.path.dirname(repo_path), ignore_errors=True)
|
shutil.rmtree(os.path.dirname(repo_path), ignore_errors=True)
|
||||||
|
|
||||||
async def _upload_repo(self, local_repo_path: str, agent_id: str, org_id: str) -> None:
|
async def _upload_repo(self, local_repo_path: str, agent_id: str, org_id: str) -> None:
|
||||||
"""Upload a local repo to storage."""
|
"""Upload a local repo to storage (full upload)."""
|
||||||
t_start = time.perf_counter()
|
t_start = time.perf_counter()
|
||||||
storage_prefix = self._repo_path(agent_id, org_id)
|
storage_prefix = self._repo_path(agent_id, org_id)
|
||||||
|
|
||||||
@@ -182,6 +182,55 @@ class GitOperations:
|
|||||||
f"upload_time={upload_time:.2f}ms"
|
f"upload_time={upload_time:.2f}ms"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _snapshot_git_files(git_dir: str) -> Dict[str, float]:
|
||||||
|
"""Snapshot mtime of all files under .git/ for delta detection."""
|
||||||
|
snapshot = {}
|
||||||
|
for root, _dirs, files in os.walk(git_dir):
|
||||||
|
for filename in files:
|
||||||
|
path = os.path.join(root, filename)
|
||||||
|
snapshot[path] = os.path.getmtime(path)
|
||||||
|
return snapshot
|
||||||
|
|
||||||
|
async def _upload_delta(
|
||||||
|
self,
|
||||||
|
local_repo_path: str,
|
||||||
|
agent_id: str,
|
||||||
|
org_id: str,
|
||||||
|
before_snapshot: Dict[str, float],
|
||||||
|
) -> None:
|
||||||
|
"""Upload only new/modified files since before_snapshot."""
|
||||||
|
t_start = time.perf_counter()
|
||||||
|
storage_prefix = self._repo_path(agent_id, org_id)
|
||||||
|
git_dir = os.path.join(local_repo_path, ".git")
|
||||||
|
|
||||||
|
upload_tasks = []
|
||||||
|
total_bytes = 0
|
||||||
|
|
||||||
|
for root, _dirs, files in os.walk(git_dir):
|
||||||
|
for filename in files:
|
||||||
|
local_path = os.path.join(root, filename)
|
||||||
|
old_mtime = before_snapshot.get(local_path)
|
||||||
|
# New file or modified since snapshot
|
||||||
|
if old_mtime is None or os.path.getmtime(local_path) != old_mtime:
|
||||||
|
rel_path = os.path.relpath(local_path, git_dir)
|
||||||
|
storage_path = f"{storage_prefix}/{rel_path}"
|
||||||
|
with open(local_path, "rb") as f:
|
||||||
|
content = f.read()
|
||||||
|
total_bytes += len(content)
|
||||||
|
upload_tasks.append((storage_path, content))
|
||||||
|
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
await asyncio.gather(*[self.storage.upload_bytes(path, content) for path, content in upload_tasks])
|
||||||
|
upload_time = (time.perf_counter() - t0) * 1000
|
||||||
|
|
||||||
|
total_time = (time.perf_counter() - t_start) * 1000
|
||||||
|
logger.info(
|
||||||
|
f"[GIT_PERF] _upload_delta TOTAL {total_time:.2f}ms "
|
||||||
|
f"files={len(upload_tasks)} bytes={total_bytes} "
|
||||||
|
f"upload_time={upload_time:.2f}ms"
|
||||||
|
)
|
||||||
|
|
||||||
async def _download_repo(self, agent_id: str, org_id: str) -> str:
|
async def _download_repo(self, agent_id: str, org_id: str) -> str:
|
||||||
"""Download a repo from storage to a temp directory.
|
"""Download a repo from storage to a temp directory.
|
||||||
|
|
||||||
@@ -396,6 +445,9 @@ class GitOperations:
|
|||||||
logger.info(f"[GIT_PERF] _commit_with_lock download phase took {download_time:.2f}ms")
|
logger.info(f"[GIT_PERF] _commit_with_lock download phase took {download_time:.2f}ms")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Snapshot git objects before commit for delta upload
|
||||||
|
git_dir = os.path.join(repo_path, ".git")
|
||||||
|
before_snapshot = self._snapshot_git_files(git_dir)
|
||||||
|
|
||||||
def _commit():
|
def _commit():
|
||||||
t_git_start = time.perf_counter()
|
t_git_start = time.perf_counter()
|
||||||
@@ -484,11 +536,11 @@ class GitOperations:
|
|||||||
git_thread_time = (time.perf_counter() - t0) * 1000
|
git_thread_time = (time.perf_counter() - t0) * 1000
|
||||||
logger.info(f"[GIT_PERF] _commit_with_lock git thread took {git_thread_time:.2f}ms")
|
logger.info(f"[GIT_PERF] _commit_with_lock git thread took {git_thread_time:.2f}ms")
|
||||||
|
|
||||||
# Upload the updated repo
|
# Upload only new/modified objects (delta)
|
||||||
t0 = time.perf_counter()
|
t0 = time.perf_counter()
|
||||||
await self._upload_repo(repo_path, agent_id, org_id)
|
await self._upload_delta(repo_path, agent_id, org_id, before_snapshot)
|
||||||
upload_time = (time.perf_counter() - t0) * 1000
|
upload_time = (time.perf_counter() - t0) * 1000
|
||||||
logger.info(f"[GIT_PERF] _commit_with_lock upload phase took {upload_time:.2f}ms")
|
logger.info(f"[GIT_PERF] _commit_with_lock upload phase (delta) took {upload_time:.2f}ms")
|
||||||
|
|
||||||
total_time = (time.perf_counter() - t_start) * 1000
|
total_time = (time.perf_counter() - t_start) * 1000
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|||||||
Reference in New Issue
Block a user