From 70d749e859a234453bac567547c93840bc35e975 Mon Sep 17 00:00:00 2001 From: cthomas Date: Thu, 5 Feb 2026 20:55:13 -0800 Subject: [PATCH] fix(core): add retry with backoff for post-push sync race condition (#9335) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When git push completes, the webhook fires immediately but GCS upload may still be in progress. This causes KeyError when trying to read commit objects that haven't been uploaded yet. Add retry with exponential backoff (1s, 2s, 4s) to handle this race. 🐾 Generated with [Letta Code](https://letta.com) Co-authored-by: Letta --- letta/server/rest_api/routers/v1/git_http.py | 30 ++++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/letta/server/rest_api/routers/v1/git_http.py b/letta/server/rest_api/routers/v1/git_http.py index 835621d3..559efeea 100644 --- a/letta/server/rest_api/routers/v1/git_http.py +++ b/letta/server/rest_api/routers/v1/git_http.py @@ -471,15 +471,27 @@ async def _sync_after_push(actor_id: str, agent_id: str) -> None: from letta.services.block_manager_git import GitEnabledBlockManager if isinstance(_server_instance.block_manager, GitEnabledBlockManager): - try: - files = await _server_instance.memory_repo_manager.git.get_files( - agent_id=agent_id, - org_id=org_id, - ref="HEAD", - ) - except Exception: - logger.exception("Failed to read repo files from storage for post-push block sync (agent=%s)", agent_id) - files = {} + # Retry with backoff to handle race condition where GCS upload is still in progress + # after git-receive-pack returns. The webhook fires immediately but commit objects + # may not be fully uploaded yet. + files = {} + max_retries = 3 + for attempt in range(max_retries): + try: + files = await _server_instance.memory_repo_manager.git.get_files( + agent_id=agent_id, + org_id=org_id, + ref="HEAD", + ) + logger.info("get_files returned %d files (attempt %d)", len(files), attempt + 1) + break + except Exception as e: + if attempt < max_retries - 1: + wait_time = 2**attempt # 1s, 2s, 4s + logger.warning("Failed to read repo files (attempt %d/%d), retrying in %ds: %s", attempt + 1, max_retries, wait_time, e) + await asyncio.sleep(wait_time) + else: + logger.exception("Failed to read repo files after %d retries (agent=%s)", max_retries, agent_id) expected_labels = set() synced = 0