diff --git a/letta/llm_api/google_vertex_client.py b/letta/llm_api/google_vertex_client.py
index ccbee0ca..18a1a1b1 100644
--- a/letta/llm_api/google_vertex_client.py
+++ b/letta/llm_api/google_vertex_client.py
@@ -204,8 +204,16 @@ class GoogleVertexClient(LLMClientBase):
             raise e
         # Direct yield - keeps response alive in generator's local scope throughout iteration
         # This is required because the SDK's connection lifecycle is tied to the response object
-        async for chunk in response:
-            yield chunk
+        try:
+            async for chunk in response:
+                yield chunk
+        except errors.ClientError as e:
+            if e.code == 499:
+                logger.info(f"{self._provider_prefix()} Stream cancelled by client (499): {e}")
+                return
+            raise self.handle_llm_error(e)
+        except errors.APIError as e:
+            raise self.handle_llm_error(e)
 
     @staticmethod
     def add_dummy_model_messages(messages: List[dict]) -> List[dict]:
@@ -801,6 +809,14 @@ class GoogleVertexClient(LLMClientBase):
     def handle_llm_error(self, e: Exception) -> Exception:
         # Handle Google GenAI specific errors
         if isinstance(e, errors.ClientError):
+            if e.code == 499:
+                logger.info(f"{self._provider_prefix()} Request cancelled by client (499): {e}")
+                return LLMConnectionError(
+                    message=f"Request to {self._provider_name()} was cancelled (client disconnected): {str(e)}",
+                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"status_code": 499, "cause": "client_cancelled"},
+                )
+
             logger.warning(f"{self._provider_prefix()} Client error ({e.code}): {e}")
 
             # Handle specific error codes