fix(core): parse <actions> blocks anywhere in responses (#579)

Co-authored-by: Letta Code <noreply@letta.com>
2026-03-12 17:02:06 -07:00
parent 1d636d6fa9
commit 00a0433358
3 changed files with 117 additions and 26 deletions
--- a/src/core/bot.ts
+++ b/src/core/bot.ts
@@ -20,7 +20,13 @@ import { getAgentSkillExecutableDirs, isVoiceMemoConfigured } from '../skills/lo
 import { formatMessageEnvelope, formatGroupBatchEnvelope, type SessionContextOptions } from './formatter.js';
 import type { GroupBatcher } from './group-batcher.js';
 import { redactOutbound } from './redact.js';
-import { parseDirectives, stripActionsBlock, type Directive } from './directives.js';
+import {
+  hasIncompleteActionsTag,
+  hasUnclosedActionsBlock,
+  parseDirectives,
+  stripActionsBlock,
+  type Directive,
+} from './directives.js';
 import { resolveEmoji } from './emoji.js';
 import { SessionManager } from './session-manager.js';
 import { createDisplayPipeline, type DisplayEvent, type CompleteEvent, type ErrorEvent } from './display-pipeline.js';
@@ -1437,8 +1443,8 @@ export class LettaBot implements AgentSession {
              const canEdit = adapter.supportsEditing?.() ?? false;
              const trimmed = response.trim();
              const mayBeHidden = '<no-reply/>'.startsWith(trimmed)
-                || '<actions>'.startsWith(trimmed)
-                || (trimmed.startsWith('<actions') && !trimmed.includes('</actions>'));
+                || hasIncompleteActionsTag(response)
+                || hasUnclosedActionsBlock(response);
              const streamText = stripActionsBlock(response).trim();
              if (canEdit && !mayBeHidden && !suppressDelivery && !this.cancelledKeys.has(convKey)
                && streamText.length > 0 && Date.now() - lastUpdate > 1500 && Date.now() > rateLimitedUntil) {
--- a/src/core/directives.test.ts
+++ b/src/core/directives.test.ts
@@ -1,5 +1,10 @@
 import { describe, it, expect } from 'vitest';
-import { parseDirectives, stripActionsBlock } from './directives.js';
+import {
+  hasIncompleteActionsTag,
+  hasUnclosedActionsBlock,
+  parseDirectives,
+  stripActionsBlock,
+} from './directives.js';

 describe('parseDirectives', () => {
  it('returns text unchanged when no actions block present', () => {
@@ -113,11 +118,34 @@ describe('parseDirectives', () => {
    expect(result.directives).toEqual([]);
  });

-  it('ignores actions block NOT at start of response', () => {
+  it('parses actions block in middle of response', () => {
    const input = 'Some text first <actions><react emoji="eyes" /></actions>';
    const result = parseDirectives(input);
-    expect(result.cleanText).toBe(input);
-    expect(result.directives).toEqual([]);
+    expect(result.cleanText).toBe('Some text first');
+    expect(result.directives).toEqual([{ type: 'react', emoji: 'eyes' }]);
+  });
+
+  it('parses trailing actions block after visible text', () => {
+    const input = 'Message complete. <actions><react emoji="thumbsup" /></actions>';
+    const result = parseDirectives(input);
+    expect(result.cleanText).toBe('Message complete.');
+    expect(result.directives).toEqual([{ type: 'react', emoji: 'thumbsup' }]);
+  });
+
+  it('parses and executes directives across multiple actions blocks in source order', () => {
+    const input = [
+      'Start',
+      '<actions><react emoji="eyes" /></actions>',
+      'Middle',
+      '<actions><voice>Hello</voice></actions>',
+      'End',
+    ].join(' ');
+    const result = parseDirectives(input);
+    expect(result.cleanText).toBe('Start  Middle  End');
+    expect(result.directives).toEqual([
+      { type: 'react', emoji: 'eyes' },
+      { type: 'voice', text: 'Hello' },
+    ]);
  });

  it('handles leading whitespace before actions block', () => {
@@ -339,8 +367,37 @@ describe('stripActionsBlock', () => {
    expect(stripActionsBlock('<actions><react emoji="eyes" /></actions>')).toBe('');
  });

-  it('does not strip actions block in middle of text', () => {
+  it('strips actions block in middle of text', () => {
    const input = 'Before <actions><react emoji="eyes" /></actions> After';
-    expect(stripActionsBlock(input)).toBe(input);
+    expect(stripActionsBlock(input)).toBe('Before  After');
+  });
+
+  it('strips multiple actions blocks in one response', () => {
+    const input = 'A <actions><react emoji="eyes" /></actions> B <actions><voice>Hello</voice></actions> C';
+    expect(stripActionsBlock(input)).toBe('A  B  C');
+  });
+});
+
+describe('hasUnclosedActionsBlock', () => {
+  it('detects unmatched opening actions tag', () => {
+    expect(hasUnclosedActionsBlock('Before <actions><react emoji="eyes" />')).toBe(true);
+  });
+
+  it('returns false for complete actions block', () => {
+    expect(hasUnclosedActionsBlock('Before <actions><react emoji="eyes" /></actions> After')).toBe(false);
+  });
+});
+
+describe('hasIncompleteActionsTag', () => {
+  it('detects partial opening actions tag while streaming', () => {
+    expect(hasIncompleteActionsTag('Before <act')).toBe(true);
+  });
+
+  it('detects partial closing actions tag while streaming', () => {
+    expect(hasIncompleteActionsTag('Before </act')).toBe(true);
+  });
+
+  it('returns false when no partial actions tag is present', () => {
+    expect(hasIncompleteActionsTag('Before <code>ok</code>')).toBe(false);
  });
 });
--- a/src/core/directives.ts
+++ b/src/core/directives.ts
@@ -1,11 +1,11 @@
 /**
 * XML Directive Parser
 *
- * Parses an <actions> block at the start of agent text responses.
+ * Parses <actions> blocks from agent text responses.
 * Extends the existing <no-reply/> pattern to support richer actions
 * (reactions, file sends, etc.) without requiring tool calls.
 *
- * The <actions> block must appear at the start of the response:
+ * <actions> blocks can appear anywhere in the response:
 *
 *   <actions>
 *     <react emoji="thumbsup" />
@@ -53,10 +53,14 @@ export interface ParseResult {
 }

 /**
- * Match the <actions>...</actions> wrapper at the start of the response.
- * Captures the inner content of the block.
+ * Match complete <actions>...</actions> wrappers anywhere in the response.
+ * Captures the inner content of each block.
 */
-const ACTIONS_BLOCK_REGEX = /^\s*<actions>([\s\S]*?)<\/actions>/;
+const ACTIONS_BLOCK_REGEX_SOURCE = '<actions>([\\s\\S]*?)<\\/actions>';
+
+function createActionsBlockRegex(flags = 'g'): RegExp {
+  return new RegExp(ACTIONS_BLOCK_REGEX_SOURCE, flags);
+}

 /**
 * Match supported directive tags inside the actions block in source order.
@@ -156,28 +160,52 @@ function parseChildDirectives(block: string): Directive[] {
 /**
 * Parse XML directives from agent response text.
 *
- * Looks for an <actions>...</actions> block at the start of the response.
- * Returns the cleaned text (block stripped) and an array of parsed directives.
- * If no <actions> block is found, the text is returned unchanged.
+ * Looks for complete <actions>...</actions> blocks anywhere in the response.
+ * Returns the cleaned text (all complete blocks stripped) and parsed directives.
+ * If no complete block is found, the text is returned unchanged.
 */
 export function parseDirectives(text: string): ParseResult {
-  const match = text.match(ACTIONS_BLOCK_REGEX);
-
-  if (!match) {
+  const blockRegex = createActionsBlockRegex();
+  if (!blockRegex.test(text)) {
    return { cleanText: text, directives: [] };
  }

-  const actionsContent = match[1];
-  const cleanText = text.slice(match[0].length).trim();
-  const directives = parseChildDirectives(actionsContent);
+  const directives: Directive[] = [];
+  const cleanText = text.replace(createActionsBlockRegex(), (_, actionsContent: string) => {
+    directives.push(...parseChildDirectives(actionsContent));
+    return '';
+  }).trim();

  return { cleanText, directives };
 }

 /**
- * Strip a leading <actions>...</actions> block from text for streaming display.
- * Returns the text after the block, or the original text if no complete block found.
+ * Returns true when text contains an opening <actions> tag with no matching
+ * closing tag yet. Used during streaming to avoid flashing raw XML.
+ */
+export function hasUnclosedActionsBlock(text: string): boolean {
+  const lastOpen = text.lastIndexOf('<actions>');
+  if (lastOpen < 0) return false;
+  const lastClose = text.lastIndexOf('</actions>');
+  return lastOpen > lastClose;
+}
+
+/**
+ * Returns true when the tail of the text contains a partial actions tag
+ * (opening or closing) that has not streamed fully yet.
+ */
+export function hasIncompleteActionsTag(text: string): boolean {
+  const lastLt = text.lastIndexOf('<');
+  const lastGt = text.lastIndexOf('>');
+  if (lastLt < 0 || lastLt <= lastGt) return false;
+  const tail = text.slice(lastLt);
+  return '<actions>'.startsWith(tail) || '</actions>'.startsWith(tail);
+}
+
+/**
+ * Strip complete <actions>...</actions> blocks from text for streaming display.
+ * Returns the text after stripping blocks, or the original text if none found.
 */
 export function stripActionsBlock(text: string): string {
-  return text.replace(ACTIONS_BLOCK_REGEX, '').trim();
+  return text.replace(createActionsBlockRegex(), '').trim();
 }