Fix critical performance issues: thread pool exhaustion and tool tracking

Root Cause Analysis:
- delegate_task used run_in_executor with default ThreadPoolExecutor (8-12 threads)
- Each delegation blocked one thread for 2-8 minutes (full sub-agent conversation)
- After 6-8 parallel delegations, pool exhausted → all work hung
- Tool tracking used hasattr(block, 'type') but ToolUseBlock has no .type attribute

Changes:

1. mcp_tools.py: Replace thread pool with dedicated threads
   - Each delegate_task creates dedicated daemon thread with isolated event loop
   - Uses asyncio.Future + loop.call_soon_threadsafe for result communication
   - Added semaphore to limit concurrent delegations (4 max)
   - Eliminates pool exhaustion, enables unlimited parallel delegations

2. llm_interface.py: Fix tool tracking
   - Added TextBlock/ToolUseBlock imports from claude_agent_sdk
   - Replaced hasattr(block, 'type') checks with isinstance() checks
   - Fixes tool_calls=0 bug (now correctly tracks tools used)

3. agent.py: Event loop isolation and thread safety
   - Added defensive sub_agent.llm._event_loop = None in spawn_sub_agent
   - Ensures sub-agents use asyncio.run() fallback with isolated loops
   - Generate unique agent IDs with timestamps to prevent caching race conditions

Impact:
- Fixes 6-8 message hang pattern (no more 10-minute timeouts)
- Enables parallel sub-agent execution via delegate_task
- Tool tracking now reports accurate tool usage counts
- All sub-agents remain in Agent SDK mode (as required)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-03-03 20:48:43 -07:00
parent cc7e623d74
commit a8f3ed40a8
3 changed files with 2101 additions and 27 deletions

View File

@@ -1,5 +1,6 @@
"""AI Agent with Memory and LLM Integration.""" """AI Agent with Memory and LLM Integration."""
import random
import threading import threading
import time import time
from typing import Any, List, Optional, Callable from typing import Any, List, Optional, Callable
@@ -87,6 +88,10 @@ class Agent:
specialist_prompt=specialist_prompt, specialist_prompt=specialist_prompt,
) )
# DEFENSIVE: Ensure sub-agent never inherits main event loop
# Sub-agents run in dedicated threads with isolated loops
sub_agent.llm._event_loop = None
# Set agent_id for activity tracking # Set agent_id for activity tracking
sub_agent.agent_id = agent_id sub_agent.agent_id = agent_id
@@ -108,6 +113,13 @@ class Agent:
max_retries: int = 1, max_retries: int = 1,
) -> str: ) -> str:
"""Delegate a task to a specialist sub-agent with automatic retry on hang.""" """Delegate a task to a specialist sub-agent with automatic retry on hang."""
# Generate unique agent IDs to prevent caching race conditions in parallel delegations
if not agent_id:
agent_id = f"sub_{int(time.time()*1000)}_{random.randint(1000,9999)}"
else:
# Add timestamp to user-provided ID to ensure uniqueness
agent_id = f"{agent_id}_{int(time.time()*1000)}"
for attempt in range(max_retries + 1): for attempt in range(max_retries + 1):
if attempt > 0: if attempt > 0:
print(f"[Agent] Retrying {agent_id} (attempt {attempt+1}/{max_retries+1})") print(f"[Agent] Retrying {agent_id} (attempt {attempt+1}/{max_retries+1})")

View File

@@ -24,6 +24,7 @@ from typing import Any, Dict, List, Optional, Set
import requests import requests
from anthropic import Anthropic from anthropic import Anthropic
from claude_agent_sdk import TextBlock, ToolUseBlock
from usage_tracker import UsageTracker from usage_tracker import UsageTracker
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -607,12 +608,13 @@ class LLMInterface:
assistant_messages.append(message.content) assistant_messages.append(message.content)
elif isinstance(message.content, list): elif isinstance(message.content, list):
for block in message.content: for block in message.content:
if hasattr(block, 'type'): # Use isinstance() checks instead of hasattr(block, 'type')
if block.type == 'text' and hasattr(block, 'text'): # ToolUseBlock dataclass has no .type attribute
assistant_messages.append(block.text) if isinstance(block, TextBlock):
elif block.type == 'tool_use' and hasattr(block, 'name'): assistant_messages.append(block.text)
tool_names.append(block.name) elif isinstance(block, ToolUseBlock):
self._last_tool_names = tool_names.copy() tool_names.append(block.name)
self._last_tool_names = tool_names.copy()
if isinstance(message, ResultMessage): if isinstance(message, ResultMessage):
# DEBUG: Log what we captured during message processing # DEBUG: Log what we captured during message processing

File diff suppressed because it is too large Load Diff