Fix critical performance issues: thread pool exhaustion and tool tracking
Root Cause Analysis: - delegate_task used run_in_executor with default ThreadPoolExecutor (8-12 threads) - Each delegation blocked one thread for 2-8 minutes (full sub-agent conversation) - After 6-8 parallel delegations, pool exhausted → all work hung - Tool tracking used hasattr(block, 'type') but ToolUseBlock has no .type attribute Changes: 1. mcp_tools.py: Replace thread pool with dedicated threads - Each delegate_task creates dedicated daemon thread with isolated event loop - Uses asyncio.Future + loop.call_soon_threadsafe for result communication - Added semaphore to limit concurrent delegations (4 max) - Eliminates pool exhaustion, enables unlimited parallel delegations 2. llm_interface.py: Fix tool tracking - Added TextBlock/ToolUseBlock imports from claude_agent_sdk - Replaced hasattr(block, 'type') checks with isinstance() checks - Fixes tool_calls=0 bug (now correctly tracks tools used) 3. agent.py: Event loop isolation and thread safety - Added defensive sub_agent.llm._event_loop = None in spawn_sub_agent - Ensures sub-agents use asyncio.run() fallback with isolated loops - Generate unique agent IDs with timestamps to prevent caching race conditions Impact: - Fixes 6-8 message hang pattern (no more 10-minute timeouts) - Enables parallel sub-agent execution via delegate_task - Tool tracking now reports accurate tool usage counts - All sub-agents remain in Agent SDK mode (as required) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
12
agent.py
12
agent.py
@@ -1,5 +1,6 @@
|
|||||||
"""AI Agent with Memory and LLM Integration."""
|
"""AI Agent with Memory and LLM Integration."""
|
||||||
|
|
||||||
|
import random
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from typing import Any, List, Optional, Callable
|
from typing import Any, List, Optional, Callable
|
||||||
@@ -87,6 +88,10 @@ class Agent:
|
|||||||
specialist_prompt=specialist_prompt,
|
specialist_prompt=specialist_prompt,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# DEFENSIVE: Ensure sub-agent never inherits main event loop
|
||||||
|
# Sub-agents run in dedicated threads with isolated loops
|
||||||
|
sub_agent.llm._event_loop = None
|
||||||
|
|
||||||
# Set agent_id for activity tracking
|
# Set agent_id for activity tracking
|
||||||
sub_agent.agent_id = agent_id
|
sub_agent.agent_id = agent_id
|
||||||
|
|
||||||
@@ -108,6 +113,13 @@ class Agent:
|
|||||||
max_retries: int = 1,
|
max_retries: int = 1,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Delegate a task to a specialist sub-agent with automatic retry on hang."""
|
"""Delegate a task to a specialist sub-agent with automatic retry on hang."""
|
||||||
|
# Generate unique agent IDs to prevent caching race conditions in parallel delegations
|
||||||
|
if not agent_id:
|
||||||
|
agent_id = f"sub_{int(time.time()*1000)}_{random.randint(1000,9999)}"
|
||||||
|
else:
|
||||||
|
# Add timestamp to user-provided ID to ensure uniqueness
|
||||||
|
agent_id = f"{agent_id}_{int(time.time()*1000)}"
|
||||||
|
|
||||||
for attempt in range(max_retries + 1):
|
for attempt in range(max_retries + 1):
|
||||||
if attempt > 0:
|
if attempt > 0:
|
||||||
print(f"[Agent] Retrying {agent_id} (attempt {attempt+1}/{max_retries+1})")
|
print(f"[Agent] Retrying {agent_id} (attempt {attempt+1}/{max_retries+1})")
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ from typing import Any, Dict, List, Optional, Set
|
|||||||
|
|
||||||
import requests
|
import requests
|
||||||
from anthropic import Anthropic
|
from anthropic import Anthropic
|
||||||
|
from claude_agent_sdk import TextBlock, ToolUseBlock
|
||||||
from usage_tracker import UsageTracker
|
from usage_tracker import UsageTracker
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -607,12 +608,13 @@ class LLMInterface:
|
|||||||
assistant_messages.append(message.content)
|
assistant_messages.append(message.content)
|
||||||
elif isinstance(message.content, list):
|
elif isinstance(message.content, list):
|
||||||
for block in message.content:
|
for block in message.content:
|
||||||
if hasattr(block, 'type'):
|
# Use isinstance() checks instead of hasattr(block, 'type')
|
||||||
if block.type == 'text' and hasattr(block, 'text'):
|
# ToolUseBlock dataclass has no .type attribute
|
||||||
assistant_messages.append(block.text)
|
if isinstance(block, TextBlock):
|
||||||
elif block.type == 'tool_use' and hasattr(block, 'name'):
|
assistant_messages.append(block.text)
|
||||||
tool_names.append(block.name)
|
elif isinstance(block, ToolUseBlock):
|
||||||
self._last_tool_names = tool_names.copy()
|
tool_names.append(block.name)
|
||||||
|
self._last_tool_names = tool_names.copy()
|
||||||
|
|
||||||
if isinstance(message, ResultMessage):
|
if isinstance(message, ResultMessage):
|
||||||
# DEBUG: Log what we captured during message processing
|
# DEBUG: Log what we captured during message processing
|
||||||
|
|||||||
2102
mcp_tools.py
2102
mcp_tools.py
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user