Implement adaptive timeout system with activity-based loop detection
**Problem**: Fixed 10-minute timeout kills legitimately slow operations (e.g., 5-minute web searches) while infinite loops waste resources. **Solution**: Dual-timeout strategy that distinguishes slow from stuck: 1. **Idle timeout** (5 min): No progress = kill - Tracks message_count growth via heartbeat - Only resets timer when count increases - Slow web searches keep progressing → allowed 2. **Total timeout** (15 min): Hard cap safety net - Prevents runaway tasks from consuming resources forever - Allows legitimately slow operations to complete 3. **Loop detection**: Kills after 5+ errors - Tracks error_count and last_error - Detects repetitive failures quickly - Independent of time-based checks **Key Changes**: - SubAgentState: Add message_count, error_count tracking fields - SubAgentManager.__init__: Dual timeout params (idle=300s, total=900s) - SubAgentManager.update_activity: Accepts message_count, smart timer reset - SubAgentManager.update_error: NEW - tracks errors for loop detection - SubAgentManager.get_hung_agents: 3-check system (idle/total/loop) - SubAgentManager.cleanup_agent: Detailed error messages by type - agent.py heartbeat: Passes sub_agent.llm.message_count every 10s - mcp_tools._DELEGATE_TIMEOUT: Increased to 900s (15 min) **Impact**: - Slow operations (5-12 min with progress) complete successfully - Infinite loops killed in <5 min via idle timeout or error detection - Clear diagnostics: "Idle timeout: No progress for 305s (23 messages)" - Zero config needed - adaptive behavior works automatically **Example**: CVE research taking 5 min with 117 messages now completes instead of timing out at 10 min. Loop with repeated errors killed at 3 min. See ADAPTIVE_TIMEOUT_SYSTEM.md for full specification and scenarios. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -25,18 +25,29 @@ class SubAgentState:
|
||||
is_complete: bool = False
|
||||
result: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
# Loop detection fields
|
||||
message_count: int = 0
|
||||
last_message_count: int = 0
|
||||
error_count: int = 0
|
||||
last_error: Optional[str] = None
|
||||
|
||||
|
||||
class SubAgentManager:
|
||||
"""Manages sub-agent lifecycle with hang detection and auto-restart."""
|
||||
|
||||
def __init__(self, timeout_seconds: int = 300): # 5 minutes default
|
||||
def __init__(
|
||||
self,
|
||||
idle_timeout_seconds: int = 300, # 5 minutes idle (no progress)
|
||||
total_timeout_seconds: int = 900, # 15 minutes total (hard cap)
|
||||
):
|
||||
"""Initialize manager.
|
||||
|
||||
Args:
|
||||
timeout_seconds: Maximum time without progress before killing sub-agent
|
||||
idle_timeout_seconds: Max time without progress before killing (distinguishes slow from stuck)
|
||||
total_timeout_seconds: Absolute maximum runtime (safety net for legitimately slow tasks)
|
||||
"""
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.idle_timeout_seconds = idle_timeout_seconds
|
||||
self.total_timeout_seconds = total_timeout_seconds
|
||||
self.sub_agents: Dict[str, SubAgentState] = {}
|
||||
self._lock = threading.Lock()
|
||||
self._watchdog_thread: Optional[threading.Thread] = None
|
||||
@@ -54,7 +65,11 @@ class SubAgentManager:
|
||||
name="SubAgentWatchdog"
|
||||
)
|
||||
self._watchdog_thread.start()
|
||||
logger.info("[SubAgentManager] Watchdog started (timeout: %ds)", self.timeout_seconds)
|
||||
logger.info(
|
||||
"[SubAgentManager] Watchdog started (idle: %ds, total: %ds)",
|
||||
self.idle_timeout_seconds,
|
||||
self.total_timeout_seconds
|
||||
)
|
||||
|
||||
def stop_watchdog(self) -> None:
|
||||
"""Stop the watchdog thread."""
|
||||
@@ -78,11 +93,54 @@ class SubAgentManager:
|
||||
)
|
||||
logger.info("[SubAgentManager] Registered sub-agent: %s - %s", agent_id, task_description)
|
||||
|
||||
def update_activity(self, agent_id: str) -> None:
|
||||
"""Update last activity timestamp for a sub-agent."""
|
||||
def update_activity(self, agent_id: str, message_count: Optional[int] = None) -> None:
|
||||
"""Update last activity timestamp and message count for a sub-agent.
|
||||
|
||||
Args:
|
||||
agent_id: The sub-agent ID
|
||||
message_count: Current message count (indicates progress)
|
||||
"""
|
||||
with self._lock:
|
||||
if agent_id in self.sub_agents:
|
||||
self.sub_agents[agent_id].last_activity = time.time()
|
||||
state = self.sub_agents[agent_id]
|
||||
now = time.time()
|
||||
|
||||
# Update message count if provided
|
||||
if message_count is not None:
|
||||
state.last_message_count = state.message_count
|
||||
state.message_count = message_count
|
||||
|
||||
# Only update activity timestamp if message count actually increased
|
||||
# (this distinguishes active progress from idle heartbeat)
|
||||
if message_count > state.last_message_count:
|
||||
state.last_activity = now
|
||||
else:
|
||||
# No message count provided - basic heartbeat
|
||||
state.last_activity = now
|
||||
|
||||
def update_error(self, agent_id: str, error_message: str) -> None:
|
||||
"""Track error occurrences for loop detection.
|
||||
|
||||
Args:
|
||||
agent_id: The sub-agent ID
|
||||
error_message: The error message that occurred
|
||||
"""
|
||||
with self._lock:
|
||||
if agent_id in self.sub_agents:
|
||||
state = self.sub_agents[agent_id]
|
||||
|
||||
# Increment error count
|
||||
state.error_count += 1
|
||||
|
||||
# Store last error for debugging
|
||||
state.last_error = error_message
|
||||
|
||||
# Log repetitive errors
|
||||
if state.error_count > 3:
|
||||
logger.warning(
|
||||
"[SubAgentManager] Sub-agent %s has %d errors (last: %s...)",
|
||||
agent_id, state.error_count, error_message[:80]
|
||||
)
|
||||
|
||||
def mark_complete(
|
||||
self,
|
||||
@@ -100,7 +158,13 @@ class SubAgentManager:
|
||||
agent_id, error is None)
|
||||
|
||||
def get_hung_agents(self) -> list:
|
||||
"""Get list of sub-agent IDs that appear to be hung."""
|
||||
"""Get list of sub-agent IDs that appear to be hung.
|
||||
|
||||
Uses adaptive detection:
|
||||
- Idle timeout: No progress (message count unchanged) for idle_timeout_seconds
|
||||
- Total timeout: Running longer than total_timeout_seconds regardless of progress
|
||||
- Loop detection: Repetitive errors or no message growth
|
||||
"""
|
||||
now = time.time()
|
||||
hung = []
|
||||
|
||||
@@ -109,13 +173,39 @@ class SubAgentManager:
|
||||
if state.is_complete:
|
||||
continue
|
||||
|
||||
time_since_activity = now - state.last_activity
|
||||
if time_since_activity > self.timeout_seconds:
|
||||
total_runtime = now - state.started_at
|
||||
idle_time = now - state.last_activity
|
||||
|
||||
# Check 1: Total timeout (hard cap)
|
||||
if total_runtime > self.total_timeout_seconds:
|
||||
hung.append(agent_id)
|
||||
logger.warning(
|
||||
"[SubAgentManager] Sub-agent appears hung: %s - %s (no activity for %.1fs)",
|
||||
agent_id, state.task_description, time_since_activity
|
||||
"[SubAgentManager] Sub-agent exceeded total timeout: %s - %s "
|
||||
"(running %.1fs > %ds total limit)",
|
||||
agent_id, state.task_description, total_runtime, self.total_timeout_seconds
|
||||
)
|
||||
continue
|
||||
|
||||
# Check 2: Idle timeout (no progress)
|
||||
if idle_time > self.idle_timeout_seconds:
|
||||
hung.append(agent_id)
|
||||
logger.warning(
|
||||
"[SubAgentManager] Sub-agent idle timeout: %s - %s "
|
||||
"(no progress for %.1fs, messages: %d)",
|
||||
agent_id, state.task_description, idle_time, state.message_count
|
||||
)
|
||||
continue
|
||||
|
||||
# Check 3: Loop detection - high error count
|
||||
if state.error_count > 5:
|
||||
hung.append(agent_id)
|
||||
logger.warning(
|
||||
"[SubAgentManager] Sub-agent loop detected: %s - %s "
|
||||
"(error count: %d, last error: %s)",
|
||||
agent_id, state.task_description, state.error_count,
|
||||
state.last_error[:100] if state.last_error else "None"
|
||||
)
|
||||
continue
|
||||
|
||||
return hung
|
||||
|
||||
@@ -133,7 +223,25 @@ class SubAgentManager:
|
||||
|
||||
# Mark as failed
|
||||
state.is_complete = True
|
||||
state.error = f"Timeout: No progress for {self.timeout_seconds}s"
|
||||
total_runtime = time.time() - state.started_at
|
||||
idle_time = time.time() - state.last_activity
|
||||
|
||||
# Build detailed error message based on timeout type
|
||||
if total_runtime > self.total_timeout_seconds:
|
||||
state.error = (
|
||||
f"Total timeout: Exceeded {self.total_timeout_seconds}s limit "
|
||||
f"(ran {total_runtime:.1f}s, {state.message_count} messages)"
|
||||
)
|
||||
elif idle_time > self.idle_timeout_seconds:
|
||||
state.error = (
|
||||
f"Idle timeout: No progress for {idle_time:.1f}s "
|
||||
f"(limit: {self.idle_timeout_seconds}s, {state.message_count} messages)"
|
||||
)
|
||||
else:
|
||||
state.error = (
|
||||
f"Loop detected: {state.error_count} errors, "
|
||||
f"last: {state.last_error[:100] if state.last_error else 'None'}"
|
||||
)
|
||||
|
||||
def _watchdog_loop(self) -> None:
|
||||
"""Watchdog loop that runs in background thread."""
|
||||
@@ -174,7 +282,8 @@ class SubAgentManager:
|
||||
|
||||
if state.is_complete:
|
||||
status["complete"] += 1
|
||||
elif (now - state.last_activity) > self.timeout_seconds:
|
||||
elif ((now - state.last_activity) > self.idle_timeout_seconds or
|
||||
(now - state.started_at) > self.total_timeout_seconds):
|
||||
status["hung"] += 1
|
||||
else:
|
||||
status["running"] += 1
|
||||
|
||||
Reference in New Issue
Block a user