**Problem**: Fixed 10-minute timeout kills legitimately slow operations (e.g., 5-minute web searches) while infinite loops waste resources. **Solution**: Dual-timeout strategy that distinguishes slow from stuck: 1. **Idle timeout** (5 min): No progress = kill - Tracks message_count growth via heartbeat - Only resets timer when count increases - Slow web searches keep progressing → allowed 2. **Total timeout** (15 min): Hard cap safety net - Prevents runaway tasks from consuming resources forever - Allows legitimately slow operations to complete 3. **Loop detection**: Kills after 5+ errors - Tracks error_count and last_error - Detects repetitive failures quickly - Independent of time-based checks **Key Changes**: - SubAgentState: Add message_count, error_count tracking fields - SubAgentManager.__init__: Dual timeout params (idle=300s, total=900s) - SubAgentManager.update_activity: Accepts message_count, smart timer reset - SubAgentManager.update_error: NEW - tracks errors for loop detection - SubAgentManager.get_hung_agents: 3-check system (idle/total/loop) - SubAgentManager.cleanup_agent: Detailed error messages by type - agent.py heartbeat: Passes sub_agent.llm.message_count every 10s - mcp_tools._DELEGATE_TIMEOUT: Increased to 900s (15 min) **Impact**: - Slow operations (5-12 min with progress) complete successfully - Infinite loops killed in <5 min via idle timeout or error detection - Clear diagnostics: "Idle timeout: No progress for 305s (23 messages)" - Zero config needed - adaptive behavior works automatically **Example**: CVE research taking 5 min with 117 messages now completes instead of timing out at 10 min. Loop with repeated errors killed at 3 min. See ADAPTIVE_TIMEOUT_SYSTEM.md for full specification and scenarios. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
307 lines
11 KiB
Python
307 lines
11 KiB
Python
"""Sub-Agent Manager - Monitors and manages sub-agent lifecycle.
|
|
|
|
Handles:
|
|
- Sub-agent spawning and tracking
|
|
- Progress monitoring and hang detection
|
|
- Automatic cleanup and restart on timeout
|
|
"""
|
|
|
|
import time
|
|
import threading
|
|
from typing import Dict, Optional, Any
|
|
from dataclasses import dataclass
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class SubAgentState:
|
|
"""Track state of a running sub-agent."""
|
|
agent_id: str
|
|
task_description: str
|
|
started_at: float
|
|
last_activity: float
|
|
is_complete: bool = False
|
|
result: Optional[str] = None
|
|
error: Optional[str] = None
|
|
# Loop detection fields
|
|
message_count: int = 0
|
|
last_message_count: int = 0
|
|
error_count: int = 0
|
|
last_error: Optional[str] = None
|
|
|
|
|
|
class SubAgentManager:
|
|
"""Manages sub-agent lifecycle with hang detection and auto-restart."""
|
|
|
|
def __init__(
|
|
self,
|
|
idle_timeout_seconds: int = 300, # 5 minutes idle (no progress)
|
|
total_timeout_seconds: int = 900, # 15 minutes total (hard cap)
|
|
):
|
|
"""Initialize manager.
|
|
|
|
Args:
|
|
idle_timeout_seconds: Max time without progress before killing (distinguishes slow from stuck)
|
|
total_timeout_seconds: Absolute maximum runtime (safety net for legitimately slow tasks)
|
|
"""
|
|
self.idle_timeout_seconds = idle_timeout_seconds
|
|
self.total_timeout_seconds = total_timeout_seconds
|
|
self.sub_agents: Dict[str, SubAgentState] = {}
|
|
self._lock = threading.Lock()
|
|
self._watchdog_thread: Optional[threading.Thread] = None
|
|
self._watchdog_running = False
|
|
|
|
def start_watchdog(self) -> None:
|
|
"""Start the watchdog thread that monitors for hung sub-agents."""
|
|
if self._watchdog_running:
|
|
return
|
|
|
|
self._watchdog_running = True
|
|
self._watchdog_thread = threading.Thread(
|
|
target=self._watchdog_loop,
|
|
daemon=True,
|
|
name="SubAgentWatchdog"
|
|
)
|
|
self._watchdog_thread.start()
|
|
logger.info(
|
|
"[SubAgentManager] Watchdog started (idle: %ds, total: %ds)",
|
|
self.idle_timeout_seconds,
|
|
self.total_timeout_seconds
|
|
)
|
|
|
|
def stop_watchdog(self) -> None:
|
|
"""Stop the watchdog thread."""
|
|
self._watchdog_running = False
|
|
if self._watchdog_thread:
|
|
self._watchdog_thread.join(timeout=2)
|
|
|
|
def register_sub_agent(
|
|
self,
|
|
agent_id: str,
|
|
task_description: str
|
|
) -> None:
|
|
"""Register a new sub-agent for monitoring."""
|
|
with self._lock:
|
|
now = time.time()
|
|
self.sub_agents[agent_id] = SubAgentState(
|
|
agent_id=agent_id,
|
|
task_description=task_description,
|
|
started_at=now,
|
|
last_activity=now
|
|
)
|
|
logger.info("[SubAgentManager] Registered sub-agent: %s - %s", agent_id, task_description)
|
|
|
|
def update_activity(self, agent_id: str, message_count: Optional[int] = None) -> None:
|
|
"""Update last activity timestamp and message count for a sub-agent.
|
|
|
|
Args:
|
|
agent_id: The sub-agent ID
|
|
message_count: Current message count (indicates progress)
|
|
"""
|
|
with self._lock:
|
|
if agent_id in self.sub_agents:
|
|
state = self.sub_agents[agent_id]
|
|
now = time.time()
|
|
|
|
# Update message count if provided
|
|
if message_count is not None:
|
|
state.last_message_count = state.message_count
|
|
state.message_count = message_count
|
|
|
|
# Only update activity timestamp if message count actually increased
|
|
# (this distinguishes active progress from idle heartbeat)
|
|
if message_count > state.last_message_count:
|
|
state.last_activity = now
|
|
else:
|
|
# No message count provided - basic heartbeat
|
|
state.last_activity = now
|
|
|
|
def update_error(self, agent_id: str, error_message: str) -> None:
|
|
"""Track error occurrences for loop detection.
|
|
|
|
Args:
|
|
agent_id: The sub-agent ID
|
|
error_message: The error message that occurred
|
|
"""
|
|
with self._lock:
|
|
if agent_id in self.sub_agents:
|
|
state = self.sub_agents[agent_id]
|
|
|
|
# Increment error count
|
|
state.error_count += 1
|
|
|
|
# Store last error for debugging
|
|
state.last_error = error_message
|
|
|
|
# Log repetitive errors
|
|
if state.error_count > 3:
|
|
logger.warning(
|
|
"[SubAgentManager] Sub-agent %s has %d errors (last: %s...)",
|
|
agent_id, state.error_count, error_message[:80]
|
|
)
|
|
|
|
def mark_complete(
|
|
self,
|
|
agent_id: str,
|
|
result: Optional[str] = None,
|
|
error: Optional[str] = None
|
|
) -> None:
|
|
"""Mark a sub-agent as complete."""
|
|
with self._lock:
|
|
if agent_id in self.sub_agents:
|
|
self.sub_agents[agent_id].is_complete = True
|
|
self.sub_agents[agent_id].result = result
|
|
self.sub_agents[agent_id].error = error
|
|
logger.info("[SubAgentManager] Sub-agent completed: %s (success=%s)",
|
|
agent_id, error is None)
|
|
|
|
def get_hung_agents(self) -> list:
|
|
"""Get list of sub-agent IDs that appear to be hung.
|
|
|
|
Uses adaptive detection:
|
|
- Idle timeout: No progress (message count unchanged) for idle_timeout_seconds
|
|
- Total timeout: Running longer than total_timeout_seconds regardless of progress
|
|
- Loop detection: Repetitive errors or no message growth
|
|
"""
|
|
now = time.time()
|
|
hung = []
|
|
|
|
with self._lock:
|
|
for agent_id, state in self.sub_agents.items():
|
|
if state.is_complete:
|
|
continue
|
|
|
|
total_runtime = now - state.started_at
|
|
idle_time = now - state.last_activity
|
|
|
|
# Check 1: Total timeout (hard cap)
|
|
if total_runtime > self.total_timeout_seconds:
|
|
hung.append(agent_id)
|
|
logger.warning(
|
|
"[SubAgentManager] Sub-agent exceeded total timeout: %s - %s "
|
|
"(running %.1fs > %ds total limit)",
|
|
agent_id, state.task_description, total_runtime, self.total_timeout_seconds
|
|
)
|
|
continue
|
|
|
|
# Check 2: Idle timeout (no progress)
|
|
if idle_time > self.idle_timeout_seconds:
|
|
hung.append(agent_id)
|
|
logger.warning(
|
|
"[SubAgentManager] Sub-agent idle timeout: %s - %s "
|
|
"(no progress for %.1fs, messages: %d)",
|
|
agent_id, state.task_description, idle_time, state.message_count
|
|
)
|
|
continue
|
|
|
|
# Check 3: Loop detection - high error count
|
|
if state.error_count > 5:
|
|
hung.append(agent_id)
|
|
logger.warning(
|
|
"[SubAgentManager] Sub-agent loop detected: %s - %s "
|
|
"(error count: %d, last error: %s)",
|
|
agent_id, state.task_description, state.error_count,
|
|
state.last_error[:100] if state.last_error else "None"
|
|
)
|
|
continue
|
|
|
|
return hung
|
|
|
|
def cleanup_agent(self, agent_id: str) -> None:
|
|
"""Clean up a hung sub-agent."""
|
|
with self._lock:
|
|
if agent_id in self.sub_agents:
|
|
state = self.sub_agents[agent_id]
|
|
logger.error(
|
|
"[SubAgentManager] Cleaning up hung sub-agent: %s - %s (hung for %.1fs)",
|
|
agent_id,
|
|
state.task_description,
|
|
time.time() - state.last_activity
|
|
)
|
|
|
|
# Mark as failed
|
|
state.is_complete = True
|
|
total_runtime = time.time() - state.started_at
|
|
idle_time = time.time() - state.last_activity
|
|
|
|
# Build detailed error message based on timeout type
|
|
if total_runtime > self.total_timeout_seconds:
|
|
state.error = (
|
|
f"Total timeout: Exceeded {self.total_timeout_seconds}s limit "
|
|
f"(ran {total_runtime:.1f}s, {state.message_count} messages)"
|
|
)
|
|
elif idle_time > self.idle_timeout_seconds:
|
|
state.error = (
|
|
f"Idle timeout: No progress for {idle_time:.1f}s "
|
|
f"(limit: {self.idle_timeout_seconds}s, {state.message_count} messages)"
|
|
)
|
|
else:
|
|
state.error = (
|
|
f"Loop detected: {state.error_count} errors, "
|
|
f"last: {state.last_error[:100] if state.last_error else 'None'}"
|
|
)
|
|
|
|
def _watchdog_loop(self) -> None:
|
|
"""Watchdog loop that runs in background thread."""
|
|
while self._watchdog_running:
|
|
try:
|
|
hung_agents = self.get_hung_agents()
|
|
for agent_id in hung_agents:
|
|
self.cleanup_agent(agent_id)
|
|
|
|
# Check every 30 seconds
|
|
time.sleep(30)
|
|
|
|
except Exception as e:
|
|
logger.error("[SubAgentManager] Watchdog error: %s", e)
|
|
time.sleep(30)
|
|
|
|
def get_status(self) -> Dict[str, Any]:
|
|
"""Get current status of all sub-agents."""
|
|
now = time.time()
|
|
status = {
|
|
"total": len(self.sub_agents),
|
|
"complete": 0,
|
|
"running": 0,
|
|
"hung": 0,
|
|
"agents": []
|
|
}
|
|
|
|
with self._lock:
|
|
for agent_id, state in self.sub_agents.items():
|
|
agent_status = {
|
|
"id": agent_id,
|
|
"task": state.task_description,
|
|
"runtime": now - state.started_at,
|
|
"idle_time": now - state.last_activity,
|
|
"complete": state.is_complete,
|
|
"has_error": state.error is not None
|
|
}
|
|
|
|
if state.is_complete:
|
|
status["complete"] += 1
|
|
elif ((now - state.last_activity) > self.idle_timeout_seconds or
|
|
(now - state.started_at) > self.total_timeout_seconds):
|
|
status["hung"] += 1
|
|
else:
|
|
status["running"] += 1
|
|
|
|
status["agents"].append(agent_status)
|
|
|
|
return status
|
|
|
|
def clear_completed(self) -> None:
|
|
"""Remove completed sub-agents from tracking."""
|
|
with self._lock:
|
|
completed = [
|
|
agent_id for agent_id, state in self.sub_agents.items()
|
|
if state.is_complete
|
|
]
|
|
for agent_id in completed:
|
|
del self.sub_agents[agent_id]
|
|
|
|
if completed:
|
|
logger.info("[SubAgentManager] Cleared %d completed sub-agents", len(completed))
|