Files
ajarbot/sub_agent_manager.py

198 lines
6.5 KiB
Python
Raw Normal View History

"""Sub-Agent Manager - Monitors and manages sub-agent lifecycle.
Handles:
- Sub-agent spawning and tracking
- Progress monitoring and hang detection
- Automatic cleanup and restart on timeout
"""
import time
import threading
from typing import Dict, Optional, Any
from dataclasses import dataclass
import logging
logger = logging.getLogger(__name__)
@dataclass
class SubAgentState:
"""Track state of a running sub-agent."""
agent_id: str
task_description: str
started_at: float
last_activity: float
is_complete: bool = False
result: Optional[str] = None
error: Optional[str] = None
class SubAgentManager:
"""Manages sub-agent lifecycle with hang detection and auto-restart."""
def __init__(self, timeout_seconds: int = 300): # 5 minutes default
"""Initialize manager.
Args:
timeout_seconds: Maximum time without progress before killing sub-agent
"""
self.timeout_seconds = timeout_seconds
self.sub_agents: Dict[str, SubAgentState] = {}
self._lock = threading.Lock()
self._watchdog_thread: Optional[threading.Thread] = None
self._watchdog_running = False
def start_watchdog(self) -> None:
"""Start the watchdog thread that monitors for hung sub-agents."""
if self._watchdog_running:
return
self._watchdog_running = True
self._watchdog_thread = threading.Thread(
target=self._watchdog_loop,
daemon=True,
name="SubAgentWatchdog"
)
self._watchdog_thread.start()
logger.info("[SubAgentManager] Watchdog started (timeout: %ds)", self.timeout_seconds)
def stop_watchdog(self) -> None:
"""Stop the watchdog thread."""
self._watchdog_running = False
if self._watchdog_thread:
self._watchdog_thread.join(timeout=2)
def register_sub_agent(
self,
agent_id: str,
task_description: str
) -> None:
"""Register a new sub-agent for monitoring."""
with self._lock:
now = time.time()
self.sub_agents[agent_id] = SubAgentState(
agent_id=agent_id,
task_description=task_description,
started_at=now,
last_activity=now
)
logger.info("[SubAgentManager] Registered sub-agent: %s - %s", agent_id, task_description)
def update_activity(self, agent_id: str) -> None:
"""Update last activity timestamp for a sub-agent."""
with self._lock:
if agent_id in self.sub_agents:
self.sub_agents[agent_id].last_activity = time.time()
def mark_complete(
self,
agent_id: str,
result: Optional[str] = None,
error: Optional[str] = None
) -> None:
"""Mark a sub-agent as complete."""
with self._lock:
if agent_id in self.sub_agents:
self.sub_agents[agent_id].is_complete = True
self.sub_agents[agent_id].result = result
self.sub_agents[agent_id].error = error
logger.info("[SubAgentManager] Sub-agent completed: %s (success=%s)",
agent_id, error is None)
def get_hung_agents(self) -> list:
"""Get list of sub-agent IDs that appear to be hung."""
now = time.time()
hung = []
with self._lock:
for agent_id, state in self.sub_agents.items():
if state.is_complete:
continue
time_since_activity = now - state.last_activity
if time_since_activity > self.timeout_seconds:
hung.append(agent_id)
logger.warning(
"[SubAgentManager] Sub-agent appears hung: %s - %s (no activity for %.1fs)",
agent_id, state.task_description, time_since_activity
)
return hung
def cleanup_agent(self, agent_id: str) -> None:
"""Clean up a hung sub-agent."""
with self._lock:
if agent_id in self.sub_agents:
state = self.sub_agents[agent_id]
logger.error(
"[SubAgentManager] Cleaning up hung sub-agent: %s - %s (hung for %.1fs)",
agent_id,
state.task_description,
time.time() - state.last_activity
)
# Mark as failed
state.is_complete = True
state.error = f"Timeout: No progress for {self.timeout_seconds}s"
def _watchdog_loop(self) -> None:
"""Watchdog loop that runs in background thread."""
while self._watchdog_running:
try:
hung_agents = self.get_hung_agents()
for agent_id in hung_agents:
self.cleanup_agent(agent_id)
# Check every 30 seconds
time.sleep(30)
except Exception as e:
logger.error("[SubAgentManager] Watchdog error: %s", e)
time.sleep(30)
def get_status(self) -> Dict[str, Any]:
"""Get current status of all sub-agents."""
now = time.time()
status = {
"total": len(self.sub_agents),
"complete": 0,
"running": 0,
"hung": 0,
"agents": []
}
with self._lock:
for agent_id, state in self.sub_agents.items():
agent_status = {
"id": agent_id,
"task": state.task_description,
"runtime": now - state.started_at,
"idle_time": now - state.last_activity,
"complete": state.is_complete,
"has_error": state.error is not None
}
if state.is_complete:
status["complete"] += 1
elif (now - state.last_activity) > self.timeout_seconds:
status["hung"] += 1
else:
status["running"] += 1
status["agents"].append(agent_status)
return status
def clear_completed(self) -> None:
"""Remove completed sub-agents from tracking."""
with self._lock:
completed = [
agent_id for agent_id, state in self.sub_agents.items()
if state.is_complete
]
for agent_id in completed:
del self.sub_agents[agent_id]
if completed:
logger.info("[SubAgentManager] Cleared %d completed sub-agents", len(completed))