ajarbot/sub_agent_manager.py

"""Sub-Agent Manager - Monitors and manages sub-agent lifecycle.

Handles:
- Sub-agent spawning and tracking
- Progress monitoring and hang detection
- Automatic cleanup and restart on timeout
"""

import time
import threading
from typing import Dict, Optional, Any
from dataclasses import dataclass
import logging

logger = logging.getLogger(__name__)


@dataclass
class SubAgentState:
    """Track state of a running sub-agent."""
    agent_id: str
    task_description: str
    started_at: float
    last_activity: float
    is_complete: bool = False
    result: Optional[str] = None
    error: Optional[str] = None
    # Loop detection fields
    message_count: int = 0
    last_message_count: int = 0
    error_count: int = 0
    last_error: Optional[str] = None


class SubAgentManager:
    """Manages sub-agent lifecycle with hang detection and auto-restart."""

    def __init__(
        self,
        idle_timeout_seconds: int = 300,  # 5 minutes idle (no progress)
        total_timeout_seconds: int = 900,  # 15 minutes total (hard cap)
    ):
        """Initialize manager.

        Args:
            idle_timeout_seconds: Max time without progress before killing (distinguishes slow from stuck)
            total_timeout_seconds: Absolute maximum runtime (safety net for legitimately slow tasks)
        """
        self.idle_timeout_seconds = idle_timeout_seconds
        self.total_timeout_seconds = total_timeout_seconds
        self.sub_agents: Dict[str, SubAgentState] = {}
        self._lock = threading.Lock()
        self._watchdog_thread: Optional[threading.Thread] = None
        self._watchdog_running = False

    def start_watchdog(self) -> None:
        """Start the watchdog thread that monitors for hung sub-agents."""
        if self._watchdog_running:
            return

        self._watchdog_running = True
        self._watchdog_thread = threading.Thread(
            target=self._watchdog_loop,
            daemon=True,
            name="SubAgentWatchdog"
        )
        self._watchdog_thread.start()
        logger.info(
            "[SubAgentManager] Watchdog started (idle: %ds, total: %ds)",
            self.idle_timeout_seconds,
            self.total_timeout_seconds
        )

    def stop_watchdog(self) -> None:
        """Stop the watchdog thread."""
        self._watchdog_running = False
        if self._watchdog_thread:
            self._watchdog_thread.join(timeout=2)

    def register_sub_agent(
        self,
        agent_id: str,
        task_description: str
    ) -> None:
        """Register a new sub-agent for monitoring."""
        with self._lock:
            now = time.time()
            self.sub_agents[agent_id] = SubAgentState(
                agent_id=agent_id,
                task_description=task_description,
                started_at=now,
                last_activity=now
            )
            logger.info("[SubAgentManager] Registered sub-agent: %s - %s", agent_id, task_description)

    def update_activity(self, agent_id: str, message_count: Optional[int] = None) -> None:
        """Update last activity timestamp and message count for a sub-agent.

        Args:
            agent_id: The sub-agent ID
            message_count: Current message count (indicates progress)
        """
        with self._lock:
            if agent_id in self.sub_agents:
                state = self.sub_agents[agent_id]
                now = time.time()

                # Update message count if provided
                if message_count is not None:
                    state.last_message_count = state.message_count
                    state.message_count = message_count

                    # Only update activity timestamp if message count actually increased
                    # (this distinguishes active progress from idle heartbeat)
                    if message_count > state.last_message_count:
                        state.last_activity = now
                else:
                    # No message count provided - basic heartbeat
                    state.last_activity = now

    def update_error(self, agent_id: str, error_message: str) -> None:
        """Track error occurrences for loop detection.

        Args:
            agent_id: The sub-agent ID
            error_message: The error message that occurred
        """
        with self._lock:
            if agent_id in self.sub_agents:
                state = self.sub_agents[agent_id]

                # Increment error count
                state.error_count += 1

                # Store last error for debugging
                state.last_error = error_message

                # Log repetitive errors
                if state.error_count > 3:
                    logger.warning(
                        "[SubAgentManager] Sub-agent %s has %d errors (last: %s...)",
                        agent_id, state.error_count, error_message[:80]
                    )

    def mark_complete(
        self,
        agent_id: str,
        result: Optional[str] = None,
        error: Optional[str] = None
    ) -> None:
        """Mark a sub-agent as complete."""
        with self._lock:
            if agent_id in self.sub_agents:
                self.sub_agents[agent_id].is_complete = True
                self.sub_agents[agent_id].result = result
                self.sub_agents[agent_id].error = error
                logger.info("[SubAgentManager] Sub-agent completed: %s (success=%s)",
                           agent_id, error is None)

    def get_hung_agents(self) -> list:
        """Get list of sub-agent IDs that appear to be hung.

        Uses adaptive detection:
        - Idle timeout: No progress (message count unchanged) for idle_timeout_seconds
        - Total timeout: Running longer than total_timeout_seconds regardless of progress
        - Loop detection: Repetitive errors or no message growth
        """
        now = time.time()
        hung = []

        with self._lock:
            for agent_id, state in self.sub_agents.items():
                if state.is_complete:
                    continue

                total_runtime = now - state.started_at
                idle_time = now - state.last_activity

                # Check 1: Total timeout (hard cap)
                if total_runtime > self.total_timeout_seconds:
                    hung.append(agent_id)
                    logger.warning(
                        "[SubAgentManager] Sub-agent exceeded total timeout: %s - %s "
                        "(running %.1fs > %ds total limit)",
                        agent_id, state.task_description, total_runtime, self.total_timeout_seconds
                    )
                    continue

                # Check 2: Idle timeout (no progress)
                if idle_time > self.idle_timeout_seconds:
                    hung.append(agent_id)
                    logger.warning(
                        "[SubAgentManager] Sub-agent idle timeout: %s - %s "
                        "(no progress for %.1fs, messages: %d)",
                        agent_id, state.task_description, idle_time, state.message_count
                    )
                    continue

                # Check 3: Loop detection - high error count
                if state.error_count > 5:
                    hung.append(agent_id)
                    logger.warning(
                        "[SubAgentManager] Sub-agent loop detected: %s - %s "
                        "(error count: %d, last error: %s)",
                        agent_id, state.task_description, state.error_count,
                        state.last_error[:100] if state.last_error else "None"
                    )
                    continue

        return hung

    def cleanup_agent(self, agent_id: str) -> None:
        """Clean up a hung sub-agent."""
        with self._lock:
            if agent_id in self.sub_agents:
                state = self.sub_agents[agent_id]
                logger.error(
                    "[SubAgentManager] Cleaning up hung sub-agent: %s - %s (hung for %.1fs)",
                    agent_id,
                    state.task_description,
                    time.time() - state.last_activity
                )

                # Mark as failed
                state.is_complete = True
                total_runtime = time.time() - state.started_at
                idle_time = time.time() - state.last_activity

                # Build detailed error message based on timeout type
                if total_runtime > self.total_timeout_seconds:
                    state.error = (
                        f"Total timeout: Exceeded {self.total_timeout_seconds}s limit "
                        f"(ran {total_runtime:.1f}s, {state.message_count} messages)"
                    )
                elif idle_time > self.idle_timeout_seconds:
                    state.error = (
                        f"Idle timeout: No progress for {idle_time:.1f}s "
                        f"(limit: {self.idle_timeout_seconds}s, {state.message_count} messages)"
                    )
                else:
                    state.error = (
                        f"Loop detected: {state.error_count} errors, "
                        f"last: {state.last_error[:100] if state.last_error else 'None'}"
                    )

    def _watchdog_loop(self) -> None:
        """Watchdog loop that runs in background thread."""
        while self._watchdog_running:
            try:
                hung_agents = self.get_hung_agents()
                for agent_id in hung_agents:
                    self.cleanup_agent(agent_id)

                # Check every 30 seconds
                time.sleep(30)

            except Exception as e:
                logger.error("[SubAgentManager] Watchdog error: %s", e)
                time.sleep(30)

    def get_status(self) -> Dict[str, Any]:
        """Get current status of all sub-agents."""
        now = time.time()
        status = {
            "total": len(self.sub_agents),
            "complete": 0,
            "running": 0,
            "hung": 0,
            "agents": []
        }

        with self._lock:
            for agent_id, state in self.sub_agents.items():
                agent_status = {
                    "id": agent_id,
                    "task": state.task_description,
                    "runtime": now - state.started_at,
                    "idle_time": now - state.last_activity,
                    "complete": state.is_complete,
                    "has_error": state.error is not None
                }

                if state.is_complete:
                    status["complete"] += 1
                elif ((now - state.last_activity) > self.idle_timeout_seconds or
                      (now - state.started_at) > self.total_timeout_seconds):
                    status["hung"] += 1
                else:
                    status["running"] += 1

                status["agents"].append(agent_status)

        return status

    def clear_completed(self) -> None:
        """Remove completed sub-agents from tracking."""
        with self._lock:
            completed = [
                agent_id for agent_id, state in self.sub_agents.items()
                if state.is_complete
            ]
            for agent_id in completed:
                del self.sub_agents[agent_id]

            if completed:
                logger.info("[SubAgentManager] Cleared %d completed sub-agents", len(completed))