Implement self-healing system Phase 1: Error capture and logging

- Add SelfHealingSystem with error observation infrastructure - Capture errors with full context: type, message, stack trace, intent, inputs - Log to MEMORY.md with deduplication (max 3 attempts per error signature) - Integrate error capture in agent, tools, runtime, and scheduler - Non-invasive: preserves all existing error handling behavior - Foundation for future diagnosis and auto-fixing capabilities Phase 1 of 4-phase rollout - observation only, no auto-fixing yet. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-14 18:03:42 -07:00
parent 0eb5d2cab4
commit f018800d94
5 changed files with 178 additions and 2 deletions
--- a/adapters/runtime.py
+++ b/adapters/runtime.py
@@ -181,6 +181,17 @@ class AdapterRuntime:
        except Exception as e:
            print(f"[Runtime] Error processing message: {e}")
            traceback.print_exc()
+            if hasattr(self.agent, 'healing_system'):
+                self.agent.healing_system.capture_error(
+                    error=e,
+                    component="adapters/runtime.py:_process_message",
+                    intent=f"Processing message from {message.platform}",
+                    context={
+                        "platform": message.platform,
+                        "user": message.username,
+                        "message_preview": message.text[:100],
+                    },
+                )
            await self._send_error_reply(message)

    async def _send_error_reply(self, message: InboundMessage) -> None:
--- a/agent.py
+++ b/agent.py
@@ -7,6 +7,7 @@ from heartbeat import Heartbeat
 from hooks import HooksSystem
 from llm_interface import LLMInterface
 from memory_system import MemorySystem
+from self_healing import SelfHealingSystem
 from tools import TOOL_DEFINITIONS, execute_tool

 # Maximum number of recent messages to include in LLM context
@@ -31,6 +32,7 @@ class Agent:
        self.hooks = HooksSystem()
        self.conversation_history: List[dict] = []
        self._chat_lock = threading.Lock()
+        self.healing_system = SelfHealingSystem(self.memory, self)

        self.memory.sync()
        self.hooks.trigger("agent", "startup", {"workspace_dir": workspace_dir})
@@ -188,6 +190,16 @@ class Agent:
            except Exception as e:
                error_msg = f"LLM API error: {e}"
                print(f"[Agent] {error_msg}")
+                self.healing_system.capture_error(
+                    error=e,
+                    component="agent.py:_chat_inner",
+                    intent="Calling LLM API for chat response",
+                    context={
+                        "model": self.llm.model,
+                        "message_preview": user_message[:100],
+                        "iteration": iteration,
+                    },
+                )
                return f"Sorry, I encountered an error communicating with the AI model. Please try again."

            # Check stop reason
@@ -245,7 +257,7 @@ class Agent:
                # Execute tools and build tool result message
                tool_results = []
                for tool_use in tool_uses:
-                    result = execute_tool(tool_use.name, tool_use.input)
+                    result = execute_tool(tool_use.name, tool_use.input, healing_system=self.healing_system)
                    # Truncate large tool outputs to prevent token explosion
                    if len(result) > 5000:
                        result = result[:5000] + "\n... (output truncated)"
--- a/scheduled_tasks.py
+++ b/scheduled_tasks.py
@@ -345,6 +345,17 @@ class TaskScheduler:
            print(f"[Scheduler] Task failed: {task.name}")
            print(f"  Error: {e}")
            traceback.print_exc()
+            if self.agent and hasattr(self.agent, 'healing_system'):
+                self.agent.healing_system.capture_error(
+                    error=e,
+                    component="scheduled_tasks.py:_execute_task",
+                    intent=f"Executing scheduled task: {task.name}",
+                    context={
+                        "task_name": task.name,
+                        "schedule": task.schedule,
+                        "prompt": task.prompt[:100],
+                    },
+                )

    async def _send_to_platform(
        self, task: ScheduledTask, response: str
--- a/self_healing.py
+++ b/self_healing.py
@@ -0,0 +1,135 @@
+"""
+Self-Healing System - Phase 1: Error Capture and Logging.
+
+Captures all errors with full context and logs them to MEMORY.md.
+No auto-fixing in this phase - observation only.
+"""
+
+import hashlib
+import json
+import traceback
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Dict, Optional
+
+
+@dataclass
+class ErrorContext:
+    """Full context for a captured error."""
+
+    error_type: str           # Exception class name
+    message: str              # Error message
+    stack_trace: str          # Full traceback
+    component: str            # Where it happened (e.g., "tools.py:read_file")
+    intent: str               # What was being attempted
+    context: Dict[str, Any]   # Additional context (tool inputs, user message, etc.)
+    timestamp: str            # ISO 8601 format
+
+
+class SelfHealingSystem:
+    """
+    Phase 1: Error observation infrastructure.
+
+    Captures errors with full context, deduplicates via error signatures,
+    and logs them to MEMORY.md for future analysis.
+    """
+
+    def __init__(self, memory_system: Any, agent: Any) -> None:
+        self.memory = memory_system
+        self.agent = agent
+        self._error_counts: Dict[str, int] = {}
+
+    def capture_error(
+        self,
+        error: Exception,
+        component: str,
+        intent: str,
+        context: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """Capture an error with full context and log it.
+
+        Args:
+            error: The exception that occurred.
+            component: Where the error happened (e.g., "tools.py:read_file").
+            intent: What was being attempted when the error occurred.
+            context: Additional context such as tool inputs, user message, etc.
+        """
+        error_ctx = ErrorContext(
+            error_type=type(error).__name__,
+            message=str(error),
+            stack_trace=traceback.format_exc(),
+            component=component,
+            intent=intent,
+            context=context or {},
+            timestamp=datetime.now().isoformat(),
+        )
+
+        signature = self._generate_signature(error_ctx)
+
+        # Track attempt count
+        self._error_counts[signature] = self._error_counts.get(signature, 0) + 1
+        attempt = self._error_counts[signature]
+
+        if attempt <= 3:
+            self._log_error(error_ctx, attempt)
+
+        print(
+            f"[SelfHealing] Error captured: {error_ctx.error_type} "
+            f"in {error_ctx.component} (attempt {attempt}/3)"
+        )
+
+    def _generate_signature(self, error_ctx: ErrorContext) -> str:
+        """Generate a deduplication signature for an error.
+
+        Uses first 8 characters of SHA-256 hash of error type,
+        component, and message combined.
+        """
+        raw = f"{error_ctx.error_type}:{error_ctx.component}:{error_ctx.message}"
+        return hashlib.sha256(raw.encode()).hexdigest()[:8]
+
+    def _log_error(self, error_ctx: ErrorContext, attempt: int) -> None:
+        """Log an error to MEMORY.md via the memory system.
+
+        Formats the error as a markdown entry and appends it to
+        the persistent MEMORY.md file (daily=False).
+        """
+        # Serialize context to JSON for readability
+        try:
+            context_json = json.dumps(error_ctx.context, indent=2, default=str)
+        except (TypeError, ValueError):
+            context_json = str(error_ctx.context)
+
+        # Format timestamp for the header
+        try:
+            dt = datetime.fromisoformat(error_ctx.timestamp)
+            header_time = dt.strftime("%Y-%m-%d %H:%M:%S")
+        except ValueError:
+            header_time = error_ctx.timestamp
+
+        log_entry = (
+            f"## Error Log - {header_time}\n"
+            f"\n"
+            f"**Type**: {error_ctx.error_type}\n"
+            f"**Component**: {error_ctx.component}\n"
+            f"**Intent**: {error_ctx.intent}\n"
+            f"**Attempt**: {attempt}/3\n"
+            f"**Message**: {error_ctx.message}\n"
+            f"\n"
+            f"**Context**:\n"
+            f"```json\n"
+            f"{context_json}\n"
+            f"```\n"
+            f"\n"
+            f"**Stack Trace**:\n"
+            f"```\n"
+            f"{error_ctx.stack_trace}\n"
+            f"```\n"
+            f"---"
+        )
+
+        try:
+            self.memory.write_memory(log_entry, daily=False)
+        except Exception as e:
+            # Last resort: print to console if memory write fails
+            print(f"[SelfHealing] Failed to write error log to MEMORY.md: {e}")
+            print(f"[SelfHealing] Error was: {error_ctx.error_type}: {error_ctx.message}")
--- a/tools.py
+++ b/tools.py
@@ -324,7 +324,7 @@ TOOL_DEFINITIONS = [
 ]


-def execute_tool(tool_name: str, tool_input: Dict[str, Any]) -> str:
+def execute_tool(tool_name: str, tool_input: Dict[str, Any], healing_system: Any = None) -> str:
    """Execute a tool and return the result as a string."""
    try:
        # File tools
@@ -407,6 +407,13 @@ def execute_tool(tool_name: str, tool_input: Dict[str, Any]) -> str:
        else:
            return f"Error: Unknown tool '{tool_name}'"
    except Exception as e:
+        if healing_system:
+            healing_system.capture_error(
+                error=e,
+                component=f"tools.py:{tool_name}",
+                intent=f"Executing {tool_name} tool",
+                context={"tool_name": tool_name, "input": tool_input},
+            )
        return f"Error executing {tool_name}: {str(e)}"