Implement self-healing system Phase 1: Error capture and logging

- Add SelfHealingSystem with error observation infrastructure - Capture errors with full context: type, message, stack trace, intent, inputs - Log to MEMORY.md with deduplication (max 3 attempts per error signature) - Integrate error capture in agent, tools, runtime, and scheduler - Non-invasive: preserves all existing error handling behavior - Foundation for future diagnosis and auto-fixing capabilities Phase 1 of 4-phase rollout - observation only, no auto-fixing yet. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-14 18:03:42 -07:00
parent 0eb5d2cab4
commit f018800d94
5 changed files with 178 additions and 2 deletions
--- a/agent.py
+++ b/agent.py
@@ -7,6 +7,7 @@ from heartbeat import Heartbeat
 from hooks import HooksSystem
 from llm_interface import LLMInterface
 from memory_system import MemorySystem
+from self_healing import SelfHealingSystem
 from tools import TOOL_DEFINITIONS, execute_tool

 # Maximum number of recent messages to include in LLM context
@@ -31,6 +32,7 @@ class Agent:
        self.hooks = HooksSystem()
        self.conversation_history: List[dict] = []
        self._chat_lock = threading.Lock()
+        self.healing_system = SelfHealingSystem(self.memory, self)

        self.memory.sync()
        self.hooks.trigger("agent", "startup", {"workspace_dir": workspace_dir})
@@ -188,6 +190,16 @@ class Agent:
            except Exception as e:
                error_msg = f"LLM API error: {e}"
                print(f"[Agent] {error_msg}")
+                self.healing_system.capture_error(
+                    error=e,
+                    component="agent.py:_chat_inner",
+                    intent="Calling LLM API for chat response",
+                    context={
+                        "model": self.llm.model,
+                        "message_preview": user_message[:100],
+                        "iteration": iteration,
+                    },
+                )
                return f"Sorry, I encountered an error communicating with the AI model. Please try again."

            # Check stop reason
@@ -245,7 +257,7 @@ class Agent:
                # Execute tools and build tool result message
                tool_results = []
                for tool_use in tool_uses:
-                    result = execute_tool(tool_use.name, tool_use.input)
+                    result = execute_tool(tool_use.name, tool_use.input, healing_system=self.healing_system)
                    # Truncate large tool outputs to prevent token explosion
                    if len(result) > 5000:
                        result = result[:5000] + "\n... (output truncated)"