From f018800d94b44c93d27b45c59dd06e06561a04fa Mon Sep 17 00:00:00 2001 From: Jordan Ramos Date: Sat, 14 Feb 2026 18:03:42 -0700 Subject: [PATCH] Implement self-healing system Phase 1: Error capture and logging - Add SelfHealingSystem with error observation infrastructure - Capture errors with full context: type, message, stack trace, intent, inputs - Log to MEMORY.md with deduplication (max 3 attempts per error signature) - Integrate error capture in agent, tools, runtime, and scheduler - Non-invasive: preserves all existing error handling behavior - Foundation for future diagnosis and auto-fixing capabilities Phase 1 of 4-phase rollout - observation only, no auto-fixing yet. Co-Authored-By: Claude Sonnet 4.5 --- adapters/runtime.py | 11 ++++ agent.py | 14 ++++- scheduled_tasks.py | 11 ++++ self_healing.py | 135 ++++++++++++++++++++++++++++++++++++++++++++ tools.py | 9 ++- 5 files changed, 178 insertions(+), 2 deletions(-) create mode 100644 self_healing.py diff --git a/adapters/runtime.py b/adapters/runtime.py index 30cd998..91a212d 100644 --- a/adapters/runtime.py +++ b/adapters/runtime.py @@ -181,6 +181,17 @@ class AdapterRuntime: except Exception as e: print(f"[Runtime] Error processing message: {e}") traceback.print_exc() + if hasattr(self.agent, 'healing_system'): + self.agent.healing_system.capture_error( + error=e, + component="adapters/runtime.py:_process_message", + intent=f"Processing message from {message.platform}", + context={ + "platform": message.platform, + "user": message.username, + "message_preview": message.text[:100], + }, + ) await self._send_error_reply(message) async def _send_error_reply(self, message: InboundMessage) -> None: diff --git a/agent.py b/agent.py index 969f1d2..ac75550 100644 --- a/agent.py +++ b/agent.py @@ -7,6 +7,7 @@ from heartbeat import Heartbeat from hooks import HooksSystem from llm_interface import LLMInterface from memory_system import MemorySystem +from self_healing import SelfHealingSystem from tools import TOOL_DEFINITIONS, execute_tool # Maximum number of recent messages to include in LLM context @@ -31,6 +32,7 @@ class Agent: self.hooks = HooksSystem() self.conversation_history: List[dict] = [] self._chat_lock = threading.Lock() + self.healing_system = SelfHealingSystem(self.memory, self) self.memory.sync() self.hooks.trigger("agent", "startup", {"workspace_dir": workspace_dir}) @@ -188,6 +190,16 @@ class Agent: except Exception as e: error_msg = f"LLM API error: {e}" print(f"[Agent] {error_msg}") + self.healing_system.capture_error( + error=e, + component="agent.py:_chat_inner", + intent="Calling LLM API for chat response", + context={ + "model": self.llm.model, + "message_preview": user_message[:100], + "iteration": iteration, + }, + ) return f"Sorry, I encountered an error communicating with the AI model. Please try again." # Check stop reason @@ -245,7 +257,7 @@ class Agent: # Execute tools and build tool result message tool_results = [] for tool_use in tool_uses: - result = execute_tool(tool_use.name, tool_use.input) + result = execute_tool(tool_use.name, tool_use.input, healing_system=self.healing_system) # Truncate large tool outputs to prevent token explosion if len(result) > 5000: result = result[:5000] + "\n... (output truncated)" diff --git a/scheduled_tasks.py b/scheduled_tasks.py index a2448b5..f38cb2b 100644 --- a/scheduled_tasks.py +++ b/scheduled_tasks.py @@ -345,6 +345,17 @@ class TaskScheduler: print(f"[Scheduler] Task failed: {task.name}") print(f" Error: {e}") traceback.print_exc() + if self.agent and hasattr(self.agent, 'healing_system'): + self.agent.healing_system.capture_error( + error=e, + component="scheduled_tasks.py:_execute_task", + intent=f"Executing scheduled task: {task.name}", + context={ + "task_name": task.name, + "schedule": task.schedule, + "prompt": task.prompt[:100], + }, + ) async def _send_to_platform( self, task: ScheduledTask, response: str diff --git a/self_healing.py b/self_healing.py new file mode 100644 index 0000000..43f4fa1 --- /dev/null +++ b/self_healing.py @@ -0,0 +1,135 @@ +""" +Self-Healing System - Phase 1: Error Capture and Logging. + +Captures all errors with full context and logs them to MEMORY.md. +No auto-fixing in this phase - observation only. +""" + +import hashlib +import json +import traceback +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Dict, Optional + + +@dataclass +class ErrorContext: + """Full context for a captured error.""" + + error_type: str # Exception class name + message: str # Error message + stack_trace: str # Full traceback + component: str # Where it happened (e.g., "tools.py:read_file") + intent: str # What was being attempted + context: Dict[str, Any] # Additional context (tool inputs, user message, etc.) + timestamp: str # ISO 8601 format + + +class SelfHealingSystem: + """ + Phase 1: Error observation infrastructure. + + Captures errors with full context, deduplicates via error signatures, + and logs them to MEMORY.md for future analysis. + """ + + def __init__(self, memory_system: Any, agent: Any) -> None: + self.memory = memory_system + self.agent = agent + self._error_counts: Dict[str, int] = {} + + def capture_error( + self, + error: Exception, + component: str, + intent: str, + context: Optional[Dict[str, Any]] = None, + ) -> None: + """Capture an error with full context and log it. + + Args: + error: The exception that occurred. + component: Where the error happened (e.g., "tools.py:read_file"). + intent: What was being attempted when the error occurred. + context: Additional context such as tool inputs, user message, etc. + """ + error_ctx = ErrorContext( + error_type=type(error).__name__, + message=str(error), + stack_trace=traceback.format_exc(), + component=component, + intent=intent, + context=context or {}, + timestamp=datetime.now().isoformat(), + ) + + signature = self._generate_signature(error_ctx) + + # Track attempt count + self._error_counts[signature] = self._error_counts.get(signature, 0) + 1 + attempt = self._error_counts[signature] + + if attempt <= 3: + self._log_error(error_ctx, attempt) + + print( + f"[SelfHealing] Error captured: {error_ctx.error_type} " + f"in {error_ctx.component} (attempt {attempt}/3)" + ) + + def _generate_signature(self, error_ctx: ErrorContext) -> str: + """Generate a deduplication signature for an error. + + Uses first 8 characters of SHA-256 hash of error type, + component, and message combined. + """ + raw = f"{error_ctx.error_type}:{error_ctx.component}:{error_ctx.message}" + return hashlib.sha256(raw.encode()).hexdigest()[:8] + + def _log_error(self, error_ctx: ErrorContext, attempt: int) -> None: + """Log an error to MEMORY.md via the memory system. + + Formats the error as a markdown entry and appends it to + the persistent MEMORY.md file (daily=False). + """ + # Serialize context to JSON for readability + try: + context_json = json.dumps(error_ctx.context, indent=2, default=str) + except (TypeError, ValueError): + context_json = str(error_ctx.context) + + # Format timestamp for the header + try: + dt = datetime.fromisoformat(error_ctx.timestamp) + header_time = dt.strftime("%Y-%m-%d %H:%M:%S") + except ValueError: + header_time = error_ctx.timestamp + + log_entry = ( + f"## Error Log - {header_time}\n" + f"\n" + f"**Type**: {error_ctx.error_type}\n" + f"**Component**: {error_ctx.component}\n" + f"**Intent**: {error_ctx.intent}\n" + f"**Attempt**: {attempt}/3\n" + f"**Message**: {error_ctx.message}\n" + f"\n" + f"**Context**:\n" + f"```json\n" + f"{context_json}\n" + f"```\n" + f"\n" + f"**Stack Trace**:\n" + f"```\n" + f"{error_ctx.stack_trace}\n" + f"```\n" + f"---" + ) + + try: + self.memory.write_memory(log_entry, daily=False) + except Exception as e: + # Last resort: print to console if memory write fails + print(f"[SelfHealing] Failed to write error log to MEMORY.md: {e}") + print(f"[SelfHealing] Error was: {error_ctx.error_type}: {error_ctx.message}") diff --git a/tools.py b/tools.py index 7230310..2b82f5f 100644 --- a/tools.py +++ b/tools.py @@ -324,7 +324,7 @@ TOOL_DEFINITIONS = [ ] -def execute_tool(tool_name: str, tool_input: Dict[str, Any]) -> str: +def execute_tool(tool_name: str, tool_input: Dict[str, Any], healing_system: Any = None) -> str: """Execute a tool and return the result as a string.""" try: # File tools @@ -407,6 +407,13 @@ def execute_tool(tool_name: str, tool_input: Dict[str, Any]) -> str: else: return f"Error: Unknown tool '{tool_name}'" except Exception as e: + if healing_system: + healing_system.capture_error( + error=e, + component=f"tools.py:{tool_name}", + intent=f"Executing {tool_name} tool", + context={"tool_name": tool_name, "input": tool_input}, + ) return f"Error executing {tool_name}: {str(e)}"