Implement self-healing system Phase 1: Error capture and logging

- Add SelfHealingSystem with error observation infrastructure
- Capture errors with full context: type, message, stack trace, intent, inputs
- Log to MEMORY.md with deduplication (max 3 attempts per error signature)
- Integrate error capture in agent, tools, runtime, and scheduler
- Non-invasive: preserves all existing error handling behavior
- Foundation for future diagnosis and auto-fixing capabilities

Phase 1 of 4-phase rollout - observation only, no auto-fixing yet.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-14 18:03:42 -07:00
parent 0eb5d2cab4
commit f018800d94
5 changed files with 178 additions and 2 deletions

View File

@@ -181,6 +181,17 @@ class AdapterRuntime:
except Exception as e: except Exception as e:
print(f"[Runtime] Error processing message: {e}") print(f"[Runtime] Error processing message: {e}")
traceback.print_exc() traceback.print_exc()
if hasattr(self.agent, 'healing_system'):
self.agent.healing_system.capture_error(
error=e,
component="adapters/runtime.py:_process_message",
intent=f"Processing message from {message.platform}",
context={
"platform": message.platform,
"user": message.username,
"message_preview": message.text[:100],
},
)
await self._send_error_reply(message) await self._send_error_reply(message)
async def _send_error_reply(self, message: InboundMessage) -> None: async def _send_error_reply(self, message: InboundMessage) -> None:

View File

@@ -7,6 +7,7 @@ from heartbeat import Heartbeat
from hooks import HooksSystem from hooks import HooksSystem
from llm_interface import LLMInterface from llm_interface import LLMInterface
from memory_system import MemorySystem from memory_system import MemorySystem
from self_healing import SelfHealingSystem
from tools import TOOL_DEFINITIONS, execute_tool from tools import TOOL_DEFINITIONS, execute_tool
# Maximum number of recent messages to include in LLM context # Maximum number of recent messages to include in LLM context
@@ -31,6 +32,7 @@ class Agent:
self.hooks = HooksSystem() self.hooks = HooksSystem()
self.conversation_history: List[dict] = [] self.conversation_history: List[dict] = []
self._chat_lock = threading.Lock() self._chat_lock = threading.Lock()
self.healing_system = SelfHealingSystem(self.memory, self)
self.memory.sync() self.memory.sync()
self.hooks.trigger("agent", "startup", {"workspace_dir": workspace_dir}) self.hooks.trigger("agent", "startup", {"workspace_dir": workspace_dir})
@@ -188,6 +190,16 @@ class Agent:
except Exception as e: except Exception as e:
error_msg = f"LLM API error: {e}" error_msg = f"LLM API error: {e}"
print(f"[Agent] {error_msg}") print(f"[Agent] {error_msg}")
self.healing_system.capture_error(
error=e,
component="agent.py:_chat_inner",
intent="Calling LLM API for chat response",
context={
"model": self.llm.model,
"message_preview": user_message[:100],
"iteration": iteration,
},
)
return f"Sorry, I encountered an error communicating with the AI model. Please try again." return f"Sorry, I encountered an error communicating with the AI model. Please try again."
# Check stop reason # Check stop reason
@@ -245,7 +257,7 @@ class Agent:
# Execute tools and build tool result message # Execute tools and build tool result message
tool_results = [] tool_results = []
for tool_use in tool_uses: for tool_use in tool_uses:
result = execute_tool(tool_use.name, tool_use.input) result = execute_tool(tool_use.name, tool_use.input, healing_system=self.healing_system)
# Truncate large tool outputs to prevent token explosion # Truncate large tool outputs to prevent token explosion
if len(result) > 5000: if len(result) > 5000:
result = result[:5000] + "\n... (output truncated)" result = result[:5000] + "\n... (output truncated)"

View File

@@ -345,6 +345,17 @@ class TaskScheduler:
print(f"[Scheduler] Task failed: {task.name}") print(f"[Scheduler] Task failed: {task.name}")
print(f" Error: {e}") print(f" Error: {e}")
traceback.print_exc() traceback.print_exc()
if self.agent and hasattr(self.agent, 'healing_system'):
self.agent.healing_system.capture_error(
error=e,
component="scheduled_tasks.py:_execute_task",
intent=f"Executing scheduled task: {task.name}",
context={
"task_name": task.name,
"schedule": task.schedule,
"prompt": task.prompt[:100],
},
)
async def _send_to_platform( async def _send_to_platform(
self, task: ScheduledTask, response: str self, task: ScheduledTask, response: str

135
self_healing.py Normal file
View File

@@ -0,0 +1,135 @@
"""
Self-Healing System - Phase 1: Error Capture and Logging.
Captures all errors with full context and logs them to MEMORY.md.
No auto-fixing in this phase - observation only.
"""
import hashlib
import json
import traceback
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Dict, Optional
@dataclass
class ErrorContext:
"""Full context for a captured error."""
error_type: str # Exception class name
message: str # Error message
stack_trace: str # Full traceback
component: str # Where it happened (e.g., "tools.py:read_file")
intent: str # What was being attempted
context: Dict[str, Any] # Additional context (tool inputs, user message, etc.)
timestamp: str # ISO 8601 format
class SelfHealingSystem:
"""
Phase 1: Error observation infrastructure.
Captures errors with full context, deduplicates via error signatures,
and logs them to MEMORY.md for future analysis.
"""
def __init__(self, memory_system: Any, agent: Any) -> None:
self.memory = memory_system
self.agent = agent
self._error_counts: Dict[str, int] = {}
def capture_error(
self,
error: Exception,
component: str,
intent: str,
context: Optional[Dict[str, Any]] = None,
) -> None:
"""Capture an error with full context and log it.
Args:
error: The exception that occurred.
component: Where the error happened (e.g., "tools.py:read_file").
intent: What was being attempted when the error occurred.
context: Additional context such as tool inputs, user message, etc.
"""
error_ctx = ErrorContext(
error_type=type(error).__name__,
message=str(error),
stack_trace=traceback.format_exc(),
component=component,
intent=intent,
context=context or {},
timestamp=datetime.now().isoformat(),
)
signature = self._generate_signature(error_ctx)
# Track attempt count
self._error_counts[signature] = self._error_counts.get(signature, 0) + 1
attempt = self._error_counts[signature]
if attempt <= 3:
self._log_error(error_ctx, attempt)
print(
f"[SelfHealing] Error captured: {error_ctx.error_type} "
f"in {error_ctx.component} (attempt {attempt}/3)"
)
def _generate_signature(self, error_ctx: ErrorContext) -> str:
"""Generate a deduplication signature for an error.
Uses first 8 characters of SHA-256 hash of error type,
component, and message combined.
"""
raw = f"{error_ctx.error_type}:{error_ctx.component}:{error_ctx.message}"
return hashlib.sha256(raw.encode()).hexdigest()[:8]
def _log_error(self, error_ctx: ErrorContext, attempt: int) -> None:
"""Log an error to MEMORY.md via the memory system.
Formats the error as a markdown entry and appends it to
the persistent MEMORY.md file (daily=False).
"""
# Serialize context to JSON for readability
try:
context_json = json.dumps(error_ctx.context, indent=2, default=str)
except (TypeError, ValueError):
context_json = str(error_ctx.context)
# Format timestamp for the header
try:
dt = datetime.fromisoformat(error_ctx.timestamp)
header_time = dt.strftime("%Y-%m-%d %H:%M:%S")
except ValueError:
header_time = error_ctx.timestamp
log_entry = (
f"## Error Log - {header_time}\n"
f"\n"
f"**Type**: {error_ctx.error_type}\n"
f"**Component**: {error_ctx.component}\n"
f"**Intent**: {error_ctx.intent}\n"
f"**Attempt**: {attempt}/3\n"
f"**Message**: {error_ctx.message}\n"
f"\n"
f"**Context**:\n"
f"```json\n"
f"{context_json}\n"
f"```\n"
f"\n"
f"**Stack Trace**:\n"
f"```\n"
f"{error_ctx.stack_trace}\n"
f"```\n"
f"---"
)
try:
self.memory.write_memory(log_entry, daily=False)
except Exception as e:
# Last resort: print to console if memory write fails
print(f"[SelfHealing] Failed to write error log to MEMORY.md: {e}")
print(f"[SelfHealing] Error was: {error_ctx.error_type}: {error_ctx.message}")

View File

@@ -324,7 +324,7 @@ TOOL_DEFINITIONS = [
] ]
def execute_tool(tool_name: str, tool_input: Dict[str, Any]) -> str: def execute_tool(tool_name: str, tool_input: Dict[str, Any], healing_system: Any = None) -> str:
"""Execute a tool and return the result as a string.""" """Execute a tool and return the result as a string."""
try: try:
# File tools # File tools
@@ -407,6 +407,13 @@ def execute_tool(tool_name: str, tool_input: Dict[str, Any]) -> str:
else: else:
return f"Error: Unknown tool '{tool_name}'" return f"Error: Unknown tool '{tool_name}'"
except Exception as e: except Exception as e:
if healing_system:
healing_system.capture_error(
error=e,
component=f"tools.py:{tool_name}",
intent=f"Executing {tool_name} tool",
context={"tool_name": tool_name, "input": tool_input},
)
return f"Error executing {tool_name}: {str(e)}" return f"Error executing {tool_name}: {str(e)}"