self_healing.py

"""
Self-Healing System - Phase 1: Error Capture and Logging.

Captures all errors with full context and logs them to MEMORY.md.
No auto-fixing in this phase - observation only.
"""

import hashlib
import json
import threading
import traceback
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Optional


@dataclass
class ErrorContext:
    """Full context for a captured error."""

    error_type: str           # Exception class name
    message: str              # Error message
    stack_trace: str          # Full traceback
    component: str            # Where it happened (e.g., "tools.py:read_file")
    intent: str               # What was being attempted
    context: Dict[str, Any]   # Additional context (tool inputs, user message, etc.)
    timestamp: str            # ISO 8601 format


class SelfHealingSystem:
    """
    Phase 1: Error observation infrastructure.

    Captures errors with full context, deduplicates via error signatures,
    and logs them to MEMORY.md for future analysis.
    """

    def __init__(self, memory_system: Any, agent: Any) -> None:
        self.memory = memory_system
        self.agent = agent
        self._error_counts: Dict[str, int] = {}

    def capture_error(
        self,
        error: Exception,
        component: str,
        intent: str,
        context: Optional[Dict[str, Any]] = None,
    ) -> None:
        """Capture an error with full context and log it.

        Args:
            error: The exception that occurred.
            component: Where the error happened (e.g., "tools.py:read_file").
            intent: What was being attempted when the error occurred.
            context: Additional context such as tool inputs, user message, etc.
        """
        error_ctx = ErrorContext(
            error_type=type(error).__name__,
            message=str(error),
            stack_trace=traceback.format_exc(),
            component=component,
            intent=intent,
            context=context or {},
            timestamp=datetime.now().isoformat(),
        )

        signature = self._generate_signature(error_ctx)

        # Track attempt count
        self._error_counts[signature] = self._error_counts.get(signature, 0) + 1
        attempt = self._error_counts[signature]

        if attempt <= 3:
            self._log_error(error_ctx, attempt)

        print(
            f"[SelfHealing] Error captured: {error_ctx.error_type} "
            f"in {error_ctx.component} (attempt {attempt}/3)"
        )

    def _generate_signature(self, error_ctx: ErrorContext) -> str:
        """Generate a deduplication signature for an error.

        Uses first 8 characters of SHA-256 hash of error type,
        component, and message combined.
        """
        raw = f"{error_ctx.error_type}:{error_ctx.component}:{error_ctx.message}"
        return hashlib.sha256(raw.encode()).hexdigest()[:8]

    def _log_error(self, error_ctx: ErrorContext, attempt: int) -> None:
        """Log an error to MEMORY.md via the memory system.

        Formats the error as a markdown entry and appends it to
        the persistent MEMORY.md file (daily=False).
        """
        # Serialize context to JSON for readability
        try:
            context_json = json.dumps(error_ctx.context, indent=2, default=str)
        except (TypeError, ValueError):
            context_json = str(error_ctx.context)

        # Format timestamp for the header
        try:
            dt = datetime.fromisoformat(error_ctx.timestamp)
            header_time = dt.strftime("%Y-%m-%d %H:%M:%S")
        except ValueError:
            header_time = error_ctx.timestamp

        log_entry = (
            f"## Error Log - {header_time}\n"
            f"\n"
            f"**Type**: {error_ctx.error_type}\n"
            f"**Component**: {error_ctx.component}\n"
            f"**Intent**: {error_ctx.intent}\n"
            f"**Attempt**: {attempt}/3\n"
            f"**Message**: {error_ctx.message}\n"
            f"\n"
            f"**Context**:\n"
            f"```json\n"
            f"{context_json}\n"
            f"```\n"
            f"\n"
            f"**Stack Trace**:\n"
            f"```\n"
            f"{error_ctx.stack_trace}\n"
            f"```\n"
            f"---"
        )

        # RSO Phase 1: also export to JSONL for structured analysis
        try:
            _workspace = Path(getattr(self.memory, 'workspace_dir', './memory_workspace'))
            _errors_dir = _workspace / "observation" / "errors"
            _errors_dir.mkdir(parents=True, exist_ok=True)
            _error_date = error_ctx.timestamp[:10]  # YYYY-MM-DD
            _error_log_path = _errors_dir / f"{_error_date}.jsonl"

            try:
                _ctx_serializable = json.loads(json.dumps(error_ctx.context, default=str))
            except Exception:
                _ctx_serializable = str(error_ctx.context)

            _jsonl_record = {
                "record_type": "error",
                "timestamp": error_ctx.timestamp,
                "error_type": error_ctx.error_type,
                "message": error_ctx.message[:500],
                "component": error_ctx.component,
                "intent": error_ctx.intent,
                "attempt": attempt,
                "context": _ctx_serializable,
                "self_healed": False,  # Phase 1: observation only
            }

            def _write_jsonl(path: Path, record: dict) -> None:
                try:
                    line = json.dumps(record, default=str, ensure_ascii=False)
                    with open(path, "a", encoding="utf-8") as fh:
                        fh.write(line + "\n")
                except Exception as exc:
                    print(f"[SelfHealing] JSONL write failed: {exc}")

            threading.Thread(
                target=_write_jsonl,
                args=(_error_log_path, _jsonl_record),
                daemon=True,
            ).start()
        except Exception as _jsonl_err:
            print(f"[SelfHealing] JSONL export setup failed: {_jsonl_err}")

        try:
            self.memory.write_memory(log_entry, daily=True)
        except Exception as e:
            # Last resort: print to console if memory write fails
            print(f"[SelfHealing] Failed to write error log to MEMORY.md: {e}")
            print(f"[SelfHealing] Error was: {error_ctx.error_type}: {error_ctx.message}")
Implement self-healing system Phase 1: Error capture and logging - Add SelfHealingSystem with error observation infrastructure - Capture errors with full context: type, message, stack trace, intent, inputs - Log to MEMORY.md with deduplication (max 3 attempts per error signature) - Integrate error capture in agent, tools, runtime, and scheduler - Non-invasive: preserves all existing error handling behavior - Foundation for future diagnosis and auto-fixing capabilities Phase 1 of 4-phase rollout - observation only, no auto-fixing yet. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> 2026-02-14 18:03:42 -07:00			`"""`
			`Self-Healing System - Phase 1: Error Capture and Logging.`

			`Captures all errors with full context and logs them to MEMORY.md.`
			`No auto-fixing in this phase - observation only.`
			`"""`

			`import hashlib`
			`import json`
feat: RSO observation system, child safety, Discord adapter, Telegram watchdog, email attachments Core agent improvements: - RSO (Relevance Scoring & Observation) system: interaction_logger, memory_scorer, signal_detector - Memory access logging (memory_access_log table) for relevance scoring; high-signal turn detection - Rich conversation storage for notable turns; compact_conversation truncates long user messages - Task-type classifier (query/action/analysis/creative) for observation tagging - Nested sub-agent visibility: deep delegations now register against the main agent's manager Child safety (Gabriel profile): - child_safety.py: filtering, audit logging, prompt constants for restricted sessions - .kiro/specs/child-safety-profile: requirements, design, tasks specs - GABRIEL_BOT_PROPOSAL.md: initial proposal doc - Reduced context window (10 msgs) and tutor-mode identity for restricted users Telegram adapter: - Polling watchdog: auto-restarts updater if polling drops unexpectedly - get_me() with exponential-backoff retry on NetworkError at startup - Correct stop() ordering: signal watchdog before cancelling tasks Email / Gmail: - send_email: supports file attachments (attachments list param) - get_email: surfaces attachment metadata in response Scheduled tasks / weather: - Remove OpenWeatherMap API calls from morning-weather task; use wttr.in exclusively - New scheduled tasks and scheduler state persistence Discord: - adapters/discord/__init__.py scaffold - discord-plugin: MCP plugin for Claude Code Discord integration (server.ts, skills, config) Infrastructure: - n8n workflow exports (garvis_webhook, content_pipeline variants) - memory_workspace: context, homelab-repo-updates, weekly observation summaries, error logs - UCS C240 migration plan doc - requirements.txt: new deps - .claude/settings.json, fix_hooks.py: hook/permission tuning 2026-04-23 07:54:01 -06:00			`import threading`
Implement self-healing system Phase 1: Error capture and logging - Add SelfHealingSystem with error observation infrastructure - Capture errors with full context: type, message, stack trace, intent, inputs - Log to MEMORY.md with deduplication (max 3 attempts per error signature) - Integrate error capture in agent, tools, runtime, and scheduler - Non-invasive: preserves all existing error handling behavior - Foundation for future diagnosis and auto-fixing capabilities Phase 1 of 4-phase rollout - observation only, no auto-fixing yet. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> 2026-02-14 18:03:42 -07:00			`import traceback`
			`from dataclasses import dataclass`
			`from datetime import datetime`
feat: RSO observation system, child safety, Discord adapter, Telegram watchdog, email attachments Core agent improvements: - RSO (Relevance Scoring & Observation) system: interaction_logger, memory_scorer, signal_detector - Memory access logging (memory_access_log table) for relevance scoring; high-signal turn detection - Rich conversation storage for notable turns; compact_conversation truncates long user messages - Task-type classifier (query/action/analysis/creative) for observation tagging - Nested sub-agent visibility: deep delegations now register against the main agent's manager Child safety (Gabriel profile): - child_safety.py: filtering, audit logging, prompt constants for restricted sessions - .kiro/specs/child-safety-profile: requirements, design, tasks specs - GABRIEL_BOT_PROPOSAL.md: initial proposal doc - Reduced context window (10 msgs) and tutor-mode identity for restricted users Telegram adapter: - Polling watchdog: auto-restarts updater if polling drops unexpectedly - get_me() with exponential-backoff retry on NetworkError at startup - Correct stop() ordering: signal watchdog before cancelling tasks Email / Gmail: - send_email: supports file attachments (attachments list param) - get_email: surfaces attachment metadata in response Scheduled tasks / weather: - Remove OpenWeatherMap API calls from morning-weather task; use wttr.in exclusively - New scheduled tasks and scheduler state persistence Discord: - adapters/discord/__init__.py scaffold - discord-plugin: MCP plugin for Claude Code Discord integration (server.ts, skills, config) Infrastructure: - n8n workflow exports (garvis_webhook, content_pipeline variants) - memory_workspace: context, homelab-repo-updates, weekly observation summaries, error logs - UCS C240 migration plan doc - requirements.txt: new deps - .claude/settings.json, fix_hooks.py: hook/permission tuning 2026-04-23 07:54:01 -06:00			`from pathlib import Path`
Implement self-healing system Phase 1: Error capture and logging - Add SelfHealingSystem with error observation infrastructure - Capture errors with full context: type, message, stack trace, intent, inputs - Log to MEMORY.md with deduplication (max 3 attempts per error signature) - Integrate error capture in agent, tools, runtime, and scheduler - Non-invasive: preserves all existing error handling behavior - Foundation for future diagnosis and auto-fixing capabilities Phase 1 of 4-phase rollout - observation only, no auto-fixing yet. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> 2026-02-14 18:03:42 -07:00			`from typing import Any, Dict, Optional`


			`@dataclass`
			`class ErrorContext:`
			`"""Full context for a captured error."""`

			`error_type: str # Exception class name`
			`message: str # Error message`
			`stack_trace: str # Full traceback`
			`component: str # Where it happened (e.g., "tools.py:read_file")`
			`intent: str # What was being attempted`
			`context: Dict[str, Any] # Additional context (tool inputs, user message, etc.)`
			`timestamp: str # ISO 8601 format`


			`class SelfHealingSystem:`
			`"""`
			`Phase 1: Error observation infrastructure.`

			`Captures errors with full context, deduplicates via error signatures,`
			`and logs them to MEMORY.md for future analysis.`
			`"""`

			`def __init__(self, memory_system: Any, agent: Any) -> None:`
			`self.memory = memory_system`
			`self.agent = agent`
			`self._error_counts: Dict[str, int] = {}`

			`def capture_error(`
			`self,`
			`error: Exception,`
			`component: str,`
			`intent: str,`
			`context: Optional[Dict[str, Any]] = None,`
			`) -> None:`
			`"""Capture an error with full context and log it.`

			`Args:`
			`error: The exception that occurred.`
			`component: Where the error happened (e.g., "tools.py:read_file").`
			`intent: What was being attempted when the error occurred.`
			`context: Additional context such as tool inputs, user message, etc.`
			`"""`
			`error_ctx = ErrorContext(`
			`error_type=type(error).__name__,`
			`message=str(error),`
			`stack_trace=traceback.format_exc(),`
			`component=component,`
			`intent=intent,`
			`context=context or {},`
			`timestamp=datetime.now().isoformat(),`
			`)`

			`signature = self._generate_signature(error_ctx)`

			`# Track attempt count`
			`self._error_counts[signature] = self._error_counts.get(signature, 0) + 1`
			`attempt = self._error_counts[signature]`

			`if attempt <= 3:`
			`self._log_error(error_ctx, attempt)`

			`print(`
			`f"[SelfHealing] Error captured: {error_ctx.error_type} "`
			`f"in {error_ctx.component} (attempt {attempt}/3)"`
			`)`

			`def _generate_signature(self, error_ctx: ErrorContext) -> str:`
			`"""Generate a deduplication signature for an error.`

			`Uses first 8 characters of SHA-256 hash of error type,`
			`component, and message combined.`
			`"""`
			`raw = f"{error_ctx.error_type}:{error_ctx.component}:{error_ctx.message}"`
			`return hashlib.sha256(raw.encode()).hexdigest()[:8]`

			`def _log_error(self, error_ctx: ErrorContext, attempt: int) -> None:`
			`"""Log an error to MEMORY.md via the memory system.`

			`Formats the error as a markdown entry and appends it to`
			`the persistent MEMORY.md file (daily=False).`
			`"""`
			`# Serialize context to JSON for readability`
			`try:`
			`context_json = json.dumps(error_ctx.context, indent=2, default=str)`
			`except (TypeError, ValueError):`
			`context_json = str(error_ctx.context)`

			`# Format timestamp for the header`
			`try:`
			`dt = datetime.fromisoformat(error_ctx.timestamp)`
			`header_time = dt.strftime("%Y-%m-%d %H:%M:%S")`
			`except ValueError:`
			`header_time = error_ctx.timestamp`

			`log_entry = (`
			`f"## Error Log - {header_time}\n"`
			`f"\n"`
			`f"Type: {error_ctx.error_type}\n"`
			`f"Component: {error_ctx.component}\n"`
			`f"Intent: {error_ctx.intent}\n"`
			`f"Attempt: {attempt}/3\n"`
			`f"Message: {error_ctx.message}\n"`
			`f"\n"`
			`f"Context:\n"`
			f"```json\n"
			`f"{context_json}\n"`
			f"```\n"
			`f"\n"`
			`f"Stack Trace:\n"`
			f"```\n"
			`f"{error_ctx.stack_trace}\n"`
			f"```\n"
			`f"---"`
			`)`

feat: RSO observation system, child safety, Discord adapter, Telegram watchdog, email attachments Core agent improvements: - RSO (Relevance Scoring & Observation) system: interaction_logger, memory_scorer, signal_detector - Memory access logging (memory_access_log table) for relevance scoring; high-signal turn detection - Rich conversation storage for notable turns; compact_conversation truncates long user messages - Task-type classifier (query/action/analysis/creative) for observation tagging - Nested sub-agent visibility: deep delegations now register against the main agent's manager Child safety (Gabriel profile): - child_safety.py: filtering, audit logging, prompt constants for restricted sessions - .kiro/specs/child-safety-profile: requirements, design, tasks specs - GABRIEL_BOT_PROPOSAL.md: initial proposal doc - Reduced context window (10 msgs) and tutor-mode identity for restricted users Telegram adapter: - Polling watchdog: auto-restarts updater if polling drops unexpectedly - get_me() with exponential-backoff retry on NetworkError at startup - Correct stop() ordering: signal watchdog before cancelling tasks Email / Gmail: - send_email: supports file attachments (attachments list param) - get_email: surfaces attachment metadata in response Scheduled tasks / weather: - Remove OpenWeatherMap API calls from morning-weather task; use wttr.in exclusively - New scheduled tasks and scheduler state persistence Discord: - adapters/discord/__init__.py scaffold - discord-plugin: MCP plugin for Claude Code Discord integration (server.ts, skills, config) Infrastructure: - n8n workflow exports (garvis_webhook, content_pipeline variants) - memory_workspace: context, homelab-repo-updates, weekly observation summaries, error logs - UCS C240 migration plan doc - requirements.txt: new deps - .claude/settings.json, fix_hooks.py: hook/permission tuning 2026-04-23 07:54:01 -06:00			`# RSO Phase 1: also export to JSONL for structured analysis`
			`try:`
			`_workspace = Path(getattr(self.memory, 'workspace_dir', './memory_workspace'))`
			`_errors_dir = _workspace / "observation" / "errors"`
			`_errors_dir.mkdir(parents=True, exist_ok=True)`
			`_error_date = error_ctx.timestamp[:10] # YYYY-MM-DD`
			`_error_log_path = _errors_dir / f"{_error_date}.jsonl"`

			`try:`
			`_ctx_serializable = json.loads(json.dumps(error_ctx.context, default=str))`
			`except Exception:`
			`_ctx_serializable = str(error_ctx.context)`

			`_jsonl_record = {`
			`"record_type": "error",`
			`"timestamp": error_ctx.timestamp,`
			`"error_type": error_ctx.error_type,`
			`"message": error_ctx.message[:500],`
			`"component": error_ctx.component,`
			`"intent": error_ctx.intent,`
			`"attempt": attempt,`
			`"context": _ctx_serializable,`
			`"self_healed": False, # Phase 1: observation only`
			`}`

			`def _write_jsonl(path: Path, record: dict) -> None:`
			`try:`
			`line = json.dumps(record, default=str, ensure_ascii=False)`
			`with open(path, "a", encoding="utf-8") as fh:`
			`fh.write(line + "\n")`
			`except Exception as exc:`
			`print(f"[SelfHealing] JSONL write failed: {exc}")`

			`threading.Thread(`
			`target=_write_jsonl,`
			`args=(_error_log_path, _jsonl_record),`
			`daemon=True,`
			`).start()`
			`except Exception as _jsonl_err:`
			`print(f"[SelfHealing] JSONL export setup failed: {_jsonl_err}")`

Implement self-healing system Phase 1: Error capture and logging - Add SelfHealingSystem with error observation infrastructure - Capture errors with full context: type, message, stack trace, intent, inputs - Log to MEMORY.md with deduplication (max 3 attempts per error signature) - Integrate error capture in agent, tools, runtime, and scheduler - Non-invasive: preserves all existing error handling behavior - Foundation for future diagnosis and auto-fixing capabilities Phase 1 of 4-phase rollout - observation only, no auto-fixing yet. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> 2026-02-14 18:03:42 -07:00			`try:`
Optimize memory: Redirect error logs to daily files Changed error logging from MEMORY.md (permanent) to daily logs (time-based). This prevents MEMORY.md from becoming polluted with error stack traces that degrade search quality. Impact: - Future errors go to memory/YYYY-MM-DD.md (naturally age out) - MEMORY.md stays focused on project context only - Search results no longer polluted by duplicate stack traces Change: self_healing.py line 131: daily=False → daily=True Note: SOUL.md, MEMORY.md, jordan.md also optimized (not in git - gitignored) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> 2026-02-24 14:04:12 -07:00			`self.memory.write_memory(log_entry, daily=True)`
Implement self-healing system Phase 1: Error capture and logging - Add SelfHealingSystem with error observation infrastructure - Capture errors with full context: type, message, stack trace, intent, inputs - Log to MEMORY.md with deduplication (max 3 attempts per error signature) - Integrate error capture in agent, tools, runtime, and scheduler - Non-invasive: preserves all existing error handling behavior - Foundation for future diagnosis and auto-fixing capabilities Phase 1 of 4-phase rollout - observation only, no auto-fixing yet. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> 2026-02-14 18:03:42 -07:00			`except Exception as e:`
			`# Last resort: print to console if memory write fails`
			`print(f"[SelfHealing] Failed to write error log to MEMORY.md: {e}")`
			`print(f"[SelfHealing] Error was: {error_ctx.error_type}: {error_ctx.message}")`