Files
ajarbot/self_healing.py

179 lines
6.3 KiB
Python
Raw Normal View History

"""
Self-Healing System - Phase 1: Error Capture and Logging.
Captures all errors with full context and logs them to MEMORY.md.
No auto-fixing in this phase - observation only.
"""
import hashlib
import json
feat: RSO observation system, child safety, Discord adapter, Telegram watchdog, email attachments Core agent improvements: - RSO (Relevance Scoring & Observation) system: interaction_logger, memory_scorer, signal_detector - Memory access logging (memory_access_log table) for relevance scoring; high-signal turn detection - Rich conversation storage for notable turns; compact_conversation truncates long user messages - Task-type classifier (query/action/analysis/creative) for observation tagging - Nested sub-agent visibility: deep delegations now register against the main agent's manager Child safety (Gabriel profile): - child_safety.py: filtering, audit logging, prompt constants for restricted sessions - .kiro/specs/child-safety-profile: requirements, design, tasks specs - GABRIEL_BOT_PROPOSAL.md: initial proposal doc - Reduced context window (10 msgs) and tutor-mode identity for restricted users Telegram adapter: - Polling watchdog: auto-restarts updater if polling drops unexpectedly - get_me() with exponential-backoff retry on NetworkError at startup - Correct stop() ordering: signal watchdog before cancelling tasks Email / Gmail: - send_email: supports file attachments (attachments list param) - get_email: surfaces attachment metadata in response Scheduled tasks / weather: - Remove OpenWeatherMap API calls from morning-weather task; use wttr.in exclusively - New scheduled tasks and scheduler state persistence Discord: - adapters/discord/__init__.py scaffold - discord-plugin: MCP plugin for Claude Code Discord integration (server.ts, skills, config) Infrastructure: - n8n workflow exports (garvis_webhook, content_pipeline variants) - memory_workspace: context, homelab-repo-updates, weekly observation summaries, error logs - UCS C240 migration plan doc - requirements.txt: new deps - .claude/settings.json, fix_hooks.py: hook/permission tuning
2026-04-23 07:54:01 -06:00
import threading
import traceback
from dataclasses import dataclass
from datetime import datetime
feat: RSO observation system, child safety, Discord adapter, Telegram watchdog, email attachments Core agent improvements: - RSO (Relevance Scoring & Observation) system: interaction_logger, memory_scorer, signal_detector - Memory access logging (memory_access_log table) for relevance scoring; high-signal turn detection - Rich conversation storage for notable turns; compact_conversation truncates long user messages - Task-type classifier (query/action/analysis/creative) for observation tagging - Nested sub-agent visibility: deep delegations now register against the main agent's manager Child safety (Gabriel profile): - child_safety.py: filtering, audit logging, prompt constants for restricted sessions - .kiro/specs/child-safety-profile: requirements, design, tasks specs - GABRIEL_BOT_PROPOSAL.md: initial proposal doc - Reduced context window (10 msgs) and tutor-mode identity for restricted users Telegram adapter: - Polling watchdog: auto-restarts updater if polling drops unexpectedly - get_me() with exponential-backoff retry on NetworkError at startup - Correct stop() ordering: signal watchdog before cancelling tasks Email / Gmail: - send_email: supports file attachments (attachments list param) - get_email: surfaces attachment metadata in response Scheduled tasks / weather: - Remove OpenWeatherMap API calls from morning-weather task; use wttr.in exclusively - New scheduled tasks and scheduler state persistence Discord: - adapters/discord/__init__.py scaffold - discord-plugin: MCP plugin for Claude Code Discord integration (server.ts, skills, config) Infrastructure: - n8n workflow exports (garvis_webhook, content_pipeline variants) - memory_workspace: context, homelab-repo-updates, weekly observation summaries, error logs - UCS C240 migration plan doc - requirements.txt: new deps - .claude/settings.json, fix_hooks.py: hook/permission tuning
2026-04-23 07:54:01 -06:00
from pathlib import Path
from typing import Any, Dict, Optional
@dataclass
class ErrorContext:
"""Full context for a captured error."""
error_type: str # Exception class name
message: str # Error message
stack_trace: str # Full traceback
component: str # Where it happened (e.g., "tools.py:read_file")
intent: str # What was being attempted
context: Dict[str, Any] # Additional context (tool inputs, user message, etc.)
timestamp: str # ISO 8601 format
class SelfHealingSystem:
"""
Phase 1: Error observation infrastructure.
Captures errors with full context, deduplicates via error signatures,
and logs them to MEMORY.md for future analysis.
"""
def __init__(self, memory_system: Any, agent: Any) -> None:
self.memory = memory_system
self.agent = agent
self._error_counts: Dict[str, int] = {}
def capture_error(
self,
error: Exception,
component: str,
intent: str,
context: Optional[Dict[str, Any]] = None,
) -> None:
"""Capture an error with full context and log it.
Args:
error: The exception that occurred.
component: Where the error happened (e.g., "tools.py:read_file").
intent: What was being attempted when the error occurred.
context: Additional context such as tool inputs, user message, etc.
"""
error_ctx = ErrorContext(
error_type=type(error).__name__,
message=str(error),
stack_trace=traceback.format_exc(),
component=component,
intent=intent,
context=context or {},
timestamp=datetime.now().isoformat(),
)
signature = self._generate_signature(error_ctx)
# Track attempt count
self._error_counts[signature] = self._error_counts.get(signature, 0) + 1
attempt = self._error_counts[signature]
if attempt <= 3:
self._log_error(error_ctx, attempt)
print(
f"[SelfHealing] Error captured: {error_ctx.error_type} "
f"in {error_ctx.component} (attempt {attempt}/3)"
)
def _generate_signature(self, error_ctx: ErrorContext) -> str:
"""Generate a deduplication signature for an error.
Uses first 8 characters of SHA-256 hash of error type,
component, and message combined.
"""
raw = f"{error_ctx.error_type}:{error_ctx.component}:{error_ctx.message}"
return hashlib.sha256(raw.encode()).hexdigest()[:8]
def _log_error(self, error_ctx: ErrorContext, attempt: int) -> None:
"""Log an error to MEMORY.md via the memory system.
Formats the error as a markdown entry and appends it to
the persistent MEMORY.md file (daily=False).
"""
# Serialize context to JSON for readability
try:
context_json = json.dumps(error_ctx.context, indent=2, default=str)
except (TypeError, ValueError):
context_json = str(error_ctx.context)
# Format timestamp for the header
try:
dt = datetime.fromisoformat(error_ctx.timestamp)
header_time = dt.strftime("%Y-%m-%d %H:%M:%S")
except ValueError:
header_time = error_ctx.timestamp
log_entry = (
f"## Error Log - {header_time}\n"
f"\n"
f"**Type**: {error_ctx.error_type}\n"
f"**Component**: {error_ctx.component}\n"
f"**Intent**: {error_ctx.intent}\n"
f"**Attempt**: {attempt}/3\n"
f"**Message**: {error_ctx.message}\n"
f"\n"
f"**Context**:\n"
f"```json\n"
f"{context_json}\n"
f"```\n"
f"\n"
f"**Stack Trace**:\n"
f"```\n"
f"{error_ctx.stack_trace}\n"
f"```\n"
f"---"
)
feat: RSO observation system, child safety, Discord adapter, Telegram watchdog, email attachments Core agent improvements: - RSO (Relevance Scoring & Observation) system: interaction_logger, memory_scorer, signal_detector - Memory access logging (memory_access_log table) for relevance scoring; high-signal turn detection - Rich conversation storage for notable turns; compact_conversation truncates long user messages - Task-type classifier (query/action/analysis/creative) for observation tagging - Nested sub-agent visibility: deep delegations now register against the main agent's manager Child safety (Gabriel profile): - child_safety.py: filtering, audit logging, prompt constants for restricted sessions - .kiro/specs/child-safety-profile: requirements, design, tasks specs - GABRIEL_BOT_PROPOSAL.md: initial proposal doc - Reduced context window (10 msgs) and tutor-mode identity for restricted users Telegram adapter: - Polling watchdog: auto-restarts updater if polling drops unexpectedly - get_me() with exponential-backoff retry on NetworkError at startup - Correct stop() ordering: signal watchdog before cancelling tasks Email / Gmail: - send_email: supports file attachments (attachments list param) - get_email: surfaces attachment metadata in response Scheduled tasks / weather: - Remove OpenWeatherMap API calls from morning-weather task; use wttr.in exclusively - New scheduled tasks and scheduler state persistence Discord: - adapters/discord/__init__.py scaffold - discord-plugin: MCP plugin for Claude Code Discord integration (server.ts, skills, config) Infrastructure: - n8n workflow exports (garvis_webhook, content_pipeline variants) - memory_workspace: context, homelab-repo-updates, weekly observation summaries, error logs - UCS C240 migration plan doc - requirements.txt: new deps - .claude/settings.json, fix_hooks.py: hook/permission tuning
2026-04-23 07:54:01 -06:00
# RSO Phase 1: also export to JSONL for structured analysis
try:
_workspace = Path(getattr(self.memory, 'workspace_dir', './memory_workspace'))
_errors_dir = _workspace / "observation" / "errors"
_errors_dir.mkdir(parents=True, exist_ok=True)
_error_date = error_ctx.timestamp[:10] # YYYY-MM-DD
_error_log_path = _errors_dir / f"{_error_date}.jsonl"
try:
_ctx_serializable = json.loads(json.dumps(error_ctx.context, default=str))
except Exception:
_ctx_serializable = str(error_ctx.context)
_jsonl_record = {
"record_type": "error",
"timestamp": error_ctx.timestamp,
"error_type": error_ctx.error_type,
"message": error_ctx.message[:500],
"component": error_ctx.component,
"intent": error_ctx.intent,
"attempt": attempt,
"context": _ctx_serializable,
"self_healed": False, # Phase 1: observation only
}
def _write_jsonl(path: Path, record: dict) -> None:
try:
line = json.dumps(record, default=str, ensure_ascii=False)
with open(path, "a", encoding="utf-8") as fh:
fh.write(line + "\n")
except Exception as exc:
print(f"[SelfHealing] JSONL write failed: {exc}")
threading.Thread(
target=_write_jsonl,
args=(_error_log_path, _jsonl_record),
daemon=True,
).start()
except Exception as _jsonl_err:
print(f"[SelfHealing] JSONL export setup failed: {_jsonl_err}")
try:
self.memory.write_memory(log_entry, daily=True)
except Exception as e:
# Last resort: print to console if memory write fails
print(f"[SelfHealing] Failed to write error log to MEMORY.md: {e}")
print(f"[SelfHealing] Error was: {error_ctx.error_type}: {error_ctx.message}")