ajarbot/observation/memory_scorer.py

"""
Memory Relevance Scorer — RSO Phase 2.

Scores every indexed memory file using the formula from the RSO spec:

    Score = (access_frequency × 3) + (influence_rate × 5)
            - (age_days × 0.1) - (staleness_risk × 2)

Tiers:
    core    (>8)  : High-value, actively referenced — keep at top of retrieval
    active  (3–8) : In-use memory — maintain as-is
    archive (0–3) : Low-signal, old, or redundant — candidate for archival
    stale   (<0)  : High staleness risk, never accessed — recommend archival

Access frequency is tracked via the memory_access_log table (added to
memory_index.db in Phase 2). On first run there is no history; scores will
be age + staleness only. Frequency builds from the next agent session onward.

Output: memory_workspace/observation/summaries/memory-scores-YYYY-MM-DD.json
"""

import json
import re
import sqlite3
import threading
import time
from datetime import date, datetime
from pathlib import Path
from typing import Any, Dict, List, Optional


# ---------------------------------------------------------------------------
# Staleness heuristic patterns
# ---------------------------------------------------------------------------

_RE_IP = re.compile(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b")
_RE_CREDENTIALS = re.compile(
    r"\b(password|passwd|credential|api[_\s\-]?key|token|secret)\b",
    re.IGNORECASE,
)
_RE_STATUS = re.compile(
    r"\b(running|stopped|active|inactive|enabled|disabled|up|down)\b",
    re.IGNORECASE,
)
_RE_VERSION = re.compile(r"v\d+\.\d+(?:\.\d+)?|\bversion\s+\d", re.IGNORECASE)
_RE_DATE = re.compile(r"(202[0-9])-(\d{2})-(\d{2})")
_RE_DAILY_NAME = re.compile(r"(\d{4})-(\d{2})-(\d{2})\.md$")


class MemoryRelevanceScorer:
    """Score all indexed memory files for the weekly reflection agent."""

    def __init__(self, workspace_dir: str) -> None:
        self._workspace = Path(workspace_dir)
        self._db_path = self._workspace / "memory_index.db"
        self._summaries_dir = (
            self._workspace / "observation" / "summaries"
        )
        self._summaries_dir.mkdir(parents=True, exist_ok=True)

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def score_all(self, lookback_days: int = 30) -> Dict[str, Any]:
        """Score every indexed memory file. Returns full report dict.

        Cold-start mode: when the access log is empty (no history yet), the
        full spec formula degrades everything to stale — useless output.
        In cold-start, a baseline of 5.0 is used so age and staleness can
        still differentiate files while access data accumulates.

        Full formula (once data exists):
            score = (access × 3) + (influence × 5) - (age × 0.1) - (staleness × 2)

        Cold-start formula:
            score = 5.0 - (age × 0.05) - (staleness × 2)
        """
        cutoff_ms = int((time.time() - lookback_days * 86400) * 1000)
        today = date.today()

        db = sqlite3.connect(str(self._db_path), check_same_thread=False)
        db.row_factory = sqlite3.Row
        try:
            files = db.execute(
                "SELECT path, mtime, size FROM files ORDER BY mtime ASC"
            ).fetchall()

            # Determine cold-start: any accesses at all in the lookback window?
            total_accesses = self._total_access_count(db, cutoff_ms)
            cold_start = total_accesses == 0

            scored: List[Dict[str, Any]] = []
            for row in files:
                path = row["path"]
                mtime_ms = row["mtime"]

                content = self._read_file(path)
                access_count = self._access_count(db, path, cutoff_ms)
                age_days = self._age_days(path, mtime_ms, today)
                staleness_risk = self._staleness_risk(content, today)
                influence_rate = self._influence_proxy(access_count)

                if cold_start:
                    # Gentler age decay (0.05 instead of 0.1); baseline of 5
                    # so files don't all collapse to stale before we have data.
                    score = 5.0 - (age_days * 0.05) - (staleness_risk * 2)
                else:
                    score = (
                        (access_count * 3)
                        + (influence_rate * 5)
                        - (age_days * 0.1)
                        - (staleness_risk * 2)
                    )

                tier = _tier(score)
                scored.append(
                    {
                        "path": path,
                        "score": round(score, 2),
                        "tier": tier,
                        "age_days": round(age_days, 1),
                        "access_frequency": access_count,
                        "influence_rate": round(influence_rate, 2),
                        "staleness_risk": round(staleness_risk, 2),
                        "staleness_flags": self._staleness_flags(content),
                        "recommendation": _recommendation(tier, age_days),
                        "cold_start": cold_start,
                    }
                )

        finally:
            db.close()

        scored.sort(key=lambda x: x["score"])

        tier_counts = {"core": 0, "active": 0, "archive": 0, "stale": 0}
        for e in scored:
            tier_counts[e["tier"]] = tier_counts.get(e["tier"], 0) + 1

        note: Optional[str] = None
        if cold_start:
            note = (
                "COLD START: no access history yet. Scores use age+staleness only "
                "(baseline 5.0, age penalty 0.05/day). Full formula activates once "
                "memory_access_log accumulates data from live sessions."
            )

        return {
            "generated_at": datetime.now().astimezone().isoformat(),
            "lookback_days": lookback_days,
            "cold_start": cold_start,
            "files_scored": len(scored),
            "note": note,
            "summary": {
                "core_memory": tier_counts["core"],
                "active_memory": tier_counts["active"],
                "archive_candidates": tier_counts["archive"],
                "stale_candidates": tier_counts["stale"],
            },
            "archive_recommendations": [
                e for e in scored
                if e["recommendation"] == "archive" and e["age_days"] >= 30
            ],
            "entries": scored,
        }

    def write_report(self, lookback_days: int = 30) -> Path:
        """Generate and write JSON report; returns the output path."""
        report = self.score_all(lookback_days)
        today = datetime.now().strftime("%Y-%m-%d")
        out_path = self._summaries_dir / f"memory-scores-{today}.json"
        out_path.write_text(
            json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8"
        )
        print(
            f"[MemoryScorer] Report written -> {out_path.name} "
            f"({report['files_scored']} files, "
            f"{report['summary']['archive_candidates']} archive candidates, "
            f"{report['summary']['stale_candidates']} stale)"
        )
        return out_path

    def print_summary(self, lookback_days: int = 30) -> None:
        """Print a human-readable summary table to stdout."""
        report = self.score_all(lookback_days)
        s = report["summary"]
        sep = "-" * 60
        print(
            f"\n{sep}\n"
            f"Memory Relevance Report  ({report['generated_at'][:10]})\n"
            f"Lookback: {lookback_days}d  |  Files scored: {report['files_scored']}\n"
            f"{sep}\n"
            f"  Core     (>8)  : {s['core_memory']:3d}\n"
            f"  Active   (3-8) : {s['active_memory']:3d}\n"
            f"  Archive  (0-3) : {s['archive_candidates']:3d}\n"
            f"  Stale    (<0)  : {s['stale_candidates']:3d}\n"
            f"{sep}"
        )
        if report.get("note"):
            print(f"  NOTE: {report['note']}")

        archive = report["archive_recommendations"]
        if archive:
            print(f"\n  Archive candidates (age >=30d, score <3):")
            for e in archive[:10]:
                flags = ", ".join(e["staleness_flags"]) or "none"
                print(
                    f"    {e['path']:<40}  "
                    f"score={e['score']:>6.2f}  "
                    f"age={e['age_days']:>5.0f}d  "
                    f"flags=[{flags}]"
                )
            if len(archive) > 10:
                print(f"    ... and {len(archive) - 10} more")
        print()

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    def _read_file(self, rel_path: str) -> str:
        try:
            return (self._workspace / rel_path).read_text(encoding="utf-8")
        except Exception:
            return ""

    def _total_access_count(
        self, db: sqlite3.Connection, cutoff_ms: int
    ) -> int:
        """Total accesses across all paths in the lookback window."""
        try:
            row = db.execute(
                "SELECT COUNT(*) AS n FROM memory_access_log WHERE accessed_at >= ?",
                (cutoff_ms,),
            ).fetchone()
            return row["n"] if row else 0
        except sqlite3.OperationalError:
            return 0

    def _access_count(
        self, db: sqlite3.Connection, path: str, cutoff_ms: int
    ) -> int:
        try:
            row = db.execute(
                "SELECT COUNT(*) AS n FROM memory_access_log "
                "WHERE path = ? AND accessed_at >= ?",
                (path, cutoff_ms),
            ).fetchone()
            return row["n"] if row else 0
        except sqlite3.OperationalError:
            # Table doesn't exist yet on very first run before schema migration
            return 0

    def _age_days(
        self, path: str, mtime_ms: int, today: date
    ) -> float:
        """Age in days — prefer date extracted from filename for daily logs."""
        m = _RE_DAILY_NAME.search(path)
        if m:
            try:
                file_date = date(int(m.group(1)), int(m.group(2)), int(m.group(3)))
                return float((today - file_date).days)
            except ValueError:
                pass
        return (time.time() - mtime_ms / 1000) / 86400

    def _staleness_risk(self, content: str, today: date) -> float:
        """0.0–3.0 staleness score from content heuristics."""
        score = 0.0
        if _RE_IP.search(content):
            score += 1.0
        if _RE_CREDENTIALS.search(content):
            score += 1.0
        if _RE_STATUS.search(content):
            score += 0.5
        if _RE_VERSION.search(content):
            score += 0.5
        # Past dates mentioned in content (more than 30 days ago)
        for m in _RE_DATE.finditer(content):
            try:
                mentioned = date(int(m.group(1)), int(m.group(2)), int(m.group(3)))
                if (today - mentioned).days > 30:
                    score += 0.5
                    break  # Only penalise once per file
            except ValueError:
                pass
        return min(score, 3.0)

    def _staleness_flags(self, content: str) -> List[str]:
        flags: List[str] = []
        if _RE_IP.search(content):
            flags.append("ip_addresses")
        if _RE_CREDENTIALS.search(content):
            flags.append("credentials")
        if _RE_STATUS.search(content):
            flags.append("status_references")
        if _RE_VERSION.search(content):
            flags.append("version_numbers")
        return flags

    @staticmethod
    def _influence_proxy(access_count: int) -> float:
        """Proxy for influence rate — no real data until access log fills."""
        if access_count >= 5:
            return 0.8
        if access_count >= 2:
            return 0.5
        if access_count == 1:
            return 0.3
        return 0.0


# ---------------------------------------------------------------------------
# Pure functions
# ---------------------------------------------------------------------------

def _tier(score: float) -> str:
    if score > 8:
        return "core"
    if score >= 3:
        return "active"
    if score >= 0:
        return "archive"
    return "stale"


def _recommendation(tier: str, age_days: float) -> str:
    if tier in ("core", "active"):
        return "keep"
    if tier == "archive":
        return "archive" if age_days >= 60 else "monitor"
    # stale — archive rather than delete (Phase 3 safety rule)
    return "archive"


# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    import sys

    workspace = sys.argv[1] if len(sys.argv) > 1 else "./memory_workspace"
    scorer = MemoryRelevanceScorer(workspace)
    scorer.print_summary()
    path = scorer.write_report()
    print(f"Full report: {path}")