feat: Add Gitea MCP integration and project cleanup

## New Features - **Gitea MCP Tools** (zero API cost): - gitea_read_file: Read files from homelab repo - gitea_list_files: Browse directories - gitea_search_code: Search by filename - gitea_get_tree: Get directory tree - **Gitea Client** (gitea_tools/client.py): REST API wrapper with OAuth - **Proxmox SSH Scripts** (scripts/): Homelab data collection utilities - **Obsidian MCP Support** (obsidian_mcp.py): Advanced vault operations - **Voice Integration Plan** (JARVIS_VOICE_INTEGRATION_PLAN.md) ## Improvements - **Increased timeout**: 5min → 10min for complex tasks (llm_interface.py) - **Removed Direct API fallback**: Gitea tools are MCP-only (zero cost) - **Updated .env.example**: Added Obsidian MCP configuration - **Enhanced .gitignore**: Protect personal memory files (SOUL.md, MEMORY.md) ## Cleanup - Deleted 24 obsolete files (temp/test/experimental scripts, outdated docs) - Untracked personal memory files (SOUL.md, MEMORY.md now in .gitignore) - Removed: AGENT_SDK_IMPLEMENTATION.md, HYBRID_SEARCH_SUMMARY.md, IMPLEMENTATION_SUMMARY.md, MIGRATION.md, test_agent_sdk.py, etc. ## Configuration - Added config/gitea_config.example.yaml (Gitea setup template) - Added config/obsidian_mcp.example.yaml (Obsidian MCP template) - Updated scheduled_tasks.yaml with new task examples Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-18 20:31:32 -07:00
parent 0271dea551
commit fe7c146dc6
29 changed files with 5678 additions and 2287 deletions
--- a/llm_interface.py
+++ b/llm_interface.py
@@ -1,42 +1,49 @@
 """LLM Interface - Claude API, GLM, and other models.

-Supports three modes for Claude:
-1. Agent SDK (v0.1.36+) - DEFAULT - Uses query() API with Pro subscription
+Supports two modes for Claude:
+1. Agent SDK (v0.1.36+) - DEFAULT - Uses query() API with Max subscription
   - Set USE_AGENT_SDK=true (default)
   - Model: claude-sonnet-4-5-20250929 (default for all operations)
-   - Optional: USE_OPUS_FOR_TOOLS=true (enables Opus for extremely intensive tasks only)
-   - MCP Tools: File/system tools (read_file, write_file, edit_file, list_directory, run_command)
-   - Traditional Tools: Google tools & weather (fall back to Direct API, requires ANTHROPIC_API_KEY)
-   - Flat-rate subscription cost (no per-token charges for MCP tools)
+   - All tools are MCP-based (no API key needed)
+   - Tools registered via mcp_tools.py MCP server
+   - Flat-rate subscription cost

 2. Direct API (pay-per-token) - Set USE_DIRECT_API=true
-   - Model: claude-sonnet-4-5-20250929 (cost-effective, never uses Opus)
+   - Model: claude-sonnet-4-5-20250929
   - Requires ANTHROPIC_API_KEY in .env
-   - Full tool support built-in (all tools via traditional API)
-
-3. Legacy: Local Claude Code server - Set USE_CLAUDE_CODE_SERVER=true (deprecated)
-   - For backward compatibility only
+   - Uses traditional tool definitions from tools.py
 """

+import asyncio
+import atexit
+import logging
 import os
-from typing import Any, Dict, List, Optional
+import subprocess
+import threading
+from typing import Any, Dict, List, Optional, Set

 import requests
 from anthropic import Anthropic
-from anthropic.types import Message, ContentBlock, TextBlock, ToolUseBlock, Usage
-
 from usage_tracker import UsageTracker

+logger = logging.getLogger(__name__)
+# Ensure our debug messages are visible even if root logger is at WARNING.
+# Only add a handler if none exist (prevent duplicate output).
+if not logger.handlers:
+    _handler = logging.StreamHandler()
+    _handler.setFormatter(logging.Formatter(
+        "%(asctime)s [%(name)s] %(levelname)s: %(message)s",
+        datefmt="%H:%M:%S",
+    ))
+    logger.addHandler(_handler)
+    logger.setLevel(logging.DEBUG)
+
 # Try to import Agent SDK (optional dependency)
 try:
    from claude_agent_sdk import (
-        query,
-        UserMessage,
-        AssistantMessage,
-        SystemMessage,
        ClaudeAgentOptions,
+        ResultMessage,
    )
-    import anyio
    AGENT_SDK_AVAILABLE = True
 except ImportError:
    AGENT_SDK_AVAILABLE = False
@@ -47,29 +54,61 @@ _API_KEY_ENV_VARS = {
    "glm": "GLM_API_KEY",
 }

-# Mode selection (priority order: USE_DIRECT_API > USE_CLAUDE_CODE_SERVER > default to Agent SDK)
+# Mode selection (priority: USE_DIRECT_API > default to Agent SDK)
 _USE_DIRECT_API = os.getenv("USE_DIRECT_API", "false").lower() == "true"
-_CLAUDE_CODE_SERVER_URL = os.getenv("CLAUDE_CODE_SERVER_URL", "http://localhost:8000")
-_USE_CLAUDE_CODE_SERVER = os.getenv("USE_CLAUDE_CODE_SERVER", "false").lower() == "true"
-# Agent SDK is the default if available and no other mode is explicitly enabled
 _USE_AGENT_SDK = os.getenv("USE_AGENT_SDK", "true").lower() == "true"

 # Default models by provider
 _DEFAULT_MODELS = {
-    "claude": "claude-sonnet-4-5-20250929",  # For Direct API (pay-per-token) - Sonnet is cost-effective
-    "claude_agent_sdk": "claude-sonnet-4-5-20250929",  # For Agent SDK (flat-rate) - Sonnet for normal operations
-    "claude_agent_sdk_opus": "claude-opus-4-6",  # For Agent SDK extremely intensive tasks only (flat-rate)
+    "claude": "claude-sonnet-4-5-20250929",
+    "claude_agent_sdk": "claude-sonnet-4-5-20250929",
    "glm": "glm-4-plus",
 }

-# When to use Opus (only on Agent SDK flat-rate mode)
-_USE_OPUS_FOR_TOOLS = os.getenv("USE_OPUS_FOR_TOOLS", "false").lower() == "true"
-
 _GLM_BASE_URL = "https://open.bigmodel.cn/api/paas/v4/chat/completions"

+# Track PIDs of claude.exe subprocesses we spawn (to avoid killing user's Claude Code session!)
+_TRACKED_CLAUDE_PIDS: Set[int] = set()
+_TRACKED_PIDS_LOCK = threading.Lock()
+
+
+def _register_claude_subprocess(pid: int):
+    """Register a claude.exe subprocess PID for cleanup on exit."""
+    with _TRACKED_PIDS_LOCK:
+        _TRACKED_CLAUDE_PIDS.add(pid)
+        logger.debug("[LLM] Registered claude.exe subprocess PID: %d", pid)
+
+
+def _cleanup_tracked_claude_processes():
+    """Kill only the claude.exe processes we spawned (not the user's Claude Code session!)"""
+    with _TRACKED_PIDS_LOCK:
+        if not _TRACKED_CLAUDE_PIDS:
+            return
+
+        logger.info("[LLM] Cleaning up %d tracked claude.exe subprocess(es)", len(_TRACKED_CLAUDE_PIDS))
+        for pid in _TRACKED_CLAUDE_PIDS:
+            try:
+                if os.name == 'nt':  # Windows
+                    subprocess.run(
+                        ['taskkill', '/F', '/PID', str(pid), '/T'],
+                        capture_output=True,
+                        timeout=2
+                    )
+                else:  # Linux/Mac
+                    subprocess.run(['kill', '-9', str(pid)], capture_output=True, timeout=2)
+                    logger.debug("[LLM] Killed claude.exe subprocess PID: %d", pid)
+            except Exception as e:
+                logger.debug("[LLM] Failed to kill PID %d: %s", pid, e)
+
+        _TRACKED_CLAUDE_PIDS.clear()
+
+
+# Register cleanup on exit (only kills our tracked subprocesses, not all claude.exe!)
+atexit.register(_cleanup_tracked_claude_processes)
+

 class LLMInterface:
-    """Simple LLM interface supporting Claude and GLM."""
+    """LLM interface supporting Claude (Agent SDK or Direct API) and GLM."""

    def __init__(
        self,
@@ -82,26 +121,27 @@ class LLMInterface:
            _API_KEY_ENV_VARS.get(provider, ""),
        )
        self.client: Optional[Anthropic] = None
-        # Model will be set after determining mode

-        # Determine mode (priority: direct API > legacy server > agent SDK)
+        # Reference to the main asyncio event loop, set by the runtime.
+        # Used by Agent SDK mode to schedule async work from worker threads
+        # via asyncio.run_coroutine_threadsafe().
+        self._event_loop: Optional[asyncio.AbstractEventLoop] = None
+
+        # Determine mode (priority: direct API > agent SDK)
        if provider == "claude":
            if _USE_DIRECT_API:
                self.mode = "direct_api"
-            elif _USE_CLAUDE_CODE_SERVER:
-                self.mode = "legacy_server"
            elif _USE_AGENT_SDK and AGENT_SDK_AVAILABLE:
                self.mode = "agent_sdk"
            else:
-                # Fallback to direct API if Agent SDK not available
                self.mode = "direct_api"
                if _USE_AGENT_SDK and not AGENT_SDK_AVAILABLE:
                    print("[LLM] Warning: Agent SDK not available, falling back to Direct API")
                    print("[LLM] Install with: pip install claude-agent-sdk")
        else:
-            self.mode = "direct_api"  # Non-Claude providers use direct API
+            self.mode = "direct_api"

-        # Usage tracking (disabled when using Agent SDK or legacy server)
+        # Usage tracking (only for Direct API pay-per-token mode)
        self.tracker = UsageTracker() if (track_usage and self.mode == "direct_api") else None

        # Set model based on mode
@@ -109,28 +149,125 @@ class LLMInterface:
            if self.mode == "agent_sdk":
                self.model = _DEFAULT_MODELS.get("claude_agent_sdk", "claude-sonnet-4-5-20250929")
            else:
-                self.model = _DEFAULT_MODELS.get(provider, "claude-haiku-4-5-20251001")
+                self.model = _DEFAULT_MODELS.get(provider, "claude-sonnet-4-5-20250929")
        else:
            self.model = _DEFAULT_MODELS.get(provider, "")

        # Initialize based on mode
        if provider == "claude":
            if self.mode == "agent_sdk":
-                print(f"[LLM] Using Claude Agent SDK (flat-rate subscription) with model: {self.model}")
-                # No initialization needed - query() is a standalone function
+                print(f"[LLM] Using Agent SDK (Max subscription) with model: {self.model}")
            elif self.mode == "direct_api":
                print(f"[LLM] Using Direct API (pay-per-token) with model: {self.model}")
                self.client = Anthropic(api_key=self.api_key)
-            elif self.mode == "legacy_server":
-                print(f"[LLM] Using Claude Code server at {_CLAUDE_CODE_SERVER_URL} (Pro subscription) with model: {self.model}")
-                # Verify server is running
-                try:
-                    response = requests.get(f"{_CLAUDE_CODE_SERVER_URL}/", timeout=2)
-                    response.raise_for_status()
-                    print(f"[LLM] Claude Code server is running: {response.json()}")
-                except Exception as e:
-                    print(f"[LLM] Warning: Could not connect to Claude Code server: {e}")
-                    print(f"[LLM] Note: Claude Code server mode is deprecated. Using Agent SDK instead.")
+
+    def set_event_loop(self, loop: asyncio.AbstractEventLoop) -> None:
+        """Store a reference to the main asyncio event loop.
+
+        This allows Agent SDK async calls to be scheduled back onto the
+        main event loop from worker threads (created by asyncio.to_thread).
+        Must be called from the async context that owns the loop.
+        """
+        self._event_loop = loop
+        logger.info(
+            "[LLM] Event loop stored: %s (running=%s)",
+            type(loop).__name__,
+            loop.is_running(),
+        )
+
+    @staticmethod
+    def _clean_claude_env() -> dict:
+        """Remove Claude Code session markers from the environment.
+
+        The Agent SDK's SubprocessCLITransport copies os.environ into the
+        child process.  If the bot is launched from within a Claude Code
+        session (or any environment that sets CLAUDECODE), the child
+        ``claude`` CLI detects the nesting and refuses to start with:
+
+            "Claude Code cannot be launched inside another Claude Code session."
+
+        This method temporarily removes the offending variables and returns
+        them so the caller can restore them afterwards.
+        """
+        saved = {}
+        # Keys that signal an active Claude Code parent session.
+        # CLAUDE_CODE_ENTRYPOINT and CLAUDE_AGENT_SDK_VERSION are set by
+        # the SDK itself on the child process, so removing them from the
+        # parent is safe -- the SDK will set them again.
+        markers = [
+            "CLAUDECODE",
+            "CLAUDE_CODE_ENTRYPOINT",
+            "CLAUDE_AGENT_SDK_VERSION",
+            "CLAUDE_CODE_ENABLE_SDK_FILE_CHECKPOINTING",
+        ]
+        for key in markers:
+            if key in os.environ:
+                saved[key] = os.environ.pop(key)
+        if saved:
+            logger.debug("[LLM] Cleaned Claude session env vars: %s", list(saved.keys()))
+        return saved
+
+    @staticmethod
+    def _restore_claude_env(saved: dict) -> None:
+        """Restore previously removed Claude session env vars."""
+        os.environ.update(saved)
+
+    def _run_async_from_thread(self, coro) -> Any:
+        """Run an async coroutine from a synchronous worker thread.
+
+        Uses asyncio.run_coroutine_threadsafe() to schedule the coroutine
+        on the main event loop (if available), which is the correct way to
+        bridge sync -> async when called from an asyncio.to_thread() worker
+        or from any background thread (e.g., the scheduler).
+
+        Falls back to asyncio.run() if no event loop reference is available
+        (e.g., direct script usage without the adapter runtime).
+
+        Args:
+            coro: An already-created coroutine object (not a coroutine function).
+        """
+        current_thread = threading.current_thread().name
+        has_loop = self._event_loop is not None
+        loop_running = has_loop and self._event_loop.is_running()
+
+        if has_loop and loop_running:
+            logger.info(
+                "[LLM] _run_async_from_thread: using run_coroutine_threadsafe "
+                "(thread=%s, loop=%s)",
+                current_thread,
+                type(self._event_loop).__name__,
+            )
+            # Schedule on the main event loop and block this thread until done.
+            # This works because:
+            # 1. asyncio.to_thread() runs us in a thread pool while the main
+            #    loop continues processing other tasks.
+            # 2. Scheduler threads are plain daemon threads, also not blocking
+            #    the main loop.
+            # The coroutine executes on the main loop without deadlocking
+            # because the main loop is free to run while we block here.
+            future = asyncio.run_coroutine_threadsafe(coro, self._event_loop)
+            try:
+                # Block with 10-minute timeout to prevent hangs
+                # Complex tasks (repo analysis, multi-step operations) can take 5-8 minutes
+                logger.info("[LLM] Waiting for Agent SDK response (timeout: 600s)...")
+                result = future.result(timeout=600)
+                logger.info("[LLM] Agent SDK response received successfully")
+                return result
+            except TimeoutError:
+                logger.error("[LLM] ⚠️ Agent SDK call TIMED OUT after 600 seconds!")
+                future.cancel()  # Cancel the coroutine
+                raise TimeoutError("Agent SDK call exceeded 10 minute timeout - task may be too complex")
+        else:
+            logger.info(
+                "[LLM] _run_async_from_thread: using asyncio.run() fallback "
+                "(thread=%s, has_loop=%s, loop_running=%s)",
+                current_thread,
+                has_loop,
+                loop_running,
+            )
+            # Fallback: no main loop available (standalone / test usage).
+            # Create a new event loop in this thread via asyncio.run().
+            return asyncio.run(coro)

    def chat(
        self,
@@ -140,44 +277,24 @@ class LLMInterface:
    ) -> str:
        """Send chat request and get response.

+        In Agent SDK mode, this uses query() which handles MCP tools automatically.
+        In Direct API mode, this is a simple messages.create() call without tools.
+
        Raises:
            Exception: If the API call fails or returns an unexpected response.
        """
        if self.provider == "claude":
-            # Agent SDK mode (Pro subscription)
            if self.mode == "agent_sdk":
                try:
-                    # Use anyio.run to create event loop for async SDK
-                    response = anyio.run(
-                        self._agent_sdk_chat,
-                        messages,
-                        system,
-                        max_tokens
+                    logger.info("[LLM] chat: dispatching via Agent SDK")
+                    response = self._run_async_from_thread(
+                        self._agent_sdk_chat(messages, system, max_tokens)
                    )
                    return response
                except Exception as e:
+                    logger.error("[LLM] Agent SDK error in chat(): %s", e, exc_info=True)
                    raise Exception(f"Agent SDK error: {e}")

-            # Legacy Claude Code server (Pro subscription)
-            elif self.mode == "legacy_server":
-                try:
-                    payload = {
-                        "messages": [{"role": m["role"], "content": m["content"]} for m in messages],
-                        "system": system,
-                        "max_tokens": max_tokens
-                    }
-                    response = requests.post(
-                        f"{_CLAUDE_CODE_SERVER_URL}/v1/chat",
-                        json=payload,
-                        timeout=120
-                    )
-                    response.raise_for_status()
-                    data = response.json()
-                    return data.get("content", "")
-                except Exception as e:
-                    raise Exception(f"Claude Code server error: {e}")
-
-            # Direct API (pay-per-token)
            elif self.mode == "direct_api":
                response = self.client.messages.create(
                    model=self.model,
@@ -186,7 +303,6 @@ class LLMInterface:
                    messages=messages,
                )

-                # Track usage
                if self.tracker and hasattr(response, "usage"):
                    self.tracker.track(
                        model=self.model,
@@ -222,177 +338,263 @@ class LLMInterface:

        raise ValueError(f"Unsupported provider: {self.provider}")

+    def _build_agent_sdk_options(self) -> Optional['ClaudeAgentOptions']:
+        """Build Agent SDK options with MCP servers and allowed tools.
+
+        Returns configured ClaudeAgentOptions, or None if mcp_tools is unavailable.
+        """
+        try:
+            from mcp_tools import file_system_server
+
+            mcp_servers = {"file_system": file_system_server}
+
+            # All tools registered in the MCP server
+            allowed_tools = [
+                # File and system tools
+                "read_file",
+                "write_file",
+                "edit_file",
+                "list_directory",
+                "run_command",
+                # Web tool
+                "web_fetch",
+                # Zettelkasten tools
+                "fleeting_note",
+                "daily_note",
+                "literature_note",
+                "permanent_note",
+                "search_vault",
+                "search_by_tags",
+                # Google tools (Gmail, Calendar, Contacts)
+                "get_weather",
+                "send_email",
+                "read_emails",
+                "get_email",
+                "read_calendar",
+                "create_calendar_event",
+                "search_calendar",
+                "create_contact",
+                "list_contacts",
+                "get_contact",
+                # Gitea tools (private repo access)
+                "gitea_read_file",
+                "gitea_list_files",
+                "gitea_search_code",
+                "gitea_get_tree",
+            ]
+
+            # Conditionally add Obsidian MCP server
+            try:
+                from obsidian_mcp import (
+                    is_obsidian_enabled,
+                    check_obsidian_health,
+                    get_obsidian_server_config,
+                    OBSIDIAN_TOOLS,
+                )
+
+                if is_obsidian_enabled() and check_obsidian_health():
+                    obsidian_config = get_obsidian_server_config()
+                    mcp_servers["obsidian"] = obsidian_config
+                    allowed_tools.extend(OBSIDIAN_TOOLS)
+                    print("[LLM] Obsidian MCP server registered (8 tools)")
+                elif is_obsidian_enabled():
+                    print("[LLM] Obsidian MCP enabled but health check failed")
+            except ImportError:
+                pass
+            except Exception as e:
+                print(f"[LLM] Obsidian MCP unavailable: {e}")
+
+            def _stderr_callback(line: str) -> None:
+                """Log Claude CLI stderr for debugging transport failures."""
+                logger.debug("[CLI stderr] %s", line)
+
+            return ClaudeAgentOptions(
+                mcp_servers=mcp_servers,
+                allowed_tools=allowed_tools,
+                permission_mode="bypassPermissions",
+                max_turns=30,  # Prevent infinite tool loops (matches MAX_TOOL_ITERATIONS)
+                stderr=_stderr_callback,
+            )
+        except ImportError:
+            print("[LLM] Warning: mcp_tools not available, no MCP tools will be registered")
+            return None
+
    async def _agent_sdk_chat(
        self,
        messages: List[Dict],
        system: Optional[str],
        max_tokens: int
    ) -> str:
-        """Internal async method for Agent SDK chat (called via anyio bridge)."""
-        # Convert messages to SDK format
-        sdk_messages = []
-        for msg in messages:
-            if msg["role"] == "user":
-                sdk_messages.append(UserMessage(content=msg["content"]))
-            elif msg["role"] == "assistant":
-                sdk_messages.append(AssistantMessage(content=msg["content"]))
+        """Agent SDK chat via custom transport flow.

-        # Add system message if provided
-        if system:
-            sdk_messages.insert(0, SystemMessage(content=system))
+        Uses the SDK's transport and query layers directly instead of the
+        high-level ``query()`` helper.  This works around a bug in
+        ``claude_agent_sdk._internal.client.process_query`` where
+        ``end_input()`` is called immediately after sending the user message
+        for string prompts.  That premature stdin close kills the
+        bidirectional control channel that SDK MCP servers need to handle
+        ``tools/list`` and ``tools/call`` requests from the CLI subprocess,
+        resulting in ``CLIConnectionError: ProcessTransport is not ready for
+        writing``.

-        # Configure MCP server for file/system tools
-        try:
-            from mcp_tools import file_system_server
+        Our fix: defer ``end_input()`` until after the first ``ResultMessage``
+        is received, matching the logic already present in
+        ``Query.stream_input()`` for async-iterable prompts.
+        """
+        import json as _json

-            options = ClaudeAgentOptions(
-                mcp_servers={"file_system": file_system_server},
-                # Allow all MCP tools (file/system + web + zettelkasten)
-                allowed_tools=[
-                    "read_file",
-                    "write_file",
-                    "edit_file",
-                    "list_directory",
-                    "run_command",
-                    "web_fetch",
-                    "fleeting_note",
-                    "daily_note",
-                    "literature_note",
-                    "permanent_note",
-                    "search_vault",
-                    "search_by_tags",
-                ],
-            )
-        except ImportError:
-            # Fallback if mcp_tools not available
-            options = None
-
-        # Call the new query() API
-        # Note: Agent SDK handles max_tokens internally, don't pass it explicitly
-        response = await query(
-            messages=sdk_messages,
-            options=options,
-            # model parameter is handled by the SDK based on settings
+        # Lazy imports from SDK internals.
+        from claude_agent_sdk._internal.transport.subprocess_cli import (
+            SubprocessCLITransport,
        )
+        from claude_agent_sdk._internal.query import Query
+        from claude_agent_sdk._internal.message_parser import parse_message

-        # Extract text from response
-        if hasattr(response, "content"):
-            # Handle list of content blocks
-            if isinstance(response.content, list):
-                text_parts = []
-                for block in response.content:
-                    if hasattr(block, "text"):
-                        text_parts.append(block.text)
-                return "".join(text_parts)
-            # Handle single text content
-            elif isinstance(response.content, str):
-                return response.content
+        # Build the prompt from the system prompt and conversation history.
+        prompt = self._build_sdk_prompt(messages, system)
+        options = self._build_agent_sdk_options()

-        return str(response)
+        # Clean Claude session env vars so the child CLI process doesn't
+        # detect a "nested session" and refuse to start.
+        saved_env = self._clean_claude_env()

-    async def _agent_sdk_chat_with_tools(
+        try:
+            # --- 1. Create and connect the subprocess transport. ---
+            transport = SubprocessCLITransport(prompt=prompt, options=options)
+            await transport.connect()
+
+            # Track the subprocess PID for cleanup on exit
+            if hasattr(transport, '_process') and transport._process:
+                _register_claude_subprocess(transport._process.pid)
+
+            # --- 2. Extract in-process SDK MCP server instances. ---
+            sdk_mcp_servers: Dict = {}
+            if options.mcp_servers and isinstance(options.mcp_servers, dict):
+                for name, config in options.mcp_servers.items():
+                    if isinstance(config, dict) and config.get("type") == "sdk":
+                        sdk_mcp_servers[name] = config["instance"]
+
+            # --- 3. Create the Query object (control-protocol handler). ---
+            query_obj = Query(
+                transport=transport,
+                is_streaming_mode=True,
+                sdk_mcp_servers=sdk_mcp_servers,
+            )
+
+            try:
+                # Start the background reader task.
+                await query_obj.start()
+
+                # Perform the initialize handshake with the CLI.
+                await query_obj.initialize()
+
+                # Send the user message over stdin.
+                user_msg = {
+                    "type": "user",
+                    "session_id": "",
+                    "message": {"role": "user", "content": prompt},
+                    "parent_tool_use_id": None,
+                }
+                await transport.write(_json.dumps(user_msg) + "\n")
+
+                # **KEY FIX**: Do NOT call end_input() yet.  The CLI will
+                # send MCP control requests (tools/list, tools/call) over
+                # the bidirectional channel.  Closing stdin now would
+                # prevent us from writing responses back.  We wait for the
+                # first ResultMessage instead.
+
+                # --- 4. Consume messages until we get a ResultMessage. ---
+                result_text = ""
+                message_count = 0
+                async for data in query_obj.receive_messages():
+                    message = parse_message(data)
+                    message_count += 1
+
+                    # Log all message types for debugging hangs
+                    message_type = type(message).__name__
+                    logger.debug(f"[LLM] Received message #{message_count}: {message_type}")
+
+                    if isinstance(message, ResultMessage):
+                        result_text = message.result or ""
+                        logger.info(
+                            "[LLM] Agent SDK result received after %d messages: cost=$%.4f, turns=%s",
+                            message_count,
+                            getattr(message, "total_cost_usd", 0),
+                            getattr(message, "num_turns", "?"),
+                        )
+                        break
+
+                    # Log non-result messages to detect loops
+                    if message_count % 10 == 0:
+                        logger.warning(f"[LLM] Still waiting for ResultMessage after {message_count} messages...")
+
+                # Now that we have the result, close stdin gracefully.
+                try:
+                    await transport.end_input()
+                except Exception:
+                    pass  # Process may have already exited; that's fine.
+
+                return result_text
+
+            finally:
+                # Always clean up the query/transport.
+                try:
+                    await query_obj.close()
+                except Exception:
+                    # Suppress errors during cleanup (e.g. if process
+                    # already exited and there are pending control
+                    # request tasks that can't write back).
+                    pass
+        finally:
+            # Always restore env vars, even on error.
+            self._restore_claude_env(saved_env)
+
+    def _build_sdk_prompt(
        self,
        messages: List[Dict],
-        tools: List[Dict[str, Any]],
        system: Optional[str],
-        max_tokens: int
-    ) -> Message:
-        """Internal async method for Agent SDK chat with tools (called via anyio bridge).
+    ) -> str:
+        """Build a prompt string for the Agent SDK query() from conversation history.

-        NOTE: The new Claude Agent SDK (v0.1.36+) uses MCP servers for tools.
-        For backward compatibility with the existing tool system, we fall back
-        to the Direct API for tool calls. This means tool calls will consume API tokens
-        even when Agent SDK mode is enabled.
-
-        Uses Sonnet by default. Opus can be enabled via USE_OPUS_FOR_TOOLS=true for
-        extremely intensive tasks (only recommended for Agent SDK flat-rate mode).
+        The SDK expects a single prompt string. We combine the system prompt
+        and conversation history into a coherent prompt.
        """
-        # Fallback to Direct API for tool calls (SDK tools use MCP servers)
-        from anthropic import Anthropic
+        parts = []

-        if not self.api_key:
-            raise ValueError(
-                "ANTHROPIC_API_KEY required for tool calls in Agent SDK mode. "
-                "Set the API key in .env or migrate tools to MCP servers."
-            )
+        if system:
+            parts.append(f"<system>\n{system}\n</system>\n")

-        temp_client = Anthropic(api_key=self.api_key)
+        # Include recent conversation history for context
+        for msg in messages:
+            content = msg.get("content", "")
+            role = msg["role"]

-        # Use Opus only if explicitly enabled (for intensive tasks on flat-rate)
-        # Otherwise default to Sonnet (cost-effective for normal tool operations)
-        if _USE_OPUS_FOR_TOOLS and self.mode == "agent_sdk":
-            model = _DEFAULT_MODELS.get("claude_agent_sdk_opus", "claude-opus-4-6")
-        else:
-            model = self.model  # Use Sonnet (default)
+            if isinstance(content, str):
+                if role == "user":
+                    parts.append(f"User: {content}")
+                elif role == "assistant":
+                    parts.append(f"Assistant: {content}")
+            elif isinstance(content, list):
+                # Structured content (tool_use/tool_result blocks from Direct API history)
+                text_parts = []
+                for block in content:
+                    if isinstance(block, dict):
+                        if block.get("type") == "text":
+                            text_parts.append(block.get("text", ""))
+                        elif block.get("type") == "tool_result":
+                            text_parts.append(f"[Tool result]: {block.get('content', '')}")
+                        elif block.get("type") == "tool_use":
+                            text_parts.append(f"[Used tool: {block.get('name', 'unknown')}]")
+                    elif hasattr(block, "type"):
+                        if block.type == "text":
+                            text_parts.append(block.text)
+                if text_parts:
+                    if role == "user":
+                        parts.append(f"User: {' '.join(text_parts)}")
+                    elif role == "assistant":
+                        parts.append(f"Assistant: {' '.join(text_parts)}")

-        response = temp_client.messages.create(
-            model=model,
-            max_tokens=max_tokens,
-            system=system or "",
-            messages=messages,
-            tools=tools,
-        )
-
-        return response
-
-    def _convert_sdk_response_to_message(self, sdk_response: Dict[str, Any]) -> Message:
-        """Convert Agent SDK response to anthropic.types.Message format.
-
-        This ensures compatibility with agent.py's existing tool loop.
-        """
-        # Extract content blocks
-        content_blocks = []
-        raw_content = sdk_response.get("content", [])
-
-        if isinstance(raw_content, str):
-            # Simple text response
-            content_blocks = [TextBlock(type="text", text=raw_content)]
-        elif isinstance(raw_content, list):
-            # List of content blocks
-            for block in raw_content:
-                if isinstance(block, dict):
-                    if block.get("type") == "text":
-                        content_blocks.append(TextBlock(
-                            type="text",
-                            text=block.get("text", "")
-                        ))
-                    elif block.get("type") == "tool_use":
-                        content_blocks.append(ToolUseBlock(
-                            type="tool_use",
-                            id=block.get("id", ""),
-                            name=block.get("name", ""),
-                            input=block.get("input", {})
-                        ))
-
-        # Extract usage information
-        usage_data = sdk_response.get("usage", {})
-        usage = Usage(
-            input_tokens=usage_data.get("input_tokens", 0),
-            output_tokens=usage_data.get("output_tokens", 0)
-        )
-
-        # Create Message object
-        # Note: We create a minimal Message-compatible object
-        # The Message class from anthropic.types is read-only, so we create a mock
-        # Capture self.model before defining inner class
-        model_name = sdk_response.get("model", self.model)
-
-        class MessageLike:
-            def __init__(self, content, stop_reason, usage, model):
-                self.content = content
-                self.stop_reason = stop_reason
-                self.usage = usage
-                self.id = sdk_response.get("id", "sdk_message")
-                self.model = model
-                self.role = "assistant"
-                self.type = "message"
-
-        return MessageLike(
-            content=content_blocks,
-            stop_reason=sdk_response.get("stop_reason", "end_turn"),
-            usage=usage,
-            model=model_name
-        )
+        return "\n\n".join(parts)

    def chat_with_tools(
        self,
@@ -401,70 +603,43 @@ class LLMInterface:
        system: Optional[str] = None,
        max_tokens: int = 16384,
        use_cache: bool = False,
-    ) -> Message:
-        """Send chat request with tool support. Returns full Message object.
+    ) -> Any:
+        """Send chat request with tool support.
+
+        In Agent SDK mode: Uses query() with MCP tools. The SDK handles tool
+        execution automatically. Returns a string (final response after all
+        tool calls are resolved).
+
+        In Direct API mode: Returns an anthropic Message object with potential
+        tool_use blocks that agent.py processes in a manual loop.

        Args:
-            use_cache: Enable prompt caching for Sonnet models (saves 90% on repeated context)
+            tools: Tool definitions (used by Direct API; ignored in Agent SDK mode
+                   since tools are registered via MCP servers).
+            use_cache: Enable prompt caching for Sonnet (Direct API only).
        """
        if self.provider != "claude":
            raise ValueError("Tool use only supported for Claude provider")

-        # Agent SDK mode (Pro subscription)
        if self.mode == "agent_sdk":
+            # Agent SDK handles tool calls automatically via MCP servers.
+            # We use the same query() path as chat(), since MCP tools are
+            # already registered. The SDK will invoke tools, collect results,
+            # and return the final text response.
            try:
-                # Use anyio.run to create event loop for async SDK
-                response = anyio.run(
-                    self._agent_sdk_chat_with_tools,
-                    messages,
-                    tools,
-                    system,
-                    max_tokens
+                logger.info("[LLM] chat_with_tools: dispatching via Agent SDK")
+                response = self._run_async_from_thread(
+                    self._agent_sdk_chat(messages, system, max_tokens)
                )
                return response
            except Exception as e:
+                logger.error("[LLM] Agent SDK error: %s", e, exc_info=True)
                raise Exception(f"Agent SDK error: {e}")

-        # Legacy Claude Code server (Pro subscription)
-        elif self.mode == "legacy_server":
-            try:
-                payload = {
-                    "messages": messages,
-                    "tools": tools,
-                    "system": system,
-                    "max_tokens": max_tokens
-                }
-                response = requests.post(
-                    f"{_CLAUDE_CODE_SERVER_URL}/v1/chat/tools",
-                    json=payload,
-                    timeout=120
-                )
-                response.raise_for_status()
-                # Convert response to Message-like object
-                data = response.json()
-
-                # Create a mock Message object with the response
-                class MockMessage:
-                    def __init__(self, data):
-                        self.content = data.get("content", [])
-                        self.stop_reason = data.get("stop_reason", "end_turn")
-                        self.usage = type('obj', (object,), {
-                            'input_tokens': data.get("usage", {}).get("input_tokens", 0),
-                            'output_tokens': data.get("usage", {}).get("output_tokens", 0)
-                        })
-
-                return MockMessage(data)
-            except Exception as e:
-                raise Exception(f"Claude Code server error: {e}")
-
-        # Direct API (pay-per-token)
        elif self.mode == "direct_api":
-            # Enable caching only for Sonnet models (not worth it for Haiku)
            enable_caching = use_cache and "sonnet" in self.model.lower()

-            # Structure system prompt for optimal caching
            if enable_caching and system:
-                # Convert string to list format with cache control
                system_blocks = [
                    {
                        "type": "text",
@@ -483,7 +658,6 @@ class LLMInterface:
                tools=tools,
            )

-            # Track usage
            if self.tracker and hasattr(response, "usage"):
                self.tracker.track(
                    model=self.model,