Add API usage tracking and dynamic task reloading

Features:
- Usage tracking system (usage_tracker.py)
  - Tracks input/output tokens per API call
  - Calculates costs with support for cache pricing
  - Stores data in usage_data.json (gitignored)
  - Integrated into llm_interface.py

- Dynamic task scheduler reloading
  - Auto-detects YAML changes every 60s
  - No restart needed for new tasks
  - reload_tasks() method for manual refresh

- Example cost tracking scheduled task
  - Daily API usage report
  - Budget tracking ($5/month target)
  - Disabled by default in scheduled_tasks.yaml

Improvements:
- Fixed tool_use/tool_result pair splitting bug (CRITICAL)
- Added thread safety to agent.chat()
- Fixed N+1 query problem in hybrid search
- Optimized database batch queries
- Added conversation history pruning (50 messages max)

Updated .gitignore:
- Exclude user profiles (memory_workspace/users/*.md)
- Exclude usage data (usage_data.json)
- Exclude vector index (vectors.usearch)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-13 23:38:44 -07:00
parent ab3a5afd59
commit 8afff96bb5
16 changed files with 1096 additions and 244 deletions

121
agent.py
View File

@@ -1,5 +1,6 @@
"""AI Agent with Memory and LLM Integration."""
import threading
from typing import List, Optional
from heartbeat import Heartbeat
@@ -12,6 +13,8 @@ from tools import TOOL_DEFINITIONS, execute_tool
MAX_CONTEXT_MESSAGES = 3 # Reduced from 5 to save tokens
# Maximum characters of agent response to store in memory
MEMORY_RESPONSE_PREVIEW_LENGTH = 200
# Maximum conversation history entries before pruning
MAX_CONVERSATION_HISTORY = 50
class Agent:
@@ -27,6 +30,7 @@ class Agent:
self.llm = LLMInterface(provider)
self.hooks = HooksSystem()
self.conversation_history: List[dict] = []
self._chat_lock = threading.Lock()
self.memory.sync()
self.hooks.trigger("agent", "startup", {"workspace_dir": workspace_dir})
@@ -37,13 +41,88 @@ class Agent:
self.heartbeat.on_alert = self._on_heartbeat_alert
self.heartbeat.start()
def _get_context_messages(self, max_messages: int) -> List[dict]:
"""Get recent messages without breaking tool_use/tool_result pairs.
Ensures that:
1. A tool_result message always has its preceding tool_use message
2. A tool_use message always has its following tool_result message
3. The first message is never a tool_result without its tool_use
"""
if len(self.conversation_history) <= max_messages:
return list(self.conversation_history)
# Start with the most recent messages
start_idx = len(self.conversation_history) - max_messages
# Track original start_idx before adjustments for end-of-list check
original_start_idx = start_idx
# Check if we split a tool pair at the start
if start_idx > 0:
candidate = self.conversation_history[start_idx]
# If first message is a tool_result, include the tool_use before it
if candidate["role"] == "user" and isinstance(candidate.get("content"), list):
if any(isinstance(block, dict) and block.get("type") == "tool_result"
for block in candidate["content"]):
start_idx -= 1
# Build result slice using adjusted start
result = list(self.conversation_history[start_idx:])
# Check if we split a tool pair at the end
# Use original_start_idx + max_messages to find end of original slice
original_end_idx = original_start_idx + max_messages
if original_end_idx < len(self.conversation_history):
end_msg = self.conversation_history[original_end_idx - 1]
if end_msg["role"] == "assistant" and isinstance(end_msg.get("content"), list):
has_tool_use = any(
(hasattr(block, "type") and block.type == "tool_use") or
(isinstance(block, dict) and block.get("type") == "tool_use")
for block in end_msg["content"]
)
if has_tool_use:
# The tool_result at original_end_idx is already in result
# if start_idx was adjusted, so only add if it's not there
next_msg = self.conversation_history[original_end_idx]
if next_msg not in result:
result.append(next_msg)
return result
def _on_heartbeat_alert(self, message: str) -> None:
"""Handle heartbeat alerts."""
print(f"\nHeartbeat Alert:\n{message}\n")
def _prune_conversation_history(self) -> None:
"""Prune conversation history to prevent unbounded growth.
Removes oldest messages while preserving tool_use/tool_result pairs.
"""
if len(self.conversation_history) <= MAX_CONVERSATION_HISTORY:
return
# Keep the most recent half
keep_count = MAX_CONVERSATION_HISTORY // 2
start_idx = len(self.conversation_history) - keep_count
# Ensure we don't split a tool pair
if start_idx > 0:
candidate = self.conversation_history[start_idx]
if candidate["role"] == "user" and isinstance(candidate.get("content"), list):
if any(isinstance(block, dict) and block.get("type") == "tool_result"
for block in candidate["content"]):
start_idx -= 1
self.conversation_history = self.conversation_history[start_idx:]
def chat(self, user_message: str, username: str = "default") -> str:
"""Chat with context from memory and tool use."""
# Handle model switching commands
"""Chat with context from memory and tool use.
Thread-safe: uses a lock to prevent concurrent modification of
conversation history from multiple threads (e.g., scheduled tasks
and live messages).
"""
# Handle model switching commands (no lock needed, read-only on history)
if user_message.lower().startswith("/model "):
model_name = user_message[7:].strip()
self.llm.set_model(model_name)
@@ -66,9 +145,14 @@ class Agent:
f"Commands: /sonnet, /haiku, /status"
)
with self._chat_lock:
return self._chat_inner(user_message, username)
def _chat_inner(self, user_message: str, username: str) -> str:
"""Inner chat logic, called while holding _chat_lock."""
soul = self.memory.get_soul()
user_profile = self.memory.get_user(username)
relevant_memory = self.memory.search(user_message, max_results=2)
relevant_memory = self.memory.search_hybrid(user_message, max_results=2)
memory_lines = [f"- {mem['snippet']}" for mem in relevant_memory]
system = (
@@ -82,18 +166,29 @@ class Agent:
{"role": "user", "content": user_message}
)
# Prune history to prevent unbounded growth
self._prune_conversation_history()
# Tool execution loop
max_iterations = 5 # Reduced from 10 to save costs
# Enable caching for Sonnet to save 90% on repeated system prompts
use_caching = "sonnet" in self.llm.model.lower()
for iteration in range(max_iterations):
response = self.llm.chat_with_tools(
self.conversation_history[-MAX_CONTEXT_MESSAGES:],
tools=TOOL_DEFINITIONS,
system=system,
use_cache=use_caching,
)
# Get recent messages, ensuring we don't break tool_use/tool_result pairs
context_messages = self._get_context_messages(MAX_CONTEXT_MESSAGES)
try:
response = self.llm.chat_with_tools(
context_messages,
tools=TOOL_DEFINITIONS,
system=system,
use_cache=use_caching,
)
except Exception as e:
error_msg = f"LLM API error: {e}"
print(f"[Agent] {error_msg}")
return f"Sorry, I encountered an error communicating with the AI model. Please try again."
# Check stop reason
if response.stop_reason == "end_turn":
@@ -104,6 +199,11 @@ class Agent:
text_content.append(block.text)
final_response = "\n".join(text_content)
# Handle empty response
if not final_response.strip():
final_response = "(No response generated)"
self.conversation_history.append(
{"role": "assistant", "content": final_response}
)
@@ -146,6 +246,9 @@ class Agent:
tool_results = []
for tool_use in tool_uses:
result = execute_tool(tool_use.name, tool_use.input)
# Truncate large tool outputs to prevent token explosion
if len(result) > 5000:
result = result[:5000] + "\n... (output truncated)"
print(f"[Tool] {tool_use.name}: {result[:100]}...")
tool_results.append({
"type": "tool_result",