diff --git a/MEMORY_IMPLEMENTATION_GUIDE.md b/MEMORY_IMPLEMENTATION_GUIDE.md
deleted file mode 100644
index b0e8fd0..0000000
--- a/MEMORY_IMPLEMENTATION_GUIDE.md
+++ /dev/null
@@ -1,656 +0,0 @@
-# Quick Implementation Guide: Rolling Summary Memory
-
-## TL;DR
-
-**Problem:** Sending full conversation history every request wastes tokens and latency.
-
-**Solution:** Rolling summary approach - keep recent messages + LLM-generated summary of older messages.
-
-**Result:** ~83% token reduction for long conversations, zero dependencies, works with current stack.
-
----
-
-## Architecture
-
-```
-SQLite History (per user)
-    ↓
-Messages 1-10: Summarized → "User asked about weather, discussed outdoor plans"
-Messages 11-18: Sent raw  → Full context
-    ↓
-LLM receives: System prompt + Summary + Recent 8 messages
-    ↓
-Response generated
-```
-
----
-
-## Files to Create/Modify
-
-### 1. Create `meshai/memory.py`
-
-```python
-"""Lightweight rolling summary memory manager."""
-
-import time
-from dataclasses import dataclass
-from typing import Optional
-
-from openai import AsyncOpenAI
-
-
-@dataclass
-class ConversationSummary:
-    """Summary of conversation history."""
-
-    summary: str
-    last_updated: float
-    message_count: int
-
-
-class RollingSummaryMemory:
-    """Manages conversation summaries with recent message window.
-
-    Strategy:
-    - Keep last N message pairs (window_size) in full
-    - Summarize everything before the window
-    - Update summary when old messages accumulate
-
-    Example (window_size=4):
-        Messages 1-10: Summarized to "User discussed weather and plans"
-        Messages 11-18: Kept in full (last 4 pairs)
-        Context sent: [Summary] + [Messages 11-18]
-    """
-
-    def __init__(
-        self,
-        client: AsyncOpenAI,
-        model: str,
-        window_size: int = 4,
-        summarize_threshold: int = 8,
-    ):
-        """Initialize rolling summary memory.
-
-        Args:
-            client: AsyncOpenAI client for generating summaries
-            model: Model name to use for summarization
-            window_size: Number of recent message pairs to keep in full
-            summarize_threshold: Messages to accumulate before re-summarizing
-        """
-        self._client = client
-        self._model = model
-        self._window_size = window_size
-        self._summarize_threshold = summarize_threshold
-
-        # In-memory cache of summaries (loaded from DB on startup)
-        self._summaries: dict[str, ConversationSummary] = {}
-
-    async def get_context_messages(
-        self,
-        user_id: str,
-        full_history: list[dict],
-    ) -> tuple[Optional[str], list[dict]]:
-        """Get optimized context: summary + recent messages.
-
-        Args:
-            user_id: User identifier
-            full_history: Full message history from database
-
-        Returns:
-            Tuple of (summary_text, recent_messages)
-            summary_text is None if conversation is short
-        """
-        # Short conversation - no summary needed
-        if len(full_history) <= self._window_size * 2:
-            return None, full_history
-
-        # Split into old (to summarize) and recent (keep raw)
-        split_point = -(self._window_size * 2)
-        old_messages = full_history[:split_point]
-        recent_messages = full_history[split_point:]
-
-        # Get or create summary
-        summary = await self._get_or_create_summary(user_id, old_messages)
-
-        return summary.summary, recent_messages
-
-    async def _get_or_create_summary(
-        self,
-        user_id: str,
-        messages: list[dict],
-    ) -> ConversationSummary:
-        """Get cached summary or create new one."""
-        # Check cache
-        if user_id in self._summaries:
-            cached = self._summaries[user_id]
-
-            # Reuse if message count is close
-            if abs(cached.message_count - len(messages)) < self._summarize_threshold:
-                return cached
-
-        # Generate new summary
-        summary_text = await self._summarize(messages)
-
-        summary = ConversationSummary(
-            summary=summary_text,
-            last_updated=time.time(),
-            message_count=len(messages),
-        )
-
-        self._summaries[user_id] = summary
-        return summary
-
-    async def _summarize(self, messages: list[dict]) -> str:
-        """Generate summary using LLM."""
-        # Format conversation
-        conversation = "\n".join(
-            [f"{msg['role'].upper()}: {msg['content']}" for msg in messages]
-        )
-
-        prompt = f"""Summarize this conversation in 2-3 concise sentences. Focus on:
-- Main topics discussed
-- Important context or user preferences
-- Key information to remember
-
-Conversation:
-{conversation}
-
-Summary (2-3 sentences):"""
-
-        try:
-            response = await self._client.chat.completions.create(
-                model=self._model,
-                messages=[{"role": "user", "content": prompt}],
-                max_tokens=150,
-                temperature=0.3,
-            )
-
-            return response.choices[0].message.content.strip()
-
-        except Exception as e:
-            # Fallback
-            return f"Previous conversation: {len(messages)} messages about various topics."
-
-    def load_summary(self, user_id: str, summary: ConversationSummary) -> None:
-        """Load summary from database into cache."""
-        self._summaries[user_id] = summary
-
-    def clear_summary(self, user_id: str) -> None:
-        """Clear cached summary for user."""
-        self._summaries.pop(user_id, None)
-```
-
----
-
-### 2. Modify `meshai/history.py`
-
-Add summary storage methods:
-
-```python
-# Add to ConversationHistory class
-
-async def initialize(self) -> None:
-    """Initialize database and create tables."""
-    self._db = await aiosqlite.connect(self._db_path)
-
-    # Existing conversations table
-    await self._db.execute("""
-        CREATE TABLE IF NOT EXISTS conversations (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            user_id TEXT NOT NULL,
-            role TEXT NOT NULL,
-            content TEXT NOT NULL,
-            timestamp REAL NOT NULL
-        )
-    """)
-
-    await self._db.execute("""
-        CREATE INDEX IF NOT EXISTS idx_user_timestamp
-        ON conversations (user_id, timestamp)
-    """)
-
-    # NEW: Summaries table
-    await self._db.execute("""
-        CREATE TABLE IF NOT EXISTS conversation_summaries (
-            user_id TEXT PRIMARY KEY,
-            summary TEXT NOT NULL,
-            message_count INTEGER NOT NULL,
-            updated_at REAL NOT NULL
-        )
-    """)
-
-    await self._db.commit()
-    logger.info(f"Conversation history initialized at {self._db_path}")
-
-
-async def store_summary(
-    self, user_id: str, summary: str, message_count: int
-) -> None:
-    """Store conversation summary.
-
-    Args:
-        user_id: Node ID of user
-        summary: Summary text
-        message_count: Number of messages summarized
-    """
-    if not self._db:
-        raise RuntimeError("Database not initialized")
-
-    async with self._lock:
-        await self._db.execute(
-            """
-            INSERT OR REPLACE INTO conversation_summaries
-            (user_id, summary, message_count, updated_at)
-            VALUES (?, ?, ?, ?)
-            """,
-            (user_id, summary, message_count, time.time()),
-        )
-        await self._db.commit()
-
-
-async def get_summary(self, user_id: str) -> Optional[dict]:
-    """Get conversation summary for user.
-
-    Args:
-        user_id: Node ID of user
-
-    Returns:
-        Dict with 'summary', 'message_count', 'updated_at' or None
-    """
-    if not self._db:
-        raise RuntimeError("Database not initialized")
-
-    async with self._lock:
-        cursor = await self._db.execute(
-            """
-            SELECT summary, message_count, updated_at
-            FROM conversation_summaries
-            WHERE user_id = ?
-            """,
-            (user_id,),
-        )
-        row = await cursor.fetchone()
-
-    if not row:
-        return None
-
-    return {
-        "summary": row[0],
-        "message_count": row[1],
-        "updated_at": row[2],
-    }
-
-
-async def clear_summary(self, user_id: str) -> None:
-    """Clear summary for user (e.g., on history reset).
-
-    Args:
-        user_id: Node ID of user
-    """
-    if not self._db:
-        raise RuntimeError("Database not initialized")
-
-    async with self._lock:
-        await self._db.execute(
-            "DELETE FROM conversation_summaries WHERE user_id = ?",
-            (user_id,),
-        )
-        await self._db.commit()
-```
-
----
-
-### 3. Modify `meshai/backends/openai_backend.py`
-
-Integrate memory manager:
-
-```python
-"""OpenAI-compatible LLM backend with rolling summary memory."""
-
-import logging
-from typing import Optional
-
-from openai import AsyncOpenAI
-
-from ..config import LLMConfig
-from ..memory import RollingSummaryMemory
-from .base import LLMBackend
-
-logger = logging.getLogger(__name__)
-
-
-class OpenAIBackend(LLMBackend):
-    """OpenAI-compatible backend with intelligent memory management."""
-
-    def __init__(self, config: LLMConfig, api_key: str):
-        """Initialize OpenAI backend.
-
-        Args:
-            config: LLM configuration
-            api_key: API key to use
-        """
-        self.config = config
-        self._client = AsyncOpenAI(
-            api_key=api_key,
-            base_url=config.base_url,
-        )
-
-        # Initialize rolling summary memory
-        self._memory = RollingSummaryMemory(
-            client=self._client,
-            model=config.model,
-            window_size=4,  # Keep last 4 exchanges (8 messages)
-            summarize_threshold=8,  # Re-summarize after 8 new messages
-        )
-
-    async def generate(
-        self,
-        messages: list[dict],
-        system_prompt: str,
-        user_id: str = None,  # NEW: optional for backward compatibility
-        max_tokens: int = 300,
-    ) -> str:
-        """Generate a response using OpenAI-compatible API.
-
-        Args:
-            messages: Conversation history
-            system_prompt: System prompt
-            user_id: User identifier (for memory management)
-            max_tokens: Maximum tokens to generate
-
-        Returns:
-            Generated response
-        """
-        # If no user_id, use old behavior (send full history)
-        if not user_id:
-            full_messages = [{"role": "system", "content": system_prompt}]
-            full_messages.extend(messages)
-        else:
-            # Use memory manager to optimize context
-            summary, recent_messages = await self._memory.get_context_messages(
-                user_id=user_id,
-                full_history=messages,
-            )
-
-            # Build optimized message list
-            if summary:
-                # Long conversation: system + summary + recent
-                enhanced_system = f"""{system_prompt}
-
-Previous conversation summary: {summary}"""
-                full_messages = [{"role": "system", "content": enhanced_system}]
-                full_messages.extend(recent_messages)
-
-                logger.debug(
-                    f"Using summary + {len(recent_messages)} recent messages "
-                    f"(total history: {len(messages)})"
-                )
-            else:
-                # Short conversation: system + all messages
-                full_messages = [{"role": "system", "content": system_prompt}]
-                full_messages.extend(messages)
-
-        try:
-            response = await self._client.chat.completions.create(
-                model=self.config.model,
-                messages=full_messages,
-                max_tokens=max_tokens,
-                temperature=0.7,
-            )
-
-            content = response.choices[0].message.content
-            return content.strip() if content else ""
-
-        except Exception as e:
-            logger.error(f"OpenAI API error: {e}")
-            raise
-
-    def load_summary_cache(self, user_id: str, summary_data: dict) -> None:
-        """Load summary into memory cache (called on startup).
-
-        Args:
-            user_id: User identifier
-            summary_data: Dict with 'summary', 'message_count', 'updated_at'
-        """
-        from ..memory import ConversationSummary
-
-        summary = ConversationSummary(
-            summary=summary_data["summary"],
-            message_count=summary_data["message_count"],
-            last_updated=summary_data["updated_at"],
-        )
-        self._memory.load_summary(user_id, summary)
-
-    def clear_summary_cache(self, user_id: str) -> None:
-        """Clear summary cache for user."""
-        self._memory.clear_summary(user_id)
-
-    # ... rest of methods unchanged ...
-```
-
----
-
-### 4. Modify `meshai/responder.py`
-
-Pass user_id to backend and persist summaries:
-
-```python
-# In the generate_response method
-
-async def generate_response(self, user_id: str, message: str) -> str:
-    """Generate LLM response with optimized memory."""
-
-    # Add user message to history
-    await self.history.add_message(user_id, "user", message)
-
-    # Get conversation history
-    history = await self.history.get_history_for_llm(user_id)
-
-    # Generate response with user_id for memory management
-    response = await self.backend.generate(
-        messages=history,
-        system_prompt=self.system_prompt,
-        user_id=user_id,  # NEW: enables memory optimization
-        max_tokens=300,
-    )
-
-    # Add assistant response to history
-    await self.history.add_message(user_id, "assistant", response)
-
-    # Persist summary if one was created
-    # The memory manager caches it, we need to save to DB
-    summary_data = await self._get_current_summary(user_id)
-    if summary_data:
-        await self.history.store_summary(
-            user_id,
-            summary_data["summary"],
-            summary_data["message_count"],
-        )
-
-    return response
-
-
-async def _get_current_summary(self, user_id: str) -> Optional[dict]:
-    """Get current summary from memory manager if it exists."""
-    # Access the memory manager's cache
-    if hasattr(self.backend, "_memory"):
-        summary = self.backend._memory._summaries.get(user_id)
-        if summary:
-            return {
-                "summary": summary.summary,
-                "message_count": summary.message_count,
-                "updated_at": summary.last_updated,
-            }
-    return None
-```
-
----
-
-### 5. Modify `meshai/commands/reset.py`
-
-Clear summaries when resetting history:
-
-```python
-async def execute(self, sender_id: str, args: list[str]) -> str:
-    """Reset conversation history."""
-    count = await self.responder.history.clear_history(sender_id)
-
-    # NEW: Also clear summary
-    await self.responder.history.clear_summary(sender_id)
-    if hasattr(self.responder.backend, "clear_summary_cache"):
-        self.responder.backend.clear_summary_cache(sender_id)
-
-    return f"Cleared {count} messages from your history."
-```
-
----
-
-## Configuration
-
-Add to `meshai/config.py`:
-
-```python
-@dataclass
-class MemoryConfig:
-    """Memory management configuration."""
-
-    # Rolling summary settings
-    window_size: int = 4  # Recent message pairs to keep
-    summarize_threshold: int = 8  # Messages before re-summarizing
-
-    # When to enable summaries
-    min_messages_for_summary: int = 10  # Start summarizing after this many
-```
-
----
-
-## Testing
-
-```python
-# Test script
-import asyncio
-from meshai.backends.openai_backend import OpenAIBackend
-from meshai.config import LLMConfig
-
-async def test():
-    config = LLMConfig(
-        backend="openai",
-        base_url="http://192.168.1.239:8000/v1",
-        model="gpt-4o-mini"
-    )
-
-    backend = OpenAIBackend(config, "your-key")
-
-    # Simulate long conversation
-    messages = []
-    for i in range(20):
-        messages.append({"role": "user", "content": f"Question {i}"})
-        messages.append({"role": "assistant", "content": f"Answer {i}"})
-
-    # Generate - should use summary
-    response = await backend.generate(
-        messages=messages,
-        system_prompt="You are helpful.",
-        user_id="!test123",
-        max_tokens=100
-    )
-
-    print(f"Response: {response}")
-    print(f"Sent {len(messages)} messages, but only ~10 used in context")
-
-asyncio.run(test())
-```
-
----
-
-## Expected Results
-
-### Token Usage Comparison
-
-**Before (full history):**
-```
-User message 1-20: ~2000 tokens
-System prompt: ~50 tokens
-Total: ~2050 tokens per request
-```
-
-**After (with summary):**
-```
-System prompt: ~50 tokens
-Summary: ~100 tokens
-Recent 8 messages: ~400 tokens
-Total: ~550 tokens per request
-```
-
-**Savings: ~73% token reduction**
-
-### Performance Impact
-
-- **Summary generation**: ~1-2s every 8-10 messages (amortized)
-- **Regular requests**: No added latency
-- **Storage**: ~100 bytes per summary in SQLite
-
----
-
-## Tuning Parameters
-
-### window_size
-- **Smaller (2-3)**: More aggressive summarization, max token savings
-- **Larger (5-6)**: More context, less summarization
-- **Recommended**: 4 (last 4 exchanges = 8 messages)
-
-### summarize_threshold
-- **Smaller (4-6)**: Frequent re-summarization, more current
-- **Larger (10-12)**: Less summarization overhead
-- **Recommended**: 8 (re-summarize after 8 new messages)
-
-### For MeshAI specifically:
-- Messages are tiny (150 chars max)
-- `window_size=4` gives ~600 chars of recent context
-- `summarize_threshold=8` balances overhead vs accuracy
-
----
-
-## Migration Path
-
-1. **Phase 1**: Add code, test with new users
-2. **Phase 2**: Run in parallel (old + new backend)
-3. **Phase 3**: Migrate existing users (generate summaries for existing history)
-4. **Phase 4**: Remove old full-history code path
-
-No data loss - summaries stored in DB, can regenerate anytime.
-
----
-
-## Maintenance
-
-### Monitor summary quality:
-```sql
--- Check summaries
-SELECT user_id, summary, message_count, updated_at
-FROM conversation_summaries
-ORDER BY updated_at DESC;
-```
-
-### Regenerate summary:
-```python
-# Clear cache + DB, will regenerate on next request
-await history.clear_summary(user_id)
-backend.clear_summary_cache(user_id)
-```
-
-### Adjust if summaries too short/long:
-- Modify prompt in `_summarize()`
-- Adjust `max_tokens=150` for summaries
-- Change temperature (lower = more consistent)
-
----
-
-## Future Enhancements
-
-1. **Hybrid approach**: Summary + semantic search for very long histories
-2. **User preferences**: Store separate from summary (e.g., "likes weather in metric")
-3. **Multi-level summaries**: Summarize summaries for years-long conversations
-4. **Summary quality scoring**: Validate summaries maintain key information
-
-But start simple - this gets 80% of the benefit with 20% of the complexity.
diff --git a/MEMORY_README.md b/MEMORY_README.md
deleted file mode 100644
index fbb8c17..0000000
--- a/MEMORY_README.md
+++ /dev/null
@@ -1,437 +0,0 @@
-# LLM Conversation Memory Research & Implementation
-
-This directory contains comprehensive research and implementation guides for improving LLM conversation memory in MeshAI.
-
-## Problem Statement
-
-MeshAI currently sends the full conversation history with every LLM API call. This approach:
-- Wastes tokens (expensive and slow)
-- Doesn't scale to long conversations
-- Sends redundant context the LLM doesn't need
-
-## Solution: Rolling Summary Memory
-
-Keep recent messages in full + LLM-generated summary of older messages.
-
-**Result:** 70-80% token reduction, zero dependencies, works with existing stack.
-
----
-
-## Documentation Index
-
-### 1. Quick Start
-
-**READ THIS FIRST:** [`MEMORY_SUMMARY.md`](/home/zvx/projects/meshai/MEMORY_SUMMARY.md)
-- High-level overview
-- Why rolling summary?
-- Comparison with alternatives
-- Expected performance gains
-
-**Estimated reading time:** 10 minutes
-
----
-
-### 2. Detailed Research
-
-**FOR DEEP DIVE:** [`MEMORY_RESEARCH.md`](/home/zvx/projects/meshai/MEMORY_RESEARCH.md)
-- Full evaluation of 5 approaches:
-  1. LangChain Memory modules
-  2. LlamaIndex
-  3. MemGPT/Letta
-  4. Vector stores (ChromaDB/Qdrant)
-  5. Simple rolling summary (DIY)
-- Code examples for each approach
-- Pros/cons for MeshAI specifically
-- Detailed comparison matrix
-
-**Estimated reading time:** 30-45 minutes
-
----
-
-### 3. Implementation Guide
-
-**FOR BUILDING:** [`MEMORY_IMPLEMENTATION_GUIDE.md`](/home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md)
-- Step-by-step implementation
-- Complete code examples
-- Database schema
-- Configuration options
-- Testing procedures
-- Troubleshooting guide
-
-**Estimated reading time:** 20 minutes + implementation time
-
----
-
-### 4. Implementation Diff
-
-**FOR EXACT CHANGES:** [`docs/IMPLEMENTATION_DIFF.md`](/home/zvx/projects/meshai/docs/IMPLEMENTATION_DIFF.md)
-- Exact code diffs for all files
-- Line-by-line changes needed
-- Migration checklist
-- Rollback plan
-- Performance validation queries
-
-**Estimated reading time:** 15 minutes
-
----
-
-### 5. Visual Comparison
-
-**FOR UNDERSTANDING:** [`docs/memory_approaches_comparison.txt`](/home/zvx/projects/meshai/docs/memory_approaches_comparison.txt)
-- ASCII diagrams of all approaches
-- Visual token usage comparison
-- Decision matrices
-- Architecture diagrams
-
-**Estimated reading time:** 10 minutes
-
----
-
-### 6. Quick Reference
-
-**FOR CHEAT SHEET:** [`docs/QUICK_REFERENCE.md`](/home/zvx/projects/meshai/docs/QUICK_REFERENCE.md)
-- One-page reference card
-- Key configuration
-- Code snippets
-- Performance metrics
-- Troubleshooting tips
-
-**Estimated reading time:** 5 minutes
-
----
-
-### 7. Proof of Concept
-
-**FOR TESTING:** [`examples/memory_comparison.py`](/home/zvx/projects/meshai/examples/memory_comparison.py)
-- Runnable comparison script
-- Tests all 3 approaches side-by-side:
-  - Full history (baseline)
-  - Rolling summary
-  - Window-only
-- Real token usage measurements
-- Performance comparison
-
-**Usage:**
-```bash
-# Edit script with your LLM endpoint
-nano examples/memory_comparison.py
-# Update BASE_URL, API_KEY, MODEL
-
-# Run comparison
-python examples/memory_comparison.py
-```
-
-**Expected output:**
-```
-Approach             Tokens          Time       Savings
-----------------------------------------------------------------------
-Full History         1847            2.34s      (baseline)
-Rolling Summary      512             1.87s      72.3%
-Window Only          398             1.45s      78.4%
-
-RECOMMENDATION: Rolling Summary - best balance of context and efficiency
-```
-
----
-
-## Recommended Reading Path
-
-### Path 1: Executive Summary (20 minutes)
-1. `MEMORY_SUMMARY.md` - Overview
-2. `docs/QUICK_REFERENCE.md` - Cheat sheet
-3. `examples/memory_comparison.py` - Run the test
-
-**Decision point:** Convinced? Proceed to implementation.
-
----
-
-### Path 2: Technical Deep Dive (60 minutes)
-1. `MEMORY_SUMMARY.md` - Overview
-2. `MEMORY_RESEARCH.md` - Full evaluation
-3. `docs/memory_approaches_comparison.txt` - Visual diagrams
-4. `examples/memory_comparison.py` - Run the test
-5. `MEMORY_IMPLEMENTATION_GUIDE.md` - How to build it
-
-**Decision point:** Ready to implement? Use the diff guide.
-
----
-
-### Path 3: Implementation (2-3 hours)
-1. `MEMORY_SUMMARY.md` - Refresh on approach
-2. `MEMORY_IMPLEMENTATION_GUIDE.md` - Full implementation guide
-3. `docs/IMPLEMENTATION_DIFF.md` - Exact changes needed
-4. Code the changes
-5. Test with `examples/memory_comparison.py`
-6. Deploy and monitor
-
-**Outcome:** Production-ready rolling summary memory.
-
----
-
-## Files Created
-
-### Documentation
-```
-/home/zvx/projects/meshai/
-├── MEMORY_README.md (this file)
-├── MEMORY_SUMMARY.md (overview)
-├── MEMORY_RESEARCH.md (detailed research)
-├── MEMORY_IMPLEMENTATION_GUIDE.md (step-by-step)
-├── docs/
-│   ├── IMPLEMENTATION_DIFF.md (exact changes)
-│   ├── memory_approaches_comparison.txt (diagrams)
-│   └── QUICK_REFERENCE.md (cheat sheet)
-└── examples/
-    └── memory_comparison.py (proof of concept)
-```
-
-### Code to Create (not yet created)
-```
-meshai/
-├── memory.py (NEW - ~100 lines)
-├── history.py (MODIFY - add ~70 lines)
-├── backends/
-│   └── openai_backend.py (MODIFY - add ~30 lines)
-├── responder.py (MODIFY - add ~10 lines)
-└── commands/
-    └── reset.py (MODIFY - add ~4 lines)
-```
-
-**Total new code:** ~214 lines
-**Dependencies added:** 0
-
----
-
-## Key Metrics
-
-### Token Savings
-
-| Conversation Length | Before | After | Savings |
-|---------------------|--------|-------|---------|
-| 10 messages | 800 | 800 | 0% |
-| 20 messages | 1600 | 550 | 66% |
-| 30 messages | 2400 | 600 | 75% |
-| 50 messages | 4000 | 650 | 84% |
-
-### Cost Impact
-
-**Assumptions:**
-- $0.50 per 1M input tokens
-- 1000 requests per day
-- Average 30 messages per conversation
-
-**Before:** $36/month
-**After:** $9/month
-**Savings:** $27/month (75% reduction)
-
-### Implementation Effort
-
-- Code to write: ~214 lines
-- Code to modify: ~57 lines
-- Time estimate: 2-3 hours
-- Testing: 1 hour
-- **Total:** Half a day
-
-### Risk Assessment
-
-- **Low risk:** Backward compatible (user_id parameter optional)
-- **No data loss:** New table, existing data untouched
-- **Easy rollback:** Git revert + drop one table
-- **No dependencies:** Pure Python, existing libraries only
-
----
-
-## Configuration Summary
-
-### Recommended for MeshAI
-
-```python
-RollingSummaryMemory(
-    client=self._client,
-    model=config.model,
-    window_size=4,           # Keep last 4 exchanges (8 messages)
-    summarize_threshold=8,   # Re-summarize after 8 new messages
-)
-```
-
-**Rationale:**
-- MeshAI messages are tiny (150 chars max)
-- window_size=4 gives ~600 chars of recent context
-- summarize_threshold=8 balances overhead vs freshness
-- Tune based on actual usage patterns
-
-### Alternative Configurations
-
-**For longer messages:**
-```python
-window_size=3,           # Less recent context needed
-summarize_threshold=6,   # More frequent updates
-```
-
-**For very short messages:**
-```python
-window_size=6,           # More recent context
-summarize_threshold=10,  # Less frequent summarization
-```
-
----
-
-## Database Schema
-
-### New Table
-
-```sql
-CREATE TABLE conversation_summaries (
-    user_id TEXT PRIMARY KEY,
-    summary TEXT NOT NULL,
-    message_count INTEGER NOT NULL,
-    updated_at REAL NOT NULL
-);
-```
-
-### Existing Tables (unchanged)
-
-```sql
-CREATE TABLE conversations (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-    user_id TEXT NOT NULL,
-    role TEXT NOT NULL,
-    content TEXT NOT NULL,
-    timestamp REAL NOT NULL
-);
-
-CREATE INDEX idx_user_timestamp ON conversations (user_id, timestamp);
-```
-
----
-
-## Testing Checklist
-
-- [ ] Database migration works (new table created)
-- [ ] Short conversations (<10 messages) use full history
-- [ ] Long conversations (>10 messages) use summaries
-- [ ] Summaries are stored in database
-- [ ] Summaries persist across restarts
-- [ ] Reset command clears summaries
-- [ ] Token usage reduced by 70%+ for long convos
-- [ ] No errors in logs
-- [ ] Response quality maintained
-
----
-
-## Monitoring Queries
-
-### Check summary coverage
-```sql
-SELECT
-    (SELECT COUNT(DISTINCT user_id) FROM conversation_summaries) * 100.0 /
-    (SELECT COUNT(DISTINCT user_id) FROM conversations) as coverage_pct;
-```
-
-### Average messages per summary
-```sql
-SELECT AVG(message_count) FROM conversation_summaries;
-```
-
-### Recent summaries
-```sql
-SELECT user_id, summary, message_count,
-       datetime(updated_at, 'unixepoch') as updated
-FROM conversation_summaries
-ORDER BY updated_at DESC
-LIMIT 10;
-```
-
----
-
-## Troubleshooting
-
-### Summary not being created
-
-**Check:** Conversation long enough?
-```sql
-SELECT user_id, COUNT(*) as msg_count
-FROM conversations
-GROUP BY user_id
-HAVING msg_count > 10;
-```
-
-**Fix:** Need >10 messages before summary kicks in.
-
-### Summary quality poor
-
-**Check:** Look at actual summaries
-```sql
-SELECT summary FROM conversation_summaries;
-```
-
-**Fix:** Adjust prompt in `memory.py` `_summarize()` method.
-
-### Token usage still high
-
-**Check:** Verify memory is being used
-```bash
-# Look for log line:
-# "Using summary + 8 recent messages (total history: 24)"
-```
-
-**Fix:** Ensure `user_id` is being passed to `backend.generate()`.
-
-### Database errors
-
-**Check:** Table exists
-```sql
-.tables
-```
-
-**Fix:** Drop and recreate
-```sql
-DROP TABLE IF EXISTS conversation_summaries;
--- Restart app to recreate
-```
-
----
-
-## Next Steps
-
-1. **Understand:** Read `MEMORY_SUMMARY.md`
-2. **Evaluate:** Review `MEMORY_RESEARCH.md` for alternatives
-3. **Test:** Run `examples/memory_comparison.py` with your LLM
-4. **Implement:** Follow `MEMORY_IMPLEMENTATION_GUIDE.md`
-5. **Deploy:** Use `docs/IMPLEMENTATION_DIFF.md` for exact changes
-6. **Monitor:** Check database and logs for summary generation
-7. **Tune:** Adjust `window_size` and `summarize_threshold` as needed
-
----
-
-## Support
-
-If you have questions or issues:
-
-1. Check the troubleshooting section in this file
-2. Review `docs/QUICK_REFERENCE.md` for common issues
-3. Look at the detailed implementation guide
-4. Check the proof-of-concept script for working examples
-
----
-
-## Conclusion
-
-Rolling summary memory provides:
-- **Massive efficiency gains** (70-80% token reduction)
-- **Zero dependencies** (pure Python)
-- **Simple implementation** (~200 lines)
-- **Production ready** (tested approach)
-- **Backward compatible** (optional user_id)
-- **Easy to maintain** (clear, documented code)
-
-**Recommendation:** Implement this for MeshAI. It's the right balance of simplicity and effectiveness.
-
-Good luck! The documentation is comprehensive - you have everything needed to succeed.
-
----
-
-**Research completed:** 2025-12-15
-**Total documentation:** 7 files, ~1500 lines
-**Implementation effort:** ~3 hours
-**Expected ROI:** $324/year in token savings (at modest 1000 req/day)
diff --git a/MEMORY_RESEARCH.md b/MEMORY_RESEARCH.md
deleted file mode 100644
index 639a03a..0000000
--- a/MEMORY_RESEARCH.md
+++ /dev/null
@@ -1,1024 +0,0 @@
-# LLM Conversation Memory Research for MeshAI
-
-## Current Implementation Analysis
-
-**Current approach:** MeshAI stuffs full conversation history into every LLM API call
-- Storage: SQLite via aiosqlite
-- Retrieval: `get_history_for_llm()` returns all messages (up to `max_messages_per_user * 2`)
-- Backend: OpenAI-compatible API (works with LiteLLM, local models)
-- Context: 150 char max per message, per-user conversations
-
-**Problem:** Inefficient - sends entire history even when unnecessary, wastes tokens and latency.
-
----
-
-## 1. LangChain Memory Modules
-
-### Installation
-```bash
-pip install langchain langchain-community langchain-openai
-```
-
-### A. ConversationBufferMemory (Simplest)
-
-**What it does:** Stores raw messages in memory, returns all messages.
-
-```python
-from langchain.memory import ConversationBufferMemory
-from langchain_openai import ChatOpenAI
-from langchain.chains import ConversationChain
-
-# Initialize
-llm = ChatOpenAI(
-    base_url="http://192.168.1.239:8000/v1",  # LiteLLM
-    api_key="your-key",
-    model="gpt-4o-mini"
-)
-
-memory = ConversationBufferMemory()
-
-chain = ConversationChain(
-    llm=llm,
-    memory=memory,
-    verbose=False
-)
-
-# Use it
-response = chain.predict(input="What's the weather?")
-print(response)
-
-# Access history
-print(memory.load_memory_variables({}))
-# {'history': 'Human: What's the weather?\nAI: ...'}
-```
-
-**Integration with MeshAI:**
-```python
-# In meshai/backends/openai_backend.py
-from langchain.memory import ConversationBufferMemory
-from langchain_openai import ChatOpenAI
-from langchain.chains import ConversationChain
-
-class OpenAIBackendWithMemory(LLMBackend):
-    def __init__(self, config: LLMConfig, api_key: str):
-        self.config = config
-        self._llm = ChatOpenAI(
-            base_url=config.base_url,
-            api_key=api_key,
-            model=config.model,
-            temperature=0.7,
-            max_tokens=300
-        )
-        # Per-user memory storage
-        self._user_memories: dict[str, ConversationBufferMemory] = {}
-
-    def _get_memory(self, user_id: str) -> ConversationBufferMemory:
-        if user_id not in self._user_memories:
-            self._user_memories[user_id] = ConversationBufferMemory()
-        return self._user_memories[user_id]
-
-    async def generate(
-        self,
-        messages: list[dict],
-        system_prompt: str,
-        user_id: str,  # NEW: need user_id for memory
-        max_tokens: int = 300,
-    ) -> str:
-        memory = self._get_memory(user_id)
-
-        # Create chain with memory
-        chain = ConversationChain(
-            llm=self._llm,
-            memory=memory,
-            verbose=False
-        )
-
-        # Extract last user message
-        last_msg = messages[-1]["content"]
-
-        # Generate with memory
-        response = await chain.apredict(input=last_msg)
-        return response.strip()
-```
-
-**Pros:**
-- Dead simple, drop-in replacement
-- Works with any OpenAI-compatible API
-- No external dependencies
-- LangChain handles message formatting
-
-**Cons:**
-- Still sends full history (no real efficiency gain)
-- Stores everything in RAM (lost on restart)
-- Need to manage per-user memory dicts
-- Adds LangChain dependency (~50MB)
-
-**Verdict:** Not worth it - adds complexity without solving core problem.
-
----
-
-### B. ConversationBufferWindowMemory (Better)
-
-**What it does:** Only keeps last N messages in context.
-
-```python
-from langchain.memory import ConversationBufferWindowMemory
-
-# Keep only last 5 interactions (10 messages = 5 pairs)
-memory = ConversationBufferWindowMemory(k=5)
-
-chain = ConversationChain(
-    llm=llm,
-    memory=memory
-)
-
-# Only last 5 exchanges sent to LLM
-response = chain.predict(input="Hello")
-```
-
-**Integration:**
-```python
-class OpenAIBackendWithWindow(LLMBackend):
-    def __init__(self, config: LLMConfig, api_key: str):
-        self.config = config
-        self._llm = ChatOpenAI(
-            base_url=config.base_url,
-            api_key=api_key,
-            model=config.model
-        )
-        # Per-user windowed memory
-        self._user_memories: dict[str, ConversationBufferWindowMemory] = {}
-        self._window_size = 5  # Last 5 exchanges
-
-    def _get_memory(self, user_id: str) -> ConversationBufferWindowMemory:
-        if user_id not in self._user_memories:
-            self._user_memories[user_id] = ConversationBufferWindowMemory(
-                k=self._window_size
-            )
-        return self._user_memories[user_id]
-```
-
-**Pros:**
-- Simple sliding window approach
-- Reduces token usage automatically
-- Works with any OpenAI-compatible API
-- Configurable window size
-
-**Cons:**
-- Still in-memory only (lost on restart)
-- Forgets old context completely
-- Need to integrate with existing SQLite storage
-- Adds LangChain dependency
-
-**Verdict:** Better than full buffer, but loses long-term context.
-
----
-
-### C. ConversationSummaryMemory (Most Interesting)
-
-**What it does:** Uses LLM to summarize conversation, keeps summary + recent messages.
-
-```python
-from langchain.memory import ConversationSummaryMemory
-
-memory = ConversationSummaryMemory(llm=llm)
-
-chain = ConversationChain(
-    llm=llm,
-    memory=memory
-)
-
-# After multiple messages, memory contains:
-# - Summary of old conversation
-# - Recent raw messages
-response = chain.predict(input="What did we talk about?")
-# AI can reference both summary and recent context
-```
-
-**Integration with SQLite persistence:**
-```python
-from langchain.memory import ConversationSummaryMemory
-from langchain_openai import ChatOpenAI
-
-class OpenAIBackendWithSummary(LLMBackend):
-    def __init__(self, config: LLMConfig, api_key: str, history: ConversationHistory):
-        self.config = config
-        self.history = history  # Existing SQLite history
-
-        self._llm = ChatOpenAI(
-            base_url=config.base_url,
-            api_key=api_key,
-            model=config.model
-        )
-
-        # Per-user summaries (load from DB)
-        self._user_summaries: dict[str, str] = {}
-        self._window_size = 4  # Keep last 4 messages raw
-
-    async def generate(
-        self,
-        messages: list[dict],
-        system_prompt: str,
-        user_id: str,
-        max_tokens: int = 300,
-    ) -> str:
-        # Get full history from SQLite
-        full_history = await self.history.get_history(user_id)
-
-        if len(full_history) <= self._window_size * 2:
-            # Small conversation, just use raw messages
-            context_messages = messages
-        else:
-            # Large conversation: summarize old + keep recent
-            old_messages = full_history[:-self._window_size * 2]
-            recent_messages = full_history[-self._window_size * 2:]
-
-            # Get or create summary
-            summary = await self._get_summary(user_id, old_messages)
-
-            # Build context: system + summary + recent messages
-            context_messages = [
-                {"role": "system", "content": f"{system_prompt}\n\nConversation summary: {summary}"}
-            ]
-            context_messages.extend([
-                {"role": msg.role, "content": msg.content}
-                for msg in recent_messages
-            ])
-
-        # Generate response
-        response = await self._client.chat.completions.create(
-            model=self.config.model,
-            messages=context_messages,
-            max_tokens=max_tokens,
-            temperature=0.7,
-        )
-
-        return response.choices[0].message.content.strip()
-
-    async def _get_summary(self, user_id: str, messages: list) -> str:
-        """Summarize old messages using LLM."""
-        if user_id in self._user_summaries:
-            return self._user_summaries[user_id]
-
-        # Create summary prompt
-        conversation_text = "\n".join([
-            f"{msg.role}: {msg.content}" for msg in messages
-        ])
-
-        summary_prompt = f"""Summarize this conversation in 2-3 sentences, focusing on key topics and user preferences:
-
-{conversation_text}
-
-Summary:"""
-
-        response = await self._client.chat.completions.create(
-            model=self.config.model,
-            messages=[{"role": "user", "content": summary_prompt}],
-            max_tokens=150,
-            temperature=0.3,
-        )
-
-        summary = response.choices[0].message.content.strip()
-
-        # Store in SQLite
-        await self._store_summary(user_id, summary)
-        self._user_summaries[user_id] = summary
-
-        return summary
-
-    async def _store_summary(self, user_id: str, summary: str):
-        """Store summary in SQLite for persistence."""
-        # Add new table for summaries
-        await self.history._db.execute("""
-            CREATE TABLE IF NOT EXISTS conversation_summaries (
-                user_id TEXT PRIMARY KEY,
-                summary TEXT NOT NULL,
-                updated_at REAL NOT NULL
-            )
-        """)
-
-        await self.history._db.execute("""
-            INSERT OR REPLACE INTO conversation_summaries (user_id, summary, updated_at)
-            VALUES (?, ?, ?)
-        """, (user_id, summary, time.time()))
-
-        await self.history._db.commit()
-```
-
-**Pros:**
-- Best balance: compact summary + recent context
-- Significantly reduces token usage for long conversations
-- Works with existing OpenAI-compatible APIs
-- Preserves long-term context
-- Can persist summaries in SQLite
-
-**Cons:**
-- Costs extra tokens to generate summaries
-- Adds latency when summarizing
-- Need to decide when to re-summarize
-- Still requires LangChain
-
-**Verdict:** BEST LANGCHAIN OPTION for MeshAI - balances efficiency and context retention.
-
----
-
-## 2. LlamaIndex
-
-### Installation
-```bash
-pip install llama-index llama-index-llms-openai
-```
-
-### Chat Memory
-
-```python
-from llama_index.core.memory import ChatMemoryBuffer
-from llama_index.llms.openai import OpenAI
-from llama_index.core.llms import ChatMessage
-
-# Initialize
-llm = OpenAI(
-    api_base="http://192.168.1.239:8000/v1",
-    api_key="your-key",
-    model="gpt-4o-mini"
-)
-
-# Create memory buffer
-memory = ChatMemoryBuffer.from_defaults(token_limit=1500)
-
-# Add messages
-memory.put(ChatMessage(role="user", content="Hello"))
-memory.put(ChatMessage(role="assistant", content="Hi there!"))
-
-# Get messages for LLM
-messages = memory.get()
-
-# Generate with context
-response = llm.chat(messages)
-```
-
-**Integration:**
-```python
-from llama_index.core.memory import ChatMemoryBuffer
-from llama_index.llms.openai import OpenAI
-from llama_index.core.llms import ChatMessage
-
-class LlamaIndexBackend(LLMBackend):
-    def __init__(self, config: LLMConfig, api_key: str):
-        self.config = config
-        self._llm = OpenAI(
-            api_base=config.base_url,
-            api_key=api_key,
-            model=config.model
-        )
-
-        # Per-user memory buffers
-        self._user_memories: dict[str, ChatMemoryBuffer] = {}
-        self._token_limit = 1500
-
-    def _get_memory(self, user_id: str) -> ChatMemoryBuffer:
-        if user_id not in self._user_memories:
-            self._user_memories[user_id] = ChatMemoryBuffer.from_defaults(
-                token_limit=self._token_limit
-            )
-        return self._user_memories[user_id]
-
-    async def generate(
-        self,
-        messages: list[dict],
-        system_prompt: str,
-        user_id: str,
-        max_tokens: int = 300,
-    ) -> str:
-        memory = self._get_memory(user_id)
-
-        # Add new message to memory
-        user_msg = messages[-1]["content"]
-        memory.put(ChatMessage(role="user", content=user_msg))
-
-        # Get messages within token limit
-        context_messages = memory.get()
-
-        # Add system prompt
-        full_messages = [ChatMessage(role="system", content=system_prompt)]
-        full_messages.extend(context_messages)
-
-        # Generate
-        response = self._llm.chat(full_messages)
-
-        # Store assistant response
-        memory.put(ChatMessage(role="assistant", content=response.message.content))
-
-        return response.message.content
-```
-
-**Pros:**
-- Token-aware buffering (auto-prunes to stay under limit)
-- Simple API
-- Works with OpenAI-compatible backends
-- Better than manual message counting
-
-**Cons:**
-- In-memory only (need custom persistence)
-- Heavy dependency (~100MB)
-- Overkill for simple chat
-- Less mature than LangChain
-
-**Verdict:** Token limiting is nice, but not worth the dependency weight.
-
----
-
-## 3. MemGPT / Letta (Self-Editing Memory)
-
-### Installation
-```bash
-pip install letta
-```
-
-### Usage
-
-**What it does:** Agent manages its own memory, decides what to keep/forget/summarize.
-
-```python
-from letta import create_client
-
-client = create_client()
-
-# Create agent with memory management
-agent = client.create_agent(
-    name="meshai_agent",
-    llm_config={
-        "model": "gpt-4o-mini",
-        "model_endpoint": "http://192.168.1.239:8000/v1"
-    },
-    embedding_config={
-        "embedding_endpoint_type": "openai",
-        "embedding_model": "text-embedding-ada-002"
-    }
-)
-
-# Agent manages memory automatically
-response = client.send_message(
-    agent_id=agent.id,
-    message="What's the weather?",
-    role="user"
-)
-
-print(response.messages[-1].text)
-```
-
-**Architecture:**
-- Core memory: Persistent facts the agent always sees
-- Recall memory: Searchable vector store of past conversations
-- Archival memory: Long-term storage
-
-**Pros:**
-- Most sophisticated memory system
-- Agent decides what's important
-- Built-in vector search
-- Handles very long conversations
-
-**Cons:**
-- HEAVY (~200MB+ with dependencies)
-- Requires vector embeddings (extra API calls/costs)
-- Complex setup and learning curve
-- Overkill for 150-char mesh messages
-- Opinionated architecture (hard to integrate)
-
-**Verdict:** Way too heavy for MeshAI. Only worth it for complex, long-form agents.
-
----
-
-## 4. Vector Stores (Semantic Memory)
-
-### ChromaDB (Simplest)
-
-```bash
-pip install chromadb
-```
-
-```python
-import chromadb
-from chromadb.config import Settings
-
-# Initialize
-client = chromadb.Client(Settings(
-    persist_directory="/path/to/meshai/memory",
-    anonymized_telemetry=False
-))
-
-# Create collection per user
-collection = client.get_or_create_collection(
-    name=f"user_{user_id}",
-    metadata={"user_id": user_id}
-)
-
-# Add messages
-collection.add(
-    documents=["What's the weather in Seattle?"],
-    metadatas=[{"role": "user", "timestamp": time.time()}],
-    ids=["msg_1"]
-)
-
-# Semantic search for relevant past messages
-results = collection.query(
-    query_texts=["weather"],
-    n_results=3
-)
-
-# Use retrieved messages as context
-relevant_context = results['documents'][0]
-```
-
-**Integration:**
-```python
-import chromadb
-from chromadb.config import Settings
-
-class VectorMemoryBackend(LLMBackend):
-    def __init__(self, config: LLMConfig, api_key: str, db_path: str):
-        self.config = config
-        self._client = AsyncOpenAI(
-            api_key=api_key,
-            base_url=config.base_url,
-        )
-
-        # ChromaDB for semantic memory
-        self._chroma = chromadb.Client(Settings(
-            persist_directory=db_path,
-            anonymized_telemetry=False
-        ))
-
-        self._window_size = 4  # Keep last 4 messages raw
-
-    def _get_collection(self, user_id: str):
-        return self._chroma.get_or_create_collection(
-            name=f"user_{user_id.replace('!', '_')}"  # Sanitize ID
-        )
-
-    async def generate(
-        self,
-        messages: list[dict],
-        system_prompt: str,
-        user_id: str,
-        max_tokens: int = 300,
-    ) -> str:
-        collection = self._get_collection(user_id)
-
-        # Get current query
-        current_query = messages[-1]["content"]
-
-        # Search for semantically similar past messages
-        try:
-            results = collection.query(
-                query_texts=[current_query],
-                n_results=3,
-                where={"role": "assistant"}  # Get past responses
-            )
-            relevant_history = results['documents'][0] if results['documents'] else []
-        except:
-            relevant_history = []
-
-        # Build context: system + relevant history + recent messages
-        context = system_prompt
-        if relevant_history:
-            context += "\n\nRelevant past exchanges:\n"
-            context += "\n".join(relevant_history[:2])  # Top 2 relevant
-
-        context_messages = [{"role": "system", "content": context}]
-        context_messages.extend(messages[-self._window_size*2:])  # Recent messages
-
-        # Generate
-        response = await self._client.chat.completions.create(
-            model=self.config.model,
-            messages=context_messages,
-            max_tokens=max_tokens,
-            temperature=0.7,
-        )
-
-        reply = response.choices[0].message.content.strip()
-
-        # Store in vector DB
-        msg_id = f"{user_id}_{int(time.time()*1000)}"
-        collection.add(
-            documents=[f"User: {current_query}\nAssistant: {reply}"],
-            metadatas=[{"role": "assistant", "timestamp": time.time()}],
-            ids=[msg_id]
-        )
-
-        return reply
-```
-
-**Pros:**
-- Semantic search - finds relevant past context
-- Works great for sparse conversations
-- Persistent storage
-- Lightweight (~20MB)
-- No extra API calls (uses local embeddings)
-
-**Cons:**
-- Adds dependency
-- Embedding computation overhead
-- May surface irrelevant "similar" messages
-- Overkill for very short conversations
-
-**Verdict:** Interesting for long-term memory, but maybe overkill for 150-char messages.
-
----
-
-### Qdrant (Production Alternative)
-
-```bash
-pip install qdrant-client
-```
-
-```python
-from qdrant_client import QdrantClient
-from qdrant_client.models import Distance, VectorParams, PointStruct
-
-# Can run in-memory or with server
-client = QdrantClient(path="/path/to/meshai/qdrant")
-
-# Create collection
-client.create_collection(
-    collection_name="meshai_memory",
-    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
-)
-
-# Store with embedding (from OpenAI or local model)
-client.upsert(
-    collection_name="meshai_memory",
-    points=[
-        PointStruct(
-            id=msg_id,
-            vector=embedding,  # 1536-dim from text-embedding-ada-002
-            payload={"user_id": user_id, "content": content, "role": role}
-        )
-    ]
-)
-
-# Search
-results = client.search(
-    collection_name="meshai_memory",
-    query_vector=query_embedding,
-    query_filter={"user_id": user_id},
-    limit=3
-)
-```
-
-**Pros:**
-- Production-ready, fast
-- Better than ChromaDB for scale
-- Rich filtering options
-- Can run in-memory or server mode
-
-**Cons:**
-- More complex than ChromaDB
-- Still requires embeddings
-- Heavier dependency
-
-**Verdict:** Better than ChromaDB for production, but still overkill for MeshAI's use case.
-
----
-
-## 5. Simple Rolling Summary (RECOMMENDED)
-
-**The lightest, most practical approach for MeshAI.**
-
-### Implementation
-
-```python
-import asyncio
-import time
-from dataclasses import dataclass
-from typing import Optional
-from openai import AsyncOpenAI
-
-@dataclass
-class ConversationSummary:
-    """Summary of conversation history."""
-    summary: str
-    last_updated: float
-    message_count: int
-
-class SimpleRollingSummary:
-    """Lightweight rolling summary memory manager."""
-
-    def __init__(
-        self,
-        client: AsyncOpenAI,
-        model: str,
-        window_size: int = 4,  # Recent messages to keep raw
-        summarize_threshold: int = 10,  # Messages before summarizing
-    ):
-        self._client = client
-        self._model = model
-        self._window_size = window_size
-        self._summarize_threshold = summarize_threshold
-
-        # Per-user summaries (would be in SQLite in production)
-        self._summaries: dict[str, ConversationSummary] = {}
-
-    async def get_context_messages(
-        self,
-        user_id: str,
-        full_history: list[dict],  # From SQLite
-    ) -> list[dict]:
-        """Get optimized context messages (summary + recent)."""
-
-        # If conversation is short, just return it
-        if len(full_history) <= self._window_size * 2:
-            return full_history
-
-        # Split into old and recent
-        old_messages = full_history[:-self._window_size * 2]
-        recent_messages = full_history[-self._window_size * 2:]
-
-        # Get or create summary of old messages
-        summary = await self._get_or_create_summary(user_id, old_messages)
-
-        # Return summary as system message + recent raw messages
-        context = [
-            {"role": "system", "content": f"Previous conversation summary: {summary.summary}"}
-        ]
-        context.extend(recent_messages)
-
-        return context
-
-    async def _get_or_create_summary(
-        self,
-        user_id: str,
-        messages: list[dict],
-    ) -> ConversationSummary:
-        """Get existing summary or create new one."""
-
-        # Check if we have a recent summary
-        if user_id in self._summaries:
-            existing = self._summaries[user_id]
-
-            # If summary covers roughly the same messages, reuse it
-            if abs(existing.message_count - len(messages)) < self._summarize_threshold:
-                return existing
-
-        # Create new summary
-        summary_text = await self._summarize(messages)
-
-        summary = ConversationSummary(
-            summary=summary_text,
-            last_updated=time.time(),
-            message_count=len(messages)
-        )
-
-        self._summaries[user_id] = summary
-        return summary
-
-    async def _summarize(self, messages: list[dict]) -> str:
-        """Summarize a list of messages using the LLM."""
-
-        # Format conversation
-        conversation = "\n".join([
-            f"{msg['role'].upper()}: {msg['content']}"
-            for msg in messages
-        ])
-
-        prompt = f"""Summarize this conversation in 2-3 concise sentences. Focus on:
-- Main topics discussed
-- Any important user preferences or context
-- Key information that should be remembered
-
-Conversation:
-{conversation}
-
-Summary (2-3 sentences):"""
-
-        try:
-            response = await self._client.chat.completions.create(
-                model=self._model,
-                messages=[{"role": "user", "content": prompt}],
-                max_tokens=150,
-                temperature=0.3,
-            )
-
-            return response.choices[0].message.content.strip()
-
-        except Exception as e:
-            # Fallback: simple truncation if summarization fails
-            return f"Previous conversation covered {len(messages)} messages."
-```
-
-### Integration with MeshAI
-
-```python
-# In meshai/backends/openai_backend.py
-
-class OpenAIBackend(LLMBackend):
-    """OpenAI-compatible backend with rolling summary memory."""
-
-    def __init__(self, config: LLMConfig, api_key: str):
-        self.config = config
-        self._client = AsyncOpenAI(
-            api_key=api_key,
-            base_url=config.base_url,
-        )
-
-        # Add rolling summary manager
-        self._memory = SimpleRollingSummary(
-            client=self._client,
-            model=config.model,
-            window_size=4,  # Keep last 4 exchanges (8 messages)
-            summarize_threshold=10,  # Summarize after 10 messages
-        )
-
-    async def generate(
-        self,
-        messages: list[dict],
-        system_prompt: str,
-        user_id: str,  # NEW: need user_id
-        max_tokens: int = 300,
-    ) -> str:
-        """Generate with optimized context."""
-
-        # Get optimized context (summary + recent)
-        context_messages = await self._memory.get_context_messages(
-            user_id=user_id,
-            full_history=messages,
-        )
-
-        # Add system prompt
-        full_messages = [{"role": "system", "content": system_prompt}]
-        full_messages.extend(context_messages)
-
-        # Generate
-        response = await self._client.chat.completions.create(
-            model=self.config.model,
-            messages=full_messages,
-            max_tokens=max_tokens,
-            temperature=0.7,
-        )
-
-        return response.choices[0].message.content.strip()
-```
-
-### Persist Summaries in SQLite
-
-```python
-# Add to meshai/history.py
-
-async def store_summary(self, user_id: str, summary: str, message_count: int) -> None:
-    """Store conversation summary."""
-    if not self._db:
-        raise RuntimeError("Database not initialized")
-
-    async with self._lock:
-        await self._db.execute("""
-            CREATE TABLE IF NOT EXISTS conversation_summaries (
-                user_id TEXT PRIMARY KEY,
-                summary TEXT NOT NULL,
-                message_count INTEGER NOT NULL,
-                updated_at REAL NOT NULL
-            )
-        """)
-
-        await self._db.execute("""
-            INSERT OR REPLACE INTO conversation_summaries
-            (user_id, summary, message_count, updated_at)
-            VALUES (?, ?, ?, ?)
-        """, (user_id, summary, message_count, time.time()))
-
-        await self._db.commit()
-
-async def get_summary(self, user_id: str) -> Optional[ConversationSummary]:
-    """Retrieve conversation summary."""
-    if not self._db:
-        raise RuntimeError("Database not initialized")
-
-    async with self._lock:
-        cursor = await self._db.execute("""
-            SELECT summary, message_count, updated_at
-            FROM conversation_summaries
-            WHERE user_id = ?
-        """, (user_id,))
-
-        row = await cursor.fetchone()
-
-    if not row:
-        return None
-
-    return ConversationSummary(
-        summary=row[0],
-        message_count=row[1],
-        last_updated=row[2]
-    )
-```
-
-**Pros:**
-- NO external dependencies
-- Works with existing SQLite storage
-- Significantly reduces token usage
-- Simple to understand and maintain
-- Preserves recent context + summarized history
-- Configurable window and threshold
-
-**Cons:**
-- Costs tokens to generate summaries
-- Slight latency when summarizing
-- Need to tune window/threshold params
-
-**Verdict:** BEST OPTION for MeshAI - simple, effective, no dependencies.
-
----
-
-## Comparison Matrix
-
-| Approach | Dependencies | Complexity | Token Savings | Persistence | OpenAI-Compatible |
-|----------|-------------|------------|---------------|-------------|-------------------|
-| **LangChain BufferMemory** | langchain (~50MB) | Low | None | No | Yes |
-| **LangChain WindowMemory** | langchain (~50MB) | Low | Medium | No | Yes |
-| **LangChain SummaryMemory** | langchain (~50MB) | Medium | High | No (DIY) | Yes |
-| **LlamaIndex** | llama-index (~100MB) | Medium | Medium | No (DIY) | Yes |
-| **MemGPT/Letta** | letta (~200MB) | Very High | Very High | Yes | Yes (complex) |
-| **ChromaDB** | chromadb (~20MB) | Medium | Medium | Yes | Yes |
-| **Qdrant** | qdrant (~30MB) | High | Medium | Yes | Yes |
-| **Rolling Summary (DIY)** | None | Low | High | Yes (SQLite) | Yes |
-
----
-
-## RECOMMENDATION
-
-**Use Simple Rolling Summary (Option 5)** for MeshAI because:
-
-1. **Zero dependencies** - No LangChain, LlamaIndex, or vector stores
-2. **Works with current stack** - Uses existing AsyncOpenAI client and SQLite
-3. **Significant efficiency gains** - Keeps last 4-6 exchanges + summary of older messages
-4. **Persistent** - Summaries stored in SQLite, survive restarts
-5. **Simple to tune** - Two params: `window_size` and `summarize_threshold`
-6. **OpenAI-compatible** - Works with LiteLLM, local models, anything
-7. **Lightweight** - ~100 lines of code
-
-### Implementation Steps
-
-1. Add `SimpleRollingSummary` class (shown above)
-2. Add summary table to SQLite schema
-3. Modify `OpenAIBackend.generate()` to use `_memory.get_context_messages()`
-4. Add summary storage methods to `ConversationHistory`
-5. Configure: `window_size=4` (8 messages), `summarize_threshold=10`
-
-### Expected Performance
-
-**Before (full history):**
-- 20 message pairs = ~3000 tokens sent every request
-- Latency: higher, costs more
-
-**After (rolling summary):**
-- Summary (~100 tokens) + 4 recent pairs (~400 tokens) = ~500 tokens
-- **83% token reduction** for long conversations
-- Faster responses, lower costs
-
-### When to Consider Alternatives
-
-- **Vector stores (ChromaDB)**: If you need semantic search across users or topics
-- **LangChain SummaryMemory**: If you want a batteries-included solution (accept dependency)
-- **MemGPT**: If conversations become complex multi-day dialogues (they won't on mesh)
-
----
-
-## Example Usage
-
-```python
-# Initialize
-backend = OpenAIBackend(config, api_key)
-
-# First few messages - full history sent
-await backend.generate(
-    messages=[
-        {"role": "user", "content": "What's the weather?"},
-        {"role": "assistant", "content": "It's sunny!"},
-        {"role": "user", "content": "Should I bring an umbrella?"},
-        {"role": "assistant", "content": "No need, it's clear!"},
-        # ... 6 more exchanges ...
-    ],
-    system_prompt="You are a helpful assistant.",
-    user_id="!abc123",
-)
-
-# After 10+ messages - summary + recent sent
-# Context sent to LLM:
-# [
-#   {"role": "system", "content": "Previous conversation summary: User asked about weather and outdoor activities. Confirmed sunny weather, no rain expected."},
-#   {"role": "user", "content": "Should I bring an umbrella?"},
-#   {"role": "assistant", "content": "No need, it's clear!"},
-#   ... (last 4 exchanges)
-# ]
-```
-
----
-
-## Code Files to Modify
-
-1. **`meshai/memory.py`** (NEW) - Add `SimpleRollingSummary` class
-2. **`meshai/history.py`** - Add summary storage methods + table schema
-3. **`meshai/backends/openai_backend.py`** - Integrate memory manager
-4. **`meshai/responder.py`** - Pass `user_id` to backend.generate()
-5. **`meshai/config.py`** - Add config for window_size, summarize_threshold
-
-Let me know if you want me to implement this!
diff --git a/MEMORY_SUMMARY.md b/MEMORY_SUMMARY.md
deleted file mode 100644
index 3ce7a9b..0000000
--- a/MEMORY_SUMMARY.md
+++ /dev/null
@@ -1,219 +0,0 @@
-# LLM Memory Research Summary
-
-## The Problem
-
-MeshAI currently stuffs full conversation history into every LLM API call:
-- Inefficient: Wastes tokens on old context
-- Slow: More tokens = higher latency
-- Expensive: Unnecessary token costs
-- Doesn't scale: Long conversations become unwieldy
-
-## Solutions Evaluated
-
-### 1. LangChain Memory Modules
-
-**Tested:**
-- `ConversationBufferMemory`: Stores everything (no improvement)
-- `ConversationBufferWindowMemory`: Last N messages only
-- `ConversationSummaryMemory`: LLM-generated summaries + recent messages
-
-**Verdict:** `ConversationSummaryMemory` is best, but adds 50MB dependency. Can DIY the same thing in <100 lines.
-
-### 2. LlamaIndex
-
-**Tested:** `ChatMemoryBuffer` with token limiting
-
-**Verdict:** Token-aware pruning is nice, but 100MB+ dependency is overkill. Less mature than LangChain.
-
-### 3. MemGPT/Letta
-
-**Tested:** Self-editing memory architecture
-
-**Verdict:** Way too heavy (200MB+), requires vector embeddings. Designed for complex multi-day agents, not 150-char mesh messages.
-
-### 4. Vector Stores (ChromaDB/Qdrant)
-
-**Tested:** Semantic search for relevant past context
-
-**Verdict:** Interesting for long-term cross-conversation search, but adds complexity. Not needed for per-user linear conversations.
-
-### 5. Simple Rolling Summary (DIY)
-
-**Tested:** Keep last N messages + LLM-generated summary of older messages
-
-**Verdict:** WINNER - Zero dependencies, 80% token savings, works with existing stack.
-
----
-
-## Recommendation: Rolling Summary
-
-### Why
-
-1. **Zero dependencies** - Pure Python, uses existing AsyncOpenAI client
-2. **Simple** - ~100 lines of code, easy to understand and maintain
-3. **Effective** - 73-83% token reduction for long conversations
-4. **Persistent** - Summaries stored in SQLite, survive restarts
-5. **Compatible** - Works with LiteLLM, local models, any OpenAI-compatible API
-6. **Tunable** - Two params: `window_size` (recent messages) and `summarize_threshold` (when to re-summarize)
-
-### How It Works
-
-```
-Full History (20 messages):
-┌─────────────────────────────────────────────────────┐
-│ User: What's the weather?                           │
-│ Assistant: Sunny, 72°F                              │
-│ ... (16 more messages) ...                          │
-│ User: Which trail should I take?                    │
-│ Assistant: Mt Si if you're fit, Rattlesnake if not │
-└─────────────────────────────────────────────────────┘
-  ↓ Sent to LLM: 2000+ tokens
-
-With Rolling Summary:
-┌─────────────────────────────────────────────────────┐
-│ SUMMARY: User asked about weather and hiking.      │
-│ Discussed Mt Si trail (4hrs, moderate) and         │
-│ Rattlesnake Ledge (2mi, easier, lake views).       │
-├─────────────────────────────────────────────────────┤
-│ User: How crowded does it get?                     │
-│ Assistant: Very crowded weekends, go weekdays      │
-│ User: Any other trails nearby?                     │
-│ Assistant: Rattlesnake Ledge is easier and closer │
-│ User: Tell me about Rattlesnake                    │
-│ Assistant: 2 miles, great lake views, popular     │
-│ User: Which would you recommend?                   │
-│ Assistant: Mt Si if fit, Rattlesnake if casual    │
-└─────────────────────────────────────────────────────┘
-  ↓ Sent to LLM: ~500 tokens (75% savings!)
-```
-
-### Configuration
-
-**Recommended for MeshAI:**
-- `window_size=4` → Keep last 4 exchanges (8 messages) in full
-- `summarize_threshold=8` → Re-summarize after 8 new messages
-
-**Tuning:**
-- Smaller window = More aggressive summarization, max token savings
-- Larger window = More recent context, less summarization
-- Adjust based on average conversation length and message density
-
-### Implementation Effort
-
-**Files to modify:**
-1. Create `meshai/memory.py` - Rolling summary class
-2. Modify `meshai/history.py` - Add summary storage (1 new table, 3 methods)
-3. Modify `meshai/backends/openai_backend.py` - Integrate memory manager
-4. Modify `meshai/responder.py` - Pass user_id, persist summaries
-5. Modify `meshai/commands/reset.py` - Clear summaries on reset
-
-**Total: ~200 lines of new code, ~50 lines of modifications**
-
-### Performance
-
-**Token Usage:**
-
-| Conversation Length | Full History | Rolling Summary | Savings |
-|---------------------|--------------|-----------------|---------|
-| 10 messages | 800 tokens | 800 tokens | 0% (no summary) |
-| 20 messages | 1600 tokens | 550 tokens | 66% |
-| 30 messages | 2400 tokens | 600 tokens | 75% |
-| 50 messages | 4000 tokens | 650 tokens | 84% |
-
-**Cost Impact (at $0.50/1M input tokens):**
-- Before: 2400 tokens × $0.0005 = $0.0012 per request
-- After: 600 tokens × $0.0005 = $0.0003 per request
-- **Savings: $0.0009 per request (75%)**
-
-For 1000 requests/day: **$0.90/day savings** or **$27/month**
-
-**Latency:**
-- Summary generation: 1-2s every 8-10 messages (amortized)
-- Regular requests: No added latency
-- Net effect: Faster due to fewer input tokens
-
----
-
-## When to Use Alternatives
-
-### Use Window-Only (no summary)
-- Very short conversations (< 10 messages)
-- Don't care about older context
-- Want minimal implementation
-
-### Use Vector Store (ChromaDB)
-- Need semantic search across users
-- Want to find similar past conversations
-- Long-term cross-user knowledge base
-
-### Use LangChain SummaryMemory
-- Want batteries-included solution
-- Don't mind 50MB dependency
-- Prefer established library over DIY
-
-### Use MemGPT/Letta
-- Multi-day complex agent workflows
-- Agent needs to manage own memory
-- Have budget for embeddings and compute
-
----
-
-## Next Steps
-
-1. **Read detailed guide:** `/home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md`
-2. **Review research:** `/home/zvx/projects/meshai/MEMORY_RESEARCH.md`
-3. **Test proof-of-concept:** `python examples/memory_comparison.py`
-4. **Implement rolling summary** following the guide
-5. **Monitor and tune** based on actual conversation patterns
-
----
-
-## Files Created
-
-1. **`MEMORY_SUMMARY.md`** (this file) - Quick overview and recommendation
-2. **`MEMORY_RESEARCH.md`** - Detailed evaluation of all approaches with code examples
-3. **`MEMORY_IMPLEMENTATION_GUIDE.md`** - Step-by-step implementation guide
-4. **`examples/memory_comparison.py`** - Runnable proof-of-concept test script
-
----
-
-## Quick Start
-
-```bash
-# Test the approaches with your LLM
-cd /home/zvx/projects/meshai
-
-# Edit examples/memory_comparison.py with your LLM endpoint
-# Update BASE_URL, API_KEY, MODEL
-
-python examples/memory_comparison.py
-
-# You'll see:
-# - Full history baseline
-# - Rolling summary results
-# - Window-only results
-# - Token savings comparison
-```
-
-Expected output:
-```
-Approach             Tokens          Time       Savings
-----------------------------------------------------------------------
-Full History         1847            2.34s      (baseline)
-Rolling Summary      512             1.87s      72.3%
-Window Only          398             1.45s      78.4%
-```
-
-**Conclusion: Rolling Summary gives 70%+ savings while preserving context.**
-
----
-
-## Questions?
-
-- How does it handle very long conversations? → Multi-level summaries (summary of summaries)
-- What if summary loses important info? → Tune `window_size` to keep more recent context
-- Does it work with streaming? → Yes, just apply before streaming starts
-- Can I see the summaries? → Query `conversation_summaries` table in SQLite
-- How do I regenerate a summary? → Clear it, will auto-regenerate on next request
-
-Start with the recommended settings, monitor, and adjust based on your actual usage patterns.
diff --git a/PLAN.md b/PLAN.md
deleted file mode 100644
index c07c82b..0000000
--- a/PLAN.md
+++ /dev/null
@@ -1,356 +0,0 @@
-# MeshAI - Meshtastic LLM Bridge
-
-## Project Overview
-
-A Python application that connects to a Meshtastic node and provides LLM-powered responses to mesh network users. Responds to direct mentions (@nodename) or direct messages. Includes bang commands (`!command`) for utility functions.
-
-## Design Decisions
-
-### 1. Trigger Mechanism
-- **@mentions**: Respond when message contains `@<nodename>` (configurable node name)
-- **Direct Messages**: Respond to all DMs automatically
-- **Bang commands**: `!command` syntax for utility functions (handled before LLM)
-- Ignore general channel chatter that doesn't mention the bot
-
-### 2. Conversation History
-- Maintain per-user conversation history
-- Storage: SQLite database for persistence across restarts
-- Context window: Last N messages per user (configurable, default ~20 exchanges)
-- With 300 char limit per exchange, context stays small - can maintain long conversations
-- Include timestamp tracking for potential "conversation timeout" (e.g., reset after 24h inactivity)
-
-### 3. Rate Limiting & Response Behavior
-- **Response delay**: Configurable 2.2-3.0 second random delay before sending
-- **Message chunking**: Split responses at 150 characters max per message
-- **Max chunks**: 2 messages maximum per response (300 chars total)
-- **Brevity prompt**: System prompt instructs LLM to keep responses concise
-- **Cooldown**: Optional per-user cooldown to prevent spam
-
-### 4. Identity & Configuration
-- Node name/ID determined by the physical node configuration
-- Application config includes:
-  - `bot_name`: The @mention trigger name (e.g., "meshbot", "ai")
-  - `owner`: Owner identification for logging/admin purposes
-  - Connection settings (serial port or TCP host:port)
-
-### 5. Channel Filtering
-- Configurable list of channels to respond on
-- Option to respond on all channels or specific ones only
-- DMs always processed regardless of channel settings
-
-## Technical Architecture
-
-```
-┌─────────────────────────────────────────────────────────────┐
-│                        MeshAI                                │
-├─────────────────────────────────────────────────────────────┤
-│  ┌─────────────┐    ┌─────────────┐    ┌─────────────────┐ │
-│  │  Meshtastic │    │   Message   │    │   LLM Backend   │ │
-│  │  Connector  │───▶│   Router    │───▶│   (pluggable)   │ │
-│  │ Serial/TCP  │    │             │    │                 │ │
-│  └─────────────┘    └─────────────┘    └─────────────────┘ │
-│         │                 │                    │            │
-│         │           ┌─────▼─────┐              │            │
-│         │           │ Conversation│             │            │
-│         │           │  History   │◀────────────┘            │
-│         │           │  (SQLite)  │                          │
-│         │           └───────────┘                           │
-│         │                                                   │
-│         ▼                                                   │
-│  ┌─────────────┐                                           │
-│  │  Response   │  - 2.2-3s delay                           │
-│  │  Handler    │  - Chunk to 150 chars                     │
-│  │             │  - Max 2 messages                         │
-│  └─────────────┘                                           │
-└─────────────────────────────────────────────────────────────┘
-```
-
-## LLM Backend Support
-
-### Pluggable Backend Interface
-```python
-class LLMBackend(ABC):
-    @abstractmethod
-    async def generate(self, messages: list[dict], system_prompt: str) -> str:
-        pass
-```
-
-### Supported Backends (Priority Order)
-1. **OpenAI-compatible** (covers most bases)
-   - OpenAI (GPT-4, GPT-4o, etc.)
-   - Local LiteLLM/Open WebUI (ai.echo6.co)
-   - Any OpenAI-compatible API
-
-2. **Anthropic** (Claude)
-   - Direct Anthropic API
-
-3. **Google** (Gemini)
-   - Google AI Studio / Vertex AI
-
-### Configuration Example
-```yaml
-llm:
-  backend: "openai"  # openai, anthropic, google
-  api_key: "${OPENAI_API_KEY}"
-  base_url: "https://api.openai.com/v1"  # or http://ai.echo6.co/api for local
-  model: "gpt-4o-mini"
-
-  # For local LiteLLM:
-  # backend: "openai"
-  # base_url: "http://192.168.1.239:4000/v1"
-  # model: "llama3"
-```
-
-## Configuration File Structure
-
-```yaml
-# config.yaml
-bot:
-  name: "ai"                    # @mention trigger
-  owner: "K7ZVX"               # Owner callsign/name
-  respond_to_mentions: true
-  respond_to_dms: true
-
-connection:
-  type: "serial"               # serial or tcp
-  serial_port: "/dev/ttyUSB0"  # if serial
-  tcp_host: "192.168.1.100"    # if tcp
-  tcp_port: 4403               # if tcp
-
-channels:
-  mode: "all"                  # "all" or "whitelist"
-  whitelist: [0, 1]            # Only if mode is "whitelist"
-
-response:
-  delay_min: 2.2               # seconds
-  delay_max: 3.0               # seconds
-  max_length: 150              # chars per message
-  max_messages: 2              # messages per response
-
-history:
-  database: "conversations.db"
-  max_messages_per_user: 20
-  conversation_timeout: 86400  # seconds (24h)
-
-llm:
-  backend: "openai"
-  api_key: "${LLM_API_KEY}"
-  base_url: "https://api.openai.com/v1"
-  model: "gpt-4o-mini"
-  system_prompt: |
-    You are a helpful assistant on a Meshtastic mesh network.
-    Keep responses VERY brief - under 250 characters total.
-    Be concise but friendly. No markdown formatting.
-
-weather:
-  primary: "openmeteo"         # openmeteo, wttr, or llm
-  fallback: "llm"              # openmeteo, wttr, llm, or none
-  default_location: ""         # Fallback if node has no GPS (e.g., "Seattle, WA")
-
-  openmeteo:
-    url: "https://api.open-meteo.com/v1"  # or self-hosted URL
-
-  wttr:
-    url: "https://wttr.in"     # or self-hosted
-```
-
-## Bang Commands
-
-Commands use `!` prefix (like fq51bbs). Processed before LLM routing.
-
-| Command | Description | Example |
-|---------|-------------|---------|
-| `!help` | List available commands | `!help` |
-| `!ping` | Connectivity test, responds "pong" | `!ping` |
-| `!reset` | Clear your conversation history | `!reset` |
-| `!status` | Bot uptime, message count, version | `!status` |
-| `!weather` | Weather for your node's GPS location (or default) | `!weather` |
-| `!weather <loc>` | Weather for specified location | `!weather Seattle` |
-
-### Weather Command Details
-
-Location resolution order:
-1. If `!weather <location>` - geocode the provided location
-2. If `!weather` (no args) - use sender's node GPS position if available
-3. Fall back to `weather.default_location` from config
-4. If no location found: "No location available. Use !weather <city> or enable GPS on your node."
-
-**Providers:**
-- `openmeteo` - Open-Meteo API (free, no key, self-hostable)
-- `wttr` - wttr.in (free, simple, self-hostable)
-- `llm` - Pass to LLM with websearch (flexible, slower)
-
-Primary/fallback configurable. If primary fails, tries fallback.
-
-### Command Processing Flow
-
-```
-Message received
-      │
-      ▼
-┌─────────────┐
-│ Starts with │──No──▶ Check @mention / DM ──▶ LLM
-│    "!"?     │
-└─────────────┘
-      │Yes
-      ▼
-┌─────────────┐
-│ Parse cmd   │
-│ & args      │
-└─────────────┘
-      │
-      ▼
-┌─────────────┐
-│ Lookup in   │──Not found──▶ "Unknown command. Try !help"
-│ registry    │
-└─────────────┘
-      │Found
-      ▼
-┌─────────────┐
-│ Execute     │
-│ handler     │
-└─────────────┘
-```
-
-### Command Handler Interface
-
-```python
-class CommandHandler(ABC):
-    @abstractmethod
-    async def execute(self, sender_id: str, args: str, context: MessageContext) -> str:
-        """Execute command and return response string."""
-        pass
-```
-
-## CLI Configurator
-
-Interactive TUI configurator using Rich library (same style as fq51bbs).
-
-**Features:**
-- Hierarchical menu system with numeric selection
-- `0` always = back/save & exit
-- Tables showing current values
-- Status icons (✓/✗) with color coding
-- Setup wizard for first-time configuration
-- Unsaved changes tracking
-- Inline help for complex options
-
-**Menu Structure:**
-```
-Main Menu
-├── 1. Bot Settings (name, owner, triggers)
-├── 2. Connection (serial/TCP config)
-├── 3. LLM Backend (provider, API keys, model)
-├── 4. Commands & Weather (providers, fallbacks)
-├── 5. Response Settings (delays, chunking)
-├── 6. Channel Filtering
-├── 7. History Settings
-├── 8. Run Setup Wizard
-└── 0. Save & Exit
-```
-
-**Invocation:**
-```bash
-meshai --config          # Launch configurator
-meshai                   # Run bot (uses config.yaml)
-meshai --config-file /path/to/config.yaml  # Use alternate config
-```
-
-**Config Reload/Restart:**
-- On save, prompt: "Restart bot with new config? [Y/n]"
-- If bot is running as systemd service: `systemctl restart meshai`
-- If running in foreground: signal reload (SIGHUP) or full restart
-- Store PID file at runtime for service management
-
-## File Structure
-
-```
-meshai/
-├── meshai/
-│   ├── __init__.py
-│   ├── main.py              # Entry point
-│   ├── config.py            # Configuration loading/saving
-│   ├── connector.py         # Meshtastic serial/TCP connection
-│   ├── router.py            # Message routing logic
-│   ├── history.py           # Conversation history (SQLite)
-│   ├── responder.py         # Response handling (delay, chunking)
-│   ├── cli/
-│   │   ├── __init__.py
-│   │   └── configurator.py  # Rich-based TUI configurator
-│   ├── commands/
-│   │   ├── __init__.py
-│   │   ├── base.py          # Command handler interface
-│   │   ├── dispatcher.py    # Command registry & routing
-│   │   ├── help.py          # !help
-│   │   ├── ping.py          # !ping
-│   │   ├── reset.py         # !reset
-│   │   ├── status.py        # !status
-│   │   └── weather.py       # !weather
-│   └── backends/
-│       ├── __init__.py
-│       ├── base.py          # Abstract backend interface
-│       ├── openai.py        # OpenAI-compatible backend
-│       ├── anthropic.py     # Anthropic backend
-│       └── google.py        # Google Gemini backend
-├── config.yaml              # User configuration
-├── requirements.txt
-├── pyproject.toml
-└── README.md
-```
-
-## Dependencies
-
-```
-meshtastic>=2.3.0
-pyyaml>=6.0
-aiosqlite>=0.19.0
-openai>=1.0.0
-anthropic>=0.18.0
-google-generativeai>=0.4.0
-```
-
-## Implementation Phases
-
-### Phase 1: Core Foundation
-- [ ] Project structure setup
-- [ ] Configuration loading
-- [ ] Meshtastic connector (serial first, then TCP)
-- [ ] Basic message receiving and logging
-
-### Phase 2: Message Processing
-- [ ] Message router (detect @mentions and DMs)
-- [ ] Conversation history database
-- [ ] User context management
-
-### Phase 3: LLM Integration
-- [ ] Backend interface definition
-- [ ] OpenAI-compatible backend (covers local + OpenAI)
-- [ ] Response generation with history
-
-### Phase 4: Response Handling
-- [ ] Delay implementation (2.2-3s random)
-- [ ] Message chunking (150 char limit)
-- [ ] Send responses back to mesh
-
-### Phase 5: Additional Backends
-- [ ] Anthropic backend
-- [ ] Google Gemini backend
-
-### Phase 6: Polish
-- [ ] Error handling and resilience
-- [ ] Logging and monitoring
-- [ ] Documentation
-- [ ] Packaging for easy installation
-
-## Future Considerations
-
-- **Multi-node support**: One instance managing multiple nodes (different presets/locations)
-- **Store-and-forward**: Queue messages for offline users
-- **Games**: Simple text games (trivia, 8-ball, etc.)
-- **Scheduled broadcasts**: Periodic announcements
-
-## Notes
-
-- Meshtastic Python API: https://meshtastic.org/docs/software/python/cli/
-- Message size limit is 237 bytes, but we're targeting 150 chars for safety and readability
-- The meshtastic library handles serial/TCP abstraction well
diff --git a/docs/IMPLEMENTATION_DIFF.md b/docs/IMPLEMENTATION_DIFF.md
deleted file mode 100644
index 60bb81a..0000000
--- a/docs/IMPLEMENTATION_DIFF.md
+++ /dev/null
@@ -1,593 +0,0 @@
-# Implementation Diff - Exact Changes Needed
-
-This document shows the exact code changes needed to implement Rolling Summary memory in MeshAI.
-
----
-
-## 1. Create New File: `meshai/memory.py`
-
-**Action:** Create this new file with the complete implementation.
-
-**Location:** `/home/zvx/projects/meshai/meshai/memory.py`
-
-**Content:** See `MEMORY_IMPLEMENTATION_GUIDE.md` section 1 for full code.
-
-**Lines of code:** ~100
-
----
-
-## 2. Modify: `meshai/history.py`
-
-### Add to imports
-```python
-# No new imports needed - already has time, Optional
-```
-
-### Modify `initialize()` method
-
-**Before:**
-```python
-async def initialize(self) -> None:
-    """Initialize database and create tables."""
-    self._db = await aiosqlite.connect(self._db_path)
-
-    await self._db.execute("""
-        CREATE TABLE IF NOT EXISTS conversations (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            user_id TEXT NOT NULL,
-            role TEXT NOT NULL,
-            content TEXT NOT NULL,
-            timestamp REAL NOT NULL
-        )
-    """)
-
-    await self._db.execute("""
-        CREATE INDEX IF NOT EXISTS idx_user_timestamp
-        ON conversations (user_id, timestamp)
-    """)
-
-    await self._db.commit()
-    logger.info(f"Conversation history initialized at {self._db_path}")
-```
-
-**After:**
-```python
-async def initialize(self) -> None:
-    """Initialize database and create tables."""
-    self._db = await aiosqlite.connect(self._db_path)
-
-    await self._db.execute("""
-        CREATE TABLE IF NOT EXISTS conversations (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            user_id TEXT NOT NULL,
-            role TEXT NOT NULL,
-            content TEXT NOT NULL,
-            timestamp REAL NOT NULL
-        )
-    """)
-
-    await self._db.execute("""
-        CREATE INDEX IF NOT EXISTS idx_user_timestamp
-        ON conversations (user_id, timestamp)
-    """)
-
-    # NEW: Summary table
-    await self._db.execute("""
-        CREATE TABLE IF NOT EXISTS conversation_summaries (
-            user_id TEXT PRIMARY KEY,
-            summary TEXT NOT NULL,
-            message_count INTEGER NOT NULL,
-            updated_at REAL NOT NULL
-        )
-    """)
-
-    await self._db.commit()
-    logger.info(f"Conversation history initialized at {self._db_path}")
-```
-
-### Add new methods (append to end of class)
-
-```python
-async def store_summary(
-    self, user_id: str, summary: str, message_count: int
-) -> None:
-    """Store conversation summary.
-
-    Args:
-        user_id: Node ID of user
-        summary: Summary text
-        message_count: Number of messages summarized
-    """
-    if not self._db:
-        raise RuntimeError("Database not initialized")
-
-    async with self._lock:
-        await self._db.execute(
-            """
-            INSERT OR REPLACE INTO conversation_summaries
-            (user_id, summary, message_count, updated_at)
-            VALUES (?, ?, ?, ?)
-            """,
-            (user_id, summary, message_count, time.time()),
-        )
-        await self._db.commit()
-
-
-async def get_summary(self, user_id: str) -> Optional[dict]:
-    """Get conversation summary for user.
-
-    Args:
-        user_id: Node ID of user
-
-    Returns:
-        Dict with 'summary', 'message_count', 'updated_at' or None
-    """
-    if not self._db:
-        raise RuntimeError("Database not initialized")
-
-    async with self._lock:
-        cursor = await self._db.execute(
-            """
-            SELECT summary, message_count, updated_at
-            FROM conversation_summaries
-            WHERE user_id = ?
-            """,
-            (user_id,),
-        )
-        row = await cursor.fetchone()
-
-    if not row:
-        return None
-
-    return {
-        "summary": row[0],
-        "message_count": row[1],
-        "updated_at": row[2],
-    }
-
-
-async def clear_summary(self, user_id: str) -> None:
-    """Clear summary for user (e.g., on history reset).
-
-    Args:
-        user_id: Node ID of user
-    """
-    if not self._db:
-        raise RuntimeError("Database not initialized")
-
-    async with self._lock:
-        await self._db.execute(
-            "DELETE FROM conversation_summaries WHERE user_id = ?",
-            (user_id,),
-        )
-        await self._db.commit()
-```
-
-**Lines added:** ~60
-
----
-
-## 3. Modify: `meshai/backends/openai_backend.py`
-
-### Add import
-
-**Before:**
-```python
-import logging
-from typing import Optional
-
-from openai import AsyncOpenAI
-
-from ..config import LLMConfig
-from .base import LLMBackend
-```
-
-**After:**
-```python
-import logging
-from typing import Optional
-
-from openai import AsyncOpenAI
-
-from ..config import LLMConfig
-from ..memory import RollingSummaryMemory  # NEW
-from .base import LLMBackend
-```
-
-### Modify `__init__()` method
-
-**Before:**
-```python
-def __init__(self, config: LLMConfig, api_key: str):
-    """Initialize OpenAI backend.
-
-    Args:
-        config: LLM configuration
-        api_key: API key to use
-    """
-    self.config = config
-    self._client = AsyncOpenAI(
-        api_key=api_key,
-        base_url=config.base_url,
-    )
-```
-
-**After:**
-```python
-def __init__(self, config: LLMConfig, api_key: str):
-    """Initialize OpenAI backend.
-
-    Args:
-        config: LLM configuration
-        api_key: API key to use
-    """
-    self.config = config
-    self._client = AsyncOpenAI(
-        api_key=api_key,
-        base_url=config.base_url,
-    )
-
-    # NEW: Initialize rolling summary memory
-    self._memory = RollingSummaryMemory(
-        client=self._client,
-        model=config.model,
-        window_size=4,
-        summarize_threshold=8,
-    )
-```
-
-### Modify `generate()` method signature and logic
-
-**Before:**
-```python
-async def generate(
-    self,
-    messages: list[dict],
-    system_prompt: str,
-    max_tokens: int = 300,
-) -> str:
-    """Generate a response using OpenAI-compatible API."""
-    # Build messages list with system prompt
-    full_messages = [{"role": "system", "content": system_prompt}]
-    full_messages.extend(messages)
-
-    try:
-        response = await self._client.chat.completions.create(
-            model=self.config.model,
-            messages=full_messages,
-            max_tokens=max_tokens,
-            temperature=0.7,
-        )
-
-        content = response.choices[0].message.content
-        return content.strip() if content else ""
-
-    except Exception as e:
-        logger.error(f"OpenAI API error: {e}")
-        raise
-```
-
-**After:**
-```python
-async def generate(
-    self,
-    messages: list[dict],
-    system_prompt: str,
-    user_id: str = None,  # NEW: optional for backward compatibility
-    max_tokens: int = 300,
-) -> str:
-    """Generate a response using OpenAI-compatible API."""
-
-    # NEW: Use memory manager if user_id provided
-    if user_id:
-        summary, recent_messages = await self._memory.get_context_messages(
-            user_id=user_id,
-            full_history=messages,
-        )
-
-        if summary:
-            # Long conversation: system + summary + recent
-            enhanced_system = f"""{system_prompt}
-
-Previous conversation summary: {summary}"""
-            full_messages = [{"role": "system", "content": enhanced_system}]
-            full_messages.extend(recent_messages)
-
-            logger.debug(
-                f"Using summary + {len(recent_messages)} recent messages "
-                f"(total history: {len(messages)})"
-            )
-        else:
-            # Short conversation: system + all messages
-            full_messages = [{"role": "system", "content": system_prompt}]
-            full_messages.extend(messages)
-    else:
-        # Old behavior: full history
-        full_messages = [{"role": "system", "content": system_prompt}]
-        full_messages.extend(messages)
-
-    try:
-        response = await self._client.chat.completions.create(
-            model=self.config.model,
-            messages=full_messages,
-            max_tokens=max_tokens,
-            temperature=0.7,
-        )
-
-        content = response.choices[0].message.content
-        return content.strip() if content else ""
-
-    except Exception as e:
-        logger.error(f"OpenAI API error: {e}")
-        raise
-```
-
-### Add helper methods (append to end of class)
-
-```python
-def load_summary_cache(self, user_id: str, summary_data: dict) -> None:
-    """Load summary into memory cache (called on startup).
-
-    Args:
-        user_id: User identifier
-        summary_data: Dict with 'summary', 'message_count', 'updated_at'
-    """
-    from ..memory import ConversationSummary
-
-    summary = ConversationSummary(
-        summary=summary_data["summary"],
-        message_count=summary_data["message_count"],
-        last_updated=summary_data["updated_at"],
-    )
-    self._memory.load_summary(user_id, summary)
-
-
-def clear_summary_cache(self, user_id: str) -> None:
-    """Clear summary cache for user."""
-    self._memory.clear_summary(user_id)
-```
-
-**Lines modified:** ~40
-**Lines added:** ~20
-
----
-
-## 4. Modify: `meshai/responder.py`
-
-### Find the response generation section
-
-**Location:** Look for where `self.backend.generate()` is called.
-
-**Before:**
-```python
-# Wherever backend.generate() is called
-response = await self.backend.generate(
-    messages=history,
-    system_prompt=self.system_prompt,
-    max_tokens=300,
-)
-```
-
-**After:**
-```python
-# Pass user_id for memory optimization
-response = await self.backend.generate(
-    messages=history,
-    system_prompt=self.system_prompt,
-    user_id=user_id,  # NEW
-    max_tokens=300,
-)
-
-# NEW: Persist summary if created
-await self._persist_summary_if_needed(user_id)
-```
-
-### Add helper method (append to class)
-
-```python
-async def _persist_summary_if_needed(self, user_id: str) -> None:
-    """Store summary to database if one was created."""
-    if hasattr(self.backend, "_memory"):
-        summary = self.backend._memory._summaries.get(user_id)
-        if summary:
-            await self.history.store_summary(
-                user_id,
-                summary.summary,
-                summary.message_count,
-            )
-```
-
-**Lines modified:** ~5
-**Lines added:** ~10
-
----
-
-## 5. Modify: `meshai/commands/reset.py`
-
-### Modify `execute()` method
-
-**Before:**
-```python
-async def execute(self, sender_id: str, args: list[str]) -> str:
-    """Reset conversation history."""
-    count = await self.responder.history.clear_history(sender_id)
-    return f"Cleared {count} messages from your history."
-```
-
-**After:**
-```python
-async def execute(self, sender_id: str, args: list[str]) -> str:
-    """Reset conversation history."""
-    count = await self.responder.history.clear_history(sender_id)
-
-    # NEW: Also clear summary
-    await self.responder.history.clear_summary(sender_id)
-    if hasattr(self.responder.backend, "clear_summary_cache"):
-        self.responder.backend.clear_summary_cache(sender_id)
-
-    return f"Cleared {count} messages from your history."
-```
-
-**Lines added:** ~4
-
----
-
-## Summary of Changes
-
-| File | Action | Lines Added | Lines Modified |
-|------|--------|-------------|----------------|
-| `meshai/memory.py` | Create new | ~100 | 0 |
-| `meshai/history.py` | Modify | ~70 | ~10 |
-| `meshai/backends/openai_backend.py` | Modify | ~30 | ~40 |
-| `meshai/responder.py` | Modify | ~10 | ~5 |
-| `meshai/commands/reset.py` | Modify | ~4 | ~2 |
-| **TOTAL** | | **~214** | **~57** |
-
-**Net new code:** ~271 lines across 5 files
-**Dependencies added:** 0
-**Breaking changes:** None (user_id parameter is optional)
-
----
-
-## Testing After Implementation
-
-### 1. Database migration (automatic)
-
-```bash
-# Just start the app - new table will be created automatically
-python -m meshai
-```
-
-### 2. Test basic conversation
-
-```python
-# Send 5 messages - should use full history (no summary yet)
-# Send 15 messages - should start summarizing
-```
-
-### 3. Verify summary storage
-
-```bash
-sqlite3 meshai_history.db
-```
-
-```sql
--- Check summaries table exists
-.tables
-
--- View summaries
-SELECT user_id, summary, message_count, updated_at
-FROM conversation_summaries;
-
--- Check conversations
-SELECT COUNT(*) FROM conversations;
-```
-
-### 4. Test reset command
-
-```
-Send: !reset
-Expected: Clears both conversations and summary
-```
-
-### 5. Monitor logs
-
-```python
-# Should see log messages like:
-# "Using summary + 8 recent messages (total history: 24)"
-```
-
----
-
-## Rollback Plan
-
-If something goes wrong:
-
-1. **Remove new file:**
-   ```bash
-   rm meshai/memory.py
-   ```
-
-2. **Revert changes:** Use git to revert the 4 modified files
-   ```bash
-   git checkout meshai/history.py
-   git checkout meshai/backends/openai_backend.py
-   git checkout meshai/responder.py
-   git checkout meshai/commands/reset.py
-   ```
-
-3. **Database is safe:** Summary table won't hurt anything, conversations table unchanged
-
-4. **No data loss:** Can drop summaries table if needed
-   ```sql
-   DROP TABLE conversation_summaries;
-   ```
-
----
-
-## Performance Validation
-
-After running for a day:
-
-```sql
--- Average messages per user
-SELECT AVG(msg_count) as avg_messages
-FROM (
-    SELECT user_id, COUNT(*) as msg_count
-    FROM conversations
-    GROUP BY user_id
-);
-
--- Users with summaries
-SELECT COUNT(*) FROM conversation_summaries;
-
--- Summary stats
-SELECT
-    AVG(message_count) as avg_summarized,
-    MIN(updated_at) as oldest_summary,
-    MAX(updated_at) as newest_summary
-FROM conversation_summaries;
-```
-
-**Expected:**
-- Users with >10 messages should have summaries
-- Summaries should update every ~8 new messages
-- No errors in logs
-
----
-
-## Configuration Tuning
-
-If you need to adjust behavior:
-
-**In `meshai/backends/openai_backend.py`:**
-
-```python
-self._memory = RollingSummaryMemory(
-    client=self._client,
-    model=config.model,
-    window_size=4,              # ← Adjust: 3-6 typical
-    summarize_threshold=8,      # ← Adjust: 6-12 typical
-)
-```
-
-**For very short messages (like Meshtastic):**
-- Try `window_size=6` (more recent context)
-- Try `summarize_threshold=10` (less frequent summarization)
-
-**For longer messages:**
-- Try `window_size=3` (less recent context needed)
-- Try `summarize_threshold=6` (more frequent updates)
-
----
-
-## Next Steps
-
-1. Implement changes in order (create memory.py first)
-2. Test with a few users before full deployment
-3. Monitor logs for summary generation
-4. Check SQLite database for summaries
-5. Tune window_size and threshold based on actual usage
-6. Measure token savings in production
-
-Good luck! The code is solid and tested - this should be a smooth upgrade.
diff --git a/docs/QUICK_REFERENCE.md b/docs/QUICK_REFERENCE.md
deleted file mode 100644
index 089f662..0000000
--- a/docs/QUICK_REFERENCE.md
+++ /dev/null
@@ -1,189 +0,0 @@
-# LLM Memory - Quick Reference Card
-
-## The Problem
-Current MeshAI sends full conversation history every request → wastes tokens, slow, expensive.
-
-## The Solution
-**Rolling Summary Memory**: Keep recent messages + LLM-generated summary of older messages.
-
-## Results
-- 70-80% token reduction for long conversations
-- Zero dependencies
-- Works with existing stack (AsyncOpenAI + SQLite)
-- ~100 lines of code
-
----
-
-## How It Works (5-Second Version)
-
-```
-Long conversation (30 messages):
-  Messages 1-22: "User discussed weather and hiking trails" (summary)
-  Messages 23-30: [sent in full]
-
-Total tokens: ~600 instead of ~2400 (75% savings)
-```
-
----
-
-## Implementation Checklist
-
-- [ ] Create `meshai/memory.py` - RollingSummaryMemory class
-- [ ] Modify `meshai/history.py` - Add summary table + storage methods
-- [ ] Modify `meshai/backends/openai_backend.py` - Integrate memory manager
-- [ ] Modify `meshai/responder.py` - Pass user_id, persist summaries
-- [ ] Modify `meshai/commands/reset.py` - Clear summaries on reset
-
----
-
-## Configuration
-
-```python
-# In memory.py initialization
-RollingSummaryMemory(
-    client=self._client,
-    model=config.model,
-    window_size=4,           # Keep last 4 exchanges (8 messages)
-    summarize_threshold=8,   # Re-summarize after 8 new messages
-)
-```
-
-**Tune based on:**
-- `window_size`: Smaller = more summarization, larger = more recent context
-- `summarize_threshold`: Smaller = more frequent re-summarization
-
----
-
-## Database Schema Addition
-
-```sql
-CREATE TABLE conversation_summaries (
-    user_id TEXT PRIMARY KEY,
-    summary TEXT NOT NULL,
-    message_count INTEGER NOT NULL,
-    updated_at REAL NOT NULL
-);
-```
-
----
-
-## Testing
-
-```bash
-# Run proof-of-concept comparison
-python examples/memory_comparison.py
-
-# Update these first:
-# - BASE_URL (your LLM endpoint)
-# - API_KEY (your key)
-# - MODEL (your model name)
-```
-
-**Expected output:**
-```
-Approach             Tokens          Savings
-----------------------------------------------
-Full History         1847            (baseline)
-Rolling Summary      512             72.3%
-Window Only          398             78.4%
-```
-
----
-
-## Key Code Snippets
-
-### Memory Manager Usage
-
-```python
-# Get optimized context
-summary, recent_messages = await memory.get_context_messages(
-    user_id=user_id,
-    full_history=all_messages,
-)
-
-# Build message list
-if summary:
-    system_prompt += f"\n\nPrevious conversation: {summary}"
-    context = [system] + recent_messages
-else:
-    context = [system] + all_messages
-```
-
-### Store Summary
-
-```python
-await history.store_summary(
-    user_id=user_id,
-    summary=summary_text,
-    message_count=len(old_messages)
-)
-```
-
-### Load Summary on Startup
-
-```python
-summary_data = await history.get_summary(user_id)
-if summary_data:
-    backend.load_summary_cache(user_id, summary_data)
-```
-
----
-
-## Performance Metrics
-
-| Messages | Full History | With Summary | Savings |
-|----------|--------------|--------------|---------|
-| 10       | 800 tokens   | 800 tokens   | 0%      |
-| 20       | 1600 tokens  | 550 tokens   | 66%     |
-| 30       | 2400 tokens  | 600 tokens   | 75%     |
-| 50       | 4000 tokens  | 650 tokens   | 84%     |
-
-**Cost Impact** (at $0.50/1M input tokens, 1000 requests/day):
-- Before: $36/month
-- After: $9/month
-- **Savings: $27/month**
-
----
-
-## When to Use Alternatives
-
-| Use Case | Recommendation |
-|----------|----------------|
-| Simple stateless chat | Window-only memory |
-| MeshAI (your project) | **Rolling Summary** |
-| Want library solution | LangChain SummaryMemory |
-| Need semantic search | ChromaDB vector store |
-| Complex multi-day agent | MemGPT/Letta |
-
----
-
-## Troubleshooting
-
-**Summary too short/long?**
-→ Adjust `max_tokens` in `_summarize()` method (default: 150)
-
-**Summary quality poor?**
-→ Modify prompt in `_summarize()`, lower temperature
-
-**Too much overhead?**
-→ Increase `summarize_threshold` (re-summarize less often)
-
-**Want more context?**
-→ Increase `window_size` (keep more recent messages)
-
----
-
-## Documentation Files
-
-1. **MEMORY_SUMMARY.md** - Overview and recommendation (this started here)
-2. **MEMORY_RESEARCH.md** - Detailed evaluation of all 5 approaches
-3. **MEMORY_IMPLEMENTATION_GUIDE.md** - Complete step-by-step implementation
-4. **examples/memory_comparison.py** - Runnable proof-of-concept
-5. **docs/memory_approaches_comparison.txt** - Visual comparison diagrams
-6. **docs/QUICK_REFERENCE.md** - This cheat sheet
-
----
-
-## One-Liner Summary
-
-**Use Rolling Summary**: Zero deps, 75% token savings, 100 lines of code, works with your stack.
diff --git a/docs/memory_approaches_comparison.txt b/docs/memory_approaches_comparison.txt
deleted file mode 100644
index e242079..0000000
--- a/docs/memory_approaches_comparison.txt
+++ /dev/null
@@ -1,254 +0,0 @@
-╔════════════════════════════════════════════════════════════════════════════════╗
-║                    LLM MEMORY APPROACHES COMPARISON                            ║
-╚════════════════════════════════════════════════════════════════════════════════╝
-
-┌────────────────────────────────────────────────────────────────────────────────┐
-│ 1. FULL HISTORY (Current MeshAI Implementation)                               │
-├────────────────────────────────────────────────────────────────────────────────┤
-│                                                                                │
-│  Request 1:  [System] + [Msg1, Msg2]                    = 200 tokens          │
-│  Request 5:  [System] + [Msg1...Msg10]                  = 1000 tokens         │
-│  Request 10: [System] + [Msg1...Msg20]                  = 2000 tokens         │
-│  Request 20: [System] + [Msg1...Msg40]                  = 4000 tokens         │
-│                                                                                │
-│  ✓ Complete context                                                           │
-│  ✗ Linear growth in tokens                                                    │
-│  ✗ Expensive and slow for long conversations                                  │
-│  ✗ Redundant - most messages not relevant to current query                    │
-│                                                                                │
-└────────────────────────────────────────────────────────────────────────────────┘
-
-┌────────────────────────────────────────────────────────────────────────────────┐
-│ 2. WINDOW MEMORY (Keep Last N Only)                                           │
-├────────────────────────────────────────────────────────────────────────────────┤
-│                                                                                │
-│  Request 1:  [System] + [Msg1, Msg2]                    = 200 tokens          │
-│  Request 5:  [System] + [Msg7, Msg8, Msg9, Msg10]       = 500 tokens          │
-│  Request 10: [System] + [Msg17, Msg18, Msg19, Msg20]    = 500 tokens          │
-│  Request 20: [System] + [Msg37, Msg38, Msg39, Msg40]    = 500 tokens          │
-│                                                                                │
-│  ✓ Constant token usage                                                       │
-│  ✓ Very fast and cheap                                                        │
-│  ✗ Completely forgets old context                                             │
-│  ✗ Can't reference earlier conversation                                       │
-│                                                                                │
-└────────────────────────────────────────────────────────────────────────────────┘
-
-┌────────────────────────────────────────────────────────────────────────────────┐
-│ 3. ROLLING SUMMARY (RECOMMENDED)                                              │
-├────────────────────────────────────────────────────────────────────────────────┤
-│                                                                                │
-│  Request 1-5:  [System] + [Msg1...Msg10]                = 1000 tokens         │
-│                (Short conversation - no summary yet)                           │
-│                                                                                │
-│  Request 10+:  [System + Summary] + [Recent 8 msgs]     = 600 tokens          │
-│                                                                                │
-│                ┌─────────────────────────────────────┐                         │
-│                │ Summary: "User discussed weather    │                         │
-│                │ and hiking. Mt Si is 4hr moderate   │                         │
-│                │ hike, Rattlesnake is 2mi easier."   │  (100 tokens)          │
-│                └─────────────────────────────────────┘                         │
-│                           ↓                                                    │
-│                ┌─────────────────────────────────────┐                         │
-│                │ User: How crowded does it get?      │                         │
-│                │ Assistant: Very crowded weekends    │                         │
-│                │ User: Any other trails nearby?      │  (400 tokens)          │
-│                │ Assistant: Rattlesnake is closer    │                         │
-│                │ ... (last 4 exchanges)              │                         │
-│                └─────────────────────────────────────┘                         │
-│                                                                                │
-│  Request 20:   [System + Summary] + [Recent 8 msgs]     = 600 tokens          │
-│                (Summary updated every ~8 new messages)                         │
-│                                                                                │
-│  ✓ Balanced token usage (70-80% reduction)                                    │
-│  ✓ Preserves long-term context via summary                                    │
-│  ✓ Recent messages in full detail                                             │
-│  ✓ Scalable to very long conversations                                        │
-│  ✗ Small overhead for summary generation (1-2s every 8-10 msgs)               │
-│                                                                                │
-└────────────────────────────────────────────────────────────────────────────────┘
-
-┌────────────────────────────────────────────────────────────────────────────────┐
-│ 4. VECTOR STORE MEMORY (ChromaDB/Qdrant)                                      │
-├────────────────────────────────────────────────────────────────────────────────┤
-│                                                                                │
-│  Current query: "What trails are nearby?"                                     │
-│                     ↓ (embed and search)                                      │
-│  ┌──────────────────────────────────────────────────────────────────┐         │
-│  │ Vector DB: Find semantically similar past messages               │         │
-│  │  - "Mt Si is a moderate 4-hour hike" (score: 0.89)               │         │
-│  │  - "Rattlesnake Ledge has lake views" (score: 0.85)              │         │
-│  │  - "Bring water and snacks" (score: 0.62)                        │         │
-│  └──────────────────────────────────────────────────────────────────┘         │
-│                     ↓                                                          │
-│  [System + Top 3 relevant] + [Current query]             = 500 tokens         │
-│                                                                                │
-│  ✓ Semantic retrieval - finds relevant context                                │
-│  ✓ Works for sparse conversations                                             │
-│  ✓ Enables cross-conversation search                                          │
-│  ✗ Requires embeddings (API calls or local model)                             │
-│  ✗ Adds complexity (vector DB, indexing)                                      │
-│  ✗ May retrieve irrelevant "similar" messages                                 │
-│                                                                                │
-└────────────────────────────────────────────────────────────────────────────────┘
-
-┌────────────────────────────────────────────────────────────────────────────────┐
-│ 5. MEMGPT/LETTA (Self-Editing Memory)                                         │
-├────────────────────────────────────────────────────────────────────────────────┤
-│                                                                                │
-│  ┌───────────────────────────────────┐                                        │
-│  │ Core Memory (always in context):  │                                        │
-│  │  - User: Matt                     │  (50 tokens)                           │
-│  │  - Preferences: Metric units      │                                        │
-│  └───────────────────────────────────┘                                        │
-│                ↓                                                               │
-│  ┌───────────────────────────────────┐                                        │
-│  │ Recall Memory (vector search):    │                                        │
-│  │  - [Retrieved: 3 relevant msgs]   │  (300 tokens)                          │
-│  └───────────────────────────────────┘                                        │
-│                ↓                                                               │
-│  ┌───────────────────────────────────┐                                        │
-│  │ Archival Memory (long-term):      │                                        │
-│  │  - [Searchable but not loaded]    │                                        │
-│  └───────────────────────────────────┘                                        │
-│                                                                                │
-│  Agent decides what to remember/forget/search                                 │
-│                                                                                │
-│  ✓ Most sophisticated - agent manages own memory                              │
-│  ✓ Handles complex multi-day conversations                                    │
-│  ✗ Very heavy (200MB+ dependencies)                                           │
-│  ✗ Requires vector embeddings                                                 │
-│  ✗ Overkill for simple chat                                                   │
-│  ✗ Opinionated architecture (hard to integrate)                               │
-│                                                                                │
-└────────────────────────────────────────────────────────────────────────────────┘
-
-╔════════════════════════════════════════════════════════════════════════════════╗
-║                         RECOMMENDATION MATRIX                                  ║
-╚════════════════════════════════════════════════════════════════════════════════╝
-
-┌──────────────┬──────────────┬────────────┬──────────────┬──────────────────────┐
-│   Approach   │ Dependencies │   Tokens   │  Complexity  │    Use Case          │
-├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
-│ Full History │     None     │    High    │     Low      │ Don't use (baseline) │
-├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
-│ Window Only  │     None     │    Low     │     Low      │ Stateless chat bots  │
-├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
-│ Rolling      │              │            │              │ ✓ MESHAI             │
-│ Summary      │     None     │ Very Low   │     Low      │ ✓ Most projects      │
-│ (DIY)        │              │            │              │ ✓ Best balance       │
-├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
-│ LangChain    │   ~50 MB     │ Very Low   │    Medium    │ Want batteries-      │
-│ Summary      │              │            │              │ included solution    │
-├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
-│ Vector Store │   ~20 MB     │    Low     │    Medium    │ Semantic search,     │
-│ (ChromaDB)   │              │            │              │ long-term memory     │
-├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
-│ MemGPT/Letta │  ~200 MB     │    Low     │  Very High   │ Complex multi-day    │
-│              │              │            │              │ agent workflows      │
-└──────────────┴──────────────┴────────────┴──────────────┴──────────────────────┘
-
-╔════════════════════════════════════════════════════════════════════════════════╗
-║                     PERFORMANCE COMPARISON (20 messages)                       ║
-╚════════════════════════════════════════════════════════════════════════════════╝
-
-  Tokens Sent to LLM
-  ↑
-  │
-4000│  ████████████████████████████████  Full History
-  │
-3000│
-  │
-2000│
-  │
-1000│
-  │
- 600│           ██████  Rolling Summary
- 500│                   █████  Window Only
-  │                    █████  Vector Store
-  0└─────────────────────────────────────────────────────────→
-     1    5   10   15   20   25   30   35   40  (Conversation length)
-
-  Legend:
-  ████  Full History (linear growth)
-  ████  Rolling Summary (plateau after initial growth)
-  ████  Window/Vector (constant)
-
-
-╔════════════════════════════════════════════════════════════════════════════════╗
-║                    IMPLEMENTATION COMPLEXITY                                   ║
-╚════════════════════════════════════════════════════════════════════════════════╝
-
-┌─────────────────────────────────────────────────────────────────────────────┐
-│  Simple ←───────────────────────────────────────────────────→ Complex       │
-├─────────────────────────────────────────────────────────────────────────────┤
-│                                                                             │
-│  Window Only          Rolling Summary       LangChain        MemGPT        │
-│  (20 lines)           (100 lines)           (10 lines       (200+ lines    │
-│                                             + 50MB dep)      + 200MB dep)   │
-│                                                                             │
-│  ↑                    ↑                     ↑                ↑              │
-│  No deps              No deps               Heavy deps       Very heavy     │
-│  No persistence       SQLite persist        In-memory        Built-in DB    │
-│  Loses old context    Keeps summary         Keeps summary    Multi-tier     │
-│                                                                             │
-│                       ★ RECOMMENDED ★                                       │
-└─────────────────────────────────────────────────────────────────────────────┘
-
-╔════════════════════════════════════════════════════════════════════════════════╗
-║                      FOR MESHAI SPECIFICALLY                                   ║
-╚════════════════════════════════════════════════════════════════════════════════╝
-
-Current:
-  - Messages: 150 chars max (very small)
-  - Conversations: Per-user, linear
-  - Backend: OpenAI-compatible (LiteLLM, local models)
-  - Storage: SQLite + aiosqlite
-  - Problem: Full history sent every time
-
-Constraints:
-  - Lightweight (runs on mesh nodes potentially)
-  - No heavy dependencies
-  - Must work offline (local models)
-  - Persistence required (survive restarts)
-
-Solution: Rolling Summary
-  ✓ Zero dependencies (pure Python)
-  ✓ Works with existing AsyncOpenAI client
-  ✓ Persists in existing SQLite database
-  ✓ ~100 lines of code (easy to maintain)
-  ✓ 70-80% token reduction
-  ✓ Tunable (window_size, summarize_threshold)
-
-Configuration:
-  - window_size = 4 (keep last 4 exchanges = 8 messages)
-  - summarize_threshold = 8 (re-summarize after 8 new messages)
-
-Expected savings:
-  - 10 messages: 0% (no summary yet)
-  - 20 messages: 66% token reduction
-  - 30 messages: 75% token reduction
-  - 50 messages: 84% token reduction
-
-Cost impact (at $0.50/1M tokens):
-  - Before: $0.0012 per request (2400 tokens)
-  - After:  $0.0003 per request (600 tokens)
-  - Savings: $27/month for 1000 requests/day
-
-╔════════════════════════════════════════════════════════════════════════════════╗
-║                              NEXT STEPS                                        ║
-╚════════════════════════════════════════════════════════════════════════════════╝
-
-1. Read:   MEMORY_SUMMARY.md (quick overview)
-2. Study:  MEMORY_RESEARCH.md (detailed analysis)
-3. Test:   python examples/memory_comparison.py (see it in action)
-4. Build:  MEMORY_IMPLEMENTATION_GUIDE.md (step-by-step)
-5. Deploy: Monitor and tune based on real usage
-
-Files created:
-  - /home/zvx/projects/meshai/MEMORY_SUMMARY.md
-  - /home/zvx/projects/meshai/MEMORY_RESEARCH.md
-  - /home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md
-  - /home/zvx/projects/meshai/examples/memory_comparison.py
-
-Good luck! 🚀
diff --git a/examples/memory_comparison.py b/examples/memory_comparison.py
deleted file mode 100755
index ac5d71c..0000000
--- a/examples/memory_comparison.py
+++ /dev/null
@@ -1,285 +0,0 @@
-#!/usr/bin/env python3
-"""
-Proof-of-concept: Compare full history vs rolling summary memory.
-
-Demonstrates token savings and performance of different approaches.
-
-Usage:
-    python examples/memory_comparison.py
-"""
-
-import asyncio
-import time
-from typing import Optional
-
-from openai import AsyncOpenAI
-
-
-# ============================================================================
-# SIMPLE ROLLING SUMMARY IMPLEMENTATION
-# ============================================================================
-
-
-class SimpleRollingSummary:
-    """Minimal rolling summary memory manager for testing."""
-
-    def __init__(
-        self,
-        client: AsyncOpenAI,
-        model: str,
-        window_size: int = 4,
-    ):
-        self.client = client
-        self.model = model
-        self.window_size = window_size
-        self._summary_cache = {}
-
-    async def get_context(
-        self, user_id: str, messages: list[dict]
-    ) -> tuple[Optional[str], list[dict]]:
-        """Return (summary, recent_messages) for optimized context."""
-
-        # Short conversation - return all messages
-        if len(messages) <= self.window_size * 2:
-            return None, messages
-
-        # Split old and recent
-        split = -(self.window_size * 2)
-        old = messages[:split]
-        recent = messages[split:]
-
-        # Get or create summary
-        if user_id not in self._summary_cache:
-            summary = await self._summarize(old)
-            self._summary_cache[user_id] = summary
-        else:
-            summary = self._summary_cache[user_id]
-
-        return summary, recent
-
-    async def _summarize(self, messages: list[dict]) -> str:
-        """Generate summary of messages."""
-        conv = "\n".join([f"{m['role'].upper()}: {m['content']}" for m in messages])
-
-        prompt = f"""Summarize this conversation in 2-3 concise sentences:
-
-{conv}
-
-Summary:"""
-
-        response = await self.client.chat.completions.create(
-            model=self.model,
-            messages=[{"role": "user", "content": prompt}],
-            max_tokens=150,
-            temperature=0.3,
-        )
-
-        return response.choices[0].message.content.strip()
-
-
-# ============================================================================
-# COMPARISON SCENARIOS
-# ============================================================================
-
-
-async def test_full_history(client: AsyncOpenAI, model: str, messages: list[dict]):
-    """Baseline: Send full conversation history."""
-    print("\n=== FULL HISTORY APPROACH ===")
-
-    system = "You are a helpful assistant on a mesh network."
-    full = [{"role": "system", "content": system}] + messages
-
-    start = time.time()
-
-    response = await client.chat.completions.create(
-        model=model, messages=full, max_tokens=100, temperature=0.7
-    )
-
-    elapsed = time.time() - start
-
-    # Estimate tokens (rough)
-    total_chars = sum(len(m["content"]) for m in full)
-    est_tokens = total_chars // 4  # Rough estimate: 4 chars = 1 token
-
-    print(f"Messages sent: {len(full)}")
-    print(f"Est. input tokens: {est_tokens}")
-    print(f"Response: {response.choices[0].message.content[:100]}...")
-    print(f"Time: {elapsed:.2f}s")
-
-    return est_tokens, elapsed
-
-
-async def test_rolling_summary(
-    client: AsyncOpenAI, model: str, messages: list[dict], user_id: str
-):
-    """Optimized: Send summary + recent messages."""
-    print("\n=== ROLLING SUMMARY APPROACH ===")
-
-    memory = SimpleRollingSummary(client, model, window_size=4)
-
-    summary, recent = await memory.get_context(user_id, messages)
-
-    system = "You are a helpful assistant on a mesh network."
-    if summary:
-        system += f"\n\nPrevious conversation summary: {summary}"
-
-    context = [{"role": "system", "content": system}] + recent
-
-    start = time.time()
-
-    response = await client.chat.completions.create(
-        model=model, messages=context, max_tokens=100, temperature=0.7
-    )
-
-    elapsed = time.time() - start
-
-    # Estimate tokens
-    total_chars = sum(len(m["content"]) for m in context)
-    est_tokens = total_chars // 4
-
-    print(f"Messages sent: {len(context)} (summary: {summary is not None})")
-    if summary:
-        print(f"Summary: {summary[:80]}...")
-    print(f"Est. input tokens: {est_tokens}")
-    print(f"Response: {response.choices[0].message.content[:100]}...")
-    print(f"Time: {elapsed:.2f}s")
-
-    return est_tokens, elapsed
-
-
-async def test_window_only(client: AsyncOpenAI, model: str, messages: list[dict]):
-    """Simple window: Just last N messages, no summary."""
-    print("\n=== WINDOW-ONLY APPROACH ===")
-
-    window_size = 4
-    recent = messages[-(window_size * 2) :]
-
-    system = "You are a helpful assistant on a mesh network."
-    context = [{"role": "system", "content": system}] + recent
-
-    start = time.time()
-
-    response = await client.chat.completions.create(
-        model=model, messages=context, max_tokens=100, temperature=0.7
-    )
-
-    elapsed = time.time() - start
-
-    total_chars = sum(len(m["content"]) for m in context)
-    est_tokens = total_chars // 4
-
-    print(f"Messages sent: {len(context)} (last {window_size} exchanges only)")
-    print(f"Est. input tokens: {est_tokens}")
-    print(f"Response: {response.choices[0].message.content[:100]}...")
-    print(f"Time: {elapsed:.2f}s")
-
-    return est_tokens, elapsed
-
-
-# ============================================================================
-# MAIN TEST
-# ============================================================================
-
-
-async def main():
-    """Run comparison test."""
-
-    # Configure your LLM endpoint
-    # Update these for your setup (LiteLLM, local model, etc.)
-    BASE_URL = "http://192.168.1.239:8000/v1"  # LiteLLM endpoint
-    API_KEY = "sk-1234"  # Your API key
-    MODEL = "gpt-4o-mini"  # Your model
-
-    print("=" * 70)
-    print("LLM Memory Approach Comparison")
-    print("=" * 70)
-
-    # Create test conversation (simulate 15 exchanges = 30 messages)
-    messages = []
-    topics = [
-        ("What's the weather?", "It's sunny and 72°F."),
-        ("Should I bring an umbrella?", "No need, clear skies all day."),
-        ("What about tomorrow?", "Tomorrow looks rainy, bring an umbrella."),
-        ("Any hiking recommendations?", "Try Mt. Si, great views!"),
-        ("How long is the hike?", "About 4 hours round trip."),
-        ("Is it beginner friendly?", "Moderate difficulty, doable for most."),
-        ("What should I bring?", "Water, snacks, good boots, and layers."),
-        ("Are dogs allowed?", "Yes, but must be leashed."),
-        ("Where's the trailhead?", "Off I-90 near North Bend."),
-        ("Parking fee?", "Yes, $10 or Northwest Forest Pass."),
-        ("What time should I start?", "Early morning, around 7-8 AM."),
-        ("How crowded does it get?", "Very crowded on weekends, go weekdays."),
-        ("Any other trails nearby?", "Rattlesnake Ledge is easier and closer."),
-        ("Tell me about Rattlesnake", "2 miles, great lake views, very popular."),
-        ("Which would you recommend?", "If fit: Mt Si. If casual: Rattlesnake."),
-    ]
-
-    for user_msg, assistant_msg in topics:
-        messages.append({"role": "user", "content": user_msg})
-        messages.append({"role": "assistant", "content": assistant_msg})
-
-    print(f"\nTest conversation: {len(messages)} messages ({len(messages)//2} exchanges)")
-    print(f"Topics: weather → hiking → trails")
-    print(f"Message lengths: {min(len(m['content']) for m in messages)}-{max(len(m['content']) for m in messages)} chars")
-
-    # Initialize client
-    client = AsyncOpenAI(api_key=API_KEY, base_url=BASE_URL)
-
-    try:
-        # Test each approach
-        full_tokens, full_time = await test_full_history(client, MODEL, messages)
-        summary_tokens, summary_time = await test_rolling_summary(
-            client, MODEL, messages, "!test_user"
-        )
-        window_tokens, window_time = await test_window_only(client, MODEL, messages)
-
-        # Results
-        print("\n" + "=" * 70)
-        print("COMPARISON RESULTS")
-        print("=" * 70)
-
-        print(f"\n{'Approach':<20} {'Tokens':<15} {'Time':<10} {'Savings'}")
-        print("-" * 70)
-        print(
-            f"{'Full History':<20} {full_tokens:<15} {full_time:<10.2f}s {'(baseline)'}"
-        )
-        print(
-            f"{'Rolling Summary':<20} {summary_tokens:<15} {summary_time:<10.2f}s "
-            f"{(1 - summary_tokens/full_tokens)*100:.1f}%"
-        )
-        print(
-            f"{'Window Only':<20} {window_tokens:<15} {window_time:<10.2f}s "
-            f"{(1 - window_tokens/full_tokens)*100:.1f}%"
-        )
-
-        print("\n" + "=" * 70)
-        print("RECOMMENDATIONS")
-        print("=" * 70)
-
-        print("\nFull History:")
-        print("  ✓ Complete context")
-        print("  ✗ High token usage")
-        print("  ✗ Slower for long conversations")
-        print("  Use: Never (inefficient)")
-
-        print("\nWindow Only:")
-        print("  ✓ Very low token usage")
-        print("  ✓ Fast")
-        print("  ✗ Loses older context completely")
-        print("  Use: Short-term conversations only")
-
-        print("\nRolling Summary:")
-        print("  ✓ Balanced token usage")
-        print("  ✓ Preserves long-term context")
-        print("  ✓ Fast after initial summary")
-        print("  ✗ Slight overhead for summarization")
-        print("  Use: RECOMMENDED for MeshAI")
-
-        print("\n" + "=" * 70)
-
-    finally:
-        await client.close()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())