Remove AI planning docs and example scripts

These were LLM-generated planning artifacts from the memory implementation phase. Not user-facing documentation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-21 23:24:44 +02:00 · 2026-02-24 00:22:31 +00:00 · 2026-02-24 00:22:31 +00:00 · 9a628724ce
commit 9a628724ce
parent 8c2c4d2aef
9 changed files with 0 additions and 4013 deletions
--- a/MEMORY_IMPLEMENTATION_GUIDE.md
+++ b/MEMORY_IMPLEMENTATION_GUIDE.md
@ -1,656 +0,0 @@
 # Quick Implementation Guide: Rolling Summary Memory
 ## TL;DR
 **Problem:** Sending full conversation history every request wastes tokens and latency.
 **Solution:** Rolling summary approach - keep recent messages + LLM-generated summary of older messages.
 **Result:** ~83% token reduction for long conversations, zero dependencies, works with current stack.
 ---
 ## Architecture
 ```
 SQLite History (per user)
    ↓
 Messages 1-10: Summarized → "User asked about weather, discussed outdoor plans"
 Messages 11-18: Sent raw  → Full context
    ↓
 LLM receives: System prompt + Summary + Recent 8 messages
    ↓
 Response generated
 ```
 ---
 ## Files to Create/Modify
 ### 1. Create `meshai/memory.py`
 ```python
 """Lightweight rolling summary memory manager."""
 import time
 from dataclasses import dataclass
 from typing import Optional
 from openai import AsyncOpenAI
@dataclass
 class ConversationSummary:
    """Summary of conversation history."""
    summary: str
    last_updated: float
    message_count: int
 class RollingSummaryMemory:
    """Manages conversation summaries with recent message window.
    Strategy:
    - Keep last N message pairs (window_size) in full
    - Summarize everything before the window
    - Update summary when old messages accumulate
    Example (window_size=4):
        Messages 1-10: Summarized to "User discussed weather and plans"
        Messages 11-18: Kept in full (last 4 pairs)
        Context sent: [Summary] + [Messages 11-18]
    """
    def __init__(
        self,
        client: AsyncOpenAI,
        model: str,
        window_size: int = 4,
        summarize_threshold: int = 8,
    ):
        """Initialize rolling summary memory.
        Args:
            client: AsyncOpenAI client for generating summaries
            model: Model name to use for summarization
            window_size: Number of recent message pairs to keep in full
            summarize_threshold: Messages to accumulate before re-summarizing
        """
        self._client = client
        self._model = model
        self._window_size = window_size
        self._summarize_threshold = summarize_threshold
        # In-memory cache of summaries (loaded from DB on startup)
        self._summaries: dict[str, ConversationSummary] = {}
    async def get_context_messages(
        self,
        user_id: str,
        full_history: list[dict],
    ) -> tuple[Optional[str], list[dict]]:
        """Get optimized context: summary + recent messages.
        Args:
            user_id: User identifier
            full_history: Full message history from database
        Returns:
            Tuple of (summary_text, recent_messages)
            summary_text is None if conversation is short
        """
        # Short conversation - no summary needed
        if len(full_history) <= self._window_size * 2:
            return None, full_history
        # Split into old (to summarize) and recent (keep raw)
        split_point = -(self._window_size * 2)
        old_messages = full_history[:split_point]
        recent_messages = full_history[split_point:]
        # Get or create summary
        summary = await self._get_or_create_summary(user_id, old_messages)
        return summary.summary, recent_messages
    async def _get_or_create_summary(
        self,
        user_id: str,
        messages: list[dict],
    ) -> ConversationSummary:
        """Get cached summary or create new one."""
        # Check cache
        if user_id in self._summaries:
            cached = self._summaries[user_id]
            # Reuse if message count is close
            if abs(cached.message_count - len(messages)) < self._summarize_threshold:
                return cached
        # Generate new summary
        summary_text = await self._summarize(messages)
        summary = ConversationSummary(
            summary=summary_text,
            last_updated=time.time(),
            message_count=len(messages),
        )
        self._summaries[user_id] = summary
        return summary
    async def _summarize(self, messages: list[dict]) -> str:
        """Generate summary using LLM."""
        # Format conversation
        conversation = "\n".join(
            [f"{msg['role'].upper()}: {msg['content']}" for msg in messages]
        )
        prompt = f"""Summarize this conversation in 2-3 concise sentences. Focus on:
 - Main topics discussed
 - Important context or user preferences
 - Key information to remember
 Conversation:
 {conversation}
 Summary (2-3 sentences):"""
        try:
            response = await self._client.chat.completions.create(
                model=self._model,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=150,
                temperature=0.3,
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            # Fallback
            return f"Previous conversation: {len(messages)} messages about various topics."
    def load_summary(self, user_id: str, summary: ConversationSummary) -> None:
        """Load summary from database into cache."""
        self._summaries[user_id] = summary
    def clear_summary(self, user_id: str) -> None:
        """Clear cached summary for user."""
        self._summaries.pop(user_id, None)
 ```
 ---
 ### 2. Modify `meshai/history.py`
 Add summary storage methods:
 ```python
 # Add to ConversationHistory class
 async def initialize(self) -> None:
    """Initialize database and create tables."""
    self._db = await aiosqlite.connect(self._db_path)
    # Existing conversations table
    await self._db.execute("""
        CREATE TABLE IF NOT EXISTS conversations (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            user_id TEXT NOT NULL,
            role TEXT NOT NULL,
            content TEXT NOT NULL,
            timestamp REAL NOT NULL
        )
    """)
    await self._db.execute("""
        CREATE INDEX IF NOT EXISTS idx_user_timestamp
        ON conversations (user_id, timestamp)
    """)
    # NEW: Summaries table
    await self._db.execute("""
        CREATE TABLE IF NOT EXISTS conversation_summaries (
            user_id TEXT PRIMARY KEY,
            summary TEXT NOT NULL,
            message_count INTEGER NOT NULL,
            updated_at REAL NOT NULL
        )
    """)
    await self._db.commit()
    logger.info(f"Conversation history initialized at {self._db_path}")
 async def store_summary(
    self, user_id: str, summary: str, message_count: int
 ) -> None:
    """Store conversation summary.
    Args:
        user_id: Node ID of user
        summary: Summary text
        message_count: Number of messages summarized
    """
    if not self._db:
        raise RuntimeError("Database not initialized")
    async with self._lock:
        await self._db.execute(
            """
            INSERT OR REPLACE INTO conversation_summaries
            (user_id, summary, message_count, updated_at)
            VALUES (?, ?, ?, ?)
            """,
            (user_id, summary, message_count, time.time()),
        )
        await self._db.commit()
 async def get_summary(self, user_id: str) -> Optional[dict]:
    """Get conversation summary for user.
    Args:
        user_id: Node ID of user
    Returns:
        Dict with 'summary', 'message_count', 'updated_at' or None
    """
    if not self._db:
        raise RuntimeError("Database not initialized")
    async with self._lock:
        cursor = await self._db.execute(
            """
            SELECT summary, message_count, updated_at
            FROM conversation_summaries
            WHERE user_id = ?
            """,
            (user_id,),
        )
        row = await cursor.fetchone()
    if not row:
        return None
    return {
        "summary": row[0],
        "message_count": row[1],
        "updated_at": row[2],
    }
 async def clear_summary(self, user_id: str) -> None:
    """Clear summary for user (e.g., on history reset).
    Args:
        user_id: Node ID of user
    """
    if not self._db:
        raise RuntimeError("Database not initialized")
    async with self._lock:
        await self._db.execute(
            "DELETE FROM conversation_summaries WHERE user_id = ?",
            (user_id,),
        )
        await self._db.commit()
 ```
 ---
 ### 3. Modify `meshai/backends/openai_backend.py`
 Integrate memory manager:
 ```python
 """OpenAI-compatible LLM backend with rolling summary memory."""
 import logging
 from typing import Optional
 from openai import AsyncOpenAI
 from ..config import LLMConfig
 from ..memory import RollingSummaryMemory
 from .base import LLMBackend
 logger = logging.getLogger(__name__)
 class OpenAIBackend(LLMBackend):
    """OpenAI-compatible backend with intelligent memory management."""
    def __init__(self, config: LLMConfig, api_key: str):
        """Initialize OpenAI backend.
        Args:
            config: LLM configuration
            api_key: API key to use
        """
        self.config = config
        self._client = AsyncOpenAI(
            api_key=api_key,
            base_url=config.base_url,
        )
        # Initialize rolling summary memory
        self._memory = RollingSummaryMemory(
            client=self._client,
            model=config.model,
            window_size=4,  # Keep last 4 exchanges (8 messages)
            summarize_threshold=8,  # Re-summarize after 8 new messages
        )
    async def generate(
        self,
        messages: list[dict],
        system_prompt: str,
        user_id: str = None,  # NEW: optional for backward compatibility
        max_tokens: int = 300,
    ) -> str:
        """Generate a response using OpenAI-compatible API.
        Args:
            messages: Conversation history
            system_prompt: System prompt
            user_id: User identifier (for memory management)
            max_tokens: Maximum tokens to generate
        Returns:
            Generated response
        """
        # If no user_id, use old behavior (send full history)
        if not user_id:
            full_messages = [{"role": "system", "content": system_prompt}]
            full_messages.extend(messages)
        else:
            # Use memory manager to optimize context
            summary, recent_messages = await self._memory.get_context_messages(
                user_id=user_id,
                full_history=messages,
            )
            # Build optimized message list
            if summary:
                # Long conversation: system + summary + recent
                enhanced_system = f"""{system_prompt}
 Previous conversation summary: {summary}"""
                full_messages = [{"role": "system", "content": enhanced_system}]
                full_messages.extend(recent_messages)
                logger.debug(
                    f"Using summary + {len(recent_messages)} recent messages "
                    f"(total history: {len(messages)})"
                )
            else:
                # Short conversation: system + all messages
                full_messages = [{"role": "system", "content": system_prompt}]
                full_messages.extend(messages)
        try:
            response = await self._client.chat.completions.create(
                model=self.config.model,
                messages=full_messages,
                max_tokens=max_tokens,
                temperature=0.7,
            )
            content = response.choices[0].message.content
            return content.strip() if content else ""
        except Exception as e:
            logger.error(f"OpenAI API error: {e}")
            raise
    def load_summary_cache(self, user_id: str, summary_data: dict) -> None:
        """Load summary into memory cache (called on startup).
        Args:
            user_id: User identifier
            summary_data: Dict with 'summary', 'message_count', 'updated_at'
        """
        from ..memory import ConversationSummary
        summary = ConversationSummary(
            summary=summary_data["summary"],
            message_count=summary_data["message_count"],
            last_updated=summary_data["updated_at"],
        )
        self._memory.load_summary(user_id, summary)
    def clear_summary_cache(self, user_id: str) -> None:
        """Clear summary cache for user."""
        self._memory.clear_summary(user_id)
    # ... rest of methods unchanged ...
 ```
 ---
 ### 4. Modify `meshai/responder.py`
 Pass user_id to backend and persist summaries:
 ```python
 # In the generate_response method
 async def generate_response(self, user_id: str, message: str) -> str:
    """Generate LLM response with optimized memory."""
    # Add user message to history
    await self.history.add_message(user_id, "user", message)
    # Get conversation history
    history = await self.history.get_history_for_llm(user_id)
    # Generate response with user_id for memory management
    response = await self.backend.generate(
        messages=history,
        system_prompt=self.system_prompt,
        user_id=user_id,  # NEW: enables memory optimization
        max_tokens=300,
    )
    # Add assistant response to history
    await self.history.add_message(user_id, "assistant", response)
    # Persist summary if one was created
    # The memory manager caches it, we need to save to DB
    summary_data = await self._get_current_summary(user_id)
    if summary_data:
        await self.history.store_summary(
            user_id,
            summary_data["summary"],
            summary_data["message_count"],
        )
    return response
 async def _get_current_summary(self, user_id: str) -> Optional[dict]:
    """Get current summary from memory manager if it exists."""
    # Access the memory manager's cache
    if hasattr(self.backend, "_memory"):
        summary = self.backend._memory._summaries.get(user_id)
        if summary:
            return {
                "summary": summary.summary,
                "message_count": summary.message_count,
                "updated_at": summary.last_updated,
            }
    return None
 ```
 ---
 ### 5. Modify `meshai/commands/reset.py`
 Clear summaries when resetting history:
 ```python
 async def execute(self, sender_id: str, args: list[str]) -> str:
    """Reset conversation history."""
    count = await self.responder.history.clear_history(sender_id)
    # NEW: Also clear summary
    await self.responder.history.clear_summary(sender_id)
    if hasattr(self.responder.backend, "clear_summary_cache"):
        self.responder.backend.clear_summary_cache(sender_id)
    return f"Cleared {count} messages from your history."
 ```
 ---
 ## Configuration
 Add to `meshai/config.py`:
 ```python
@dataclass
 class MemoryConfig:
    """Memory management configuration."""
    # Rolling summary settings
    window_size: int = 4  # Recent message pairs to keep
    summarize_threshold: int = 8  # Messages before re-summarizing
    # When to enable summaries
    min_messages_for_summary: int = 10  # Start summarizing after this many
 ```
 ---
 ## Testing
 ```python
 # Test script
 import asyncio
 from meshai.backends.openai_backend import OpenAIBackend
 from meshai.config import LLMConfig
 async def test():
    config = LLMConfig(
        backend="openai",
        base_url="http://192.168.1.239:8000/v1",
        model="gpt-4o-mini"
    )
    backend = OpenAIBackend(config, "your-key")
    # Simulate long conversation
    messages = []
    for i in range(20):
        messages.append({"role": "user", "content": f"Question {i}"})
        messages.append({"role": "assistant", "content": f"Answer {i}"})
    # Generate - should use summary
    response = await backend.generate(
        messages=messages,
        system_prompt="You are helpful.",
        user_id="!test123",
        max_tokens=100
    )
    print(f"Response: {response}")
    print(f"Sent {len(messages)} messages, but only ~10 used in context")
 asyncio.run(test())
 ```
 ---
 ## Expected Results
 ### Token Usage Comparison
 **Before (full history):**
 ```
 User message 1-20: ~2000 tokens
 System prompt: ~50 tokens
 Total: ~2050 tokens per request
 ```
 **After (with summary):**
 ```
 System prompt: ~50 tokens
 Summary: ~100 tokens
 Recent 8 messages: ~400 tokens
 Total: ~550 tokens per request
 ```
 **Savings: ~73% token reduction**
 ### Performance Impact
 - **Summary generation**: ~1-2s every 8-10 messages (amortized)
 - **Regular requests**: No added latency
 - **Storage**: ~100 bytes per summary in SQLite
 ---
 ## Tuning Parameters
 ### window_size
 - **Smaller (2-3)**: More aggressive summarization, max token savings
 - **Larger (5-6)**: More context, less summarization
 - **Recommended**: 4 (last 4 exchanges = 8 messages)
 ### summarize_threshold
 - **Smaller (4-6)**: Frequent re-summarization, more current
 - **Larger (10-12)**: Less summarization overhead
 - **Recommended**: 8 (re-summarize after 8 new messages)
 ### For MeshAI specifically:
 - Messages are tiny (150 chars max)
 - `window_size=4` gives ~600 chars of recent context
 - `summarize_threshold=8` balances overhead vs accuracy
 ---
 ## Migration Path
 1. **Phase 1**: Add code, test with new users
 2. **Phase 2**: Run in parallel (old + new backend)
 3. **Phase 3**: Migrate existing users (generate summaries for existing history)
 4. **Phase 4**: Remove old full-history code path
 No data loss - summaries stored in DB, can regenerate anytime.
 ---
 ## Maintenance
 ### Monitor summary quality:
 ```sql
 -- Check summaries
 SELECT user_id, summary, message_count, updated_at
 FROM conversation_summaries
 ORDER BY updated_at DESC;
 ```
 ### Regenerate summary:
 ```python
 # Clear cache + DB, will regenerate on next request
 await history.clear_summary(user_id)
 backend.clear_summary_cache(user_id)
 ```
 ### Adjust if summaries too short/long:
 - Modify prompt in `_summarize()`
 - Adjust `max_tokens=150` for summaries
 - Change temperature (lower = more consistent)
 ---
 ## Future Enhancements
 1. **Hybrid approach**: Summary + semantic search for very long histories
 2. **User preferences**: Store separate from summary (e.g., "likes weather in metric")
 3. **Multi-level summaries**: Summarize summaries for years-long conversations
 4. **Summary quality scoring**: Validate summaries maintain key information
 But start simple - this gets 80% of the benefit with 20% of the complexity.
--- a/MEMORY_README.md
+++ b/MEMORY_README.md
@ -1,437 +0,0 @@
 # LLM Conversation Memory Research & Implementation
 This directory contains comprehensive research and implementation guides for improving LLM conversation memory in MeshAI.
 ## Problem Statement
 MeshAI currently sends the full conversation history with every LLM API call. This approach:
 - Wastes tokens (expensive and slow)
 - Doesn't scale to long conversations
 - Sends redundant context the LLM doesn't need
 ## Solution: Rolling Summary Memory
 Keep recent messages in full + LLM-generated summary of older messages.
 **Result:** 70-80% token reduction, zero dependencies, works with existing stack.
 ---
 ## Documentation Index
 ### 1. Quick Start
 **READ THIS FIRST:** [`MEMORY_SUMMARY.md`](/home/zvx/projects/meshai/MEMORY_SUMMARY.md)
 - High-level overview
 - Why rolling summary?
 - Comparison with alternatives
 - Expected performance gains
 **Estimated reading time:** 10 minutes
 ---
 ### 2. Detailed Research
 **FOR DEEP DIVE:** [`MEMORY_RESEARCH.md`](/home/zvx/projects/meshai/MEMORY_RESEARCH.md)
 - Full evaluation of 5 approaches:
  1. LangChain Memory modules
  2. LlamaIndex
  3. MemGPT/Letta
  4. Vector stores (ChromaDB/Qdrant)
  5. Simple rolling summary (DIY)
 - Code examples for each approach
 - Pros/cons for MeshAI specifically
 - Detailed comparison matrix
 **Estimated reading time:** 30-45 minutes
 ---
 ### 3. Implementation Guide
 **FOR BUILDING:** [`MEMORY_IMPLEMENTATION_GUIDE.md`](/home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md)
 - Step-by-step implementation
 - Complete code examples
 - Database schema
 - Configuration options
 - Testing procedures
 - Troubleshooting guide
 **Estimated reading time:** 20 minutes + implementation time
 ---
 ### 4. Implementation Diff
 **FOR EXACT CHANGES:** [`docs/IMPLEMENTATION_DIFF.md`](/home/zvx/projects/meshai/docs/IMPLEMENTATION_DIFF.md)
 - Exact code diffs for all files
 - Line-by-line changes needed
 - Migration checklist
 - Rollback plan
 - Performance validation queries
 **Estimated reading time:** 15 minutes
 ---
 ### 5. Visual Comparison
 **FOR UNDERSTANDING:** [`docs/memory_approaches_comparison.txt`](/home/zvx/projects/meshai/docs/memory_approaches_comparison.txt)
 - ASCII diagrams of all approaches
 - Visual token usage comparison
 - Decision matrices
 - Architecture diagrams
 **Estimated reading time:** 10 minutes
 ---
 ### 6. Quick Reference
 **FOR CHEAT SHEET:** [`docs/QUICK_REFERENCE.md`](/home/zvx/projects/meshai/docs/QUICK_REFERENCE.md)
 - One-page reference card
 - Key configuration
 - Code snippets
 - Performance metrics
 - Troubleshooting tips
 **Estimated reading time:** 5 minutes
 ---
 ### 7. Proof of Concept
 **FOR TESTING:** [`examples/memory_comparison.py`](/home/zvx/projects/meshai/examples/memory_comparison.py)
 - Runnable comparison script
 - Tests all 3 approaches side-by-side:
  - Full history (baseline)
  - Rolling summary
  - Window-only
 - Real token usage measurements
 - Performance comparison
 **Usage:**
 ```bash
 # Edit script with your LLM endpoint
 nano examples/memory_comparison.py
 # Update BASE_URL, API_KEY, MODEL
 # Run comparison
 python examples/memory_comparison.py
 ```
 **Expected output:**
 ```
 Approach             Tokens          Time       Savings
 ----------------------------------------------------------------------
 Full History         1847            2.34s      (baseline)
 Rolling Summary      512             1.87s      72.3%
 Window Only          398             1.45s      78.4%
 RECOMMENDATION: Rolling Summary - best balance of context and efficiency
 ```
 ---
 ## Recommended Reading Path
 ### Path 1: Executive Summary (20 minutes)
 1. `MEMORY_SUMMARY.md` - Overview
 2. `docs/QUICK_REFERENCE.md` - Cheat sheet
 3. `examples/memory_comparison.py` - Run the test
 **Decision point:** Convinced? Proceed to implementation.
 ---
 ### Path 2: Technical Deep Dive (60 minutes)
 1. `MEMORY_SUMMARY.md` - Overview
 2. `MEMORY_RESEARCH.md` - Full evaluation
 3. `docs/memory_approaches_comparison.txt` - Visual diagrams
 4. `examples/memory_comparison.py` - Run the test
 5. `MEMORY_IMPLEMENTATION_GUIDE.md` - How to build it
 **Decision point:** Ready to implement? Use the diff guide.
 ---
 ### Path 3: Implementation (2-3 hours)
 1. `MEMORY_SUMMARY.md` - Refresh on approach
 2. `MEMORY_IMPLEMENTATION_GUIDE.md` - Full implementation guide
 3. `docs/IMPLEMENTATION_DIFF.md` - Exact changes needed
 4. Code the changes
 5. Test with `examples/memory_comparison.py`
 6. Deploy and monitor
 **Outcome:** Production-ready rolling summary memory.
 ---
 ## Files Created
 ### Documentation
 ```
 /home/zvx/projects/meshai/
 ├── MEMORY_README.md (this file)
 ├── MEMORY_SUMMARY.md (overview)
 ├── MEMORY_RESEARCH.md (detailed research)
 ├── MEMORY_IMPLEMENTATION_GUIDE.md (step-by-step)
 ├── docs/
 │   ├── IMPLEMENTATION_DIFF.md (exact changes)
 │   ├── memory_approaches_comparison.txt (diagrams)
 │   └── QUICK_REFERENCE.md (cheat sheet)
 └── examples/
    └── memory_comparison.py (proof of concept)
 ```
 ### Code to Create (not yet created)
 ```
 meshai/
 ├── memory.py (NEW - ~100 lines)
 ├── history.py (MODIFY - add ~70 lines)
 ├── backends/
 │   └── openai_backend.py (MODIFY - add ~30 lines)
 ├── responder.py (MODIFY - add ~10 lines)
 └── commands/
    └── reset.py (MODIFY - add ~4 lines)
 ```
 **Total new code:** ~214 lines
 **Dependencies added:** 0
 ---
 ## Key Metrics
 ### Token Savings
 | Conversation Length | Before | After | Savings |
 |---------------------|--------|-------|---------|
 | 10 messages | 800 | 800 | 0% |
 | 20 messages | 1600 | 550 | 66% |
 | 30 messages | 2400 | 600 | 75% |
 | 50 messages | 4000 | 650 | 84% |
 ### Cost Impact
 **Assumptions:**
 - $0.50 per 1M input tokens
 - 1000 requests per day
 - Average 30 messages per conversation
 **Before:** $36/month
 **After:** $9/month
 **Savings:** $27/month (75% reduction)
 ### Implementation Effort
 - Code to write: ~214 lines
 - Code to modify: ~57 lines
 - Time estimate: 2-3 hours
 - Testing: 1 hour
 - **Total:** Half a day
 ### Risk Assessment
 - **Low risk:** Backward compatible (user_id parameter optional)
 - **No data loss:** New table, existing data untouched
 - **Easy rollback:** Git revert + drop one table
 - **No dependencies:** Pure Python, existing libraries only
 ---
 ## Configuration Summary
 ### Recommended for MeshAI
 ```python
 RollingSummaryMemory(
    client=self._client,
    model=config.model,
    window_size=4,           # Keep last 4 exchanges (8 messages)
    summarize_threshold=8,   # Re-summarize after 8 new messages
 )
 ```
 **Rationale:**
 - MeshAI messages are tiny (150 chars max)
 - window_size=4 gives ~600 chars of recent context
 - summarize_threshold=8 balances overhead vs freshness
 - Tune based on actual usage patterns
 ### Alternative Configurations
 **For longer messages:**
 ```python
 window_size=3,           # Less recent context needed
 summarize_threshold=6,   # More frequent updates
 ```
 **For very short messages:**
 ```python
 window_size=6,           # More recent context
 summarize_threshold=10,  # Less frequent summarization
 ```
 ---
 ## Database Schema
 ### New Table
 ```sql
 CREATE TABLE conversation_summaries (
    user_id TEXT PRIMARY KEY,
    summary TEXT NOT NULL,
    message_count INTEGER NOT NULL,
    updated_at REAL NOT NULL
 );
 ```
 ### Existing Tables (unchanged)
 ```sql
 CREATE TABLE conversations (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    user_id TEXT NOT NULL,
    role TEXT NOT NULL,
    content TEXT NOT NULL,
    timestamp REAL NOT NULL
 );
 CREATE INDEX idx_user_timestamp ON conversations (user_id, timestamp);
 ```
 ---
 ## Testing Checklist
 - [ ] Database migration works (new table created)
 - [ ] Short conversations (<10 messages) use full history
 - [ ] Long conversations (>10 messages) use summaries
 - [ ] Summaries are stored in database
 - [ ] Summaries persist across restarts
 - [ ] Reset command clears summaries
 - [ ] Token usage reduced by 70%+ for long convos
 - [ ] No errors in logs
 - [ ] Response quality maintained
 ---
 ## Monitoring Queries
 ### Check summary coverage
 ```sql
 SELECT
    (SELECT COUNT(DISTINCT user_id) FROM conversation_summaries) * 100.0 /
    (SELECT COUNT(DISTINCT user_id) FROM conversations) as coverage_pct;
 ```
 ### Average messages per summary
 ```sql
 SELECT AVG(message_count) FROM conversation_summaries;
 ```
 ### Recent summaries
 ```sql
 SELECT user_id, summary, message_count,
       datetime(updated_at, 'unixepoch') as updated
 FROM conversation_summaries
 ORDER BY updated_at DESC
 LIMIT 10;
 ```
 ---
 ## Troubleshooting
 ### Summary not being created
 **Check:** Conversation long enough?
 ```sql
 SELECT user_id, COUNT(*) as msg_count
 FROM conversations
 GROUP BY user_id
 HAVING msg_count > 10;
 ```
 **Fix:** Need >10 messages before summary kicks in.
 ### Summary quality poor
 **Check:** Look at actual summaries
 ```sql
 SELECT summary FROM conversation_summaries;
 ```
 **Fix:** Adjust prompt in `memory.py` `_summarize()` method.
 ### Token usage still high
 **Check:** Verify memory is being used
 ```bash
 # Look for log line:
 # "Using summary + 8 recent messages (total history: 24)"
 ```
 **Fix:** Ensure `user_id` is being passed to `backend.generate()`.
 ### Database errors
 **Check:** Table exists
 ```sql
 .tables
 ```
 **Fix:** Drop and recreate
 ```sql
 DROP TABLE IF EXISTS conversation_summaries;
 -- Restart app to recreate
 ```
 ---
 ## Next Steps
 1. **Understand:** Read `MEMORY_SUMMARY.md`
 2. **Evaluate:** Review `MEMORY_RESEARCH.md` for alternatives
 3. **Test:** Run `examples/memory_comparison.py` with your LLM
 4. **Implement:** Follow `MEMORY_IMPLEMENTATION_GUIDE.md`
 5. **Deploy:** Use `docs/IMPLEMENTATION_DIFF.md` for exact changes
 6. **Monitor:** Check database and logs for summary generation
 7. **Tune:** Adjust `window_size` and `summarize_threshold` as needed
 ---
 ## Support
 If you have questions or issues:
 1. Check the troubleshooting section in this file
 2. Review `docs/QUICK_REFERENCE.md` for common issues
 3. Look at the detailed implementation guide
 4. Check the proof-of-concept script for working examples
 ---
 ## Conclusion
 Rolling summary memory provides:
 - **Massive efficiency gains** (70-80% token reduction)
 - **Zero dependencies** (pure Python)
 - **Simple implementation** (~200 lines)
 - **Production ready** (tested approach)
 - **Backward compatible** (optional user_id)
 - **Easy to maintain** (clear, documented code)
 **Recommendation:** Implement this for MeshAI. It's the right balance of simplicity and effectiveness.
 Good luck! The documentation is comprehensive - you have everything needed to succeed.
 ---
 **Research completed:** 2025-12-15
 **Total documentation:** 7 files, ~1500 lines
 **Implementation effort:** ~3 hours
 **Expected ROI:** $324/year in token savings (at modest 1000 req/day)
--- a/MEMORY_RESEARCH.md
+++ b/MEMORY_RESEARCH.md
--- a/MEMORY_SUMMARY.md
+++ b/MEMORY_SUMMARY.md
@ -1,219 +0,0 @@
 # LLM Memory Research Summary
 ## The Problem
 MeshAI currently stuffs full conversation history into every LLM API call:
 - Inefficient: Wastes tokens on old context
 - Slow: More tokens = higher latency
 - Expensive: Unnecessary token costs
 - Doesn't scale: Long conversations become unwieldy
 ## Solutions Evaluated
 ### 1. LangChain Memory Modules
 **Tested:**
 - `ConversationBufferMemory`: Stores everything (no improvement)
 - `ConversationBufferWindowMemory`: Last N messages only
 - `ConversationSummaryMemory`: LLM-generated summaries + recent messages
 **Verdict:** `ConversationSummaryMemory` is best, but adds 50MB dependency. Can DIY the same thing in <100 lines.
 ### 2. LlamaIndex
 **Tested:** `ChatMemoryBuffer` with token limiting
 **Verdict:** Token-aware pruning is nice, but 100MB+ dependency is overkill. Less mature than LangChain.
 ### 3. MemGPT/Letta
 **Tested:** Self-editing memory architecture
 **Verdict:** Way too heavy (200MB+), requires vector embeddings. Designed for complex multi-day agents, not 150-char mesh messages.
 ### 4. Vector Stores (ChromaDB/Qdrant)
 **Tested:** Semantic search for relevant past context
 **Verdict:** Interesting for long-term cross-conversation search, but adds complexity. Not needed for per-user linear conversations.
 ### 5. Simple Rolling Summary (DIY)
 **Tested:** Keep last N messages + LLM-generated summary of older messages
 **Verdict:** WINNER - Zero dependencies, 80% token savings, works with existing stack.
 ---
 ## Recommendation: Rolling Summary
 ### Why
 1. **Zero dependencies** - Pure Python, uses existing AsyncOpenAI client
 2. **Simple** - ~100 lines of code, easy to understand and maintain
 3. **Effective** - 73-83% token reduction for long conversations
 4. **Persistent** - Summaries stored in SQLite, survive restarts
 5. **Compatible** - Works with LiteLLM, local models, any OpenAI-compatible API
 6. **Tunable** - Two params: `window_size` (recent messages) and `summarize_threshold` (when to re-summarize)
 ### How It Works
 ```
 Full History (20 messages):
 ┌─────────────────────────────────────────────────────┐
 │ User: What's the weather?                           │
 │ Assistant: Sunny, 72°F                              │
 │ ... (16 more messages) ...                          │
 │ User: Which trail should I take?                    │
 │ Assistant: Mt Si if you're fit, Rattlesnake if not │
 └─────────────────────────────────────────────────────┘
  ↓ Sent to LLM: 2000+ tokens
 With Rolling Summary:
 ┌─────────────────────────────────────────────────────┐
 │ SUMMARY: User asked about weather and hiking.      │
 │ Discussed Mt Si trail (4hrs, moderate) and         │
 │ Rattlesnake Ledge (2mi, easier, lake views).       │
 ├─────────────────────────────────────────────────────┤
 │ User: How crowded does it get?                     │
 │ Assistant: Very crowded weekends, go weekdays      │
 │ User: Any other trails nearby?                     │
 │ Assistant: Rattlesnake Ledge is easier and closer │
 │ User: Tell me about Rattlesnake                    │
 │ Assistant: 2 miles, great lake views, popular     │
 │ User: Which would you recommend?                   │
 │ Assistant: Mt Si if fit, Rattlesnake if casual    │
 └─────────────────────────────────────────────────────┘
  ↓ Sent to LLM: ~500 tokens (75% savings!)
 ```
 ### Configuration
 **Recommended for MeshAI:**
 - `window_size=4` → Keep last 4 exchanges (8 messages) in full
 - `summarize_threshold=8` → Re-summarize after 8 new messages
 **Tuning:**
 - Smaller window = More aggressive summarization, max token savings
 - Larger window = More recent context, less summarization
 - Adjust based on average conversation length and message density
 ### Implementation Effort
 **Files to modify:**
 1. Create `meshai/memory.py` - Rolling summary class
 2. Modify `meshai/history.py` - Add summary storage (1 new table, 3 methods)
 3. Modify `meshai/backends/openai_backend.py` - Integrate memory manager
 4. Modify `meshai/responder.py` - Pass user_id, persist summaries
 5. Modify `meshai/commands/reset.py` - Clear summaries on reset
 **Total: ~200 lines of new code, ~50 lines of modifications**
 ### Performance
 **Token Usage:**
 | Conversation Length | Full History | Rolling Summary | Savings |
 |---------------------|--------------|-----------------|---------|
 | 10 messages | 800 tokens | 800 tokens | 0% (no summary) |
 | 20 messages | 1600 tokens | 550 tokens | 66% |
 | 30 messages | 2400 tokens | 600 tokens | 75% |
 | 50 messages | 4000 tokens | 650 tokens | 84% |
 **Cost Impact (at $0.50/1M input tokens):**
 - Before: 2400 tokens × $0.0005 = $0.0012 per request
 - After: 600 tokens × $0.0005 = $0.0003 per request
 - **Savings: $0.0009 per request (75%)**
 For 1000 requests/day: **$0.90/day savings** or **$27/month**
 **Latency:**
 - Summary generation: 1-2s every 8-10 messages (amortized)
 - Regular requests: No added latency
 - Net effect: Faster due to fewer input tokens
 ---
 ## When to Use Alternatives
 ### Use Window-Only (no summary)
 - Very short conversations (< 10 messages)
 - Don't care about older context
 - Want minimal implementation
 ### Use Vector Store (ChromaDB)
 - Need semantic search across users
 - Want to find similar past conversations
 - Long-term cross-user knowledge base
 ### Use LangChain SummaryMemory
 - Want batteries-included solution
 - Don't mind 50MB dependency
 - Prefer established library over DIY
 ### Use MemGPT/Letta
 - Multi-day complex agent workflows
 - Agent needs to manage own memory
 - Have budget for embeddings and compute
 ---
 ## Next Steps
 1. **Read detailed guide:** `/home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md`
 2. **Review research:** `/home/zvx/projects/meshai/MEMORY_RESEARCH.md`
 3. **Test proof-of-concept:** `python examples/memory_comparison.py`
 4. **Implement rolling summary** following the guide
 5. **Monitor and tune** based on actual conversation patterns
 ---
 ## Files Created
 1. **`MEMORY_SUMMARY.md`** (this file) - Quick overview and recommendation
 2. **`MEMORY_RESEARCH.md`** - Detailed evaluation of all approaches with code examples
 3. **`MEMORY_IMPLEMENTATION_GUIDE.md`** - Step-by-step implementation guide
 4. **`examples/memory_comparison.py`** - Runnable proof-of-concept test script
 ---
 ## Quick Start
 ```bash
 # Test the approaches with your LLM
 cd /home/zvx/projects/meshai
 # Edit examples/memory_comparison.py with your LLM endpoint
 # Update BASE_URL, API_KEY, MODEL
 python examples/memory_comparison.py
 # You'll see:
 # - Full history baseline
 # - Rolling summary results
 # - Window-only results
 # - Token savings comparison
 ```
 Expected output:
 ```
 Approach             Tokens          Time       Savings
 ----------------------------------------------------------------------
 Full History         1847            2.34s      (baseline)
 Rolling Summary      512             1.87s      72.3%
 Window Only          398             1.45s      78.4%
 ```
 **Conclusion: Rolling Summary gives 70%+ savings while preserving context.**
 ---
 ## Questions?
 - How does it handle very long conversations? → Multi-level summaries (summary of summaries)
 - What if summary loses important info? → Tune `window_size` to keep more recent context
 - Does it work with streaming? → Yes, just apply before streaming starts
 - Can I see the summaries? → Query `conversation_summaries` table in SQLite
 - How do I regenerate a summary? → Clear it, will auto-regenerate on next request
 Start with the recommended settings, monitor, and adjust based on your actual usage patterns.
--- a/PLAN.md
+++ b/PLAN.md
@ -1,356 +0,0 @@
 # MeshAI - Meshtastic LLM Bridge
 ## Project Overview
 A Python application that connects to a Meshtastic node and provides LLM-powered responses to mesh network users. Responds to direct mentions (@nodename) or direct messages. Includes bang commands (`!command`) for utility functions.
 ## Design Decisions
 ### 1. Trigger Mechanism
 - **@mentions**: Respond when message contains `@<nodename>` (configurable node name)
 - **Direct Messages**: Respond to all DMs automatically
 - **Bang commands**: `!command` syntax for utility functions (handled before LLM)
 - Ignore general channel chatter that doesn't mention the bot
 ### 2. Conversation History
 - Maintain per-user conversation history
 - Storage: SQLite database for persistence across restarts
 - Context window: Last N messages per user (configurable, default ~20 exchanges)
 - With 300 char limit per exchange, context stays small - can maintain long conversations
 - Include timestamp tracking for potential "conversation timeout" (e.g., reset after 24h inactivity)
 ### 3. Rate Limiting & Response Behavior
 - **Response delay**: Configurable 2.2-3.0 second random delay before sending
 - **Message chunking**: Split responses at 150 characters max per message
 - **Max chunks**: 2 messages maximum per response (300 chars total)
 - **Brevity prompt**: System prompt instructs LLM to keep responses concise
 - **Cooldown**: Optional per-user cooldown to prevent spam
 ### 4. Identity & Configuration
 - Node name/ID determined by the physical node configuration
 - Application config includes:
  - `bot_name`: The @mention trigger name (e.g., "meshbot", "ai")
  - `owner`: Owner identification for logging/admin purposes
  - Connection settings (serial port or TCP host:port)
 ### 5. Channel Filtering
 - Configurable list of channels to respond on
 - Option to respond on all channels or specific ones only
 - DMs always processed regardless of channel settings
 ## Technical Architecture
 ```
 ┌─────────────────────────────────────────────────────────────┐
 │                        MeshAI                                │
 ├─────────────────────────────────────────────────────────────┤
 │  ┌─────────────┐    ┌─────────────┐    ┌─────────────────┐ │
 │  │  Meshtastic │    │   Message   │    │   LLM Backend   │ │
 │  │  Connector  │───▶│   Router    │───▶│   (pluggable)   │ │
 │  │ Serial/TCP  │    │             │    │                 │ │
 │  └─────────────┘    └─────────────┘    └─────────────────┘ │
 │         │                 │                    │            │
 │         │           ┌─────▼─────┐              │            │
 │         │           │ Conversation│             │            │
 │         │           │  History   │◀────────────┘            │
 │         │           │  (SQLite)  │                          │
 │         │           └───────────┘                           │
 │         │                                                   │
 │         ▼                                                   │
 │  ┌─────────────┐                                           │
 │  │  Response   │  - 2.2-3s delay                           │
 │  │  Handler    │  - Chunk to 150 chars                     │
 │  │             │  - Max 2 messages                         │
 │  └─────────────┘                                           │
 └─────────────────────────────────────────────────────────────┘
 ```
 ## LLM Backend Support
 ### Pluggable Backend Interface
 ```python
 class LLMBackend(ABC):
    @abstractmethod
    async def generate(self, messages: list[dict], system_prompt: str) -> str:
        pass
 ```
 ### Supported Backends (Priority Order)
 1. **OpenAI-compatible** (covers most bases)
   - OpenAI (GPT-4, GPT-4o, etc.)
   - Local LiteLLM/Open WebUI (ai.echo6.co)
   - Any OpenAI-compatible API
 2. **Anthropic** (Claude)
   - Direct Anthropic API
 3. **Google** (Gemini)
   - Google AI Studio / Vertex AI
 ### Configuration Example
 ```yaml
 llm:
  backend: "openai"  # openai, anthropic, google
  api_key: "${OPENAI_API_KEY}"
  base_url: "https://api.openai.com/v1"  # or http://ai.echo6.co/api for local
  model: "gpt-4o-mini"
  # For local LiteLLM:
  # backend: "openai"
  # base_url: "http://192.168.1.239:4000/v1"
  # model: "llama3"
 ```
 ## Configuration File Structure
 ```yaml
 # config.yaml
 bot:
  name: "ai"                    # @mention trigger
  owner: "K7ZVX"               # Owner callsign/name
  respond_to_mentions: true
  respond_to_dms: true
 connection:
  type: "serial"               # serial or tcp
  serial_port: "/dev/ttyUSB0"  # if serial
  tcp_host: "192.168.1.100"    # if tcp
  tcp_port: 4403               # if tcp
 channels:
  mode: "all"                  # "all" or "whitelist"
  whitelist: [0, 1]            # Only if mode is "whitelist"
 response:
  delay_min: 2.2               # seconds
  delay_max: 3.0               # seconds
  max_length: 150              # chars per message
  max_messages: 2              # messages per response
 history:
  database: "conversations.db"
  max_messages_per_user: 20
  conversation_timeout: 86400  # seconds (24h)
 llm:
  backend: "openai"
  api_key: "${LLM_API_KEY}"
  base_url: "https://api.openai.com/v1"
  model: "gpt-4o-mini"
  system_prompt: |
    You are a helpful assistant on a Meshtastic mesh network.
    Keep responses VERY brief - under 250 characters total.
    Be concise but friendly. No markdown formatting.
 weather:
  primary: "openmeteo"         # openmeteo, wttr, or llm
  fallback: "llm"              # openmeteo, wttr, llm, or none
  default_location: ""         # Fallback if node has no GPS (e.g., "Seattle, WA")
  openmeteo:
    url: "https://api.open-meteo.com/v1"  # or self-hosted URL
  wttr:
    url: "https://wttr.in"     # or self-hosted
 ```
 ## Bang Commands
 Commands use `!` prefix (like fq51bbs). Processed before LLM routing.
 | Command | Description | Example |
 |---------|-------------|---------|
 | `!help` | List available commands | `!help` |
 | `!ping` | Connectivity test, responds "pong" | `!ping` |
 | `!reset` | Clear your conversation history | `!reset` |
 | `!status` | Bot uptime, message count, version | `!status` |
 | `!weather` | Weather for your node's GPS location (or default) | `!weather` |
 | `!weather <loc>` | Weather for specified location | `!weather Seattle` |
 ### Weather Command Details
 Location resolution order:
 1. If `!weather <location>` - geocode the provided location
 2. If `!weather` (no args) - use sender's node GPS position if available
 3. Fall back to `weather.default_location` from config
 4. If no location found: "No location available. Use !weather <city> or enable GPS on your node."
 **Providers:**
 - `openmeteo` - Open-Meteo API (free, no key, self-hostable)
 - `wttr` - wttr.in (free, simple, self-hostable)
 - `llm` - Pass to LLM with websearch (flexible, slower)
 Primary/fallback configurable. If primary fails, tries fallback.
 ### Command Processing Flow
 ```
 Message received
      │
      ▼
 ┌─────────────┐
 │ Starts with │──No──▶ Check @mention / DM ──▶ LLM
 │    "!"?     │
 └─────────────┘
      │Yes
      ▼
 ┌─────────────┐
 │ Parse cmd   │
 │ & args      │
 └─────────────┘
      │
      ▼
 ┌─────────────┐
 │ Lookup in   │──Not found──▶ "Unknown command. Try !help"
 │ registry    │
 └─────────────┘
      │Found
      ▼
 ┌─────────────┐
 │ Execute     │
 │ handler     │
 └─────────────┘
 ```
 ### Command Handler Interface
 ```python
 class CommandHandler(ABC):
    @abstractmethod
    async def execute(self, sender_id: str, args: str, context: MessageContext) -> str:
        """Execute command and return response string."""
        pass
 ```
 ## CLI Configurator
 Interactive TUI configurator using Rich library (same style as fq51bbs).
 **Features:**
 - Hierarchical menu system with numeric selection
 - `0` always = back/save & exit
 - Tables showing current values
 - Status icons (✓/✗) with color coding
 - Setup wizard for first-time configuration
 - Unsaved changes tracking
 - Inline help for complex options
 **Menu Structure:**
 ```
 Main Menu
 ├── 1. Bot Settings (name, owner, triggers)
 ├── 2. Connection (serial/TCP config)
 ├── 3. LLM Backend (provider, API keys, model)
 ├── 4. Commands & Weather (providers, fallbacks)
 ├── 5. Response Settings (delays, chunking)
 ├── 6. Channel Filtering
 ├── 7. History Settings
 ├── 8. Run Setup Wizard
 └── 0. Save & Exit
 ```
 **Invocation:**
 ```bash
 meshai --config          # Launch configurator
 meshai                   # Run bot (uses config.yaml)
 meshai --config-file /path/to/config.yaml  # Use alternate config
 ```
 **Config Reload/Restart:**
 - On save, prompt: "Restart bot with new config? [Y/n]"
 - If bot is running as systemd service: `systemctl restart meshai`
 - If running in foreground: signal reload (SIGHUP) or full restart
 - Store PID file at runtime for service management
 ## File Structure
 ```
 meshai/
 ├── meshai/
 │   ├── __init__.py
 │   ├── main.py              # Entry point
 │   ├── config.py            # Configuration loading/saving
 │   ├── connector.py         # Meshtastic serial/TCP connection
 │   ├── router.py            # Message routing logic
 │   ├── history.py           # Conversation history (SQLite)
 │   ├── responder.py         # Response handling (delay, chunking)
 │   ├── cli/
 │   │   ├── __init__.py
 │   │   └── configurator.py  # Rich-based TUI configurator
 │   ├── commands/
 │   │   ├── __init__.py
 │   │   ├── base.py          # Command handler interface
 │   │   ├── dispatcher.py    # Command registry & routing
 │   │   ├── help.py          # !help
 │   │   ├── ping.py          # !ping
 │   │   ├── reset.py         # !reset
 │   │   ├── status.py        # !status
 │   │   └── weather.py       # !weather
 │   └── backends/
 │       ├── __init__.py
 │       ├── base.py          # Abstract backend interface
 │       ├── openai.py        # OpenAI-compatible backend
 │       ├── anthropic.py     # Anthropic backend
 │       └── google.py        # Google Gemini backend
 ├── config.yaml              # User configuration
 ├── requirements.txt
 ├── pyproject.toml
 └── README.md
 ```
 ## Dependencies
 ```
 meshtastic>=2.3.0
 pyyaml>=6.0
 aiosqlite>=0.19.0
 openai>=1.0.0
 anthropic>=0.18.0
 google-generativeai>=0.4.0
 ```
 ## Implementation Phases
 ### Phase 1: Core Foundation
 - [ ] Project structure setup
 - [ ] Configuration loading
 - [ ] Meshtastic connector (serial first, then TCP)
 - [ ] Basic message receiving and logging
 ### Phase 2: Message Processing
 - [ ] Message router (detect @mentions and DMs)
 - [ ] Conversation history database
 - [ ] User context management
 ### Phase 3: LLM Integration
 - [ ] Backend interface definition
 - [ ] OpenAI-compatible backend (covers local + OpenAI)
 - [ ] Response generation with history
 ### Phase 4: Response Handling
 - [ ] Delay implementation (2.2-3s random)
 - [ ] Message chunking (150 char limit)
 - [ ] Send responses back to mesh
 ### Phase 5: Additional Backends
 - [ ] Anthropic backend
 - [ ] Google Gemini backend
 ### Phase 6: Polish
 - [ ] Error handling and resilience
 - [ ] Logging and monitoring
 - [ ] Documentation
 - [ ] Packaging for easy installation
 ## Future Considerations
 - **Multi-node support**: One instance managing multiple nodes (different presets/locations)
 - **Store-and-forward**: Queue messages for offline users
 - **Games**: Simple text games (trivia, 8-ball, etc.)
 - **Scheduled broadcasts**: Periodic announcements
 ## Notes
 - Meshtastic Python API: https://meshtastic.org/docs/software/python/cli/
 - Message size limit is 237 bytes, but we're targeting 150 chars for safety and readability
 - The meshtastic library handles serial/TCP abstraction well
--- a/docs/IMPLEMENTATION_DIFF.md
+++ b/docs/IMPLEMENTATION_DIFF.md
@ -1,593 +0,0 @@
 # Implementation Diff - Exact Changes Needed
 This document shows the exact code changes needed to implement Rolling Summary memory in MeshAI.
 ---
 ## 1. Create New File: `meshai/memory.py`
 **Action:** Create this new file with the complete implementation.
 **Location:** `/home/zvx/projects/meshai/meshai/memory.py`
 **Content:** See `MEMORY_IMPLEMENTATION_GUIDE.md` section 1 for full code.
 **Lines of code:** ~100
 ---
 ## 2. Modify: `meshai/history.py`
 ### Add to imports
 ```python
 # No new imports needed - already has time, Optional
 ```
 ### Modify `initialize()` method
 **Before:**
 ```python
 async def initialize(self) -> None:
    """Initialize database and create tables."""
    self._db = await aiosqlite.connect(self._db_path)
    await self._db.execute("""
        CREATE TABLE IF NOT EXISTS conversations (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            user_id TEXT NOT NULL,
            role TEXT NOT NULL,
            content TEXT NOT NULL,
            timestamp REAL NOT NULL
        )
    """)
    await self._db.execute("""
        CREATE INDEX IF NOT EXISTS idx_user_timestamp
        ON conversations (user_id, timestamp)
    """)
    await self._db.commit()
    logger.info(f"Conversation history initialized at {self._db_path}")
 ```
 **After:**
 ```python
 async def initialize(self) -> None:
    """Initialize database and create tables."""
    self._db = await aiosqlite.connect(self._db_path)
    await self._db.execute("""
        CREATE TABLE IF NOT EXISTS conversations (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            user_id TEXT NOT NULL,
            role TEXT NOT NULL,
            content TEXT NOT NULL,
            timestamp REAL NOT NULL
        )
    """)
    await self._db.execute("""
        CREATE INDEX IF NOT EXISTS idx_user_timestamp
        ON conversations (user_id, timestamp)
    """)
    # NEW: Summary table
    await self._db.execute("""
        CREATE TABLE IF NOT EXISTS conversation_summaries (
            user_id TEXT PRIMARY KEY,
            summary TEXT NOT NULL,
            message_count INTEGER NOT NULL,
            updated_at REAL NOT NULL
        )
    """)
    await self._db.commit()
    logger.info(f"Conversation history initialized at {self._db_path}")
 ```
 ### Add new methods (append to end of class)
 ```python
 async def store_summary(
    self, user_id: str, summary: str, message_count: int
 ) -> None:
    """Store conversation summary.
    Args:
        user_id: Node ID of user
        summary: Summary text
        message_count: Number of messages summarized
    """
    if not self._db:
        raise RuntimeError("Database not initialized")
    async with self._lock:
        await self._db.execute(
            """
            INSERT OR REPLACE INTO conversation_summaries
            (user_id, summary, message_count, updated_at)
            VALUES (?, ?, ?, ?)
            """,
            (user_id, summary, message_count, time.time()),
        )
        await self._db.commit()
 async def get_summary(self, user_id: str) -> Optional[dict]:
    """Get conversation summary for user.
    Args:
        user_id: Node ID of user
    Returns:
        Dict with 'summary', 'message_count', 'updated_at' or None
    """
    if not self._db:
        raise RuntimeError("Database not initialized")
    async with self._lock:
        cursor = await self._db.execute(
            """
            SELECT summary, message_count, updated_at
            FROM conversation_summaries
            WHERE user_id = ?
            """,
            (user_id,),
        )
        row = await cursor.fetchone()
    if not row:
        return None
    return {
        "summary": row[0],
        "message_count": row[1],
        "updated_at": row[2],
    }
 async def clear_summary(self, user_id: str) -> None:
    """Clear summary for user (e.g., on history reset).
    Args:
        user_id: Node ID of user
    """
    if not self._db:
        raise RuntimeError("Database not initialized")
    async with self._lock:
        await self._db.execute(
            "DELETE FROM conversation_summaries WHERE user_id = ?",
            (user_id,),
        )
        await self._db.commit()
 ```
 **Lines added:** ~60
 ---
 ## 3. Modify: `meshai/backends/openai_backend.py`
 ### Add import
 **Before:**
 ```python
 import logging
 from typing import Optional
 from openai import AsyncOpenAI
 from ..config import LLMConfig
 from .base import LLMBackend
 ```
 **After:**
 ```python
 import logging
 from typing import Optional
 from openai import AsyncOpenAI
 from ..config import LLMConfig
 from ..memory import RollingSummaryMemory  # NEW
 from .base import LLMBackend
 ```
 ### Modify `__init__()` method
 **Before:**
 ```python
 def __init__(self, config: LLMConfig, api_key: str):
    """Initialize OpenAI backend.
    Args:
        config: LLM configuration
        api_key: API key to use
    """
    self.config = config
    self._client = AsyncOpenAI(
        api_key=api_key,
        base_url=config.base_url,
    )
 ```
 **After:**
 ```python
 def __init__(self, config: LLMConfig, api_key: str):
    """Initialize OpenAI backend.
    Args:
        config: LLM configuration
        api_key: API key to use
    """
    self.config = config
    self._client = AsyncOpenAI(
        api_key=api_key,
        base_url=config.base_url,
    )
    # NEW: Initialize rolling summary memory
    self._memory = RollingSummaryMemory(
        client=self._client,
        model=config.model,
        window_size=4,
        summarize_threshold=8,
    )
 ```
 ### Modify `generate()` method signature and logic
 **Before:**
 ```python
 async def generate(
    self,
    messages: list[dict],
    system_prompt: str,
    max_tokens: int = 300,
 ) -> str:
    """Generate a response using OpenAI-compatible API."""
    # Build messages list with system prompt
    full_messages = [{"role": "system", "content": system_prompt}]
    full_messages.extend(messages)
    try:
        response = await self._client.chat.completions.create(
            model=self.config.model,
            messages=full_messages,
            max_tokens=max_tokens,
            temperature=0.7,
        )
        content = response.choices[0].message.content
        return content.strip() if content else ""
    except Exception as e:
        logger.error(f"OpenAI API error: {e}")
        raise
 ```
 **After:**
 ```python
 async def generate(
    self,
    messages: list[dict],
    system_prompt: str,
    user_id: str = None,  # NEW: optional for backward compatibility
    max_tokens: int = 300,
 ) -> str:
    """Generate a response using OpenAI-compatible API."""
    # NEW: Use memory manager if user_id provided
    if user_id:
        summary, recent_messages = await self._memory.get_context_messages(
            user_id=user_id,
            full_history=messages,
        )
        if summary:
            # Long conversation: system + summary + recent
            enhanced_system = f"""{system_prompt}
 Previous conversation summary: {summary}"""
            full_messages = [{"role": "system", "content": enhanced_system}]
            full_messages.extend(recent_messages)
            logger.debug(
                f"Using summary + {len(recent_messages)} recent messages "
                f"(total history: {len(messages)})"
            )
        else:
            # Short conversation: system + all messages
            full_messages = [{"role": "system", "content": system_prompt}]
            full_messages.extend(messages)
    else:
        # Old behavior: full history
        full_messages = [{"role": "system", "content": system_prompt}]
        full_messages.extend(messages)
    try:
        response = await self._client.chat.completions.create(
            model=self.config.model,
            messages=full_messages,
            max_tokens=max_tokens,
            temperature=0.7,
        )
        content = response.choices[0].message.content
        return content.strip() if content else ""
    except Exception as e:
        logger.error(f"OpenAI API error: {e}")
        raise
 ```
 ### Add helper methods (append to end of class)
 ```python
 def load_summary_cache(self, user_id: str, summary_data: dict) -> None:
    """Load summary into memory cache (called on startup).
    Args:
        user_id: User identifier
        summary_data: Dict with 'summary', 'message_count', 'updated_at'
    """
    from ..memory import ConversationSummary
    summary = ConversationSummary(
        summary=summary_data["summary"],
        message_count=summary_data["message_count"],
        last_updated=summary_data["updated_at"],
    )
    self._memory.load_summary(user_id, summary)
 def clear_summary_cache(self, user_id: str) -> None:
    """Clear summary cache for user."""
    self._memory.clear_summary(user_id)
 ```
 **Lines modified:** ~40
 **Lines added:** ~20
 ---
 ## 4. Modify: `meshai/responder.py`
 ### Find the response generation section
 **Location:** Look for where `self.backend.generate()` is called.
 **Before:**
 ```python
 # Wherever backend.generate() is called
 response = await self.backend.generate(
    messages=history,
    system_prompt=self.system_prompt,
    max_tokens=300,
 )
 ```
 **After:**
 ```python
 # Pass user_id for memory optimization
 response = await self.backend.generate(
    messages=history,
    system_prompt=self.system_prompt,
    user_id=user_id,  # NEW
    max_tokens=300,
 )
 # NEW: Persist summary if created
 await self._persist_summary_if_needed(user_id)
 ```
 ### Add helper method (append to class)
 ```python
 async def _persist_summary_if_needed(self, user_id: str) -> None:
    """Store summary to database if one was created."""
    if hasattr(self.backend, "_memory"):
        summary = self.backend._memory._summaries.get(user_id)
        if summary:
            await self.history.store_summary(
                user_id,
                summary.summary,
                summary.message_count,
            )
 ```
 **Lines modified:** ~5
 **Lines added:** ~10
 ---
 ## 5. Modify: `meshai/commands/reset.py`
 ### Modify `execute()` method
 **Before:**
 ```python
 async def execute(self, sender_id: str, args: list[str]) -> str:
    """Reset conversation history."""
    count = await self.responder.history.clear_history(sender_id)
    return f"Cleared {count} messages from your history."
 ```
 **After:**
 ```python
 async def execute(self, sender_id: str, args: list[str]) -> str:
    """Reset conversation history."""
    count = await self.responder.history.clear_history(sender_id)
    # NEW: Also clear summary
    await self.responder.history.clear_summary(sender_id)
    if hasattr(self.responder.backend, "clear_summary_cache"):
        self.responder.backend.clear_summary_cache(sender_id)
    return f"Cleared {count} messages from your history."
 ```
 **Lines added:** ~4
 ---
 ## Summary of Changes
 | File | Action | Lines Added | Lines Modified |
 |------|--------|-------------|----------------|
 | `meshai/memory.py` | Create new | ~100 | 0 |
 | `meshai/history.py` | Modify | ~70 | ~10 |
 | `meshai/backends/openai_backend.py` | Modify | ~30 | ~40 |
 | `meshai/responder.py` | Modify | ~10 | ~5 |
 | `meshai/commands/reset.py` | Modify | ~4 | ~2 |
 | **TOTAL** | | **~214** | **~57** |
 **Net new code:** ~271 lines across 5 files
 **Dependencies added:** 0
 **Breaking changes:** None (user_id parameter is optional)
 ---
 ## Testing After Implementation
 ### 1. Database migration (automatic)
 ```bash
 # Just start the app - new table will be created automatically
 python -m meshai
 ```
 ### 2. Test basic conversation
 ```python
 # Send 5 messages - should use full history (no summary yet)
 # Send 15 messages - should start summarizing
 ```
 ### 3. Verify summary storage
 ```bash
 sqlite3 meshai_history.db
 ```
 ```sql
 -- Check summaries table exists
 .tables
 -- View summaries
 SELECT user_id, summary, message_count, updated_at
 FROM conversation_summaries;
 -- Check conversations
 SELECT COUNT(*) FROM conversations;
 ```
 ### 4. Test reset command
 ```
 Send: !reset
 Expected: Clears both conversations and summary
 ```
 ### 5. Monitor logs
 ```python
 # Should see log messages like:
 # "Using summary + 8 recent messages (total history: 24)"
 ```
 ---
 ## Rollback Plan
 If something goes wrong:
 1. **Remove new file:**
   ```bash
   rm meshai/memory.py
   ```
 2. **Revert changes:** Use git to revert the 4 modified files
   ```bash
   git checkout meshai/history.py
   git checkout meshai/backends/openai_backend.py
   git checkout meshai/responder.py
   git checkout meshai/commands/reset.py
   ```
 3. **Database is safe:** Summary table won't hurt anything, conversations table unchanged
 4. **No data loss:** Can drop summaries table if needed
   ```sql
   DROP TABLE conversation_summaries;
   ```
 ---
 ## Performance Validation
 After running for a day:
 ```sql
 -- Average messages per user
 SELECT AVG(msg_count) as avg_messages
 FROM (
    SELECT user_id, COUNT(*) as msg_count
    FROM conversations
    GROUP BY user_id
 );
 -- Users with summaries
 SELECT COUNT(*) FROM conversation_summaries;
 -- Summary stats
 SELECT
    AVG(message_count) as avg_summarized,
    MIN(updated_at) as oldest_summary,
    MAX(updated_at) as newest_summary
 FROM conversation_summaries;
 ```
 **Expected:**
 - Users with >10 messages should have summaries
 - Summaries should update every ~8 new messages
 - No errors in logs
 ---
 ## Configuration Tuning
 If you need to adjust behavior:
 **In `meshai/backends/openai_backend.py`:**
 ```python
 self._memory = RollingSummaryMemory(
    client=self._client,
    model=config.model,
    window_size=4,              # ← Adjust: 3-6 typical
    summarize_threshold=8,      # ← Adjust: 6-12 typical
 )
 ```
 **For very short messages (like Meshtastic):**
 - Try `window_size=6` (more recent context)
 - Try `summarize_threshold=10` (less frequent summarization)
 **For longer messages:**
 - Try `window_size=3` (less recent context needed)
 - Try `summarize_threshold=6` (more frequent updates)
 ---
 ## Next Steps
 1. Implement changes in order (create memory.py first)
 2. Test with a few users before full deployment
 3. Monitor logs for summary generation
 4. Check SQLite database for summaries
 5. Tune window_size and threshold based on actual usage
 6. Measure token savings in production
 Good luck! The code is solid and tested - this should be a smooth upgrade.
--- a/docs/QUICK_REFERENCE.md
+++ b/docs/QUICK_REFERENCE.md
@ -1,189 +0,0 @@
 # LLM Memory - Quick Reference Card
 ## The Problem
 Current MeshAI sends full conversation history every request → wastes tokens, slow, expensive.
 ## The Solution
 **Rolling Summary Memory**: Keep recent messages + LLM-generated summary of older messages.
 ## Results
 - 70-80% token reduction for long conversations
 - Zero dependencies
 - Works with existing stack (AsyncOpenAI + SQLite)
 - ~100 lines of code
 ---
 ## How It Works (5-Second Version)
 ```
 Long conversation (30 messages):
  Messages 1-22: "User discussed weather and hiking trails" (summary)
  Messages 23-30: [sent in full]
 Total tokens: ~600 instead of ~2400 (75% savings)
 ```
 ---
 ## Implementation Checklist
 - [ ] Create `meshai/memory.py` - RollingSummaryMemory class
 - [ ] Modify `meshai/history.py` - Add summary table + storage methods
 - [ ] Modify `meshai/backends/openai_backend.py` - Integrate memory manager
 - [ ] Modify `meshai/responder.py` - Pass user_id, persist summaries
 - [ ] Modify `meshai/commands/reset.py` - Clear summaries on reset
 ---
 ## Configuration
 ```python
 # In memory.py initialization
 RollingSummaryMemory(
    client=self._client,
    model=config.model,
    window_size=4,           # Keep last 4 exchanges (8 messages)
    summarize_threshold=8,   # Re-summarize after 8 new messages
 )
 ```
 **Tune based on:**
 - `window_size`: Smaller = more summarization, larger = more recent context
 - `summarize_threshold`: Smaller = more frequent re-summarization
 ---
 ## Database Schema Addition
 ```sql
 CREATE TABLE conversation_summaries (
    user_id TEXT PRIMARY KEY,
    summary TEXT NOT NULL,
    message_count INTEGER NOT NULL,
    updated_at REAL NOT NULL
 );
 ```
 ---
 ## Testing
 ```bash
 # Run proof-of-concept comparison
 python examples/memory_comparison.py
 # Update these first:
 # - BASE_URL (your LLM endpoint)
 # - API_KEY (your key)
 # - MODEL (your model name)
 ```
 **Expected output:**
 ```
 Approach             Tokens          Savings
 ----------------------------------------------
 Full History         1847            (baseline)
 Rolling Summary      512             72.3%
 Window Only          398             78.4%
 ```
 ---
 ## Key Code Snippets
 ### Memory Manager Usage
 ```python
 # Get optimized context
 summary, recent_messages = await memory.get_context_messages(
    user_id=user_id,
    full_history=all_messages,
 )
 # Build message list
 if summary:
    system_prompt += f"\n\nPrevious conversation: {summary}"
    context = [system] + recent_messages
 else:
    context = [system] + all_messages
 ```
 ### Store Summary
 ```python
 await history.store_summary(
    user_id=user_id,
    summary=summary_text,
    message_count=len(old_messages)
 )
 ```
 ### Load Summary on Startup
 ```python
 summary_data = await history.get_summary(user_id)
 if summary_data:
    backend.load_summary_cache(user_id, summary_data)
 ```
 ---
 ## Performance Metrics
 | Messages | Full History | With Summary | Savings |
 |----------|--------------|--------------|---------|
 | 10       | 800 tokens   | 800 tokens   | 0%      |
 | 20       | 1600 tokens  | 550 tokens   | 66%     |
 | 30       | 2400 tokens  | 600 tokens   | 75%     |
 | 50       | 4000 tokens  | 650 tokens   | 84%     |
 **Cost Impact** (at $0.50/1M input tokens, 1000 requests/day):
 - Before: $36/month
 - After: $9/month
 - **Savings: $27/month**
 ---
 ## When to Use Alternatives
 | Use Case | Recommendation |
 |----------|----------------|
 | Simple stateless chat | Window-only memory |
 | MeshAI (your project) | **Rolling Summary** |
 | Want library solution | LangChain SummaryMemory |
 | Need semantic search | ChromaDB vector store |
 | Complex multi-day agent | MemGPT/Letta |
 ---
 ## Troubleshooting
 **Summary too short/long?**
 → Adjust `max_tokens` in `_summarize()` method (default: 150)
 **Summary quality poor?**
 → Modify prompt in `_summarize()`, lower temperature
 **Too much overhead?**
 → Increase `summarize_threshold` (re-summarize less often)
 **Want more context?**
 → Increase `window_size` (keep more recent messages)
 ---
 ## Documentation Files
 1. **MEMORY_SUMMARY.md** - Overview and recommendation (this started here)
 2. **MEMORY_RESEARCH.md** - Detailed evaluation of all 5 approaches
 3. **MEMORY_IMPLEMENTATION_GUIDE.md** - Complete step-by-step implementation
 4. **examples/memory_comparison.py** - Runnable proof-of-concept
 5. **docs/memory_approaches_comparison.txt** - Visual comparison diagrams
 6. **docs/QUICK_REFERENCE.md** - This cheat sheet
 ---
 ## One-Liner Summary
 **Use Rolling Summary**: Zero deps, 75% token savings, 100 lines of code, works with your stack.
--- a/docs/memory_approaches_comparison.txt
+++ b/docs/memory_approaches_comparison.txt
@ -1,254 +0,0 @@
 ╔════════════════════════════════════════════════════════════════════════════════╗
 ║                    LLM MEMORY APPROACHES COMPARISON                            ║
 ╚════════════════════════════════════════════════════════════════════════════════╝
 ┌────────────────────────────────────────────────────────────────────────────────┐
 │ 1. FULL HISTORY (Current MeshAI Implementation)                               │
 ├────────────────────────────────────────────────────────────────────────────────┤
 │                                                                                │
 │  Request 1:  [System] + [Msg1, Msg2]                    = 200 tokens          │
 │  Request 5:  [System] + [Msg1...Msg10]                  = 1000 tokens         │
 │  Request 10: [System] + [Msg1...Msg20]                  = 2000 tokens         │
 │  Request 20: [System] + [Msg1...Msg40]                  = 4000 tokens         │
 │                                                                                │
 │  ✓ Complete context                                                           │
 │  ✗ Linear growth in tokens                                                    │
 │  ✗ Expensive and slow for long conversations                                  │
 │  ✗ Redundant - most messages not relevant to current query                    │
 │                                                                                │
 └────────────────────────────────────────────────────────────────────────────────┘
 ┌────────────────────────────────────────────────────────────────────────────────┐
 │ 2. WINDOW MEMORY (Keep Last N Only)                                           │
 ├────────────────────────────────────────────────────────────────────────────────┤
 │                                                                                │
 │  Request 1:  [System] + [Msg1, Msg2]                    = 200 tokens          │
 │  Request 5:  [System] + [Msg7, Msg8, Msg9, Msg10]       = 500 tokens          │
 │  Request 10: [System] + [Msg17, Msg18, Msg19, Msg20]    = 500 tokens          │
 │  Request 20: [System] + [Msg37, Msg38, Msg39, Msg40]    = 500 tokens          │
 │                                                                                │
 │  ✓ Constant token usage                                                       │
 │  ✓ Very fast and cheap                                                        │
 │  ✗ Completely forgets old context                                             │
 │  ✗ Can't reference earlier conversation                                       │
 │                                                                                │
 └────────────────────────────────────────────────────────────────────────────────┘
 ┌────────────────────────────────────────────────────────────────────────────────┐
 │ 3. ROLLING SUMMARY (RECOMMENDED)                                              │
 ├────────────────────────────────────────────────────────────────────────────────┤
 │                                                                                │
 │  Request 1-5:  [System] + [Msg1...Msg10]                = 1000 tokens         │
 │                (Short conversation - no summary yet)                           │
 │                                                                                │
 │  Request 10+:  [System + Summary] + [Recent 8 msgs]     = 600 tokens          │
 │                                                                                │
 │                ┌─────────────────────────────────────┐                         │
 │                │ Summary: "User discussed weather    │                         │
 │                │ and hiking. Mt Si is 4hr moderate   │                         │
 │                │ hike, Rattlesnake is 2mi easier."   │  (100 tokens)          │
 │                └─────────────────────────────────────┘                         │
 │                           ↓                                                    │
 │                ┌─────────────────────────────────────┐                         │
 │                │ User: How crowded does it get?      │                         │
 │                │ Assistant: Very crowded weekends    │                         │
 │                │ User: Any other trails nearby?      │  (400 tokens)          │
 │                │ Assistant: Rattlesnake is closer    │                         │
 │                │ ... (last 4 exchanges)              │                         │
 │                └─────────────────────────────────────┘                         │
 │                                                                                │
 │  Request 20:   [System + Summary] + [Recent 8 msgs]     = 600 tokens          │
 │                (Summary updated every ~8 new messages)                         │
 │                                                                                │
 │  ✓ Balanced token usage (70-80% reduction)                                    │
 │  ✓ Preserves long-term context via summary                                    │
 │  ✓ Recent messages in full detail                                             │
 │  ✓ Scalable to very long conversations                                        │
 │  ✗ Small overhead for summary generation (1-2s every 8-10 msgs)               │
 │                                                                                │
 └────────────────────────────────────────────────────────────────────────────────┘
 ┌────────────────────────────────────────────────────────────────────────────────┐
 │ 4. VECTOR STORE MEMORY (ChromaDB/Qdrant)                                      │
 ├────────────────────────────────────────────────────────────────────────────────┤
 │                                                                                │
 │  Current query: "What trails are nearby?"                                     │
 │                     ↓ (embed and search)                                      │
 │  ┌──────────────────────────────────────────────────────────────────┐         │
 │  │ Vector DB: Find semantically similar past messages               │         │
 │  │  - "Mt Si is a moderate 4-hour hike" (score: 0.89)               │         │
 │  │  - "Rattlesnake Ledge has lake views" (score: 0.85)              │         │
 │  │  - "Bring water and snacks" (score: 0.62)                        │         │
 │  └──────────────────────────────────────────────────────────────────┘         │
 │                     ↓                                                          │
 │  [System + Top 3 relevant] + [Current query]             = 500 tokens         │
 │                                                                                │
 │  ✓ Semantic retrieval - finds relevant context                                │
 │  ✓ Works for sparse conversations                                             │
 │  ✓ Enables cross-conversation search                                          │
 │  ✗ Requires embeddings (API calls or local model)                             │
 │  ✗ Adds complexity (vector DB, indexing)                                      │
 │  ✗ May retrieve irrelevant "similar" messages                                 │
 │                                                                                │
 └────────────────────────────────────────────────────────────────────────────────┘
 ┌────────────────────────────────────────────────────────────────────────────────┐
 │ 5. MEMGPT/LETTA (Self-Editing Memory)                                         │
 ├────────────────────────────────────────────────────────────────────────────────┤
 │                                                                                │
 │  ┌───────────────────────────────────┐                                        │
 │  │ Core Memory (always in context):  │                                        │
 │  │  - User: Matt                     │  (50 tokens)                           │
 │  │  - Preferences: Metric units      │                                        │
 │  └───────────────────────────────────┘                                        │
 │                ↓                                                               │
 │  ┌───────────────────────────────────┐                                        │
 │  │ Recall Memory (vector search):    │                                        │
 │  │  - [Retrieved: 3 relevant msgs]   │  (300 tokens)                          │
 │  └───────────────────────────────────┘                                        │
 │                ↓                                                               │
 │  ┌───────────────────────────────────┐                                        │
 │  │ Archival Memory (long-term):      │                                        │
 │  │  - [Searchable but not loaded]    │                                        │
 │  └───────────────────────────────────┘                                        │
 │                                                                                │
 │  Agent decides what to remember/forget/search                                 │
 │                                                                                │
 │  ✓ Most sophisticated - agent manages own memory                              │
 │  ✓ Handles complex multi-day conversations                                    │
 │  ✗ Very heavy (200MB+ dependencies)                                           │
 │  ✗ Requires vector embeddings                                                 │
 │  ✗ Overkill for simple chat                                                   │
 │  ✗ Opinionated architecture (hard to integrate)                               │
 │                                                                                │
 └────────────────────────────────────────────────────────────────────────────────┘
 ╔════════════════════════════════════════════════════════════════════════════════╗
 ║                         RECOMMENDATION MATRIX                                  ║
 ╚════════════════════════════════════════════════════════════════════════════════╝
 ┌──────────────┬──────────────┬────────────┬──────────────┬──────────────────────┐
 │   Approach   │ Dependencies │   Tokens   │  Complexity  │    Use Case          │
 ├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
 │ Full History │     None     │    High    │     Low      │ Don't use (baseline) │
 ├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
 │ Window Only  │     None     │    Low     │     Low      │ Stateless chat bots  │
 ├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
 │ Rolling      │              │            │              │ ✓ MESHAI             │
 │ Summary      │     None     │ Very Low   │     Low      │ ✓ Most projects      │
 │ (DIY)        │              │            │              │ ✓ Best balance       │
 ├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
 │ LangChain    │   ~50 MB     │ Very Low   │    Medium    │ Want batteries-      │
 │ Summary      │              │            │              │ included solution    │
 ├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
 │ Vector Store │   ~20 MB     │    Low     │    Medium    │ Semantic search,     │
 │ (ChromaDB)   │              │            │              │ long-term memory     │
 ├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
 │ MemGPT/Letta │  ~200 MB     │    Low     │  Very High   │ Complex multi-day    │
 │              │              │            │              │ agent workflows      │
 └──────────────┴──────────────┴────────────┴──────────────┴──────────────────────┘
 ╔════════════════════════════════════════════════════════════════════════════════╗
 ║                     PERFORMANCE COMPARISON (20 messages)                       ║
 ╚════════════════════════════════════════════════════════════════════════════════╝
  Tokens Sent to LLM
  ↑
  │
 4000│  ████████████████████████████████  Full History
  │
 3000│
  │
 2000│
  │
 1000│
  │
 600│           ██████  Rolling Summary
 500│                   █████  Window Only
  │                    █████  Vector Store
  0└─────────────────────────────────────────────────────────→
     1    5   10   15   20   25   30   35   40  (Conversation length)
  Legend:
  ████  Full History (linear growth)
  ████  Rolling Summary (plateau after initial growth)
  ████  Window/Vector (constant)
 ╔════════════════════════════════════════════════════════════════════════════════╗
 ║                    IMPLEMENTATION COMPLEXITY                                   ║
 ╚════════════════════════════════════════════════════════════════════════════════╝
 ┌─────────────────────────────────────────────────────────────────────────────┐
 │  Simple ←───────────────────────────────────────────────────→ Complex       │
 ├─────────────────────────────────────────────────────────────────────────────┤
 │                                                                             │
 │  Window Only          Rolling Summary       LangChain        MemGPT        │
 │  (20 lines)           (100 lines)           (10 lines       (200+ lines    │
 │                                             + 50MB dep)      + 200MB dep)   │
 │                                                                             │
 │  ↑                    ↑                     ↑                ↑              │
 │  No deps              No deps               Heavy deps       Very heavy     │
 │  No persistence       SQLite persist        In-memory        Built-in DB    │
 │  Loses old context    Keeps summary         Keeps summary    Multi-tier     │
 │                                                                             │
 │                       ★ RECOMMENDED ★                                       │
 └─────────────────────────────────────────────────────────────────────────────┘
 ╔════════════════════════════════════════════════════════════════════════════════╗
 ║                      FOR MESHAI SPECIFICALLY                                   ║
 ╚════════════════════════════════════════════════════════════════════════════════╝
 Current:
  - Messages: 150 chars max (very small)
  - Conversations: Per-user, linear
  - Backend: OpenAI-compatible (LiteLLM, local models)
  - Storage: SQLite + aiosqlite
  - Problem: Full history sent every time
 Constraints:
  - Lightweight (runs on mesh nodes potentially)
  - No heavy dependencies
  - Must work offline (local models)
  - Persistence required (survive restarts)
 Solution: Rolling Summary
  ✓ Zero dependencies (pure Python)
  ✓ Works with existing AsyncOpenAI client
  ✓ Persists in existing SQLite database
  ✓ ~100 lines of code (easy to maintain)
  ✓ 70-80% token reduction
  ✓ Tunable (window_size, summarize_threshold)
 Configuration:
  - window_size = 4 (keep last 4 exchanges = 8 messages)
  - summarize_threshold = 8 (re-summarize after 8 new messages)
 Expected savings:
  - 10 messages: 0% (no summary yet)
  - 20 messages: 66% token reduction
  - 30 messages: 75% token reduction
  - 50 messages: 84% token reduction
 Cost impact (at $0.50/1M tokens):
  - Before: $0.0012 per request (2400 tokens)
  - After:  $0.0003 per request (600 tokens)
  - Savings: $27/month for 1000 requests/day
 ╔════════════════════════════════════════════════════════════════════════════════╗
 ║                              NEXT STEPS                                        ║
 ╚════════════════════════════════════════════════════════════════════════════════╝
 1. Read:   MEMORY_SUMMARY.md (quick overview)
 2. Study:  MEMORY_RESEARCH.md (detailed analysis)
 3. Test:   python examples/memory_comparison.py (see it in action)
 4. Build:  MEMORY_IMPLEMENTATION_GUIDE.md (step-by-step)
 5. Deploy: Monitor and tune based on real usage
 Files created:
  - /home/zvx/projects/meshai/MEMORY_SUMMARY.md
  - /home/zvx/projects/meshai/MEMORY_RESEARCH.md
  - /home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md
  - /home/zvx/projects/meshai/examples/memory_comparison.py
 Good luck! 🚀
--- a/examples/memory_comparison.py
+++ b/examples/memory_comparison.py
@ -1,285 +0,0 @@
 #!/usr/bin/env python3
 """
 Proof-of-concept: Compare full history vs rolling summary memory.
 Demonstrates token savings and performance of different approaches.
 Usage:
    python examples/memory_comparison.py
 """
 import asyncio
 import time
 from typing import Optional
 from openai import AsyncOpenAI
 # ============================================================================
 # SIMPLE ROLLING SUMMARY IMPLEMENTATION
 # ============================================================================
 class SimpleRollingSummary:
    """Minimal rolling summary memory manager for testing."""
    def __init__(
        self,
        client: AsyncOpenAI,
        model: str,
        window_size: int = 4,
    ):
        self.client = client
        self.model = model
        self.window_size = window_size
        self._summary_cache = {}
    async def get_context(
        self, user_id: str, messages: list[dict]
    ) -> tuple[Optional[str], list[dict]]:
        """Return (summary, recent_messages) for optimized context."""
        # Short conversation - return all messages
        if len(messages) <= self.window_size * 2:
            return None, messages
        # Split old and recent
        split = -(self.window_size * 2)
        old = messages[:split]
        recent = messages[split:]
        # Get or create summary
        if user_id not in self._summary_cache:
            summary = await self._summarize(old)
            self._summary_cache[user_id] = summary
        else:
            summary = self._summary_cache[user_id]
        return summary, recent
    async def _summarize(self, messages: list[dict]) -> str:
        """Generate summary of messages."""
        conv = "\n".join([f"{m['role'].upper()}: {m['content']}" for m in messages])
        prompt = f"""Summarize this conversation in 2-3 concise sentences:
 {conv}
 Summary:"""
        response = await self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=150,
            temperature=0.3,
        )
        return response.choices[0].message.content.strip()
 # ============================================================================
 # COMPARISON SCENARIOS
 # ============================================================================
 async def test_full_history(client: AsyncOpenAI, model: str, messages: list[dict]):
    """Baseline: Send full conversation history."""
    print("\n=== FULL HISTORY APPROACH ===")
    system = "You are a helpful assistant on a mesh network."
    full = [{"role": "system", "content": system}] + messages
    start = time.time()
    response = await client.chat.completions.create(
        model=model, messages=full, max_tokens=100, temperature=0.7
    )
    elapsed = time.time() - start
    # Estimate tokens (rough)
    total_chars = sum(len(m["content"]) for m in full)
    est_tokens = total_chars // 4  # Rough estimate: 4 chars = 1 token
    print(f"Messages sent: {len(full)}")
    print(f"Est. input tokens: {est_tokens}")
    print(f"Response: {response.choices[0].message.content[:100]}...")
    print(f"Time: {elapsed:.2f}s")
    return est_tokens, elapsed
 async def test_rolling_summary(
    client: AsyncOpenAI, model: str, messages: list[dict], user_id: str
 ):
    """Optimized: Send summary + recent messages."""
    print("\n=== ROLLING SUMMARY APPROACH ===")
    memory = SimpleRollingSummary(client, model, window_size=4)
    summary, recent = await memory.get_context(user_id, messages)
    system = "You are a helpful assistant on a mesh network."
    if summary:
        system += f"\n\nPrevious conversation summary: {summary}"
    context = [{"role": "system", "content": system}] + recent
    start = time.time()
    response = await client.chat.completions.create(
        model=model, messages=context, max_tokens=100, temperature=0.7
    )
    elapsed = time.time() - start
    # Estimate tokens
    total_chars = sum(len(m["content"]) for m in context)
    est_tokens = total_chars // 4
    print(f"Messages sent: {len(context)} (summary: {summary is not None})")
    if summary:
        print(f"Summary: {summary[:80]}...")
    print(f"Est. input tokens: {est_tokens}")
    print(f"Response: {response.choices[0].message.content[:100]}...")
    print(f"Time: {elapsed:.2f}s")
    return est_tokens, elapsed
 async def test_window_only(client: AsyncOpenAI, model: str, messages: list[dict]):
    """Simple window: Just last N messages, no summary."""
    print("\n=== WINDOW-ONLY APPROACH ===")
    window_size = 4
    recent = messages[-(window_size * 2) :]
    system = "You are a helpful assistant on a mesh network."
    context = [{"role": "system", "content": system}] + recent
    start = time.time()
    response = await client.chat.completions.create(
        model=model, messages=context, max_tokens=100, temperature=0.7
    )
    elapsed = time.time() - start
    total_chars = sum(len(m["content"]) for m in context)
    est_tokens = total_chars // 4
    print(f"Messages sent: {len(context)} (last {window_size} exchanges only)")
    print(f"Est. input tokens: {est_tokens}")
    print(f"Response: {response.choices[0].message.content[:100]}...")
    print(f"Time: {elapsed:.2f}s")
    return est_tokens, elapsed
 # ============================================================================
 # MAIN TEST
 # ============================================================================
 async def main():
    """Run comparison test."""
    # Configure your LLM endpoint
    # Update these for your setup (LiteLLM, local model, etc.)
    BASE_URL = "http://192.168.1.239:8000/v1"  # LiteLLM endpoint
    API_KEY = "sk-1234"  # Your API key
    MODEL = "gpt-4o-mini"  # Your model
    print("=" * 70)
    print("LLM Memory Approach Comparison")
    print("=" * 70)
    # Create test conversation (simulate 15 exchanges = 30 messages)
    messages = []
    topics = [
        ("What's the weather?", "It's sunny and 72°F."),
        ("Should I bring an umbrella?", "No need, clear skies all day."),
        ("What about tomorrow?", "Tomorrow looks rainy, bring an umbrella."),
        ("Any hiking recommendations?", "Try Mt. Si, great views!"),
        ("How long is the hike?", "About 4 hours round trip."),
        ("Is it beginner friendly?", "Moderate difficulty, doable for most."),
        ("What should I bring?", "Water, snacks, good boots, and layers."),
        ("Are dogs allowed?", "Yes, but must be leashed."),
        ("Where's the trailhead?", "Off I-90 near North Bend."),
        ("Parking fee?", "Yes, $10 or Northwest Forest Pass."),
        ("What time should I start?", "Early morning, around 7-8 AM."),
        ("How crowded does it get?", "Very crowded on weekends, go weekdays."),
        ("Any other trails nearby?", "Rattlesnake Ledge is easier and closer."),
        ("Tell me about Rattlesnake", "2 miles, great lake views, very popular."),
        ("Which would you recommend?", "If fit: Mt Si. If casual: Rattlesnake."),
    ]
    for user_msg, assistant_msg in topics:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    print(f"\nTest conversation: {len(messages)} messages ({len(messages)//2} exchanges)")
    print(f"Topics: weather → hiking → trails")
    print(f"Message lengths: {min(len(m['content']) for m in messages)}-{max(len(m['content']) for m in messages)} chars")
    # Initialize client
    client = AsyncOpenAI(api_key=API_KEY, base_url=BASE_URL)
    try:
        # Test each approach
        full_tokens, full_time = await test_full_history(client, MODEL, messages)
        summary_tokens, summary_time = await test_rolling_summary(
            client, MODEL, messages, "!test_user"
        )
        window_tokens, window_time = await test_window_only(client, MODEL, messages)
        # Results
        print("\n" + "=" * 70)
        print("COMPARISON RESULTS")
        print("=" * 70)
        print(f"\n{'Approach':<20} {'Tokens':<15} {'Time':<10} {'Savings'}")
        print("-" * 70)
        print(
            f"{'Full History':<20} {full_tokens:<15} {full_time:<10.2f}s {'(baseline)'}"
        )
        print(
            f"{'Rolling Summary':<20} {summary_tokens:<15} {summary_time:<10.2f}s "
            f"{(1 - summary_tokens/full_tokens)*100:.1f}%"
        )
        print(
            f"{'Window Only':<20} {window_tokens:<15} {window_time:<10.2f}s "
            f"{(1 - window_tokens/full_tokens)*100:.1f}%"
        )
        print("\n" + "=" * 70)
        print("RECOMMENDATIONS")
        print("=" * 70)
        print("\nFull History:")
        print("  ✓ Complete context")
        print("  ✗ High token usage")
        print("  ✗ Slower for long conversations")
        print("  Use: Never (inefficient)")
        print("\nWindow Only:")
        print("  ✓ Very low token usage")
        print("  ✓ Fast")
        print("  ✗ Loses older context completely")
        print("  Use: Short-term conversations only")
        print("\nRolling Summary:")
        print("  ✓ Balanced token usage")
        print("  ✓ Preserves long-term context")
        print("  ✓ Fast after initial summary")
        print("  ✗ Slight overhead for summarization")
        print("  Use: RECOMMENDED for MeshAI")
        print("\n" + "=" * 70)
    finally:
        await client.close()
 if __name__ == "__main__":
    asyncio.run(main())