diff --git a/MEMORY_IMPLEMENTATION_GUIDE.md b/MEMORY_IMPLEMENTATION_GUIDE.md deleted file mode 100644 index b0e8fd0..0000000 --- a/MEMORY_IMPLEMENTATION_GUIDE.md +++ /dev/null @@ -1,656 +0,0 @@ -# Quick Implementation Guide: Rolling Summary Memory - -## TL;DR - -**Problem:** Sending full conversation history every request wastes tokens and latency. - -**Solution:** Rolling summary approach - keep recent messages + LLM-generated summary of older messages. - -**Result:** ~83% token reduction for long conversations, zero dependencies, works with current stack. - ---- - -## Architecture - -``` -SQLite History (per user) - ↓ -Messages 1-10: Summarized → "User asked about weather, discussed outdoor plans" -Messages 11-18: Sent raw → Full context - ↓ -LLM receives: System prompt + Summary + Recent 8 messages - ↓ -Response generated -``` - ---- - -## Files to Create/Modify - -### 1. Create `meshai/memory.py` - -```python -"""Lightweight rolling summary memory manager.""" - -import time -from dataclasses import dataclass -from typing import Optional - -from openai import AsyncOpenAI - - -@dataclass -class ConversationSummary: - """Summary of conversation history.""" - - summary: str - last_updated: float - message_count: int - - -class RollingSummaryMemory: - """Manages conversation summaries with recent message window. - - Strategy: - - Keep last N message pairs (window_size) in full - - Summarize everything before the window - - Update summary when old messages accumulate - - Example (window_size=4): - Messages 1-10: Summarized to "User discussed weather and plans" - Messages 11-18: Kept in full (last 4 pairs) - Context sent: [Summary] + [Messages 11-18] - """ - - def __init__( - self, - client: AsyncOpenAI, - model: str, - window_size: int = 4, - summarize_threshold: int = 8, - ): - """Initialize rolling summary memory. - - Args: - client: AsyncOpenAI client for generating summaries - model: Model name to use for summarization - window_size: Number of recent message pairs to keep in full - summarize_threshold: Messages to accumulate before re-summarizing - """ - self._client = client - self._model = model - self._window_size = window_size - self._summarize_threshold = summarize_threshold - - # In-memory cache of summaries (loaded from DB on startup) - self._summaries: dict[str, ConversationSummary] = {} - - async def get_context_messages( - self, - user_id: str, - full_history: list[dict], - ) -> tuple[Optional[str], list[dict]]: - """Get optimized context: summary + recent messages. - - Args: - user_id: User identifier - full_history: Full message history from database - - Returns: - Tuple of (summary_text, recent_messages) - summary_text is None if conversation is short - """ - # Short conversation - no summary needed - if len(full_history) <= self._window_size * 2: - return None, full_history - - # Split into old (to summarize) and recent (keep raw) - split_point = -(self._window_size * 2) - old_messages = full_history[:split_point] - recent_messages = full_history[split_point:] - - # Get or create summary - summary = await self._get_or_create_summary(user_id, old_messages) - - return summary.summary, recent_messages - - async def _get_or_create_summary( - self, - user_id: str, - messages: list[dict], - ) -> ConversationSummary: - """Get cached summary or create new one.""" - # Check cache - if user_id in self._summaries: - cached = self._summaries[user_id] - - # Reuse if message count is close - if abs(cached.message_count - len(messages)) < self._summarize_threshold: - return cached - - # Generate new summary - summary_text = await self._summarize(messages) - - summary = ConversationSummary( - summary=summary_text, - last_updated=time.time(), - message_count=len(messages), - ) - - self._summaries[user_id] = summary - return summary - - async def _summarize(self, messages: list[dict]) -> str: - """Generate summary using LLM.""" - # Format conversation - conversation = "\n".join( - [f"{msg['role'].upper()}: {msg['content']}" for msg in messages] - ) - - prompt = f"""Summarize this conversation in 2-3 concise sentences. Focus on: -- Main topics discussed -- Important context or user preferences -- Key information to remember - -Conversation: -{conversation} - -Summary (2-3 sentences):""" - - try: - response = await self._client.chat.completions.create( - model=self._model, - messages=[{"role": "user", "content": prompt}], - max_tokens=150, - temperature=0.3, - ) - - return response.choices[0].message.content.strip() - - except Exception as e: - # Fallback - return f"Previous conversation: {len(messages)} messages about various topics." - - def load_summary(self, user_id: str, summary: ConversationSummary) -> None: - """Load summary from database into cache.""" - self._summaries[user_id] = summary - - def clear_summary(self, user_id: str) -> None: - """Clear cached summary for user.""" - self._summaries.pop(user_id, None) -``` - ---- - -### 2. Modify `meshai/history.py` - -Add summary storage methods: - -```python -# Add to ConversationHistory class - -async def initialize(self) -> None: - """Initialize database and create tables.""" - self._db = await aiosqlite.connect(self._db_path) - - # Existing conversations table - await self._db.execute(""" - CREATE TABLE IF NOT EXISTS conversations ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - user_id TEXT NOT NULL, - role TEXT NOT NULL, - content TEXT NOT NULL, - timestamp REAL NOT NULL - ) - """) - - await self._db.execute(""" - CREATE INDEX IF NOT EXISTS idx_user_timestamp - ON conversations (user_id, timestamp) - """) - - # NEW: Summaries table - await self._db.execute(""" - CREATE TABLE IF NOT EXISTS conversation_summaries ( - user_id TEXT PRIMARY KEY, - summary TEXT NOT NULL, - message_count INTEGER NOT NULL, - updated_at REAL NOT NULL - ) - """) - - await self._db.commit() - logger.info(f"Conversation history initialized at {self._db_path}") - - -async def store_summary( - self, user_id: str, summary: str, message_count: int -) -> None: - """Store conversation summary. - - Args: - user_id: Node ID of user - summary: Summary text - message_count: Number of messages summarized - """ - if not self._db: - raise RuntimeError("Database not initialized") - - async with self._lock: - await self._db.execute( - """ - INSERT OR REPLACE INTO conversation_summaries - (user_id, summary, message_count, updated_at) - VALUES (?, ?, ?, ?) - """, - (user_id, summary, message_count, time.time()), - ) - await self._db.commit() - - -async def get_summary(self, user_id: str) -> Optional[dict]: - """Get conversation summary for user. - - Args: - user_id: Node ID of user - - Returns: - Dict with 'summary', 'message_count', 'updated_at' or None - """ - if not self._db: - raise RuntimeError("Database not initialized") - - async with self._lock: - cursor = await self._db.execute( - """ - SELECT summary, message_count, updated_at - FROM conversation_summaries - WHERE user_id = ? - """, - (user_id,), - ) - row = await cursor.fetchone() - - if not row: - return None - - return { - "summary": row[0], - "message_count": row[1], - "updated_at": row[2], - } - - -async def clear_summary(self, user_id: str) -> None: - """Clear summary for user (e.g., on history reset). - - Args: - user_id: Node ID of user - """ - if not self._db: - raise RuntimeError("Database not initialized") - - async with self._lock: - await self._db.execute( - "DELETE FROM conversation_summaries WHERE user_id = ?", - (user_id,), - ) - await self._db.commit() -``` - ---- - -### 3. Modify `meshai/backends/openai_backend.py` - -Integrate memory manager: - -```python -"""OpenAI-compatible LLM backend with rolling summary memory.""" - -import logging -from typing import Optional - -from openai import AsyncOpenAI - -from ..config import LLMConfig -from ..memory import RollingSummaryMemory -from .base import LLMBackend - -logger = logging.getLogger(__name__) - - -class OpenAIBackend(LLMBackend): - """OpenAI-compatible backend with intelligent memory management.""" - - def __init__(self, config: LLMConfig, api_key: str): - """Initialize OpenAI backend. - - Args: - config: LLM configuration - api_key: API key to use - """ - self.config = config - self._client = AsyncOpenAI( - api_key=api_key, - base_url=config.base_url, - ) - - # Initialize rolling summary memory - self._memory = RollingSummaryMemory( - client=self._client, - model=config.model, - window_size=4, # Keep last 4 exchanges (8 messages) - summarize_threshold=8, # Re-summarize after 8 new messages - ) - - async def generate( - self, - messages: list[dict], - system_prompt: str, - user_id: str = None, # NEW: optional for backward compatibility - max_tokens: int = 300, - ) -> str: - """Generate a response using OpenAI-compatible API. - - Args: - messages: Conversation history - system_prompt: System prompt - user_id: User identifier (for memory management) - max_tokens: Maximum tokens to generate - - Returns: - Generated response - """ - # If no user_id, use old behavior (send full history) - if not user_id: - full_messages = [{"role": "system", "content": system_prompt}] - full_messages.extend(messages) - else: - # Use memory manager to optimize context - summary, recent_messages = await self._memory.get_context_messages( - user_id=user_id, - full_history=messages, - ) - - # Build optimized message list - if summary: - # Long conversation: system + summary + recent - enhanced_system = f"""{system_prompt} - -Previous conversation summary: {summary}""" - full_messages = [{"role": "system", "content": enhanced_system}] - full_messages.extend(recent_messages) - - logger.debug( - f"Using summary + {len(recent_messages)} recent messages " - f"(total history: {len(messages)})" - ) - else: - # Short conversation: system + all messages - full_messages = [{"role": "system", "content": system_prompt}] - full_messages.extend(messages) - - try: - response = await self._client.chat.completions.create( - model=self.config.model, - messages=full_messages, - max_tokens=max_tokens, - temperature=0.7, - ) - - content = response.choices[0].message.content - return content.strip() if content else "" - - except Exception as e: - logger.error(f"OpenAI API error: {e}") - raise - - def load_summary_cache(self, user_id: str, summary_data: dict) -> None: - """Load summary into memory cache (called on startup). - - Args: - user_id: User identifier - summary_data: Dict with 'summary', 'message_count', 'updated_at' - """ - from ..memory import ConversationSummary - - summary = ConversationSummary( - summary=summary_data["summary"], - message_count=summary_data["message_count"], - last_updated=summary_data["updated_at"], - ) - self._memory.load_summary(user_id, summary) - - def clear_summary_cache(self, user_id: str) -> None: - """Clear summary cache for user.""" - self._memory.clear_summary(user_id) - - # ... rest of methods unchanged ... -``` - ---- - -### 4. Modify `meshai/responder.py` - -Pass user_id to backend and persist summaries: - -```python -# In the generate_response method - -async def generate_response(self, user_id: str, message: str) -> str: - """Generate LLM response with optimized memory.""" - - # Add user message to history - await self.history.add_message(user_id, "user", message) - - # Get conversation history - history = await self.history.get_history_for_llm(user_id) - - # Generate response with user_id for memory management - response = await self.backend.generate( - messages=history, - system_prompt=self.system_prompt, - user_id=user_id, # NEW: enables memory optimization - max_tokens=300, - ) - - # Add assistant response to history - await self.history.add_message(user_id, "assistant", response) - - # Persist summary if one was created - # The memory manager caches it, we need to save to DB - summary_data = await self._get_current_summary(user_id) - if summary_data: - await self.history.store_summary( - user_id, - summary_data["summary"], - summary_data["message_count"], - ) - - return response - - -async def _get_current_summary(self, user_id: str) -> Optional[dict]: - """Get current summary from memory manager if it exists.""" - # Access the memory manager's cache - if hasattr(self.backend, "_memory"): - summary = self.backend._memory._summaries.get(user_id) - if summary: - return { - "summary": summary.summary, - "message_count": summary.message_count, - "updated_at": summary.last_updated, - } - return None -``` - ---- - -### 5. Modify `meshai/commands/reset.py` - -Clear summaries when resetting history: - -```python -async def execute(self, sender_id: str, args: list[str]) -> str: - """Reset conversation history.""" - count = await self.responder.history.clear_history(sender_id) - - # NEW: Also clear summary - await self.responder.history.clear_summary(sender_id) - if hasattr(self.responder.backend, "clear_summary_cache"): - self.responder.backend.clear_summary_cache(sender_id) - - return f"Cleared {count} messages from your history." -``` - ---- - -## Configuration - -Add to `meshai/config.py`: - -```python -@dataclass -class MemoryConfig: - """Memory management configuration.""" - - # Rolling summary settings - window_size: int = 4 # Recent message pairs to keep - summarize_threshold: int = 8 # Messages before re-summarizing - - # When to enable summaries - min_messages_for_summary: int = 10 # Start summarizing after this many -``` - ---- - -## Testing - -```python -# Test script -import asyncio -from meshai.backends.openai_backend import OpenAIBackend -from meshai.config import LLMConfig - -async def test(): - config = LLMConfig( - backend="openai", - base_url="http://192.168.1.239:8000/v1", - model="gpt-4o-mini" - ) - - backend = OpenAIBackend(config, "your-key") - - # Simulate long conversation - messages = [] - for i in range(20): - messages.append({"role": "user", "content": f"Question {i}"}) - messages.append({"role": "assistant", "content": f"Answer {i}"}) - - # Generate - should use summary - response = await backend.generate( - messages=messages, - system_prompt="You are helpful.", - user_id="!test123", - max_tokens=100 - ) - - print(f"Response: {response}") - print(f"Sent {len(messages)} messages, but only ~10 used in context") - -asyncio.run(test()) -``` - ---- - -## Expected Results - -### Token Usage Comparison - -**Before (full history):** -``` -User message 1-20: ~2000 tokens -System prompt: ~50 tokens -Total: ~2050 tokens per request -``` - -**After (with summary):** -``` -System prompt: ~50 tokens -Summary: ~100 tokens -Recent 8 messages: ~400 tokens -Total: ~550 tokens per request -``` - -**Savings: ~73% token reduction** - -### Performance Impact - -- **Summary generation**: ~1-2s every 8-10 messages (amortized) -- **Regular requests**: No added latency -- **Storage**: ~100 bytes per summary in SQLite - ---- - -## Tuning Parameters - -### window_size -- **Smaller (2-3)**: More aggressive summarization, max token savings -- **Larger (5-6)**: More context, less summarization -- **Recommended**: 4 (last 4 exchanges = 8 messages) - -### summarize_threshold -- **Smaller (4-6)**: Frequent re-summarization, more current -- **Larger (10-12)**: Less summarization overhead -- **Recommended**: 8 (re-summarize after 8 new messages) - -### For MeshAI specifically: -- Messages are tiny (150 chars max) -- `window_size=4` gives ~600 chars of recent context -- `summarize_threshold=8` balances overhead vs accuracy - ---- - -## Migration Path - -1. **Phase 1**: Add code, test with new users -2. **Phase 2**: Run in parallel (old + new backend) -3. **Phase 3**: Migrate existing users (generate summaries for existing history) -4. **Phase 4**: Remove old full-history code path - -No data loss - summaries stored in DB, can regenerate anytime. - ---- - -## Maintenance - -### Monitor summary quality: -```sql --- Check summaries -SELECT user_id, summary, message_count, updated_at -FROM conversation_summaries -ORDER BY updated_at DESC; -``` - -### Regenerate summary: -```python -# Clear cache + DB, will regenerate on next request -await history.clear_summary(user_id) -backend.clear_summary_cache(user_id) -``` - -### Adjust if summaries too short/long: -- Modify prompt in `_summarize()` -- Adjust `max_tokens=150` for summaries -- Change temperature (lower = more consistent) - ---- - -## Future Enhancements - -1. **Hybrid approach**: Summary + semantic search for very long histories -2. **User preferences**: Store separate from summary (e.g., "likes weather in metric") -3. **Multi-level summaries**: Summarize summaries for years-long conversations -4. **Summary quality scoring**: Validate summaries maintain key information - -But start simple - this gets 80% of the benefit with 20% of the complexity. diff --git a/MEMORY_README.md b/MEMORY_README.md deleted file mode 100644 index fbb8c17..0000000 --- a/MEMORY_README.md +++ /dev/null @@ -1,437 +0,0 @@ -# LLM Conversation Memory Research & Implementation - -This directory contains comprehensive research and implementation guides for improving LLM conversation memory in MeshAI. - -## Problem Statement - -MeshAI currently sends the full conversation history with every LLM API call. This approach: -- Wastes tokens (expensive and slow) -- Doesn't scale to long conversations -- Sends redundant context the LLM doesn't need - -## Solution: Rolling Summary Memory - -Keep recent messages in full + LLM-generated summary of older messages. - -**Result:** 70-80% token reduction, zero dependencies, works with existing stack. - ---- - -## Documentation Index - -### 1. Quick Start - -**READ THIS FIRST:** [`MEMORY_SUMMARY.md`](/home/zvx/projects/meshai/MEMORY_SUMMARY.md) -- High-level overview -- Why rolling summary? -- Comparison with alternatives -- Expected performance gains - -**Estimated reading time:** 10 minutes - ---- - -### 2. Detailed Research - -**FOR DEEP DIVE:** [`MEMORY_RESEARCH.md`](/home/zvx/projects/meshai/MEMORY_RESEARCH.md) -- Full evaluation of 5 approaches: - 1. LangChain Memory modules - 2. LlamaIndex - 3. MemGPT/Letta - 4. Vector stores (ChromaDB/Qdrant) - 5. Simple rolling summary (DIY) -- Code examples for each approach -- Pros/cons for MeshAI specifically -- Detailed comparison matrix - -**Estimated reading time:** 30-45 minutes - ---- - -### 3. Implementation Guide - -**FOR BUILDING:** [`MEMORY_IMPLEMENTATION_GUIDE.md`](/home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md) -- Step-by-step implementation -- Complete code examples -- Database schema -- Configuration options -- Testing procedures -- Troubleshooting guide - -**Estimated reading time:** 20 minutes + implementation time - ---- - -### 4. Implementation Diff - -**FOR EXACT CHANGES:** [`docs/IMPLEMENTATION_DIFF.md`](/home/zvx/projects/meshai/docs/IMPLEMENTATION_DIFF.md) -- Exact code diffs for all files -- Line-by-line changes needed -- Migration checklist -- Rollback plan -- Performance validation queries - -**Estimated reading time:** 15 minutes - ---- - -### 5. Visual Comparison - -**FOR UNDERSTANDING:** [`docs/memory_approaches_comparison.txt`](/home/zvx/projects/meshai/docs/memory_approaches_comparison.txt) -- ASCII diagrams of all approaches -- Visual token usage comparison -- Decision matrices -- Architecture diagrams - -**Estimated reading time:** 10 minutes - ---- - -### 6. Quick Reference - -**FOR CHEAT SHEET:** [`docs/QUICK_REFERENCE.md`](/home/zvx/projects/meshai/docs/QUICK_REFERENCE.md) -- One-page reference card -- Key configuration -- Code snippets -- Performance metrics -- Troubleshooting tips - -**Estimated reading time:** 5 minutes - ---- - -### 7. Proof of Concept - -**FOR TESTING:** [`examples/memory_comparison.py`](/home/zvx/projects/meshai/examples/memory_comparison.py) -- Runnable comparison script -- Tests all 3 approaches side-by-side: - - Full history (baseline) - - Rolling summary - - Window-only -- Real token usage measurements -- Performance comparison - -**Usage:** -```bash -# Edit script with your LLM endpoint -nano examples/memory_comparison.py -# Update BASE_URL, API_KEY, MODEL - -# Run comparison -python examples/memory_comparison.py -``` - -**Expected output:** -``` -Approach Tokens Time Savings ----------------------------------------------------------------------- -Full History 1847 2.34s (baseline) -Rolling Summary 512 1.87s 72.3% -Window Only 398 1.45s 78.4% - -RECOMMENDATION: Rolling Summary - best balance of context and efficiency -``` - ---- - -## Recommended Reading Path - -### Path 1: Executive Summary (20 minutes) -1. `MEMORY_SUMMARY.md` - Overview -2. `docs/QUICK_REFERENCE.md` - Cheat sheet -3. `examples/memory_comparison.py` - Run the test - -**Decision point:** Convinced? Proceed to implementation. - ---- - -### Path 2: Technical Deep Dive (60 minutes) -1. `MEMORY_SUMMARY.md` - Overview -2. `MEMORY_RESEARCH.md` - Full evaluation -3. `docs/memory_approaches_comparison.txt` - Visual diagrams -4. `examples/memory_comparison.py` - Run the test -5. `MEMORY_IMPLEMENTATION_GUIDE.md` - How to build it - -**Decision point:** Ready to implement? Use the diff guide. - ---- - -### Path 3: Implementation (2-3 hours) -1. `MEMORY_SUMMARY.md` - Refresh on approach -2. `MEMORY_IMPLEMENTATION_GUIDE.md` - Full implementation guide -3. `docs/IMPLEMENTATION_DIFF.md` - Exact changes needed -4. Code the changes -5. Test with `examples/memory_comparison.py` -6. Deploy and monitor - -**Outcome:** Production-ready rolling summary memory. - ---- - -## Files Created - -### Documentation -``` -/home/zvx/projects/meshai/ -├── MEMORY_README.md (this file) -├── MEMORY_SUMMARY.md (overview) -├── MEMORY_RESEARCH.md (detailed research) -├── MEMORY_IMPLEMENTATION_GUIDE.md (step-by-step) -├── docs/ -│ ├── IMPLEMENTATION_DIFF.md (exact changes) -│ ├── memory_approaches_comparison.txt (diagrams) -│ └── QUICK_REFERENCE.md (cheat sheet) -└── examples/ - └── memory_comparison.py (proof of concept) -``` - -### Code to Create (not yet created) -``` -meshai/ -├── memory.py (NEW - ~100 lines) -├── history.py (MODIFY - add ~70 lines) -├── backends/ -│ └── openai_backend.py (MODIFY - add ~30 lines) -├── responder.py (MODIFY - add ~10 lines) -└── commands/ - └── reset.py (MODIFY - add ~4 lines) -``` - -**Total new code:** ~214 lines -**Dependencies added:** 0 - ---- - -## Key Metrics - -### Token Savings - -| Conversation Length | Before | After | Savings | -|---------------------|--------|-------|---------| -| 10 messages | 800 | 800 | 0% | -| 20 messages | 1600 | 550 | 66% | -| 30 messages | 2400 | 600 | 75% | -| 50 messages | 4000 | 650 | 84% | - -### Cost Impact - -**Assumptions:** -- $0.50 per 1M input tokens -- 1000 requests per day -- Average 30 messages per conversation - -**Before:** $36/month -**After:** $9/month -**Savings:** $27/month (75% reduction) - -### Implementation Effort - -- Code to write: ~214 lines -- Code to modify: ~57 lines -- Time estimate: 2-3 hours -- Testing: 1 hour -- **Total:** Half a day - -### Risk Assessment - -- **Low risk:** Backward compatible (user_id parameter optional) -- **No data loss:** New table, existing data untouched -- **Easy rollback:** Git revert + drop one table -- **No dependencies:** Pure Python, existing libraries only - ---- - -## Configuration Summary - -### Recommended for MeshAI - -```python -RollingSummaryMemory( - client=self._client, - model=config.model, - window_size=4, # Keep last 4 exchanges (8 messages) - summarize_threshold=8, # Re-summarize after 8 new messages -) -``` - -**Rationale:** -- MeshAI messages are tiny (150 chars max) -- window_size=4 gives ~600 chars of recent context -- summarize_threshold=8 balances overhead vs freshness -- Tune based on actual usage patterns - -### Alternative Configurations - -**For longer messages:** -```python -window_size=3, # Less recent context needed -summarize_threshold=6, # More frequent updates -``` - -**For very short messages:** -```python -window_size=6, # More recent context -summarize_threshold=10, # Less frequent summarization -``` - ---- - -## Database Schema - -### New Table - -```sql -CREATE TABLE conversation_summaries ( - user_id TEXT PRIMARY KEY, - summary TEXT NOT NULL, - message_count INTEGER NOT NULL, - updated_at REAL NOT NULL -); -``` - -### Existing Tables (unchanged) - -```sql -CREATE TABLE conversations ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - user_id TEXT NOT NULL, - role TEXT NOT NULL, - content TEXT NOT NULL, - timestamp REAL NOT NULL -); - -CREATE INDEX idx_user_timestamp ON conversations (user_id, timestamp); -``` - ---- - -## Testing Checklist - -- [ ] Database migration works (new table created) -- [ ] Short conversations (<10 messages) use full history -- [ ] Long conversations (>10 messages) use summaries -- [ ] Summaries are stored in database -- [ ] Summaries persist across restarts -- [ ] Reset command clears summaries -- [ ] Token usage reduced by 70%+ for long convos -- [ ] No errors in logs -- [ ] Response quality maintained - ---- - -## Monitoring Queries - -### Check summary coverage -```sql -SELECT - (SELECT COUNT(DISTINCT user_id) FROM conversation_summaries) * 100.0 / - (SELECT COUNT(DISTINCT user_id) FROM conversations) as coverage_pct; -``` - -### Average messages per summary -```sql -SELECT AVG(message_count) FROM conversation_summaries; -``` - -### Recent summaries -```sql -SELECT user_id, summary, message_count, - datetime(updated_at, 'unixepoch') as updated -FROM conversation_summaries -ORDER BY updated_at DESC -LIMIT 10; -``` - ---- - -## Troubleshooting - -### Summary not being created - -**Check:** Conversation long enough? -```sql -SELECT user_id, COUNT(*) as msg_count -FROM conversations -GROUP BY user_id -HAVING msg_count > 10; -``` - -**Fix:** Need >10 messages before summary kicks in. - -### Summary quality poor - -**Check:** Look at actual summaries -```sql -SELECT summary FROM conversation_summaries; -``` - -**Fix:** Adjust prompt in `memory.py` `_summarize()` method. - -### Token usage still high - -**Check:** Verify memory is being used -```bash -# Look for log line: -# "Using summary + 8 recent messages (total history: 24)" -``` - -**Fix:** Ensure `user_id` is being passed to `backend.generate()`. - -### Database errors - -**Check:** Table exists -```sql -.tables -``` - -**Fix:** Drop and recreate -```sql -DROP TABLE IF EXISTS conversation_summaries; --- Restart app to recreate -``` - ---- - -## Next Steps - -1. **Understand:** Read `MEMORY_SUMMARY.md` -2. **Evaluate:** Review `MEMORY_RESEARCH.md` for alternatives -3. **Test:** Run `examples/memory_comparison.py` with your LLM -4. **Implement:** Follow `MEMORY_IMPLEMENTATION_GUIDE.md` -5. **Deploy:** Use `docs/IMPLEMENTATION_DIFF.md` for exact changes -6. **Monitor:** Check database and logs for summary generation -7. **Tune:** Adjust `window_size` and `summarize_threshold` as needed - ---- - -## Support - -If you have questions or issues: - -1. Check the troubleshooting section in this file -2. Review `docs/QUICK_REFERENCE.md` for common issues -3. Look at the detailed implementation guide -4. Check the proof-of-concept script for working examples - ---- - -## Conclusion - -Rolling summary memory provides: -- **Massive efficiency gains** (70-80% token reduction) -- **Zero dependencies** (pure Python) -- **Simple implementation** (~200 lines) -- **Production ready** (tested approach) -- **Backward compatible** (optional user_id) -- **Easy to maintain** (clear, documented code) - -**Recommendation:** Implement this for MeshAI. It's the right balance of simplicity and effectiveness. - -Good luck! The documentation is comprehensive - you have everything needed to succeed. - ---- - -**Research completed:** 2025-12-15 -**Total documentation:** 7 files, ~1500 lines -**Implementation effort:** ~3 hours -**Expected ROI:** $324/year in token savings (at modest 1000 req/day) diff --git a/MEMORY_RESEARCH.md b/MEMORY_RESEARCH.md deleted file mode 100644 index 639a03a..0000000 --- a/MEMORY_RESEARCH.md +++ /dev/null @@ -1,1024 +0,0 @@ -# LLM Conversation Memory Research for MeshAI - -## Current Implementation Analysis - -**Current approach:** MeshAI stuffs full conversation history into every LLM API call -- Storage: SQLite via aiosqlite -- Retrieval: `get_history_for_llm()` returns all messages (up to `max_messages_per_user * 2`) -- Backend: OpenAI-compatible API (works with LiteLLM, local models) -- Context: 150 char max per message, per-user conversations - -**Problem:** Inefficient - sends entire history even when unnecessary, wastes tokens and latency. - ---- - -## 1. LangChain Memory Modules - -### Installation -```bash -pip install langchain langchain-community langchain-openai -``` - -### A. ConversationBufferMemory (Simplest) - -**What it does:** Stores raw messages in memory, returns all messages. - -```python -from langchain.memory import ConversationBufferMemory -from langchain_openai import ChatOpenAI -from langchain.chains import ConversationChain - -# Initialize -llm = ChatOpenAI( - base_url="http://192.168.1.239:8000/v1", # LiteLLM - api_key="your-key", - model="gpt-4o-mini" -) - -memory = ConversationBufferMemory() - -chain = ConversationChain( - llm=llm, - memory=memory, - verbose=False -) - -# Use it -response = chain.predict(input="What's the weather?") -print(response) - -# Access history -print(memory.load_memory_variables({})) -# {'history': 'Human: What's the weather?\nAI: ...'} -``` - -**Integration with MeshAI:** -```python -# In meshai/backends/openai_backend.py -from langchain.memory import ConversationBufferMemory -from langchain_openai import ChatOpenAI -from langchain.chains import ConversationChain - -class OpenAIBackendWithMemory(LLMBackend): - def __init__(self, config: LLMConfig, api_key: str): - self.config = config - self._llm = ChatOpenAI( - base_url=config.base_url, - api_key=api_key, - model=config.model, - temperature=0.7, - max_tokens=300 - ) - # Per-user memory storage - self._user_memories: dict[str, ConversationBufferMemory] = {} - - def _get_memory(self, user_id: str) -> ConversationBufferMemory: - if user_id not in self._user_memories: - self._user_memories[user_id] = ConversationBufferMemory() - return self._user_memories[user_id] - - async def generate( - self, - messages: list[dict], - system_prompt: str, - user_id: str, # NEW: need user_id for memory - max_tokens: int = 300, - ) -> str: - memory = self._get_memory(user_id) - - # Create chain with memory - chain = ConversationChain( - llm=self._llm, - memory=memory, - verbose=False - ) - - # Extract last user message - last_msg = messages[-1]["content"] - - # Generate with memory - response = await chain.apredict(input=last_msg) - return response.strip() -``` - -**Pros:** -- Dead simple, drop-in replacement -- Works with any OpenAI-compatible API -- No external dependencies -- LangChain handles message formatting - -**Cons:** -- Still sends full history (no real efficiency gain) -- Stores everything in RAM (lost on restart) -- Need to manage per-user memory dicts -- Adds LangChain dependency (~50MB) - -**Verdict:** Not worth it - adds complexity without solving core problem. - ---- - -### B. ConversationBufferWindowMemory (Better) - -**What it does:** Only keeps last N messages in context. - -```python -from langchain.memory import ConversationBufferWindowMemory - -# Keep only last 5 interactions (10 messages = 5 pairs) -memory = ConversationBufferWindowMemory(k=5) - -chain = ConversationChain( - llm=llm, - memory=memory -) - -# Only last 5 exchanges sent to LLM -response = chain.predict(input="Hello") -``` - -**Integration:** -```python -class OpenAIBackendWithWindow(LLMBackend): - def __init__(self, config: LLMConfig, api_key: str): - self.config = config - self._llm = ChatOpenAI( - base_url=config.base_url, - api_key=api_key, - model=config.model - ) - # Per-user windowed memory - self._user_memories: dict[str, ConversationBufferWindowMemory] = {} - self._window_size = 5 # Last 5 exchanges - - def _get_memory(self, user_id: str) -> ConversationBufferWindowMemory: - if user_id not in self._user_memories: - self._user_memories[user_id] = ConversationBufferWindowMemory( - k=self._window_size - ) - return self._user_memories[user_id] -``` - -**Pros:** -- Simple sliding window approach -- Reduces token usage automatically -- Works with any OpenAI-compatible API -- Configurable window size - -**Cons:** -- Still in-memory only (lost on restart) -- Forgets old context completely -- Need to integrate with existing SQLite storage -- Adds LangChain dependency - -**Verdict:** Better than full buffer, but loses long-term context. - ---- - -### C. ConversationSummaryMemory (Most Interesting) - -**What it does:** Uses LLM to summarize conversation, keeps summary + recent messages. - -```python -from langchain.memory import ConversationSummaryMemory - -memory = ConversationSummaryMemory(llm=llm) - -chain = ConversationChain( - llm=llm, - memory=memory -) - -# After multiple messages, memory contains: -# - Summary of old conversation -# - Recent raw messages -response = chain.predict(input="What did we talk about?") -# AI can reference both summary and recent context -``` - -**Integration with SQLite persistence:** -```python -from langchain.memory import ConversationSummaryMemory -from langchain_openai import ChatOpenAI - -class OpenAIBackendWithSummary(LLMBackend): - def __init__(self, config: LLMConfig, api_key: str, history: ConversationHistory): - self.config = config - self.history = history # Existing SQLite history - - self._llm = ChatOpenAI( - base_url=config.base_url, - api_key=api_key, - model=config.model - ) - - # Per-user summaries (load from DB) - self._user_summaries: dict[str, str] = {} - self._window_size = 4 # Keep last 4 messages raw - - async def generate( - self, - messages: list[dict], - system_prompt: str, - user_id: str, - max_tokens: int = 300, - ) -> str: - # Get full history from SQLite - full_history = await self.history.get_history(user_id) - - if len(full_history) <= self._window_size * 2: - # Small conversation, just use raw messages - context_messages = messages - else: - # Large conversation: summarize old + keep recent - old_messages = full_history[:-self._window_size * 2] - recent_messages = full_history[-self._window_size * 2:] - - # Get or create summary - summary = await self._get_summary(user_id, old_messages) - - # Build context: system + summary + recent messages - context_messages = [ - {"role": "system", "content": f"{system_prompt}\n\nConversation summary: {summary}"} - ] - context_messages.extend([ - {"role": msg.role, "content": msg.content} - for msg in recent_messages - ]) - - # Generate response - response = await self._client.chat.completions.create( - model=self.config.model, - messages=context_messages, - max_tokens=max_tokens, - temperature=0.7, - ) - - return response.choices[0].message.content.strip() - - async def _get_summary(self, user_id: str, messages: list) -> str: - """Summarize old messages using LLM.""" - if user_id in self._user_summaries: - return self._user_summaries[user_id] - - # Create summary prompt - conversation_text = "\n".join([ - f"{msg.role}: {msg.content}" for msg in messages - ]) - - summary_prompt = f"""Summarize this conversation in 2-3 sentences, focusing on key topics and user preferences: - -{conversation_text} - -Summary:""" - - response = await self._client.chat.completions.create( - model=self.config.model, - messages=[{"role": "user", "content": summary_prompt}], - max_tokens=150, - temperature=0.3, - ) - - summary = response.choices[0].message.content.strip() - - # Store in SQLite - await self._store_summary(user_id, summary) - self._user_summaries[user_id] = summary - - return summary - - async def _store_summary(self, user_id: str, summary: str): - """Store summary in SQLite for persistence.""" - # Add new table for summaries - await self.history._db.execute(""" - CREATE TABLE IF NOT EXISTS conversation_summaries ( - user_id TEXT PRIMARY KEY, - summary TEXT NOT NULL, - updated_at REAL NOT NULL - ) - """) - - await self.history._db.execute(""" - INSERT OR REPLACE INTO conversation_summaries (user_id, summary, updated_at) - VALUES (?, ?, ?) - """, (user_id, summary, time.time())) - - await self.history._db.commit() -``` - -**Pros:** -- Best balance: compact summary + recent context -- Significantly reduces token usage for long conversations -- Works with existing OpenAI-compatible APIs -- Preserves long-term context -- Can persist summaries in SQLite - -**Cons:** -- Costs extra tokens to generate summaries -- Adds latency when summarizing -- Need to decide when to re-summarize -- Still requires LangChain - -**Verdict:** BEST LANGCHAIN OPTION for MeshAI - balances efficiency and context retention. - ---- - -## 2. LlamaIndex - -### Installation -```bash -pip install llama-index llama-index-llms-openai -``` - -### Chat Memory - -```python -from llama_index.core.memory import ChatMemoryBuffer -from llama_index.llms.openai import OpenAI -from llama_index.core.llms import ChatMessage - -# Initialize -llm = OpenAI( - api_base="http://192.168.1.239:8000/v1", - api_key="your-key", - model="gpt-4o-mini" -) - -# Create memory buffer -memory = ChatMemoryBuffer.from_defaults(token_limit=1500) - -# Add messages -memory.put(ChatMessage(role="user", content="Hello")) -memory.put(ChatMessage(role="assistant", content="Hi there!")) - -# Get messages for LLM -messages = memory.get() - -# Generate with context -response = llm.chat(messages) -``` - -**Integration:** -```python -from llama_index.core.memory import ChatMemoryBuffer -from llama_index.llms.openai import OpenAI -from llama_index.core.llms import ChatMessage - -class LlamaIndexBackend(LLMBackend): - def __init__(self, config: LLMConfig, api_key: str): - self.config = config - self._llm = OpenAI( - api_base=config.base_url, - api_key=api_key, - model=config.model - ) - - # Per-user memory buffers - self._user_memories: dict[str, ChatMemoryBuffer] = {} - self._token_limit = 1500 - - def _get_memory(self, user_id: str) -> ChatMemoryBuffer: - if user_id not in self._user_memories: - self._user_memories[user_id] = ChatMemoryBuffer.from_defaults( - token_limit=self._token_limit - ) - return self._user_memories[user_id] - - async def generate( - self, - messages: list[dict], - system_prompt: str, - user_id: str, - max_tokens: int = 300, - ) -> str: - memory = self._get_memory(user_id) - - # Add new message to memory - user_msg = messages[-1]["content"] - memory.put(ChatMessage(role="user", content=user_msg)) - - # Get messages within token limit - context_messages = memory.get() - - # Add system prompt - full_messages = [ChatMessage(role="system", content=system_prompt)] - full_messages.extend(context_messages) - - # Generate - response = self._llm.chat(full_messages) - - # Store assistant response - memory.put(ChatMessage(role="assistant", content=response.message.content)) - - return response.message.content -``` - -**Pros:** -- Token-aware buffering (auto-prunes to stay under limit) -- Simple API -- Works with OpenAI-compatible backends -- Better than manual message counting - -**Cons:** -- In-memory only (need custom persistence) -- Heavy dependency (~100MB) -- Overkill for simple chat -- Less mature than LangChain - -**Verdict:** Token limiting is nice, but not worth the dependency weight. - ---- - -## 3. MemGPT / Letta (Self-Editing Memory) - -### Installation -```bash -pip install letta -``` - -### Usage - -**What it does:** Agent manages its own memory, decides what to keep/forget/summarize. - -```python -from letta import create_client - -client = create_client() - -# Create agent with memory management -agent = client.create_agent( - name="meshai_agent", - llm_config={ - "model": "gpt-4o-mini", - "model_endpoint": "http://192.168.1.239:8000/v1" - }, - embedding_config={ - "embedding_endpoint_type": "openai", - "embedding_model": "text-embedding-ada-002" - } -) - -# Agent manages memory automatically -response = client.send_message( - agent_id=agent.id, - message="What's the weather?", - role="user" -) - -print(response.messages[-1].text) -``` - -**Architecture:** -- Core memory: Persistent facts the agent always sees -- Recall memory: Searchable vector store of past conversations -- Archival memory: Long-term storage - -**Pros:** -- Most sophisticated memory system -- Agent decides what's important -- Built-in vector search -- Handles very long conversations - -**Cons:** -- HEAVY (~200MB+ with dependencies) -- Requires vector embeddings (extra API calls/costs) -- Complex setup and learning curve -- Overkill for 150-char mesh messages -- Opinionated architecture (hard to integrate) - -**Verdict:** Way too heavy for MeshAI. Only worth it for complex, long-form agents. - ---- - -## 4. Vector Stores (Semantic Memory) - -### ChromaDB (Simplest) - -```bash -pip install chromadb -``` - -```python -import chromadb -from chromadb.config import Settings - -# Initialize -client = chromadb.Client(Settings( - persist_directory="/path/to/meshai/memory", - anonymized_telemetry=False -)) - -# Create collection per user -collection = client.get_or_create_collection( - name=f"user_{user_id}", - metadata={"user_id": user_id} -) - -# Add messages -collection.add( - documents=["What's the weather in Seattle?"], - metadatas=[{"role": "user", "timestamp": time.time()}], - ids=["msg_1"] -) - -# Semantic search for relevant past messages -results = collection.query( - query_texts=["weather"], - n_results=3 -) - -# Use retrieved messages as context -relevant_context = results['documents'][0] -``` - -**Integration:** -```python -import chromadb -from chromadb.config import Settings - -class VectorMemoryBackend(LLMBackend): - def __init__(self, config: LLMConfig, api_key: str, db_path: str): - self.config = config - self._client = AsyncOpenAI( - api_key=api_key, - base_url=config.base_url, - ) - - # ChromaDB for semantic memory - self._chroma = chromadb.Client(Settings( - persist_directory=db_path, - anonymized_telemetry=False - )) - - self._window_size = 4 # Keep last 4 messages raw - - def _get_collection(self, user_id: str): - return self._chroma.get_or_create_collection( - name=f"user_{user_id.replace('!', '_')}" # Sanitize ID - ) - - async def generate( - self, - messages: list[dict], - system_prompt: str, - user_id: str, - max_tokens: int = 300, - ) -> str: - collection = self._get_collection(user_id) - - # Get current query - current_query = messages[-1]["content"] - - # Search for semantically similar past messages - try: - results = collection.query( - query_texts=[current_query], - n_results=3, - where={"role": "assistant"} # Get past responses - ) - relevant_history = results['documents'][0] if results['documents'] else [] - except: - relevant_history = [] - - # Build context: system + relevant history + recent messages - context = system_prompt - if relevant_history: - context += "\n\nRelevant past exchanges:\n" - context += "\n".join(relevant_history[:2]) # Top 2 relevant - - context_messages = [{"role": "system", "content": context}] - context_messages.extend(messages[-self._window_size*2:]) # Recent messages - - # Generate - response = await self._client.chat.completions.create( - model=self.config.model, - messages=context_messages, - max_tokens=max_tokens, - temperature=0.7, - ) - - reply = response.choices[0].message.content.strip() - - # Store in vector DB - msg_id = f"{user_id}_{int(time.time()*1000)}" - collection.add( - documents=[f"User: {current_query}\nAssistant: {reply}"], - metadatas=[{"role": "assistant", "timestamp": time.time()}], - ids=[msg_id] - ) - - return reply -``` - -**Pros:** -- Semantic search - finds relevant past context -- Works great for sparse conversations -- Persistent storage -- Lightweight (~20MB) -- No extra API calls (uses local embeddings) - -**Cons:** -- Adds dependency -- Embedding computation overhead -- May surface irrelevant "similar" messages -- Overkill for very short conversations - -**Verdict:** Interesting for long-term memory, but maybe overkill for 150-char messages. - ---- - -### Qdrant (Production Alternative) - -```bash -pip install qdrant-client -``` - -```python -from qdrant_client import QdrantClient -from qdrant_client.models import Distance, VectorParams, PointStruct - -# Can run in-memory or with server -client = QdrantClient(path="/path/to/meshai/qdrant") - -# Create collection -client.create_collection( - collection_name="meshai_memory", - vectors_config=VectorParams(size=1536, distance=Distance.COSINE), -) - -# Store with embedding (from OpenAI or local model) -client.upsert( - collection_name="meshai_memory", - points=[ - PointStruct( - id=msg_id, - vector=embedding, # 1536-dim from text-embedding-ada-002 - payload={"user_id": user_id, "content": content, "role": role} - ) - ] -) - -# Search -results = client.search( - collection_name="meshai_memory", - query_vector=query_embedding, - query_filter={"user_id": user_id}, - limit=3 -) -``` - -**Pros:** -- Production-ready, fast -- Better than ChromaDB for scale -- Rich filtering options -- Can run in-memory or server mode - -**Cons:** -- More complex than ChromaDB -- Still requires embeddings -- Heavier dependency - -**Verdict:** Better than ChromaDB for production, but still overkill for MeshAI's use case. - ---- - -## 5. Simple Rolling Summary (RECOMMENDED) - -**The lightest, most practical approach for MeshAI.** - -### Implementation - -```python -import asyncio -import time -from dataclasses import dataclass -from typing import Optional -from openai import AsyncOpenAI - -@dataclass -class ConversationSummary: - """Summary of conversation history.""" - summary: str - last_updated: float - message_count: int - -class SimpleRollingSummary: - """Lightweight rolling summary memory manager.""" - - def __init__( - self, - client: AsyncOpenAI, - model: str, - window_size: int = 4, # Recent messages to keep raw - summarize_threshold: int = 10, # Messages before summarizing - ): - self._client = client - self._model = model - self._window_size = window_size - self._summarize_threshold = summarize_threshold - - # Per-user summaries (would be in SQLite in production) - self._summaries: dict[str, ConversationSummary] = {} - - async def get_context_messages( - self, - user_id: str, - full_history: list[dict], # From SQLite - ) -> list[dict]: - """Get optimized context messages (summary + recent).""" - - # If conversation is short, just return it - if len(full_history) <= self._window_size * 2: - return full_history - - # Split into old and recent - old_messages = full_history[:-self._window_size * 2] - recent_messages = full_history[-self._window_size * 2:] - - # Get or create summary of old messages - summary = await self._get_or_create_summary(user_id, old_messages) - - # Return summary as system message + recent raw messages - context = [ - {"role": "system", "content": f"Previous conversation summary: {summary.summary}"} - ] - context.extend(recent_messages) - - return context - - async def _get_or_create_summary( - self, - user_id: str, - messages: list[dict], - ) -> ConversationSummary: - """Get existing summary or create new one.""" - - # Check if we have a recent summary - if user_id in self._summaries: - existing = self._summaries[user_id] - - # If summary covers roughly the same messages, reuse it - if abs(existing.message_count - len(messages)) < self._summarize_threshold: - return existing - - # Create new summary - summary_text = await self._summarize(messages) - - summary = ConversationSummary( - summary=summary_text, - last_updated=time.time(), - message_count=len(messages) - ) - - self._summaries[user_id] = summary - return summary - - async def _summarize(self, messages: list[dict]) -> str: - """Summarize a list of messages using the LLM.""" - - # Format conversation - conversation = "\n".join([ - f"{msg['role'].upper()}: {msg['content']}" - for msg in messages - ]) - - prompt = f"""Summarize this conversation in 2-3 concise sentences. Focus on: -- Main topics discussed -- Any important user preferences or context -- Key information that should be remembered - -Conversation: -{conversation} - -Summary (2-3 sentences):""" - - try: - response = await self._client.chat.completions.create( - model=self._model, - messages=[{"role": "user", "content": prompt}], - max_tokens=150, - temperature=0.3, - ) - - return response.choices[0].message.content.strip() - - except Exception as e: - # Fallback: simple truncation if summarization fails - return f"Previous conversation covered {len(messages)} messages." -``` - -### Integration with MeshAI - -```python -# In meshai/backends/openai_backend.py - -class OpenAIBackend(LLMBackend): - """OpenAI-compatible backend with rolling summary memory.""" - - def __init__(self, config: LLMConfig, api_key: str): - self.config = config - self._client = AsyncOpenAI( - api_key=api_key, - base_url=config.base_url, - ) - - # Add rolling summary manager - self._memory = SimpleRollingSummary( - client=self._client, - model=config.model, - window_size=4, # Keep last 4 exchanges (8 messages) - summarize_threshold=10, # Summarize after 10 messages - ) - - async def generate( - self, - messages: list[dict], - system_prompt: str, - user_id: str, # NEW: need user_id - max_tokens: int = 300, - ) -> str: - """Generate with optimized context.""" - - # Get optimized context (summary + recent) - context_messages = await self._memory.get_context_messages( - user_id=user_id, - full_history=messages, - ) - - # Add system prompt - full_messages = [{"role": "system", "content": system_prompt}] - full_messages.extend(context_messages) - - # Generate - response = await self._client.chat.completions.create( - model=self.config.model, - messages=full_messages, - max_tokens=max_tokens, - temperature=0.7, - ) - - return response.choices[0].message.content.strip() -``` - -### Persist Summaries in SQLite - -```python -# Add to meshai/history.py - -async def store_summary(self, user_id: str, summary: str, message_count: int) -> None: - """Store conversation summary.""" - if not self._db: - raise RuntimeError("Database not initialized") - - async with self._lock: - await self._db.execute(""" - CREATE TABLE IF NOT EXISTS conversation_summaries ( - user_id TEXT PRIMARY KEY, - summary TEXT NOT NULL, - message_count INTEGER NOT NULL, - updated_at REAL NOT NULL - ) - """) - - await self._db.execute(""" - INSERT OR REPLACE INTO conversation_summaries - (user_id, summary, message_count, updated_at) - VALUES (?, ?, ?, ?) - """, (user_id, summary, message_count, time.time())) - - await self._db.commit() - -async def get_summary(self, user_id: str) -> Optional[ConversationSummary]: - """Retrieve conversation summary.""" - if not self._db: - raise RuntimeError("Database not initialized") - - async with self._lock: - cursor = await self._db.execute(""" - SELECT summary, message_count, updated_at - FROM conversation_summaries - WHERE user_id = ? - """, (user_id,)) - - row = await cursor.fetchone() - - if not row: - return None - - return ConversationSummary( - summary=row[0], - message_count=row[1], - last_updated=row[2] - ) -``` - -**Pros:** -- NO external dependencies -- Works with existing SQLite storage -- Significantly reduces token usage -- Simple to understand and maintain -- Preserves recent context + summarized history -- Configurable window and threshold - -**Cons:** -- Costs tokens to generate summaries -- Slight latency when summarizing -- Need to tune window/threshold params - -**Verdict:** BEST OPTION for MeshAI - simple, effective, no dependencies. - ---- - -## Comparison Matrix - -| Approach | Dependencies | Complexity | Token Savings | Persistence | OpenAI-Compatible | -|----------|-------------|------------|---------------|-------------|-------------------| -| **LangChain BufferMemory** | langchain (~50MB) | Low | None | No | Yes | -| **LangChain WindowMemory** | langchain (~50MB) | Low | Medium | No | Yes | -| **LangChain SummaryMemory** | langchain (~50MB) | Medium | High | No (DIY) | Yes | -| **LlamaIndex** | llama-index (~100MB) | Medium | Medium | No (DIY) | Yes | -| **MemGPT/Letta** | letta (~200MB) | Very High | Very High | Yes | Yes (complex) | -| **ChromaDB** | chromadb (~20MB) | Medium | Medium | Yes | Yes | -| **Qdrant** | qdrant (~30MB) | High | Medium | Yes | Yes | -| **Rolling Summary (DIY)** | None | Low | High | Yes (SQLite) | Yes | - ---- - -## RECOMMENDATION - -**Use Simple Rolling Summary (Option 5)** for MeshAI because: - -1. **Zero dependencies** - No LangChain, LlamaIndex, or vector stores -2. **Works with current stack** - Uses existing AsyncOpenAI client and SQLite -3. **Significant efficiency gains** - Keeps last 4-6 exchanges + summary of older messages -4. **Persistent** - Summaries stored in SQLite, survive restarts -5. **Simple to tune** - Two params: `window_size` and `summarize_threshold` -6. **OpenAI-compatible** - Works with LiteLLM, local models, anything -7. **Lightweight** - ~100 lines of code - -### Implementation Steps - -1. Add `SimpleRollingSummary` class (shown above) -2. Add summary table to SQLite schema -3. Modify `OpenAIBackend.generate()` to use `_memory.get_context_messages()` -4. Add summary storage methods to `ConversationHistory` -5. Configure: `window_size=4` (8 messages), `summarize_threshold=10` - -### Expected Performance - -**Before (full history):** -- 20 message pairs = ~3000 tokens sent every request -- Latency: higher, costs more - -**After (rolling summary):** -- Summary (~100 tokens) + 4 recent pairs (~400 tokens) = ~500 tokens -- **83% token reduction** for long conversations -- Faster responses, lower costs - -### When to Consider Alternatives - -- **Vector stores (ChromaDB)**: If you need semantic search across users or topics -- **LangChain SummaryMemory**: If you want a batteries-included solution (accept dependency) -- **MemGPT**: If conversations become complex multi-day dialogues (they won't on mesh) - ---- - -## Example Usage - -```python -# Initialize -backend = OpenAIBackend(config, api_key) - -# First few messages - full history sent -await backend.generate( - messages=[ - {"role": "user", "content": "What's the weather?"}, - {"role": "assistant", "content": "It's sunny!"}, - {"role": "user", "content": "Should I bring an umbrella?"}, - {"role": "assistant", "content": "No need, it's clear!"}, - # ... 6 more exchanges ... - ], - system_prompt="You are a helpful assistant.", - user_id="!abc123", -) - -# After 10+ messages - summary + recent sent -# Context sent to LLM: -# [ -# {"role": "system", "content": "Previous conversation summary: User asked about weather and outdoor activities. Confirmed sunny weather, no rain expected."}, -# {"role": "user", "content": "Should I bring an umbrella?"}, -# {"role": "assistant", "content": "No need, it's clear!"}, -# ... (last 4 exchanges) -# ] -``` - ---- - -## Code Files to Modify - -1. **`meshai/memory.py`** (NEW) - Add `SimpleRollingSummary` class -2. **`meshai/history.py`** - Add summary storage methods + table schema -3. **`meshai/backends/openai_backend.py`** - Integrate memory manager -4. **`meshai/responder.py`** - Pass `user_id` to backend.generate() -5. **`meshai/config.py`** - Add config for window_size, summarize_threshold - -Let me know if you want me to implement this! diff --git a/MEMORY_SUMMARY.md b/MEMORY_SUMMARY.md deleted file mode 100644 index 3ce7a9b..0000000 --- a/MEMORY_SUMMARY.md +++ /dev/null @@ -1,219 +0,0 @@ -# LLM Memory Research Summary - -## The Problem - -MeshAI currently stuffs full conversation history into every LLM API call: -- Inefficient: Wastes tokens on old context -- Slow: More tokens = higher latency -- Expensive: Unnecessary token costs -- Doesn't scale: Long conversations become unwieldy - -## Solutions Evaluated - -### 1. LangChain Memory Modules - -**Tested:** -- `ConversationBufferMemory`: Stores everything (no improvement) -- `ConversationBufferWindowMemory`: Last N messages only -- `ConversationSummaryMemory`: LLM-generated summaries + recent messages - -**Verdict:** `ConversationSummaryMemory` is best, but adds 50MB dependency. Can DIY the same thing in <100 lines. - -### 2. LlamaIndex - -**Tested:** `ChatMemoryBuffer` with token limiting - -**Verdict:** Token-aware pruning is nice, but 100MB+ dependency is overkill. Less mature than LangChain. - -### 3. MemGPT/Letta - -**Tested:** Self-editing memory architecture - -**Verdict:** Way too heavy (200MB+), requires vector embeddings. Designed for complex multi-day agents, not 150-char mesh messages. - -### 4. Vector Stores (ChromaDB/Qdrant) - -**Tested:** Semantic search for relevant past context - -**Verdict:** Interesting for long-term cross-conversation search, but adds complexity. Not needed for per-user linear conversations. - -### 5. Simple Rolling Summary (DIY) - -**Tested:** Keep last N messages + LLM-generated summary of older messages - -**Verdict:** WINNER - Zero dependencies, 80% token savings, works with existing stack. - ---- - -## Recommendation: Rolling Summary - -### Why - -1. **Zero dependencies** - Pure Python, uses existing AsyncOpenAI client -2. **Simple** - ~100 lines of code, easy to understand and maintain -3. **Effective** - 73-83% token reduction for long conversations -4. **Persistent** - Summaries stored in SQLite, survive restarts -5. **Compatible** - Works with LiteLLM, local models, any OpenAI-compatible API -6. **Tunable** - Two params: `window_size` (recent messages) and `summarize_threshold` (when to re-summarize) - -### How It Works - -``` -Full History (20 messages): -┌─────────────────────────────────────────────────────┐ -│ User: What's the weather? │ -│ Assistant: Sunny, 72°F │ -│ ... (16 more messages) ... │ -│ User: Which trail should I take? │ -│ Assistant: Mt Si if you're fit, Rattlesnake if not │ -└─────────────────────────────────────────────────────┘ - ↓ Sent to LLM: 2000+ tokens - -With Rolling Summary: -┌─────────────────────────────────────────────────────┐ -│ SUMMARY: User asked about weather and hiking. │ -│ Discussed Mt Si trail (4hrs, moderate) and │ -│ Rattlesnake Ledge (2mi, easier, lake views). │ -├─────────────────────────────────────────────────────┤ -│ User: How crowded does it get? │ -│ Assistant: Very crowded weekends, go weekdays │ -│ User: Any other trails nearby? │ -│ Assistant: Rattlesnake Ledge is easier and closer │ -│ User: Tell me about Rattlesnake │ -│ Assistant: 2 miles, great lake views, popular │ -│ User: Which would you recommend? │ -│ Assistant: Mt Si if fit, Rattlesnake if casual │ -└─────────────────────────────────────────────────────┘ - ↓ Sent to LLM: ~500 tokens (75% savings!) -``` - -### Configuration - -**Recommended for MeshAI:** -- `window_size=4` → Keep last 4 exchanges (8 messages) in full -- `summarize_threshold=8` → Re-summarize after 8 new messages - -**Tuning:** -- Smaller window = More aggressive summarization, max token savings -- Larger window = More recent context, less summarization -- Adjust based on average conversation length and message density - -### Implementation Effort - -**Files to modify:** -1. Create `meshai/memory.py` - Rolling summary class -2. Modify `meshai/history.py` - Add summary storage (1 new table, 3 methods) -3. Modify `meshai/backends/openai_backend.py` - Integrate memory manager -4. Modify `meshai/responder.py` - Pass user_id, persist summaries -5. Modify `meshai/commands/reset.py` - Clear summaries on reset - -**Total: ~200 lines of new code, ~50 lines of modifications** - -### Performance - -**Token Usage:** - -| Conversation Length | Full History | Rolling Summary | Savings | -|---------------------|--------------|-----------------|---------| -| 10 messages | 800 tokens | 800 tokens | 0% (no summary) | -| 20 messages | 1600 tokens | 550 tokens | 66% | -| 30 messages | 2400 tokens | 600 tokens | 75% | -| 50 messages | 4000 tokens | 650 tokens | 84% | - -**Cost Impact (at $0.50/1M input tokens):** -- Before: 2400 tokens × $0.0005 = $0.0012 per request -- After: 600 tokens × $0.0005 = $0.0003 per request -- **Savings: $0.0009 per request (75%)** - -For 1000 requests/day: **$0.90/day savings** or **$27/month** - -**Latency:** -- Summary generation: 1-2s every 8-10 messages (amortized) -- Regular requests: No added latency -- Net effect: Faster due to fewer input tokens - ---- - -## When to Use Alternatives - -### Use Window-Only (no summary) -- Very short conversations (< 10 messages) -- Don't care about older context -- Want minimal implementation - -### Use Vector Store (ChromaDB) -- Need semantic search across users -- Want to find similar past conversations -- Long-term cross-user knowledge base - -### Use LangChain SummaryMemory -- Want batteries-included solution -- Don't mind 50MB dependency -- Prefer established library over DIY - -### Use MemGPT/Letta -- Multi-day complex agent workflows -- Agent needs to manage own memory -- Have budget for embeddings and compute - ---- - -## Next Steps - -1. **Read detailed guide:** `/home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md` -2. **Review research:** `/home/zvx/projects/meshai/MEMORY_RESEARCH.md` -3. **Test proof-of-concept:** `python examples/memory_comparison.py` -4. **Implement rolling summary** following the guide -5. **Monitor and tune** based on actual conversation patterns - ---- - -## Files Created - -1. **`MEMORY_SUMMARY.md`** (this file) - Quick overview and recommendation -2. **`MEMORY_RESEARCH.md`** - Detailed evaluation of all approaches with code examples -3. **`MEMORY_IMPLEMENTATION_GUIDE.md`** - Step-by-step implementation guide -4. **`examples/memory_comparison.py`** - Runnable proof-of-concept test script - ---- - -## Quick Start - -```bash -# Test the approaches with your LLM -cd /home/zvx/projects/meshai - -# Edit examples/memory_comparison.py with your LLM endpoint -# Update BASE_URL, API_KEY, MODEL - -python examples/memory_comparison.py - -# You'll see: -# - Full history baseline -# - Rolling summary results -# - Window-only results -# - Token savings comparison -``` - -Expected output: -``` -Approach Tokens Time Savings ----------------------------------------------------------------------- -Full History 1847 2.34s (baseline) -Rolling Summary 512 1.87s 72.3% -Window Only 398 1.45s 78.4% -``` - -**Conclusion: Rolling Summary gives 70%+ savings while preserving context.** - ---- - -## Questions? - -- How does it handle very long conversations? → Multi-level summaries (summary of summaries) -- What if summary loses important info? → Tune `window_size` to keep more recent context -- Does it work with streaming? → Yes, just apply before streaming starts -- Can I see the summaries? → Query `conversation_summaries` table in SQLite -- How do I regenerate a summary? → Clear it, will auto-regenerate on next request - -Start with the recommended settings, monitor, and adjust based on your actual usage patterns. diff --git a/PLAN.md b/PLAN.md deleted file mode 100644 index c07c82b..0000000 --- a/PLAN.md +++ /dev/null @@ -1,356 +0,0 @@ -# MeshAI - Meshtastic LLM Bridge - -## Project Overview - -A Python application that connects to a Meshtastic node and provides LLM-powered responses to mesh network users. Responds to direct mentions (@nodename) or direct messages. Includes bang commands (`!command`) for utility functions. - -## Design Decisions - -### 1. Trigger Mechanism -- **@mentions**: Respond when message contains `@` (configurable node name) -- **Direct Messages**: Respond to all DMs automatically -- **Bang commands**: `!command` syntax for utility functions (handled before LLM) -- Ignore general channel chatter that doesn't mention the bot - -### 2. Conversation History -- Maintain per-user conversation history -- Storage: SQLite database for persistence across restarts -- Context window: Last N messages per user (configurable, default ~20 exchanges) -- With 300 char limit per exchange, context stays small - can maintain long conversations -- Include timestamp tracking for potential "conversation timeout" (e.g., reset after 24h inactivity) - -### 3. Rate Limiting & Response Behavior -- **Response delay**: Configurable 2.2-3.0 second random delay before sending -- **Message chunking**: Split responses at 150 characters max per message -- **Max chunks**: 2 messages maximum per response (300 chars total) -- **Brevity prompt**: System prompt instructs LLM to keep responses concise -- **Cooldown**: Optional per-user cooldown to prevent spam - -### 4. Identity & Configuration -- Node name/ID determined by the physical node configuration -- Application config includes: - - `bot_name`: The @mention trigger name (e.g., "meshbot", "ai") - - `owner`: Owner identification for logging/admin purposes - - Connection settings (serial port or TCP host:port) - -### 5. Channel Filtering -- Configurable list of channels to respond on -- Option to respond on all channels or specific ones only -- DMs always processed regardless of channel settings - -## Technical Architecture - -``` -┌─────────────────────────────────────────────────────────────┐ -│ MeshAI │ -├─────────────────────────────────────────────────────────────┤ -│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────┐ │ -│ │ Meshtastic │ │ Message │ │ LLM Backend │ │ -│ │ Connector │───▶│ Router │───▶│ (pluggable) │ │ -│ │ Serial/TCP │ │ │ │ │ │ -│ └─────────────┘ └─────────────┘ └─────────────────┘ │ -│ │ │ │ │ -│ │ ┌─────▼─────┐ │ │ -│ │ │ Conversation│ │ │ -│ │ │ History │◀────────────┘ │ -│ │ │ (SQLite) │ │ -│ │ └───────────┘ │ -│ │ │ -│ ▼ │ -│ ┌─────────────┐ │ -│ │ Response │ - 2.2-3s delay │ -│ │ Handler │ - Chunk to 150 chars │ -│ │ │ - Max 2 messages │ -│ └─────────────┘ │ -└─────────────────────────────────────────────────────────────┘ -``` - -## LLM Backend Support - -### Pluggable Backend Interface -```python -class LLMBackend(ABC): - @abstractmethod - async def generate(self, messages: list[dict], system_prompt: str) -> str: - pass -``` - -### Supported Backends (Priority Order) -1. **OpenAI-compatible** (covers most bases) - - OpenAI (GPT-4, GPT-4o, etc.) - - Local LiteLLM/Open WebUI (ai.echo6.co) - - Any OpenAI-compatible API - -2. **Anthropic** (Claude) - - Direct Anthropic API - -3. **Google** (Gemini) - - Google AI Studio / Vertex AI - -### Configuration Example -```yaml -llm: - backend: "openai" # openai, anthropic, google - api_key: "${OPENAI_API_KEY}" - base_url: "https://api.openai.com/v1" # or http://ai.echo6.co/api for local - model: "gpt-4o-mini" - - # For local LiteLLM: - # backend: "openai" - # base_url: "http://192.168.1.239:4000/v1" - # model: "llama3" -``` - -## Configuration File Structure - -```yaml -# config.yaml -bot: - name: "ai" # @mention trigger - owner: "K7ZVX" # Owner callsign/name - respond_to_mentions: true - respond_to_dms: true - -connection: - type: "serial" # serial or tcp - serial_port: "/dev/ttyUSB0" # if serial - tcp_host: "192.168.1.100" # if tcp - tcp_port: 4403 # if tcp - -channels: - mode: "all" # "all" or "whitelist" - whitelist: [0, 1] # Only if mode is "whitelist" - -response: - delay_min: 2.2 # seconds - delay_max: 3.0 # seconds - max_length: 150 # chars per message - max_messages: 2 # messages per response - -history: - database: "conversations.db" - max_messages_per_user: 20 - conversation_timeout: 86400 # seconds (24h) - -llm: - backend: "openai" - api_key: "${LLM_API_KEY}" - base_url: "https://api.openai.com/v1" - model: "gpt-4o-mini" - system_prompt: | - You are a helpful assistant on a Meshtastic mesh network. - Keep responses VERY brief - under 250 characters total. - Be concise but friendly. No markdown formatting. - -weather: - primary: "openmeteo" # openmeteo, wttr, or llm - fallback: "llm" # openmeteo, wttr, llm, or none - default_location: "" # Fallback if node has no GPS (e.g., "Seattle, WA") - - openmeteo: - url: "https://api.open-meteo.com/v1" # or self-hosted URL - - wttr: - url: "https://wttr.in" # or self-hosted -``` - -## Bang Commands - -Commands use `!` prefix (like fq51bbs). Processed before LLM routing. - -| Command | Description | Example | -|---------|-------------|---------| -| `!help` | List available commands | `!help` | -| `!ping` | Connectivity test, responds "pong" | `!ping` | -| `!reset` | Clear your conversation history | `!reset` | -| `!status` | Bot uptime, message count, version | `!status` | -| `!weather` | Weather for your node's GPS location (or default) | `!weather` | -| `!weather ` | Weather for specified location | `!weather Seattle` | - -### Weather Command Details - -Location resolution order: -1. If `!weather ` - geocode the provided location -2. If `!weather` (no args) - use sender's node GPS position if available -3. Fall back to `weather.default_location` from config -4. If no location found: "No location available. Use !weather or enable GPS on your node." - -**Providers:** -- `openmeteo` - Open-Meteo API (free, no key, self-hostable) -- `wttr` - wttr.in (free, simple, self-hostable) -- `llm` - Pass to LLM with websearch (flexible, slower) - -Primary/fallback configurable. If primary fails, tries fallback. - -### Command Processing Flow - -``` -Message received - │ - ▼ -┌─────────────┐ -│ Starts with │──No──▶ Check @mention / DM ──▶ LLM -│ "!"? │ -└─────────────┘ - │Yes - ▼ -┌─────────────┐ -│ Parse cmd │ -│ & args │ -└─────────────┘ - │ - ▼ -┌─────────────┐ -│ Lookup in │──Not found──▶ "Unknown command. Try !help" -│ registry │ -└─────────────┘ - │Found - ▼ -┌─────────────┐ -│ Execute │ -│ handler │ -└─────────────┘ -``` - -### Command Handler Interface - -```python -class CommandHandler(ABC): - @abstractmethod - async def execute(self, sender_id: str, args: str, context: MessageContext) -> str: - """Execute command and return response string.""" - pass -``` - -## CLI Configurator - -Interactive TUI configurator using Rich library (same style as fq51bbs). - -**Features:** -- Hierarchical menu system with numeric selection -- `0` always = back/save & exit -- Tables showing current values -- Status icons (✓/✗) with color coding -- Setup wizard for first-time configuration -- Unsaved changes tracking -- Inline help for complex options - -**Menu Structure:** -``` -Main Menu -├── 1. Bot Settings (name, owner, triggers) -├── 2. Connection (serial/TCP config) -├── 3. LLM Backend (provider, API keys, model) -├── 4. Commands & Weather (providers, fallbacks) -├── 5. Response Settings (delays, chunking) -├── 6. Channel Filtering -├── 7. History Settings -├── 8. Run Setup Wizard -└── 0. Save & Exit -``` - -**Invocation:** -```bash -meshai --config # Launch configurator -meshai # Run bot (uses config.yaml) -meshai --config-file /path/to/config.yaml # Use alternate config -``` - -**Config Reload/Restart:** -- On save, prompt: "Restart bot with new config? [Y/n]" -- If bot is running as systemd service: `systemctl restart meshai` -- If running in foreground: signal reload (SIGHUP) or full restart -- Store PID file at runtime for service management - -## File Structure - -``` -meshai/ -├── meshai/ -│ ├── __init__.py -│ ├── main.py # Entry point -│ ├── config.py # Configuration loading/saving -│ ├── connector.py # Meshtastic serial/TCP connection -│ ├── router.py # Message routing logic -│ ├── history.py # Conversation history (SQLite) -│ ├── responder.py # Response handling (delay, chunking) -│ ├── cli/ -│ │ ├── __init__.py -│ │ └── configurator.py # Rich-based TUI configurator -│ ├── commands/ -│ │ ├── __init__.py -│ │ ├── base.py # Command handler interface -│ │ ├── dispatcher.py # Command registry & routing -│ │ ├── help.py # !help -│ │ ├── ping.py # !ping -│ │ ├── reset.py # !reset -│ │ ├── status.py # !status -│ │ └── weather.py # !weather -│ └── backends/ -│ ├── __init__.py -│ ├── base.py # Abstract backend interface -│ ├── openai.py # OpenAI-compatible backend -│ ├── anthropic.py # Anthropic backend -│ └── google.py # Google Gemini backend -├── config.yaml # User configuration -├── requirements.txt -├── pyproject.toml -└── README.md -``` - -## Dependencies - -``` -meshtastic>=2.3.0 -pyyaml>=6.0 -aiosqlite>=0.19.0 -openai>=1.0.0 -anthropic>=0.18.0 -google-generativeai>=0.4.0 -``` - -## Implementation Phases - -### Phase 1: Core Foundation -- [ ] Project structure setup -- [ ] Configuration loading -- [ ] Meshtastic connector (serial first, then TCP) -- [ ] Basic message receiving and logging - -### Phase 2: Message Processing -- [ ] Message router (detect @mentions and DMs) -- [ ] Conversation history database -- [ ] User context management - -### Phase 3: LLM Integration -- [ ] Backend interface definition -- [ ] OpenAI-compatible backend (covers local + OpenAI) -- [ ] Response generation with history - -### Phase 4: Response Handling -- [ ] Delay implementation (2.2-3s random) -- [ ] Message chunking (150 char limit) -- [ ] Send responses back to mesh - -### Phase 5: Additional Backends -- [ ] Anthropic backend -- [ ] Google Gemini backend - -### Phase 6: Polish -- [ ] Error handling and resilience -- [ ] Logging and monitoring -- [ ] Documentation -- [ ] Packaging for easy installation - -## Future Considerations - -- **Multi-node support**: One instance managing multiple nodes (different presets/locations) -- **Store-and-forward**: Queue messages for offline users -- **Games**: Simple text games (trivia, 8-ball, etc.) -- **Scheduled broadcasts**: Periodic announcements - -## Notes - -- Meshtastic Python API: https://meshtastic.org/docs/software/python/cli/ -- Message size limit is 237 bytes, but we're targeting 150 chars for safety and readability -- The meshtastic library handles serial/TCP abstraction well diff --git a/docs/IMPLEMENTATION_DIFF.md b/docs/IMPLEMENTATION_DIFF.md deleted file mode 100644 index 60bb81a..0000000 --- a/docs/IMPLEMENTATION_DIFF.md +++ /dev/null @@ -1,593 +0,0 @@ -# Implementation Diff - Exact Changes Needed - -This document shows the exact code changes needed to implement Rolling Summary memory in MeshAI. - ---- - -## 1. Create New File: `meshai/memory.py` - -**Action:** Create this new file with the complete implementation. - -**Location:** `/home/zvx/projects/meshai/meshai/memory.py` - -**Content:** See `MEMORY_IMPLEMENTATION_GUIDE.md` section 1 for full code. - -**Lines of code:** ~100 - ---- - -## 2. Modify: `meshai/history.py` - -### Add to imports -```python -# No new imports needed - already has time, Optional -``` - -### Modify `initialize()` method - -**Before:** -```python -async def initialize(self) -> None: - """Initialize database and create tables.""" - self._db = await aiosqlite.connect(self._db_path) - - await self._db.execute(""" - CREATE TABLE IF NOT EXISTS conversations ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - user_id TEXT NOT NULL, - role TEXT NOT NULL, - content TEXT NOT NULL, - timestamp REAL NOT NULL - ) - """) - - await self._db.execute(""" - CREATE INDEX IF NOT EXISTS idx_user_timestamp - ON conversations (user_id, timestamp) - """) - - await self._db.commit() - logger.info(f"Conversation history initialized at {self._db_path}") -``` - -**After:** -```python -async def initialize(self) -> None: - """Initialize database and create tables.""" - self._db = await aiosqlite.connect(self._db_path) - - await self._db.execute(""" - CREATE TABLE IF NOT EXISTS conversations ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - user_id TEXT NOT NULL, - role TEXT NOT NULL, - content TEXT NOT NULL, - timestamp REAL NOT NULL - ) - """) - - await self._db.execute(""" - CREATE INDEX IF NOT EXISTS idx_user_timestamp - ON conversations (user_id, timestamp) - """) - - # NEW: Summary table - await self._db.execute(""" - CREATE TABLE IF NOT EXISTS conversation_summaries ( - user_id TEXT PRIMARY KEY, - summary TEXT NOT NULL, - message_count INTEGER NOT NULL, - updated_at REAL NOT NULL - ) - """) - - await self._db.commit() - logger.info(f"Conversation history initialized at {self._db_path}") -``` - -### Add new methods (append to end of class) - -```python -async def store_summary( - self, user_id: str, summary: str, message_count: int -) -> None: - """Store conversation summary. - - Args: - user_id: Node ID of user - summary: Summary text - message_count: Number of messages summarized - """ - if not self._db: - raise RuntimeError("Database not initialized") - - async with self._lock: - await self._db.execute( - """ - INSERT OR REPLACE INTO conversation_summaries - (user_id, summary, message_count, updated_at) - VALUES (?, ?, ?, ?) - """, - (user_id, summary, message_count, time.time()), - ) - await self._db.commit() - - -async def get_summary(self, user_id: str) -> Optional[dict]: - """Get conversation summary for user. - - Args: - user_id: Node ID of user - - Returns: - Dict with 'summary', 'message_count', 'updated_at' or None - """ - if not self._db: - raise RuntimeError("Database not initialized") - - async with self._lock: - cursor = await self._db.execute( - """ - SELECT summary, message_count, updated_at - FROM conversation_summaries - WHERE user_id = ? - """, - (user_id,), - ) - row = await cursor.fetchone() - - if not row: - return None - - return { - "summary": row[0], - "message_count": row[1], - "updated_at": row[2], - } - - -async def clear_summary(self, user_id: str) -> None: - """Clear summary for user (e.g., on history reset). - - Args: - user_id: Node ID of user - """ - if not self._db: - raise RuntimeError("Database not initialized") - - async with self._lock: - await self._db.execute( - "DELETE FROM conversation_summaries WHERE user_id = ?", - (user_id,), - ) - await self._db.commit() -``` - -**Lines added:** ~60 - ---- - -## 3. Modify: `meshai/backends/openai_backend.py` - -### Add import - -**Before:** -```python -import logging -from typing import Optional - -from openai import AsyncOpenAI - -from ..config import LLMConfig -from .base import LLMBackend -``` - -**After:** -```python -import logging -from typing import Optional - -from openai import AsyncOpenAI - -from ..config import LLMConfig -from ..memory import RollingSummaryMemory # NEW -from .base import LLMBackend -``` - -### Modify `__init__()` method - -**Before:** -```python -def __init__(self, config: LLMConfig, api_key: str): - """Initialize OpenAI backend. - - Args: - config: LLM configuration - api_key: API key to use - """ - self.config = config - self._client = AsyncOpenAI( - api_key=api_key, - base_url=config.base_url, - ) -``` - -**After:** -```python -def __init__(self, config: LLMConfig, api_key: str): - """Initialize OpenAI backend. - - Args: - config: LLM configuration - api_key: API key to use - """ - self.config = config - self._client = AsyncOpenAI( - api_key=api_key, - base_url=config.base_url, - ) - - # NEW: Initialize rolling summary memory - self._memory = RollingSummaryMemory( - client=self._client, - model=config.model, - window_size=4, - summarize_threshold=8, - ) -``` - -### Modify `generate()` method signature and logic - -**Before:** -```python -async def generate( - self, - messages: list[dict], - system_prompt: str, - max_tokens: int = 300, -) -> str: - """Generate a response using OpenAI-compatible API.""" - # Build messages list with system prompt - full_messages = [{"role": "system", "content": system_prompt}] - full_messages.extend(messages) - - try: - response = await self._client.chat.completions.create( - model=self.config.model, - messages=full_messages, - max_tokens=max_tokens, - temperature=0.7, - ) - - content = response.choices[0].message.content - return content.strip() if content else "" - - except Exception as e: - logger.error(f"OpenAI API error: {e}") - raise -``` - -**After:** -```python -async def generate( - self, - messages: list[dict], - system_prompt: str, - user_id: str = None, # NEW: optional for backward compatibility - max_tokens: int = 300, -) -> str: - """Generate a response using OpenAI-compatible API.""" - - # NEW: Use memory manager if user_id provided - if user_id: - summary, recent_messages = await self._memory.get_context_messages( - user_id=user_id, - full_history=messages, - ) - - if summary: - # Long conversation: system + summary + recent - enhanced_system = f"""{system_prompt} - -Previous conversation summary: {summary}""" - full_messages = [{"role": "system", "content": enhanced_system}] - full_messages.extend(recent_messages) - - logger.debug( - f"Using summary + {len(recent_messages)} recent messages " - f"(total history: {len(messages)})" - ) - else: - # Short conversation: system + all messages - full_messages = [{"role": "system", "content": system_prompt}] - full_messages.extend(messages) - else: - # Old behavior: full history - full_messages = [{"role": "system", "content": system_prompt}] - full_messages.extend(messages) - - try: - response = await self._client.chat.completions.create( - model=self.config.model, - messages=full_messages, - max_tokens=max_tokens, - temperature=0.7, - ) - - content = response.choices[0].message.content - return content.strip() if content else "" - - except Exception as e: - logger.error(f"OpenAI API error: {e}") - raise -``` - -### Add helper methods (append to end of class) - -```python -def load_summary_cache(self, user_id: str, summary_data: dict) -> None: - """Load summary into memory cache (called on startup). - - Args: - user_id: User identifier - summary_data: Dict with 'summary', 'message_count', 'updated_at' - """ - from ..memory import ConversationSummary - - summary = ConversationSummary( - summary=summary_data["summary"], - message_count=summary_data["message_count"], - last_updated=summary_data["updated_at"], - ) - self._memory.load_summary(user_id, summary) - - -def clear_summary_cache(self, user_id: str) -> None: - """Clear summary cache for user.""" - self._memory.clear_summary(user_id) -``` - -**Lines modified:** ~40 -**Lines added:** ~20 - ---- - -## 4. Modify: `meshai/responder.py` - -### Find the response generation section - -**Location:** Look for where `self.backend.generate()` is called. - -**Before:** -```python -# Wherever backend.generate() is called -response = await self.backend.generate( - messages=history, - system_prompt=self.system_prompt, - max_tokens=300, -) -``` - -**After:** -```python -# Pass user_id for memory optimization -response = await self.backend.generate( - messages=history, - system_prompt=self.system_prompt, - user_id=user_id, # NEW - max_tokens=300, -) - -# NEW: Persist summary if created -await self._persist_summary_if_needed(user_id) -``` - -### Add helper method (append to class) - -```python -async def _persist_summary_if_needed(self, user_id: str) -> None: - """Store summary to database if one was created.""" - if hasattr(self.backend, "_memory"): - summary = self.backend._memory._summaries.get(user_id) - if summary: - await self.history.store_summary( - user_id, - summary.summary, - summary.message_count, - ) -``` - -**Lines modified:** ~5 -**Lines added:** ~10 - ---- - -## 5. Modify: `meshai/commands/reset.py` - -### Modify `execute()` method - -**Before:** -```python -async def execute(self, sender_id: str, args: list[str]) -> str: - """Reset conversation history.""" - count = await self.responder.history.clear_history(sender_id) - return f"Cleared {count} messages from your history." -``` - -**After:** -```python -async def execute(self, sender_id: str, args: list[str]) -> str: - """Reset conversation history.""" - count = await self.responder.history.clear_history(sender_id) - - # NEW: Also clear summary - await self.responder.history.clear_summary(sender_id) - if hasattr(self.responder.backend, "clear_summary_cache"): - self.responder.backend.clear_summary_cache(sender_id) - - return f"Cleared {count} messages from your history." -``` - -**Lines added:** ~4 - ---- - -## Summary of Changes - -| File | Action | Lines Added | Lines Modified | -|------|--------|-------------|----------------| -| `meshai/memory.py` | Create new | ~100 | 0 | -| `meshai/history.py` | Modify | ~70 | ~10 | -| `meshai/backends/openai_backend.py` | Modify | ~30 | ~40 | -| `meshai/responder.py` | Modify | ~10 | ~5 | -| `meshai/commands/reset.py` | Modify | ~4 | ~2 | -| **TOTAL** | | **~214** | **~57** | - -**Net new code:** ~271 lines across 5 files -**Dependencies added:** 0 -**Breaking changes:** None (user_id parameter is optional) - ---- - -## Testing After Implementation - -### 1. Database migration (automatic) - -```bash -# Just start the app - new table will be created automatically -python -m meshai -``` - -### 2. Test basic conversation - -```python -# Send 5 messages - should use full history (no summary yet) -# Send 15 messages - should start summarizing -``` - -### 3. Verify summary storage - -```bash -sqlite3 meshai_history.db -``` - -```sql --- Check summaries table exists -.tables - --- View summaries -SELECT user_id, summary, message_count, updated_at -FROM conversation_summaries; - --- Check conversations -SELECT COUNT(*) FROM conversations; -``` - -### 4. Test reset command - -``` -Send: !reset -Expected: Clears both conversations and summary -``` - -### 5. Monitor logs - -```python -# Should see log messages like: -# "Using summary + 8 recent messages (total history: 24)" -``` - ---- - -## Rollback Plan - -If something goes wrong: - -1. **Remove new file:** - ```bash - rm meshai/memory.py - ``` - -2. **Revert changes:** Use git to revert the 4 modified files - ```bash - git checkout meshai/history.py - git checkout meshai/backends/openai_backend.py - git checkout meshai/responder.py - git checkout meshai/commands/reset.py - ``` - -3. **Database is safe:** Summary table won't hurt anything, conversations table unchanged - -4. **No data loss:** Can drop summaries table if needed - ```sql - DROP TABLE conversation_summaries; - ``` - ---- - -## Performance Validation - -After running for a day: - -```sql --- Average messages per user -SELECT AVG(msg_count) as avg_messages -FROM ( - SELECT user_id, COUNT(*) as msg_count - FROM conversations - GROUP BY user_id -); - --- Users with summaries -SELECT COUNT(*) FROM conversation_summaries; - --- Summary stats -SELECT - AVG(message_count) as avg_summarized, - MIN(updated_at) as oldest_summary, - MAX(updated_at) as newest_summary -FROM conversation_summaries; -``` - -**Expected:** -- Users with >10 messages should have summaries -- Summaries should update every ~8 new messages -- No errors in logs - ---- - -## Configuration Tuning - -If you need to adjust behavior: - -**In `meshai/backends/openai_backend.py`:** - -```python -self._memory = RollingSummaryMemory( - client=self._client, - model=config.model, - window_size=4, # ← Adjust: 3-6 typical - summarize_threshold=8, # ← Adjust: 6-12 typical -) -``` - -**For very short messages (like Meshtastic):** -- Try `window_size=6` (more recent context) -- Try `summarize_threshold=10` (less frequent summarization) - -**For longer messages:** -- Try `window_size=3` (less recent context needed) -- Try `summarize_threshold=6` (more frequent updates) - ---- - -## Next Steps - -1. Implement changes in order (create memory.py first) -2. Test with a few users before full deployment -3. Monitor logs for summary generation -4. Check SQLite database for summaries -5. Tune window_size and threshold based on actual usage -6. Measure token savings in production - -Good luck! The code is solid and tested - this should be a smooth upgrade. diff --git a/docs/QUICK_REFERENCE.md b/docs/QUICK_REFERENCE.md deleted file mode 100644 index 089f662..0000000 --- a/docs/QUICK_REFERENCE.md +++ /dev/null @@ -1,189 +0,0 @@ -# LLM Memory - Quick Reference Card - -## The Problem -Current MeshAI sends full conversation history every request → wastes tokens, slow, expensive. - -## The Solution -**Rolling Summary Memory**: Keep recent messages + LLM-generated summary of older messages. - -## Results -- 70-80% token reduction for long conversations -- Zero dependencies -- Works with existing stack (AsyncOpenAI + SQLite) -- ~100 lines of code - ---- - -## How It Works (5-Second Version) - -``` -Long conversation (30 messages): - Messages 1-22: "User discussed weather and hiking trails" (summary) - Messages 23-30: [sent in full] - -Total tokens: ~600 instead of ~2400 (75% savings) -``` - ---- - -## Implementation Checklist - -- [ ] Create `meshai/memory.py` - RollingSummaryMemory class -- [ ] Modify `meshai/history.py` - Add summary table + storage methods -- [ ] Modify `meshai/backends/openai_backend.py` - Integrate memory manager -- [ ] Modify `meshai/responder.py` - Pass user_id, persist summaries -- [ ] Modify `meshai/commands/reset.py` - Clear summaries on reset - ---- - -## Configuration - -```python -# In memory.py initialization -RollingSummaryMemory( - client=self._client, - model=config.model, - window_size=4, # Keep last 4 exchanges (8 messages) - summarize_threshold=8, # Re-summarize after 8 new messages -) -``` - -**Tune based on:** -- `window_size`: Smaller = more summarization, larger = more recent context -- `summarize_threshold`: Smaller = more frequent re-summarization - ---- - -## Database Schema Addition - -```sql -CREATE TABLE conversation_summaries ( - user_id TEXT PRIMARY KEY, - summary TEXT NOT NULL, - message_count INTEGER NOT NULL, - updated_at REAL NOT NULL -); -``` - ---- - -## Testing - -```bash -# Run proof-of-concept comparison -python examples/memory_comparison.py - -# Update these first: -# - BASE_URL (your LLM endpoint) -# - API_KEY (your key) -# - MODEL (your model name) -``` - -**Expected output:** -``` -Approach Tokens Savings ----------------------------------------------- -Full History 1847 (baseline) -Rolling Summary 512 72.3% -Window Only 398 78.4% -``` - ---- - -## Key Code Snippets - -### Memory Manager Usage - -```python -# Get optimized context -summary, recent_messages = await memory.get_context_messages( - user_id=user_id, - full_history=all_messages, -) - -# Build message list -if summary: - system_prompt += f"\n\nPrevious conversation: {summary}" - context = [system] + recent_messages -else: - context = [system] + all_messages -``` - -### Store Summary - -```python -await history.store_summary( - user_id=user_id, - summary=summary_text, - message_count=len(old_messages) -) -``` - -### Load Summary on Startup - -```python -summary_data = await history.get_summary(user_id) -if summary_data: - backend.load_summary_cache(user_id, summary_data) -``` - ---- - -## Performance Metrics - -| Messages | Full History | With Summary | Savings | -|----------|--------------|--------------|---------| -| 10 | 800 tokens | 800 tokens | 0% | -| 20 | 1600 tokens | 550 tokens | 66% | -| 30 | 2400 tokens | 600 tokens | 75% | -| 50 | 4000 tokens | 650 tokens | 84% | - -**Cost Impact** (at $0.50/1M input tokens, 1000 requests/day): -- Before: $36/month -- After: $9/month -- **Savings: $27/month** - ---- - -## When to Use Alternatives - -| Use Case | Recommendation | -|----------|----------------| -| Simple stateless chat | Window-only memory | -| MeshAI (your project) | **Rolling Summary** | -| Want library solution | LangChain SummaryMemory | -| Need semantic search | ChromaDB vector store | -| Complex multi-day agent | MemGPT/Letta | - ---- - -## Troubleshooting - -**Summary too short/long?** -→ Adjust `max_tokens` in `_summarize()` method (default: 150) - -**Summary quality poor?** -→ Modify prompt in `_summarize()`, lower temperature - -**Too much overhead?** -→ Increase `summarize_threshold` (re-summarize less often) - -**Want more context?** -→ Increase `window_size` (keep more recent messages) - ---- - -## Documentation Files - -1. **MEMORY_SUMMARY.md** - Overview and recommendation (this started here) -2. **MEMORY_RESEARCH.md** - Detailed evaluation of all 5 approaches -3. **MEMORY_IMPLEMENTATION_GUIDE.md** - Complete step-by-step implementation -4. **examples/memory_comparison.py** - Runnable proof-of-concept -5. **docs/memory_approaches_comparison.txt** - Visual comparison diagrams -6. **docs/QUICK_REFERENCE.md** - This cheat sheet - ---- - -## One-Liner Summary - -**Use Rolling Summary**: Zero deps, 75% token savings, 100 lines of code, works with your stack. diff --git a/docs/memory_approaches_comparison.txt b/docs/memory_approaches_comparison.txt deleted file mode 100644 index e242079..0000000 --- a/docs/memory_approaches_comparison.txt +++ /dev/null @@ -1,254 +0,0 @@ -╔════════════════════════════════════════════════════════════════════════════════╗ -║ LLM MEMORY APPROACHES COMPARISON ║ -╚════════════════════════════════════════════════════════════════════════════════╝ - -┌────────────────────────────────────────────────────────────────────────────────┐ -│ 1. FULL HISTORY (Current MeshAI Implementation) │ -├────────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ Request 1: [System] + [Msg1, Msg2] = 200 tokens │ -│ Request 5: [System] + [Msg1...Msg10] = 1000 tokens │ -│ Request 10: [System] + [Msg1...Msg20] = 2000 tokens │ -│ Request 20: [System] + [Msg1...Msg40] = 4000 tokens │ -│ │ -│ ✓ Complete context │ -│ ✗ Linear growth in tokens │ -│ ✗ Expensive and slow for long conversations │ -│ ✗ Redundant - most messages not relevant to current query │ -│ │ -└────────────────────────────────────────────────────────────────────────────────┘ - -┌────────────────────────────────────────────────────────────────────────────────┐ -│ 2. WINDOW MEMORY (Keep Last N Only) │ -├────────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ Request 1: [System] + [Msg1, Msg2] = 200 tokens │ -│ Request 5: [System] + [Msg7, Msg8, Msg9, Msg10] = 500 tokens │ -│ Request 10: [System] + [Msg17, Msg18, Msg19, Msg20] = 500 tokens │ -│ Request 20: [System] + [Msg37, Msg38, Msg39, Msg40] = 500 tokens │ -│ │ -│ ✓ Constant token usage │ -│ ✓ Very fast and cheap │ -│ ✗ Completely forgets old context │ -│ ✗ Can't reference earlier conversation │ -│ │ -└────────────────────────────────────────────────────────────────────────────────┘ - -┌────────────────────────────────────────────────────────────────────────────────┐ -│ 3. ROLLING SUMMARY (RECOMMENDED) │ -├────────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ Request 1-5: [System] + [Msg1...Msg10] = 1000 tokens │ -│ (Short conversation - no summary yet) │ -│ │ -│ Request 10+: [System + Summary] + [Recent 8 msgs] = 600 tokens │ -│ │ -│ ┌─────────────────────────────────────┐ │ -│ │ Summary: "User discussed weather │ │ -│ │ and hiking. Mt Si is 4hr moderate │ │ -│ │ hike, Rattlesnake is 2mi easier." │ (100 tokens) │ -│ └─────────────────────────────────────┘ │ -│ ↓ │ -│ ┌─────────────────────────────────────┐ │ -│ │ User: How crowded does it get? │ │ -│ │ Assistant: Very crowded weekends │ │ -│ │ User: Any other trails nearby? │ (400 tokens) │ -│ │ Assistant: Rattlesnake is closer │ │ -│ │ ... (last 4 exchanges) │ │ -│ └─────────────────────────────────────┘ │ -│ │ -│ Request 20: [System + Summary] + [Recent 8 msgs] = 600 tokens │ -│ (Summary updated every ~8 new messages) │ -│ │ -│ ✓ Balanced token usage (70-80% reduction) │ -│ ✓ Preserves long-term context via summary │ -│ ✓ Recent messages in full detail │ -│ ✓ Scalable to very long conversations │ -│ ✗ Small overhead for summary generation (1-2s every 8-10 msgs) │ -│ │ -└────────────────────────────────────────────────────────────────────────────────┘ - -┌────────────────────────────────────────────────────────────────────────────────┐ -│ 4. VECTOR STORE MEMORY (ChromaDB/Qdrant) │ -├────────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ Current query: "What trails are nearby?" │ -│ ↓ (embed and search) │ -│ ┌──────────────────────────────────────────────────────────────────┐ │ -│ │ Vector DB: Find semantically similar past messages │ │ -│ │ - "Mt Si is a moderate 4-hour hike" (score: 0.89) │ │ -│ │ - "Rattlesnake Ledge has lake views" (score: 0.85) │ │ -│ │ - "Bring water and snacks" (score: 0.62) │ │ -│ └──────────────────────────────────────────────────────────────────┘ │ -│ ↓ │ -│ [System + Top 3 relevant] + [Current query] = 500 tokens │ -│ │ -│ ✓ Semantic retrieval - finds relevant context │ -│ ✓ Works for sparse conversations │ -│ ✓ Enables cross-conversation search │ -│ ✗ Requires embeddings (API calls or local model) │ -│ ✗ Adds complexity (vector DB, indexing) │ -│ ✗ May retrieve irrelevant "similar" messages │ -│ │ -└────────────────────────────────────────────────────────────────────────────────┘ - -┌────────────────────────────────────────────────────────────────────────────────┐ -│ 5. MEMGPT/LETTA (Self-Editing Memory) │ -├────────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌───────────────────────────────────┐ │ -│ │ Core Memory (always in context): │ │ -│ │ - User: Matt │ (50 tokens) │ -│ │ - Preferences: Metric units │ │ -│ └───────────────────────────────────┘ │ -│ ↓ │ -│ ┌───────────────────────────────────┐ │ -│ │ Recall Memory (vector search): │ │ -│ │ - [Retrieved: 3 relevant msgs] │ (300 tokens) │ -│ └───────────────────────────────────┘ │ -│ ↓ │ -│ ┌───────────────────────────────────┐ │ -│ │ Archival Memory (long-term): │ │ -│ │ - [Searchable but not loaded] │ │ -│ └───────────────────────────────────┘ │ -│ │ -│ Agent decides what to remember/forget/search │ -│ │ -│ ✓ Most sophisticated - agent manages own memory │ -│ ✓ Handles complex multi-day conversations │ -│ ✗ Very heavy (200MB+ dependencies) │ -│ ✗ Requires vector embeddings │ -│ ✗ Overkill for simple chat │ -│ ✗ Opinionated architecture (hard to integrate) │ -│ │ -└────────────────────────────────────────────────────────────────────────────────┘ - -╔════════════════════════════════════════════════════════════════════════════════╗ -║ RECOMMENDATION MATRIX ║ -╚════════════════════════════════════════════════════════════════════════════════╝ - -┌──────────────┬──────────────┬────────────┬──────────────┬──────────────────────┐ -│ Approach │ Dependencies │ Tokens │ Complexity │ Use Case │ -├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤ -│ Full History │ None │ High │ Low │ Don't use (baseline) │ -├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤ -│ Window Only │ None │ Low │ Low │ Stateless chat bots │ -├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤ -│ Rolling │ │ │ │ ✓ MESHAI │ -│ Summary │ None │ Very Low │ Low │ ✓ Most projects │ -│ (DIY) │ │ │ │ ✓ Best balance │ -├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤ -│ LangChain │ ~50 MB │ Very Low │ Medium │ Want batteries- │ -│ Summary │ │ │ │ included solution │ -├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤ -│ Vector Store │ ~20 MB │ Low │ Medium │ Semantic search, │ -│ (ChromaDB) │ │ │ │ long-term memory │ -├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤ -│ MemGPT/Letta │ ~200 MB │ Low │ Very High │ Complex multi-day │ -│ │ │ │ │ agent workflows │ -└──────────────┴──────────────┴────────────┴──────────────┴──────────────────────┘ - -╔════════════════════════════════════════════════════════════════════════════════╗ -║ PERFORMANCE COMPARISON (20 messages) ║ -╚════════════════════════════════════════════════════════════════════════════════╝ - - Tokens Sent to LLM - ↑ - │ -4000│ ████████████████████████████████ Full History - │ -3000│ - │ -2000│ - │ -1000│ - │ - 600│ ██████ Rolling Summary - 500│ █████ Window Only - │ █████ Vector Store - 0└─────────────────────────────────────────────────────────→ - 1 5 10 15 20 25 30 35 40 (Conversation length) - - Legend: - ████ Full History (linear growth) - ████ Rolling Summary (plateau after initial growth) - ████ Window/Vector (constant) - - -╔════════════════════════════════════════════════════════════════════════════════╗ -║ IMPLEMENTATION COMPLEXITY ║ -╚════════════════════════════════════════════════════════════════════════════════╝ - -┌─────────────────────────────────────────────────────────────────────────────┐ -│ Simple ←───────────────────────────────────────────────────→ Complex │ -├─────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ Window Only Rolling Summary LangChain MemGPT │ -│ (20 lines) (100 lines) (10 lines (200+ lines │ -│ + 50MB dep) + 200MB dep) │ -│ │ -│ ↑ ↑ ↑ ↑ │ -│ No deps No deps Heavy deps Very heavy │ -│ No persistence SQLite persist In-memory Built-in DB │ -│ Loses old context Keeps summary Keeps summary Multi-tier │ -│ │ -│ ★ RECOMMENDED ★ │ -└─────────────────────────────────────────────────────────────────────────────┘ - -╔════════════════════════════════════════════════════════════════════════════════╗ -║ FOR MESHAI SPECIFICALLY ║ -╚════════════════════════════════════════════════════════════════════════════════╝ - -Current: - - Messages: 150 chars max (very small) - - Conversations: Per-user, linear - - Backend: OpenAI-compatible (LiteLLM, local models) - - Storage: SQLite + aiosqlite - - Problem: Full history sent every time - -Constraints: - - Lightweight (runs on mesh nodes potentially) - - No heavy dependencies - - Must work offline (local models) - - Persistence required (survive restarts) - -Solution: Rolling Summary - ✓ Zero dependencies (pure Python) - ✓ Works with existing AsyncOpenAI client - ✓ Persists in existing SQLite database - ✓ ~100 lines of code (easy to maintain) - ✓ 70-80% token reduction - ✓ Tunable (window_size, summarize_threshold) - -Configuration: - - window_size = 4 (keep last 4 exchanges = 8 messages) - - summarize_threshold = 8 (re-summarize after 8 new messages) - -Expected savings: - - 10 messages: 0% (no summary yet) - - 20 messages: 66% token reduction - - 30 messages: 75% token reduction - - 50 messages: 84% token reduction - -Cost impact (at $0.50/1M tokens): - - Before: $0.0012 per request (2400 tokens) - - After: $0.0003 per request (600 tokens) - - Savings: $27/month for 1000 requests/day - -╔════════════════════════════════════════════════════════════════════════════════╗ -║ NEXT STEPS ║ -╚════════════════════════════════════════════════════════════════════════════════╝ - -1. Read: MEMORY_SUMMARY.md (quick overview) -2. Study: MEMORY_RESEARCH.md (detailed analysis) -3. Test: python examples/memory_comparison.py (see it in action) -4. Build: MEMORY_IMPLEMENTATION_GUIDE.md (step-by-step) -5. Deploy: Monitor and tune based on real usage - -Files created: - - /home/zvx/projects/meshai/MEMORY_SUMMARY.md - - /home/zvx/projects/meshai/MEMORY_RESEARCH.md - - /home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md - - /home/zvx/projects/meshai/examples/memory_comparison.py - -Good luck! 🚀 diff --git a/examples/memory_comparison.py b/examples/memory_comparison.py deleted file mode 100755 index ac5d71c..0000000 --- a/examples/memory_comparison.py +++ /dev/null @@ -1,285 +0,0 @@ -#!/usr/bin/env python3 -""" -Proof-of-concept: Compare full history vs rolling summary memory. - -Demonstrates token savings and performance of different approaches. - -Usage: - python examples/memory_comparison.py -""" - -import asyncio -import time -from typing import Optional - -from openai import AsyncOpenAI - - -# ============================================================================ -# SIMPLE ROLLING SUMMARY IMPLEMENTATION -# ============================================================================ - - -class SimpleRollingSummary: - """Minimal rolling summary memory manager for testing.""" - - def __init__( - self, - client: AsyncOpenAI, - model: str, - window_size: int = 4, - ): - self.client = client - self.model = model - self.window_size = window_size - self._summary_cache = {} - - async def get_context( - self, user_id: str, messages: list[dict] - ) -> tuple[Optional[str], list[dict]]: - """Return (summary, recent_messages) for optimized context.""" - - # Short conversation - return all messages - if len(messages) <= self.window_size * 2: - return None, messages - - # Split old and recent - split = -(self.window_size * 2) - old = messages[:split] - recent = messages[split:] - - # Get or create summary - if user_id not in self._summary_cache: - summary = await self._summarize(old) - self._summary_cache[user_id] = summary - else: - summary = self._summary_cache[user_id] - - return summary, recent - - async def _summarize(self, messages: list[dict]) -> str: - """Generate summary of messages.""" - conv = "\n".join([f"{m['role'].upper()}: {m['content']}" for m in messages]) - - prompt = f"""Summarize this conversation in 2-3 concise sentences: - -{conv} - -Summary:""" - - response = await self.client.chat.completions.create( - model=self.model, - messages=[{"role": "user", "content": prompt}], - max_tokens=150, - temperature=0.3, - ) - - return response.choices[0].message.content.strip() - - -# ============================================================================ -# COMPARISON SCENARIOS -# ============================================================================ - - -async def test_full_history(client: AsyncOpenAI, model: str, messages: list[dict]): - """Baseline: Send full conversation history.""" - print("\n=== FULL HISTORY APPROACH ===") - - system = "You are a helpful assistant on a mesh network." - full = [{"role": "system", "content": system}] + messages - - start = time.time() - - response = await client.chat.completions.create( - model=model, messages=full, max_tokens=100, temperature=0.7 - ) - - elapsed = time.time() - start - - # Estimate tokens (rough) - total_chars = sum(len(m["content"]) for m in full) - est_tokens = total_chars // 4 # Rough estimate: 4 chars = 1 token - - print(f"Messages sent: {len(full)}") - print(f"Est. input tokens: {est_tokens}") - print(f"Response: {response.choices[0].message.content[:100]}...") - print(f"Time: {elapsed:.2f}s") - - return est_tokens, elapsed - - -async def test_rolling_summary( - client: AsyncOpenAI, model: str, messages: list[dict], user_id: str -): - """Optimized: Send summary + recent messages.""" - print("\n=== ROLLING SUMMARY APPROACH ===") - - memory = SimpleRollingSummary(client, model, window_size=4) - - summary, recent = await memory.get_context(user_id, messages) - - system = "You are a helpful assistant on a mesh network." - if summary: - system += f"\n\nPrevious conversation summary: {summary}" - - context = [{"role": "system", "content": system}] + recent - - start = time.time() - - response = await client.chat.completions.create( - model=model, messages=context, max_tokens=100, temperature=0.7 - ) - - elapsed = time.time() - start - - # Estimate tokens - total_chars = sum(len(m["content"]) for m in context) - est_tokens = total_chars // 4 - - print(f"Messages sent: {len(context)} (summary: {summary is not None})") - if summary: - print(f"Summary: {summary[:80]}...") - print(f"Est. input tokens: {est_tokens}") - print(f"Response: {response.choices[0].message.content[:100]}...") - print(f"Time: {elapsed:.2f}s") - - return est_tokens, elapsed - - -async def test_window_only(client: AsyncOpenAI, model: str, messages: list[dict]): - """Simple window: Just last N messages, no summary.""" - print("\n=== WINDOW-ONLY APPROACH ===") - - window_size = 4 - recent = messages[-(window_size * 2) :] - - system = "You are a helpful assistant on a mesh network." - context = [{"role": "system", "content": system}] + recent - - start = time.time() - - response = await client.chat.completions.create( - model=model, messages=context, max_tokens=100, temperature=0.7 - ) - - elapsed = time.time() - start - - total_chars = sum(len(m["content"]) for m in context) - est_tokens = total_chars // 4 - - print(f"Messages sent: {len(context)} (last {window_size} exchanges only)") - print(f"Est. input tokens: {est_tokens}") - print(f"Response: {response.choices[0].message.content[:100]}...") - print(f"Time: {elapsed:.2f}s") - - return est_tokens, elapsed - - -# ============================================================================ -# MAIN TEST -# ============================================================================ - - -async def main(): - """Run comparison test.""" - - # Configure your LLM endpoint - # Update these for your setup (LiteLLM, local model, etc.) - BASE_URL = "http://192.168.1.239:8000/v1" # LiteLLM endpoint - API_KEY = "sk-1234" # Your API key - MODEL = "gpt-4o-mini" # Your model - - print("=" * 70) - print("LLM Memory Approach Comparison") - print("=" * 70) - - # Create test conversation (simulate 15 exchanges = 30 messages) - messages = [] - topics = [ - ("What's the weather?", "It's sunny and 72°F."), - ("Should I bring an umbrella?", "No need, clear skies all day."), - ("What about tomorrow?", "Tomorrow looks rainy, bring an umbrella."), - ("Any hiking recommendations?", "Try Mt. Si, great views!"), - ("How long is the hike?", "About 4 hours round trip."), - ("Is it beginner friendly?", "Moderate difficulty, doable for most."), - ("What should I bring?", "Water, snacks, good boots, and layers."), - ("Are dogs allowed?", "Yes, but must be leashed."), - ("Where's the trailhead?", "Off I-90 near North Bend."), - ("Parking fee?", "Yes, $10 or Northwest Forest Pass."), - ("What time should I start?", "Early morning, around 7-8 AM."), - ("How crowded does it get?", "Very crowded on weekends, go weekdays."), - ("Any other trails nearby?", "Rattlesnake Ledge is easier and closer."), - ("Tell me about Rattlesnake", "2 miles, great lake views, very popular."), - ("Which would you recommend?", "If fit: Mt Si. If casual: Rattlesnake."), - ] - - for user_msg, assistant_msg in topics: - messages.append({"role": "user", "content": user_msg}) - messages.append({"role": "assistant", "content": assistant_msg}) - - print(f"\nTest conversation: {len(messages)} messages ({len(messages)//2} exchanges)") - print(f"Topics: weather → hiking → trails") - print(f"Message lengths: {min(len(m['content']) for m in messages)}-{max(len(m['content']) for m in messages)} chars") - - # Initialize client - client = AsyncOpenAI(api_key=API_KEY, base_url=BASE_URL) - - try: - # Test each approach - full_tokens, full_time = await test_full_history(client, MODEL, messages) - summary_tokens, summary_time = await test_rolling_summary( - client, MODEL, messages, "!test_user" - ) - window_tokens, window_time = await test_window_only(client, MODEL, messages) - - # Results - print("\n" + "=" * 70) - print("COMPARISON RESULTS") - print("=" * 70) - - print(f"\n{'Approach':<20} {'Tokens':<15} {'Time':<10} {'Savings'}") - print("-" * 70) - print( - f"{'Full History':<20} {full_tokens:<15} {full_time:<10.2f}s {'(baseline)'}" - ) - print( - f"{'Rolling Summary':<20} {summary_tokens:<15} {summary_time:<10.2f}s " - f"{(1 - summary_tokens/full_tokens)*100:.1f}%" - ) - print( - f"{'Window Only':<20} {window_tokens:<15} {window_time:<10.2f}s " - f"{(1 - window_tokens/full_tokens)*100:.1f}%" - ) - - print("\n" + "=" * 70) - print("RECOMMENDATIONS") - print("=" * 70) - - print("\nFull History:") - print(" ✓ Complete context") - print(" ✗ High token usage") - print(" ✗ Slower for long conversations") - print(" Use: Never (inefficient)") - - print("\nWindow Only:") - print(" ✓ Very low token usage") - print(" ✓ Fast") - print(" ✗ Loses older context completely") - print(" Use: Short-term conversations only") - - print("\nRolling Summary:") - print(" ✓ Balanced token usage") - print(" ✓ Preserves long-term context") - print(" ✓ Fast after initial summary") - print(" ✗ Slight overhead for summarization") - print(" Use: RECOMMENDED for MeshAI") - - print("\n" + "=" * 70) - - finally: - await client.close() - - -if __name__ == "__main__": - asyncio.run(main())