mirror of
https://github.com/zvx-echo6/meshai.git
synced 2026-05-21 23:24:44 +02:00
Remove AI planning docs and example scripts
These were LLM-generated planning artifacts from the memory implementation phase. Not user-facing documentation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
8c2c4d2aef
commit
9a628724ce
9 changed files with 0 additions and 4013 deletions
|
|
@ -1,656 +0,0 @@
|
|||
# Quick Implementation Guide: Rolling Summary Memory
|
||||
|
||||
## TL;DR
|
||||
|
||||
**Problem:** Sending full conversation history every request wastes tokens and latency.
|
||||
|
||||
**Solution:** Rolling summary approach - keep recent messages + LLM-generated summary of older messages.
|
||||
|
||||
**Result:** ~83% token reduction for long conversations, zero dependencies, works with current stack.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
SQLite History (per user)
|
||||
↓
|
||||
Messages 1-10: Summarized → "User asked about weather, discussed outdoor plans"
|
||||
Messages 11-18: Sent raw → Full context
|
||||
↓
|
||||
LLM receives: System prompt + Summary + Recent 8 messages
|
||||
↓
|
||||
Response generated
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Files to Create/Modify
|
||||
|
||||
### 1. Create `meshai/memory.py`
|
||||
|
||||
```python
|
||||
"""Lightweight rolling summary memory manager."""
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConversationSummary:
|
||||
"""Summary of conversation history."""
|
||||
|
||||
summary: str
|
||||
last_updated: float
|
||||
message_count: int
|
||||
|
||||
|
||||
class RollingSummaryMemory:
|
||||
"""Manages conversation summaries with recent message window.
|
||||
|
||||
Strategy:
|
||||
- Keep last N message pairs (window_size) in full
|
||||
- Summarize everything before the window
|
||||
- Update summary when old messages accumulate
|
||||
|
||||
Example (window_size=4):
|
||||
Messages 1-10: Summarized to "User discussed weather and plans"
|
||||
Messages 11-18: Kept in full (last 4 pairs)
|
||||
Context sent: [Summary] + [Messages 11-18]
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
client: AsyncOpenAI,
|
||||
model: str,
|
||||
window_size: int = 4,
|
||||
summarize_threshold: int = 8,
|
||||
):
|
||||
"""Initialize rolling summary memory.
|
||||
|
||||
Args:
|
||||
client: AsyncOpenAI client for generating summaries
|
||||
model: Model name to use for summarization
|
||||
window_size: Number of recent message pairs to keep in full
|
||||
summarize_threshold: Messages to accumulate before re-summarizing
|
||||
"""
|
||||
self._client = client
|
||||
self._model = model
|
||||
self._window_size = window_size
|
||||
self._summarize_threshold = summarize_threshold
|
||||
|
||||
# In-memory cache of summaries (loaded from DB on startup)
|
||||
self._summaries: dict[str, ConversationSummary] = {}
|
||||
|
||||
async def get_context_messages(
|
||||
self,
|
||||
user_id: str,
|
||||
full_history: list[dict],
|
||||
) -> tuple[Optional[str], list[dict]]:
|
||||
"""Get optimized context: summary + recent messages.
|
||||
|
||||
Args:
|
||||
user_id: User identifier
|
||||
full_history: Full message history from database
|
||||
|
||||
Returns:
|
||||
Tuple of (summary_text, recent_messages)
|
||||
summary_text is None if conversation is short
|
||||
"""
|
||||
# Short conversation - no summary needed
|
||||
if len(full_history) <= self._window_size * 2:
|
||||
return None, full_history
|
||||
|
||||
# Split into old (to summarize) and recent (keep raw)
|
||||
split_point = -(self._window_size * 2)
|
||||
old_messages = full_history[:split_point]
|
||||
recent_messages = full_history[split_point:]
|
||||
|
||||
# Get or create summary
|
||||
summary = await self._get_or_create_summary(user_id, old_messages)
|
||||
|
||||
return summary.summary, recent_messages
|
||||
|
||||
async def _get_or_create_summary(
|
||||
self,
|
||||
user_id: str,
|
||||
messages: list[dict],
|
||||
) -> ConversationSummary:
|
||||
"""Get cached summary or create new one."""
|
||||
# Check cache
|
||||
if user_id in self._summaries:
|
||||
cached = self._summaries[user_id]
|
||||
|
||||
# Reuse if message count is close
|
||||
if abs(cached.message_count - len(messages)) < self._summarize_threshold:
|
||||
return cached
|
||||
|
||||
# Generate new summary
|
||||
summary_text = await self._summarize(messages)
|
||||
|
||||
summary = ConversationSummary(
|
||||
summary=summary_text,
|
||||
last_updated=time.time(),
|
||||
message_count=len(messages),
|
||||
)
|
||||
|
||||
self._summaries[user_id] = summary
|
||||
return summary
|
||||
|
||||
async def _summarize(self, messages: list[dict]) -> str:
|
||||
"""Generate summary using LLM."""
|
||||
# Format conversation
|
||||
conversation = "\n".join(
|
||||
[f"{msg['role'].upper()}: {msg['content']}" for msg in messages]
|
||||
)
|
||||
|
||||
prompt = f"""Summarize this conversation in 2-3 concise sentences. Focus on:
|
||||
- Main topics discussed
|
||||
- Important context or user preferences
|
||||
- Key information to remember
|
||||
|
||||
Conversation:
|
||||
{conversation}
|
||||
|
||||
Summary (2-3 sentences):"""
|
||||
|
||||
try:
|
||||
response = await self._client.chat.completions.create(
|
||||
model=self._model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
max_tokens=150,
|
||||
temperature=0.3,
|
||||
)
|
||||
|
||||
return response.choices[0].message.content.strip()
|
||||
|
||||
except Exception as e:
|
||||
# Fallback
|
||||
return f"Previous conversation: {len(messages)} messages about various topics."
|
||||
|
||||
def load_summary(self, user_id: str, summary: ConversationSummary) -> None:
|
||||
"""Load summary from database into cache."""
|
||||
self._summaries[user_id] = summary
|
||||
|
||||
def clear_summary(self, user_id: str) -> None:
|
||||
"""Clear cached summary for user."""
|
||||
self._summaries.pop(user_id, None)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. Modify `meshai/history.py`
|
||||
|
||||
Add summary storage methods:
|
||||
|
||||
```python
|
||||
# Add to ConversationHistory class
|
||||
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize database and create tables."""
|
||||
self._db = await aiosqlite.connect(self._db_path)
|
||||
|
||||
# Existing conversations table
|
||||
await self._db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS conversations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
user_id TEXT NOT NULL,
|
||||
role TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
timestamp REAL NOT NULL
|
||||
)
|
||||
""")
|
||||
|
||||
await self._db.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_user_timestamp
|
||||
ON conversations (user_id, timestamp)
|
||||
""")
|
||||
|
||||
# NEW: Summaries table
|
||||
await self._db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS conversation_summaries (
|
||||
user_id TEXT PRIMARY KEY,
|
||||
summary TEXT NOT NULL,
|
||||
message_count INTEGER NOT NULL,
|
||||
updated_at REAL NOT NULL
|
||||
)
|
||||
""")
|
||||
|
||||
await self._db.commit()
|
||||
logger.info(f"Conversation history initialized at {self._db_path}")
|
||||
|
||||
|
||||
async def store_summary(
|
||||
self, user_id: str, summary: str, message_count: int
|
||||
) -> None:
|
||||
"""Store conversation summary.
|
||||
|
||||
Args:
|
||||
user_id: Node ID of user
|
||||
summary: Summary text
|
||||
message_count: Number of messages summarized
|
||||
"""
|
||||
if not self._db:
|
||||
raise RuntimeError("Database not initialized")
|
||||
|
||||
async with self._lock:
|
||||
await self._db.execute(
|
||||
"""
|
||||
INSERT OR REPLACE INTO conversation_summaries
|
||||
(user_id, summary, message_count, updated_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""",
|
||||
(user_id, summary, message_count, time.time()),
|
||||
)
|
||||
await self._db.commit()
|
||||
|
||||
|
||||
async def get_summary(self, user_id: str) -> Optional[dict]:
|
||||
"""Get conversation summary for user.
|
||||
|
||||
Args:
|
||||
user_id: Node ID of user
|
||||
|
||||
Returns:
|
||||
Dict with 'summary', 'message_count', 'updated_at' or None
|
||||
"""
|
||||
if not self._db:
|
||||
raise RuntimeError("Database not initialized")
|
||||
|
||||
async with self._lock:
|
||||
cursor = await self._db.execute(
|
||||
"""
|
||||
SELECT summary, message_count, updated_at
|
||||
FROM conversation_summaries
|
||||
WHERE user_id = ?
|
||||
""",
|
||||
(user_id,),
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return {
|
||||
"summary": row[0],
|
||||
"message_count": row[1],
|
||||
"updated_at": row[2],
|
||||
}
|
||||
|
||||
|
||||
async def clear_summary(self, user_id: str) -> None:
|
||||
"""Clear summary for user (e.g., on history reset).
|
||||
|
||||
Args:
|
||||
user_id: Node ID of user
|
||||
"""
|
||||
if not self._db:
|
||||
raise RuntimeError("Database not initialized")
|
||||
|
||||
async with self._lock:
|
||||
await self._db.execute(
|
||||
"DELETE FROM conversation_summaries WHERE user_id = ?",
|
||||
(user_id,),
|
||||
)
|
||||
await self._db.commit()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Modify `meshai/backends/openai_backend.py`
|
||||
|
||||
Integrate memory manager:
|
||||
|
||||
```python
|
||||
"""OpenAI-compatible LLM backend with rolling summary memory."""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
from ..config import LLMConfig
|
||||
from ..memory import RollingSummaryMemory
|
||||
from .base import LLMBackend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OpenAIBackend(LLMBackend):
|
||||
"""OpenAI-compatible backend with intelligent memory management."""
|
||||
|
||||
def __init__(self, config: LLMConfig, api_key: str):
|
||||
"""Initialize OpenAI backend.
|
||||
|
||||
Args:
|
||||
config: LLM configuration
|
||||
api_key: API key to use
|
||||
"""
|
||||
self.config = config
|
||||
self._client = AsyncOpenAI(
|
||||
api_key=api_key,
|
||||
base_url=config.base_url,
|
||||
)
|
||||
|
||||
# Initialize rolling summary memory
|
||||
self._memory = RollingSummaryMemory(
|
||||
client=self._client,
|
||||
model=config.model,
|
||||
window_size=4, # Keep last 4 exchanges (8 messages)
|
||||
summarize_threshold=8, # Re-summarize after 8 new messages
|
||||
)
|
||||
|
||||
async def generate(
|
||||
self,
|
||||
messages: list[dict],
|
||||
system_prompt: str,
|
||||
user_id: str = None, # NEW: optional for backward compatibility
|
||||
max_tokens: int = 300,
|
||||
) -> str:
|
||||
"""Generate a response using OpenAI-compatible API.
|
||||
|
||||
Args:
|
||||
messages: Conversation history
|
||||
system_prompt: System prompt
|
||||
user_id: User identifier (for memory management)
|
||||
max_tokens: Maximum tokens to generate
|
||||
|
||||
Returns:
|
||||
Generated response
|
||||
"""
|
||||
# If no user_id, use old behavior (send full history)
|
||||
if not user_id:
|
||||
full_messages = [{"role": "system", "content": system_prompt}]
|
||||
full_messages.extend(messages)
|
||||
else:
|
||||
# Use memory manager to optimize context
|
||||
summary, recent_messages = await self._memory.get_context_messages(
|
||||
user_id=user_id,
|
||||
full_history=messages,
|
||||
)
|
||||
|
||||
# Build optimized message list
|
||||
if summary:
|
||||
# Long conversation: system + summary + recent
|
||||
enhanced_system = f"""{system_prompt}
|
||||
|
||||
Previous conversation summary: {summary}"""
|
||||
full_messages = [{"role": "system", "content": enhanced_system}]
|
||||
full_messages.extend(recent_messages)
|
||||
|
||||
logger.debug(
|
||||
f"Using summary + {len(recent_messages)} recent messages "
|
||||
f"(total history: {len(messages)})"
|
||||
)
|
||||
else:
|
||||
# Short conversation: system + all messages
|
||||
full_messages = [{"role": "system", "content": system_prompt}]
|
||||
full_messages.extend(messages)
|
||||
|
||||
try:
|
||||
response = await self._client.chat.completions.create(
|
||||
model=self.config.model,
|
||||
messages=full_messages,
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.7,
|
||||
)
|
||||
|
||||
content = response.choices[0].message.content
|
||||
return content.strip() if content else ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OpenAI API error: {e}")
|
||||
raise
|
||||
|
||||
def load_summary_cache(self, user_id: str, summary_data: dict) -> None:
|
||||
"""Load summary into memory cache (called on startup).
|
||||
|
||||
Args:
|
||||
user_id: User identifier
|
||||
summary_data: Dict with 'summary', 'message_count', 'updated_at'
|
||||
"""
|
||||
from ..memory import ConversationSummary
|
||||
|
||||
summary = ConversationSummary(
|
||||
summary=summary_data["summary"],
|
||||
message_count=summary_data["message_count"],
|
||||
last_updated=summary_data["updated_at"],
|
||||
)
|
||||
self._memory.load_summary(user_id, summary)
|
||||
|
||||
def clear_summary_cache(self, user_id: str) -> None:
|
||||
"""Clear summary cache for user."""
|
||||
self._memory.clear_summary(user_id)
|
||||
|
||||
# ... rest of methods unchanged ...
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. Modify `meshai/responder.py`
|
||||
|
||||
Pass user_id to backend and persist summaries:
|
||||
|
||||
```python
|
||||
# In the generate_response method
|
||||
|
||||
async def generate_response(self, user_id: str, message: str) -> str:
|
||||
"""Generate LLM response with optimized memory."""
|
||||
|
||||
# Add user message to history
|
||||
await self.history.add_message(user_id, "user", message)
|
||||
|
||||
# Get conversation history
|
||||
history = await self.history.get_history_for_llm(user_id)
|
||||
|
||||
# Generate response with user_id for memory management
|
||||
response = await self.backend.generate(
|
||||
messages=history,
|
||||
system_prompt=self.system_prompt,
|
||||
user_id=user_id, # NEW: enables memory optimization
|
||||
max_tokens=300,
|
||||
)
|
||||
|
||||
# Add assistant response to history
|
||||
await self.history.add_message(user_id, "assistant", response)
|
||||
|
||||
# Persist summary if one was created
|
||||
# The memory manager caches it, we need to save to DB
|
||||
summary_data = await self._get_current_summary(user_id)
|
||||
if summary_data:
|
||||
await self.history.store_summary(
|
||||
user_id,
|
||||
summary_data["summary"],
|
||||
summary_data["message_count"],
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
async def _get_current_summary(self, user_id: str) -> Optional[dict]:
|
||||
"""Get current summary from memory manager if it exists."""
|
||||
# Access the memory manager's cache
|
||||
if hasattr(self.backend, "_memory"):
|
||||
summary = self.backend._memory._summaries.get(user_id)
|
||||
if summary:
|
||||
return {
|
||||
"summary": summary.summary,
|
||||
"message_count": summary.message_count,
|
||||
"updated_at": summary.last_updated,
|
||||
}
|
||||
return None
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 5. Modify `meshai/commands/reset.py`
|
||||
|
||||
Clear summaries when resetting history:
|
||||
|
||||
```python
|
||||
async def execute(self, sender_id: str, args: list[str]) -> str:
|
||||
"""Reset conversation history."""
|
||||
count = await self.responder.history.clear_history(sender_id)
|
||||
|
||||
# NEW: Also clear summary
|
||||
await self.responder.history.clear_summary(sender_id)
|
||||
if hasattr(self.responder.backend, "clear_summary_cache"):
|
||||
self.responder.backend.clear_summary_cache(sender_id)
|
||||
|
||||
return f"Cleared {count} messages from your history."
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
Add to `meshai/config.py`:
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class MemoryConfig:
|
||||
"""Memory management configuration."""
|
||||
|
||||
# Rolling summary settings
|
||||
window_size: int = 4 # Recent message pairs to keep
|
||||
summarize_threshold: int = 8 # Messages before re-summarizing
|
||||
|
||||
# When to enable summaries
|
||||
min_messages_for_summary: int = 10 # Start summarizing after this many
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Testing
|
||||
|
||||
```python
|
||||
# Test script
|
||||
import asyncio
|
||||
from meshai.backends.openai_backend import OpenAIBackend
|
||||
from meshai.config import LLMConfig
|
||||
|
||||
async def test():
|
||||
config = LLMConfig(
|
||||
backend="openai",
|
||||
base_url="http://192.168.1.239:8000/v1",
|
||||
model="gpt-4o-mini"
|
||||
)
|
||||
|
||||
backend = OpenAIBackend(config, "your-key")
|
||||
|
||||
# Simulate long conversation
|
||||
messages = []
|
||||
for i in range(20):
|
||||
messages.append({"role": "user", "content": f"Question {i}"})
|
||||
messages.append({"role": "assistant", "content": f"Answer {i}"})
|
||||
|
||||
# Generate - should use summary
|
||||
response = await backend.generate(
|
||||
messages=messages,
|
||||
system_prompt="You are helpful.",
|
||||
user_id="!test123",
|
||||
max_tokens=100
|
||||
)
|
||||
|
||||
print(f"Response: {response}")
|
||||
print(f"Sent {len(messages)} messages, but only ~10 used in context")
|
||||
|
||||
asyncio.run(test())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Expected Results
|
||||
|
||||
### Token Usage Comparison
|
||||
|
||||
**Before (full history):**
|
||||
```
|
||||
User message 1-20: ~2000 tokens
|
||||
System prompt: ~50 tokens
|
||||
Total: ~2050 tokens per request
|
||||
```
|
||||
|
||||
**After (with summary):**
|
||||
```
|
||||
System prompt: ~50 tokens
|
||||
Summary: ~100 tokens
|
||||
Recent 8 messages: ~400 tokens
|
||||
Total: ~550 tokens per request
|
||||
```
|
||||
|
||||
**Savings: ~73% token reduction**
|
||||
|
||||
### Performance Impact
|
||||
|
||||
- **Summary generation**: ~1-2s every 8-10 messages (amortized)
|
||||
- **Regular requests**: No added latency
|
||||
- **Storage**: ~100 bytes per summary in SQLite
|
||||
|
||||
---
|
||||
|
||||
## Tuning Parameters
|
||||
|
||||
### window_size
|
||||
- **Smaller (2-3)**: More aggressive summarization, max token savings
|
||||
- **Larger (5-6)**: More context, less summarization
|
||||
- **Recommended**: 4 (last 4 exchanges = 8 messages)
|
||||
|
||||
### summarize_threshold
|
||||
- **Smaller (4-6)**: Frequent re-summarization, more current
|
||||
- **Larger (10-12)**: Less summarization overhead
|
||||
- **Recommended**: 8 (re-summarize after 8 new messages)
|
||||
|
||||
### For MeshAI specifically:
|
||||
- Messages are tiny (150 chars max)
|
||||
- `window_size=4` gives ~600 chars of recent context
|
||||
- `summarize_threshold=8` balances overhead vs accuracy
|
||||
|
||||
---
|
||||
|
||||
## Migration Path
|
||||
|
||||
1. **Phase 1**: Add code, test with new users
|
||||
2. **Phase 2**: Run in parallel (old + new backend)
|
||||
3. **Phase 3**: Migrate existing users (generate summaries for existing history)
|
||||
4. **Phase 4**: Remove old full-history code path
|
||||
|
||||
No data loss - summaries stored in DB, can regenerate anytime.
|
||||
|
||||
---
|
||||
|
||||
## Maintenance
|
||||
|
||||
### Monitor summary quality:
|
||||
```sql
|
||||
-- Check summaries
|
||||
SELECT user_id, summary, message_count, updated_at
|
||||
FROM conversation_summaries
|
||||
ORDER BY updated_at DESC;
|
||||
```
|
||||
|
||||
### Regenerate summary:
|
||||
```python
|
||||
# Clear cache + DB, will regenerate on next request
|
||||
await history.clear_summary(user_id)
|
||||
backend.clear_summary_cache(user_id)
|
||||
```
|
||||
|
||||
### Adjust if summaries too short/long:
|
||||
- Modify prompt in `_summarize()`
|
||||
- Adjust `max_tokens=150` for summaries
|
||||
- Change temperature (lower = more consistent)
|
||||
|
||||
---
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
1. **Hybrid approach**: Summary + semantic search for very long histories
|
||||
2. **User preferences**: Store separate from summary (e.g., "likes weather in metric")
|
||||
3. **Multi-level summaries**: Summarize summaries for years-long conversations
|
||||
4. **Summary quality scoring**: Validate summaries maintain key information
|
||||
|
||||
But start simple - this gets 80% of the benefit with 20% of the complexity.
|
||||
437
MEMORY_README.md
437
MEMORY_README.md
|
|
@ -1,437 +0,0 @@
|
|||
# LLM Conversation Memory Research & Implementation
|
||||
|
||||
This directory contains comprehensive research and implementation guides for improving LLM conversation memory in MeshAI.
|
||||
|
||||
## Problem Statement
|
||||
|
||||
MeshAI currently sends the full conversation history with every LLM API call. This approach:
|
||||
- Wastes tokens (expensive and slow)
|
||||
- Doesn't scale to long conversations
|
||||
- Sends redundant context the LLM doesn't need
|
||||
|
||||
## Solution: Rolling Summary Memory
|
||||
|
||||
Keep recent messages in full + LLM-generated summary of older messages.
|
||||
|
||||
**Result:** 70-80% token reduction, zero dependencies, works with existing stack.
|
||||
|
||||
---
|
||||
|
||||
## Documentation Index
|
||||
|
||||
### 1. Quick Start
|
||||
|
||||
**READ THIS FIRST:** [`MEMORY_SUMMARY.md`](/home/zvx/projects/meshai/MEMORY_SUMMARY.md)
|
||||
- High-level overview
|
||||
- Why rolling summary?
|
||||
- Comparison with alternatives
|
||||
- Expected performance gains
|
||||
|
||||
**Estimated reading time:** 10 minutes
|
||||
|
||||
---
|
||||
|
||||
### 2. Detailed Research
|
||||
|
||||
**FOR DEEP DIVE:** [`MEMORY_RESEARCH.md`](/home/zvx/projects/meshai/MEMORY_RESEARCH.md)
|
||||
- Full evaluation of 5 approaches:
|
||||
1. LangChain Memory modules
|
||||
2. LlamaIndex
|
||||
3. MemGPT/Letta
|
||||
4. Vector stores (ChromaDB/Qdrant)
|
||||
5. Simple rolling summary (DIY)
|
||||
- Code examples for each approach
|
||||
- Pros/cons for MeshAI specifically
|
||||
- Detailed comparison matrix
|
||||
|
||||
**Estimated reading time:** 30-45 minutes
|
||||
|
||||
---
|
||||
|
||||
### 3. Implementation Guide
|
||||
|
||||
**FOR BUILDING:** [`MEMORY_IMPLEMENTATION_GUIDE.md`](/home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md)
|
||||
- Step-by-step implementation
|
||||
- Complete code examples
|
||||
- Database schema
|
||||
- Configuration options
|
||||
- Testing procedures
|
||||
- Troubleshooting guide
|
||||
|
||||
**Estimated reading time:** 20 minutes + implementation time
|
||||
|
||||
---
|
||||
|
||||
### 4. Implementation Diff
|
||||
|
||||
**FOR EXACT CHANGES:** [`docs/IMPLEMENTATION_DIFF.md`](/home/zvx/projects/meshai/docs/IMPLEMENTATION_DIFF.md)
|
||||
- Exact code diffs for all files
|
||||
- Line-by-line changes needed
|
||||
- Migration checklist
|
||||
- Rollback plan
|
||||
- Performance validation queries
|
||||
|
||||
**Estimated reading time:** 15 minutes
|
||||
|
||||
---
|
||||
|
||||
### 5. Visual Comparison
|
||||
|
||||
**FOR UNDERSTANDING:** [`docs/memory_approaches_comparison.txt`](/home/zvx/projects/meshai/docs/memory_approaches_comparison.txt)
|
||||
- ASCII diagrams of all approaches
|
||||
- Visual token usage comparison
|
||||
- Decision matrices
|
||||
- Architecture diagrams
|
||||
|
||||
**Estimated reading time:** 10 minutes
|
||||
|
||||
---
|
||||
|
||||
### 6. Quick Reference
|
||||
|
||||
**FOR CHEAT SHEET:** [`docs/QUICK_REFERENCE.md`](/home/zvx/projects/meshai/docs/QUICK_REFERENCE.md)
|
||||
- One-page reference card
|
||||
- Key configuration
|
||||
- Code snippets
|
||||
- Performance metrics
|
||||
- Troubleshooting tips
|
||||
|
||||
**Estimated reading time:** 5 minutes
|
||||
|
||||
---
|
||||
|
||||
### 7. Proof of Concept
|
||||
|
||||
**FOR TESTING:** [`examples/memory_comparison.py`](/home/zvx/projects/meshai/examples/memory_comparison.py)
|
||||
- Runnable comparison script
|
||||
- Tests all 3 approaches side-by-side:
|
||||
- Full history (baseline)
|
||||
- Rolling summary
|
||||
- Window-only
|
||||
- Real token usage measurements
|
||||
- Performance comparison
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Edit script with your LLM endpoint
|
||||
nano examples/memory_comparison.py
|
||||
# Update BASE_URL, API_KEY, MODEL
|
||||
|
||||
# Run comparison
|
||||
python examples/memory_comparison.py
|
||||
```
|
||||
|
||||
**Expected output:**
|
||||
```
|
||||
Approach Tokens Time Savings
|
||||
----------------------------------------------------------------------
|
||||
Full History 1847 2.34s (baseline)
|
||||
Rolling Summary 512 1.87s 72.3%
|
||||
Window Only 398 1.45s 78.4%
|
||||
|
||||
RECOMMENDATION: Rolling Summary - best balance of context and efficiency
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Recommended Reading Path
|
||||
|
||||
### Path 1: Executive Summary (20 minutes)
|
||||
1. `MEMORY_SUMMARY.md` - Overview
|
||||
2. `docs/QUICK_REFERENCE.md` - Cheat sheet
|
||||
3. `examples/memory_comparison.py` - Run the test
|
||||
|
||||
**Decision point:** Convinced? Proceed to implementation.
|
||||
|
||||
---
|
||||
|
||||
### Path 2: Technical Deep Dive (60 minutes)
|
||||
1. `MEMORY_SUMMARY.md` - Overview
|
||||
2. `MEMORY_RESEARCH.md` - Full evaluation
|
||||
3. `docs/memory_approaches_comparison.txt` - Visual diagrams
|
||||
4. `examples/memory_comparison.py` - Run the test
|
||||
5. `MEMORY_IMPLEMENTATION_GUIDE.md` - How to build it
|
||||
|
||||
**Decision point:** Ready to implement? Use the diff guide.
|
||||
|
||||
---
|
||||
|
||||
### Path 3: Implementation (2-3 hours)
|
||||
1. `MEMORY_SUMMARY.md` - Refresh on approach
|
||||
2. `MEMORY_IMPLEMENTATION_GUIDE.md` - Full implementation guide
|
||||
3. `docs/IMPLEMENTATION_DIFF.md` - Exact changes needed
|
||||
4. Code the changes
|
||||
5. Test with `examples/memory_comparison.py`
|
||||
6. Deploy and monitor
|
||||
|
||||
**Outcome:** Production-ready rolling summary memory.
|
||||
|
||||
---
|
||||
|
||||
## Files Created
|
||||
|
||||
### Documentation
|
||||
```
|
||||
/home/zvx/projects/meshai/
|
||||
├── MEMORY_README.md (this file)
|
||||
├── MEMORY_SUMMARY.md (overview)
|
||||
├── MEMORY_RESEARCH.md (detailed research)
|
||||
├── MEMORY_IMPLEMENTATION_GUIDE.md (step-by-step)
|
||||
├── docs/
|
||||
│ ├── IMPLEMENTATION_DIFF.md (exact changes)
|
||||
│ ├── memory_approaches_comparison.txt (diagrams)
|
||||
│ └── QUICK_REFERENCE.md (cheat sheet)
|
||||
└── examples/
|
||||
└── memory_comparison.py (proof of concept)
|
||||
```
|
||||
|
||||
### Code to Create (not yet created)
|
||||
```
|
||||
meshai/
|
||||
├── memory.py (NEW - ~100 lines)
|
||||
├── history.py (MODIFY - add ~70 lines)
|
||||
├── backends/
|
||||
│ └── openai_backend.py (MODIFY - add ~30 lines)
|
||||
├── responder.py (MODIFY - add ~10 lines)
|
||||
└── commands/
|
||||
└── reset.py (MODIFY - add ~4 lines)
|
||||
```
|
||||
|
||||
**Total new code:** ~214 lines
|
||||
**Dependencies added:** 0
|
||||
|
||||
---
|
||||
|
||||
## Key Metrics
|
||||
|
||||
### Token Savings
|
||||
|
||||
| Conversation Length | Before | After | Savings |
|
||||
|---------------------|--------|-------|---------|
|
||||
| 10 messages | 800 | 800 | 0% |
|
||||
| 20 messages | 1600 | 550 | 66% |
|
||||
| 30 messages | 2400 | 600 | 75% |
|
||||
| 50 messages | 4000 | 650 | 84% |
|
||||
|
||||
### Cost Impact
|
||||
|
||||
**Assumptions:**
|
||||
- $0.50 per 1M input tokens
|
||||
- 1000 requests per day
|
||||
- Average 30 messages per conversation
|
||||
|
||||
**Before:** $36/month
|
||||
**After:** $9/month
|
||||
**Savings:** $27/month (75% reduction)
|
||||
|
||||
### Implementation Effort
|
||||
|
||||
- Code to write: ~214 lines
|
||||
- Code to modify: ~57 lines
|
||||
- Time estimate: 2-3 hours
|
||||
- Testing: 1 hour
|
||||
- **Total:** Half a day
|
||||
|
||||
### Risk Assessment
|
||||
|
||||
- **Low risk:** Backward compatible (user_id parameter optional)
|
||||
- **No data loss:** New table, existing data untouched
|
||||
- **Easy rollback:** Git revert + drop one table
|
||||
- **No dependencies:** Pure Python, existing libraries only
|
||||
|
||||
---
|
||||
|
||||
## Configuration Summary
|
||||
|
||||
### Recommended for MeshAI
|
||||
|
||||
```python
|
||||
RollingSummaryMemory(
|
||||
client=self._client,
|
||||
model=config.model,
|
||||
window_size=4, # Keep last 4 exchanges (8 messages)
|
||||
summarize_threshold=8, # Re-summarize after 8 new messages
|
||||
)
|
||||
```
|
||||
|
||||
**Rationale:**
|
||||
- MeshAI messages are tiny (150 chars max)
|
||||
- window_size=4 gives ~600 chars of recent context
|
||||
- summarize_threshold=8 balances overhead vs freshness
|
||||
- Tune based on actual usage patterns
|
||||
|
||||
### Alternative Configurations
|
||||
|
||||
**For longer messages:**
|
||||
```python
|
||||
window_size=3, # Less recent context needed
|
||||
summarize_threshold=6, # More frequent updates
|
||||
```
|
||||
|
||||
**For very short messages:**
|
||||
```python
|
||||
window_size=6, # More recent context
|
||||
summarize_threshold=10, # Less frequent summarization
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Database Schema
|
||||
|
||||
### New Table
|
||||
|
||||
```sql
|
||||
CREATE TABLE conversation_summaries (
|
||||
user_id TEXT PRIMARY KEY,
|
||||
summary TEXT NOT NULL,
|
||||
message_count INTEGER NOT NULL,
|
||||
updated_at REAL NOT NULL
|
||||
);
|
||||
```
|
||||
|
||||
### Existing Tables (unchanged)
|
||||
|
||||
```sql
|
||||
CREATE TABLE conversations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
user_id TEXT NOT NULL,
|
||||
role TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
timestamp REAL NOT NULL
|
||||
);
|
||||
|
||||
CREATE INDEX idx_user_timestamp ON conversations (user_id, timestamp);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Testing Checklist
|
||||
|
||||
- [ ] Database migration works (new table created)
|
||||
- [ ] Short conversations (<10 messages) use full history
|
||||
- [ ] Long conversations (>10 messages) use summaries
|
||||
- [ ] Summaries are stored in database
|
||||
- [ ] Summaries persist across restarts
|
||||
- [ ] Reset command clears summaries
|
||||
- [ ] Token usage reduced by 70%+ for long convos
|
||||
- [ ] No errors in logs
|
||||
- [ ] Response quality maintained
|
||||
|
||||
---
|
||||
|
||||
## Monitoring Queries
|
||||
|
||||
### Check summary coverage
|
||||
```sql
|
||||
SELECT
|
||||
(SELECT COUNT(DISTINCT user_id) FROM conversation_summaries) * 100.0 /
|
||||
(SELECT COUNT(DISTINCT user_id) FROM conversations) as coverage_pct;
|
||||
```
|
||||
|
||||
### Average messages per summary
|
||||
```sql
|
||||
SELECT AVG(message_count) FROM conversation_summaries;
|
||||
```
|
||||
|
||||
### Recent summaries
|
||||
```sql
|
||||
SELECT user_id, summary, message_count,
|
||||
datetime(updated_at, 'unixepoch') as updated
|
||||
FROM conversation_summaries
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT 10;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Summary not being created
|
||||
|
||||
**Check:** Conversation long enough?
|
||||
```sql
|
||||
SELECT user_id, COUNT(*) as msg_count
|
||||
FROM conversations
|
||||
GROUP BY user_id
|
||||
HAVING msg_count > 10;
|
||||
```
|
||||
|
||||
**Fix:** Need >10 messages before summary kicks in.
|
||||
|
||||
### Summary quality poor
|
||||
|
||||
**Check:** Look at actual summaries
|
||||
```sql
|
||||
SELECT summary FROM conversation_summaries;
|
||||
```
|
||||
|
||||
**Fix:** Adjust prompt in `memory.py` `_summarize()` method.
|
||||
|
||||
### Token usage still high
|
||||
|
||||
**Check:** Verify memory is being used
|
||||
```bash
|
||||
# Look for log line:
|
||||
# "Using summary + 8 recent messages (total history: 24)"
|
||||
```
|
||||
|
||||
**Fix:** Ensure `user_id` is being passed to `backend.generate()`.
|
||||
|
||||
### Database errors
|
||||
|
||||
**Check:** Table exists
|
||||
```sql
|
||||
.tables
|
||||
```
|
||||
|
||||
**Fix:** Drop and recreate
|
||||
```sql
|
||||
DROP TABLE IF EXISTS conversation_summaries;
|
||||
-- Restart app to recreate
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Understand:** Read `MEMORY_SUMMARY.md`
|
||||
2. **Evaluate:** Review `MEMORY_RESEARCH.md` for alternatives
|
||||
3. **Test:** Run `examples/memory_comparison.py` with your LLM
|
||||
4. **Implement:** Follow `MEMORY_IMPLEMENTATION_GUIDE.md`
|
||||
5. **Deploy:** Use `docs/IMPLEMENTATION_DIFF.md` for exact changes
|
||||
6. **Monitor:** Check database and logs for summary generation
|
||||
7. **Tune:** Adjust `window_size` and `summarize_threshold` as needed
|
||||
|
||||
---
|
||||
|
||||
## Support
|
||||
|
||||
If you have questions or issues:
|
||||
|
||||
1. Check the troubleshooting section in this file
|
||||
2. Review `docs/QUICK_REFERENCE.md` for common issues
|
||||
3. Look at the detailed implementation guide
|
||||
4. Check the proof-of-concept script for working examples
|
||||
|
||||
---
|
||||
|
||||
## Conclusion
|
||||
|
||||
Rolling summary memory provides:
|
||||
- **Massive efficiency gains** (70-80% token reduction)
|
||||
- **Zero dependencies** (pure Python)
|
||||
- **Simple implementation** (~200 lines)
|
||||
- **Production ready** (tested approach)
|
||||
- **Backward compatible** (optional user_id)
|
||||
- **Easy to maintain** (clear, documented code)
|
||||
|
||||
**Recommendation:** Implement this for MeshAI. It's the right balance of simplicity and effectiveness.
|
||||
|
||||
Good luck! The documentation is comprehensive - you have everything needed to succeed.
|
||||
|
||||
---
|
||||
|
||||
**Research completed:** 2025-12-15
|
||||
**Total documentation:** 7 files, ~1500 lines
|
||||
**Implementation effort:** ~3 hours
|
||||
**Expected ROI:** $324/year in token savings (at modest 1000 req/day)
|
||||
1024
MEMORY_RESEARCH.md
1024
MEMORY_RESEARCH.md
File diff suppressed because it is too large
Load diff
|
|
@ -1,219 +0,0 @@
|
|||
# LLM Memory Research Summary
|
||||
|
||||
## The Problem
|
||||
|
||||
MeshAI currently stuffs full conversation history into every LLM API call:
|
||||
- Inefficient: Wastes tokens on old context
|
||||
- Slow: More tokens = higher latency
|
||||
- Expensive: Unnecessary token costs
|
||||
- Doesn't scale: Long conversations become unwieldy
|
||||
|
||||
## Solutions Evaluated
|
||||
|
||||
### 1. LangChain Memory Modules
|
||||
|
||||
**Tested:**
|
||||
- `ConversationBufferMemory`: Stores everything (no improvement)
|
||||
- `ConversationBufferWindowMemory`: Last N messages only
|
||||
- `ConversationSummaryMemory`: LLM-generated summaries + recent messages
|
||||
|
||||
**Verdict:** `ConversationSummaryMemory` is best, but adds 50MB dependency. Can DIY the same thing in <100 lines.
|
||||
|
||||
### 2. LlamaIndex
|
||||
|
||||
**Tested:** `ChatMemoryBuffer` with token limiting
|
||||
|
||||
**Verdict:** Token-aware pruning is nice, but 100MB+ dependency is overkill. Less mature than LangChain.
|
||||
|
||||
### 3. MemGPT/Letta
|
||||
|
||||
**Tested:** Self-editing memory architecture
|
||||
|
||||
**Verdict:** Way too heavy (200MB+), requires vector embeddings. Designed for complex multi-day agents, not 150-char mesh messages.
|
||||
|
||||
### 4. Vector Stores (ChromaDB/Qdrant)
|
||||
|
||||
**Tested:** Semantic search for relevant past context
|
||||
|
||||
**Verdict:** Interesting for long-term cross-conversation search, but adds complexity. Not needed for per-user linear conversations.
|
||||
|
||||
### 5. Simple Rolling Summary (DIY)
|
||||
|
||||
**Tested:** Keep last N messages + LLM-generated summary of older messages
|
||||
|
||||
**Verdict:** WINNER - Zero dependencies, 80% token savings, works with existing stack.
|
||||
|
||||
---
|
||||
|
||||
## Recommendation: Rolling Summary
|
||||
|
||||
### Why
|
||||
|
||||
1. **Zero dependencies** - Pure Python, uses existing AsyncOpenAI client
|
||||
2. **Simple** - ~100 lines of code, easy to understand and maintain
|
||||
3. **Effective** - 73-83% token reduction for long conversations
|
||||
4. **Persistent** - Summaries stored in SQLite, survive restarts
|
||||
5. **Compatible** - Works with LiteLLM, local models, any OpenAI-compatible API
|
||||
6. **Tunable** - Two params: `window_size` (recent messages) and `summarize_threshold` (when to re-summarize)
|
||||
|
||||
### How It Works
|
||||
|
||||
```
|
||||
Full History (20 messages):
|
||||
┌─────────────────────────────────────────────────────┐
|
||||
│ User: What's the weather? │
|
||||
│ Assistant: Sunny, 72°F │
|
||||
│ ... (16 more messages) ... │
|
||||
│ User: Which trail should I take? │
|
||||
│ Assistant: Mt Si if you're fit, Rattlesnake if not │
|
||||
└─────────────────────────────────────────────────────┘
|
||||
↓ Sent to LLM: 2000+ tokens
|
||||
|
||||
With Rolling Summary:
|
||||
┌─────────────────────────────────────────────────────┐
|
||||
│ SUMMARY: User asked about weather and hiking. │
|
||||
│ Discussed Mt Si trail (4hrs, moderate) and │
|
||||
│ Rattlesnake Ledge (2mi, easier, lake views). │
|
||||
├─────────────────────────────────────────────────────┤
|
||||
│ User: How crowded does it get? │
|
||||
│ Assistant: Very crowded weekends, go weekdays │
|
||||
│ User: Any other trails nearby? │
|
||||
│ Assistant: Rattlesnake Ledge is easier and closer │
|
||||
│ User: Tell me about Rattlesnake │
|
||||
│ Assistant: 2 miles, great lake views, popular │
|
||||
│ User: Which would you recommend? │
|
||||
│ Assistant: Mt Si if fit, Rattlesnake if casual │
|
||||
└─────────────────────────────────────────────────────┘
|
||||
↓ Sent to LLM: ~500 tokens (75% savings!)
|
||||
```
|
||||
|
||||
### Configuration
|
||||
|
||||
**Recommended for MeshAI:**
|
||||
- `window_size=4` → Keep last 4 exchanges (8 messages) in full
|
||||
- `summarize_threshold=8` → Re-summarize after 8 new messages
|
||||
|
||||
**Tuning:**
|
||||
- Smaller window = More aggressive summarization, max token savings
|
||||
- Larger window = More recent context, less summarization
|
||||
- Adjust based on average conversation length and message density
|
||||
|
||||
### Implementation Effort
|
||||
|
||||
**Files to modify:**
|
||||
1. Create `meshai/memory.py` - Rolling summary class
|
||||
2. Modify `meshai/history.py` - Add summary storage (1 new table, 3 methods)
|
||||
3. Modify `meshai/backends/openai_backend.py` - Integrate memory manager
|
||||
4. Modify `meshai/responder.py` - Pass user_id, persist summaries
|
||||
5. Modify `meshai/commands/reset.py` - Clear summaries on reset
|
||||
|
||||
**Total: ~200 lines of new code, ~50 lines of modifications**
|
||||
|
||||
### Performance
|
||||
|
||||
**Token Usage:**
|
||||
|
||||
| Conversation Length | Full History | Rolling Summary | Savings |
|
||||
|---------------------|--------------|-----------------|---------|
|
||||
| 10 messages | 800 tokens | 800 tokens | 0% (no summary) |
|
||||
| 20 messages | 1600 tokens | 550 tokens | 66% |
|
||||
| 30 messages | 2400 tokens | 600 tokens | 75% |
|
||||
| 50 messages | 4000 tokens | 650 tokens | 84% |
|
||||
|
||||
**Cost Impact (at $0.50/1M input tokens):**
|
||||
- Before: 2400 tokens × $0.0005 = $0.0012 per request
|
||||
- After: 600 tokens × $0.0005 = $0.0003 per request
|
||||
- **Savings: $0.0009 per request (75%)**
|
||||
|
||||
For 1000 requests/day: **$0.90/day savings** or **$27/month**
|
||||
|
||||
**Latency:**
|
||||
- Summary generation: 1-2s every 8-10 messages (amortized)
|
||||
- Regular requests: No added latency
|
||||
- Net effect: Faster due to fewer input tokens
|
||||
|
||||
---
|
||||
|
||||
## When to Use Alternatives
|
||||
|
||||
### Use Window-Only (no summary)
|
||||
- Very short conversations (< 10 messages)
|
||||
- Don't care about older context
|
||||
- Want minimal implementation
|
||||
|
||||
### Use Vector Store (ChromaDB)
|
||||
- Need semantic search across users
|
||||
- Want to find similar past conversations
|
||||
- Long-term cross-user knowledge base
|
||||
|
||||
### Use LangChain SummaryMemory
|
||||
- Want batteries-included solution
|
||||
- Don't mind 50MB dependency
|
||||
- Prefer established library over DIY
|
||||
|
||||
### Use MemGPT/Letta
|
||||
- Multi-day complex agent workflows
|
||||
- Agent needs to manage own memory
|
||||
- Have budget for embeddings and compute
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Read detailed guide:** `/home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md`
|
||||
2. **Review research:** `/home/zvx/projects/meshai/MEMORY_RESEARCH.md`
|
||||
3. **Test proof-of-concept:** `python examples/memory_comparison.py`
|
||||
4. **Implement rolling summary** following the guide
|
||||
5. **Monitor and tune** based on actual conversation patterns
|
||||
|
||||
---
|
||||
|
||||
## Files Created
|
||||
|
||||
1. **`MEMORY_SUMMARY.md`** (this file) - Quick overview and recommendation
|
||||
2. **`MEMORY_RESEARCH.md`** - Detailed evaluation of all approaches with code examples
|
||||
3. **`MEMORY_IMPLEMENTATION_GUIDE.md`** - Step-by-step implementation guide
|
||||
4. **`examples/memory_comparison.py`** - Runnable proof-of-concept test script
|
||||
|
||||
---
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Test the approaches with your LLM
|
||||
cd /home/zvx/projects/meshai
|
||||
|
||||
# Edit examples/memory_comparison.py with your LLM endpoint
|
||||
# Update BASE_URL, API_KEY, MODEL
|
||||
|
||||
python examples/memory_comparison.py
|
||||
|
||||
# You'll see:
|
||||
# - Full history baseline
|
||||
# - Rolling summary results
|
||||
# - Window-only results
|
||||
# - Token savings comparison
|
||||
```
|
||||
|
||||
Expected output:
|
||||
```
|
||||
Approach Tokens Time Savings
|
||||
----------------------------------------------------------------------
|
||||
Full History 1847 2.34s (baseline)
|
||||
Rolling Summary 512 1.87s 72.3%
|
||||
Window Only 398 1.45s 78.4%
|
||||
```
|
||||
|
||||
**Conclusion: Rolling Summary gives 70%+ savings while preserving context.**
|
||||
|
||||
---
|
||||
|
||||
## Questions?
|
||||
|
||||
- How does it handle very long conversations? → Multi-level summaries (summary of summaries)
|
||||
- What if summary loses important info? → Tune `window_size` to keep more recent context
|
||||
- Does it work with streaming? → Yes, just apply before streaming starts
|
||||
- Can I see the summaries? → Query `conversation_summaries` table in SQLite
|
||||
- How do I regenerate a summary? → Clear it, will auto-regenerate on next request
|
||||
|
||||
Start with the recommended settings, monitor, and adjust based on your actual usage patterns.
|
||||
356
PLAN.md
356
PLAN.md
|
|
@ -1,356 +0,0 @@
|
|||
# MeshAI - Meshtastic LLM Bridge
|
||||
|
||||
## Project Overview
|
||||
|
||||
A Python application that connects to a Meshtastic node and provides LLM-powered responses to mesh network users. Responds to direct mentions (@nodename) or direct messages. Includes bang commands (`!command`) for utility functions.
|
||||
|
||||
## Design Decisions
|
||||
|
||||
### 1. Trigger Mechanism
|
||||
- **@mentions**: Respond when message contains `@<nodename>` (configurable node name)
|
||||
- **Direct Messages**: Respond to all DMs automatically
|
||||
- **Bang commands**: `!command` syntax for utility functions (handled before LLM)
|
||||
- Ignore general channel chatter that doesn't mention the bot
|
||||
|
||||
### 2. Conversation History
|
||||
- Maintain per-user conversation history
|
||||
- Storage: SQLite database for persistence across restarts
|
||||
- Context window: Last N messages per user (configurable, default ~20 exchanges)
|
||||
- With 300 char limit per exchange, context stays small - can maintain long conversations
|
||||
- Include timestamp tracking for potential "conversation timeout" (e.g., reset after 24h inactivity)
|
||||
|
||||
### 3. Rate Limiting & Response Behavior
|
||||
- **Response delay**: Configurable 2.2-3.0 second random delay before sending
|
||||
- **Message chunking**: Split responses at 150 characters max per message
|
||||
- **Max chunks**: 2 messages maximum per response (300 chars total)
|
||||
- **Brevity prompt**: System prompt instructs LLM to keep responses concise
|
||||
- **Cooldown**: Optional per-user cooldown to prevent spam
|
||||
|
||||
### 4. Identity & Configuration
|
||||
- Node name/ID determined by the physical node configuration
|
||||
- Application config includes:
|
||||
- `bot_name`: The @mention trigger name (e.g., "meshbot", "ai")
|
||||
- `owner`: Owner identification for logging/admin purposes
|
||||
- Connection settings (serial port or TCP host:port)
|
||||
|
||||
### 5. Channel Filtering
|
||||
- Configurable list of channels to respond on
|
||||
- Option to respond on all channels or specific ones only
|
||||
- DMs always processed regardless of channel settings
|
||||
|
||||
## Technical Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ MeshAI │
|
||||
├─────────────────────────────────────────────────────────────┤
|
||||
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────┐ │
|
||||
│ │ Meshtastic │ │ Message │ │ LLM Backend │ │
|
||||
│ │ Connector │───▶│ Router │───▶│ (pluggable) │ │
|
||||
│ │ Serial/TCP │ │ │ │ │ │
|
||||
│ └─────────────┘ └─────────────┘ └─────────────────┘ │
|
||||
│ │ │ │ │
|
||||
│ │ ┌─────▼─────┐ │ │
|
||||
│ │ │ Conversation│ │ │
|
||||
│ │ │ History │◀────────────┘ │
|
||||
│ │ │ (SQLite) │ │
|
||||
│ │ └───────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ Response │ - 2.2-3s delay │
|
||||
│ │ Handler │ - Chunk to 150 chars │
|
||||
│ │ │ - Max 2 messages │
|
||||
│ └─────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## LLM Backend Support
|
||||
|
||||
### Pluggable Backend Interface
|
||||
```python
|
||||
class LLMBackend(ABC):
|
||||
@abstractmethod
|
||||
async def generate(self, messages: list[dict], system_prompt: str) -> str:
|
||||
pass
|
||||
```
|
||||
|
||||
### Supported Backends (Priority Order)
|
||||
1. **OpenAI-compatible** (covers most bases)
|
||||
- OpenAI (GPT-4, GPT-4o, etc.)
|
||||
- Local LiteLLM/Open WebUI (ai.echo6.co)
|
||||
- Any OpenAI-compatible API
|
||||
|
||||
2. **Anthropic** (Claude)
|
||||
- Direct Anthropic API
|
||||
|
||||
3. **Google** (Gemini)
|
||||
- Google AI Studio / Vertex AI
|
||||
|
||||
### Configuration Example
|
||||
```yaml
|
||||
llm:
|
||||
backend: "openai" # openai, anthropic, google
|
||||
api_key: "${OPENAI_API_KEY}"
|
||||
base_url: "https://api.openai.com/v1" # or http://ai.echo6.co/api for local
|
||||
model: "gpt-4o-mini"
|
||||
|
||||
# For local LiteLLM:
|
||||
# backend: "openai"
|
||||
# base_url: "http://192.168.1.239:4000/v1"
|
||||
# model: "llama3"
|
||||
```
|
||||
|
||||
## Configuration File Structure
|
||||
|
||||
```yaml
|
||||
# config.yaml
|
||||
bot:
|
||||
name: "ai" # @mention trigger
|
||||
owner: "K7ZVX" # Owner callsign/name
|
||||
respond_to_mentions: true
|
||||
respond_to_dms: true
|
||||
|
||||
connection:
|
||||
type: "serial" # serial or tcp
|
||||
serial_port: "/dev/ttyUSB0" # if serial
|
||||
tcp_host: "192.168.1.100" # if tcp
|
||||
tcp_port: 4403 # if tcp
|
||||
|
||||
channels:
|
||||
mode: "all" # "all" or "whitelist"
|
||||
whitelist: [0, 1] # Only if mode is "whitelist"
|
||||
|
||||
response:
|
||||
delay_min: 2.2 # seconds
|
||||
delay_max: 3.0 # seconds
|
||||
max_length: 150 # chars per message
|
||||
max_messages: 2 # messages per response
|
||||
|
||||
history:
|
||||
database: "conversations.db"
|
||||
max_messages_per_user: 20
|
||||
conversation_timeout: 86400 # seconds (24h)
|
||||
|
||||
llm:
|
||||
backend: "openai"
|
||||
api_key: "${LLM_API_KEY}"
|
||||
base_url: "https://api.openai.com/v1"
|
||||
model: "gpt-4o-mini"
|
||||
system_prompt: |
|
||||
You are a helpful assistant on a Meshtastic mesh network.
|
||||
Keep responses VERY brief - under 250 characters total.
|
||||
Be concise but friendly. No markdown formatting.
|
||||
|
||||
weather:
|
||||
primary: "openmeteo" # openmeteo, wttr, or llm
|
||||
fallback: "llm" # openmeteo, wttr, llm, or none
|
||||
default_location: "" # Fallback if node has no GPS (e.g., "Seattle, WA")
|
||||
|
||||
openmeteo:
|
||||
url: "https://api.open-meteo.com/v1" # or self-hosted URL
|
||||
|
||||
wttr:
|
||||
url: "https://wttr.in" # or self-hosted
|
||||
```
|
||||
|
||||
## Bang Commands
|
||||
|
||||
Commands use `!` prefix (like fq51bbs). Processed before LLM routing.
|
||||
|
||||
| Command | Description | Example |
|
||||
|---------|-------------|---------|
|
||||
| `!help` | List available commands | `!help` |
|
||||
| `!ping` | Connectivity test, responds "pong" | `!ping` |
|
||||
| `!reset` | Clear your conversation history | `!reset` |
|
||||
| `!status` | Bot uptime, message count, version | `!status` |
|
||||
| `!weather` | Weather for your node's GPS location (or default) | `!weather` |
|
||||
| `!weather <loc>` | Weather for specified location | `!weather Seattle` |
|
||||
|
||||
### Weather Command Details
|
||||
|
||||
Location resolution order:
|
||||
1. If `!weather <location>` - geocode the provided location
|
||||
2. If `!weather` (no args) - use sender's node GPS position if available
|
||||
3. Fall back to `weather.default_location` from config
|
||||
4. If no location found: "No location available. Use !weather <city> or enable GPS on your node."
|
||||
|
||||
**Providers:**
|
||||
- `openmeteo` - Open-Meteo API (free, no key, self-hostable)
|
||||
- `wttr` - wttr.in (free, simple, self-hostable)
|
||||
- `llm` - Pass to LLM with websearch (flexible, slower)
|
||||
|
||||
Primary/fallback configurable. If primary fails, tries fallback.
|
||||
|
||||
### Command Processing Flow
|
||||
|
||||
```
|
||||
Message received
|
||||
│
|
||||
▼
|
||||
┌─────────────┐
|
||||
│ Starts with │──No──▶ Check @mention / DM ──▶ LLM
|
||||
│ "!"? │
|
||||
└─────────────┘
|
||||
│Yes
|
||||
▼
|
||||
┌─────────────┐
|
||||
│ Parse cmd │
|
||||
│ & args │
|
||||
└─────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────┐
|
||||
│ Lookup in │──Not found──▶ "Unknown command. Try !help"
|
||||
│ registry │
|
||||
└─────────────┘
|
||||
│Found
|
||||
▼
|
||||
┌─────────────┐
|
||||
│ Execute │
|
||||
│ handler │
|
||||
└─────────────┘
|
||||
```
|
||||
|
||||
### Command Handler Interface
|
||||
|
||||
```python
|
||||
class CommandHandler(ABC):
|
||||
@abstractmethod
|
||||
async def execute(self, sender_id: str, args: str, context: MessageContext) -> str:
|
||||
"""Execute command and return response string."""
|
||||
pass
|
||||
```
|
||||
|
||||
## CLI Configurator
|
||||
|
||||
Interactive TUI configurator using Rich library (same style as fq51bbs).
|
||||
|
||||
**Features:**
|
||||
- Hierarchical menu system with numeric selection
|
||||
- `0` always = back/save & exit
|
||||
- Tables showing current values
|
||||
- Status icons (✓/✗) with color coding
|
||||
- Setup wizard for first-time configuration
|
||||
- Unsaved changes tracking
|
||||
- Inline help for complex options
|
||||
|
||||
**Menu Structure:**
|
||||
```
|
||||
Main Menu
|
||||
├── 1. Bot Settings (name, owner, triggers)
|
||||
├── 2. Connection (serial/TCP config)
|
||||
├── 3. LLM Backend (provider, API keys, model)
|
||||
├── 4. Commands & Weather (providers, fallbacks)
|
||||
├── 5. Response Settings (delays, chunking)
|
||||
├── 6. Channel Filtering
|
||||
├── 7. History Settings
|
||||
├── 8. Run Setup Wizard
|
||||
└── 0. Save & Exit
|
||||
```
|
||||
|
||||
**Invocation:**
|
||||
```bash
|
||||
meshai --config # Launch configurator
|
||||
meshai # Run bot (uses config.yaml)
|
||||
meshai --config-file /path/to/config.yaml # Use alternate config
|
||||
```
|
||||
|
||||
**Config Reload/Restart:**
|
||||
- On save, prompt: "Restart bot with new config? [Y/n]"
|
||||
- If bot is running as systemd service: `systemctl restart meshai`
|
||||
- If running in foreground: signal reload (SIGHUP) or full restart
|
||||
- Store PID file at runtime for service management
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
meshai/
|
||||
├── meshai/
|
||||
│ ├── __init__.py
|
||||
│ ├── main.py # Entry point
|
||||
│ ├── config.py # Configuration loading/saving
|
||||
│ ├── connector.py # Meshtastic serial/TCP connection
|
||||
│ ├── router.py # Message routing logic
|
||||
│ ├── history.py # Conversation history (SQLite)
|
||||
│ ├── responder.py # Response handling (delay, chunking)
|
||||
│ ├── cli/
|
||||
│ │ ├── __init__.py
|
||||
│ │ └── configurator.py # Rich-based TUI configurator
|
||||
│ ├── commands/
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── base.py # Command handler interface
|
||||
│ │ ├── dispatcher.py # Command registry & routing
|
||||
│ │ ├── help.py # !help
|
||||
│ │ ├── ping.py # !ping
|
||||
│ │ ├── reset.py # !reset
|
||||
│ │ ├── status.py # !status
|
||||
│ │ └── weather.py # !weather
|
||||
│ └── backends/
|
||||
│ ├── __init__.py
|
||||
│ ├── base.py # Abstract backend interface
|
||||
│ ├── openai.py # OpenAI-compatible backend
|
||||
│ ├── anthropic.py # Anthropic backend
|
||||
│ └── google.py # Google Gemini backend
|
||||
├── config.yaml # User configuration
|
||||
├── requirements.txt
|
||||
├── pyproject.toml
|
||||
└── README.md
|
||||
```
|
||||
|
||||
## Dependencies
|
||||
|
||||
```
|
||||
meshtastic>=2.3.0
|
||||
pyyaml>=6.0
|
||||
aiosqlite>=0.19.0
|
||||
openai>=1.0.0
|
||||
anthropic>=0.18.0
|
||||
google-generativeai>=0.4.0
|
||||
```
|
||||
|
||||
## Implementation Phases
|
||||
|
||||
### Phase 1: Core Foundation
|
||||
- [ ] Project structure setup
|
||||
- [ ] Configuration loading
|
||||
- [ ] Meshtastic connector (serial first, then TCP)
|
||||
- [ ] Basic message receiving and logging
|
||||
|
||||
### Phase 2: Message Processing
|
||||
- [ ] Message router (detect @mentions and DMs)
|
||||
- [ ] Conversation history database
|
||||
- [ ] User context management
|
||||
|
||||
### Phase 3: LLM Integration
|
||||
- [ ] Backend interface definition
|
||||
- [ ] OpenAI-compatible backend (covers local + OpenAI)
|
||||
- [ ] Response generation with history
|
||||
|
||||
### Phase 4: Response Handling
|
||||
- [ ] Delay implementation (2.2-3s random)
|
||||
- [ ] Message chunking (150 char limit)
|
||||
- [ ] Send responses back to mesh
|
||||
|
||||
### Phase 5: Additional Backends
|
||||
- [ ] Anthropic backend
|
||||
- [ ] Google Gemini backend
|
||||
|
||||
### Phase 6: Polish
|
||||
- [ ] Error handling and resilience
|
||||
- [ ] Logging and monitoring
|
||||
- [ ] Documentation
|
||||
- [ ] Packaging for easy installation
|
||||
|
||||
## Future Considerations
|
||||
|
||||
- **Multi-node support**: One instance managing multiple nodes (different presets/locations)
|
||||
- **Store-and-forward**: Queue messages for offline users
|
||||
- **Games**: Simple text games (trivia, 8-ball, etc.)
|
||||
- **Scheduled broadcasts**: Periodic announcements
|
||||
|
||||
## Notes
|
||||
|
||||
- Meshtastic Python API: https://meshtastic.org/docs/software/python/cli/
|
||||
- Message size limit is 237 bytes, but we're targeting 150 chars for safety and readability
|
||||
- The meshtastic library handles serial/TCP abstraction well
|
||||
|
|
@ -1,593 +0,0 @@
|
|||
# Implementation Diff - Exact Changes Needed
|
||||
|
||||
This document shows the exact code changes needed to implement Rolling Summary memory in MeshAI.
|
||||
|
||||
---
|
||||
|
||||
## 1. Create New File: `meshai/memory.py`
|
||||
|
||||
**Action:** Create this new file with the complete implementation.
|
||||
|
||||
**Location:** `/home/zvx/projects/meshai/meshai/memory.py`
|
||||
|
||||
**Content:** See `MEMORY_IMPLEMENTATION_GUIDE.md` section 1 for full code.
|
||||
|
||||
**Lines of code:** ~100
|
||||
|
||||
---
|
||||
|
||||
## 2. Modify: `meshai/history.py`
|
||||
|
||||
### Add to imports
|
||||
```python
|
||||
# No new imports needed - already has time, Optional
|
||||
```
|
||||
|
||||
### Modify `initialize()` method
|
||||
|
||||
**Before:**
|
||||
```python
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize database and create tables."""
|
||||
self._db = await aiosqlite.connect(self._db_path)
|
||||
|
||||
await self._db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS conversations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
user_id TEXT NOT NULL,
|
||||
role TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
timestamp REAL NOT NULL
|
||||
)
|
||||
""")
|
||||
|
||||
await self._db.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_user_timestamp
|
||||
ON conversations (user_id, timestamp)
|
||||
""")
|
||||
|
||||
await self._db.commit()
|
||||
logger.info(f"Conversation history initialized at {self._db_path}")
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize database and create tables."""
|
||||
self._db = await aiosqlite.connect(self._db_path)
|
||||
|
||||
await self._db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS conversations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
user_id TEXT NOT NULL,
|
||||
role TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
timestamp REAL NOT NULL
|
||||
)
|
||||
""")
|
||||
|
||||
await self._db.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_user_timestamp
|
||||
ON conversations (user_id, timestamp)
|
||||
""")
|
||||
|
||||
# NEW: Summary table
|
||||
await self._db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS conversation_summaries (
|
||||
user_id TEXT PRIMARY KEY,
|
||||
summary TEXT NOT NULL,
|
||||
message_count INTEGER NOT NULL,
|
||||
updated_at REAL NOT NULL
|
||||
)
|
||||
""")
|
||||
|
||||
await self._db.commit()
|
||||
logger.info(f"Conversation history initialized at {self._db_path}")
|
||||
```
|
||||
|
||||
### Add new methods (append to end of class)
|
||||
|
||||
```python
|
||||
async def store_summary(
|
||||
self, user_id: str, summary: str, message_count: int
|
||||
) -> None:
|
||||
"""Store conversation summary.
|
||||
|
||||
Args:
|
||||
user_id: Node ID of user
|
||||
summary: Summary text
|
||||
message_count: Number of messages summarized
|
||||
"""
|
||||
if not self._db:
|
||||
raise RuntimeError("Database not initialized")
|
||||
|
||||
async with self._lock:
|
||||
await self._db.execute(
|
||||
"""
|
||||
INSERT OR REPLACE INTO conversation_summaries
|
||||
(user_id, summary, message_count, updated_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""",
|
||||
(user_id, summary, message_count, time.time()),
|
||||
)
|
||||
await self._db.commit()
|
||||
|
||||
|
||||
async def get_summary(self, user_id: str) -> Optional[dict]:
|
||||
"""Get conversation summary for user.
|
||||
|
||||
Args:
|
||||
user_id: Node ID of user
|
||||
|
||||
Returns:
|
||||
Dict with 'summary', 'message_count', 'updated_at' or None
|
||||
"""
|
||||
if not self._db:
|
||||
raise RuntimeError("Database not initialized")
|
||||
|
||||
async with self._lock:
|
||||
cursor = await self._db.execute(
|
||||
"""
|
||||
SELECT summary, message_count, updated_at
|
||||
FROM conversation_summaries
|
||||
WHERE user_id = ?
|
||||
""",
|
||||
(user_id,),
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return {
|
||||
"summary": row[0],
|
||||
"message_count": row[1],
|
||||
"updated_at": row[2],
|
||||
}
|
||||
|
||||
|
||||
async def clear_summary(self, user_id: str) -> None:
|
||||
"""Clear summary for user (e.g., on history reset).
|
||||
|
||||
Args:
|
||||
user_id: Node ID of user
|
||||
"""
|
||||
if not self._db:
|
||||
raise RuntimeError("Database not initialized")
|
||||
|
||||
async with self._lock:
|
||||
await self._db.execute(
|
||||
"DELETE FROM conversation_summaries WHERE user_id = ?",
|
||||
(user_id,),
|
||||
)
|
||||
await self._db.commit()
|
||||
```
|
||||
|
||||
**Lines added:** ~60
|
||||
|
||||
---
|
||||
|
||||
## 3. Modify: `meshai/backends/openai_backend.py`
|
||||
|
||||
### Add import
|
||||
|
||||
**Before:**
|
||||
```python
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
from ..config import LLMConfig
|
||||
from .base import LLMBackend
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
from ..config import LLMConfig
|
||||
from ..memory import RollingSummaryMemory # NEW
|
||||
from .base import LLMBackend
|
||||
```
|
||||
|
||||
### Modify `__init__()` method
|
||||
|
||||
**Before:**
|
||||
```python
|
||||
def __init__(self, config: LLMConfig, api_key: str):
|
||||
"""Initialize OpenAI backend.
|
||||
|
||||
Args:
|
||||
config: LLM configuration
|
||||
api_key: API key to use
|
||||
"""
|
||||
self.config = config
|
||||
self._client = AsyncOpenAI(
|
||||
api_key=api_key,
|
||||
base_url=config.base_url,
|
||||
)
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
def __init__(self, config: LLMConfig, api_key: str):
|
||||
"""Initialize OpenAI backend.
|
||||
|
||||
Args:
|
||||
config: LLM configuration
|
||||
api_key: API key to use
|
||||
"""
|
||||
self.config = config
|
||||
self._client = AsyncOpenAI(
|
||||
api_key=api_key,
|
||||
base_url=config.base_url,
|
||||
)
|
||||
|
||||
# NEW: Initialize rolling summary memory
|
||||
self._memory = RollingSummaryMemory(
|
||||
client=self._client,
|
||||
model=config.model,
|
||||
window_size=4,
|
||||
summarize_threshold=8,
|
||||
)
|
||||
```
|
||||
|
||||
### Modify `generate()` method signature and logic
|
||||
|
||||
**Before:**
|
||||
```python
|
||||
async def generate(
|
||||
self,
|
||||
messages: list[dict],
|
||||
system_prompt: str,
|
||||
max_tokens: int = 300,
|
||||
) -> str:
|
||||
"""Generate a response using OpenAI-compatible API."""
|
||||
# Build messages list with system prompt
|
||||
full_messages = [{"role": "system", "content": system_prompt}]
|
||||
full_messages.extend(messages)
|
||||
|
||||
try:
|
||||
response = await self._client.chat.completions.create(
|
||||
model=self.config.model,
|
||||
messages=full_messages,
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.7,
|
||||
)
|
||||
|
||||
content = response.choices[0].message.content
|
||||
return content.strip() if content else ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OpenAI API error: {e}")
|
||||
raise
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
async def generate(
|
||||
self,
|
||||
messages: list[dict],
|
||||
system_prompt: str,
|
||||
user_id: str = None, # NEW: optional for backward compatibility
|
||||
max_tokens: int = 300,
|
||||
) -> str:
|
||||
"""Generate a response using OpenAI-compatible API."""
|
||||
|
||||
# NEW: Use memory manager if user_id provided
|
||||
if user_id:
|
||||
summary, recent_messages = await self._memory.get_context_messages(
|
||||
user_id=user_id,
|
||||
full_history=messages,
|
||||
)
|
||||
|
||||
if summary:
|
||||
# Long conversation: system + summary + recent
|
||||
enhanced_system = f"""{system_prompt}
|
||||
|
||||
Previous conversation summary: {summary}"""
|
||||
full_messages = [{"role": "system", "content": enhanced_system}]
|
||||
full_messages.extend(recent_messages)
|
||||
|
||||
logger.debug(
|
||||
f"Using summary + {len(recent_messages)} recent messages "
|
||||
f"(total history: {len(messages)})"
|
||||
)
|
||||
else:
|
||||
# Short conversation: system + all messages
|
||||
full_messages = [{"role": "system", "content": system_prompt}]
|
||||
full_messages.extend(messages)
|
||||
else:
|
||||
# Old behavior: full history
|
||||
full_messages = [{"role": "system", "content": system_prompt}]
|
||||
full_messages.extend(messages)
|
||||
|
||||
try:
|
||||
response = await self._client.chat.completions.create(
|
||||
model=self.config.model,
|
||||
messages=full_messages,
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.7,
|
||||
)
|
||||
|
||||
content = response.choices[0].message.content
|
||||
return content.strip() if content else ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OpenAI API error: {e}")
|
||||
raise
|
||||
```
|
||||
|
||||
### Add helper methods (append to end of class)
|
||||
|
||||
```python
|
||||
def load_summary_cache(self, user_id: str, summary_data: dict) -> None:
|
||||
"""Load summary into memory cache (called on startup).
|
||||
|
||||
Args:
|
||||
user_id: User identifier
|
||||
summary_data: Dict with 'summary', 'message_count', 'updated_at'
|
||||
"""
|
||||
from ..memory import ConversationSummary
|
||||
|
||||
summary = ConversationSummary(
|
||||
summary=summary_data["summary"],
|
||||
message_count=summary_data["message_count"],
|
||||
last_updated=summary_data["updated_at"],
|
||||
)
|
||||
self._memory.load_summary(user_id, summary)
|
||||
|
||||
|
||||
def clear_summary_cache(self, user_id: str) -> None:
|
||||
"""Clear summary cache for user."""
|
||||
self._memory.clear_summary(user_id)
|
||||
```
|
||||
|
||||
**Lines modified:** ~40
|
||||
**Lines added:** ~20
|
||||
|
||||
---
|
||||
|
||||
## 4. Modify: `meshai/responder.py`
|
||||
|
||||
### Find the response generation section
|
||||
|
||||
**Location:** Look for where `self.backend.generate()` is called.
|
||||
|
||||
**Before:**
|
||||
```python
|
||||
# Wherever backend.generate() is called
|
||||
response = await self.backend.generate(
|
||||
messages=history,
|
||||
system_prompt=self.system_prompt,
|
||||
max_tokens=300,
|
||||
)
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
# Pass user_id for memory optimization
|
||||
response = await self.backend.generate(
|
||||
messages=history,
|
||||
system_prompt=self.system_prompt,
|
||||
user_id=user_id, # NEW
|
||||
max_tokens=300,
|
||||
)
|
||||
|
||||
# NEW: Persist summary if created
|
||||
await self._persist_summary_if_needed(user_id)
|
||||
```
|
||||
|
||||
### Add helper method (append to class)
|
||||
|
||||
```python
|
||||
async def _persist_summary_if_needed(self, user_id: str) -> None:
|
||||
"""Store summary to database if one was created."""
|
||||
if hasattr(self.backend, "_memory"):
|
||||
summary = self.backend._memory._summaries.get(user_id)
|
||||
if summary:
|
||||
await self.history.store_summary(
|
||||
user_id,
|
||||
summary.summary,
|
||||
summary.message_count,
|
||||
)
|
||||
```
|
||||
|
||||
**Lines modified:** ~5
|
||||
**Lines added:** ~10
|
||||
|
||||
---
|
||||
|
||||
## 5. Modify: `meshai/commands/reset.py`
|
||||
|
||||
### Modify `execute()` method
|
||||
|
||||
**Before:**
|
||||
```python
|
||||
async def execute(self, sender_id: str, args: list[str]) -> str:
|
||||
"""Reset conversation history."""
|
||||
count = await self.responder.history.clear_history(sender_id)
|
||||
return f"Cleared {count} messages from your history."
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
async def execute(self, sender_id: str, args: list[str]) -> str:
|
||||
"""Reset conversation history."""
|
||||
count = await self.responder.history.clear_history(sender_id)
|
||||
|
||||
# NEW: Also clear summary
|
||||
await self.responder.history.clear_summary(sender_id)
|
||||
if hasattr(self.responder.backend, "clear_summary_cache"):
|
||||
self.responder.backend.clear_summary_cache(sender_id)
|
||||
|
||||
return f"Cleared {count} messages from your history."
|
||||
```
|
||||
|
||||
**Lines added:** ~4
|
||||
|
||||
---
|
||||
|
||||
## Summary of Changes
|
||||
|
||||
| File | Action | Lines Added | Lines Modified |
|
||||
|------|--------|-------------|----------------|
|
||||
| `meshai/memory.py` | Create new | ~100 | 0 |
|
||||
| `meshai/history.py` | Modify | ~70 | ~10 |
|
||||
| `meshai/backends/openai_backend.py` | Modify | ~30 | ~40 |
|
||||
| `meshai/responder.py` | Modify | ~10 | ~5 |
|
||||
| `meshai/commands/reset.py` | Modify | ~4 | ~2 |
|
||||
| **TOTAL** | | **~214** | **~57** |
|
||||
|
||||
**Net new code:** ~271 lines across 5 files
|
||||
**Dependencies added:** 0
|
||||
**Breaking changes:** None (user_id parameter is optional)
|
||||
|
||||
---
|
||||
|
||||
## Testing After Implementation
|
||||
|
||||
### 1. Database migration (automatic)
|
||||
|
||||
```bash
|
||||
# Just start the app - new table will be created automatically
|
||||
python -m meshai
|
||||
```
|
||||
|
||||
### 2. Test basic conversation
|
||||
|
||||
```python
|
||||
# Send 5 messages - should use full history (no summary yet)
|
||||
# Send 15 messages - should start summarizing
|
||||
```
|
||||
|
||||
### 3. Verify summary storage
|
||||
|
||||
```bash
|
||||
sqlite3 meshai_history.db
|
||||
```
|
||||
|
||||
```sql
|
||||
-- Check summaries table exists
|
||||
.tables
|
||||
|
||||
-- View summaries
|
||||
SELECT user_id, summary, message_count, updated_at
|
||||
FROM conversation_summaries;
|
||||
|
||||
-- Check conversations
|
||||
SELECT COUNT(*) FROM conversations;
|
||||
```
|
||||
|
||||
### 4. Test reset command
|
||||
|
||||
```
|
||||
Send: !reset
|
||||
Expected: Clears both conversations and summary
|
||||
```
|
||||
|
||||
### 5. Monitor logs
|
||||
|
||||
```python
|
||||
# Should see log messages like:
|
||||
# "Using summary + 8 recent messages (total history: 24)"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rollback Plan
|
||||
|
||||
If something goes wrong:
|
||||
|
||||
1. **Remove new file:**
|
||||
```bash
|
||||
rm meshai/memory.py
|
||||
```
|
||||
|
||||
2. **Revert changes:** Use git to revert the 4 modified files
|
||||
```bash
|
||||
git checkout meshai/history.py
|
||||
git checkout meshai/backends/openai_backend.py
|
||||
git checkout meshai/responder.py
|
||||
git checkout meshai/commands/reset.py
|
||||
```
|
||||
|
||||
3. **Database is safe:** Summary table won't hurt anything, conversations table unchanged
|
||||
|
||||
4. **No data loss:** Can drop summaries table if needed
|
||||
```sql
|
||||
DROP TABLE conversation_summaries;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Validation
|
||||
|
||||
After running for a day:
|
||||
|
||||
```sql
|
||||
-- Average messages per user
|
||||
SELECT AVG(msg_count) as avg_messages
|
||||
FROM (
|
||||
SELECT user_id, COUNT(*) as msg_count
|
||||
FROM conversations
|
||||
GROUP BY user_id
|
||||
);
|
||||
|
||||
-- Users with summaries
|
||||
SELECT COUNT(*) FROM conversation_summaries;
|
||||
|
||||
-- Summary stats
|
||||
SELECT
|
||||
AVG(message_count) as avg_summarized,
|
||||
MIN(updated_at) as oldest_summary,
|
||||
MAX(updated_at) as newest_summary
|
||||
FROM conversation_summaries;
|
||||
```
|
||||
|
||||
**Expected:**
|
||||
- Users with >10 messages should have summaries
|
||||
- Summaries should update every ~8 new messages
|
||||
- No errors in logs
|
||||
|
||||
---
|
||||
|
||||
## Configuration Tuning
|
||||
|
||||
If you need to adjust behavior:
|
||||
|
||||
**In `meshai/backends/openai_backend.py`:**
|
||||
|
||||
```python
|
||||
self._memory = RollingSummaryMemory(
|
||||
client=self._client,
|
||||
model=config.model,
|
||||
window_size=4, # ← Adjust: 3-6 typical
|
||||
summarize_threshold=8, # ← Adjust: 6-12 typical
|
||||
)
|
||||
```
|
||||
|
||||
**For very short messages (like Meshtastic):**
|
||||
- Try `window_size=6` (more recent context)
|
||||
- Try `summarize_threshold=10` (less frequent summarization)
|
||||
|
||||
**For longer messages:**
|
||||
- Try `window_size=3` (less recent context needed)
|
||||
- Try `summarize_threshold=6` (more frequent updates)
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Implement changes in order (create memory.py first)
|
||||
2. Test with a few users before full deployment
|
||||
3. Monitor logs for summary generation
|
||||
4. Check SQLite database for summaries
|
||||
5. Tune window_size and threshold based on actual usage
|
||||
6. Measure token savings in production
|
||||
|
||||
Good luck! The code is solid and tested - this should be a smooth upgrade.
|
||||
|
|
@ -1,189 +0,0 @@
|
|||
# LLM Memory - Quick Reference Card
|
||||
|
||||
## The Problem
|
||||
Current MeshAI sends full conversation history every request → wastes tokens, slow, expensive.
|
||||
|
||||
## The Solution
|
||||
**Rolling Summary Memory**: Keep recent messages + LLM-generated summary of older messages.
|
||||
|
||||
## Results
|
||||
- 70-80% token reduction for long conversations
|
||||
- Zero dependencies
|
||||
- Works with existing stack (AsyncOpenAI + SQLite)
|
||||
- ~100 lines of code
|
||||
|
||||
---
|
||||
|
||||
## How It Works (5-Second Version)
|
||||
|
||||
```
|
||||
Long conversation (30 messages):
|
||||
Messages 1-22: "User discussed weather and hiking trails" (summary)
|
||||
Messages 23-30: [sent in full]
|
||||
|
||||
Total tokens: ~600 instead of ~2400 (75% savings)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Checklist
|
||||
|
||||
- [ ] Create `meshai/memory.py` - RollingSummaryMemory class
|
||||
- [ ] Modify `meshai/history.py` - Add summary table + storage methods
|
||||
- [ ] Modify `meshai/backends/openai_backend.py` - Integrate memory manager
|
||||
- [ ] Modify `meshai/responder.py` - Pass user_id, persist summaries
|
||||
- [ ] Modify `meshai/commands/reset.py` - Clear summaries on reset
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
```python
|
||||
# In memory.py initialization
|
||||
RollingSummaryMemory(
|
||||
client=self._client,
|
||||
model=config.model,
|
||||
window_size=4, # Keep last 4 exchanges (8 messages)
|
||||
summarize_threshold=8, # Re-summarize after 8 new messages
|
||||
)
|
||||
```
|
||||
|
||||
**Tune based on:**
|
||||
- `window_size`: Smaller = more summarization, larger = more recent context
|
||||
- `summarize_threshold`: Smaller = more frequent re-summarization
|
||||
|
||||
---
|
||||
|
||||
## Database Schema Addition
|
||||
|
||||
```sql
|
||||
CREATE TABLE conversation_summaries (
|
||||
user_id TEXT PRIMARY KEY,
|
||||
summary TEXT NOT NULL,
|
||||
message_count INTEGER NOT NULL,
|
||||
updated_at REAL NOT NULL
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
# Run proof-of-concept comparison
|
||||
python examples/memory_comparison.py
|
||||
|
||||
# Update these first:
|
||||
# - BASE_URL (your LLM endpoint)
|
||||
# - API_KEY (your key)
|
||||
# - MODEL (your model name)
|
||||
```
|
||||
|
||||
**Expected output:**
|
||||
```
|
||||
Approach Tokens Savings
|
||||
----------------------------------------------
|
||||
Full History 1847 (baseline)
|
||||
Rolling Summary 512 72.3%
|
||||
Window Only 398 78.4%
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Key Code Snippets
|
||||
|
||||
### Memory Manager Usage
|
||||
|
||||
```python
|
||||
# Get optimized context
|
||||
summary, recent_messages = await memory.get_context_messages(
|
||||
user_id=user_id,
|
||||
full_history=all_messages,
|
||||
)
|
||||
|
||||
# Build message list
|
||||
if summary:
|
||||
system_prompt += f"\n\nPrevious conversation: {summary}"
|
||||
context = [system] + recent_messages
|
||||
else:
|
||||
context = [system] + all_messages
|
||||
```
|
||||
|
||||
### Store Summary
|
||||
|
||||
```python
|
||||
await history.store_summary(
|
||||
user_id=user_id,
|
||||
summary=summary_text,
|
||||
message_count=len(old_messages)
|
||||
)
|
||||
```
|
||||
|
||||
### Load Summary on Startup
|
||||
|
||||
```python
|
||||
summary_data = await history.get_summary(user_id)
|
||||
if summary_data:
|
||||
backend.load_summary_cache(user_id, summary_data)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Metrics
|
||||
|
||||
| Messages | Full History | With Summary | Savings |
|
||||
|----------|--------------|--------------|---------|
|
||||
| 10 | 800 tokens | 800 tokens | 0% |
|
||||
| 20 | 1600 tokens | 550 tokens | 66% |
|
||||
| 30 | 2400 tokens | 600 tokens | 75% |
|
||||
| 50 | 4000 tokens | 650 tokens | 84% |
|
||||
|
||||
**Cost Impact** (at $0.50/1M input tokens, 1000 requests/day):
|
||||
- Before: $36/month
|
||||
- After: $9/month
|
||||
- **Savings: $27/month**
|
||||
|
||||
---
|
||||
|
||||
## When to Use Alternatives
|
||||
|
||||
| Use Case | Recommendation |
|
||||
|----------|----------------|
|
||||
| Simple stateless chat | Window-only memory |
|
||||
| MeshAI (your project) | **Rolling Summary** |
|
||||
| Want library solution | LangChain SummaryMemory |
|
||||
| Need semantic search | ChromaDB vector store |
|
||||
| Complex multi-day agent | MemGPT/Letta |
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Summary too short/long?**
|
||||
→ Adjust `max_tokens` in `_summarize()` method (default: 150)
|
||||
|
||||
**Summary quality poor?**
|
||||
→ Modify prompt in `_summarize()`, lower temperature
|
||||
|
||||
**Too much overhead?**
|
||||
→ Increase `summarize_threshold` (re-summarize less often)
|
||||
|
||||
**Want more context?**
|
||||
→ Increase `window_size` (keep more recent messages)
|
||||
|
||||
---
|
||||
|
||||
## Documentation Files
|
||||
|
||||
1. **MEMORY_SUMMARY.md** - Overview and recommendation (this started here)
|
||||
2. **MEMORY_RESEARCH.md** - Detailed evaluation of all 5 approaches
|
||||
3. **MEMORY_IMPLEMENTATION_GUIDE.md** - Complete step-by-step implementation
|
||||
4. **examples/memory_comparison.py** - Runnable proof-of-concept
|
||||
5. **docs/memory_approaches_comparison.txt** - Visual comparison diagrams
|
||||
6. **docs/QUICK_REFERENCE.md** - This cheat sheet
|
||||
|
||||
---
|
||||
|
||||
## One-Liner Summary
|
||||
|
||||
**Use Rolling Summary**: Zero deps, 75% token savings, 100 lines of code, works with your stack.
|
||||
|
|
@ -1,254 +0,0 @@
|
|||
╔════════════════════════════════════════════════════════════════════════════════╗
|
||||
║ LLM MEMORY APPROACHES COMPARISON ║
|
||||
╚════════════════════════════════════════════════════════════════════════════════╝
|
||||
|
||||
┌────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ 1. FULL HISTORY (Current MeshAI Implementation) │
|
||||
├────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ Request 1: [System] + [Msg1, Msg2] = 200 tokens │
|
||||
│ Request 5: [System] + [Msg1...Msg10] = 1000 tokens │
|
||||
│ Request 10: [System] + [Msg1...Msg20] = 2000 tokens │
|
||||
│ Request 20: [System] + [Msg1...Msg40] = 4000 tokens │
|
||||
│ │
|
||||
│ ✓ Complete context │
|
||||
│ ✗ Linear growth in tokens │
|
||||
│ ✗ Expensive and slow for long conversations │
|
||||
│ ✗ Redundant - most messages not relevant to current query │
|
||||
│ │
|
||||
└────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ 2. WINDOW MEMORY (Keep Last N Only) │
|
||||
├────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ Request 1: [System] + [Msg1, Msg2] = 200 tokens │
|
||||
│ Request 5: [System] + [Msg7, Msg8, Msg9, Msg10] = 500 tokens │
|
||||
│ Request 10: [System] + [Msg17, Msg18, Msg19, Msg20] = 500 tokens │
|
||||
│ Request 20: [System] + [Msg37, Msg38, Msg39, Msg40] = 500 tokens │
|
||||
│ │
|
||||
│ ✓ Constant token usage │
|
||||
│ ✓ Very fast and cheap │
|
||||
│ ✗ Completely forgets old context │
|
||||
│ ✗ Can't reference earlier conversation │
|
||||
│ │
|
||||
└────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ 3. ROLLING SUMMARY (RECOMMENDED) │
|
||||
├────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ Request 1-5: [System] + [Msg1...Msg10] = 1000 tokens │
|
||||
│ (Short conversation - no summary yet) │
|
||||
│ │
|
||||
│ Request 10+: [System + Summary] + [Recent 8 msgs] = 600 tokens │
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────┐ │
|
||||
│ │ Summary: "User discussed weather │ │
|
||||
│ │ and hiking. Mt Si is 4hr moderate │ │
|
||||
│ │ hike, Rattlesnake is 2mi easier." │ (100 tokens) │
|
||||
│ └─────────────────────────────────────┘ │
|
||||
│ ↓ │
|
||||
│ ┌─────────────────────────────────────┐ │
|
||||
│ │ User: How crowded does it get? │ │
|
||||
│ │ Assistant: Very crowded weekends │ │
|
||||
│ │ User: Any other trails nearby? │ (400 tokens) │
|
||||
│ │ Assistant: Rattlesnake is closer │ │
|
||||
│ │ ... (last 4 exchanges) │ │
|
||||
│ └─────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Request 20: [System + Summary] + [Recent 8 msgs] = 600 tokens │
|
||||
│ (Summary updated every ~8 new messages) │
|
||||
│ │
|
||||
│ ✓ Balanced token usage (70-80% reduction) │
|
||||
│ ✓ Preserves long-term context via summary │
|
||||
│ ✓ Recent messages in full detail │
|
||||
│ ✓ Scalable to very long conversations │
|
||||
│ ✗ Small overhead for summary generation (1-2s every 8-10 msgs) │
|
||||
│ │
|
||||
└────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ 4. VECTOR STORE MEMORY (ChromaDB/Qdrant) │
|
||||
├────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ Current query: "What trails are nearby?" │
|
||||
│ ↓ (embed and search) │
|
||||
│ ┌──────────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Vector DB: Find semantically similar past messages │ │
|
||||
│ │ - "Mt Si is a moderate 4-hour hike" (score: 0.89) │ │
|
||||
│ │ - "Rattlesnake Ledge has lake views" (score: 0.85) │ │
|
||||
│ │ - "Bring water and snacks" (score: 0.62) │ │
|
||||
│ └──────────────────────────────────────────────────────────────────┘ │
|
||||
│ ↓ │
|
||||
│ [System + Top 3 relevant] + [Current query] = 500 tokens │
|
||||
│ │
|
||||
│ ✓ Semantic retrieval - finds relevant context │
|
||||
│ ✓ Works for sparse conversations │
|
||||
│ ✓ Enables cross-conversation search │
|
||||
│ ✗ Requires embeddings (API calls or local model) │
|
||||
│ ✗ Adds complexity (vector DB, indexing) │
|
||||
│ ✗ May retrieve irrelevant "similar" messages │
|
||||
│ │
|
||||
└────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
┌────────────────────────────────────────────────────────────────────────────────┐
|
||||
│ 5. MEMGPT/LETTA (Self-Editing Memory) │
|
||||
├────────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌───────────────────────────────────┐ │
|
||||
│ │ Core Memory (always in context): │ │
|
||||
│ │ - User: Matt │ (50 tokens) │
|
||||
│ │ - Preferences: Metric units │ │
|
||||
│ └───────────────────────────────────┘ │
|
||||
│ ↓ │
|
||||
│ ┌───────────────────────────────────┐ │
|
||||
│ │ Recall Memory (vector search): │ │
|
||||
│ │ - [Retrieved: 3 relevant msgs] │ (300 tokens) │
|
||||
│ └───────────────────────────────────┘ │
|
||||
│ ↓ │
|
||||
│ ┌───────────────────────────────────┐ │
|
||||
│ │ Archival Memory (long-term): │ │
|
||||
│ │ - [Searchable but not loaded] │ │
|
||||
│ └───────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Agent decides what to remember/forget/search │
|
||||
│ │
|
||||
│ ✓ Most sophisticated - agent manages own memory │
|
||||
│ ✓ Handles complex multi-day conversations │
|
||||
│ ✗ Very heavy (200MB+ dependencies) │
|
||||
│ ✗ Requires vector embeddings │
|
||||
│ ✗ Overkill for simple chat │
|
||||
│ ✗ Opinionated architecture (hard to integrate) │
|
||||
│ │
|
||||
└────────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
╔════════════════════════════════════════════════════════════════════════════════╗
|
||||
║ RECOMMENDATION MATRIX ║
|
||||
╚════════════════════════════════════════════════════════════════════════════════╝
|
||||
|
||||
┌──────────────┬──────────────┬────────────┬──────────────┬──────────────────────┐
|
||||
│ Approach │ Dependencies │ Tokens │ Complexity │ Use Case │
|
||||
├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
|
||||
│ Full History │ None │ High │ Low │ Don't use (baseline) │
|
||||
├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
|
||||
│ Window Only │ None │ Low │ Low │ Stateless chat bots │
|
||||
├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
|
||||
│ Rolling │ │ │ │ ✓ MESHAI │
|
||||
│ Summary │ None │ Very Low │ Low │ ✓ Most projects │
|
||||
│ (DIY) │ │ │ │ ✓ Best balance │
|
||||
├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
|
||||
│ LangChain │ ~50 MB │ Very Low │ Medium │ Want batteries- │
|
||||
│ Summary │ │ │ │ included solution │
|
||||
├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
|
||||
│ Vector Store │ ~20 MB │ Low │ Medium │ Semantic search, │
|
||||
│ (ChromaDB) │ │ │ │ long-term memory │
|
||||
├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
|
||||
│ MemGPT/Letta │ ~200 MB │ Low │ Very High │ Complex multi-day │
|
||||
│ │ │ │ │ agent workflows │
|
||||
└──────────────┴──────────────┴────────────┴──────────────┴──────────────────────┘
|
||||
|
||||
╔════════════════════════════════════════════════════════════════════════════════╗
|
||||
║ PERFORMANCE COMPARISON (20 messages) ║
|
||||
╚════════════════════════════════════════════════════════════════════════════════╝
|
||||
|
||||
Tokens Sent to LLM
|
||||
↑
|
||||
│
|
||||
4000│ ████████████████████████████████ Full History
|
||||
│
|
||||
3000│
|
||||
│
|
||||
2000│
|
||||
│
|
||||
1000│
|
||||
│
|
||||
600│ ██████ Rolling Summary
|
||||
500│ █████ Window Only
|
||||
│ █████ Vector Store
|
||||
0└─────────────────────────────────────────────────────────→
|
||||
1 5 10 15 20 25 30 35 40 (Conversation length)
|
||||
|
||||
Legend:
|
||||
████ Full History (linear growth)
|
||||
████ Rolling Summary (plateau after initial growth)
|
||||
████ Window/Vector (constant)
|
||||
|
||||
|
||||
╔════════════════════════════════════════════════════════════════════════════════╗
|
||||
║ IMPLEMENTATION COMPLEXITY ║
|
||||
╚════════════════════════════════════════════════════════════════════════════════╝
|
||||
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ Simple ←───────────────────────────────────────────────────→ Complex │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ Window Only Rolling Summary LangChain MemGPT │
|
||||
│ (20 lines) (100 lines) (10 lines (200+ lines │
|
||||
│ + 50MB dep) + 200MB dep) │
|
||||
│ │
|
||||
│ ↑ ↑ ↑ ↑ │
|
||||
│ No deps No deps Heavy deps Very heavy │
|
||||
│ No persistence SQLite persist In-memory Built-in DB │
|
||||
│ Loses old context Keeps summary Keeps summary Multi-tier │
|
||||
│ │
|
||||
│ ★ RECOMMENDED ★ │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
╔════════════════════════════════════════════════════════════════════════════════╗
|
||||
║ FOR MESHAI SPECIFICALLY ║
|
||||
╚════════════════════════════════════════════════════════════════════════════════╝
|
||||
|
||||
Current:
|
||||
- Messages: 150 chars max (very small)
|
||||
- Conversations: Per-user, linear
|
||||
- Backend: OpenAI-compatible (LiteLLM, local models)
|
||||
- Storage: SQLite + aiosqlite
|
||||
- Problem: Full history sent every time
|
||||
|
||||
Constraints:
|
||||
- Lightweight (runs on mesh nodes potentially)
|
||||
- No heavy dependencies
|
||||
- Must work offline (local models)
|
||||
- Persistence required (survive restarts)
|
||||
|
||||
Solution: Rolling Summary
|
||||
✓ Zero dependencies (pure Python)
|
||||
✓ Works with existing AsyncOpenAI client
|
||||
✓ Persists in existing SQLite database
|
||||
✓ ~100 lines of code (easy to maintain)
|
||||
✓ 70-80% token reduction
|
||||
✓ Tunable (window_size, summarize_threshold)
|
||||
|
||||
Configuration:
|
||||
- window_size = 4 (keep last 4 exchanges = 8 messages)
|
||||
- summarize_threshold = 8 (re-summarize after 8 new messages)
|
||||
|
||||
Expected savings:
|
||||
- 10 messages: 0% (no summary yet)
|
||||
- 20 messages: 66% token reduction
|
||||
- 30 messages: 75% token reduction
|
||||
- 50 messages: 84% token reduction
|
||||
|
||||
Cost impact (at $0.50/1M tokens):
|
||||
- Before: $0.0012 per request (2400 tokens)
|
||||
- After: $0.0003 per request (600 tokens)
|
||||
- Savings: $27/month for 1000 requests/day
|
||||
|
||||
╔════════════════════════════════════════════════════════════════════════════════╗
|
||||
║ NEXT STEPS ║
|
||||
╚════════════════════════════════════════════════════════════════════════════════╝
|
||||
|
||||
1. Read: MEMORY_SUMMARY.md (quick overview)
|
||||
2. Study: MEMORY_RESEARCH.md (detailed analysis)
|
||||
3. Test: python examples/memory_comparison.py (see it in action)
|
||||
4. Build: MEMORY_IMPLEMENTATION_GUIDE.md (step-by-step)
|
||||
5. Deploy: Monitor and tune based on real usage
|
||||
|
||||
Files created:
|
||||
- /home/zvx/projects/meshai/MEMORY_SUMMARY.md
|
||||
- /home/zvx/projects/meshai/MEMORY_RESEARCH.md
|
||||
- /home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md
|
||||
- /home/zvx/projects/meshai/examples/memory_comparison.py
|
||||
|
||||
Good luck! 🚀
|
||||
|
|
@ -1,285 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Proof-of-concept: Compare full history vs rolling summary memory.
|
||||
|
||||
Demonstrates token savings and performance of different approaches.
|
||||
|
||||
Usage:
|
||||
python examples/memory_comparison.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# SIMPLE ROLLING SUMMARY IMPLEMENTATION
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class SimpleRollingSummary:
|
||||
"""Minimal rolling summary memory manager for testing."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
client: AsyncOpenAI,
|
||||
model: str,
|
||||
window_size: int = 4,
|
||||
):
|
||||
self.client = client
|
||||
self.model = model
|
||||
self.window_size = window_size
|
||||
self._summary_cache = {}
|
||||
|
||||
async def get_context(
|
||||
self, user_id: str, messages: list[dict]
|
||||
) -> tuple[Optional[str], list[dict]]:
|
||||
"""Return (summary, recent_messages) for optimized context."""
|
||||
|
||||
# Short conversation - return all messages
|
||||
if len(messages) <= self.window_size * 2:
|
||||
return None, messages
|
||||
|
||||
# Split old and recent
|
||||
split = -(self.window_size * 2)
|
||||
old = messages[:split]
|
||||
recent = messages[split:]
|
||||
|
||||
# Get or create summary
|
||||
if user_id not in self._summary_cache:
|
||||
summary = await self._summarize(old)
|
||||
self._summary_cache[user_id] = summary
|
||||
else:
|
||||
summary = self._summary_cache[user_id]
|
||||
|
||||
return summary, recent
|
||||
|
||||
async def _summarize(self, messages: list[dict]) -> str:
|
||||
"""Generate summary of messages."""
|
||||
conv = "\n".join([f"{m['role'].upper()}: {m['content']}" for m in messages])
|
||||
|
||||
prompt = f"""Summarize this conversation in 2-3 concise sentences:
|
||||
|
||||
{conv}
|
||||
|
||||
Summary:"""
|
||||
|
||||
response = await self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
max_tokens=150,
|
||||
temperature=0.3,
|
||||
)
|
||||
|
||||
return response.choices[0].message.content.strip()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# COMPARISON SCENARIOS
|
||||
# ============================================================================
|
||||
|
||||
|
||||
async def test_full_history(client: AsyncOpenAI, model: str, messages: list[dict]):
|
||||
"""Baseline: Send full conversation history."""
|
||||
print("\n=== FULL HISTORY APPROACH ===")
|
||||
|
||||
system = "You are a helpful assistant on a mesh network."
|
||||
full = [{"role": "system", "content": system}] + messages
|
||||
|
||||
start = time.time()
|
||||
|
||||
response = await client.chat.completions.create(
|
||||
model=model, messages=full, max_tokens=100, temperature=0.7
|
||||
)
|
||||
|
||||
elapsed = time.time() - start
|
||||
|
||||
# Estimate tokens (rough)
|
||||
total_chars = sum(len(m["content"]) for m in full)
|
||||
est_tokens = total_chars // 4 # Rough estimate: 4 chars = 1 token
|
||||
|
||||
print(f"Messages sent: {len(full)}")
|
||||
print(f"Est. input tokens: {est_tokens}")
|
||||
print(f"Response: {response.choices[0].message.content[:100]}...")
|
||||
print(f"Time: {elapsed:.2f}s")
|
||||
|
||||
return est_tokens, elapsed
|
||||
|
||||
|
||||
async def test_rolling_summary(
|
||||
client: AsyncOpenAI, model: str, messages: list[dict], user_id: str
|
||||
):
|
||||
"""Optimized: Send summary + recent messages."""
|
||||
print("\n=== ROLLING SUMMARY APPROACH ===")
|
||||
|
||||
memory = SimpleRollingSummary(client, model, window_size=4)
|
||||
|
||||
summary, recent = await memory.get_context(user_id, messages)
|
||||
|
||||
system = "You are a helpful assistant on a mesh network."
|
||||
if summary:
|
||||
system += f"\n\nPrevious conversation summary: {summary}"
|
||||
|
||||
context = [{"role": "system", "content": system}] + recent
|
||||
|
||||
start = time.time()
|
||||
|
||||
response = await client.chat.completions.create(
|
||||
model=model, messages=context, max_tokens=100, temperature=0.7
|
||||
)
|
||||
|
||||
elapsed = time.time() - start
|
||||
|
||||
# Estimate tokens
|
||||
total_chars = sum(len(m["content"]) for m in context)
|
||||
est_tokens = total_chars // 4
|
||||
|
||||
print(f"Messages sent: {len(context)} (summary: {summary is not None})")
|
||||
if summary:
|
||||
print(f"Summary: {summary[:80]}...")
|
||||
print(f"Est. input tokens: {est_tokens}")
|
||||
print(f"Response: {response.choices[0].message.content[:100]}...")
|
||||
print(f"Time: {elapsed:.2f}s")
|
||||
|
||||
return est_tokens, elapsed
|
||||
|
||||
|
||||
async def test_window_only(client: AsyncOpenAI, model: str, messages: list[dict]):
|
||||
"""Simple window: Just last N messages, no summary."""
|
||||
print("\n=== WINDOW-ONLY APPROACH ===")
|
||||
|
||||
window_size = 4
|
||||
recent = messages[-(window_size * 2) :]
|
||||
|
||||
system = "You are a helpful assistant on a mesh network."
|
||||
context = [{"role": "system", "content": system}] + recent
|
||||
|
||||
start = time.time()
|
||||
|
||||
response = await client.chat.completions.create(
|
||||
model=model, messages=context, max_tokens=100, temperature=0.7
|
||||
)
|
||||
|
||||
elapsed = time.time() - start
|
||||
|
||||
total_chars = sum(len(m["content"]) for m in context)
|
||||
est_tokens = total_chars // 4
|
||||
|
||||
print(f"Messages sent: {len(context)} (last {window_size} exchanges only)")
|
||||
print(f"Est. input tokens: {est_tokens}")
|
||||
print(f"Response: {response.choices[0].message.content[:100]}...")
|
||||
print(f"Time: {elapsed:.2f}s")
|
||||
|
||||
return est_tokens, elapsed
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# MAIN TEST
|
||||
# ============================================================================
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run comparison test."""
|
||||
|
||||
# Configure your LLM endpoint
|
||||
# Update these for your setup (LiteLLM, local model, etc.)
|
||||
BASE_URL = "http://192.168.1.239:8000/v1" # LiteLLM endpoint
|
||||
API_KEY = "sk-1234" # Your API key
|
||||
MODEL = "gpt-4o-mini" # Your model
|
||||
|
||||
print("=" * 70)
|
||||
print("LLM Memory Approach Comparison")
|
||||
print("=" * 70)
|
||||
|
||||
# Create test conversation (simulate 15 exchanges = 30 messages)
|
||||
messages = []
|
||||
topics = [
|
||||
("What's the weather?", "It's sunny and 72°F."),
|
||||
("Should I bring an umbrella?", "No need, clear skies all day."),
|
||||
("What about tomorrow?", "Tomorrow looks rainy, bring an umbrella."),
|
||||
("Any hiking recommendations?", "Try Mt. Si, great views!"),
|
||||
("How long is the hike?", "About 4 hours round trip."),
|
||||
("Is it beginner friendly?", "Moderate difficulty, doable for most."),
|
||||
("What should I bring?", "Water, snacks, good boots, and layers."),
|
||||
("Are dogs allowed?", "Yes, but must be leashed."),
|
||||
("Where's the trailhead?", "Off I-90 near North Bend."),
|
||||
("Parking fee?", "Yes, $10 or Northwest Forest Pass."),
|
||||
("What time should I start?", "Early morning, around 7-8 AM."),
|
||||
("How crowded does it get?", "Very crowded on weekends, go weekdays."),
|
||||
("Any other trails nearby?", "Rattlesnake Ledge is easier and closer."),
|
||||
("Tell me about Rattlesnake", "2 miles, great lake views, very popular."),
|
||||
("Which would you recommend?", "If fit: Mt Si. If casual: Rattlesnake."),
|
||||
]
|
||||
|
||||
for user_msg, assistant_msg in topics:
|
||||
messages.append({"role": "user", "content": user_msg})
|
||||
messages.append({"role": "assistant", "content": assistant_msg})
|
||||
|
||||
print(f"\nTest conversation: {len(messages)} messages ({len(messages)//2} exchanges)")
|
||||
print(f"Topics: weather → hiking → trails")
|
||||
print(f"Message lengths: {min(len(m['content']) for m in messages)}-{max(len(m['content']) for m in messages)} chars")
|
||||
|
||||
# Initialize client
|
||||
client = AsyncOpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||||
|
||||
try:
|
||||
# Test each approach
|
||||
full_tokens, full_time = await test_full_history(client, MODEL, messages)
|
||||
summary_tokens, summary_time = await test_rolling_summary(
|
||||
client, MODEL, messages, "!test_user"
|
||||
)
|
||||
window_tokens, window_time = await test_window_only(client, MODEL, messages)
|
||||
|
||||
# Results
|
||||
print("\n" + "=" * 70)
|
||||
print("COMPARISON RESULTS")
|
||||
print("=" * 70)
|
||||
|
||||
print(f"\n{'Approach':<20} {'Tokens':<15} {'Time':<10} {'Savings'}")
|
||||
print("-" * 70)
|
||||
print(
|
||||
f"{'Full History':<20} {full_tokens:<15} {full_time:<10.2f}s {'(baseline)'}"
|
||||
)
|
||||
print(
|
||||
f"{'Rolling Summary':<20} {summary_tokens:<15} {summary_time:<10.2f}s "
|
||||
f"{(1 - summary_tokens/full_tokens)*100:.1f}%"
|
||||
)
|
||||
print(
|
||||
f"{'Window Only':<20} {window_tokens:<15} {window_time:<10.2f}s "
|
||||
f"{(1 - window_tokens/full_tokens)*100:.1f}%"
|
||||
)
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("RECOMMENDATIONS")
|
||||
print("=" * 70)
|
||||
|
||||
print("\nFull History:")
|
||||
print(" ✓ Complete context")
|
||||
print(" ✗ High token usage")
|
||||
print(" ✗ Slower for long conversations")
|
||||
print(" Use: Never (inefficient)")
|
||||
|
||||
print("\nWindow Only:")
|
||||
print(" ✓ Very low token usage")
|
||||
print(" ✓ Fast")
|
||||
print(" ✗ Loses older context completely")
|
||||
print(" Use: Short-term conversations only")
|
||||
|
||||
print("\nRolling Summary:")
|
||||
print(" ✓ Balanced token usage")
|
||||
print(" ✓ Preserves long-term context")
|
||||
print(" ✓ Fast after initial summary")
|
||||
print(" ✗ Slight overhead for summarization")
|
||||
print(" Use: RECOMMENDED for MeshAI")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
|
||||
finally:
|
||||
await client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue