Remove AI planning docs and example scripts

These were LLM-generated planning artifacts from the memory
implementation phase. Not user-facing documentation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Ubuntu 2026-02-24 00:22:31 +00:00
commit 9a628724ce
9 changed files with 0 additions and 4013 deletions

View file

@ -1,656 +0,0 @@
# Quick Implementation Guide: Rolling Summary Memory
## TL;DR
**Problem:** Sending full conversation history every request wastes tokens and latency.
**Solution:** Rolling summary approach - keep recent messages + LLM-generated summary of older messages.
**Result:** ~83% token reduction for long conversations, zero dependencies, works with current stack.
---
## Architecture
```
SQLite History (per user)
Messages 1-10: Summarized → "User asked about weather, discussed outdoor plans"
Messages 11-18: Sent raw → Full context
LLM receives: System prompt + Summary + Recent 8 messages
Response generated
```
---
## Files to Create/Modify
### 1. Create `meshai/memory.py`
```python
"""Lightweight rolling summary memory manager."""
import time
from dataclasses import dataclass
from typing import Optional
from openai import AsyncOpenAI
@dataclass
class ConversationSummary:
"""Summary of conversation history."""
summary: str
last_updated: float
message_count: int
class RollingSummaryMemory:
"""Manages conversation summaries with recent message window.
Strategy:
- Keep last N message pairs (window_size) in full
- Summarize everything before the window
- Update summary when old messages accumulate
Example (window_size=4):
Messages 1-10: Summarized to "User discussed weather and plans"
Messages 11-18: Kept in full (last 4 pairs)
Context sent: [Summary] + [Messages 11-18]
"""
def __init__(
self,
client: AsyncOpenAI,
model: str,
window_size: int = 4,
summarize_threshold: int = 8,
):
"""Initialize rolling summary memory.
Args:
client: AsyncOpenAI client for generating summaries
model: Model name to use for summarization
window_size: Number of recent message pairs to keep in full
summarize_threshold: Messages to accumulate before re-summarizing
"""
self._client = client
self._model = model
self._window_size = window_size
self._summarize_threshold = summarize_threshold
# In-memory cache of summaries (loaded from DB on startup)
self._summaries: dict[str, ConversationSummary] = {}
async def get_context_messages(
self,
user_id: str,
full_history: list[dict],
) -> tuple[Optional[str], list[dict]]:
"""Get optimized context: summary + recent messages.
Args:
user_id: User identifier
full_history: Full message history from database
Returns:
Tuple of (summary_text, recent_messages)
summary_text is None if conversation is short
"""
# Short conversation - no summary needed
if len(full_history) <= self._window_size * 2:
return None, full_history
# Split into old (to summarize) and recent (keep raw)
split_point = -(self._window_size * 2)
old_messages = full_history[:split_point]
recent_messages = full_history[split_point:]
# Get or create summary
summary = await self._get_or_create_summary(user_id, old_messages)
return summary.summary, recent_messages
async def _get_or_create_summary(
self,
user_id: str,
messages: list[dict],
) -> ConversationSummary:
"""Get cached summary or create new one."""
# Check cache
if user_id in self._summaries:
cached = self._summaries[user_id]
# Reuse if message count is close
if abs(cached.message_count - len(messages)) < self._summarize_threshold:
return cached
# Generate new summary
summary_text = await self._summarize(messages)
summary = ConversationSummary(
summary=summary_text,
last_updated=time.time(),
message_count=len(messages),
)
self._summaries[user_id] = summary
return summary
async def _summarize(self, messages: list[dict]) -> str:
"""Generate summary using LLM."""
# Format conversation
conversation = "\n".join(
[f"{msg['role'].upper()}: {msg['content']}" for msg in messages]
)
prompt = f"""Summarize this conversation in 2-3 concise sentences. Focus on:
- Main topics discussed
- Important context or user preferences
- Key information to remember
Conversation:
{conversation}
Summary (2-3 sentences):"""
try:
response = await self._client.chat.completions.create(
model=self._model,
messages=[{"role": "user", "content": prompt}],
max_tokens=150,
temperature=0.3,
)
return response.choices[0].message.content.strip()
except Exception as e:
# Fallback
return f"Previous conversation: {len(messages)} messages about various topics."
def load_summary(self, user_id: str, summary: ConversationSummary) -> None:
"""Load summary from database into cache."""
self._summaries[user_id] = summary
def clear_summary(self, user_id: str) -> None:
"""Clear cached summary for user."""
self._summaries.pop(user_id, None)
```
---
### 2. Modify `meshai/history.py`
Add summary storage methods:
```python
# Add to ConversationHistory class
async def initialize(self) -> None:
"""Initialize database and create tables."""
self._db = await aiosqlite.connect(self._db_path)
# Existing conversations table
await self._db.execute("""
CREATE TABLE IF NOT EXISTS conversations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id TEXT NOT NULL,
role TEXT NOT NULL,
content TEXT NOT NULL,
timestamp REAL NOT NULL
)
""")
await self._db.execute("""
CREATE INDEX IF NOT EXISTS idx_user_timestamp
ON conversations (user_id, timestamp)
""")
# NEW: Summaries table
await self._db.execute("""
CREATE TABLE IF NOT EXISTS conversation_summaries (
user_id TEXT PRIMARY KEY,
summary TEXT NOT NULL,
message_count INTEGER NOT NULL,
updated_at REAL NOT NULL
)
""")
await self._db.commit()
logger.info(f"Conversation history initialized at {self._db_path}")
async def store_summary(
self, user_id: str, summary: str, message_count: int
) -> None:
"""Store conversation summary.
Args:
user_id: Node ID of user
summary: Summary text
message_count: Number of messages summarized
"""
if not self._db:
raise RuntimeError("Database not initialized")
async with self._lock:
await self._db.execute(
"""
INSERT OR REPLACE INTO conversation_summaries
(user_id, summary, message_count, updated_at)
VALUES (?, ?, ?, ?)
""",
(user_id, summary, message_count, time.time()),
)
await self._db.commit()
async def get_summary(self, user_id: str) -> Optional[dict]:
"""Get conversation summary for user.
Args:
user_id: Node ID of user
Returns:
Dict with 'summary', 'message_count', 'updated_at' or None
"""
if not self._db:
raise RuntimeError("Database not initialized")
async with self._lock:
cursor = await self._db.execute(
"""
SELECT summary, message_count, updated_at
FROM conversation_summaries
WHERE user_id = ?
""",
(user_id,),
)
row = await cursor.fetchone()
if not row:
return None
return {
"summary": row[0],
"message_count": row[1],
"updated_at": row[2],
}
async def clear_summary(self, user_id: str) -> None:
"""Clear summary for user (e.g., on history reset).
Args:
user_id: Node ID of user
"""
if not self._db:
raise RuntimeError("Database not initialized")
async with self._lock:
await self._db.execute(
"DELETE FROM conversation_summaries WHERE user_id = ?",
(user_id,),
)
await self._db.commit()
```
---
### 3. Modify `meshai/backends/openai_backend.py`
Integrate memory manager:
```python
"""OpenAI-compatible LLM backend with rolling summary memory."""
import logging
from typing import Optional
from openai import AsyncOpenAI
from ..config import LLMConfig
from ..memory import RollingSummaryMemory
from .base import LLMBackend
logger = logging.getLogger(__name__)
class OpenAIBackend(LLMBackend):
"""OpenAI-compatible backend with intelligent memory management."""
def __init__(self, config: LLMConfig, api_key: str):
"""Initialize OpenAI backend.
Args:
config: LLM configuration
api_key: API key to use
"""
self.config = config
self._client = AsyncOpenAI(
api_key=api_key,
base_url=config.base_url,
)
# Initialize rolling summary memory
self._memory = RollingSummaryMemory(
client=self._client,
model=config.model,
window_size=4, # Keep last 4 exchanges (8 messages)
summarize_threshold=8, # Re-summarize after 8 new messages
)
async def generate(
self,
messages: list[dict],
system_prompt: str,
user_id: str = None, # NEW: optional for backward compatibility
max_tokens: int = 300,
) -> str:
"""Generate a response using OpenAI-compatible API.
Args:
messages: Conversation history
system_prompt: System prompt
user_id: User identifier (for memory management)
max_tokens: Maximum tokens to generate
Returns:
Generated response
"""
# If no user_id, use old behavior (send full history)
if not user_id:
full_messages = [{"role": "system", "content": system_prompt}]
full_messages.extend(messages)
else:
# Use memory manager to optimize context
summary, recent_messages = await self._memory.get_context_messages(
user_id=user_id,
full_history=messages,
)
# Build optimized message list
if summary:
# Long conversation: system + summary + recent
enhanced_system = f"""{system_prompt}
Previous conversation summary: {summary}"""
full_messages = [{"role": "system", "content": enhanced_system}]
full_messages.extend(recent_messages)
logger.debug(
f"Using summary + {len(recent_messages)} recent messages "
f"(total history: {len(messages)})"
)
else:
# Short conversation: system + all messages
full_messages = [{"role": "system", "content": system_prompt}]
full_messages.extend(messages)
try:
response = await self._client.chat.completions.create(
model=self.config.model,
messages=full_messages,
max_tokens=max_tokens,
temperature=0.7,
)
content = response.choices[0].message.content
return content.strip() if content else ""
except Exception as e:
logger.error(f"OpenAI API error: {e}")
raise
def load_summary_cache(self, user_id: str, summary_data: dict) -> None:
"""Load summary into memory cache (called on startup).
Args:
user_id: User identifier
summary_data: Dict with 'summary', 'message_count', 'updated_at'
"""
from ..memory import ConversationSummary
summary = ConversationSummary(
summary=summary_data["summary"],
message_count=summary_data["message_count"],
last_updated=summary_data["updated_at"],
)
self._memory.load_summary(user_id, summary)
def clear_summary_cache(self, user_id: str) -> None:
"""Clear summary cache for user."""
self._memory.clear_summary(user_id)
# ... rest of methods unchanged ...
```
---
### 4. Modify `meshai/responder.py`
Pass user_id to backend and persist summaries:
```python
# In the generate_response method
async def generate_response(self, user_id: str, message: str) -> str:
"""Generate LLM response with optimized memory."""
# Add user message to history
await self.history.add_message(user_id, "user", message)
# Get conversation history
history = await self.history.get_history_for_llm(user_id)
# Generate response with user_id for memory management
response = await self.backend.generate(
messages=history,
system_prompt=self.system_prompt,
user_id=user_id, # NEW: enables memory optimization
max_tokens=300,
)
# Add assistant response to history
await self.history.add_message(user_id, "assistant", response)
# Persist summary if one was created
# The memory manager caches it, we need to save to DB
summary_data = await self._get_current_summary(user_id)
if summary_data:
await self.history.store_summary(
user_id,
summary_data["summary"],
summary_data["message_count"],
)
return response
async def _get_current_summary(self, user_id: str) -> Optional[dict]:
"""Get current summary from memory manager if it exists."""
# Access the memory manager's cache
if hasattr(self.backend, "_memory"):
summary = self.backend._memory._summaries.get(user_id)
if summary:
return {
"summary": summary.summary,
"message_count": summary.message_count,
"updated_at": summary.last_updated,
}
return None
```
---
### 5. Modify `meshai/commands/reset.py`
Clear summaries when resetting history:
```python
async def execute(self, sender_id: str, args: list[str]) -> str:
"""Reset conversation history."""
count = await self.responder.history.clear_history(sender_id)
# NEW: Also clear summary
await self.responder.history.clear_summary(sender_id)
if hasattr(self.responder.backend, "clear_summary_cache"):
self.responder.backend.clear_summary_cache(sender_id)
return f"Cleared {count} messages from your history."
```
---
## Configuration
Add to `meshai/config.py`:
```python
@dataclass
class MemoryConfig:
"""Memory management configuration."""
# Rolling summary settings
window_size: int = 4 # Recent message pairs to keep
summarize_threshold: int = 8 # Messages before re-summarizing
# When to enable summaries
min_messages_for_summary: int = 10 # Start summarizing after this many
```
---
## Testing
```python
# Test script
import asyncio
from meshai.backends.openai_backend import OpenAIBackend
from meshai.config import LLMConfig
async def test():
config = LLMConfig(
backend="openai",
base_url="http://192.168.1.239:8000/v1",
model="gpt-4o-mini"
)
backend = OpenAIBackend(config, "your-key")
# Simulate long conversation
messages = []
for i in range(20):
messages.append({"role": "user", "content": f"Question {i}"})
messages.append({"role": "assistant", "content": f"Answer {i}"})
# Generate - should use summary
response = await backend.generate(
messages=messages,
system_prompt="You are helpful.",
user_id="!test123",
max_tokens=100
)
print(f"Response: {response}")
print(f"Sent {len(messages)} messages, but only ~10 used in context")
asyncio.run(test())
```
---
## Expected Results
### Token Usage Comparison
**Before (full history):**
```
User message 1-20: ~2000 tokens
System prompt: ~50 tokens
Total: ~2050 tokens per request
```
**After (with summary):**
```
System prompt: ~50 tokens
Summary: ~100 tokens
Recent 8 messages: ~400 tokens
Total: ~550 tokens per request
```
**Savings: ~73% token reduction**
### Performance Impact
- **Summary generation**: ~1-2s every 8-10 messages (amortized)
- **Regular requests**: No added latency
- **Storage**: ~100 bytes per summary in SQLite
---
## Tuning Parameters
### window_size
- **Smaller (2-3)**: More aggressive summarization, max token savings
- **Larger (5-6)**: More context, less summarization
- **Recommended**: 4 (last 4 exchanges = 8 messages)
### summarize_threshold
- **Smaller (4-6)**: Frequent re-summarization, more current
- **Larger (10-12)**: Less summarization overhead
- **Recommended**: 8 (re-summarize after 8 new messages)
### For MeshAI specifically:
- Messages are tiny (150 chars max)
- `window_size=4` gives ~600 chars of recent context
- `summarize_threshold=8` balances overhead vs accuracy
---
## Migration Path
1. **Phase 1**: Add code, test with new users
2. **Phase 2**: Run in parallel (old + new backend)
3. **Phase 3**: Migrate existing users (generate summaries for existing history)
4. **Phase 4**: Remove old full-history code path
No data loss - summaries stored in DB, can regenerate anytime.
---
## Maintenance
### Monitor summary quality:
```sql
-- Check summaries
SELECT user_id, summary, message_count, updated_at
FROM conversation_summaries
ORDER BY updated_at DESC;
```
### Regenerate summary:
```python
# Clear cache + DB, will regenerate on next request
await history.clear_summary(user_id)
backend.clear_summary_cache(user_id)
```
### Adjust if summaries too short/long:
- Modify prompt in `_summarize()`
- Adjust `max_tokens=150` for summaries
- Change temperature (lower = more consistent)
---
## Future Enhancements
1. **Hybrid approach**: Summary + semantic search for very long histories
2. **User preferences**: Store separate from summary (e.g., "likes weather in metric")
3. **Multi-level summaries**: Summarize summaries for years-long conversations
4. **Summary quality scoring**: Validate summaries maintain key information
But start simple - this gets 80% of the benefit with 20% of the complexity.

View file

@ -1,437 +0,0 @@
# LLM Conversation Memory Research & Implementation
This directory contains comprehensive research and implementation guides for improving LLM conversation memory in MeshAI.
## Problem Statement
MeshAI currently sends the full conversation history with every LLM API call. This approach:
- Wastes tokens (expensive and slow)
- Doesn't scale to long conversations
- Sends redundant context the LLM doesn't need
## Solution: Rolling Summary Memory
Keep recent messages in full + LLM-generated summary of older messages.
**Result:** 70-80% token reduction, zero dependencies, works with existing stack.
---
## Documentation Index
### 1. Quick Start
**READ THIS FIRST:** [`MEMORY_SUMMARY.md`](/home/zvx/projects/meshai/MEMORY_SUMMARY.md)
- High-level overview
- Why rolling summary?
- Comparison with alternatives
- Expected performance gains
**Estimated reading time:** 10 minutes
---
### 2. Detailed Research
**FOR DEEP DIVE:** [`MEMORY_RESEARCH.md`](/home/zvx/projects/meshai/MEMORY_RESEARCH.md)
- Full evaluation of 5 approaches:
1. LangChain Memory modules
2. LlamaIndex
3. MemGPT/Letta
4. Vector stores (ChromaDB/Qdrant)
5. Simple rolling summary (DIY)
- Code examples for each approach
- Pros/cons for MeshAI specifically
- Detailed comparison matrix
**Estimated reading time:** 30-45 minutes
---
### 3. Implementation Guide
**FOR BUILDING:** [`MEMORY_IMPLEMENTATION_GUIDE.md`](/home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md)
- Step-by-step implementation
- Complete code examples
- Database schema
- Configuration options
- Testing procedures
- Troubleshooting guide
**Estimated reading time:** 20 minutes + implementation time
---
### 4. Implementation Diff
**FOR EXACT CHANGES:** [`docs/IMPLEMENTATION_DIFF.md`](/home/zvx/projects/meshai/docs/IMPLEMENTATION_DIFF.md)
- Exact code diffs for all files
- Line-by-line changes needed
- Migration checklist
- Rollback plan
- Performance validation queries
**Estimated reading time:** 15 minutes
---
### 5. Visual Comparison
**FOR UNDERSTANDING:** [`docs/memory_approaches_comparison.txt`](/home/zvx/projects/meshai/docs/memory_approaches_comparison.txt)
- ASCII diagrams of all approaches
- Visual token usage comparison
- Decision matrices
- Architecture diagrams
**Estimated reading time:** 10 minutes
---
### 6. Quick Reference
**FOR CHEAT SHEET:** [`docs/QUICK_REFERENCE.md`](/home/zvx/projects/meshai/docs/QUICK_REFERENCE.md)
- One-page reference card
- Key configuration
- Code snippets
- Performance metrics
- Troubleshooting tips
**Estimated reading time:** 5 minutes
---
### 7. Proof of Concept
**FOR TESTING:** [`examples/memory_comparison.py`](/home/zvx/projects/meshai/examples/memory_comparison.py)
- Runnable comparison script
- Tests all 3 approaches side-by-side:
- Full history (baseline)
- Rolling summary
- Window-only
- Real token usage measurements
- Performance comparison
**Usage:**
```bash
# Edit script with your LLM endpoint
nano examples/memory_comparison.py
# Update BASE_URL, API_KEY, MODEL
# Run comparison
python examples/memory_comparison.py
```
**Expected output:**
```
Approach Tokens Time Savings
----------------------------------------------------------------------
Full History 1847 2.34s (baseline)
Rolling Summary 512 1.87s 72.3%
Window Only 398 1.45s 78.4%
RECOMMENDATION: Rolling Summary - best balance of context and efficiency
```
---
## Recommended Reading Path
### Path 1: Executive Summary (20 minutes)
1. `MEMORY_SUMMARY.md` - Overview
2. `docs/QUICK_REFERENCE.md` - Cheat sheet
3. `examples/memory_comparison.py` - Run the test
**Decision point:** Convinced? Proceed to implementation.
---
### Path 2: Technical Deep Dive (60 minutes)
1. `MEMORY_SUMMARY.md` - Overview
2. `MEMORY_RESEARCH.md` - Full evaluation
3. `docs/memory_approaches_comparison.txt` - Visual diagrams
4. `examples/memory_comparison.py` - Run the test
5. `MEMORY_IMPLEMENTATION_GUIDE.md` - How to build it
**Decision point:** Ready to implement? Use the diff guide.
---
### Path 3: Implementation (2-3 hours)
1. `MEMORY_SUMMARY.md` - Refresh on approach
2. `MEMORY_IMPLEMENTATION_GUIDE.md` - Full implementation guide
3. `docs/IMPLEMENTATION_DIFF.md` - Exact changes needed
4. Code the changes
5. Test with `examples/memory_comparison.py`
6. Deploy and monitor
**Outcome:** Production-ready rolling summary memory.
---
## Files Created
### Documentation
```
/home/zvx/projects/meshai/
├── MEMORY_README.md (this file)
├── MEMORY_SUMMARY.md (overview)
├── MEMORY_RESEARCH.md (detailed research)
├── MEMORY_IMPLEMENTATION_GUIDE.md (step-by-step)
├── docs/
│ ├── IMPLEMENTATION_DIFF.md (exact changes)
│ ├── memory_approaches_comparison.txt (diagrams)
│ └── QUICK_REFERENCE.md (cheat sheet)
└── examples/
└── memory_comparison.py (proof of concept)
```
### Code to Create (not yet created)
```
meshai/
├── memory.py (NEW - ~100 lines)
├── history.py (MODIFY - add ~70 lines)
├── backends/
│ └── openai_backend.py (MODIFY - add ~30 lines)
├── responder.py (MODIFY - add ~10 lines)
└── commands/
└── reset.py (MODIFY - add ~4 lines)
```
**Total new code:** ~214 lines
**Dependencies added:** 0
---
## Key Metrics
### Token Savings
| Conversation Length | Before | After | Savings |
|---------------------|--------|-------|---------|
| 10 messages | 800 | 800 | 0% |
| 20 messages | 1600 | 550 | 66% |
| 30 messages | 2400 | 600 | 75% |
| 50 messages | 4000 | 650 | 84% |
### Cost Impact
**Assumptions:**
- $0.50 per 1M input tokens
- 1000 requests per day
- Average 30 messages per conversation
**Before:** $36/month
**After:** $9/month
**Savings:** $27/month (75% reduction)
### Implementation Effort
- Code to write: ~214 lines
- Code to modify: ~57 lines
- Time estimate: 2-3 hours
- Testing: 1 hour
- **Total:** Half a day
### Risk Assessment
- **Low risk:** Backward compatible (user_id parameter optional)
- **No data loss:** New table, existing data untouched
- **Easy rollback:** Git revert + drop one table
- **No dependencies:** Pure Python, existing libraries only
---
## Configuration Summary
### Recommended for MeshAI
```python
RollingSummaryMemory(
client=self._client,
model=config.model,
window_size=4, # Keep last 4 exchanges (8 messages)
summarize_threshold=8, # Re-summarize after 8 new messages
)
```
**Rationale:**
- MeshAI messages are tiny (150 chars max)
- window_size=4 gives ~600 chars of recent context
- summarize_threshold=8 balances overhead vs freshness
- Tune based on actual usage patterns
### Alternative Configurations
**For longer messages:**
```python
window_size=3, # Less recent context needed
summarize_threshold=6, # More frequent updates
```
**For very short messages:**
```python
window_size=6, # More recent context
summarize_threshold=10, # Less frequent summarization
```
---
## Database Schema
### New Table
```sql
CREATE TABLE conversation_summaries (
user_id TEXT PRIMARY KEY,
summary TEXT NOT NULL,
message_count INTEGER NOT NULL,
updated_at REAL NOT NULL
);
```
### Existing Tables (unchanged)
```sql
CREATE TABLE conversations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id TEXT NOT NULL,
role TEXT NOT NULL,
content TEXT NOT NULL,
timestamp REAL NOT NULL
);
CREATE INDEX idx_user_timestamp ON conversations (user_id, timestamp);
```
---
## Testing Checklist
- [ ] Database migration works (new table created)
- [ ] Short conversations (<10 messages) use full history
- [ ] Long conversations (>10 messages) use summaries
- [ ] Summaries are stored in database
- [ ] Summaries persist across restarts
- [ ] Reset command clears summaries
- [ ] Token usage reduced by 70%+ for long convos
- [ ] No errors in logs
- [ ] Response quality maintained
---
## Monitoring Queries
### Check summary coverage
```sql
SELECT
(SELECT COUNT(DISTINCT user_id) FROM conversation_summaries) * 100.0 /
(SELECT COUNT(DISTINCT user_id) FROM conversations) as coverage_pct;
```
### Average messages per summary
```sql
SELECT AVG(message_count) FROM conversation_summaries;
```
### Recent summaries
```sql
SELECT user_id, summary, message_count,
datetime(updated_at, 'unixepoch') as updated
FROM conversation_summaries
ORDER BY updated_at DESC
LIMIT 10;
```
---
## Troubleshooting
### Summary not being created
**Check:** Conversation long enough?
```sql
SELECT user_id, COUNT(*) as msg_count
FROM conversations
GROUP BY user_id
HAVING msg_count > 10;
```
**Fix:** Need >10 messages before summary kicks in.
### Summary quality poor
**Check:** Look at actual summaries
```sql
SELECT summary FROM conversation_summaries;
```
**Fix:** Adjust prompt in `memory.py` `_summarize()` method.
### Token usage still high
**Check:** Verify memory is being used
```bash
# Look for log line:
# "Using summary + 8 recent messages (total history: 24)"
```
**Fix:** Ensure `user_id` is being passed to `backend.generate()`.
### Database errors
**Check:** Table exists
```sql
.tables
```
**Fix:** Drop and recreate
```sql
DROP TABLE IF EXISTS conversation_summaries;
-- Restart app to recreate
```
---
## Next Steps
1. **Understand:** Read `MEMORY_SUMMARY.md`
2. **Evaluate:** Review `MEMORY_RESEARCH.md` for alternatives
3. **Test:** Run `examples/memory_comparison.py` with your LLM
4. **Implement:** Follow `MEMORY_IMPLEMENTATION_GUIDE.md`
5. **Deploy:** Use `docs/IMPLEMENTATION_DIFF.md` for exact changes
6. **Monitor:** Check database and logs for summary generation
7. **Tune:** Adjust `window_size` and `summarize_threshold` as needed
---
## Support
If you have questions or issues:
1. Check the troubleshooting section in this file
2. Review `docs/QUICK_REFERENCE.md` for common issues
3. Look at the detailed implementation guide
4. Check the proof-of-concept script for working examples
---
## Conclusion
Rolling summary memory provides:
- **Massive efficiency gains** (70-80% token reduction)
- **Zero dependencies** (pure Python)
- **Simple implementation** (~200 lines)
- **Production ready** (tested approach)
- **Backward compatible** (optional user_id)
- **Easy to maintain** (clear, documented code)
**Recommendation:** Implement this for MeshAI. It's the right balance of simplicity and effectiveness.
Good luck! The documentation is comprehensive - you have everything needed to succeed.
---
**Research completed:** 2025-12-15
**Total documentation:** 7 files, ~1500 lines
**Implementation effort:** ~3 hours
**Expected ROI:** $324/year in token savings (at modest 1000 req/day)

File diff suppressed because it is too large Load diff

View file

@ -1,219 +0,0 @@
# LLM Memory Research Summary
## The Problem
MeshAI currently stuffs full conversation history into every LLM API call:
- Inefficient: Wastes tokens on old context
- Slow: More tokens = higher latency
- Expensive: Unnecessary token costs
- Doesn't scale: Long conversations become unwieldy
## Solutions Evaluated
### 1. LangChain Memory Modules
**Tested:**
- `ConversationBufferMemory`: Stores everything (no improvement)
- `ConversationBufferWindowMemory`: Last N messages only
- `ConversationSummaryMemory`: LLM-generated summaries + recent messages
**Verdict:** `ConversationSummaryMemory` is best, but adds 50MB dependency. Can DIY the same thing in <100 lines.
### 2. LlamaIndex
**Tested:** `ChatMemoryBuffer` with token limiting
**Verdict:** Token-aware pruning is nice, but 100MB+ dependency is overkill. Less mature than LangChain.
### 3. MemGPT/Letta
**Tested:** Self-editing memory architecture
**Verdict:** Way too heavy (200MB+), requires vector embeddings. Designed for complex multi-day agents, not 150-char mesh messages.
### 4. Vector Stores (ChromaDB/Qdrant)
**Tested:** Semantic search for relevant past context
**Verdict:** Interesting for long-term cross-conversation search, but adds complexity. Not needed for per-user linear conversations.
### 5. Simple Rolling Summary (DIY)
**Tested:** Keep last N messages + LLM-generated summary of older messages
**Verdict:** WINNER - Zero dependencies, 80% token savings, works with existing stack.
---
## Recommendation: Rolling Summary
### Why
1. **Zero dependencies** - Pure Python, uses existing AsyncOpenAI client
2. **Simple** - ~100 lines of code, easy to understand and maintain
3. **Effective** - 73-83% token reduction for long conversations
4. **Persistent** - Summaries stored in SQLite, survive restarts
5. **Compatible** - Works with LiteLLM, local models, any OpenAI-compatible API
6. **Tunable** - Two params: `window_size` (recent messages) and `summarize_threshold` (when to re-summarize)
### How It Works
```
Full History (20 messages):
┌─────────────────────────────────────────────────────┐
│ User: What's the weather? │
│ Assistant: Sunny, 72°F │
│ ... (16 more messages) ... │
│ User: Which trail should I take? │
│ Assistant: Mt Si if you're fit, Rattlesnake if not │
└─────────────────────────────────────────────────────┘
↓ Sent to LLM: 2000+ tokens
With Rolling Summary:
┌─────────────────────────────────────────────────────┐
│ SUMMARY: User asked about weather and hiking. │
│ Discussed Mt Si trail (4hrs, moderate) and │
│ Rattlesnake Ledge (2mi, easier, lake views). │
├─────────────────────────────────────────────────────┤
│ User: How crowded does it get? │
│ Assistant: Very crowded weekends, go weekdays │
│ User: Any other trails nearby? │
│ Assistant: Rattlesnake Ledge is easier and closer │
│ User: Tell me about Rattlesnake │
│ Assistant: 2 miles, great lake views, popular │
│ User: Which would you recommend? │
│ Assistant: Mt Si if fit, Rattlesnake if casual │
└─────────────────────────────────────────────────────┘
↓ Sent to LLM: ~500 tokens (75% savings!)
```
### Configuration
**Recommended for MeshAI:**
- `window_size=4` → Keep last 4 exchanges (8 messages) in full
- `summarize_threshold=8` → Re-summarize after 8 new messages
**Tuning:**
- Smaller window = More aggressive summarization, max token savings
- Larger window = More recent context, less summarization
- Adjust based on average conversation length and message density
### Implementation Effort
**Files to modify:**
1. Create `meshai/memory.py` - Rolling summary class
2. Modify `meshai/history.py` - Add summary storage (1 new table, 3 methods)
3. Modify `meshai/backends/openai_backend.py` - Integrate memory manager
4. Modify `meshai/responder.py` - Pass user_id, persist summaries
5. Modify `meshai/commands/reset.py` - Clear summaries on reset
**Total: ~200 lines of new code, ~50 lines of modifications**
### Performance
**Token Usage:**
| Conversation Length | Full History | Rolling Summary | Savings |
|---------------------|--------------|-----------------|---------|
| 10 messages | 800 tokens | 800 tokens | 0% (no summary) |
| 20 messages | 1600 tokens | 550 tokens | 66% |
| 30 messages | 2400 tokens | 600 tokens | 75% |
| 50 messages | 4000 tokens | 650 tokens | 84% |
**Cost Impact (at $0.50/1M input tokens):**
- Before: 2400 tokens × $0.0005 = $0.0012 per request
- After: 600 tokens × $0.0005 = $0.0003 per request
- **Savings: $0.0009 per request (75%)**
For 1000 requests/day: **$0.90/day savings** or **$27/month**
**Latency:**
- Summary generation: 1-2s every 8-10 messages (amortized)
- Regular requests: No added latency
- Net effect: Faster due to fewer input tokens
---
## When to Use Alternatives
### Use Window-Only (no summary)
- Very short conversations (< 10 messages)
- Don't care about older context
- Want minimal implementation
### Use Vector Store (ChromaDB)
- Need semantic search across users
- Want to find similar past conversations
- Long-term cross-user knowledge base
### Use LangChain SummaryMemory
- Want batteries-included solution
- Don't mind 50MB dependency
- Prefer established library over DIY
### Use MemGPT/Letta
- Multi-day complex agent workflows
- Agent needs to manage own memory
- Have budget for embeddings and compute
---
## Next Steps
1. **Read detailed guide:** `/home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md`
2. **Review research:** `/home/zvx/projects/meshai/MEMORY_RESEARCH.md`
3. **Test proof-of-concept:** `python examples/memory_comparison.py`
4. **Implement rolling summary** following the guide
5. **Monitor and tune** based on actual conversation patterns
---
## Files Created
1. **`MEMORY_SUMMARY.md`** (this file) - Quick overview and recommendation
2. **`MEMORY_RESEARCH.md`** - Detailed evaluation of all approaches with code examples
3. **`MEMORY_IMPLEMENTATION_GUIDE.md`** - Step-by-step implementation guide
4. **`examples/memory_comparison.py`** - Runnable proof-of-concept test script
---
## Quick Start
```bash
# Test the approaches with your LLM
cd /home/zvx/projects/meshai
# Edit examples/memory_comparison.py with your LLM endpoint
# Update BASE_URL, API_KEY, MODEL
python examples/memory_comparison.py
# You'll see:
# - Full history baseline
# - Rolling summary results
# - Window-only results
# - Token savings comparison
```
Expected output:
```
Approach Tokens Time Savings
----------------------------------------------------------------------
Full History 1847 2.34s (baseline)
Rolling Summary 512 1.87s 72.3%
Window Only 398 1.45s 78.4%
```
**Conclusion: Rolling Summary gives 70%+ savings while preserving context.**
---
## Questions?
- How does it handle very long conversations? → Multi-level summaries (summary of summaries)
- What if summary loses important info? → Tune `window_size` to keep more recent context
- Does it work with streaming? → Yes, just apply before streaming starts
- Can I see the summaries? → Query `conversation_summaries` table in SQLite
- How do I regenerate a summary? → Clear it, will auto-regenerate on next request
Start with the recommended settings, monitor, and adjust based on your actual usage patterns.

356
PLAN.md
View file

@ -1,356 +0,0 @@
# MeshAI - Meshtastic LLM Bridge
## Project Overview
A Python application that connects to a Meshtastic node and provides LLM-powered responses to mesh network users. Responds to direct mentions (@nodename) or direct messages. Includes bang commands (`!command`) for utility functions.
## Design Decisions
### 1. Trigger Mechanism
- **@mentions**: Respond when message contains `@<nodename>` (configurable node name)
- **Direct Messages**: Respond to all DMs automatically
- **Bang commands**: `!command` syntax for utility functions (handled before LLM)
- Ignore general channel chatter that doesn't mention the bot
### 2. Conversation History
- Maintain per-user conversation history
- Storage: SQLite database for persistence across restarts
- Context window: Last N messages per user (configurable, default ~20 exchanges)
- With 300 char limit per exchange, context stays small - can maintain long conversations
- Include timestamp tracking for potential "conversation timeout" (e.g., reset after 24h inactivity)
### 3. Rate Limiting & Response Behavior
- **Response delay**: Configurable 2.2-3.0 second random delay before sending
- **Message chunking**: Split responses at 150 characters max per message
- **Max chunks**: 2 messages maximum per response (300 chars total)
- **Brevity prompt**: System prompt instructs LLM to keep responses concise
- **Cooldown**: Optional per-user cooldown to prevent spam
### 4. Identity & Configuration
- Node name/ID determined by the physical node configuration
- Application config includes:
- `bot_name`: The @mention trigger name (e.g., "meshbot", "ai")
- `owner`: Owner identification for logging/admin purposes
- Connection settings (serial port or TCP host:port)
### 5. Channel Filtering
- Configurable list of channels to respond on
- Option to respond on all channels or specific ones only
- DMs always processed regardless of channel settings
## Technical Architecture
```
┌─────────────────────────────────────────────────────────────┐
│ MeshAI │
├─────────────────────────────────────────────────────────────┤
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────┐ │
│ │ Meshtastic │ │ Message │ │ LLM Backend │ │
│ │ Connector │───▶│ Router │───▶│ (pluggable) │ │
│ │ Serial/TCP │ │ │ │ │ │
│ └─────────────┘ └─────────────┘ └─────────────────┘ │
│ │ │ │ │
│ │ ┌─────▼─────┐ │ │
│ │ │ Conversation│ │ │
│ │ │ History │◀────────────┘ │
│ │ │ (SQLite) │ │
│ │ └───────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────┐ │
│ │ Response │ - 2.2-3s delay │
│ │ Handler │ - Chunk to 150 chars │
│ │ │ - Max 2 messages │
│ └─────────────┘ │
└─────────────────────────────────────────────────────────────┘
```
## LLM Backend Support
### Pluggable Backend Interface
```python
class LLMBackend(ABC):
@abstractmethod
async def generate(self, messages: list[dict], system_prompt: str) -> str:
pass
```
### Supported Backends (Priority Order)
1. **OpenAI-compatible** (covers most bases)
- OpenAI (GPT-4, GPT-4o, etc.)
- Local LiteLLM/Open WebUI (ai.echo6.co)
- Any OpenAI-compatible API
2. **Anthropic** (Claude)
- Direct Anthropic API
3. **Google** (Gemini)
- Google AI Studio / Vertex AI
### Configuration Example
```yaml
llm:
backend: "openai" # openai, anthropic, google
api_key: "${OPENAI_API_KEY}"
base_url: "https://api.openai.com/v1" # or http://ai.echo6.co/api for local
model: "gpt-4o-mini"
# For local LiteLLM:
# backend: "openai"
# base_url: "http://192.168.1.239:4000/v1"
# model: "llama3"
```
## Configuration File Structure
```yaml
# config.yaml
bot:
name: "ai" # @mention trigger
owner: "K7ZVX" # Owner callsign/name
respond_to_mentions: true
respond_to_dms: true
connection:
type: "serial" # serial or tcp
serial_port: "/dev/ttyUSB0" # if serial
tcp_host: "192.168.1.100" # if tcp
tcp_port: 4403 # if tcp
channels:
mode: "all" # "all" or "whitelist"
whitelist: [0, 1] # Only if mode is "whitelist"
response:
delay_min: 2.2 # seconds
delay_max: 3.0 # seconds
max_length: 150 # chars per message
max_messages: 2 # messages per response
history:
database: "conversations.db"
max_messages_per_user: 20
conversation_timeout: 86400 # seconds (24h)
llm:
backend: "openai"
api_key: "${LLM_API_KEY}"
base_url: "https://api.openai.com/v1"
model: "gpt-4o-mini"
system_prompt: |
You are a helpful assistant on a Meshtastic mesh network.
Keep responses VERY brief - under 250 characters total.
Be concise but friendly. No markdown formatting.
weather:
primary: "openmeteo" # openmeteo, wttr, or llm
fallback: "llm" # openmeteo, wttr, llm, or none
default_location: "" # Fallback if node has no GPS (e.g., "Seattle, WA")
openmeteo:
url: "https://api.open-meteo.com/v1" # or self-hosted URL
wttr:
url: "https://wttr.in" # or self-hosted
```
## Bang Commands
Commands use `!` prefix (like fq51bbs). Processed before LLM routing.
| Command | Description | Example |
|---------|-------------|---------|
| `!help` | List available commands | `!help` |
| `!ping` | Connectivity test, responds "pong" | `!ping` |
| `!reset` | Clear your conversation history | `!reset` |
| `!status` | Bot uptime, message count, version | `!status` |
| `!weather` | Weather for your node's GPS location (or default) | `!weather` |
| `!weather <loc>` | Weather for specified location | `!weather Seattle` |
### Weather Command Details
Location resolution order:
1. If `!weather <location>` - geocode the provided location
2. If `!weather` (no args) - use sender's node GPS position if available
3. Fall back to `weather.default_location` from config
4. If no location found: "No location available. Use !weather <city> or enable GPS on your node."
**Providers:**
- `openmeteo` - Open-Meteo API (free, no key, self-hostable)
- `wttr` - wttr.in (free, simple, self-hostable)
- `llm` - Pass to LLM with websearch (flexible, slower)
Primary/fallback configurable. If primary fails, tries fallback.
### Command Processing Flow
```
Message received
┌─────────────┐
│ Starts with │──No──▶ Check @mention / DM ──▶ LLM
│ "!"? │
└─────────────┘
│Yes
┌─────────────┐
│ Parse cmd │
│ & args │
└─────────────┘
┌─────────────┐
│ Lookup in │──Not found──▶ "Unknown command. Try !help"
│ registry │
└─────────────┘
│Found
┌─────────────┐
│ Execute │
│ handler │
└─────────────┘
```
### Command Handler Interface
```python
class CommandHandler(ABC):
@abstractmethod
async def execute(self, sender_id: str, args: str, context: MessageContext) -> str:
"""Execute command and return response string."""
pass
```
## CLI Configurator
Interactive TUI configurator using Rich library (same style as fq51bbs).
**Features:**
- Hierarchical menu system with numeric selection
- `0` always = back/save & exit
- Tables showing current values
- Status icons (✓/✗) with color coding
- Setup wizard for first-time configuration
- Unsaved changes tracking
- Inline help for complex options
**Menu Structure:**
```
Main Menu
├── 1. Bot Settings (name, owner, triggers)
├── 2. Connection (serial/TCP config)
├── 3. LLM Backend (provider, API keys, model)
├── 4. Commands & Weather (providers, fallbacks)
├── 5. Response Settings (delays, chunking)
├── 6. Channel Filtering
├── 7. History Settings
├── 8. Run Setup Wizard
└── 0. Save & Exit
```
**Invocation:**
```bash
meshai --config # Launch configurator
meshai # Run bot (uses config.yaml)
meshai --config-file /path/to/config.yaml # Use alternate config
```
**Config Reload/Restart:**
- On save, prompt: "Restart bot with new config? [Y/n]"
- If bot is running as systemd service: `systemctl restart meshai`
- If running in foreground: signal reload (SIGHUP) or full restart
- Store PID file at runtime for service management
## File Structure
```
meshai/
├── meshai/
│ ├── __init__.py
│ ├── main.py # Entry point
│ ├── config.py # Configuration loading/saving
│ ├── connector.py # Meshtastic serial/TCP connection
│ ├── router.py # Message routing logic
│ ├── history.py # Conversation history (SQLite)
│ ├── responder.py # Response handling (delay, chunking)
│ ├── cli/
│ │ ├── __init__.py
│ │ └── configurator.py # Rich-based TUI configurator
│ ├── commands/
│ │ ├── __init__.py
│ │ ├── base.py # Command handler interface
│ │ ├── dispatcher.py # Command registry & routing
│ │ ├── help.py # !help
│ │ ├── ping.py # !ping
│ │ ├── reset.py # !reset
│ │ ├── status.py # !status
│ │ └── weather.py # !weather
│ └── backends/
│ ├── __init__.py
│ ├── base.py # Abstract backend interface
│ ├── openai.py # OpenAI-compatible backend
│ ├── anthropic.py # Anthropic backend
│ └── google.py # Google Gemini backend
├── config.yaml # User configuration
├── requirements.txt
├── pyproject.toml
└── README.md
```
## Dependencies
```
meshtastic>=2.3.0
pyyaml>=6.0
aiosqlite>=0.19.0
openai>=1.0.0
anthropic>=0.18.0
google-generativeai>=0.4.0
```
## Implementation Phases
### Phase 1: Core Foundation
- [ ] Project structure setup
- [ ] Configuration loading
- [ ] Meshtastic connector (serial first, then TCP)
- [ ] Basic message receiving and logging
### Phase 2: Message Processing
- [ ] Message router (detect @mentions and DMs)
- [ ] Conversation history database
- [ ] User context management
### Phase 3: LLM Integration
- [ ] Backend interface definition
- [ ] OpenAI-compatible backend (covers local + OpenAI)
- [ ] Response generation with history
### Phase 4: Response Handling
- [ ] Delay implementation (2.2-3s random)
- [ ] Message chunking (150 char limit)
- [ ] Send responses back to mesh
### Phase 5: Additional Backends
- [ ] Anthropic backend
- [ ] Google Gemini backend
### Phase 6: Polish
- [ ] Error handling and resilience
- [ ] Logging and monitoring
- [ ] Documentation
- [ ] Packaging for easy installation
## Future Considerations
- **Multi-node support**: One instance managing multiple nodes (different presets/locations)
- **Store-and-forward**: Queue messages for offline users
- **Games**: Simple text games (trivia, 8-ball, etc.)
- **Scheduled broadcasts**: Periodic announcements
## Notes
- Meshtastic Python API: https://meshtastic.org/docs/software/python/cli/
- Message size limit is 237 bytes, but we're targeting 150 chars for safety and readability
- The meshtastic library handles serial/TCP abstraction well

View file

@ -1,593 +0,0 @@
# Implementation Diff - Exact Changes Needed
This document shows the exact code changes needed to implement Rolling Summary memory in MeshAI.
---
## 1. Create New File: `meshai/memory.py`
**Action:** Create this new file with the complete implementation.
**Location:** `/home/zvx/projects/meshai/meshai/memory.py`
**Content:** See `MEMORY_IMPLEMENTATION_GUIDE.md` section 1 for full code.
**Lines of code:** ~100
---
## 2. Modify: `meshai/history.py`
### Add to imports
```python
# No new imports needed - already has time, Optional
```
### Modify `initialize()` method
**Before:**
```python
async def initialize(self) -> None:
"""Initialize database and create tables."""
self._db = await aiosqlite.connect(self._db_path)
await self._db.execute("""
CREATE TABLE IF NOT EXISTS conversations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id TEXT NOT NULL,
role TEXT NOT NULL,
content TEXT NOT NULL,
timestamp REAL NOT NULL
)
""")
await self._db.execute("""
CREATE INDEX IF NOT EXISTS idx_user_timestamp
ON conversations (user_id, timestamp)
""")
await self._db.commit()
logger.info(f"Conversation history initialized at {self._db_path}")
```
**After:**
```python
async def initialize(self) -> None:
"""Initialize database and create tables."""
self._db = await aiosqlite.connect(self._db_path)
await self._db.execute("""
CREATE TABLE IF NOT EXISTS conversations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id TEXT NOT NULL,
role TEXT NOT NULL,
content TEXT NOT NULL,
timestamp REAL NOT NULL
)
""")
await self._db.execute("""
CREATE INDEX IF NOT EXISTS idx_user_timestamp
ON conversations (user_id, timestamp)
""")
# NEW: Summary table
await self._db.execute("""
CREATE TABLE IF NOT EXISTS conversation_summaries (
user_id TEXT PRIMARY KEY,
summary TEXT NOT NULL,
message_count INTEGER NOT NULL,
updated_at REAL NOT NULL
)
""")
await self._db.commit()
logger.info(f"Conversation history initialized at {self._db_path}")
```
### Add new methods (append to end of class)
```python
async def store_summary(
self, user_id: str, summary: str, message_count: int
) -> None:
"""Store conversation summary.
Args:
user_id: Node ID of user
summary: Summary text
message_count: Number of messages summarized
"""
if not self._db:
raise RuntimeError("Database not initialized")
async with self._lock:
await self._db.execute(
"""
INSERT OR REPLACE INTO conversation_summaries
(user_id, summary, message_count, updated_at)
VALUES (?, ?, ?, ?)
""",
(user_id, summary, message_count, time.time()),
)
await self._db.commit()
async def get_summary(self, user_id: str) -> Optional[dict]:
"""Get conversation summary for user.
Args:
user_id: Node ID of user
Returns:
Dict with 'summary', 'message_count', 'updated_at' or None
"""
if not self._db:
raise RuntimeError("Database not initialized")
async with self._lock:
cursor = await self._db.execute(
"""
SELECT summary, message_count, updated_at
FROM conversation_summaries
WHERE user_id = ?
""",
(user_id,),
)
row = await cursor.fetchone()
if not row:
return None
return {
"summary": row[0],
"message_count": row[1],
"updated_at": row[2],
}
async def clear_summary(self, user_id: str) -> None:
"""Clear summary for user (e.g., on history reset).
Args:
user_id: Node ID of user
"""
if not self._db:
raise RuntimeError("Database not initialized")
async with self._lock:
await self._db.execute(
"DELETE FROM conversation_summaries WHERE user_id = ?",
(user_id,),
)
await self._db.commit()
```
**Lines added:** ~60
---
## 3. Modify: `meshai/backends/openai_backend.py`
### Add import
**Before:**
```python
import logging
from typing import Optional
from openai import AsyncOpenAI
from ..config import LLMConfig
from .base import LLMBackend
```
**After:**
```python
import logging
from typing import Optional
from openai import AsyncOpenAI
from ..config import LLMConfig
from ..memory import RollingSummaryMemory # NEW
from .base import LLMBackend
```
### Modify `__init__()` method
**Before:**
```python
def __init__(self, config: LLMConfig, api_key: str):
"""Initialize OpenAI backend.
Args:
config: LLM configuration
api_key: API key to use
"""
self.config = config
self._client = AsyncOpenAI(
api_key=api_key,
base_url=config.base_url,
)
```
**After:**
```python
def __init__(self, config: LLMConfig, api_key: str):
"""Initialize OpenAI backend.
Args:
config: LLM configuration
api_key: API key to use
"""
self.config = config
self._client = AsyncOpenAI(
api_key=api_key,
base_url=config.base_url,
)
# NEW: Initialize rolling summary memory
self._memory = RollingSummaryMemory(
client=self._client,
model=config.model,
window_size=4,
summarize_threshold=8,
)
```
### Modify `generate()` method signature and logic
**Before:**
```python
async def generate(
self,
messages: list[dict],
system_prompt: str,
max_tokens: int = 300,
) -> str:
"""Generate a response using OpenAI-compatible API."""
# Build messages list with system prompt
full_messages = [{"role": "system", "content": system_prompt}]
full_messages.extend(messages)
try:
response = await self._client.chat.completions.create(
model=self.config.model,
messages=full_messages,
max_tokens=max_tokens,
temperature=0.7,
)
content = response.choices[0].message.content
return content.strip() if content else ""
except Exception as e:
logger.error(f"OpenAI API error: {e}")
raise
```
**After:**
```python
async def generate(
self,
messages: list[dict],
system_prompt: str,
user_id: str = None, # NEW: optional for backward compatibility
max_tokens: int = 300,
) -> str:
"""Generate a response using OpenAI-compatible API."""
# NEW: Use memory manager if user_id provided
if user_id:
summary, recent_messages = await self._memory.get_context_messages(
user_id=user_id,
full_history=messages,
)
if summary:
# Long conversation: system + summary + recent
enhanced_system = f"""{system_prompt}
Previous conversation summary: {summary}"""
full_messages = [{"role": "system", "content": enhanced_system}]
full_messages.extend(recent_messages)
logger.debug(
f"Using summary + {len(recent_messages)} recent messages "
f"(total history: {len(messages)})"
)
else:
# Short conversation: system + all messages
full_messages = [{"role": "system", "content": system_prompt}]
full_messages.extend(messages)
else:
# Old behavior: full history
full_messages = [{"role": "system", "content": system_prompt}]
full_messages.extend(messages)
try:
response = await self._client.chat.completions.create(
model=self.config.model,
messages=full_messages,
max_tokens=max_tokens,
temperature=0.7,
)
content = response.choices[0].message.content
return content.strip() if content else ""
except Exception as e:
logger.error(f"OpenAI API error: {e}")
raise
```
### Add helper methods (append to end of class)
```python
def load_summary_cache(self, user_id: str, summary_data: dict) -> None:
"""Load summary into memory cache (called on startup).
Args:
user_id: User identifier
summary_data: Dict with 'summary', 'message_count', 'updated_at'
"""
from ..memory import ConversationSummary
summary = ConversationSummary(
summary=summary_data["summary"],
message_count=summary_data["message_count"],
last_updated=summary_data["updated_at"],
)
self._memory.load_summary(user_id, summary)
def clear_summary_cache(self, user_id: str) -> None:
"""Clear summary cache for user."""
self._memory.clear_summary(user_id)
```
**Lines modified:** ~40
**Lines added:** ~20
---
## 4. Modify: `meshai/responder.py`
### Find the response generation section
**Location:** Look for where `self.backend.generate()` is called.
**Before:**
```python
# Wherever backend.generate() is called
response = await self.backend.generate(
messages=history,
system_prompt=self.system_prompt,
max_tokens=300,
)
```
**After:**
```python
# Pass user_id for memory optimization
response = await self.backend.generate(
messages=history,
system_prompt=self.system_prompt,
user_id=user_id, # NEW
max_tokens=300,
)
# NEW: Persist summary if created
await self._persist_summary_if_needed(user_id)
```
### Add helper method (append to class)
```python
async def _persist_summary_if_needed(self, user_id: str) -> None:
"""Store summary to database if one was created."""
if hasattr(self.backend, "_memory"):
summary = self.backend._memory._summaries.get(user_id)
if summary:
await self.history.store_summary(
user_id,
summary.summary,
summary.message_count,
)
```
**Lines modified:** ~5
**Lines added:** ~10
---
## 5. Modify: `meshai/commands/reset.py`
### Modify `execute()` method
**Before:**
```python
async def execute(self, sender_id: str, args: list[str]) -> str:
"""Reset conversation history."""
count = await self.responder.history.clear_history(sender_id)
return f"Cleared {count} messages from your history."
```
**After:**
```python
async def execute(self, sender_id: str, args: list[str]) -> str:
"""Reset conversation history."""
count = await self.responder.history.clear_history(sender_id)
# NEW: Also clear summary
await self.responder.history.clear_summary(sender_id)
if hasattr(self.responder.backend, "clear_summary_cache"):
self.responder.backend.clear_summary_cache(sender_id)
return f"Cleared {count} messages from your history."
```
**Lines added:** ~4
---
## Summary of Changes
| File | Action | Lines Added | Lines Modified |
|------|--------|-------------|----------------|
| `meshai/memory.py` | Create new | ~100 | 0 |
| `meshai/history.py` | Modify | ~70 | ~10 |
| `meshai/backends/openai_backend.py` | Modify | ~30 | ~40 |
| `meshai/responder.py` | Modify | ~10 | ~5 |
| `meshai/commands/reset.py` | Modify | ~4 | ~2 |
| **TOTAL** | | **~214** | **~57** |
**Net new code:** ~271 lines across 5 files
**Dependencies added:** 0
**Breaking changes:** None (user_id parameter is optional)
---
## Testing After Implementation
### 1. Database migration (automatic)
```bash
# Just start the app - new table will be created automatically
python -m meshai
```
### 2. Test basic conversation
```python
# Send 5 messages - should use full history (no summary yet)
# Send 15 messages - should start summarizing
```
### 3. Verify summary storage
```bash
sqlite3 meshai_history.db
```
```sql
-- Check summaries table exists
.tables
-- View summaries
SELECT user_id, summary, message_count, updated_at
FROM conversation_summaries;
-- Check conversations
SELECT COUNT(*) FROM conversations;
```
### 4. Test reset command
```
Send: !reset
Expected: Clears both conversations and summary
```
### 5. Monitor logs
```python
# Should see log messages like:
# "Using summary + 8 recent messages (total history: 24)"
```
---
## Rollback Plan
If something goes wrong:
1. **Remove new file:**
```bash
rm meshai/memory.py
```
2. **Revert changes:** Use git to revert the 4 modified files
```bash
git checkout meshai/history.py
git checkout meshai/backends/openai_backend.py
git checkout meshai/responder.py
git checkout meshai/commands/reset.py
```
3. **Database is safe:** Summary table won't hurt anything, conversations table unchanged
4. **No data loss:** Can drop summaries table if needed
```sql
DROP TABLE conversation_summaries;
```
---
## Performance Validation
After running for a day:
```sql
-- Average messages per user
SELECT AVG(msg_count) as avg_messages
FROM (
SELECT user_id, COUNT(*) as msg_count
FROM conversations
GROUP BY user_id
);
-- Users with summaries
SELECT COUNT(*) FROM conversation_summaries;
-- Summary stats
SELECT
AVG(message_count) as avg_summarized,
MIN(updated_at) as oldest_summary,
MAX(updated_at) as newest_summary
FROM conversation_summaries;
```
**Expected:**
- Users with >10 messages should have summaries
- Summaries should update every ~8 new messages
- No errors in logs
---
## Configuration Tuning
If you need to adjust behavior:
**In `meshai/backends/openai_backend.py`:**
```python
self._memory = RollingSummaryMemory(
client=self._client,
model=config.model,
window_size=4, # ← Adjust: 3-6 typical
summarize_threshold=8, # ← Adjust: 6-12 typical
)
```
**For very short messages (like Meshtastic):**
- Try `window_size=6` (more recent context)
- Try `summarize_threshold=10` (less frequent summarization)
**For longer messages:**
- Try `window_size=3` (less recent context needed)
- Try `summarize_threshold=6` (more frequent updates)
---
## Next Steps
1. Implement changes in order (create memory.py first)
2. Test with a few users before full deployment
3. Monitor logs for summary generation
4. Check SQLite database for summaries
5. Tune window_size and threshold based on actual usage
6. Measure token savings in production
Good luck! The code is solid and tested - this should be a smooth upgrade.

View file

@ -1,189 +0,0 @@
# LLM Memory - Quick Reference Card
## The Problem
Current MeshAI sends full conversation history every request → wastes tokens, slow, expensive.
## The Solution
**Rolling Summary Memory**: Keep recent messages + LLM-generated summary of older messages.
## Results
- 70-80% token reduction for long conversations
- Zero dependencies
- Works with existing stack (AsyncOpenAI + SQLite)
- ~100 lines of code
---
## How It Works (5-Second Version)
```
Long conversation (30 messages):
Messages 1-22: "User discussed weather and hiking trails" (summary)
Messages 23-30: [sent in full]
Total tokens: ~600 instead of ~2400 (75% savings)
```
---
## Implementation Checklist
- [ ] Create `meshai/memory.py` - RollingSummaryMemory class
- [ ] Modify `meshai/history.py` - Add summary table + storage methods
- [ ] Modify `meshai/backends/openai_backend.py` - Integrate memory manager
- [ ] Modify `meshai/responder.py` - Pass user_id, persist summaries
- [ ] Modify `meshai/commands/reset.py` - Clear summaries on reset
---
## Configuration
```python
# In memory.py initialization
RollingSummaryMemory(
client=self._client,
model=config.model,
window_size=4, # Keep last 4 exchanges (8 messages)
summarize_threshold=8, # Re-summarize after 8 new messages
)
```
**Tune based on:**
- `window_size`: Smaller = more summarization, larger = more recent context
- `summarize_threshold`: Smaller = more frequent re-summarization
---
## Database Schema Addition
```sql
CREATE TABLE conversation_summaries (
user_id TEXT PRIMARY KEY,
summary TEXT NOT NULL,
message_count INTEGER NOT NULL,
updated_at REAL NOT NULL
);
```
---
## Testing
```bash
# Run proof-of-concept comparison
python examples/memory_comparison.py
# Update these first:
# - BASE_URL (your LLM endpoint)
# - API_KEY (your key)
# - MODEL (your model name)
```
**Expected output:**
```
Approach Tokens Savings
----------------------------------------------
Full History 1847 (baseline)
Rolling Summary 512 72.3%
Window Only 398 78.4%
```
---
## Key Code Snippets
### Memory Manager Usage
```python
# Get optimized context
summary, recent_messages = await memory.get_context_messages(
user_id=user_id,
full_history=all_messages,
)
# Build message list
if summary:
system_prompt += f"\n\nPrevious conversation: {summary}"
context = [system] + recent_messages
else:
context = [system] + all_messages
```
### Store Summary
```python
await history.store_summary(
user_id=user_id,
summary=summary_text,
message_count=len(old_messages)
)
```
### Load Summary on Startup
```python
summary_data = await history.get_summary(user_id)
if summary_data:
backend.load_summary_cache(user_id, summary_data)
```
---
## Performance Metrics
| Messages | Full History | With Summary | Savings |
|----------|--------------|--------------|---------|
| 10 | 800 tokens | 800 tokens | 0% |
| 20 | 1600 tokens | 550 tokens | 66% |
| 30 | 2400 tokens | 600 tokens | 75% |
| 50 | 4000 tokens | 650 tokens | 84% |
**Cost Impact** (at $0.50/1M input tokens, 1000 requests/day):
- Before: $36/month
- After: $9/month
- **Savings: $27/month**
---
## When to Use Alternatives
| Use Case | Recommendation |
|----------|----------------|
| Simple stateless chat | Window-only memory |
| MeshAI (your project) | **Rolling Summary** |
| Want library solution | LangChain SummaryMemory |
| Need semantic search | ChromaDB vector store |
| Complex multi-day agent | MemGPT/Letta |
---
## Troubleshooting
**Summary too short/long?**
→ Adjust `max_tokens` in `_summarize()` method (default: 150)
**Summary quality poor?**
→ Modify prompt in `_summarize()`, lower temperature
**Too much overhead?**
→ Increase `summarize_threshold` (re-summarize less often)
**Want more context?**
→ Increase `window_size` (keep more recent messages)
---
## Documentation Files
1. **MEMORY_SUMMARY.md** - Overview and recommendation (this started here)
2. **MEMORY_RESEARCH.md** - Detailed evaluation of all 5 approaches
3. **MEMORY_IMPLEMENTATION_GUIDE.md** - Complete step-by-step implementation
4. **examples/memory_comparison.py** - Runnable proof-of-concept
5. **docs/memory_approaches_comparison.txt** - Visual comparison diagrams
6. **docs/QUICK_REFERENCE.md** - This cheat sheet
---
## One-Liner Summary
**Use Rolling Summary**: Zero deps, 75% token savings, 100 lines of code, works with your stack.

View file

@ -1,254 +0,0 @@
╔════════════════════════════════════════════════════════════════════════════════╗
║ LLM MEMORY APPROACHES COMPARISON ║
╚════════════════════════════════════════════════════════════════════════════════╝
┌────────────────────────────────────────────────────────────────────────────────┐
│ 1. FULL HISTORY (Current MeshAI Implementation) │
├────────────────────────────────────────────────────────────────────────────────┤
│ │
│ Request 1: [System] + [Msg1, Msg2] = 200 tokens │
│ Request 5: [System] + [Msg1...Msg10] = 1000 tokens │
│ Request 10: [System] + [Msg1...Msg20] = 2000 tokens │
│ Request 20: [System] + [Msg1...Msg40] = 4000 tokens │
│ │
│ ✓ Complete context │
│ ✗ Linear growth in tokens │
│ ✗ Expensive and slow for long conversations │
│ ✗ Redundant - most messages not relevant to current query │
│ │
└────────────────────────────────────────────────────────────────────────────────┘
┌────────────────────────────────────────────────────────────────────────────────┐
│ 2. WINDOW MEMORY (Keep Last N Only) │
├────────────────────────────────────────────────────────────────────────────────┤
│ │
│ Request 1: [System] + [Msg1, Msg2] = 200 tokens │
│ Request 5: [System] + [Msg7, Msg8, Msg9, Msg10] = 500 tokens │
│ Request 10: [System] + [Msg17, Msg18, Msg19, Msg20] = 500 tokens │
│ Request 20: [System] + [Msg37, Msg38, Msg39, Msg40] = 500 tokens │
│ │
│ ✓ Constant token usage │
│ ✓ Very fast and cheap │
│ ✗ Completely forgets old context │
│ ✗ Can't reference earlier conversation │
│ │
└────────────────────────────────────────────────────────────────────────────────┘
┌────────────────────────────────────────────────────────────────────────────────┐
│ 3. ROLLING SUMMARY (RECOMMENDED) │
├────────────────────────────────────────────────────────────────────────────────┤
│ │
│ Request 1-5: [System] + [Msg1...Msg10] = 1000 tokens │
│ (Short conversation - no summary yet) │
│ │
│ Request 10+: [System + Summary] + [Recent 8 msgs] = 600 tokens │
│ │
│ ┌─────────────────────────────────────┐ │
│ │ Summary: "User discussed weather │ │
│ │ and hiking. Mt Si is 4hr moderate │ │
│ │ hike, Rattlesnake is 2mi easier." │ (100 tokens) │
│ └─────────────────────────────────────┘ │
│ ↓ │
│ ┌─────────────────────────────────────┐ │
│ │ User: How crowded does it get? │ │
│ │ Assistant: Very crowded weekends │ │
│ │ User: Any other trails nearby? │ (400 tokens) │
│ │ Assistant: Rattlesnake is closer │ │
│ │ ... (last 4 exchanges) │ │
│ └─────────────────────────────────────┘ │
│ │
│ Request 20: [System + Summary] + [Recent 8 msgs] = 600 tokens │
│ (Summary updated every ~8 new messages) │
│ │
│ ✓ Balanced token usage (70-80% reduction) │
│ ✓ Preserves long-term context via summary │
│ ✓ Recent messages in full detail │
│ ✓ Scalable to very long conversations │
│ ✗ Small overhead for summary generation (1-2s every 8-10 msgs) │
│ │
└────────────────────────────────────────────────────────────────────────────────┘
┌────────────────────────────────────────────────────────────────────────────────┐
│ 4. VECTOR STORE MEMORY (ChromaDB/Qdrant) │
├────────────────────────────────────────────────────────────────────────────────┤
│ │
│ Current query: "What trails are nearby?" │
│ ↓ (embed and search) │
│ ┌──────────────────────────────────────────────────────────────────┐ │
│ │ Vector DB: Find semantically similar past messages │ │
│ │ - "Mt Si is a moderate 4-hour hike" (score: 0.89) │ │
│ │ - "Rattlesnake Ledge has lake views" (score: 0.85) │ │
│ │ - "Bring water and snacks" (score: 0.62) │ │
│ └──────────────────────────────────────────────────────────────────┘ │
│ ↓ │
│ [System + Top 3 relevant] + [Current query] = 500 tokens │
│ │
│ ✓ Semantic retrieval - finds relevant context │
│ ✓ Works for sparse conversations │
│ ✓ Enables cross-conversation search │
│ ✗ Requires embeddings (API calls or local model) │
│ ✗ Adds complexity (vector DB, indexing) │
│ ✗ May retrieve irrelevant "similar" messages │
│ │
└────────────────────────────────────────────────────────────────────────────────┘
┌────────────────────────────────────────────────────────────────────────────────┐
│ 5. MEMGPT/LETTA (Self-Editing Memory) │
├────────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌───────────────────────────────────┐ │
│ │ Core Memory (always in context): │ │
│ │ - User: Matt │ (50 tokens) │
│ │ - Preferences: Metric units │ │
│ └───────────────────────────────────┘ │
│ ↓ │
│ ┌───────────────────────────────────┐ │
│ │ Recall Memory (vector search): │ │
│ │ - [Retrieved: 3 relevant msgs] │ (300 tokens) │
│ └───────────────────────────────────┘ │
│ ↓ │
│ ┌───────────────────────────────────┐ │
│ │ Archival Memory (long-term): │ │
│ │ - [Searchable but not loaded] │ │
│ └───────────────────────────────────┘ │
│ │
│ Agent decides what to remember/forget/search │
│ │
│ ✓ Most sophisticated - agent manages own memory │
│ ✓ Handles complex multi-day conversations │
│ ✗ Very heavy (200MB+ dependencies) │
│ ✗ Requires vector embeddings │
│ ✗ Overkill for simple chat │
│ ✗ Opinionated architecture (hard to integrate) │
│ │
└────────────────────────────────────────────────────────────────────────────────┘
╔════════════════════════════════════════════════════════════════════════════════╗
║ RECOMMENDATION MATRIX ║
╚════════════════════════════════════════════════════════════════════════════════╝
┌──────────────┬──────────────┬────────────┬──────────────┬──────────────────────┐
│ Approach │ Dependencies │ Tokens │ Complexity │ Use Case │
├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
│ Full History │ None │ High │ Low │ Don't use (baseline) │
├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
│ Window Only │ None │ Low │ Low │ Stateless chat bots │
├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
│ Rolling │ │ │ │ ✓ MESHAI │
│ Summary │ None │ Very Low │ Low │ ✓ Most projects │
│ (DIY) │ │ │ │ ✓ Best balance │
├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
│ LangChain │ ~50 MB │ Very Low │ Medium │ Want batteries- │
│ Summary │ │ │ │ included solution │
├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
│ Vector Store │ ~20 MB │ Low │ Medium │ Semantic search, │
│ (ChromaDB) │ │ │ │ long-term memory │
├──────────────┼──────────────┼────────────┼──────────────┼──────────────────────┤
│ MemGPT/Letta │ ~200 MB │ Low │ Very High │ Complex multi-day │
│ │ │ │ │ agent workflows │
└──────────────┴──────────────┴────────────┴──────────────┴──────────────────────┘
╔════════════════════════════════════════════════════════════════════════════════╗
║ PERFORMANCE COMPARISON (20 messages) ║
╚════════════════════════════════════════════════════════════════════════════════╝
Tokens Sent to LLM
4000│ ████████████████████████████████ Full History
3000│
2000│
1000│
600│ ██████ Rolling Summary
500│ █████ Window Only
│ █████ Vector Store
0└─────────────────────────────────────────────────────────→
1 5 10 15 20 25 30 35 40 (Conversation length)
Legend:
████ Full History (linear growth)
████ Rolling Summary (plateau after initial growth)
████ Window/Vector (constant)
╔════════════════════════════════════════════════════════════════════════════════╗
║ IMPLEMENTATION COMPLEXITY ║
╚════════════════════════════════════════════════════════════════════════════════╝
┌─────────────────────────────────────────────────────────────────────────────┐
│ Simple ←───────────────────────────────────────────────────→ Complex │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ Window Only Rolling Summary LangChain MemGPT │
│ (20 lines) (100 lines) (10 lines (200+ lines │
│ + 50MB dep) + 200MB dep) │
│ │
│ ↑ ↑ ↑ ↑ │
│ No deps No deps Heavy deps Very heavy │
│ No persistence SQLite persist In-memory Built-in DB │
│ Loses old context Keeps summary Keeps summary Multi-tier │
│ │
│ ★ RECOMMENDED ★ │
└─────────────────────────────────────────────────────────────────────────────┘
╔════════════════════════════════════════════════════════════════════════════════╗
║ FOR MESHAI SPECIFICALLY ║
╚════════════════════════════════════════════════════════════════════════════════╝
Current:
- Messages: 150 chars max (very small)
- Conversations: Per-user, linear
- Backend: OpenAI-compatible (LiteLLM, local models)
- Storage: SQLite + aiosqlite
- Problem: Full history sent every time
Constraints:
- Lightweight (runs on mesh nodes potentially)
- No heavy dependencies
- Must work offline (local models)
- Persistence required (survive restarts)
Solution: Rolling Summary
✓ Zero dependencies (pure Python)
✓ Works with existing AsyncOpenAI client
✓ Persists in existing SQLite database
✓ ~100 lines of code (easy to maintain)
✓ 70-80% token reduction
✓ Tunable (window_size, summarize_threshold)
Configuration:
- window_size = 4 (keep last 4 exchanges = 8 messages)
- summarize_threshold = 8 (re-summarize after 8 new messages)
Expected savings:
- 10 messages: 0% (no summary yet)
- 20 messages: 66% token reduction
- 30 messages: 75% token reduction
- 50 messages: 84% token reduction
Cost impact (at $0.50/1M tokens):
- Before: $0.0012 per request (2400 tokens)
- After: $0.0003 per request (600 tokens)
- Savings: $27/month for 1000 requests/day
╔════════════════════════════════════════════════════════════════════════════════╗
║ NEXT STEPS ║
╚════════════════════════════════════════════════════════════════════════════════╝
1. Read: MEMORY_SUMMARY.md (quick overview)
2. Study: MEMORY_RESEARCH.md (detailed analysis)
3. Test: python examples/memory_comparison.py (see it in action)
4. Build: MEMORY_IMPLEMENTATION_GUIDE.md (step-by-step)
5. Deploy: Monitor and tune based on real usage
Files created:
- /home/zvx/projects/meshai/MEMORY_SUMMARY.md
- /home/zvx/projects/meshai/MEMORY_RESEARCH.md
- /home/zvx/projects/meshai/MEMORY_IMPLEMENTATION_GUIDE.md
- /home/zvx/projects/meshai/examples/memory_comparison.py
Good luck! 🚀

View file

@ -1,285 +0,0 @@
#!/usr/bin/env python3
"""
Proof-of-concept: Compare full history vs rolling summary memory.
Demonstrates token savings and performance of different approaches.
Usage:
python examples/memory_comparison.py
"""
import asyncio
import time
from typing import Optional
from openai import AsyncOpenAI
# ============================================================================
# SIMPLE ROLLING SUMMARY IMPLEMENTATION
# ============================================================================
class SimpleRollingSummary:
"""Minimal rolling summary memory manager for testing."""
def __init__(
self,
client: AsyncOpenAI,
model: str,
window_size: int = 4,
):
self.client = client
self.model = model
self.window_size = window_size
self._summary_cache = {}
async def get_context(
self, user_id: str, messages: list[dict]
) -> tuple[Optional[str], list[dict]]:
"""Return (summary, recent_messages) for optimized context."""
# Short conversation - return all messages
if len(messages) <= self.window_size * 2:
return None, messages
# Split old and recent
split = -(self.window_size * 2)
old = messages[:split]
recent = messages[split:]
# Get or create summary
if user_id not in self._summary_cache:
summary = await self._summarize(old)
self._summary_cache[user_id] = summary
else:
summary = self._summary_cache[user_id]
return summary, recent
async def _summarize(self, messages: list[dict]) -> str:
"""Generate summary of messages."""
conv = "\n".join([f"{m['role'].upper()}: {m['content']}" for m in messages])
prompt = f"""Summarize this conversation in 2-3 concise sentences:
{conv}
Summary:"""
response = await self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
max_tokens=150,
temperature=0.3,
)
return response.choices[0].message.content.strip()
# ============================================================================
# COMPARISON SCENARIOS
# ============================================================================
async def test_full_history(client: AsyncOpenAI, model: str, messages: list[dict]):
"""Baseline: Send full conversation history."""
print("\n=== FULL HISTORY APPROACH ===")
system = "You are a helpful assistant on a mesh network."
full = [{"role": "system", "content": system}] + messages
start = time.time()
response = await client.chat.completions.create(
model=model, messages=full, max_tokens=100, temperature=0.7
)
elapsed = time.time() - start
# Estimate tokens (rough)
total_chars = sum(len(m["content"]) for m in full)
est_tokens = total_chars // 4 # Rough estimate: 4 chars = 1 token
print(f"Messages sent: {len(full)}")
print(f"Est. input tokens: {est_tokens}")
print(f"Response: {response.choices[0].message.content[:100]}...")
print(f"Time: {elapsed:.2f}s")
return est_tokens, elapsed
async def test_rolling_summary(
client: AsyncOpenAI, model: str, messages: list[dict], user_id: str
):
"""Optimized: Send summary + recent messages."""
print("\n=== ROLLING SUMMARY APPROACH ===")
memory = SimpleRollingSummary(client, model, window_size=4)
summary, recent = await memory.get_context(user_id, messages)
system = "You are a helpful assistant on a mesh network."
if summary:
system += f"\n\nPrevious conversation summary: {summary}"
context = [{"role": "system", "content": system}] + recent
start = time.time()
response = await client.chat.completions.create(
model=model, messages=context, max_tokens=100, temperature=0.7
)
elapsed = time.time() - start
# Estimate tokens
total_chars = sum(len(m["content"]) for m in context)
est_tokens = total_chars // 4
print(f"Messages sent: {len(context)} (summary: {summary is not None})")
if summary:
print(f"Summary: {summary[:80]}...")
print(f"Est. input tokens: {est_tokens}")
print(f"Response: {response.choices[0].message.content[:100]}...")
print(f"Time: {elapsed:.2f}s")
return est_tokens, elapsed
async def test_window_only(client: AsyncOpenAI, model: str, messages: list[dict]):
"""Simple window: Just last N messages, no summary."""
print("\n=== WINDOW-ONLY APPROACH ===")
window_size = 4
recent = messages[-(window_size * 2) :]
system = "You are a helpful assistant on a mesh network."
context = [{"role": "system", "content": system}] + recent
start = time.time()
response = await client.chat.completions.create(
model=model, messages=context, max_tokens=100, temperature=0.7
)
elapsed = time.time() - start
total_chars = sum(len(m["content"]) for m in context)
est_tokens = total_chars // 4
print(f"Messages sent: {len(context)} (last {window_size} exchanges only)")
print(f"Est. input tokens: {est_tokens}")
print(f"Response: {response.choices[0].message.content[:100]}...")
print(f"Time: {elapsed:.2f}s")
return est_tokens, elapsed
# ============================================================================
# MAIN TEST
# ============================================================================
async def main():
"""Run comparison test."""
# Configure your LLM endpoint
# Update these for your setup (LiteLLM, local model, etc.)
BASE_URL = "http://192.168.1.239:8000/v1" # LiteLLM endpoint
API_KEY = "sk-1234" # Your API key
MODEL = "gpt-4o-mini" # Your model
print("=" * 70)
print("LLM Memory Approach Comparison")
print("=" * 70)
# Create test conversation (simulate 15 exchanges = 30 messages)
messages = []
topics = [
("What's the weather?", "It's sunny and 72°F."),
("Should I bring an umbrella?", "No need, clear skies all day."),
("What about tomorrow?", "Tomorrow looks rainy, bring an umbrella."),
("Any hiking recommendations?", "Try Mt. Si, great views!"),
("How long is the hike?", "About 4 hours round trip."),
("Is it beginner friendly?", "Moderate difficulty, doable for most."),
("What should I bring?", "Water, snacks, good boots, and layers."),
("Are dogs allowed?", "Yes, but must be leashed."),
("Where's the trailhead?", "Off I-90 near North Bend."),
("Parking fee?", "Yes, $10 or Northwest Forest Pass."),
("What time should I start?", "Early morning, around 7-8 AM."),
("How crowded does it get?", "Very crowded on weekends, go weekdays."),
("Any other trails nearby?", "Rattlesnake Ledge is easier and closer."),
("Tell me about Rattlesnake", "2 miles, great lake views, very popular."),
("Which would you recommend?", "If fit: Mt Si. If casual: Rattlesnake."),
]
for user_msg, assistant_msg in topics:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": assistant_msg})
print(f"\nTest conversation: {len(messages)} messages ({len(messages)//2} exchanges)")
print(f"Topics: weather → hiking → trails")
print(f"Message lengths: {min(len(m['content']) for m in messages)}-{max(len(m['content']) for m in messages)} chars")
# Initialize client
client = AsyncOpenAI(api_key=API_KEY, base_url=BASE_URL)
try:
# Test each approach
full_tokens, full_time = await test_full_history(client, MODEL, messages)
summary_tokens, summary_time = await test_rolling_summary(
client, MODEL, messages, "!test_user"
)
window_tokens, window_time = await test_window_only(client, MODEL, messages)
# Results
print("\n" + "=" * 70)
print("COMPARISON RESULTS")
print("=" * 70)
print(f"\n{'Approach':<20} {'Tokens':<15} {'Time':<10} {'Savings'}")
print("-" * 70)
print(
f"{'Full History':<20} {full_tokens:<15} {full_time:<10.2f}s {'(baseline)'}"
)
print(
f"{'Rolling Summary':<20} {summary_tokens:<15} {summary_time:<10.2f}s "
f"{(1 - summary_tokens/full_tokens)*100:.1f}%"
)
print(
f"{'Window Only':<20} {window_tokens:<15} {window_time:<10.2f}s "
f"{(1 - window_tokens/full_tokens)*100:.1f}%"
)
print("\n" + "=" * 70)
print("RECOMMENDATIONS")
print("=" * 70)
print("\nFull History:")
print(" ✓ Complete context")
print(" ✗ High token usage")
print(" ✗ Slower for long conversations")
print(" Use: Never (inefficient)")
print("\nWindow Only:")
print(" ✓ Very low token usage")
print(" ✓ Fast")
print(" ✗ Loses older context completely")
print(" Use: Short-term conversations only")
print("\nRolling Summary:")
print(" ✓ Balanced token usage")
print(" ✓ Preserves long-term context")
print(" ✓ Fast after initial summary")
print(" ✗ Slight overhead for summarization")
print(" Use: RECOMMENDED for MeshAI")
print("\n" + "=" * 70)
finally:
await client.close()
if __name__ == "__main__":
asyncio.run(main())