meshai/meshai/persistence/db.py
Matt Johnson (via Claude) c333a97344 feat(v0.6-2): dispatcher state persistence -- cold-start, cooldowns, dedup LRU to SQLite
Closes Rule-20 dispatcher gap from audit doc v0.6-phase1-audit.md finding #1.
Pre-this-commit the cold-start anchor, 4 drop counters, per-toggle cooldown
map, and dedup OrderedDict all lived in Dispatcher instance memory and were
lost on every container restart.

v5.sql adds three tables:
  - dispatcher_state (singleton id=1): cold_start_anchor + 4 drop counters
  - dispatcher_cooldowns ((toggle,category,region) keyed): last_fired_at
  - dispatcher_dedup ((source,event_id) keyed): seen_at

Dispatcher refactor:
  - __init__ calls _restore_from_db -- counters, cold-start anchor, cooldown
    map, and dedup LRU (most-recent 10k by seen_at) all rehydrated from the
    three new tables
  - write-through on every mutation: _persist_state for counter/anchor,
    _persist_cooldown for cooldown UPSERT + 2*cooldown_s prune,
    _persist_dedup for dedup INSERT OR REPLACE + 7-day cleanup
  - in-memory caches stay authoritative on the fast read path
  - cumulative-since-install counters (NOT since-boot); LLM will be able
    to answer "we have dropped 47 stale events this week" after commit #5
    (env_reporter) lands
  - graceful degrade: missing v5 tables / persistence outage falls back to
    fresh in-memory state without crashing the constructor

Tests:
  - tests/test_dispatcher_persistence.py (17 tests): state restore on init,
    counter+cooldown+dedup survival across simulated restart, cooldown rearm
    within 2x window, dedup LRU rebuild caps at 10k, 7-day cleanup on insert,
    INSERT OR REPLACE on duplicate source+event_id, v5 migration idempotent,
    synthetic storm (50 events) -> restart -> replay (5 incl 1 duplicate)
    with the duplicate dedup-rejected and counters NOT resetting
  - tests/conftest.py (new): autouse MESHAI_DB_PATH redirection to per-test
    tmp file, so the dispatcher_*  tables on production /data dont get
    polluted by tests that construct Dispatcher() without an explicit fixture
  - tests/test_notification_toggles.py: _dispatch helper wipes dedup/cooldown/
    state tables between calls (per-call independence preserved; pre-v0.6-2
    in-memory-only Dispatcher reset naturally per instance)

Test count: 680 -> 697 (+17 new, 0 regressions).

Refs audit doc v0.6-phase1-audit.md finding #1.
2026-06-05 16:35:40 +00:00

194 lines
7.2 KiB
Python

"""SQLite persistence connection management + migration runner.
Single-writer SQLite pattern with WAL journal mode for reader concurrency.
One connection per thread (threading.local) -- callers should not share
connections across threads.
Path resolution:
1. MESHAI_DB_PATH env var (explicit override)
2. DEFAULT_DB_PATH = /data/meshai.sqlite (prod container mount)
Special value ":memory:" or any path containing "memory" routes to an
in-memory SQLite for tests.
Migrations live in meshai/persistence/migrations/v*.sql. The runner
applies them in version order, recording the applied version in
schema_meta. Idempotent re-run is a no-op.
"""
from __future__ import annotations
import logging
import os
import sqlite3
import threading
from pathlib import Path
from typing import Iterable, Optional
logger = logging.getLogger(__name__)
DEFAULT_DB_PATH = "/data/meshai.sqlite"
MESHAI_DB_PATH_ENV = "MESHAI_DB_PATH"
SCHEMA_VERSION = 5
SCHEMA_META_TABLE = "schema_meta"
MIGRATIONS_DIR = Path(__file__).parent / "migrations"
# Per-thread connection pool. Each thread that calls get_db() gets its
# own sqlite3.Connection cached on threading.local. Tests can clear
# via close_thread_connection() between cases.
_local = threading.local()
# Module-level lock guards init_db() so concurrent first-callers don't
# race on migration application.
_init_lock = threading.Lock()
# Cache of initialised database paths in this process so init_db() is
# idempotent without re-reading migration files on every call.
_initialised: set[str] = set()
def MESHAI_DB_PATH() -> str:
"""Resolve the active SQLite path (env var override or default)."""
return os.environ.get(MESHAI_DB_PATH_ENV) or DEFAULT_DB_PATH
def _is_memory_path(path: str) -> bool:
return path == ":memory:" or "mode=memory" in path or path.startswith("file::memory:")
def _connect(path: str) -> sqlite3.Connection:
"""Open a SQLite connection with sane defaults for this project."""
if _is_memory_path(path):
# For in-memory tests, use a shared cache so multiple connections
# in the same thread can see the same DB. Tests that want isolation
# call close_thread_connection() between cases.
uri = "file::memory:?cache=shared"
conn = sqlite3.connect(uri, uri=True, isolation_level=None,
check_same_thread=False)
else:
# Ensure parent dir exists for file-backed DBs.
Path(path).parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(path, isolation_level=None,
check_same_thread=False, timeout=30.0)
conn.row_factory = sqlite3.Row
# Enable foreign keys (off by default in SQLite); WAL mode for reader
# concurrency; reasonable busy timeout for the single-writer pattern.
conn.execute("PRAGMA foreign_keys = ON")
if not _is_memory_path(path):
conn.execute("PRAGMA journal_mode = WAL")
conn.execute("PRAGMA synchronous = NORMAL")
conn.execute("PRAGMA busy_timeout = 30000")
return conn
def get_db(path: Optional[str] = None) -> sqlite3.Connection:
"""Return a SQLite connection for the current thread (cached).
First call initialises the database (runs pending migrations).
Subsequent calls in the same thread return the cached connection.
"""
target = path or MESHAI_DB_PATH()
cached = getattr(_local, "conn", None)
cached_path = getattr(_local, "path", None)
if cached is not None and cached_path == target:
return cached
# Different path requested or no cached conn -- (re)open.
if cached is not None:
try: cached.close()
except Exception: pass
conn = _connect(target)
_local.conn = conn
_local.path = target
if target not in _initialised:
with _init_lock:
if target not in _initialised:
_apply_migrations(conn)
_initialised.add(target)
return conn
def close_thread_connection() -> None:
"""Close + drop the cached connection for the current thread.
Tests call this between cases to ensure a clean slate. The shared-cache
in-memory database is reset on the LAST close in the process.
"""
conn = getattr(_local, "conn", None)
if conn is not None:
try: conn.close()
except Exception: pass
if hasattr(_local, "conn"): del _local.conn
if hasattr(_local, "path"): del _local.path
def init_db(path: Optional[str] = None) -> sqlite3.Connection:
"""Explicit init entry point (idempotent). Equivalent to get_db()
semantically but documents intent at startup. Returns the connection."""
return get_db(path)
def _read_migration_files() -> list[tuple[int, str, str]]:
"""Return [(version_int, filename, sql_text), ...] sorted ascending."""
if not MIGRATIONS_DIR.is_dir():
return []
out: list[tuple[int, str, str]] = []
for p in sorted(MIGRATIONS_DIR.iterdir()):
if not p.is_file() or p.suffix.lower() != ".sql":
continue
# Filename format: v<N>.sql or v<N>_<label>.sql
stem = p.stem
if not stem.startswith("v"):
continue
n_str = stem[1:].split("_", 1)[0]
try: n = int(n_str)
except ValueError: continue
out.append((n, p.name, p.read_text()))
return out
def _current_version(conn: sqlite3.Connection) -> int:
"""Return the highest applied migration version, or 0 if none."""
# Does the schema_meta table exist?
row = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name=?",
(SCHEMA_META_TABLE,),
).fetchone()
if row is None:
return 0
row = conn.execute(
f"SELECT value FROM {SCHEMA_META_TABLE} WHERE key='version'",
).fetchone()
if row is None:
return 0
try: return int(row["value"])
except (TypeError, ValueError): return 0
def _apply_migrations(conn: sqlite3.Connection) -> None:
"""Apply any pending migrations in version order. Idempotent."""
migrations = _read_migration_files()
if not migrations:
logger.warning("persistence: no migration files found in %s", MIGRATIONS_DIR)
return
current = _current_version(conn)
pending = [(n, name, sql) for n, name, sql in migrations if n > current]
if not pending:
logger.debug("persistence: schema up to date at v%d", current)
return
for version, filename, sql in pending:
logger.info("persistence: applying migration %s (v%d)", filename, version)
# sqlite3.Connection.executescript() ISSUES ITS OWN COMMIT before
# starting the script and another at the end, so wrapping it in an
# explicit BEGIN/COMMIT is both unnecessary and broken (the
# ROLLBACK in the except clause would fire against an already-
# committed-or-empty transaction). Migration atomicity is bounded
# by executescript's own transaction.
try:
conn.executescript(sql)
conn.execute(
f"INSERT OR REPLACE INTO {SCHEMA_META_TABLE}(key, value) VALUES('version', ?)",
(str(version),),
)
except Exception:
logger.exception("persistence: migration %s failed", filename)
raise
logger.info("persistence: schema now at v%d", pending[-1][0])