feat: alert detection and dispatch system for real-time mesh alerts

- Add AlertEngine to detect infra down/recovery, battery critical, critical node offline
- Add alert_channel, critical_nodes, alert_cooldown_minutes config options
- Wire alerts to channel broadcast and DM subscribers
- Add TUI options for critical nodes, alert channel, cooldown

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
K7ZVX 2026-05-06 04:48:41 +00:00
commit c8400233bd
4 changed files with 1601 additions and 1283 deletions

191
meshai/alert_engine.py Normal file
View file

@ -0,0 +1,191 @@
"""Alert engine — detects mesh state changes and dispatches alerts."""
import logging
import time
from typing import Optional, TYPE_CHECKING
if TYPE_CHECKING:
from .mesh_health import MeshHealthEngine
from .mesh_reporter import MeshReporter
from .subscriptions import SubscriptionManager
logger = logging.getLogger(__name__)
class AlertEngine:
"""Detects mesh state changes and dispatches alerts."""
def __init__(
self,
health_engine: "MeshHealthEngine",
reporter: "MeshReporter",
subscription_manager: "SubscriptionManager",
critical_nodes: list[str] = None,
alert_cooldown_minutes: int = 30,
):
self._health = health_engine
self._reporter = reporter
self._subs = subscription_manager
self._critical_nodes = set(n.upper() for n in (critical_nodes or []))
self._cooldown_seconds = alert_cooldown_minutes * 60
# Previous state snapshot for change detection
self._prev_infra_online: dict[int, bool] = {} # node_num -> was_online
self._prev_battery: dict[int, float] = {} # node_num -> battery_percent
# Cooldown tracker: condition_key -> last_alert_time
self._cooldowns: dict[str, float] = {}
# Queued alerts for delivery
self._pending_alerts: list[dict] = []
def check(self) -> list[dict]:
"""Compare current health to previous state. Returns list of alert dicts.
Each alert dict: {
"type": "infra_offline" | "infra_recovery" | "battery_critical" | "critical_node_down",
"node_name": str,
"node_short": str,
"node_num": int,
"region": str,
"message": str,
"scope_type": "mesh" | "region" | "node",
"scope_value": str,
"is_critical": bool,
}
"""
health = self._health.mesh_health
if not health:
return []
now = time.time()
alerts = []
for node in health.nodes.values():
if not node.is_infrastructure:
continue
node_num = node.node_num
name = node.long_name or node.short_name or str(node_num)
short = node.short_name or str(node_num)
region = node.region or "Unknown"
is_critical = short.upper() in self._critical_nodes
# --- Infrastructure offline detection ---
was_online = self._prev_infra_online.get(node_num)
is_online = node.is_online
if was_online is not None: # Skip first run (no previous state)
if was_online and not is_online:
# Node went OFFLINE
alert_type = "critical_node_down" if is_critical else "infra_offline"
cooldown_key = f"offline_{node_num}"
if self._check_cooldown(cooldown_key, now):
emoji = "\U0001F6A8" if is_critical else "\u274C" # 🚨 or ❌
region_display = self._get_region_display(region)
alerts.append({
"type": alert_type,
"node_name": name,
"node_short": short,
"node_num": node_num,
"region": region,
"message": f"{emoji} {name} went offline in {region_display}.",
"scope_type": "region",
"scope_value": region,
"is_critical": is_critical,
})
self._cooldowns[cooldown_key] = now
elif not was_online and is_online:
# Node came BACK ONLINE
cooldown_key = f"recovery_{node_num}"
if self._check_cooldown(cooldown_key, now):
region_display = self._get_region_display(region)
alerts.append({
"type": "infra_recovery",
"node_name": name,
"node_short": short,
"node_num": node_num,
"region": region,
"message": f"\u2705 {name} is back online in {region_display}.", # ✅
"scope_type": "region",
"scope_value": region,
"is_critical": is_critical,
})
self._cooldowns[cooldown_key] = now
# --- Battery critical detection (infra only) ---
if node.battery_percent is not None and 0 < node.battery_percent <= 100:
prev_bat = self._prev_battery.get(node_num)
current_bat = node.battery_percent
if current_bat < 10 and (prev_bat is None or prev_bat >= 10):
# Battery just dropped below 10%
cooldown_key = f"battery_{node_num}"
if self._check_cooldown(cooldown_key, now):
region_display = self._get_region_display(region)
alerts.append({
"type": "battery_critical",
"node_name": name,
"node_short": short,
"node_num": node_num,
"region": region,
"message": f"\U0001F50B {name} battery critical at {current_bat:.0f}% in {region_display}.", # 🔋
"scope_type": "region",
"scope_value": region,
"is_critical": is_critical,
})
self._cooldowns[cooldown_key] = now
self._prev_battery[node_num] = current_bat
# Update state snapshot
self._prev_infra_online[node_num] = is_online
self._pending_alerts = alerts
return alerts
def _get_region_display(self, region: str) -> str:
"""Get display name for region."""
if not self._reporter:
return region
try:
context = self._reporter._region_context(region)
if context:
return context.split("(")[0].strip()
except Exception:
pass
return region
def _check_cooldown(self, key: str, now: float) -> bool:
"""Check if enough time has passed since last alert for this condition."""
last = self._cooldowns.get(key, 0)
return (now - last) >= self._cooldown_seconds
def get_pending_alerts(self) -> list[dict]:
"""Get alerts pending delivery."""
return self._pending_alerts
def clear_pending(self):
"""Clear pending alerts after delivery."""
self._pending_alerts = []
def get_subscribers_for_alert(self, alert: dict) -> list[dict]:
"""Find subscribers matching an alert's scope."""
if not self._subs:
return []
# Get all alert subscribers
# mesh-scope subscribers get everything
# region-scope subscribers get alerts for their region
# node-scope subscribers get alerts for their specific node
return self._subs.get_alert_subscribers(
scope_type=alert.get("scope_type"),
scope_value=alert.get("scope_value"),
)

File diff suppressed because it is too large Load diff

View file

@ -198,6 +198,11 @@ class MeshIntelligenceConfig:
packet_threshold: int = 500 # Non-text packets per 24h to flag packet_threshold: int = 500 # Non-text packets per 24h to flag
battery_warning_percent: int = 20 # Battery level for warnings battery_warning_percent: int = 20 # Battery level for warnings
# Alert settings
critical_nodes: list[str] = field(default_factory=list) # Short names of critical nodes (e.g., ["MHR", "HPR"])
alert_channel: int = -1 # Channel to broadcast alerts on. -1 = disabled, 0+ = channel index
alert_cooldown_minutes: int = 30 # Min minutes between repeated alerts for same condition
@dataclass @dataclass
class Config: class Config:

View file

@ -43,6 +43,7 @@ class MeshAI:
self.health_engine = None self.health_engine = None
self.mesh_reporter = None self.mesh_reporter = None
self.subscription_manager = None self.subscription_manager = None
self.alert_engine = None
self._last_sub_check: float = 0.0 self._last_sub_check: float = 0.0
self.router: Optional[MessageRouter] = None self.router: Optional[MessageRouter] = None
self.responder: Optional[Responder] = None self.responder: Optional[Responder] = None
@ -93,6 +94,12 @@ class MeshAI:
self.health_engine.compute(self.data_store) self.health_engine.compute(self.data_store)
self._last_health_compute = time.time() self._last_health_compute = time.time()
# Check for alertable conditions
if self.alert_engine:
alerts = self.alert_engine.check()
if alerts:
await self._dispatch_alerts(alerts)
# Check scheduled subscriptions (every 60 seconds) # Check scheduled subscriptions (every 60 seconds)
if self.subscription_manager and self.mesh_reporter: if self.subscription_manager and self.mesh_reporter:
if time.time() - self._last_sub_check >= 60: if time.time() - self._last_sub_check >= 60:
@ -264,6 +271,19 @@ class MeshAI:
else: else:
self.subscription_manager = None self.subscription_manager = None
# Alert engine (needs health engine, reporter, and subscription manager)
if self.health_engine and self.mesh_reporter and self.subscription_manager:
from .alert_engine import AlertEngine
mi = self.config.mesh_intelligence
self.alert_engine = AlertEngine(
health_engine=self.health_engine,
reporter=self.mesh_reporter,
subscription_manager=self.subscription_manager,
critical_nodes=getattr(mi, 'critical_nodes', []),
alert_cooldown_minutes=getattr(mi, 'alert_cooldown_minutes', 30),
)
logger.info(f"Alert engine initialized (critical nodes: {getattr(mi, 'critical_nodes', [])})")
# Knowledge base (optional - gracefully degrade if deps missing) # Knowledge base (optional - gracefully degrade if deps missing)
kb_cfg = self.config.knowledge kb_cfg = self.config.knowledge
if kb_cfg.enabled and kb_cfg.db_path: if kb_cfg.enabled and kb_cfg.db_path:
@ -420,6 +440,40 @@ class MeshAI:
if pid_file.exists(): if pid_file.exists():
pid_file.unlink() pid_file.unlink()
async def _dispatch_alerts(self, alerts: list[dict]) -> None:
"""Dispatch alerts to subscribers and alert channel."""
mi = self.config.mesh_intelligence
alert_channel = getattr(mi, 'alert_channel', -1)
for alert in alerts:
message = alert["message"]
logger.info(f"ALERT: {message}")
# Send to alert channel if configured
if alert_channel >= 0 and self.connector:
try:
self.connector.send_message(
text=message,
destination=None, # Broadcast
channel=alert_channel,
)
logger.info(f"Alert sent to channel {alert_channel}")
except Exception as e:
logger.error(f"Failed to send channel alert: {e}")
# Send DMs to matching subscribers
if self.alert_engine and self.subscription_manager:
subscribers = self.alert_engine.get_subscribers_for_alert(alert)
for sub in subscribers:
user_id = sub["user_id"]
try:
await self._send_sub_dm(user_id, message)
logger.info(f"Alert DM sent to {user_id}: {alert['type']}")
except Exception as e:
logger.error(f"Failed to send alert DM to {user_id}: {e}")
self.alert_engine.clear_pending()
async def _check_scheduled_subs(self) -> None: async def _check_scheduled_subs(self) -> None:
"""Check for and deliver due scheduled reports.""" """Check for and deliver due scheduled reports."""
from datetime import datetime from datetime import datetime