mirror of
https://github.com/zvx-echo6/meshai.git
synced 2026-05-22 07:34:47 +02:00
feat: alert detection and dispatch system for real-time mesh alerts
- Add AlertEngine to detect infra down/recovery, battery critical, critical node offline - Add alert_channel, critical_nodes, alert_cooldown_minutes config options - Wire alerts to channel broadcast and DM subscribers - Add TUI options for critical nodes, alert channel, cooldown Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
0da697855e
commit
c8400233bd
4 changed files with 1601 additions and 1283 deletions
191
meshai/alert_engine.py
Normal file
191
meshai/alert_engine.py
Normal file
|
|
@ -0,0 +1,191 @@
|
|||
"""Alert engine — detects mesh state changes and dispatches alerts."""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional, TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .mesh_health import MeshHealthEngine
|
||||
from .mesh_reporter import MeshReporter
|
||||
from .subscriptions import SubscriptionManager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AlertEngine:
|
||||
"""Detects mesh state changes and dispatches alerts."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
health_engine: "MeshHealthEngine",
|
||||
reporter: "MeshReporter",
|
||||
subscription_manager: "SubscriptionManager",
|
||||
critical_nodes: list[str] = None,
|
||||
alert_cooldown_minutes: int = 30,
|
||||
):
|
||||
self._health = health_engine
|
||||
self._reporter = reporter
|
||||
self._subs = subscription_manager
|
||||
self._critical_nodes = set(n.upper() for n in (critical_nodes or []))
|
||||
self._cooldown_seconds = alert_cooldown_minutes * 60
|
||||
|
||||
# Previous state snapshot for change detection
|
||||
self._prev_infra_online: dict[int, bool] = {} # node_num -> was_online
|
||||
self._prev_battery: dict[int, float] = {} # node_num -> battery_percent
|
||||
|
||||
# Cooldown tracker: condition_key -> last_alert_time
|
||||
self._cooldowns: dict[str, float] = {}
|
||||
|
||||
# Queued alerts for delivery
|
||||
self._pending_alerts: list[dict] = []
|
||||
|
||||
def check(self) -> list[dict]:
|
||||
"""Compare current health to previous state. Returns list of alert dicts.
|
||||
|
||||
Each alert dict: {
|
||||
"type": "infra_offline" | "infra_recovery" | "battery_critical" | "critical_node_down",
|
||||
"node_name": str,
|
||||
"node_short": str,
|
||||
"node_num": int,
|
||||
"region": str,
|
||||
"message": str,
|
||||
"scope_type": "mesh" | "region" | "node",
|
||||
"scope_value": str,
|
||||
"is_critical": bool,
|
||||
}
|
||||
"""
|
||||
health = self._health.mesh_health
|
||||
if not health:
|
||||
return []
|
||||
|
||||
now = time.time()
|
||||
alerts = []
|
||||
|
||||
for node in health.nodes.values():
|
||||
if not node.is_infrastructure:
|
||||
continue
|
||||
|
||||
node_num = node.node_num
|
||||
name = node.long_name or node.short_name or str(node_num)
|
||||
short = node.short_name or str(node_num)
|
||||
region = node.region or "Unknown"
|
||||
is_critical = short.upper() in self._critical_nodes
|
||||
|
||||
# --- Infrastructure offline detection ---
|
||||
was_online = self._prev_infra_online.get(node_num)
|
||||
is_online = node.is_online
|
||||
|
||||
if was_online is not None: # Skip first run (no previous state)
|
||||
if was_online and not is_online:
|
||||
# Node went OFFLINE
|
||||
alert_type = "critical_node_down" if is_critical else "infra_offline"
|
||||
cooldown_key = f"offline_{node_num}"
|
||||
|
||||
if self._check_cooldown(cooldown_key, now):
|
||||
emoji = "\U0001F6A8" if is_critical else "\u274C" # 🚨 or ❌
|
||||
region_display = self._get_region_display(region)
|
||||
|
||||
alerts.append({
|
||||
"type": alert_type,
|
||||
"node_name": name,
|
||||
"node_short": short,
|
||||
"node_num": node_num,
|
||||
"region": region,
|
||||
"message": f"{emoji} {name} went offline in {region_display}.",
|
||||
"scope_type": "region",
|
||||
"scope_value": region,
|
||||
"is_critical": is_critical,
|
||||
})
|
||||
self._cooldowns[cooldown_key] = now
|
||||
|
||||
elif not was_online and is_online:
|
||||
# Node came BACK ONLINE
|
||||
cooldown_key = f"recovery_{node_num}"
|
||||
|
||||
if self._check_cooldown(cooldown_key, now):
|
||||
region_display = self._get_region_display(region)
|
||||
|
||||
alerts.append({
|
||||
"type": "infra_recovery",
|
||||
"node_name": name,
|
||||
"node_short": short,
|
||||
"node_num": node_num,
|
||||
"region": region,
|
||||
"message": f"\u2705 {name} is back online in {region_display}.", # ✅
|
||||
"scope_type": "region",
|
||||
"scope_value": region,
|
||||
"is_critical": is_critical,
|
||||
})
|
||||
self._cooldowns[cooldown_key] = now
|
||||
|
||||
# --- Battery critical detection (infra only) ---
|
||||
if node.battery_percent is not None and 0 < node.battery_percent <= 100:
|
||||
prev_bat = self._prev_battery.get(node_num)
|
||||
current_bat = node.battery_percent
|
||||
|
||||
if current_bat < 10 and (prev_bat is None or prev_bat >= 10):
|
||||
# Battery just dropped below 10%
|
||||
cooldown_key = f"battery_{node_num}"
|
||||
|
||||
if self._check_cooldown(cooldown_key, now):
|
||||
region_display = self._get_region_display(region)
|
||||
|
||||
alerts.append({
|
||||
"type": "battery_critical",
|
||||
"node_name": name,
|
||||
"node_short": short,
|
||||
"node_num": node_num,
|
||||
"region": region,
|
||||
"message": f"\U0001F50B {name} battery critical at {current_bat:.0f}% in {region_display}.", # 🔋
|
||||
"scope_type": "region",
|
||||
"scope_value": region,
|
||||
"is_critical": is_critical,
|
||||
})
|
||||
self._cooldowns[cooldown_key] = now
|
||||
|
||||
self._prev_battery[node_num] = current_bat
|
||||
|
||||
# Update state snapshot
|
||||
self._prev_infra_online[node_num] = is_online
|
||||
|
||||
self._pending_alerts = alerts
|
||||
return alerts
|
||||
|
||||
def _get_region_display(self, region: str) -> str:
|
||||
"""Get display name for region."""
|
||||
if not self._reporter:
|
||||
return region
|
||||
try:
|
||||
context = self._reporter._region_context(region)
|
||||
if context:
|
||||
return context.split("(")[0].strip()
|
||||
except Exception:
|
||||
pass
|
||||
return region
|
||||
|
||||
def _check_cooldown(self, key: str, now: float) -> bool:
|
||||
"""Check if enough time has passed since last alert for this condition."""
|
||||
last = self._cooldowns.get(key, 0)
|
||||
return (now - last) >= self._cooldown_seconds
|
||||
|
||||
def get_pending_alerts(self) -> list[dict]:
|
||||
"""Get alerts pending delivery."""
|
||||
return self._pending_alerts
|
||||
|
||||
def clear_pending(self):
|
||||
"""Clear pending alerts after delivery."""
|
||||
self._pending_alerts = []
|
||||
|
||||
def get_subscribers_for_alert(self, alert: dict) -> list[dict]:
|
||||
"""Find subscribers matching an alert's scope."""
|
||||
if not self._subs:
|
||||
return []
|
||||
|
||||
# Get all alert subscribers
|
||||
# mesh-scope subscribers get everything
|
||||
# region-scope subscribers get alerts for their region
|
||||
# node-scope subscribers get alerts for their specific node
|
||||
return self._subs.get_alert_subscribers(
|
||||
scope_type=alert.get("scope_type"),
|
||||
scope_value=alert.get("scope_value"),
|
||||
)
|
||||
|
|
@ -1036,6 +1036,12 @@ class Configurator:
|
|||
table.add_row("4", "Offline Threshold (hours)", str(mi.offline_threshold_hours))
|
||||
table.add_row("5", "Packet Threshold (24h)", str(mi.packet_threshold))
|
||||
table.add_row("6", "Battery Warning (%)", str(mi.battery_warning_percent))
|
||||
crit_nodes = getattr(mi, 'critical_nodes', [])
|
||||
alert_ch = getattr(mi, 'alert_channel', -1)
|
||||
alert_cd = getattr(mi, 'alert_cooldown_minutes', 30)
|
||||
table.add_row("7", "Critical Nodes", ", ".join(crit_nodes) if crit_nodes else "[dim]none[/dim]")
|
||||
table.add_row("8", "Alert Channel", f"Channel {alert_ch}" if alert_ch >= 0 else "[dim]disabled[/dim]")
|
||||
table.add_row("9", "Alert Cooldown (min)", str(alert_cd))
|
||||
table.add_row("0", "Back", "")
|
||||
|
||||
console.print(table)
|
||||
|
|
@ -1070,6 +1076,68 @@ class Configurator:
|
|||
if value != mi.battery_warning_percent:
|
||||
mi.battery_warning_percent = value
|
||||
self.modified = True
|
||||
elif choice == 7:
|
||||
self._edit_critical_nodes()
|
||||
elif choice == 8:
|
||||
ch = Prompt.ask("Alert channel (-1 to disable)", default=str(getattr(mi, 'alert_channel', -1)))
|
||||
try:
|
||||
new_ch = int(ch)
|
||||
if new_ch != getattr(mi, 'alert_channel', -1):
|
||||
mi.alert_channel = new_ch
|
||||
self.modified = True
|
||||
except ValueError:
|
||||
pass
|
||||
elif choice == 9:
|
||||
mins = Prompt.ask("Alert cooldown (minutes)", default=str(getattr(mi, 'alert_cooldown_minutes', 30)))
|
||||
try:
|
||||
new_mins = int(mins)
|
||||
if new_mins != getattr(mi, 'alert_cooldown_minutes', 30):
|
||||
mi.alert_cooldown_minutes = new_mins
|
||||
self.modified = True
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def _edit_critical_nodes(self) -> None:
|
||||
"""Edit critical node list."""
|
||||
mi = self.config.mesh_intelligence
|
||||
|
||||
while True:
|
||||
self._clear()
|
||||
console.print("[bold]Critical Nodes[/bold]")
|
||||
console.print("[dim]These nodes trigger priority alerts when they go offline.[/dim]")
|
||||
|
||||
crit_nodes = getattr(mi, 'critical_nodes', None)
|
||||
if crit_nodes is None:
|
||||
mi.critical_nodes = []
|
||||
crit_nodes = mi.critical_nodes
|
||||
|
||||
if crit_nodes:
|
||||
for i, name in enumerate(crit_nodes, 1):
|
||||
console.print(f" {i}. {name}")
|
||||
else:
|
||||
console.print(" [dim]No critical nodes configured[/dim]")
|
||||
|
||||
console.print("[cyan]A[/cyan] Add [cyan]R[/cyan] Remove [cyan]B[/cyan] Back")
|
||||
choice = Prompt.ask("Choice", default="b").strip().lower()
|
||||
|
||||
if choice == "b" or choice == "":
|
||||
break
|
||||
elif choice == "a":
|
||||
name = Prompt.ask("Node short name (e.g., MHR)")
|
||||
if name and name.strip():
|
||||
mi.critical_nodes.append(name.strip())
|
||||
self.modified = True
|
||||
elif choice == "r":
|
||||
if crit_nodes:
|
||||
idx = Prompt.ask("Number to remove")
|
||||
try:
|
||||
idx = int(idx)
|
||||
if 1 <= idx <= len(crit_nodes):
|
||||
removed = mi.critical_nodes.pop(idx - 1)
|
||||
console.print(f"Removed: {removed}")
|
||||
self.modified = True
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
def _edit_regions(self) -> None:
|
||||
"""Edit region anchor points."""
|
||||
|
|
|
|||
|
|
@ -198,6 +198,11 @@ class MeshIntelligenceConfig:
|
|||
packet_threshold: int = 500 # Non-text packets per 24h to flag
|
||||
battery_warning_percent: int = 20 # Battery level for warnings
|
||||
|
||||
# Alert settings
|
||||
critical_nodes: list[str] = field(default_factory=list) # Short names of critical nodes (e.g., ["MHR", "HPR"])
|
||||
alert_channel: int = -1 # Channel to broadcast alerts on. -1 = disabled, 0+ = channel index
|
||||
alert_cooldown_minutes: int = 30 # Min minutes between repeated alerts for same condition
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ class MeshAI:
|
|||
self.health_engine = None
|
||||
self.mesh_reporter = None
|
||||
self.subscription_manager = None
|
||||
self.alert_engine = None
|
||||
self._last_sub_check: float = 0.0
|
||||
self.router: Optional[MessageRouter] = None
|
||||
self.responder: Optional[Responder] = None
|
||||
|
|
@ -93,6 +94,12 @@ class MeshAI:
|
|||
self.health_engine.compute(self.data_store)
|
||||
self._last_health_compute = time.time()
|
||||
|
||||
# Check for alertable conditions
|
||||
if self.alert_engine:
|
||||
alerts = self.alert_engine.check()
|
||||
if alerts:
|
||||
await self._dispatch_alerts(alerts)
|
||||
|
||||
# Check scheduled subscriptions (every 60 seconds)
|
||||
if self.subscription_manager and self.mesh_reporter:
|
||||
if time.time() - self._last_sub_check >= 60:
|
||||
|
|
@ -264,6 +271,19 @@ class MeshAI:
|
|||
else:
|
||||
self.subscription_manager = None
|
||||
|
||||
# Alert engine (needs health engine, reporter, and subscription manager)
|
||||
if self.health_engine and self.mesh_reporter and self.subscription_manager:
|
||||
from .alert_engine import AlertEngine
|
||||
mi = self.config.mesh_intelligence
|
||||
self.alert_engine = AlertEngine(
|
||||
health_engine=self.health_engine,
|
||||
reporter=self.mesh_reporter,
|
||||
subscription_manager=self.subscription_manager,
|
||||
critical_nodes=getattr(mi, 'critical_nodes', []),
|
||||
alert_cooldown_minutes=getattr(mi, 'alert_cooldown_minutes', 30),
|
||||
)
|
||||
logger.info(f"Alert engine initialized (critical nodes: {getattr(mi, 'critical_nodes', [])})")
|
||||
|
||||
# Knowledge base (optional - gracefully degrade if deps missing)
|
||||
kb_cfg = self.config.knowledge
|
||||
if kb_cfg.enabled and kb_cfg.db_path:
|
||||
|
|
@ -420,6 +440,40 @@ class MeshAI:
|
|||
if pid_file.exists():
|
||||
pid_file.unlink()
|
||||
|
||||
async def _dispatch_alerts(self, alerts: list[dict]) -> None:
|
||||
"""Dispatch alerts to subscribers and alert channel."""
|
||||
mi = self.config.mesh_intelligence
|
||||
alert_channel = getattr(mi, 'alert_channel', -1)
|
||||
|
||||
for alert in alerts:
|
||||
message = alert["message"]
|
||||
logger.info(f"ALERT: {message}")
|
||||
|
||||
# Send to alert channel if configured
|
||||
if alert_channel >= 0 and self.connector:
|
||||
try:
|
||||
self.connector.send_message(
|
||||
text=message,
|
||||
destination=None, # Broadcast
|
||||
channel=alert_channel,
|
||||
)
|
||||
logger.info(f"Alert sent to channel {alert_channel}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send channel alert: {e}")
|
||||
|
||||
# Send DMs to matching subscribers
|
||||
if self.alert_engine and self.subscription_manager:
|
||||
subscribers = self.alert_engine.get_subscribers_for_alert(alert)
|
||||
for sub in subscribers:
|
||||
user_id = sub["user_id"]
|
||||
try:
|
||||
await self._send_sub_dm(user_id, message)
|
||||
logger.info(f"Alert DM sent to {user_id}: {alert['type']}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send alert DM to {user_id}: {e}")
|
||||
|
||||
self.alert_engine.clear_pending()
|
||||
|
||||
async def _check_scheduled_subs(self) -> None:
|
||||
"""Check for and deliver due scheduled reports."""
|
||||
from datetime import datetime
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue