mirror of
https://github.com/zvx-echo6/meshai.git
synced 2026-05-21 23:24:44 +02:00
fix(health): use real channel utilization from node telemetry
- Utilization pillar now reads firmware-reported channel_utilization instead of estimating from packet counts with hardcoded 200ms/pkt - Uses highest infra node value (busiest node = bottleneck) - Falls back to packet count estimate only when telemetry unavailable - Updated thresholds: 20/25/35/45% matching real Meshtastic behavior - Per-region utilization from region nodes, not mesh-wide - API response includes util_method, util_max_percent, util_node_count Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
7de02fb924
commit
57a19aeec6
3 changed files with 1267 additions and 1164 deletions
|
|
@ -746,27 +746,32 @@ export default function Reference() {
|
|||
|
||||
<SubHeader>Utilization (25%)</SubHeader>
|
||||
<p>
|
||||
Estimates how much of the radio channel's airtime is being used. MeshAI can't measure airtime directly, so it estimates based on packet counts over the last 24 hours.
|
||||
</p>
|
||||
<p className="p-3 bg-slate-800 rounded font-mono text-sm">
|
||||
packets_per_hour = non_text_packets ÷ 24<br/>
|
||||
airtime_estimate = (packets_per_hour × 200ms) ÷ 3,600,000ms × 100%
|
||||
MeshAI reads the channel utilization that each router reports in its telemetry — this is the firmware's own measurement of how busy the radio channel is. MeshAI uses the <strong>highest</strong> value from any infrastructure node because the busiest router is the bottleneck for the whole mesh.
|
||||
</p>
|
||||
<p>
|
||||
The 200ms is an approximation for the MediumFast radio preset — each LoRa packet takes roughly 200ms of airtime. Text messages don't count toward utilization (chatting is the point of a mesh).
|
||||
<strong>How it works:</strong>
|
||||
</p>
|
||||
<ol className="list-decimal list-inside space-y-1 ml-4">
|
||||
<li>Collect <Mono>channel_utilization</Mono> from all infrastructure nodes that report it</li>
|
||||
<li>If no infra nodes have telemetry, try all nodes</li>
|
||||
<li>Use the <strong>maximum</strong> value for scoring (busiest node = bottleneck)</li>
|
||||
<li>If no nodes report utilization (older firmware), fall back to packet count estimate</li>
|
||||
</ol>
|
||||
<p className="mt-4">
|
||||
<strong>Fallback method</strong> (when telemetry unavailable): estimates from packet counts using 200ms/packet airtime. This is less accurate — it assumes MediumFast preset and sums packets across all nodes.
|
||||
</p>
|
||||
<RefTable
|
||||
headers={['Estimated Airtime', 'Score', 'What It Means']}
|
||||
headers={['Channel Utilization', 'Score', 'What It Means']}
|
||||
rows={[
|
||||
['Under 20%', '100', 'Channel is clear — this is the goal'],
|
||||
['20-25%', '75-100', 'Slight degradation, occasional collisions'],
|
||||
['25-35%', '50-75', 'Severe degradation — firmware throttling active'],
|
||||
['35-45%', '25-50', 'Mesh struggling badly — reliability dropping'],
|
||||
['Over 45%', '0-25', 'Mesh is effectively dead'],
|
||||
['Over 45%', '0-25', 'Mesh is effectively unusable'],
|
||||
]}
|
||||
/>
|
||||
<p>
|
||||
<strong>Special case:</strong> If MeshAI doesn't have packet data (no sources reporting packet counts), this pillar scores 100. You're not penalized for missing data.
|
||||
<strong>Special case:</strong> If no utilization data is available (no telemetry and no packet data), this pillar scores 100. You're not penalized for missing data.
|
||||
</p>
|
||||
|
||||
<SubHeader>Coverage (20%)</SubHeader>
|
||||
|
|
|
|||
|
|
@ -20,6 +20,9 @@ def _serialize_health_score(score) -> dict:
|
|||
"infra_online": score.infra_online,
|
||||
"infra_total": score.infra_total,
|
||||
"util_percent": round(score.util_percent, 1),
|
||||
"util_max_percent": round(getattr(score, 'util_max_percent', score.util_percent), 1),
|
||||
"util_method": getattr(score, 'util_method', 'unknown'),
|
||||
"util_node_count": getattr(score, 'util_node_count', 0),
|
||||
"flagged_nodes": score.flagged_nodes,
|
||||
"battery_warnings": score.battery_warnings,
|
||||
"solar_index": round(score.solar_index, 1),
|
||||
|
|
@ -76,6 +79,9 @@ async def get_health(request: Request):
|
|||
"infra_online": score.infra_online,
|
||||
"infra_total": score.infra_total,
|
||||
"util_percent": round(score.util_percent, 1),
|
||||
"util_max_percent": round(getattr(score, 'util_max_percent', score.util_percent), 1),
|
||||
"util_method": getattr(score, 'util_method', 'unknown'),
|
||||
"util_node_count": getattr(score, 'util_node_count', 0),
|
||||
"flagged_nodes": score.flagged_nodes,
|
||||
"battery_warnings": score.battery_warnings,
|
||||
"total_nodes": health.total_nodes,
|
||||
|
|
|
|||
|
|
@ -30,11 +30,12 @@ DEFAULT_OFFLINE_THRESHOLD_HOURS = 24
|
|||
DEFAULT_PACKET_THRESHOLD = 500 # Non-text packets per 24h
|
||||
DEFAULT_BATTERY_WARNING_PERCENT = 20
|
||||
|
||||
# Utilization thresholds (percentage)
|
||||
UTIL_HEALTHY = 20
|
||||
UTIL_CAUTION = 25
|
||||
UTIL_WARNING = 35
|
||||
UTIL_UNHEALTHY = 45
|
||||
# Utilization thresholds (percentage) - based on real Meshtastic behavior
|
||||
# Firmware starts throttling GPS at 25%, severe degradation above 35%
|
||||
UTIL_HEALTHY = 20 # Under 20% = channel is clear
|
||||
UTIL_CAUTION = 25 # 20-25% = slight degradation, occasional collisions
|
||||
UTIL_WARNING = 35 # 25-35% = severe degradation, firmware throttling
|
||||
UTIL_UNHEALTHY = 45 # 35-45% = mesh struggling badly, reliability dropping
|
||||
|
||||
# Pillar weights (5-pillar system)
|
||||
WEIGHT_INFRASTRUCTURE = 0.30
|
||||
|
|
@ -58,6 +59,9 @@ class HealthScore:
|
|||
infra_online: int = 0
|
||||
infra_total: int = 0
|
||||
util_percent: float = 0.0
|
||||
util_max_percent: float = 0.0 # Highest node utilization (hotspot indicator)
|
||||
util_method: str = "none" # "telemetry", "packet_estimate", or "none"
|
||||
util_node_count: int = 0 # Nodes reporting utilization
|
||||
coverage_avg_gateways: float = 0.0
|
||||
coverage_single_gw_count: int = 0
|
||||
coverage_full_count: int = 0
|
||||
|
|
@ -486,10 +490,19 @@ class MeshHealthEngine:
|
|||
data_sources.append(f"{len(all_channels)} ch")
|
||||
data_str = ", ".join(data_sources) if data_sources else "nodes only"
|
||||
|
||||
# Log utilization method used
|
||||
util_method = mesh_score.util_method
|
||||
if util_method == "telemetry":
|
||||
util_info = f"util={mesh_score.util_percent:.1f}% (max={mesh_score.util_max_percent:.1f}%, {mesh_score.util_node_count} nodes reporting)"
|
||||
elif util_method == "packet_estimate":
|
||||
util_info = f"util={mesh_score.util_percent:.1f}% (packet estimate fallback)"
|
||||
else:
|
||||
util_info = "util=N/A (no data)"
|
||||
|
||||
logger.info(
|
||||
f"Mesh health computed: {mesh_health.total_nodes} nodes, "
|
||||
f"{mesh_health.total_regions} regions, score {mesh_score.composite:.0f}/100 "
|
||||
f"[{data_str}]"
|
||||
f"[{data_str}] [{util_info}]"
|
||||
)
|
||||
|
||||
return mesh_health
|
||||
|
|
@ -541,6 +554,31 @@ class MeshHealthEngine:
|
|||
all_nodes = list(nodes.values())
|
||||
return self._compute_node_group_score(all_nodes, has_packet_data)
|
||||
|
||||
def _compute_utilization_score(self, util_percent: float) -> float:
|
||||
"""Convert utilization percentage to health score using thresholds.
|
||||
|
||||
Thresholds based on real Meshtastic behavior:
|
||||
- Under 20%: Clear channel (score 100)
|
||||
- 20-25%: Slight degradation (score 75-100)
|
||||
- 25-35%: Severe degradation, firmware throttling (score 50-75)
|
||||
- 35-45%: Mesh struggling badly (score 25-50)
|
||||
- Over 45%: Mesh effectively dead (score 0-25)
|
||||
"""
|
||||
if util_percent < UTIL_HEALTHY: # <20%
|
||||
return 100.0
|
||||
elif util_percent < UTIL_CAUTION: # 20-25%
|
||||
# Interpolate from 100 to 75
|
||||
return 100.0 - ((util_percent - UTIL_HEALTHY) / (UTIL_CAUTION - UTIL_HEALTHY)) * 25
|
||||
elif util_percent < UTIL_WARNING: # 25-35%
|
||||
# Interpolate from 75 to 50
|
||||
return 75.0 - ((util_percent - UTIL_CAUTION) / (UTIL_WARNING - UTIL_CAUTION)) * 25
|
||||
elif util_percent < UTIL_UNHEALTHY: # 35-45%
|
||||
# Interpolate from 50 to 25
|
||||
return 50.0 - ((util_percent - UTIL_WARNING) / (UTIL_UNHEALTHY - UTIL_WARNING)) * 25
|
||||
else: # 45%+
|
||||
# Interpolate from 25 to 0 over next 10%
|
||||
return max(0.0, 25.0 - ((util_percent - UTIL_UNHEALTHY) / 10) * 25)
|
||||
|
||||
def _compute_node_group_score(
|
||||
self,
|
||||
node_list: list[UnifiedNode],
|
||||
|
|
@ -568,33 +606,84 @@ class MeshHealthEngine:
|
|||
else:
|
||||
infra_score = 100.0 # No infrastructure = not penalized
|
||||
|
||||
# Channel utilization (based on packet counts if available)
|
||||
# BUG 7 FIX: Use actual Meshtastic airtime calculation
|
||||
if has_packet_data:
|
||||
# Channel utilization - prefer real telemetry over packet estimate
|
||||
#
|
||||
# Priority 1: Use firmware-reported channel_utilization from nodes
|
||||
# This is the most accurate measure - the firmware calculates this
|
||||
# from actual radio activity over the last minute.
|
||||
#
|
||||
# Priority 2: Fall back to packet count estimate if no telemetry
|
||||
# This is a rough approximation using 200ms/packet (MediumFast preset).
|
||||
# It's less accurate because different presets have different airtime,
|
||||
# and it sums packets across all nodes regardless of channel.
|
||||
|
||||
util_percent = 0.0
|
||||
util_max_percent = 0.0
|
||||
util_score = 100.0
|
||||
util_method = "none"
|
||||
util_node_count = 0
|
||||
util_data_available = False
|
||||
|
||||
# Try to get real channel_utilization from infrastructure nodes
|
||||
# Use infrastructure nodes because they're the routers - they see the most traffic
|
||||
util_readings = []
|
||||
for n in infra_nodes:
|
||||
if n.channel_utilization is not None and n.channel_utilization >= 0:
|
||||
util_readings.append(n.channel_utilization)
|
||||
|
||||
# If no infra nodes have it, try all nodes
|
||||
if not util_readings:
|
||||
for n in node_list:
|
||||
if n.channel_utilization is not None and n.channel_utilization >= 0:
|
||||
util_readings.append(n.channel_utilization)
|
||||
|
||||
if util_readings:
|
||||
# Use the HIGHEST value - the busiest node is the bottleneck
|
||||
# If one router is at 45% utilization, the mesh has a problem
|
||||
# even if other nodes are at 10%
|
||||
util_max_percent = max(util_readings)
|
||||
util_percent = util_max_percent # Use max for scoring
|
||||
util_score = self._compute_utilization_score(util_percent)
|
||||
util_method = "telemetry"
|
||||
util_node_count = len(util_readings)
|
||||
util_data_available = True
|
||||
|
||||
# Also compute average for informational purposes
|
||||
# (stored in util_percent, max in util_max_percent)
|
||||
# Actually, use max for the score since that's the bottleneck
|
||||
|
||||
elif has_packet_data:
|
||||
# Fallback: Estimate from packet counts
|
||||
# This is a rough approximation - only use when telemetry unavailable
|
||||
#
|
||||
# WARNING: This method has known issues:
|
||||
# - Assumes 200ms airtime per packet (only correct for MediumFast)
|
||||
# - Sums packets across all nodes even on different channels
|
||||
# - Can't distinguish retries from new packets
|
||||
# Use real channel_utilization from telemetry when available.
|
||||
|
||||
total_non_text_packets = sum((n.packets_sent_24h - n.text_messages_24h) for n in node_list)
|
||||
# Average airtime per packet on MediumFast: ~200ms
|
||||
# Total available airtime per hour: 3,600,000ms
|
||||
# Utilization = (packets_per_hour * airtime_ms) / total_airtime_ms * 100
|
||||
packets_per_hour = total_non_text_packets / 24.0 # 24h window
|
||||
airtime_per_packet_ms = 200 # ~200ms on MediumFast preset
|
||||
util_percent = (packets_per_hour * airtime_per_packet_ms) / 3_600_000 * 100
|
||||
util_max_percent = util_percent # No per-node data available
|
||||
util_score = self._compute_utilization_score(util_percent)
|
||||
util_method = "packet_estimate"
|
||||
util_node_count = 0
|
||||
util_data_available = True
|
||||
|
||||
# Apply scoring thresholds with interpolation
|
||||
if util_percent < UTIL_HEALTHY: # <15%
|
||||
util_score = 100.0
|
||||
elif util_percent < UTIL_CAUTION: # 15-20%
|
||||
util_score = 100.0 - ((util_percent - UTIL_HEALTHY) / (UTIL_CAUTION - UTIL_HEALTHY)) * 25
|
||||
elif util_percent < UTIL_WARNING: # 20-25%
|
||||
util_score = 75.0 - ((util_percent - UTIL_CAUTION) / (UTIL_WARNING - UTIL_CAUTION)) * 25
|
||||
elif util_percent < UTIL_UNHEALTHY: # 25-35%
|
||||
util_score = 50.0 - ((util_percent - UTIL_WARNING) / (UTIL_UNHEALTHY - UTIL_WARNING)) * 25
|
||||
else: # 35%+
|
||||
util_score = max(0.0, 25.0 - ((util_percent - UTIL_UNHEALTHY) / 10) * 25)
|
||||
logger.debug(
|
||||
f"Utilization using packet estimate fallback: {util_percent:.1f}% "
|
||||
f"({total_non_text_packets} non-text packets/24h)"
|
||||
)
|
||||
else:
|
||||
# No packet data available - assume healthy utilization
|
||||
# This prevents penalizing the score when we simply don't have data
|
||||
# No utilization data available - don't penalize
|
||||
util_percent = 0.0
|
||||
util_max_percent = 0.0
|
||||
util_score = 100.0
|
||||
util_method = "none"
|
||||
util_node_count = 0
|
||||
util_data_available = False
|
||||
|
||||
# Node behavior (flagged nodes)
|
||||
flagged = [n for n in node_list if (n.packets_sent_24h - n.text_messages_24h) > self.packet_threshold]
|
||||
|
|
@ -674,13 +763,16 @@ class MeshHealthEngine:
|
|||
infra_online=infra_online,
|
||||
infra_total=infra_total,
|
||||
util_percent=util_percent,
|
||||
util_max_percent=util_max_percent,
|
||||
util_method=util_method,
|
||||
util_node_count=util_node_count,
|
||||
coverage_avg_gateways=coverage_avg_gw,
|
||||
coverage_single_gw_count=coverage_single,
|
||||
coverage_full_count=coverage_full,
|
||||
flagged_nodes=flagged_count,
|
||||
battery_warnings=battery_warnings,
|
||||
solar_index=solar_index,
|
||||
util_data_available=has_packet_data,
|
||||
util_data_available=util_data_available,
|
||||
coverage_data_available=coverage_available,
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue