fix(health): use real channel utilization from node telemetry

- Utilization pillar now reads firmware-reported channel_utilization
  instead of estimating from packet counts with hardcoded 200ms/pkt
- Uses highest infra node value (busiest node = bottleneck)
- Falls back to packet count estimate only when telemetry unavailable
- Updated thresholds: 20/25/35/45% matching real Meshtastic behavior
- Per-region utilization from region nodes, not mesh-wide
- API response includes util_method, util_max_percent, util_node_count

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
K7ZVX 2026-05-13 22:49:41 +00:00
commit 57a19aeec6
3 changed files with 1267 additions and 1164 deletions

View file

@ -746,27 +746,32 @@ export default function Reference() {
<SubHeader>Utilization (25%)</SubHeader>
<p>
Estimates how much of the radio channel's airtime is being used. MeshAI can't measure airtime directly, so it estimates based on packet counts over the last 24 hours.
</p>
<p className="p-3 bg-slate-800 rounded font-mono text-sm">
packets_per_hour = non_text_packets ÷ 24<br/>
airtime_estimate = (packets_per_hour × 200ms) ÷ 3,600,000ms × 100%
MeshAI reads the channel utilization that each router reports in its telemetry this is the firmware's own measurement of how busy the radio channel is. MeshAI uses the <strong>highest</strong> value from any infrastructure node because the busiest router is the bottleneck for the whole mesh.
</p>
<p>
The 200ms is an approximation for the MediumFast radio preset each LoRa packet takes roughly 200ms of airtime. Text messages don't count toward utilization (chatting is the point of a mesh).
<strong>How it works:</strong>
</p>
<ol className="list-decimal list-inside space-y-1 ml-4">
<li>Collect <Mono>channel_utilization</Mono> from all infrastructure nodes that report it</li>
<li>If no infra nodes have telemetry, try all nodes</li>
<li>Use the <strong>maximum</strong> value for scoring (busiest node = bottleneck)</li>
<li>If no nodes report utilization (older firmware), fall back to packet count estimate</li>
</ol>
<p className="mt-4">
<strong>Fallback method</strong> (when telemetry unavailable): estimates from packet counts using 200ms/packet airtime. This is less accurate it assumes MediumFast preset and sums packets across all nodes.
</p>
<RefTable
headers={['Estimated Airtime', 'Score', 'What It Means']}
headers={['Channel Utilization', 'Score', 'What It Means']}
rows={[
['Under 20%', '100', 'Channel is clear — this is the goal'],
['20-25%', '75-100', 'Slight degradation, occasional collisions'],
['25-35%', '50-75', 'Severe degradation — firmware throttling active'],
['35-45%', '25-50', 'Mesh struggling badly — reliability dropping'],
['Over 45%', '0-25', 'Mesh is effectively dead'],
['Over 45%', '0-25', 'Mesh is effectively unusable'],
]}
/>
<p>
<strong>Special case:</strong> If MeshAI doesn't have packet data (no sources reporting packet counts), this pillar scores 100. You're not penalized for missing data.
<strong>Special case:</strong> If no utilization data is available (no telemetry and no packet data), this pillar scores 100. You're not penalized for missing data.
</p>
<SubHeader>Coverage (20%)</SubHeader>

View file

@ -20,6 +20,9 @@ def _serialize_health_score(score) -> dict:
"infra_online": score.infra_online,
"infra_total": score.infra_total,
"util_percent": round(score.util_percent, 1),
"util_max_percent": round(getattr(score, 'util_max_percent', score.util_percent), 1),
"util_method": getattr(score, 'util_method', 'unknown'),
"util_node_count": getattr(score, 'util_node_count', 0),
"flagged_nodes": score.flagged_nodes,
"battery_warnings": score.battery_warnings,
"solar_index": round(score.solar_index, 1),
@ -76,6 +79,9 @@ async def get_health(request: Request):
"infra_online": score.infra_online,
"infra_total": score.infra_total,
"util_percent": round(score.util_percent, 1),
"util_max_percent": round(getattr(score, 'util_max_percent', score.util_percent), 1),
"util_method": getattr(score, 'util_method', 'unknown'),
"util_node_count": getattr(score, 'util_node_count', 0),
"flagged_nodes": score.flagged_nodes,
"battery_warnings": score.battery_warnings,
"total_nodes": health.total_nodes,

View file

@ -30,11 +30,12 @@ DEFAULT_OFFLINE_THRESHOLD_HOURS = 24
DEFAULT_PACKET_THRESHOLD = 500 # Non-text packets per 24h
DEFAULT_BATTERY_WARNING_PERCENT = 20
# Utilization thresholds (percentage)
UTIL_HEALTHY = 20
UTIL_CAUTION = 25
UTIL_WARNING = 35
UTIL_UNHEALTHY = 45
# Utilization thresholds (percentage) - based on real Meshtastic behavior
# Firmware starts throttling GPS at 25%, severe degradation above 35%
UTIL_HEALTHY = 20 # Under 20% = channel is clear
UTIL_CAUTION = 25 # 20-25% = slight degradation, occasional collisions
UTIL_WARNING = 35 # 25-35% = severe degradation, firmware throttling
UTIL_UNHEALTHY = 45 # 35-45% = mesh struggling badly, reliability dropping
# Pillar weights (5-pillar system)
WEIGHT_INFRASTRUCTURE = 0.30
@ -58,6 +59,9 @@ class HealthScore:
infra_online: int = 0
infra_total: int = 0
util_percent: float = 0.0
util_max_percent: float = 0.0 # Highest node utilization (hotspot indicator)
util_method: str = "none" # "telemetry", "packet_estimate", or "none"
util_node_count: int = 0 # Nodes reporting utilization
coverage_avg_gateways: float = 0.0
coverage_single_gw_count: int = 0
coverage_full_count: int = 0
@ -486,10 +490,19 @@ class MeshHealthEngine:
data_sources.append(f"{len(all_channels)} ch")
data_str = ", ".join(data_sources) if data_sources else "nodes only"
# Log utilization method used
util_method = mesh_score.util_method
if util_method == "telemetry":
util_info = f"util={mesh_score.util_percent:.1f}% (max={mesh_score.util_max_percent:.1f}%, {mesh_score.util_node_count} nodes reporting)"
elif util_method == "packet_estimate":
util_info = f"util={mesh_score.util_percent:.1f}% (packet estimate fallback)"
else:
util_info = "util=N/A (no data)"
logger.info(
f"Mesh health computed: {mesh_health.total_nodes} nodes, "
f"{mesh_health.total_regions} regions, score {mesh_score.composite:.0f}/100 "
f"[{data_str}]"
f"[{data_str}] [{util_info}]"
)
return mesh_health
@ -541,6 +554,31 @@ class MeshHealthEngine:
all_nodes = list(nodes.values())
return self._compute_node_group_score(all_nodes, has_packet_data)
def _compute_utilization_score(self, util_percent: float) -> float:
"""Convert utilization percentage to health score using thresholds.
Thresholds based on real Meshtastic behavior:
- Under 20%: Clear channel (score 100)
- 20-25%: Slight degradation (score 75-100)
- 25-35%: Severe degradation, firmware throttling (score 50-75)
- 35-45%: Mesh struggling badly (score 25-50)
- Over 45%: Mesh effectively dead (score 0-25)
"""
if util_percent < UTIL_HEALTHY: # <20%
return 100.0
elif util_percent < UTIL_CAUTION: # 20-25%
# Interpolate from 100 to 75
return 100.0 - ((util_percent - UTIL_HEALTHY) / (UTIL_CAUTION - UTIL_HEALTHY)) * 25
elif util_percent < UTIL_WARNING: # 25-35%
# Interpolate from 75 to 50
return 75.0 - ((util_percent - UTIL_CAUTION) / (UTIL_WARNING - UTIL_CAUTION)) * 25
elif util_percent < UTIL_UNHEALTHY: # 35-45%
# Interpolate from 50 to 25
return 50.0 - ((util_percent - UTIL_WARNING) / (UTIL_UNHEALTHY - UTIL_WARNING)) * 25
else: # 45%+
# Interpolate from 25 to 0 over next 10%
return max(0.0, 25.0 - ((util_percent - UTIL_UNHEALTHY) / 10) * 25)
def _compute_node_group_score(
self,
node_list: list[UnifiedNode],
@ -568,33 +606,84 @@ class MeshHealthEngine:
else:
infra_score = 100.0 # No infrastructure = not penalized
# Channel utilization (based on packet counts if available)
# BUG 7 FIX: Use actual Meshtastic airtime calculation
if has_packet_data:
# Channel utilization - prefer real telemetry over packet estimate
#
# Priority 1: Use firmware-reported channel_utilization from nodes
# This is the most accurate measure - the firmware calculates this
# from actual radio activity over the last minute.
#
# Priority 2: Fall back to packet count estimate if no telemetry
# This is a rough approximation using 200ms/packet (MediumFast preset).
# It's less accurate because different presets have different airtime,
# and it sums packets across all nodes regardless of channel.
util_percent = 0.0
util_max_percent = 0.0
util_score = 100.0
util_method = "none"
util_node_count = 0
util_data_available = False
# Try to get real channel_utilization from infrastructure nodes
# Use infrastructure nodes because they're the routers - they see the most traffic
util_readings = []
for n in infra_nodes:
if n.channel_utilization is not None and n.channel_utilization >= 0:
util_readings.append(n.channel_utilization)
# If no infra nodes have it, try all nodes
if not util_readings:
for n in node_list:
if n.channel_utilization is not None and n.channel_utilization >= 0:
util_readings.append(n.channel_utilization)
if util_readings:
# Use the HIGHEST value - the busiest node is the bottleneck
# If one router is at 45% utilization, the mesh has a problem
# even if other nodes are at 10%
util_max_percent = max(util_readings)
util_percent = util_max_percent # Use max for scoring
util_score = self._compute_utilization_score(util_percent)
util_method = "telemetry"
util_node_count = len(util_readings)
util_data_available = True
# Also compute average for informational purposes
# (stored in util_percent, max in util_max_percent)
# Actually, use max for the score since that's the bottleneck
elif has_packet_data:
# Fallback: Estimate from packet counts
# This is a rough approximation - only use when telemetry unavailable
#
# WARNING: This method has known issues:
# - Assumes 200ms airtime per packet (only correct for MediumFast)
# - Sums packets across all nodes even on different channels
# - Can't distinguish retries from new packets
# Use real channel_utilization from telemetry when available.
total_non_text_packets = sum((n.packets_sent_24h - n.text_messages_24h) for n in node_list)
# Average airtime per packet on MediumFast: ~200ms
# Total available airtime per hour: 3,600,000ms
# Utilization = (packets_per_hour * airtime_ms) / total_airtime_ms * 100
packets_per_hour = total_non_text_packets / 24.0 # 24h window
airtime_per_packet_ms = 200 # ~200ms on MediumFast preset
util_percent = (packets_per_hour * airtime_per_packet_ms) / 3_600_000 * 100
util_max_percent = util_percent # No per-node data available
util_score = self._compute_utilization_score(util_percent)
util_method = "packet_estimate"
util_node_count = 0
util_data_available = True
# Apply scoring thresholds with interpolation
if util_percent < UTIL_HEALTHY: # <15%
util_score = 100.0
elif util_percent < UTIL_CAUTION: # 15-20%
util_score = 100.0 - ((util_percent - UTIL_HEALTHY) / (UTIL_CAUTION - UTIL_HEALTHY)) * 25
elif util_percent < UTIL_WARNING: # 20-25%
util_score = 75.0 - ((util_percent - UTIL_CAUTION) / (UTIL_WARNING - UTIL_CAUTION)) * 25
elif util_percent < UTIL_UNHEALTHY: # 25-35%
util_score = 50.0 - ((util_percent - UTIL_WARNING) / (UTIL_UNHEALTHY - UTIL_WARNING)) * 25
else: # 35%+
util_score = max(0.0, 25.0 - ((util_percent - UTIL_UNHEALTHY) / 10) * 25)
logger.debug(
f"Utilization using packet estimate fallback: {util_percent:.1f}% "
f"({total_non_text_packets} non-text packets/24h)"
)
else:
# No packet data available - assume healthy utilization
# This prevents penalizing the score when we simply don't have data
# No utilization data available - don't penalize
util_percent = 0.0
util_max_percent = 0.0
util_score = 100.0
util_method = "none"
util_node_count = 0
util_data_available = False
# Node behavior (flagged nodes)
flagged = [n for n in node_list if (n.packets_sent_24h - n.text_messages_24h) > self.packet_threshold]
@ -674,13 +763,16 @@ class MeshHealthEngine:
infra_online=infra_online,
infra_total=infra_total,
util_percent=util_percent,
util_max_percent=util_max_percent,
util_method=util_method,
util_node_count=util_node_count,
coverage_avg_gateways=coverage_avg_gw,
coverage_single_gw_count=coverage_single,
coverage_full_count=coverage_full,
flagged_nodes=flagged_count,
battery_warnings=battery_warnings,
solar_index=solar_index,
util_data_available=has_packet_data,
util_data_available=util_data_available,
coverage_data_available=coverage_available,
)