fix(health): use real channel utilization from node telemetry

- Utilization pillar now reads firmware-reported channel_utilization
  instead of estimating from packet counts with hardcoded 200ms/pkt
- Uses highest infra node value (busiest node = bottleneck)
- Falls back to packet count estimate only when telemetry unavailable
- Updated thresholds: 20/25/35/45% matching real Meshtastic behavior
- Per-region utilization from region nodes, not mesh-wide
- API response includes util_method, util_max_percent, util_node_count

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
K7ZVX 2026-05-13 22:49:41 +00:00
commit 57a19aeec6
3 changed files with 1267 additions and 1164 deletions

View file

@ -746,27 +746,32 @@ export default function Reference() {
<SubHeader>Utilization (25%)</SubHeader> <SubHeader>Utilization (25%)</SubHeader>
<p> <p>
Estimates how much of the radio channel's airtime is being used. MeshAI can't measure airtime directly, so it estimates based on packet counts over the last 24 hours. MeshAI reads the channel utilization that each router reports in its telemetry this is the firmware's own measurement of how busy the radio channel is. MeshAI uses the <strong>highest</strong> value from any infrastructure node because the busiest router is the bottleneck for the whole mesh.
</p>
<p className="p-3 bg-slate-800 rounded font-mono text-sm">
packets_per_hour = non_text_packets ÷ 24<br/>
airtime_estimate = (packets_per_hour × 200ms) ÷ 3,600,000ms × 100%
</p> </p>
<p> <p>
The 200ms is an approximation for the MediumFast radio preset each LoRa packet takes roughly 200ms of airtime. Text messages don't count toward utilization (chatting is the point of a mesh). <strong>How it works:</strong>
</p>
<ol className="list-decimal list-inside space-y-1 ml-4">
<li>Collect <Mono>channel_utilization</Mono> from all infrastructure nodes that report it</li>
<li>If no infra nodes have telemetry, try all nodes</li>
<li>Use the <strong>maximum</strong> value for scoring (busiest node = bottleneck)</li>
<li>If no nodes report utilization (older firmware), fall back to packet count estimate</li>
</ol>
<p className="mt-4">
<strong>Fallback method</strong> (when telemetry unavailable): estimates from packet counts using 200ms/packet airtime. This is less accurate it assumes MediumFast preset and sums packets across all nodes.
</p> </p>
<RefTable <RefTable
headers={['Estimated Airtime', 'Score', 'What It Means']} headers={['Channel Utilization', 'Score', 'What It Means']}
rows={[ rows={[
['Under 20%', '100', 'Channel is clear — this is the goal'], ['Under 20%', '100', 'Channel is clear — this is the goal'],
['20-25%', '75-100', 'Slight degradation, occasional collisions'], ['20-25%', '75-100', 'Slight degradation, occasional collisions'],
['25-35%', '50-75', 'Severe degradation — firmware throttling active'], ['25-35%', '50-75', 'Severe degradation — firmware throttling active'],
['35-45%', '25-50', 'Mesh struggling badly — reliability dropping'], ['35-45%', '25-50', 'Mesh struggling badly — reliability dropping'],
['Over 45%', '0-25', 'Mesh is effectively dead'], ['Over 45%', '0-25', 'Mesh is effectively unusable'],
]} ]}
/> />
<p> <p>
<strong>Special case:</strong> If MeshAI doesn't have packet data (no sources reporting packet counts), this pillar scores 100. You're not penalized for missing data. <strong>Special case:</strong> If no utilization data is available (no telemetry and no packet data), this pillar scores 100. You're not penalized for missing data.
</p> </p>
<SubHeader>Coverage (20%)</SubHeader> <SubHeader>Coverage (20%)</SubHeader>

View file

@ -20,6 +20,9 @@ def _serialize_health_score(score) -> dict:
"infra_online": score.infra_online, "infra_online": score.infra_online,
"infra_total": score.infra_total, "infra_total": score.infra_total,
"util_percent": round(score.util_percent, 1), "util_percent": round(score.util_percent, 1),
"util_max_percent": round(getattr(score, 'util_max_percent', score.util_percent), 1),
"util_method": getattr(score, 'util_method', 'unknown'),
"util_node_count": getattr(score, 'util_node_count', 0),
"flagged_nodes": score.flagged_nodes, "flagged_nodes": score.flagged_nodes,
"battery_warnings": score.battery_warnings, "battery_warnings": score.battery_warnings,
"solar_index": round(score.solar_index, 1), "solar_index": round(score.solar_index, 1),
@ -76,6 +79,9 @@ async def get_health(request: Request):
"infra_online": score.infra_online, "infra_online": score.infra_online,
"infra_total": score.infra_total, "infra_total": score.infra_total,
"util_percent": round(score.util_percent, 1), "util_percent": round(score.util_percent, 1),
"util_max_percent": round(getattr(score, 'util_max_percent', score.util_percent), 1),
"util_method": getattr(score, 'util_method', 'unknown'),
"util_node_count": getattr(score, 'util_node_count', 0),
"flagged_nodes": score.flagged_nodes, "flagged_nodes": score.flagged_nodes,
"battery_warnings": score.battery_warnings, "battery_warnings": score.battery_warnings,
"total_nodes": health.total_nodes, "total_nodes": health.total_nodes,

View file

@ -30,11 +30,12 @@ DEFAULT_OFFLINE_THRESHOLD_HOURS = 24
DEFAULT_PACKET_THRESHOLD = 500 # Non-text packets per 24h DEFAULT_PACKET_THRESHOLD = 500 # Non-text packets per 24h
DEFAULT_BATTERY_WARNING_PERCENT = 20 DEFAULT_BATTERY_WARNING_PERCENT = 20
# Utilization thresholds (percentage) # Utilization thresholds (percentage) - based on real Meshtastic behavior
UTIL_HEALTHY = 20 # Firmware starts throttling GPS at 25%, severe degradation above 35%
UTIL_CAUTION = 25 UTIL_HEALTHY = 20 # Under 20% = channel is clear
UTIL_WARNING = 35 UTIL_CAUTION = 25 # 20-25% = slight degradation, occasional collisions
UTIL_UNHEALTHY = 45 UTIL_WARNING = 35 # 25-35% = severe degradation, firmware throttling
UTIL_UNHEALTHY = 45 # 35-45% = mesh struggling badly, reliability dropping
# Pillar weights (5-pillar system) # Pillar weights (5-pillar system)
WEIGHT_INFRASTRUCTURE = 0.30 WEIGHT_INFRASTRUCTURE = 0.30
@ -58,6 +59,9 @@ class HealthScore:
infra_online: int = 0 infra_online: int = 0
infra_total: int = 0 infra_total: int = 0
util_percent: float = 0.0 util_percent: float = 0.0
util_max_percent: float = 0.0 # Highest node utilization (hotspot indicator)
util_method: str = "none" # "telemetry", "packet_estimate", or "none"
util_node_count: int = 0 # Nodes reporting utilization
coverage_avg_gateways: float = 0.0 coverage_avg_gateways: float = 0.0
coverage_single_gw_count: int = 0 coverage_single_gw_count: int = 0
coverage_full_count: int = 0 coverage_full_count: int = 0
@ -486,10 +490,19 @@ class MeshHealthEngine:
data_sources.append(f"{len(all_channels)} ch") data_sources.append(f"{len(all_channels)} ch")
data_str = ", ".join(data_sources) if data_sources else "nodes only" data_str = ", ".join(data_sources) if data_sources else "nodes only"
# Log utilization method used
util_method = mesh_score.util_method
if util_method == "telemetry":
util_info = f"util={mesh_score.util_percent:.1f}% (max={mesh_score.util_max_percent:.1f}%, {mesh_score.util_node_count} nodes reporting)"
elif util_method == "packet_estimate":
util_info = f"util={mesh_score.util_percent:.1f}% (packet estimate fallback)"
else:
util_info = "util=N/A (no data)"
logger.info( logger.info(
f"Mesh health computed: {mesh_health.total_nodes} nodes, " f"Mesh health computed: {mesh_health.total_nodes} nodes, "
f"{mesh_health.total_regions} regions, score {mesh_score.composite:.0f}/100 " f"{mesh_health.total_regions} regions, score {mesh_score.composite:.0f}/100 "
f"[{data_str}]" f"[{data_str}] [{util_info}]"
) )
return mesh_health return mesh_health
@ -541,6 +554,31 @@ class MeshHealthEngine:
all_nodes = list(nodes.values()) all_nodes = list(nodes.values())
return self._compute_node_group_score(all_nodes, has_packet_data) return self._compute_node_group_score(all_nodes, has_packet_data)
def _compute_utilization_score(self, util_percent: float) -> float:
"""Convert utilization percentage to health score using thresholds.
Thresholds based on real Meshtastic behavior:
- Under 20%: Clear channel (score 100)
- 20-25%: Slight degradation (score 75-100)
- 25-35%: Severe degradation, firmware throttling (score 50-75)
- 35-45%: Mesh struggling badly (score 25-50)
- Over 45%: Mesh effectively dead (score 0-25)
"""
if util_percent < UTIL_HEALTHY: # <20%
return 100.0
elif util_percent < UTIL_CAUTION: # 20-25%
# Interpolate from 100 to 75
return 100.0 - ((util_percent - UTIL_HEALTHY) / (UTIL_CAUTION - UTIL_HEALTHY)) * 25
elif util_percent < UTIL_WARNING: # 25-35%
# Interpolate from 75 to 50
return 75.0 - ((util_percent - UTIL_CAUTION) / (UTIL_WARNING - UTIL_CAUTION)) * 25
elif util_percent < UTIL_UNHEALTHY: # 35-45%
# Interpolate from 50 to 25
return 50.0 - ((util_percent - UTIL_WARNING) / (UTIL_UNHEALTHY - UTIL_WARNING)) * 25
else: # 45%+
# Interpolate from 25 to 0 over next 10%
return max(0.0, 25.0 - ((util_percent - UTIL_UNHEALTHY) / 10) * 25)
def _compute_node_group_score( def _compute_node_group_score(
self, self,
node_list: list[UnifiedNode], node_list: list[UnifiedNode],
@ -568,33 +606,84 @@ class MeshHealthEngine:
else: else:
infra_score = 100.0 # No infrastructure = not penalized infra_score = 100.0 # No infrastructure = not penalized
# Channel utilization (based on packet counts if available) # Channel utilization - prefer real telemetry over packet estimate
# BUG 7 FIX: Use actual Meshtastic airtime calculation #
if has_packet_data: # Priority 1: Use firmware-reported channel_utilization from nodes
# This is the most accurate measure - the firmware calculates this
# from actual radio activity over the last minute.
#
# Priority 2: Fall back to packet count estimate if no telemetry
# This is a rough approximation using 200ms/packet (MediumFast preset).
# It's less accurate because different presets have different airtime,
# and it sums packets across all nodes regardless of channel.
util_percent = 0.0
util_max_percent = 0.0
util_score = 100.0
util_method = "none"
util_node_count = 0
util_data_available = False
# Try to get real channel_utilization from infrastructure nodes
# Use infrastructure nodes because they're the routers - they see the most traffic
util_readings = []
for n in infra_nodes:
if n.channel_utilization is not None and n.channel_utilization >= 0:
util_readings.append(n.channel_utilization)
# If no infra nodes have it, try all nodes
if not util_readings:
for n in node_list:
if n.channel_utilization is not None and n.channel_utilization >= 0:
util_readings.append(n.channel_utilization)
if util_readings:
# Use the HIGHEST value - the busiest node is the bottleneck
# If one router is at 45% utilization, the mesh has a problem
# even if other nodes are at 10%
util_max_percent = max(util_readings)
util_percent = util_max_percent # Use max for scoring
util_score = self._compute_utilization_score(util_percent)
util_method = "telemetry"
util_node_count = len(util_readings)
util_data_available = True
# Also compute average for informational purposes
# (stored in util_percent, max in util_max_percent)
# Actually, use max for the score since that's the bottleneck
elif has_packet_data:
# Fallback: Estimate from packet counts
# This is a rough approximation - only use when telemetry unavailable
#
# WARNING: This method has known issues:
# - Assumes 200ms airtime per packet (only correct for MediumFast)
# - Sums packets across all nodes even on different channels
# - Can't distinguish retries from new packets
# Use real channel_utilization from telemetry when available.
total_non_text_packets = sum((n.packets_sent_24h - n.text_messages_24h) for n in node_list) total_non_text_packets = sum((n.packets_sent_24h - n.text_messages_24h) for n in node_list)
# Average airtime per packet on MediumFast: ~200ms
# Total available airtime per hour: 3,600,000ms
# Utilization = (packets_per_hour * airtime_ms) / total_airtime_ms * 100
packets_per_hour = total_non_text_packets / 24.0 # 24h window packets_per_hour = total_non_text_packets / 24.0 # 24h window
airtime_per_packet_ms = 200 # ~200ms on MediumFast preset airtime_per_packet_ms = 200 # ~200ms on MediumFast preset
util_percent = (packets_per_hour * airtime_per_packet_ms) / 3_600_000 * 100 util_percent = (packets_per_hour * airtime_per_packet_ms) / 3_600_000 * 100
util_max_percent = util_percent # No per-node data available
util_score = self._compute_utilization_score(util_percent)
util_method = "packet_estimate"
util_node_count = 0
util_data_available = True
# Apply scoring thresholds with interpolation logger.debug(
if util_percent < UTIL_HEALTHY: # <15% f"Utilization using packet estimate fallback: {util_percent:.1f}% "
util_score = 100.0 f"({total_non_text_packets} non-text packets/24h)"
elif util_percent < UTIL_CAUTION: # 15-20% )
util_score = 100.0 - ((util_percent - UTIL_HEALTHY) / (UTIL_CAUTION - UTIL_HEALTHY)) * 25
elif util_percent < UTIL_WARNING: # 20-25%
util_score = 75.0 - ((util_percent - UTIL_CAUTION) / (UTIL_WARNING - UTIL_CAUTION)) * 25
elif util_percent < UTIL_UNHEALTHY: # 25-35%
util_score = 50.0 - ((util_percent - UTIL_WARNING) / (UTIL_UNHEALTHY - UTIL_WARNING)) * 25
else: # 35%+
util_score = max(0.0, 25.0 - ((util_percent - UTIL_UNHEALTHY) / 10) * 25)
else: else:
# No packet data available - assume healthy utilization # No utilization data available - don't penalize
# This prevents penalizing the score when we simply don't have data
util_percent = 0.0 util_percent = 0.0
util_max_percent = 0.0
util_score = 100.0 util_score = 100.0
util_method = "none"
util_node_count = 0
util_data_available = False
# Node behavior (flagged nodes) # Node behavior (flagged nodes)
flagged = [n for n in node_list if (n.packets_sent_24h - n.text_messages_24h) > self.packet_threshold] flagged = [n for n in node_list if (n.packets_sent_24h - n.text_messages_24h) > self.packet_threshold]
@ -674,13 +763,16 @@ class MeshHealthEngine:
infra_online=infra_online, infra_online=infra_online,
infra_total=infra_total, infra_total=infra_total,
util_percent=util_percent, util_percent=util_percent,
util_max_percent=util_max_percent,
util_method=util_method,
util_node_count=util_node_count,
coverage_avg_gateways=coverage_avg_gw, coverage_avg_gateways=coverage_avg_gw,
coverage_single_gw_count=coverage_single, coverage_single_gw_count=coverage_single,
coverage_full_count=coverage_full, coverage_full_count=coverage_full,
flagged_nodes=flagged_count, flagged_nodes=flagged_count,
battery_warnings=battery_warnings, battery_warnings=battery_warnings,
solar_index=solar_index, solar_index=solar_index,
util_data_available=has_packet_data, util_data_available=util_data_available,
coverage_data_available=coverage_available, coverage_data_available=coverage_available,
) )