fix(health): use real channel utilization from node telemetry

- Utilization pillar now reads firmware-reported channel_utilization instead of estimating from packet counts with hardcoded 200ms/pkt - Uses highest infra node value (busiest node = bottleneck) - Falls back to packet count estimate only when telemetry unavailable - Updated thresholds: 20/25/35/45% matching real Meshtastic behavior - Per-region utilization from region nodes, not mesh-wide - API response includes util_method, util_max_percent, util_node_count Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-21 23:24:44 +02:00 · 2026-05-13 22:49:41 +00:00 · 2026-05-13 22:49:41 +00:00 · 57a19aeec6
commit 57a19aeec6
parent 7de02fb924
3 changed files with 1267 additions and 1164 deletions
--- a/dashboard-frontend/src/pages/Reference.tsx
+++ b/dashboard-frontend/src/pages/Reference.tsx
@ -746,27 +746,32 @@ export default function Reference() {
            <SubHeader>Utilization (25%)</SubHeader>
            <p>
-              Estimates how much of the radio channel's airtime is being used. MeshAI can't measure airtime directly, so it estimates based on packet counts over the last 24 hours.
+              MeshAI reads the channel utilization that each router reports in its telemetry — this is the firmware's own measurement of how busy the radio channel is. MeshAI uses the <strong>highest</strong> value from any infrastructure node because the busiest router is the bottleneck for the whole mesh.
            </p>
            <p className="p-3 bg-slate-800 rounded font-mono text-sm">
              packets_per_hour = non_text_packets ÷ 24<br/>
              airtime_estimate = (packets_per_hour × 200ms) ÷ 3,600,000ms × 100%
            </p>
            <p>
-              The 200ms is an approximation for the MediumFast radio preset — each LoRa packet takes roughly 200ms of airtime. Text messages don't count toward utilization (chatting is the point of a mesh).
+              <strong>How it works:</strong>
            </p>
            <ol className="list-decimal list-inside space-y-1 ml-4">
              <li>Collect <Mono>channel_utilization</Mono> from all infrastructure nodes that report it</li>
              <li>If no infra nodes have telemetry, try all nodes</li>
              <li>Use the <strong>maximum</strong> value for scoring (busiest node = bottleneck)</li>
              <li>If no nodes report utilization (older firmware), fall back to packet count estimate</li>
            </ol>
            <p className="mt-4">
              <strong>Fallback method</strong> (when telemetry unavailable): estimates from packet counts using 200ms/packet airtime. This is less accurate — it assumes MediumFast preset and sums packets across all nodes.
            </p>
            <RefTable
-              headers={['Estimated Airtime', 'Score', 'What It Means']}
+              headers={['Channel Utilization', 'Score', 'What It Means']}
              rows={[
                ['Under 20%', '100', 'Channel is clear — this is the goal'],
                ['20-25%', '75-100', 'Slight degradation, occasional collisions'],
                ['25-35%', '50-75', 'Severe degradation — firmware throttling active'],
                ['35-45%', '25-50', 'Mesh struggling badly — reliability dropping'],
-                ['Over 45%', '0-25', 'Mesh is effectively dead'],
+                ['Over 45%', '0-25', 'Mesh is effectively unusable'],
              ]}
            />
            <p>
-              <strong>Special case:</strong> If MeshAI doesn't have packet data (no sources reporting packet counts), this pillar scores 100. You're not penalized for missing data.
+              <strong>Special case:</strong> If no utilization data is available (no telemetry and no packet data), this pillar scores 100. You're not penalized for missing data.
            </p>
            <SubHeader>Coverage (20%)</SubHeader>
--- a/meshai/dashboard/api/mesh_routes.py
+++ b/meshai/dashboard/api/mesh_routes.py
@ -20,6 +20,9 @@ def _serialize_health_score(score) -> dict:
        "infra_online": score.infra_online,
        "infra_total": score.infra_total,
        "util_percent": round(score.util_percent, 1),
        "util_max_percent": round(getattr(score, 'util_max_percent', score.util_percent), 1),
        "util_method": getattr(score, 'util_method', 'unknown'),
        "util_node_count": getattr(score, 'util_node_count', 0),
        "flagged_nodes": score.flagged_nodes,
        "battery_warnings": score.battery_warnings,
        "solar_index": round(score.solar_index, 1),
@ -76,6 +79,9 @@ async def get_health(request: Request):
        "infra_online": score.infra_online,
        "infra_total": score.infra_total,
        "util_percent": round(score.util_percent, 1),
        "util_max_percent": round(getattr(score, 'util_max_percent', score.util_percent), 1),
        "util_method": getattr(score, 'util_method', 'unknown'),
        "util_node_count": getattr(score, 'util_node_count', 0),
        "flagged_nodes": score.flagged_nodes,
        "battery_warnings": score.battery_warnings,
        "total_nodes": health.total_nodes,
--- a/meshai/mesh_health.py
+++ b/meshai/mesh_health.py
@ -30,11 +30,12 @@ DEFAULT_OFFLINE_THRESHOLD_HOURS = 24
 DEFAULT_PACKET_THRESHOLD = 500  # Non-text packets per 24h
 DEFAULT_BATTERY_WARNING_PERCENT = 20
-# Utilization thresholds (percentage)
+# Utilization thresholds (percentage) - based on real Meshtastic behavior
-UTIL_HEALTHY = 20
+# Firmware starts throttling GPS at 25%, severe degradation above 35%
-UTIL_CAUTION = 25
+UTIL_HEALTHY = 20     # Under 20% = channel is clear
-UTIL_WARNING = 35
+UTIL_CAUTION = 25     # 20-25% = slight degradation, occasional collisions
-UTIL_UNHEALTHY = 45
+UTIL_WARNING = 35     # 25-35% = severe degradation, firmware throttling
 UTIL_UNHEALTHY = 45   # 35-45% = mesh struggling badly, reliability dropping
 # Pillar weights (5-pillar system)
 WEIGHT_INFRASTRUCTURE = 0.30
@ -58,6 +59,9 @@ class HealthScore:
    infra_online: int = 0
    infra_total: int = 0
    util_percent: float = 0.0
    util_max_percent: float = 0.0  # Highest node utilization (hotspot indicator)
    util_method: str = "none"  # "telemetry", "packet_estimate", or "none"
    util_node_count: int = 0  # Nodes reporting utilization
    coverage_avg_gateways: float = 0.0
    coverage_single_gw_count: int = 0
    coverage_full_count: int = 0
@ -486,10 +490,19 @@ class MeshHealthEngine:
            data_sources.append(f"{len(all_channels)} ch")
        data_str = ", ".join(data_sources) if data_sources else "nodes only"
        # Log utilization method used
        util_method = mesh_score.util_method
        if util_method == "telemetry":
            util_info = f"util={mesh_score.util_percent:.1f}% (max={mesh_score.util_max_percent:.1f}%, {mesh_score.util_node_count} nodes reporting)"
        elif util_method == "packet_estimate":
            util_info = f"util={mesh_score.util_percent:.1f}% (packet estimate fallback)"
        else:
            util_info = "util=N/A (no data)"
        logger.info(
            f"Mesh health computed: {mesh_health.total_nodes} nodes, "
            f"{mesh_health.total_regions} regions, score {mesh_score.composite:.0f}/100 "
-            f"[{data_str}]"
+            f"[{data_str}] [{util_info}]"
        )
        return mesh_health
@ -541,6 +554,31 @@ class MeshHealthEngine:
        all_nodes = list(nodes.values())
        return self._compute_node_group_score(all_nodes, has_packet_data)
    def _compute_utilization_score(self, util_percent: float) -> float:
        """Convert utilization percentage to health score using thresholds.
        Thresholds based on real Meshtastic behavior:
        - Under 20%: Clear channel (score 100)
        - 20-25%: Slight degradation (score 75-100)
        - 25-35%: Severe degradation, firmware throttling (score 50-75)
        - 35-45%: Mesh struggling badly (score 25-50)
        - Over 45%: Mesh effectively dead (score 0-25)
        """
        if util_percent < UTIL_HEALTHY:  # <20%
            return 100.0
        elif util_percent < UTIL_CAUTION:  # 20-25%
            # Interpolate from 100 to 75
            return 100.0 - ((util_percent - UTIL_HEALTHY) / (UTIL_CAUTION - UTIL_HEALTHY)) * 25
        elif util_percent < UTIL_WARNING:  # 25-35%
            # Interpolate from 75 to 50
            return 75.0 - ((util_percent - UTIL_CAUTION) / (UTIL_WARNING - UTIL_CAUTION)) * 25
        elif util_percent < UTIL_UNHEALTHY:  # 35-45%
            # Interpolate from 50 to 25
            return 50.0 - ((util_percent - UTIL_WARNING) / (UTIL_UNHEALTHY - UTIL_WARNING)) * 25
        else:  # 45%+
            # Interpolate from 25 to 0 over next 10%
            return max(0.0, 25.0 - ((util_percent - UTIL_UNHEALTHY) / 10) * 25)
    def _compute_node_group_score(
        self,
        node_list: list[UnifiedNode],
@ -568,33 +606,84 @@ class MeshHealthEngine:
        else:
            infra_score = 100.0  # No infrastructure = not penalized
-        # Channel utilization (based on packet counts if available)
+        # Channel utilization - prefer real telemetry over packet estimate
-        # BUG 7 FIX: Use actual Meshtastic airtime calculation
+        #
-        if has_packet_data:
+        # Priority 1: Use firmware-reported channel_utilization from nodes
        # This is the most accurate measure - the firmware calculates this
        # from actual radio activity over the last minute.
        #
        # Priority 2: Fall back to packet count estimate if no telemetry
        # This is a rough approximation using 200ms/packet (MediumFast preset).
        # It's less accurate because different presets have different airtime,
        # and it sums packets across all nodes regardless of channel.
        util_percent = 0.0
        util_max_percent = 0.0
        util_score = 100.0
        util_method = "none"
        util_node_count = 0
        util_data_available = False
        # Try to get real channel_utilization from infrastructure nodes
        # Use infrastructure nodes because they're the routers - they see the most traffic
        util_readings = []
        for n in infra_nodes:
            if n.channel_utilization is not None and n.channel_utilization >= 0:
                util_readings.append(n.channel_utilization)
        # If no infra nodes have it, try all nodes
        if not util_readings:
            for n in node_list:
                if n.channel_utilization is not None and n.channel_utilization >= 0:
                    util_readings.append(n.channel_utilization)
        if util_readings:
            # Use the HIGHEST value - the busiest node is the bottleneck
            # If one router is at 45% utilization, the mesh has a problem
            # even if other nodes are at 10%
            util_max_percent = max(util_readings)
            util_percent = util_max_percent  # Use max for scoring
            util_score = self._compute_utilization_score(util_percent)
            util_method = "telemetry"
            util_node_count = len(util_readings)
            util_data_available = True
            # Also compute average for informational purposes
            # (stored in util_percent, max in util_max_percent)
            # Actually, use max for the score since that's the bottleneck
        elif has_packet_data:
            # Fallback: Estimate from packet counts
            # This is a rough approximation - only use when telemetry unavailable
            #
            # WARNING: This method has known issues:
            # - Assumes 200ms airtime per packet (only correct for MediumFast)
            # - Sums packets across all nodes even on different channels
            # - Can't distinguish retries from new packets
            # Use real channel_utilization from telemetry when available.
            total_non_text_packets = sum((n.packets_sent_24h - n.text_messages_24h) for n in node_list)
            # Average airtime per packet on MediumFast: ~200ms
            # Total available airtime per hour: 3,600,000ms
            # Utilization = (packets_per_hour * airtime_ms) / total_airtime_ms * 100
            packets_per_hour = total_non_text_packets / 24.0  # 24h window
            airtime_per_packet_ms = 200  # ~200ms on MediumFast preset
            util_percent = (packets_per_hour * airtime_per_packet_ms) / 3_600_000 * 100
            util_max_percent = util_percent  # No per-node data available
            util_score = self._compute_utilization_score(util_percent)
            util_method = "packet_estimate"
            util_node_count = 0
            util_data_available = True
-            # Apply scoring thresholds with interpolation
+            logger.debug(
-            if util_percent < UTIL_HEALTHY:  # <15%
+                f"Utilization using packet estimate fallback: {util_percent:.1f}% "
-                util_score = 100.0
+                f"({total_non_text_packets} non-text packets/24h)"
-            elif util_percent < UTIL_CAUTION:  # 15-20%
+            )
                util_score = 100.0 - ((util_percent - UTIL_HEALTHY) / (UTIL_CAUTION - UTIL_HEALTHY)) * 25
            elif util_percent < UTIL_WARNING:  # 20-25%
                util_score = 75.0 - ((util_percent - UTIL_CAUTION) / (UTIL_WARNING - UTIL_CAUTION)) * 25
            elif util_percent < UTIL_UNHEALTHY:  # 25-35%
                util_score = 50.0 - ((util_percent - UTIL_WARNING) / (UTIL_UNHEALTHY - UTIL_WARNING)) * 25
            else:  # 35%+
                util_score = max(0.0, 25.0 - ((util_percent - UTIL_UNHEALTHY) / 10) * 25)
        else:
-            # No packet data available - assume healthy utilization
+            # No utilization data available - don't penalize
            # This prevents penalizing the score when we simply don't have data
            util_percent = 0.0
            util_max_percent = 0.0
            util_score = 100.0
            util_method = "none"
            util_node_count = 0
            util_data_available = False
        # Node behavior (flagged nodes)
        flagged = [n for n in node_list if (n.packets_sent_24h - n.text_messages_24h) > self.packet_threshold]
@ -674,13 +763,16 @@ class MeshHealthEngine:
            infra_online=infra_online,
            infra_total=infra_total,
            util_percent=util_percent,
            util_max_percent=util_max_percent,
            util_method=util_method,
            util_node_count=util_node_count,
            coverage_avg_gateways=coverage_avg_gw,
            coverage_single_gw_count=coverage_single,
            coverage_full_count=coverage_full,
            flagged_nodes=flagged_count,
            battery_warnings=battery_warnings,
            solar_index=solar_index,
-            util_data_available=has_packet_data,
+            util_data_available=util_data_available,
            coverage_data_available=coverage_available,
        )