fix(health): use real channel utilization from node telemetry

- Utilization pillar now reads firmware-reported channel_utilization instead of estimating from packet counts with hardcoded 200ms/pkt - Uses highest infra node value (busiest node = bottleneck) - Falls back to packet count estimate only when telemetry unavailable - Updated thresholds: 20/25/35/45% matching real Meshtastic behavior - Per-region utilization from region nodes, not mesh-wide - API response includes util_method, util_max_percent, util_node_count Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-21 23:24:44 +02:00 · 2026-05-13 22:49:41 +00:00 · 2026-05-13 22:49:41 +00:00 · 57a19aeec6
commit 57a19aeec6
parent 7de02fb924
3 changed files with 1267 additions and 1164 deletions
--- a/dashboard-frontend/src/pages/Reference.tsx
+++ b/dashboard-frontend/src/pages/Reference.tsx
@ -746,27 +746,32 @@ export default function Reference() {

            <SubHeader>Utilization (25%)</SubHeader>
            <p>
-              Estimates how much of the radio channel's airtime is being used. MeshAI can't measure airtime directly, so it estimates based on packet counts over the last 24 hours.
-            </p>
-            <p className="p-3 bg-slate-800 rounded font-mono text-sm">
-              packets_per_hour = non_text_packets ÷ 24<br/>
-              airtime_estimate = (packets_per_hour × 200ms) ÷ 3,600,000ms × 100%
+              MeshAI reads the channel utilization that each router reports in its telemetry — this is the firmware's own measurement of how busy the radio channel is. MeshAI uses the <strong>highest</strong> value from any infrastructure node because the busiest router is the bottleneck for the whole mesh.
            </p>
            <p>
-              The 200ms is an approximation for the MediumFast radio preset — each LoRa packet takes roughly 200ms of airtime. Text messages don't count toward utilization (chatting is the point of a mesh).
+              <strong>How it works:</strong>
+            </p>
+            <ol className="list-decimal list-inside space-y-1 ml-4">
+              <li>Collect <Mono>channel_utilization</Mono> from all infrastructure nodes that report it</li>
+              <li>If no infra nodes have telemetry, try all nodes</li>
+              <li>Use the <strong>maximum</strong> value for scoring (busiest node = bottleneck)</li>
+              <li>If no nodes report utilization (older firmware), fall back to packet count estimate</li>
+            </ol>
+            <p className="mt-4">
+              <strong>Fallback method</strong> (when telemetry unavailable): estimates from packet counts using 200ms/packet airtime. This is less accurate — it assumes MediumFast preset and sums packets across all nodes.
            </p>
            <RefTable
-              headers={['Estimated Airtime', 'Score', 'What It Means']}
+              headers={['Channel Utilization', 'Score', 'What It Means']}
              rows={[
                ['Under 20%', '100', 'Channel is clear — this is the goal'],
                ['20-25%', '75-100', 'Slight degradation, occasional collisions'],
                ['25-35%', '50-75', 'Severe degradation — firmware throttling active'],
                ['35-45%', '25-50', 'Mesh struggling badly — reliability dropping'],
-                ['Over 45%', '0-25', 'Mesh is effectively dead'],
+                ['Over 45%', '0-25', 'Mesh is effectively unusable'],
              ]}
            />
            <p>
-              <strong>Special case:</strong> If MeshAI doesn't have packet data (no sources reporting packet counts), this pillar scores 100. You're not penalized for missing data.
+              <strong>Special case:</strong> If no utilization data is available (no telemetry and no packet data), this pillar scores 100. You're not penalized for missing data.
            </p>

            <SubHeader>Coverage (20%)</SubHeader>
--- a/meshai/dashboard/api/mesh_routes.py
+++ b/meshai/dashboard/api/mesh_routes.py
@ -20,6 +20,9 @@ def _serialize_health_score(score) -> dict:
        "infra_online": score.infra_online,
        "infra_total": score.infra_total,
        "util_percent": round(score.util_percent, 1),
+        "util_max_percent": round(getattr(score, 'util_max_percent', score.util_percent), 1),
+        "util_method": getattr(score, 'util_method', 'unknown'),
+        "util_node_count": getattr(score, 'util_node_count', 0),
        "flagged_nodes": score.flagged_nodes,
        "battery_warnings": score.battery_warnings,
        "solar_index": round(score.solar_index, 1),
@ -76,6 +79,9 @@ async def get_health(request: Request):
        "infra_online": score.infra_online,
        "infra_total": score.infra_total,
        "util_percent": round(score.util_percent, 1),
+        "util_max_percent": round(getattr(score, 'util_max_percent', score.util_percent), 1),
+        "util_method": getattr(score, 'util_method', 'unknown'),
+        "util_node_count": getattr(score, 'util_node_count', 0),
        "flagged_nodes": score.flagged_nodes,
        "battery_warnings": score.battery_warnings,
        "total_nodes": health.total_nodes,
--- a/meshai/mesh_health.py
+++ b/meshai/mesh_health.py
@ -30,11 +30,12 @@ DEFAULT_OFFLINE_THRESHOLD_HOURS = 24
 DEFAULT_PACKET_THRESHOLD = 500  # Non-text packets per 24h
 DEFAULT_BATTERY_WARNING_PERCENT = 20

-# Utilization thresholds (percentage)
-UTIL_HEALTHY = 20
-UTIL_CAUTION = 25
-UTIL_WARNING = 35
-UTIL_UNHEALTHY = 45
+# Utilization thresholds (percentage) - based on real Meshtastic behavior
+# Firmware starts throttling GPS at 25%, severe degradation above 35%
+UTIL_HEALTHY = 20     # Under 20% = channel is clear
+UTIL_CAUTION = 25     # 20-25% = slight degradation, occasional collisions
+UTIL_WARNING = 35     # 25-35% = severe degradation, firmware throttling
+UTIL_UNHEALTHY = 45   # 35-45% = mesh struggling badly, reliability dropping

 # Pillar weights (5-pillar system)
 WEIGHT_INFRASTRUCTURE = 0.30
@ -58,6 +59,9 @@ class HealthScore:
    infra_online: int = 0
    infra_total: int = 0
    util_percent: float = 0.0
+    util_max_percent: float = 0.0  # Highest node utilization (hotspot indicator)
+    util_method: str = "none"  # "telemetry", "packet_estimate", or "none"
+    util_node_count: int = 0  # Nodes reporting utilization
    coverage_avg_gateways: float = 0.0
    coverage_single_gw_count: int = 0
    coverage_full_count: int = 0
@ -486,10 +490,19 @@ class MeshHealthEngine:
            data_sources.append(f"{len(all_channels)} ch")
        data_str = ", ".join(data_sources) if data_sources else "nodes only"

+        # Log utilization method used
+        util_method = mesh_score.util_method
+        if util_method == "telemetry":
+            util_info = f"util={mesh_score.util_percent:.1f}% (max={mesh_score.util_max_percent:.1f}%, {mesh_score.util_node_count} nodes reporting)"
+        elif util_method == "packet_estimate":
+            util_info = f"util={mesh_score.util_percent:.1f}% (packet estimate fallback)"
+        else:
+            util_info = "util=N/A (no data)"
+
        logger.info(
            f"Mesh health computed: {mesh_health.total_nodes} nodes, "
            f"{mesh_health.total_regions} regions, score {mesh_score.composite:.0f}/100 "
-            f"[{data_str}]"
+            f"[{data_str}] [{util_info}]"
        )

        return mesh_health
@ -541,6 +554,31 @@ class MeshHealthEngine:
        all_nodes = list(nodes.values())
        return self._compute_node_group_score(all_nodes, has_packet_data)

+    def _compute_utilization_score(self, util_percent: float) -> float:
+        """Convert utilization percentage to health score using thresholds.
+
+        Thresholds based on real Meshtastic behavior:
+        - Under 20%: Clear channel (score 100)
+        - 20-25%: Slight degradation (score 75-100)
+        - 25-35%: Severe degradation, firmware throttling (score 50-75)
+        - 35-45%: Mesh struggling badly (score 25-50)
+        - Over 45%: Mesh effectively dead (score 0-25)
+        """
+        if util_percent < UTIL_HEALTHY:  # <20%
+            return 100.0
+        elif util_percent < UTIL_CAUTION:  # 20-25%
+            # Interpolate from 100 to 75
+            return 100.0 - ((util_percent - UTIL_HEALTHY) / (UTIL_CAUTION - UTIL_HEALTHY)) * 25
+        elif util_percent < UTIL_WARNING:  # 25-35%
+            # Interpolate from 75 to 50
+            return 75.0 - ((util_percent - UTIL_CAUTION) / (UTIL_WARNING - UTIL_CAUTION)) * 25
+        elif util_percent < UTIL_UNHEALTHY:  # 35-45%
+            # Interpolate from 50 to 25
+            return 50.0 - ((util_percent - UTIL_WARNING) / (UTIL_UNHEALTHY - UTIL_WARNING)) * 25
+        else:  # 45%+
+            # Interpolate from 25 to 0 over next 10%
+            return max(0.0, 25.0 - ((util_percent - UTIL_UNHEALTHY) / 10) * 25)
+
    def _compute_node_group_score(
        self,
        node_list: list[UnifiedNode],
@ -568,33 +606,84 @@ class MeshHealthEngine:
        else:
            infra_score = 100.0  # No infrastructure = not penalized

-        # Channel utilization (based on packet counts if available)
-        # BUG 7 FIX: Use actual Meshtastic airtime calculation
-        if has_packet_data:
+        # Channel utilization - prefer real telemetry over packet estimate
+        #
+        # Priority 1: Use firmware-reported channel_utilization from nodes
+        # This is the most accurate measure - the firmware calculates this
+        # from actual radio activity over the last minute.
+        #
+        # Priority 2: Fall back to packet count estimate if no telemetry
+        # This is a rough approximation using 200ms/packet (MediumFast preset).
+        # It's less accurate because different presets have different airtime,
+        # and it sums packets across all nodes regardless of channel.
+
+        util_percent = 0.0
+        util_max_percent = 0.0
+        util_score = 100.0
+        util_method = "none"
+        util_node_count = 0
+        util_data_available = False
+
+        # Try to get real channel_utilization from infrastructure nodes
+        # Use infrastructure nodes because they're the routers - they see the most traffic
+        util_readings = []
+        for n in infra_nodes:
+            if n.channel_utilization is not None and n.channel_utilization >= 0:
+                util_readings.append(n.channel_utilization)
+
+        # If no infra nodes have it, try all nodes
+        if not util_readings:
+            for n in node_list:
+                if n.channel_utilization is not None and n.channel_utilization >= 0:
+                    util_readings.append(n.channel_utilization)
+
+        if util_readings:
+            # Use the HIGHEST value - the busiest node is the bottleneck
+            # If one router is at 45% utilization, the mesh has a problem
+            # even if other nodes are at 10%
+            util_max_percent = max(util_readings)
+            util_percent = util_max_percent  # Use max for scoring
+            util_score = self._compute_utilization_score(util_percent)
+            util_method = "telemetry"
+            util_node_count = len(util_readings)
+            util_data_available = True
+
+            # Also compute average for informational purposes
+            # (stored in util_percent, max in util_max_percent)
+            # Actually, use max for the score since that's the bottleneck
+
+        elif has_packet_data:
+            # Fallback: Estimate from packet counts
+            # This is a rough approximation - only use when telemetry unavailable
+            #
+            # WARNING: This method has known issues:
+            # - Assumes 200ms airtime per packet (only correct for MediumFast)
+            # - Sums packets across all nodes even on different channels
+            # - Can't distinguish retries from new packets
+            # Use real channel_utilization from telemetry when available.
+
            total_non_text_packets = sum((n.packets_sent_24h - n.text_messages_24h) for n in node_list)
-            # Average airtime per packet on MediumFast: ~200ms
-            # Total available airtime per hour: 3,600,000ms
-            # Utilization = (packets_per_hour * airtime_ms) / total_airtime_ms * 100
            packets_per_hour = total_non_text_packets / 24.0  # 24h window
            airtime_per_packet_ms = 200  # ~200ms on MediumFast preset
            util_percent = (packets_per_hour * airtime_per_packet_ms) / 3_600_000 * 100
+            util_max_percent = util_percent  # No per-node data available
+            util_score = self._compute_utilization_score(util_percent)
+            util_method = "packet_estimate"
+            util_node_count = 0
+            util_data_available = True

-            # Apply scoring thresholds with interpolation
-            if util_percent < UTIL_HEALTHY:  # <15%
-                util_score = 100.0
-            elif util_percent < UTIL_CAUTION:  # 15-20%
-                util_score = 100.0 - ((util_percent - UTIL_HEALTHY) / (UTIL_CAUTION - UTIL_HEALTHY)) * 25
-            elif util_percent < UTIL_WARNING:  # 20-25%
-                util_score = 75.0 - ((util_percent - UTIL_CAUTION) / (UTIL_WARNING - UTIL_CAUTION)) * 25
-            elif util_percent < UTIL_UNHEALTHY:  # 25-35%
-                util_score = 50.0 - ((util_percent - UTIL_WARNING) / (UTIL_UNHEALTHY - UTIL_WARNING)) * 25
-            else:  # 35%+
-                util_score = max(0.0, 25.0 - ((util_percent - UTIL_UNHEALTHY) / 10) * 25)
+            logger.debug(
+                f"Utilization using packet estimate fallback: {util_percent:.1f}% "
+                f"({total_non_text_packets} non-text packets/24h)"
+            )
        else:
-            # No packet data available - assume healthy utilization
-            # This prevents penalizing the score when we simply don't have data
+            # No utilization data available - don't penalize
            util_percent = 0.0
+            util_max_percent = 0.0
            util_score = 100.0
+            util_method = "none"
+            util_node_count = 0
+            util_data_available = False

        # Node behavior (flagged nodes)
        flagged = [n for n in node_list if (n.packets_sent_24h - n.text_messages_24h) > self.packet_threshold]
@ -674,13 +763,16 @@ class MeshHealthEngine:
            infra_online=infra_online,
            infra_total=infra_total,
            util_percent=util_percent,
+            util_max_percent=util_max_percent,
+            util_method=util_method,
+            util_node_count=util_node_count,
            coverage_avg_gateways=coverage_avg_gw,
            coverage_single_gw_count=coverage_single,
            coverage_full_count=coverage_full,
            flagged_nodes=flagged_count,
            battery_warnings=battery_warnings,
            solar_index=solar_index,
-            util_data_available=has_packet_data,
+            util_data_available=util_data_available,
            coverage_data_available=coverage_available,
        )