From e807750a72910ac5d819dc666a3eb6398b1a4052 Mon Sep 17 00:00:00 2001
From: malice <matt@echo6.co>
Date: Sun, 7 Jun 2026 21:38:04 -0600
Subject: [PATCH] v0.10.6: extract mile_marker from itd_511 comment field as
 _enriched.mile_marker (#94)

itd_511's free-text Comment field carries a milepost in roughly a third of
the live samples ('milepost 32.5', 'MP 80 to MP 81', etc.). meshai's roads
integration needs that as a structured field; wzdx and tomtom_incidents
already speak in structured mile-post / from-to so itd_511 is the only
adapter that needs the regex extraction layer.

Design (per Step-0 review):
- Shared module src/central/enrichment/mile_marker.py exporting
  extract(text) -> {value, source, confidence} | None. Pure regex, no I/O,
  re-usable by future per-state-DOT adapters (Wyoming, Montana, ...).
- itd_511 calls extract on the Comment in _build_event_record; result lands
  under the established _enriched namespace (NOT a new _enrichment one),
  keyed 'mile_marker'. Same convention the supervisor's geocoder uses, same
  merge semantics apply_enrichment already supports. Absent when no match
  (no null placeholder) so subscribers can tell 'not mentioned' from
  'extraction found nothing'.
- Confidence tiers: 'high' (single unambiguous MP/milepost/MM match),
  'medium' (multiple matches like 'MP 80 to MP 81' -- first wins), 'low'
  (bare 'mile N' only; consumers can ignore).

Tests:
- tests/test_enrichment_mile_marker.py: 36 cases parametrized over the 15
  real ITD comments I pulled from CENTRAL_TRAFFIC, including the critical
  red-herring classes the regex must reject (phone numbers, project key
  numbers, state-highway numbers, date/time numbers). Crafted samples
  cover M.P. / MM / milemarker / bare-mile patterns not in live ITD data
  but required by spec for future DOT adapters.
- tests/test_itd_511.py: 2 integration tests confirming the bundle is
  attached on a milepost-bearing Comment and absent otherwise.

Pure enrichment, no schema-breaking changes; meshai's renderer picks it up
additively.

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/central/adapters/itd_511.py       |  60 ++++----
 src/central/enrichment/mile_marker.py |  78 +++++++++++
 tests/test_enrichment_mile_marker.py  | 192 ++++++++++++++++++++++++++
 tests/test_itd_511.py                 |  51 +++++++
 4 files changed, 354 insertions(+), 27 deletions(-)
 create mode 100644 src/central/enrichment/mile_marker.py
 create mode 100644 tests/test_enrichment_mile_marker.py

diff --git a/src/central/adapters/itd_511.py b/src/central/adapters/itd_511.py
index 641b8fc..b9efa66 100644
--- a/src/central/adapters/itd_511.py
+++ b/src/central/adapters/itd_511.py
@@ -57,6 +57,7 @@ from tenacity import (
 from central.adapter import SourceAdapter
 from central.config_models import AdapterConfig
 from central.config_store import ConfigStore
+from central.enrichment import mile_marker
 from central.models import Event, Geo
 
 logger = logging.getLogger(__name__)
@@ -296,6 +297,37 @@ class Itd511Adapter(SourceAdapter):
             lat, lon, rec.get("LatitudeSecondary"), rec.get("LongitudeSecondary"),
             rec.get("EncodedPolyline"),
         )
+        comment = _strip_or_none(rec.get("Comment"))
+        data: dict[str, Any] = {
+            "event_type_short": et,
+            "event_sub_type": _strip_or_none(rec.get("EventSubType")),
+            "roadway_name": _strip_or_none(rec.get("RoadwayName")),
+            "direction": _strip_or_none(rec.get("DirectionOfTravel")),
+            "description": _strip_or_none(rec.get("Description")),
+            "lanes_affected": _strip_or_none(rec.get("LanesAffected")),
+            "is_full_closure": bool(rec.get("IsFullClosure")),
+            "itd_severity": rec.get("Severity"),
+            "comment": comment,
+            "cause": _strip_or_none(rec.get("Cause")),
+            "organization": rec.get("Organization"),
+            "recurrence_text": _strip_or_none(rec.get("Recurrence")),
+            "recurrence_schedules": rec.get("RecurrenceSchedules") or [],
+            "restrictions": rec.get("Restrictions") or {},
+            "detour_polyline": rec.get("DetourPolyline") or None,
+            "detour_instructions": _strip_or_none(rec.get("DetourInstructions")),
+            "encoded_polyline": rec.get("EncodedPolyline"),
+            "id_internal": rec.get("ID"),
+            "source_id": rec.get("SourceId"),
+            "reported_epoch": rec.get("Reported"),
+            "last_updated_epoch": rec.get("LastUpdated"),
+            "start_epoch": rec.get("StartDate"),
+            "planned_end_epoch": rec.get("PlannedEndDate"),
+            "latitude": lat,
+            "longitude": lon,
+        }
+        mm = mile_marker.extract(comment)
+        if mm is not None:
+            data.setdefault("_enriched", {})["mile_marker"] = mm
         return Event(
             id=f"idaho_511:event:{source_id}",
             adapter=self.name,
@@ -310,33 +342,7 @@ class Itd511Adapter(SourceAdapter):
                 centroid=centroid, geometry=geom,
                 regions=["US-ID"], primary_region="US-ID",
             ),
-            data={
-                "event_type_short": et,
-                "event_sub_type": _strip_or_none(rec.get("EventSubType")),
-                "roadway_name": _strip_or_none(rec.get("RoadwayName")),
-                "direction": _strip_or_none(rec.get("DirectionOfTravel")),
-                "description": _strip_or_none(rec.get("Description")),
-                "lanes_affected": _strip_or_none(rec.get("LanesAffected")),
-                "is_full_closure": bool(rec.get("IsFullClosure")),
-                "itd_severity": rec.get("Severity"),
-                "comment": _strip_or_none(rec.get("Comment")),
-                "cause": _strip_or_none(rec.get("Cause")),
-                "organization": rec.get("Organization"),
-                "recurrence_text": _strip_or_none(rec.get("Recurrence")),
-                "recurrence_schedules": rec.get("RecurrenceSchedules") or [],
-                "restrictions": rec.get("Restrictions") or {},
-                "detour_polyline": rec.get("DetourPolyline") or None,
-                "detour_instructions": _strip_or_none(rec.get("DetourInstructions")),
-                "encoded_polyline": rec.get("EncodedPolyline"),
-                "id_internal": rec.get("ID"),
-                "source_id": rec.get("SourceId"),
-                "reported_epoch": rec.get("Reported"),
-                "last_updated_epoch": rec.get("LastUpdated"),
-                "start_epoch": rec.get("StartDate"),
-                "planned_end_epoch": rec.get("PlannedEndDate"),
-                "latitude": lat,
-                "longitude": lon,
-            },
+            data=data,
         )
 
     def _build_advisory_record(self, rec: dict[str, Any]) -> Event | None:
diff --git a/src/central/enrichment/mile_marker.py b/src/central/enrichment/mile_marker.py
new file mode 100644
index 0000000..ce89711
--- /dev/null
+++ b/src/central/enrichment/mile_marker.py
@@ -0,0 +1,78 @@
+"""Mile-marker extraction from free-text comment fields.
+
+Used by DOT adapters (itd_511 today; future per-state DOTs) to pull a
+mile-marker value out of upstream freeform comments. Returns ``None`` when
+no match -- the caller is expected to omit the field entirely rather than
+write a null placeholder, so subscribers can distinguish "no MP mentioned"
+from "MP extraction ran and found nothing".
+
+Confidence tiers (v0.10.6 spec):
+
+- ``high``:   exactly one unambiguous keyword+number match (``milepost`` /
+              ``MP`` / ``MM`` / ``mile marker`` etc.)
+- ``medium``: two or more unambiguous matches in the same comment
+              (e.g. a range like ``MP 80 to MP 81``); first match wins
+- ``low``:    no unambiguous match but a bare ``mile N`` token is present;
+              consumers may choose to ignore low-confidence extractions
+
+Shared module by design -- regex is universal, not Idaho-specific, so
+future per-state-DOT adapters (Wyoming, Montana, etc.) call
+``from central.enrichment.mile_marker import extract``.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+# Unambiguous keyword forms. Each branch is a keyword family; the trailing
+# ``\s+ (\d+ (?:\.\d+)?)`` captures the value. ``\b`` anchors guard against
+# substring matches inside larger words (so "amp 5" / "stamp 5" / "miles 5"
+# never match).
+_KEYWORD_PATTERN = re.compile(
+    r"""
+    \b
+    (?:
+        m \.? p \.?            |   # MP, M.P., MP., M.P
+        m \.? m \.?            |   # MM, M.M., MM., M.M
+        mile [\s-]* post       |   # milepost, mile post, mile-post
+        mile [\s-]* marker         # milemarker, mile marker, mile-marker
+    )
+    \s+
+    ( \d+ (?: \. \d+ )? )
+    \b
+    """,
+    re.IGNORECASE | re.VERBOSE,
+)
+
+# Low-confidence fallback: bare ``mile N``. Word-boundary at start prevents
+# the prefix from matching inside ``milepost`` / ``milemarker`` / ``miles``;
+# the ``\s+`` between ``mile`` and the digit further excludes those words.
+_BARE_MILE_PATTERN = re.compile(
+    r"\bmile\s+(\d+(?:\.\d+)?)\b",
+    re.IGNORECASE,
+)
+
+_SOURCE = "comment_regex"
+
+
+def extract(text: str | None) -> dict[str, Any] | None:
+    """Return ``{value, source, confidence}`` if a mile marker is found, else ``None``.
+
+    Pure function, no I/O. Never raises on malformed input -- callers can
+    pass a raw upstream string with no try/except.
+    """
+    if not text:
+        return None
+
+    keyword_hits = _KEYWORD_PATTERN.findall(text)
+    if len(keyword_hits) == 1:
+        return {"value": float(keyword_hits[0]), "source": _SOURCE, "confidence": "high"}
+    if len(keyword_hits) > 1:
+        return {"value": float(keyword_hits[0]), "source": _SOURCE, "confidence": "medium"}
+
+    bare_hits = _BARE_MILE_PATTERN.findall(text)
+    if bare_hits:
+        return {"value": float(bare_hits[0]), "source": _SOURCE, "confidence": "low"}
+
+    return None
diff --git a/tests/test_enrichment_mile_marker.py b/tests/test_enrichment_mile_marker.py
new file mode 100644
index 0000000..3c18e12
--- /dev/null
+++ b/tests/test_enrichment_mile_marker.py
@@ -0,0 +1,192 @@
+"""Tests for v0.10.6 mile-marker regex extraction.
+
+Coverage strategy:
+- Real ITD samples drive the parametrize for high/medium/no-match tiers --
+  these are the comments that actually appear on CENTRAL_TRAFFIC events
+  and include the red-herring classes (phone numbers, project key numbers,
+  state-highway numbers, date/time numbers) that the regex must reject.
+- Crafted samples cover the M.P./MM/milemarker/bare-mile patterns the spec
+  requires for future per-state-DOT adapters even though they're not in
+  live ITD data today.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from central.enrichment.mile_marker import extract
+
+
+# --- High-confidence: single unambiguous match (real ITD comments) -----------
+
+
+@pytest.mark.parametrize("text, expected_value", [
+    ("Emergency vehicles blocking the right lane and right shoulder, "
+     "eastbound I-84 near milepost 32.5. Keep left.", 32.5),
+    ("Crash on westbound I-84 at milepost 54.  One right lane blocked.", 54.0),
+    ("Crash westbound I-84 milepost 42 blocking the right two lanes. "
+     "Expect delays, use caution and keep left.", 42.0),
+    ("A crash is blocking all lanes on Highway 21, near milepost 10, "
+     "before Lucky Peak State Park.", 10.0),
+    ("All directions of travel blocked SH 21 milepost 15 due to a crash.", 15.0),
+])
+def test_high_confidence_real_samples(text, expected_value):
+    result = extract(text)
+    assert result is not None
+    assert result["value"] == expected_value
+    assert result["source"] == "comment_regex"
+    assert result["confidence"] == "high"
+
+
+def test_red_herring_road_numbers_in_real_sample():
+    """US-20 / E 200 St must NOT shadow the actual milepost 320."""
+    text = ("A crash is blocking the rightmost lane of US-20 at milepost 320, "
+            "near E 200 St. Keep Left.")
+    result = extract(text)
+    assert result is not None
+    assert result["value"] == 320.0
+    assert result["confidence"] == "high"
+
+
+# --- Medium-confidence: range / multi-match (real ITD comments) --------------
+
+
+@pytest.mark.parametrize("text, expected_first_value", [
+    ("6/6 - 6/8 Southbound Left Lane Closure (slow) from MP 80 to MP 81.", 80.0),
+    ("Northbound Left Lane Closure from MP 72.6 to MP 76.25 from "
+     "7:00 PM to 6:00 AM.", 72.6),
+])
+def test_medium_confidence_real_samples(text, expected_first_value):
+    result = extract(text)
+    assert result is not None
+    assert result["value"] == expected_first_value
+    assert result["confidence"] == "medium"
+
+
+# --- No-match: real ITD comments that must NOT yield a value -----------------
+
+
+@pytest.mark.parametrize("text", [
+    "40th St will be closed for work on water lines.",
+    "Bridge Repair",
+    "Bridge Maintenance. ITD- Phil Etchart, 208-490-4593, "
+    "Jason Fisher, 208-420-8328.",
+    "ITD Project Key Number 21832 McCammon IC to Old US-91",
+    "Sunday, June 28, 2026, from approximately 9:45 AM to 10:30 AM. "
+    "Traffic restrictions will be lifted as the motorcade passes.",
+])
+def test_no_match_real_samples(text):
+    assert extract(text) is None
+
+
+# --- Crafted high-confidence patterns (spec, not in live ITD data) -----------
+
+
+@pytest.mark.parametrize("text, expected_value", [
+    ("Eastbound MP 32", 32.0),
+    ("Closure at M.P. 32", 32.0),
+    ("Crash near M.P 32 today", 32.0),
+    ("Slow at MM 32", 32.0),
+    ("Slow at M.M. 32", 32.0),
+    ("milepost 32", 32.0),
+    ("mile post 32", 32.0),
+    ("mile-post 32", 32.0),
+    ("milemarker 32", 32.0),
+    ("mile marker 32", 32.0),
+    ("mile-marker 32", 32.0),
+    ("milepost 32.5", 32.5),
+])
+def test_crafted_high_confidence_patterns(text, expected_value):
+    result = extract(text)
+    assert result is not None
+    assert result["value"] == expected_value
+    assert result["confidence"] == "high"
+
+
+# --- Crafted medium-confidence: multiple unambiguous matches -----------------
+
+
+def test_crafted_medium_confidence_multiple_mp():
+    result = extract("Closure from MP 5 to MP 9.")
+    assert result is not None
+    assert result["value"] == 5.0
+    assert result["confidence"] == "medium"
+
+
+def test_crafted_medium_mixed_keywords():
+    """Mixed unambiguous keyword forms both count -> medium, first wins."""
+    result = extract("milepost 5 and mile marker 10 affected.")
+    assert result is not None
+    assert result["value"] == 5.0
+    assert result["confidence"] == "medium"
+
+
+# --- Crafted low-confidence: bare 'mile N' (spec, not in live data) ----------
+
+
+def test_crafted_low_confidence_bare_mile():
+    """Bare 'mile N' without MP/milepost context -- extract at 'low'."""
+    result = extract("Crash near mile 14")
+    assert result is not None
+    assert result["value"] == 14.0
+    assert result["confidence"] == "low"
+
+
+def test_crafted_low_confidence_bare_mile_with_decimal():
+    result = extract("Slowdown near mile 14.5 today.")
+    assert result is not None
+    assert result["value"] == 14.5
+    assert result["confidence"] == "low"
+
+
+# --- Crafted: tier precedence ------------------------------------------------
+
+
+def test_high_keyword_beats_bare_mile_in_same_text():
+    """If a high-conf keyword matches, bare 'mile N' is not consulted."""
+    result = extract("Crash near milepost 22, also affecting mile 14 detour.")
+    assert result is not None
+    assert result["value"] == 22.0
+    assert result["confidence"] == "high"
+
+
+# --- Edge cases --------------------------------------------------------------
+
+
+def test_empty_string():
+    assert extract("") is None
+
+
+def test_none_input():
+    assert extract(None) is None
+
+
+def test_numbers_without_keyword_never_match():
+    """Standalone numbers without an MP/mile keyword must not match."""
+    assert extract("Highway 21, US-20, 208-555-1234, exit 84.") is None
+
+
+def test_case_insensitive():
+    """Keywords must match regardless of capitalization."""
+    result = extract("CRASH at MILEPOST 50.")
+    assert result is not None
+    assert result["value"] == 50.0
+    assert result["confidence"] == "high"
+
+
+def test_substring_keywords_do_not_match():
+    """'amp', 'stamp', 'miles', 'milestone' must not match the keyword regex."""
+    assert extract("The amp 50 was loud.") is None
+    assert extract("Stamp 50 on the document.") is None
+    assert extract("Miles 50 traveled.") is None
+    assert extract("Milestone 50 reached.") is None
+
+
+def test_result_dict_shape():
+    """Result has exactly {value: float, source: 'comment_regex', confidence: str}."""
+    result = extract("milepost 32.5")
+    assert result is not None
+    assert set(result.keys()) == {"value", "source", "confidence"}
+    assert isinstance(result["value"], float)
+    assert result["source"] == "comment_regex"
+    assert result["confidence"] in {"high", "medium", "low"}
diff --git a/tests/test_itd_511.py b/tests/test_itd_511.py
index 50f8fc3..4857d9c 100644
--- a/tests/test_itd_511.py
+++ b/tests/test_itd_511.py
@@ -396,3 +396,54 @@ def test_tenacity_decorator_has_explicit_no_log_hooks():
     assert retrying.after is after_nothing
     assert retrying.before is before_nothing
     assert retrying.reraise is True
+
+
+# --- v0.10.6: mile_marker enrichment on incident events ---------------------
+
+
+def _rec_with_comment(comment: str | None) -> dict:
+    """Minimal /get/event record with a settable Comment field."""
+    return {
+        "SourceId": "test-mm-1",
+        "EventType": "accidentsAndIncidents",
+        "Comment": comment,
+        "Latitude": 43.6,
+        "Longitude": -116.2,
+        "Severity": "Minor",
+    }
+
+
+def test_build_event_attaches_mile_marker_when_comment_has_milepost(adapter):
+    """Comment with a milepost keyword -> _enriched.mile_marker populated.
+
+    v0.10.6: the adapter calls central.enrichment.mile_marker.extract on
+    the Comment field; the result lands under the existing _enriched
+    namespace (same convention the supervisor's geocoder uses), keyed by
+    'mile_marker'. Asserts the bundle is present and matches the
+    {value, source, confidence} contract.
+    """
+    rec = _rec_with_comment(
+        "Crash on westbound I-84 at milepost 54.  One right lane blocked."
+    )
+    e = adapter._build_event_record(rec)
+    assert e is not None
+    bundle = e.data.get("_enriched", {}).get("mile_marker")
+    assert bundle is not None, "expected _enriched.mile_marker on milepost-bearing comment"
+    assert bundle["value"] == 54.0
+    assert bundle["source"] == "comment_regex"
+    assert bundle["confidence"] == "high"
+
+
+def test_build_event_omits_mile_marker_when_comment_has_none(adapter):
+    """No MP/mile keyword -> _enriched.mile_marker ABSENT (no null placeholder).
+
+    Subscribers can therefore distinguish 'no MP mentioned' from
+    'extraction ran and found nothing'. Also covers the missing-Comment path.
+    """
+    no_match = adapter._build_event_record(_rec_with_comment("Bridge Repair"))
+    assert no_match is not None
+    assert "mile_marker" not in no_match.data.get("_enriched", {})
+
+    missing = adapter._build_event_record(_rec_with_comment(None))
+    assert missing is not None
+    assert "mile_marker" not in missing.data.get("_enriched", {})