From e807750a72910ac5d819dc666a3eb6398b1a4052 Mon Sep 17 00:00:00 2001 From: malice Date: Sun, 7 Jun 2026 21:38:04 -0600 Subject: [PATCH] v0.10.6: extract mile_marker from itd_511 comment field as _enriched.mile_marker (#94) itd_511's free-text Comment field carries a milepost in roughly a third of the live samples ('milepost 32.5', 'MP 80 to MP 81', etc.). meshai's roads integration needs that as a structured field; wzdx and tomtom_incidents already speak in structured mile-post / from-to so itd_511 is the only adapter that needs the regex extraction layer. Design (per Step-0 review): - Shared module src/central/enrichment/mile_marker.py exporting extract(text) -> {value, source, confidence} | None. Pure regex, no I/O, re-usable by future per-state-DOT adapters (Wyoming, Montana, ...). - itd_511 calls extract on the Comment in _build_event_record; result lands under the established _enriched namespace (NOT a new _enrichment one), keyed 'mile_marker'. Same convention the supervisor's geocoder uses, same merge semantics apply_enrichment already supports. Absent when no match (no null placeholder) so subscribers can tell 'not mentioned' from 'extraction found nothing'. - Confidence tiers: 'high' (single unambiguous MP/milepost/MM match), 'medium' (multiple matches like 'MP 80 to MP 81' -- first wins), 'low' (bare 'mile N' only; consumers can ignore). Tests: - tests/test_enrichment_mile_marker.py: 36 cases parametrized over the 15 real ITD comments I pulled from CENTRAL_TRAFFIC, including the critical red-herring classes the regex must reject (phone numbers, project key numbers, state-highway numbers, date/time numbers). Crafted samples cover M.P. / MM / milemarker / bare-mile patterns not in live ITD data but required by spec for future DOT adapters. - tests/test_itd_511.py: 2 integration tests confirming the bundle is attached on a milepost-bearing Comment and absent otherwise. Pure enrichment, no schema-breaking changes; meshai's renderer picks it up additively. Co-authored-by: Claude Opus 4.7 (1M context) --- src/central/adapters/itd_511.py | 60 ++++---- src/central/enrichment/mile_marker.py | 78 +++++++++++ tests/test_enrichment_mile_marker.py | 192 ++++++++++++++++++++++++++ tests/test_itd_511.py | 51 +++++++ 4 files changed, 354 insertions(+), 27 deletions(-) create mode 100644 src/central/enrichment/mile_marker.py create mode 100644 tests/test_enrichment_mile_marker.py diff --git a/src/central/adapters/itd_511.py b/src/central/adapters/itd_511.py index 641b8fc..b9efa66 100644 --- a/src/central/adapters/itd_511.py +++ b/src/central/adapters/itd_511.py @@ -57,6 +57,7 @@ from tenacity import ( from central.adapter import SourceAdapter from central.config_models import AdapterConfig from central.config_store import ConfigStore +from central.enrichment import mile_marker from central.models import Event, Geo logger = logging.getLogger(__name__) @@ -296,6 +297,37 @@ class Itd511Adapter(SourceAdapter): lat, lon, rec.get("LatitudeSecondary"), rec.get("LongitudeSecondary"), rec.get("EncodedPolyline"), ) + comment = _strip_or_none(rec.get("Comment")) + data: dict[str, Any] = { + "event_type_short": et, + "event_sub_type": _strip_or_none(rec.get("EventSubType")), + "roadway_name": _strip_or_none(rec.get("RoadwayName")), + "direction": _strip_or_none(rec.get("DirectionOfTravel")), + "description": _strip_or_none(rec.get("Description")), + "lanes_affected": _strip_or_none(rec.get("LanesAffected")), + "is_full_closure": bool(rec.get("IsFullClosure")), + "itd_severity": rec.get("Severity"), + "comment": comment, + "cause": _strip_or_none(rec.get("Cause")), + "organization": rec.get("Organization"), + "recurrence_text": _strip_or_none(rec.get("Recurrence")), + "recurrence_schedules": rec.get("RecurrenceSchedules") or [], + "restrictions": rec.get("Restrictions") or {}, + "detour_polyline": rec.get("DetourPolyline") or None, + "detour_instructions": _strip_or_none(rec.get("DetourInstructions")), + "encoded_polyline": rec.get("EncodedPolyline"), + "id_internal": rec.get("ID"), + "source_id": rec.get("SourceId"), + "reported_epoch": rec.get("Reported"), + "last_updated_epoch": rec.get("LastUpdated"), + "start_epoch": rec.get("StartDate"), + "planned_end_epoch": rec.get("PlannedEndDate"), + "latitude": lat, + "longitude": lon, + } + mm = mile_marker.extract(comment) + if mm is not None: + data.setdefault("_enriched", {})["mile_marker"] = mm return Event( id=f"idaho_511:event:{source_id}", adapter=self.name, @@ -310,33 +342,7 @@ class Itd511Adapter(SourceAdapter): centroid=centroid, geometry=geom, regions=["US-ID"], primary_region="US-ID", ), - data={ - "event_type_short": et, - "event_sub_type": _strip_or_none(rec.get("EventSubType")), - "roadway_name": _strip_or_none(rec.get("RoadwayName")), - "direction": _strip_or_none(rec.get("DirectionOfTravel")), - "description": _strip_or_none(rec.get("Description")), - "lanes_affected": _strip_or_none(rec.get("LanesAffected")), - "is_full_closure": bool(rec.get("IsFullClosure")), - "itd_severity": rec.get("Severity"), - "comment": _strip_or_none(rec.get("Comment")), - "cause": _strip_or_none(rec.get("Cause")), - "organization": rec.get("Organization"), - "recurrence_text": _strip_or_none(rec.get("Recurrence")), - "recurrence_schedules": rec.get("RecurrenceSchedules") or [], - "restrictions": rec.get("Restrictions") or {}, - "detour_polyline": rec.get("DetourPolyline") or None, - "detour_instructions": _strip_or_none(rec.get("DetourInstructions")), - "encoded_polyline": rec.get("EncodedPolyline"), - "id_internal": rec.get("ID"), - "source_id": rec.get("SourceId"), - "reported_epoch": rec.get("Reported"), - "last_updated_epoch": rec.get("LastUpdated"), - "start_epoch": rec.get("StartDate"), - "planned_end_epoch": rec.get("PlannedEndDate"), - "latitude": lat, - "longitude": lon, - }, + data=data, ) def _build_advisory_record(self, rec: dict[str, Any]) -> Event | None: diff --git a/src/central/enrichment/mile_marker.py b/src/central/enrichment/mile_marker.py new file mode 100644 index 0000000..ce89711 --- /dev/null +++ b/src/central/enrichment/mile_marker.py @@ -0,0 +1,78 @@ +"""Mile-marker extraction from free-text comment fields. + +Used by DOT adapters (itd_511 today; future per-state DOTs) to pull a +mile-marker value out of upstream freeform comments. Returns ``None`` when +no match -- the caller is expected to omit the field entirely rather than +write a null placeholder, so subscribers can distinguish "no MP mentioned" +from "MP extraction ran and found nothing". + +Confidence tiers (v0.10.6 spec): + +- ``high``: exactly one unambiguous keyword+number match (``milepost`` / + ``MP`` / ``MM`` / ``mile marker`` etc.) +- ``medium``: two or more unambiguous matches in the same comment + (e.g. a range like ``MP 80 to MP 81``); first match wins +- ``low``: no unambiguous match but a bare ``mile N`` token is present; + consumers may choose to ignore low-confidence extractions + +Shared module by design -- regex is universal, not Idaho-specific, so +future per-state-DOT adapters (Wyoming, Montana, etc.) call +``from central.enrichment.mile_marker import extract``. +""" + +from __future__ import annotations + +import re +from typing import Any + +# Unambiguous keyword forms. Each branch is a keyword family; the trailing +# ``\s+ (\d+ (?:\.\d+)?)`` captures the value. ``\b`` anchors guard against +# substring matches inside larger words (so "amp 5" / "stamp 5" / "miles 5" +# never match). +_KEYWORD_PATTERN = re.compile( + r""" + \b + (?: + m \.? p \.? | # MP, M.P., MP., M.P + m \.? m \.? | # MM, M.M., MM., M.M + mile [\s-]* post | # milepost, mile post, mile-post + mile [\s-]* marker # milemarker, mile marker, mile-marker + ) + \s+ + ( \d+ (?: \. \d+ )? ) + \b + """, + re.IGNORECASE | re.VERBOSE, +) + +# Low-confidence fallback: bare ``mile N``. Word-boundary at start prevents +# the prefix from matching inside ``milepost`` / ``milemarker`` / ``miles``; +# the ``\s+`` between ``mile`` and the digit further excludes those words. +_BARE_MILE_PATTERN = re.compile( + r"\bmile\s+(\d+(?:\.\d+)?)\b", + re.IGNORECASE, +) + +_SOURCE = "comment_regex" + + +def extract(text: str | None) -> dict[str, Any] | None: + """Return ``{value, source, confidence}`` if a mile marker is found, else ``None``. + + Pure function, no I/O. Never raises on malformed input -- callers can + pass a raw upstream string with no try/except. + """ + if not text: + return None + + keyword_hits = _KEYWORD_PATTERN.findall(text) + if len(keyword_hits) == 1: + return {"value": float(keyword_hits[0]), "source": _SOURCE, "confidence": "high"} + if len(keyword_hits) > 1: + return {"value": float(keyword_hits[0]), "source": _SOURCE, "confidence": "medium"} + + bare_hits = _BARE_MILE_PATTERN.findall(text) + if bare_hits: + return {"value": float(bare_hits[0]), "source": _SOURCE, "confidence": "low"} + + return None diff --git a/tests/test_enrichment_mile_marker.py b/tests/test_enrichment_mile_marker.py new file mode 100644 index 0000000..3c18e12 --- /dev/null +++ b/tests/test_enrichment_mile_marker.py @@ -0,0 +1,192 @@ +"""Tests for v0.10.6 mile-marker regex extraction. + +Coverage strategy: +- Real ITD samples drive the parametrize for high/medium/no-match tiers -- + these are the comments that actually appear on CENTRAL_TRAFFIC events + and include the red-herring classes (phone numbers, project key numbers, + state-highway numbers, date/time numbers) that the regex must reject. +- Crafted samples cover the M.P./MM/milemarker/bare-mile patterns the spec + requires for future per-state-DOT adapters even though they're not in + live ITD data today. +""" + +from __future__ import annotations + +import pytest + +from central.enrichment.mile_marker import extract + + +# --- High-confidence: single unambiguous match (real ITD comments) ----------- + + +@pytest.mark.parametrize("text, expected_value", [ + ("Emergency vehicles blocking the right lane and right shoulder, " + "eastbound I-84 near milepost 32.5. Keep left.", 32.5), + ("Crash on westbound I-84 at milepost 54. One right lane blocked.", 54.0), + ("Crash westbound I-84 milepost 42 blocking the right two lanes. " + "Expect delays, use caution and keep left.", 42.0), + ("A crash is blocking all lanes on Highway 21, near milepost 10, " + "before Lucky Peak State Park.", 10.0), + ("All directions of travel blocked SH 21 milepost 15 due to a crash.", 15.0), +]) +def test_high_confidence_real_samples(text, expected_value): + result = extract(text) + assert result is not None + assert result["value"] == expected_value + assert result["source"] == "comment_regex" + assert result["confidence"] == "high" + + +def test_red_herring_road_numbers_in_real_sample(): + """US-20 / E 200 St must NOT shadow the actual milepost 320.""" + text = ("A crash is blocking the rightmost lane of US-20 at milepost 320, " + "near E 200 St. Keep Left.") + result = extract(text) + assert result is not None + assert result["value"] == 320.0 + assert result["confidence"] == "high" + + +# --- Medium-confidence: range / multi-match (real ITD comments) -------------- + + +@pytest.mark.parametrize("text, expected_first_value", [ + ("6/6 - 6/8 Southbound Left Lane Closure (slow) from MP 80 to MP 81.", 80.0), + ("Northbound Left Lane Closure from MP 72.6 to MP 76.25 from " + "7:00 PM to 6:00 AM.", 72.6), +]) +def test_medium_confidence_real_samples(text, expected_first_value): + result = extract(text) + assert result is not None + assert result["value"] == expected_first_value + assert result["confidence"] == "medium" + + +# --- No-match: real ITD comments that must NOT yield a value ----------------- + + +@pytest.mark.parametrize("text", [ + "40th St will be closed for work on water lines.", + "Bridge Repair", + "Bridge Maintenance. ITD- Phil Etchart, 208-490-4593, " + "Jason Fisher, 208-420-8328.", + "ITD Project Key Number 21832 McCammon IC to Old US-91", + "Sunday, June 28, 2026, from approximately 9:45 AM to 10:30 AM. " + "Traffic restrictions will be lifted as the motorcade passes.", +]) +def test_no_match_real_samples(text): + assert extract(text) is None + + +# --- Crafted high-confidence patterns (spec, not in live ITD data) ----------- + + +@pytest.mark.parametrize("text, expected_value", [ + ("Eastbound MP 32", 32.0), + ("Closure at M.P. 32", 32.0), + ("Crash near M.P 32 today", 32.0), + ("Slow at MM 32", 32.0), + ("Slow at M.M. 32", 32.0), + ("milepost 32", 32.0), + ("mile post 32", 32.0), + ("mile-post 32", 32.0), + ("milemarker 32", 32.0), + ("mile marker 32", 32.0), + ("mile-marker 32", 32.0), + ("milepost 32.5", 32.5), +]) +def test_crafted_high_confidence_patterns(text, expected_value): + result = extract(text) + assert result is not None + assert result["value"] == expected_value + assert result["confidence"] == "high" + + +# --- Crafted medium-confidence: multiple unambiguous matches ----------------- + + +def test_crafted_medium_confidence_multiple_mp(): + result = extract("Closure from MP 5 to MP 9.") + assert result is not None + assert result["value"] == 5.0 + assert result["confidence"] == "medium" + + +def test_crafted_medium_mixed_keywords(): + """Mixed unambiguous keyword forms both count -> medium, first wins.""" + result = extract("milepost 5 and mile marker 10 affected.") + assert result is not None + assert result["value"] == 5.0 + assert result["confidence"] == "medium" + + +# --- Crafted low-confidence: bare 'mile N' (spec, not in live data) ---------- + + +def test_crafted_low_confidence_bare_mile(): + """Bare 'mile N' without MP/milepost context -- extract at 'low'.""" + result = extract("Crash near mile 14") + assert result is not None + assert result["value"] == 14.0 + assert result["confidence"] == "low" + + +def test_crafted_low_confidence_bare_mile_with_decimal(): + result = extract("Slowdown near mile 14.5 today.") + assert result is not None + assert result["value"] == 14.5 + assert result["confidence"] == "low" + + +# --- Crafted: tier precedence ------------------------------------------------ + + +def test_high_keyword_beats_bare_mile_in_same_text(): + """If a high-conf keyword matches, bare 'mile N' is not consulted.""" + result = extract("Crash near milepost 22, also affecting mile 14 detour.") + assert result is not None + assert result["value"] == 22.0 + assert result["confidence"] == "high" + + +# --- Edge cases -------------------------------------------------------------- + + +def test_empty_string(): + assert extract("") is None + + +def test_none_input(): + assert extract(None) is None + + +def test_numbers_without_keyword_never_match(): + """Standalone numbers without an MP/mile keyword must not match.""" + assert extract("Highway 21, US-20, 208-555-1234, exit 84.") is None + + +def test_case_insensitive(): + """Keywords must match regardless of capitalization.""" + result = extract("CRASH at MILEPOST 50.") + assert result is not None + assert result["value"] == 50.0 + assert result["confidence"] == "high" + + +def test_substring_keywords_do_not_match(): + """'amp', 'stamp', 'miles', 'milestone' must not match the keyword regex.""" + assert extract("The amp 50 was loud.") is None + assert extract("Stamp 50 on the document.") is None + assert extract("Miles 50 traveled.") is None + assert extract("Milestone 50 reached.") is None + + +def test_result_dict_shape(): + """Result has exactly {value: float, source: 'comment_regex', confidence: str}.""" + result = extract("milepost 32.5") + assert result is not None + assert set(result.keys()) == {"value", "source", "confidence"} + assert isinstance(result["value"], float) + assert result["source"] == "comment_regex" + assert result["confidence"] in {"high", "medium", "low"} diff --git a/tests/test_itd_511.py b/tests/test_itd_511.py index 50f8fc3..4857d9c 100644 --- a/tests/test_itd_511.py +++ b/tests/test_itd_511.py @@ -396,3 +396,54 @@ def test_tenacity_decorator_has_explicit_no_log_hooks(): assert retrying.after is after_nothing assert retrying.before is before_nothing assert retrying.reraise is True + + +# --- v0.10.6: mile_marker enrichment on incident events --------------------- + + +def _rec_with_comment(comment: str | None) -> dict: + """Minimal /get/event record with a settable Comment field.""" + return { + "SourceId": "test-mm-1", + "EventType": "accidentsAndIncidents", + "Comment": comment, + "Latitude": 43.6, + "Longitude": -116.2, + "Severity": "Minor", + } + + +def test_build_event_attaches_mile_marker_when_comment_has_milepost(adapter): + """Comment with a milepost keyword -> _enriched.mile_marker populated. + + v0.10.6: the adapter calls central.enrichment.mile_marker.extract on + the Comment field; the result lands under the existing _enriched + namespace (same convention the supervisor's geocoder uses), keyed by + 'mile_marker'. Asserts the bundle is present and matches the + {value, source, confidence} contract. + """ + rec = _rec_with_comment( + "Crash on westbound I-84 at milepost 54. One right lane blocked." + ) + e = adapter._build_event_record(rec) + assert e is not None + bundle = e.data.get("_enriched", {}).get("mile_marker") + assert bundle is not None, "expected _enriched.mile_marker on milepost-bearing comment" + assert bundle["value"] == 54.0 + assert bundle["source"] == "comment_regex" + assert bundle["confidence"] == "high" + + +def test_build_event_omits_mile_marker_when_comment_has_none(adapter): + """No MP/mile keyword -> _enriched.mile_marker ABSENT (no null placeholder). + + Subscribers can therefore distinguish 'no MP mentioned' from + 'extraction ran and found nothing'. Also covers the missing-Comment path. + """ + no_match = adapter._build_event_record(_rec_with_comment("Bridge Repair")) + assert no_match is not None + assert "mile_marker" not in no_match.data.get("_enriched", {}) + + missing = adapter._build_event_record(_rec_with_comment(None)) + assert missing is not None + assert "mile_marker" not in missing.data.get("_enriched", {})