v0.10.6: extract mile_marker from itd_511 comment field as _enriched.mile_marker (#94)

itd_511's free-text Comment field carries a milepost in roughly a third of
the live samples ('milepost 32.5', 'MP 80 to MP 81', etc.). meshai's roads
integration needs that as a structured field; wzdx and tomtom_incidents
already speak in structured mile-post / from-to so itd_511 is the only
adapter that needs the regex extraction layer.

Design (per Step-0 review):
- Shared module src/central/enrichment/mile_marker.py exporting
  extract(text) -> {value, source, confidence} | None. Pure regex, no I/O,
  re-usable by future per-state-DOT adapters (Wyoming, Montana, ...).
- itd_511 calls extract on the Comment in _build_event_record; result lands
  under the established _enriched namespace (NOT a new _enrichment one),
  keyed 'mile_marker'. Same convention the supervisor's geocoder uses, same
  merge semantics apply_enrichment already supports. Absent when no match
  (no null placeholder) so subscribers can tell 'not mentioned' from
  'extraction found nothing'.
- Confidence tiers: 'high' (single unambiguous MP/milepost/MM match),
  'medium' (multiple matches like 'MP 80 to MP 81' -- first wins), 'low'
  (bare 'mile N' only; consumers can ignore).

Tests:
- tests/test_enrichment_mile_marker.py: 36 cases parametrized over the 15
  real ITD comments I pulled from CENTRAL_TRAFFIC, including the critical
  red-herring classes the regex must reject (phone numbers, project key
  numbers, state-highway numbers, date/time numbers). Crafted samples
  cover M.P. / MM / milemarker / bare-mile patterns not in live ITD data
  but required by spec for future DOT adapters.
- tests/test_itd_511.py: 2 integration tests confirming the bundle is
  attached on a milepost-bearing Comment and absent otherwise.

Pure enrichment, no schema-breaking changes; meshai's renderer picks it up
additively.

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
malice 2026-06-07 21:38:04 -06:00 committed by GitHub
commit e807750a72
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 354 additions and 27 deletions

View file

@ -0,0 +1,192 @@
"""Tests for v0.10.6 mile-marker regex extraction.
Coverage strategy:
- Real ITD samples drive the parametrize for high/medium/no-match tiers --
these are the comments that actually appear on CENTRAL_TRAFFIC events
and include the red-herring classes (phone numbers, project key numbers,
state-highway numbers, date/time numbers) that the regex must reject.
- Crafted samples cover the M.P./MM/milemarker/bare-mile patterns the spec
requires for future per-state-DOT adapters even though they're not in
live ITD data today.
"""
from __future__ import annotations
import pytest
from central.enrichment.mile_marker import extract
# --- High-confidence: single unambiguous match (real ITD comments) -----------
@pytest.mark.parametrize("text, expected_value", [
("Emergency vehicles blocking the right lane and right shoulder, "
"eastbound I-84 near milepost 32.5. Keep left.", 32.5),
("Crash on westbound I-84 at milepost 54. One right lane blocked.", 54.0),
("Crash westbound I-84 milepost 42 blocking the right two lanes. "
"Expect delays, use caution and keep left.", 42.0),
("A crash is blocking all lanes on Highway 21, near milepost 10, "
"before Lucky Peak State Park.", 10.0),
("All directions of travel blocked SH 21 milepost 15 due to a crash.", 15.0),
])
def test_high_confidence_real_samples(text, expected_value):
result = extract(text)
assert result is not None
assert result["value"] == expected_value
assert result["source"] == "comment_regex"
assert result["confidence"] == "high"
def test_red_herring_road_numbers_in_real_sample():
"""US-20 / E 200 St must NOT shadow the actual milepost 320."""
text = ("A crash is blocking the rightmost lane of US-20 at milepost 320, "
"near E 200 St. Keep Left.")
result = extract(text)
assert result is not None
assert result["value"] == 320.0
assert result["confidence"] == "high"
# --- Medium-confidence: range / multi-match (real ITD comments) --------------
@pytest.mark.parametrize("text, expected_first_value", [
("6/6 - 6/8 Southbound Left Lane Closure (slow) from MP 80 to MP 81.", 80.0),
("Northbound Left Lane Closure from MP 72.6 to MP 76.25 from "
"7:00 PM to 6:00 AM.", 72.6),
])
def test_medium_confidence_real_samples(text, expected_first_value):
result = extract(text)
assert result is not None
assert result["value"] == expected_first_value
assert result["confidence"] == "medium"
# --- No-match: real ITD comments that must NOT yield a value -----------------
@pytest.mark.parametrize("text", [
"40th St will be closed for work on water lines.",
"Bridge Repair",
"Bridge Maintenance. ITD- Phil Etchart, 208-490-4593, "
"Jason Fisher, 208-420-8328.",
"ITD Project Key Number 21832 McCammon IC to Old US-91",
"Sunday, June 28, 2026, from approximately 9:45 AM to 10:30 AM. "
"Traffic restrictions will be lifted as the motorcade passes.",
])
def test_no_match_real_samples(text):
assert extract(text) is None
# --- Crafted high-confidence patterns (spec, not in live ITD data) -----------
@pytest.mark.parametrize("text, expected_value", [
("Eastbound MP 32", 32.0),
("Closure at M.P. 32", 32.0),
("Crash near M.P 32 today", 32.0),
("Slow at MM 32", 32.0),
("Slow at M.M. 32", 32.0),
("milepost 32", 32.0),
("mile post 32", 32.0),
("mile-post 32", 32.0),
("milemarker 32", 32.0),
("mile marker 32", 32.0),
("mile-marker 32", 32.0),
("milepost 32.5", 32.5),
])
def test_crafted_high_confidence_patterns(text, expected_value):
result = extract(text)
assert result is not None
assert result["value"] == expected_value
assert result["confidence"] == "high"
# --- Crafted medium-confidence: multiple unambiguous matches -----------------
def test_crafted_medium_confidence_multiple_mp():
result = extract("Closure from MP 5 to MP 9.")
assert result is not None
assert result["value"] == 5.0
assert result["confidence"] == "medium"
def test_crafted_medium_mixed_keywords():
"""Mixed unambiguous keyword forms both count -> medium, first wins."""
result = extract("milepost 5 and mile marker 10 affected.")
assert result is not None
assert result["value"] == 5.0
assert result["confidence"] == "medium"
# --- Crafted low-confidence: bare 'mile N' (spec, not in live data) ----------
def test_crafted_low_confidence_bare_mile():
"""Bare 'mile N' without MP/milepost context -- extract at 'low'."""
result = extract("Crash near mile 14")
assert result is not None
assert result["value"] == 14.0
assert result["confidence"] == "low"
def test_crafted_low_confidence_bare_mile_with_decimal():
result = extract("Slowdown near mile 14.5 today.")
assert result is not None
assert result["value"] == 14.5
assert result["confidence"] == "low"
# --- Crafted: tier precedence ------------------------------------------------
def test_high_keyword_beats_bare_mile_in_same_text():
"""If a high-conf keyword matches, bare 'mile N' is not consulted."""
result = extract("Crash near milepost 22, also affecting mile 14 detour.")
assert result is not None
assert result["value"] == 22.0
assert result["confidence"] == "high"
# --- Edge cases --------------------------------------------------------------
def test_empty_string():
assert extract("") is None
def test_none_input():
assert extract(None) is None
def test_numbers_without_keyword_never_match():
"""Standalone numbers without an MP/mile keyword must not match."""
assert extract("Highway 21, US-20, 208-555-1234, exit 84.") is None
def test_case_insensitive():
"""Keywords must match regardless of capitalization."""
result = extract("CRASH at MILEPOST 50.")
assert result is not None
assert result["value"] == 50.0
assert result["confidence"] == "high"
def test_substring_keywords_do_not_match():
"""'amp', 'stamp', 'miles', 'milestone' must not match the keyword regex."""
assert extract("The amp 50 was loud.") is None
assert extract("Stamp 50 on the document.") is None
assert extract("Miles 50 traveled.") is None
assert extract("Milestone 50 reached.") is None
def test_result_dict_shape():
"""Result has exactly {value: float, source: 'comment_regex', confidence: str}."""
result = extract("milepost 32.5")
assert result is not None
assert set(result.keys()) == {"value", "source", "confidence"}
assert isinstance(result["value"], float)
assert result["source"] == "comment_regex"
assert result["confidence"] in {"high", "medium", "low"}

View file

@ -396,3 +396,54 @@ def test_tenacity_decorator_has_explicit_no_log_hooks():
assert retrying.after is after_nothing
assert retrying.before is before_nothing
assert retrying.reraise is True
# --- v0.10.6: mile_marker enrichment on incident events ---------------------
def _rec_with_comment(comment: str | None) -> dict:
"""Minimal /get/event record with a settable Comment field."""
return {
"SourceId": "test-mm-1",
"EventType": "accidentsAndIncidents",
"Comment": comment,
"Latitude": 43.6,
"Longitude": -116.2,
"Severity": "Minor",
}
def test_build_event_attaches_mile_marker_when_comment_has_milepost(adapter):
"""Comment with a milepost keyword -> _enriched.mile_marker populated.
v0.10.6: the adapter calls central.enrichment.mile_marker.extract on
the Comment field; the result lands under the existing _enriched
namespace (same convention the supervisor's geocoder uses), keyed by
'mile_marker'. Asserts the bundle is present and matches the
{value, source, confidence} contract.
"""
rec = _rec_with_comment(
"Crash on westbound I-84 at milepost 54. One right lane blocked."
)
e = adapter._build_event_record(rec)
assert e is not None
bundle = e.data.get("_enriched", {}).get("mile_marker")
assert bundle is not None, "expected _enriched.mile_marker on milepost-bearing comment"
assert bundle["value"] == 54.0
assert bundle["source"] == "comment_regex"
assert bundle["confidence"] == "high"
def test_build_event_omits_mile_marker_when_comment_has_none(adapter):
"""No MP/mile keyword -> _enriched.mile_marker ABSENT (no null placeholder).
Subscribers can therefore distinguish 'no MP mentioned' from
'extraction ran and found nothing'. Also covers the missing-Comment path.
"""
no_match = adapter._build_event_record(_rec_with_comment("Bridge Repair"))
assert no_match is not None
assert "mile_marker" not in no_match.data.get("_enriched", {})
missing = adapter._build_event_record(_rec_with_comment(None))
assert missing is not None
assert "mile_marker" not in missing.data.get("_enriched", {})