From 1ef19508a1358bfb28b499895ecac8c1254cd856 Mon Sep 17 00:00:00 2001 From: Matt Johnson Date: Tue, 19 May 2026 03:53:10 +0000 Subject: [PATCH] fix(2-C): wire dedup into poll loop, add conditional fetch Bug fixes: 1. Wire is_published/mark_published/bump_last_seen into poll() loop - Skip already-published items, bump TTL to prevent sweep - Mark published after yield to track new items 2. Add conditional fetch support (If-Modified-Since, If-None-Match) - Store Last-Modified/ETag from responses - Send conditional headers on subsequent requests - Handle 304 Not Modified gracefully (return empty list) 3. Document state parsing rationale in docstring - Description has structured State: field vs unreliable title prefixes Tests added: - test_dedup_in_poll_loop: verify second poll yields 0 for same items - test_conditional_304_yields_zero: verify 304 returns empty list - test_conditional_headers_sent_after_first_poll: verify headers sent Co-Authored-By: Claude Opus 4.5 --- src/central/adapters/inciweb.py | 31 +++++++ tests/test_inciweb.py | 149 ++++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+) diff --git a/src/central/adapters/inciweb.py b/src/central/adapters/inciweb.py index 8fc3854..2ae0634 100644 --- a/src/central/adapters/inciweb.py +++ b/src/central/adapters/inciweb.py @@ -101,6 +101,12 @@ def parse_state_from_description(description: str) -> str | None: Format: "State: Minnesota" or "State: New Mexico" Returns 2-letter state code or None if not found. + + Design note: State is parsed from the description rather than the title + because InciWeb titles use unit code prefixes (e.g., "MNMNS Stewart Trail", + "CACNP Santa Rosa Island Fire") which are not reliable state indicators. + The description has a structured "State: " field that reliably + identifies the state for all incidents. """ pattern = r"State:\s*([A-Za-z\s]+?)(?:\n|---|$)" match = re.search(pattern, description) @@ -176,6 +182,10 @@ class InciWebAdapter(SourceAdapter): self._session: aiohttp.ClientSession | None = None self._db: sqlite3.Connection | None = None + # Conditional fetch state + self._last_modified: str | None = None + self._etag: str | None = None + # Parse region from settings region_dict = config.settings.get("region") if region_dict: @@ -300,10 +310,25 @@ class InciWebAdapter(SourceAdapter): if not self._session: raise RuntimeError("Session not initialized") + # Build request headers with conditional fetch support headers = {"User-Agent": "Central/0.4"} + if self._last_modified: + headers["If-Modified-Since"] = self._last_modified + if self._etag: + headers["If-None-Match"] = self._etag async with self._session.get(INCIWEB_RSS_URL, headers=headers) as resp: + # Handle 304 Not Modified + if resp.status == 304: + logger.info("InciWeb not modified") + return [] + resp.raise_for_status() + + # Capture conditional fetch headers for next request + self._last_modified = resp.headers.get("Last-Modified") + self._etag = resp.headers.get("ETag") + content = await resp.text() # Parse RSS XML @@ -367,6 +392,11 @@ class InciWebAdapter(SourceAdapter): if not guid: continue + # Dedup: skip if already published + if self.is_published(guid): + self.bump_last_seen(guid) + continue + description_html = item.get("description", "") # Parse coordinates from description @@ -435,6 +465,7 @@ class InciWebAdapter(SourceAdapter): ) yield event + self.mark_published(guid) events_yielded += 1 # Periodic cleanup of old entries diff --git a/tests/test_inciweb.py b/tests/test_inciweb.py index 4d255ed..f423553 100644 --- a/tests/test_inciweb.py +++ b/tests/test_inciweb.py @@ -448,3 +448,152 @@ class TestInciWebAdapter: assert adapter.region.north == 50.0 assert adapter.region.south == 35.0 + + @pytest.mark.asyncio + async def test_dedup_in_poll_loop( + self, mock_config_no_region: AdapterConfig, mock_config_store: MagicMock, cursor_db_path: Path + ): + """Dedup integration: second poll with same items yields zero events.""" + from central.adapters.inciweb import InciWebAdapter + + adapter = InciWebAdapter(mock_config_no_region, mock_config_store, cursor_db_path) + await adapter.startup() + + # Single-item RSS for clarity + single_item_rss = """ + + + InciWeb + + Test Fire + http://inciweb.wildfire.gov/test + State: California + Mon, 18 May 2026 09:00:00 EDT + DEDUP-TEST-001 + + +""" + + def make_mock_response(): + mock_response = AsyncMock() + mock_response.status = 200 + mock_response.raise_for_status = MagicMock() + mock_response.text = AsyncMock(return_value=single_item_rss) + mock_response.headers = {"Last-Modified": None, "ETag": None} + return mock_response + + # First poll: should yield 1 event + with patch.object( + adapter._session, "get", + return_value=AsyncMock( + __aenter__=AsyncMock(return_value=make_mock_response()), + __aexit__=AsyncMock() + ) + ): + events_first = [e async for e in adapter.poll()] + + assert len(events_first) == 1 + assert events_first[0].data["guid"] == "DEDUP-TEST-001" + + # Verify mark_published was called + assert adapter.is_published("DEDUP-TEST-001") is True + + # Second poll: same item should be skipped (dedup) + with patch.object( + adapter._session, "get", + return_value=AsyncMock( + __aenter__=AsyncMock(return_value=make_mock_response()), + __aexit__=AsyncMock() + ) + ): + events_second = [e async for e in adapter.poll()] + + assert len(events_second) == 0 # Dedup prevents re-yield + + await adapter.shutdown() + + @pytest.mark.asyncio + async def test_conditional_304_yields_zero( + self, mock_config_no_region: AdapterConfig, mock_config_store: MagicMock, cursor_db_path: Path + ): + """HTTP 304 Not Modified returns empty list and yields zero events.""" + from central.adapters.inciweb import InciWebAdapter + + adapter = InciWebAdapter(mock_config_no_region, mock_config_store, cursor_db_path) + await adapter.startup() + + # Mock 304 response + mock_response = AsyncMock() + mock_response.status = 304 + mock_response.raise_for_status = MagicMock() + + with patch.object( + adapter._session, "get", + return_value=AsyncMock( + __aenter__=AsyncMock(return_value=mock_response), + __aexit__=AsyncMock() + ) + ): + events = [e async for e in adapter.poll()] + + assert len(events) == 0 + + await adapter.shutdown() + + @pytest.mark.asyncio + async def test_conditional_headers_sent_after_first_poll( + self, mock_config_no_region: AdapterConfig, mock_config_store: MagicMock, cursor_db_path: Path + ): + """Conditional fetch headers sent on second poll after first captures them.""" + from central.adapters.inciweb import InciWebAdapter + + adapter = InciWebAdapter(mock_config_no_region, mock_config_store, cursor_db_path) + await adapter.startup() + + # First response with Last-Modified and ETag + first_response = AsyncMock() + first_response.status = 200 + first_response.raise_for_status = MagicMock() + first_response.text = AsyncMock(return_value=""" +Test""") + first_response.headers = { + "Last-Modified": "Tue, 19 May 2026 03:00:00 GMT", + "ETag": "\"abc123\"", + } + + # Track headers sent on second request + captured_headers = {} + + def capture_get(*args, **kwargs): + captured_headers.update(kwargs.get("headers", {})) + second_response = AsyncMock() + second_response.status = 304 + second_response.raise_for_status = MagicMock() + return AsyncMock( + __aenter__=AsyncMock(return_value=second_response), + __aexit__=AsyncMock() + ) + + # First poll + with patch.object( + adapter._session, "get", + return_value=AsyncMock( + __aenter__=AsyncMock(return_value=first_response), + __aexit__=AsyncMock() + ) + ): + [e async for e in adapter.poll()] + + # Verify adapter captured the headers + assert adapter._last_modified == "Tue, 19 May 2026 03:00:00 GMT" + assert adapter._etag == "\"abc123\"" + + # Second poll with header capture + with patch.object(adapter._session, "get", side_effect=capture_get): + [e async for e in adapter.poll()] + + # Verify conditional headers were sent + assert captured_headers.get("If-Modified-Since") == "Tue, 19 May 2026 03:00:00 GMT" + assert captured_headers.get("If-None-Match") == "\"abc123\"" + + await adapter.shutdown()