mirror of
https://github.com/zvx-echo6/central.git
synced 2026-06-10 11:54:37 +02:00
fix(2-C): wire dedup into poll loop, add conditional fetch
Bug fixes: 1. Wire is_published/mark_published/bump_last_seen into poll() loop - Skip already-published items, bump TTL to prevent sweep - Mark published after yield to track new items 2. Add conditional fetch support (If-Modified-Since, If-None-Match) - Store Last-Modified/ETag from responses - Send conditional headers on subsequent requests - Handle 304 Not Modified gracefully (return empty list) 3. Document state parsing rationale in docstring - Description has structured State: field vs unreliable title prefixes Tests added: - test_dedup_in_poll_loop: verify second poll yields 0 for same items - test_conditional_304_yields_zero: verify 304 returns empty list - test_conditional_headers_sent_after_first_poll: verify headers sent Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
8751264f8c
commit
1ef19508a1
2 changed files with 180 additions and 0 deletions
|
|
@ -101,6 +101,12 @@ def parse_state_from_description(description: str) -> str | None:
|
|||
|
||||
Format: "State: Minnesota" or "State: New Mexico"
|
||||
Returns 2-letter state code or None if not found.
|
||||
|
||||
Design note: State is parsed from the description rather than the title
|
||||
because InciWeb titles use unit code prefixes (e.g., "MNMNS Stewart Trail",
|
||||
"CACNP Santa Rosa Island Fire") which are not reliable state indicators.
|
||||
The description has a structured "State: <name>" field that reliably
|
||||
identifies the state for all incidents.
|
||||
"""
|
||||
pattern = r"State:\s*([A-Za-z\s]+?)(?:\n|---|$)"
|
||||
match = re.search(pattern, description)
|
||||
|
|
@ -176,6 +182,10 @@ class InciWebAdapter(SourceAdapter):
|
|||
self._session: aiohttp.ClientSession | None = None
|
||||
self._db: sqlite3.Connection | None = None
|
||||
|
||||
# Conditional fetch state
|
||||
self._last_modified: str | None = None
|
||||
self._etag: str | None = None
|
||||
|
||||
# Parse region from settings
|
||||
region_dict = config.settings.get("region")
|
||||
if region_dict:
|
||||
|
|
@ -300,10 +310,25 @@ class InciWebAdapter(SourceAdapter):
|
|||
if not self._session:
|
||||
raise RuntimeError("Session not initialized")
|
||||
|
||||
# Build request headers with conditional fetch support
|
||||
headers = {"User-Agent": "Central/0.4"}
|
||||
if self._last_modified:
|
||||
headers["If-Modified-Since"] = self._last_modified
|
||||
if self._etag:
|
||||
headers["If-None-Match"] = self._etag
|
||||
|
||||
async with self._session.get(INCIWEB_RSS_URL, headers=headers) as resp:
|
||||
# Handle 304 Not Modified
|
||||
if resp.status == 304:
|
||||
logger.info("InciWeb not modified")
|
||||
return []
|
||||
|
||||
resp.raise_for_status()
|
||||
|
||||
# Capture conditional fetch headers for next request
|
||||
self._last_modified = resp.headers.get("Last-Modified")
|
||||
self._etag = resp.headers.get("ETag")
|
||||
|
||||
content = await resp.text()
|
||||
|
||||
# Parse RSS XML
|
||||
|
|
@ -367,6 +392,11 @@ class InciWebAdapter(SourceAdapter):
|
|||
if not guid:
|
||||
continue
|
||||
|
||||
# Dedup: skip if already published
|
||||
if self.is_published(guid):
|
||||
self.bump_last_seen(guid)
|
||||
continue
|
||||
|
||||
description_html = item.get("description", "")
|
||||
|
||||
# Parse coordinates from description
|
||||
|
|
@ -435,6 +465,7 @@ class InciWebAdapter(SourceAdapter):
|
|||
)
|
||||
|
||||
yield event
|
||||
self.mark_published(guid)
|
||||
events_yielded += 1
|
||||
|
||||
# Periodic cleanup of old entries
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue