Add wiki link rewriting to local Kiwix

Rewrites OSM wikipedia/wikidata/wikivoyage/appropedia extratag values to local Kiwix URLs (wiki.echo6.co) when the article exists in a loaded ZIM, falling back silently to public URLs otherwise. - New lib/wiki_rewrite.py: URL classification, Kiwix OPDS catalog discovery (xml.etree.ElementTree), HEAD-based availability check, positive-only SQLite cache, disabled discovery stubs - place_detail.py: _enrich_wiki_links() at both Nominatim and Overpass enrichment sites, before cache_put - Profile flags: has_wiki_rewriting (home/regional: true, minimal: false), has_wiki_discovery (all: false, stubs for future activation) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-20 06:34:40 +02:00 · 2026-04-23 06:34:22 +00:00 · 2026-04-23 06:34:22 +00:00 · 829bc87b7b
commit 829bc87b7b
parent 9c5b0520f9
5 changed files with 385 additions and 0 deletions
--- a/lib/place_detail.py
+++ b/lib/place_detail.py
@ -272,6 +272,56 @@ def _apply_google_data(result, google_data, gaps):
    result['extratags'] = extratags


+
+
+# ── Wiki link rewriting ─────────────────────────────────────────────────
+
+# Extratag keys that may contain wiki references
+_WIKI_TAGS = ('wikipedia', 'wikidata', 'wikivoyage', 'appropedia')
+
+
+def _enrich_wiki_links(result):
+    """
+    Rewrite wiki-related extratags to local Kiwix URLs where available.
+    Falls back to public URLs. Only runs when has_wiki_rewriting is enabled.
+    Returns the (possibly enriched) result dict.
+    """
+    try:
+        from .deployment_config import get_deployment_config
+        deploy_config = get_deployment_config()
+        features = deploy_config.get('features', {})
+        if not features.get('has_wiki_rewriting', False):
+            return result
+    except Exception:
+        return result
+
+    try:
+        from .wiki_rewrite import rewrite_wiki_link
+    except ImportError:
+        logger.debug("wiki_rewrite module not available")
+        return result
+
+    extratags = result.get('extratags', {})
+    if not extratags:
+        return result
+
+    rewrites = {}
+    for tag in _WIKI_TAGS:
+        value = extratags.get(tag)
+        if not value:
+            continue
+        url, status = rewrite_wiki_link(tag, value)
+        if status != 'original':
+            extratags[tag] = url
+            rewrites[tag] = status
+
+    if rewrites:
+        result['extratags'] = extratags
+        result.setdefault('sources', {})['wiki_rewrites'] = rewrites
+        logger.debug(f"Wiki rewrites for {result.get('osm_type')}/{result.get('osm_id')}: {rewrites}")
+
+    return result
+
 # ── Nominatim parsing ───────────────────────────────────────────────────

 # Nominatim address array uses rank_address to indicate what each entry is.
@ -560,6 +610,7 @@ def get_place_detail(osm_type, osm_id):
    if nominatim_result:
        nominatim_result = _enrich_with_overture(nominatim_result, osm_type, osm_id)
        nominatim_result = _enrich_with_google(nominatim_result, osm_type, osm_id)
+        nominatim_result = _enrich_wiki_links(nominatim_result)
        cache_put(osm_type, osm_id, nominatim_result, 'nominatim_local')
        return nominatim_result, 200

@ -592,6 +643,7 @@ def get_place_detail(osm_type, osm_id):
    if overpass_result:
        overpass_result = _enrich_with_overture(overpass_result, osm_type, osm_id)
        overpass_result = _enrich_with_google(overpass_result, osm_type, osm_id)
+        overpass_result = _enrich_wiki_links(overpass_result)
        cache_put(osm_type, osm_id, overpass_result, 'overpass')
        return overpass_result, 200

--- a/lib/wiki_rewrite.py
+++ b/lib/wiki_rewrite.py
@ -0,0 +1,324 @@
+"""
+Wiki link rewriter — rewrites OSM wikipedia/wikidata/wikivoyage/appropedia
+links to local Kiwix URLs where the article exists in a loaded ZIM.
+
+Falls back silently to public URLs when article is unavailable locally.
+Caches positive results only in place_cache.db.
+
+Kiwix catalog is parsed from the OPDS Atom feed at startup and refreshed
+hourly to pick up newly loaded ZIMs without a restart.
+
+Operations note:
+  - After loading a new ZIM, either restart RECON (forces fresh catalog
+    fetch) or wait up to 1 hour for automatic refresh.
+  - To invalidate the wiki cache (e.g. after ZIM update):
+      sqlite3 /opt/recon/data/place_cache.db "DELETE FROM wiki_cache;"
+"""
+import os
+import re
+import sqlite3
+import time
+import xml.etree.ElementTree as ET
+from urllib.parse import unquote, quote
+
+import requests as http_requests
+
+from .utils import setup_logging
+
+logger = setup_logging('recon.wiki_rewrite')
+
+# ── Configuration ───────────────────────────────────────────────────────
+
+KIWIX_BASE = "http://localhost:8430"
+KIWIX_PUBLIC_BASE = "https://wiki.echo6.co"
+KIWIX_CATALOG_URL = f"{KIWIX_BASE}/catalog/v2/entries"
+HEAD_TIMEOUT = 1.5  # seconds
+CATALOG_REFRESH_INTERVAL = 3600  # 1 hour
+
+# OPDS Atom namespace
+_ATOM_NS = "http://www.w3.org/2005/Atom"
+
+# ── ZIM catalog map ─────────────────────────────────────────────────────
+
+_zim_map = {}        # source_type → content_path  e.g. 'wikipedia' → 'wikipedia_en_all_maxi_2026-02'
+_zim_map_ts = 0.0    # last refresh timestamp
+
+# Prefix-to-source-type mapping (order matters: longest prefix first)
+_ZIM_PREFIX_MAP = [
+    ('wikipedia_en_all', 'wikipedia'),
+    ('appropedia_en_all', 'appropedia'),
+    ('wikivoyage_en', 'wikivoyage'),
+    ('wikidata_en', 'wikidata'),
+]
+
+
+def _discover_zims():
+    """Parse Kiwix OPDS Atom catalog to map source types to content paths."""
+    global _zim_map, _zim_map_ts
+
+    try:
+        resp = http_requests.get(KIWIX_CATALOG_URL, timeout=5)
+        if resp.status_code != 200:
+            logger.warning(f"Kiwix catalog returned HTTP {resp.status_code}")
+            return
+
+        root = ET.fromstring(resp.content)
+        new_map = {}
+
+        for entry in root.findall(f"{{{_ATOM_NS}}}entry"):
+            name_el = entry.find(f"{{{_ATOM_NS}}}name")
+            if name_el is None:
+                continue
+            book_name = name_el.text or ""
+
+            # <link type="text/html" href="/content/..."/>
+            content_path = None
+            for link in entry.findall(f"{{{_ATOM_NS}}}link"):
+                if link.get("type") == "text/html":
+                    href = link.get("href", "")
+                    if href.startswith("/content/"):
+                        content_path = href[len("/content/"):]
+                    break
+
+            if not content_path:
+                continue
+
+            # Match book name against known prefixes
+            for prefix, source_type in _ZIM_PREFIX_MAP:
+                if book_name.startswith(prefix):
+                    new_map[source_type] = content_path
+                    break
+
+        _zim_map = new_map
+        _zim_map_ts = time.time()
+        logger.info(f"ZIM catalog refreshed: {new_map}")
+
+    except Exception as e:
+        logger.warning(f"Failed to discover ZIMs from Kiwix catalog: {e}")
+
+
+def _ensure_zim_map():
+    """Lazy-load and refresh ZIM map if stale."""
+    if not _zim_map or (time.time() - _zim_map_ts) > CATALOG_REFRESH_INTERVAL:
+        _discover_zims()
+
+
+# ── Database (wiki_cache in place_cache.db) ─────────────────────────────
+
+_db_conn = None
+
+
+def _get_db():
+    """Return a module-level SQLite connection to place_cache.db (lazy init)."""
+    global _db_conn
+    if _db_conn is not None:
+        return _db_conn
+
+    db_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
+    os.makedirs(db_dir, exist_ok=True)
+    db_path = os.path.join(db_dir, 'place_cache.db')
+
+    _db_conn = sqlite3.connect(db_path, check_same_thread=False)
+    _db_conn.execute("PRAGMA journal_mode=WAL")
+    _db_conn.execute("PRAGMA synchronous=NORMAL")
+    _db_conn.execute("""
+        CREATE TABLE IF NOT EXISTS wiki_cache (
+            source_type TEXT NOT NULL,
+            article_id  TEXT NOT NULL,
+            kiwix_url   TEXT NOT NULL,
+            cached_at   INTEGER NOT NULL,
+            PRIMARY KEY (source_type, article_id)
+        )
+    """)
+    _db_conn.commit()
+    logger.info(f"Wiki cache table ready in {db_path}")
+    return _db_conn
+
+
+# ── URL classification ──────────────────────────────────────────────────
+
+# Patterns for OSM wikipedia/wikidata tag values
+_WIKI_TAG_RE = re.compile(r'^(?:en:)?(.+)$')  # "en:Title" or just "Title"
+_WIKI_URL_RE = re.compile(r'https?://en\.wikipedia\.org/wiki/(.+)')
+_WIKIDATA_TAG_RE = re.compile(r'^(Q\d+)$')
+_WIKIDATA_URL_RE = re.compile(r'https?://(?:www\.)?wikidata\.org/wiki/(Q\d+)')
+_WIKIVOYAGE_URL_RE = re.compile(r'https?://en\.wikivoyage\.org/wiki/(.+)')
+_APPROPEDIA_URL_RE = re.compile(r'https?://(?:www\.)?appropedia\.org/(?:wiki/)?(.+)')
+
+
+def _normalize_article_id(article_id):
+    """Normalize article ID to MediaWiki/Kiwix convention: spaces → underscores."""
+    return article_id.replace(' ', '_')
+
+
+def classify_wiki_link(tag_name, value):
+    """
+    Classify an OSM extratag value into (source_type, article_id) or None.
+
+    tag_name: the extratags key ('wikipedia', 'wikidata', etc.)
+    value: the raw tag value from OSM
+
+    Article IDs are normalized to MediaWiki convention (spaces → underscores).
+    """
+    if not value or not isinstance(value, str):
+        return None
+
+    value = value.strip()
+
+    if tag_name == 'wikidata':
+        m = _WIKIDATA_TAG_RE.match(value)
+        if m:
+            return ('wikidata', m.group(1))
+        m = _WIKIDATA_URL_RE.match(value)
+        if m:
+            return ('wikidata', m.group(1))
+        return None
+
+    if tag_name == 'wikipedia':
+        # URL form: https://en.wikipedia.org/wiki/Title
+        m = _WIKI_URL_RE.match(value)
+        if m:
+            return ('wikipedia', _normalize_article_id(unquote(m.group(1))))
+        # Tag form: "en:Title" or "Title"
+        m = _WIKI_TAG_RE.match(value)
+        if m:
+            return ('wikipedia', _normalize_article_id(m.group(1)))
+        return None
+
+    if tag_name == 'wikivoyage':
+        m = _WIKIVOYAGE_URL_RE.match(value)
+        if m:
+            return ('wikivoyage', _normalize_article_id(unquote(m.group(1))))
+        # Plain tag: "en:Title" or "Title"
+        m = _WIKI_TAG_RE.match(value)
+        if m:
+            return ('wikivoyage', _normalize_article_id(m.group(1)))
+        return None
+
+    if tag_name == 'appropedia':
+        m = _APPROPEDIA_URL_RE.match(value)
+        if m:
+            return ('appropedia', _normalize_article_id(unquote(m.group(1))))
+        return ('appropedia', _normalize_article_id(value))
+
+    return None
+
+
+# ── URL builders ────────────────────────────────────────────────────────
+
+def build_kiwix_url(source_type, article_id):
+    """Build a public Kiwix URL. Returns None if source_type not in ZIM map."""
+    _ensure_zim_map()
+    content_path = _zim_map.get(source_type)
+    if not content_path:
+        return None
+    return f"{KIWIX_PUBLIC_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}"
+
+
+_PUBLIC_URL_TEMPLATES = {
+    'wikipedia':  "https://en.wikipedia.org/wiki/{id}",
+    'wikidata':   "https://www.wikidata.org/wiki/{id}",
+    'wikivoyage': "https://en.wikivoyage.org/wiki/{id}",
+    'appropedia': "https://www.appropedia.org/wiki/{id}",
+}
+
+
+def build_public_url(source_type, article_id):
+    """Build the canonical public URL for a wiki article."""
+    tmpl = _PUBLIC_URL_TEMPLATES.get(source_type)
+    if not tmpl:
+        return None
+    return tmpl.format(id=quote(article_id, safe='/:@!$&\'()*+,;='))
+
+
+# ── Kiwix availability check ───────────────────────────────────────────
+
+def check_kiwix_has_article(source_type, article_id):
+    """
+    Check if an article exists in local Kiwix.
+
+    Returns (bool, url):
+      - (True, kiwix_public_url) if article exists locally
+      - (False, None) if not found or Kiwix unavailable
+
+    Only positive results are cached.
+    """
+    # Check cache first
+    db = _get_db()
+    row = db.execute(
+        "SELECT kiwix_url FROM wiki_cache WHERE source_type=? AND article_id=?",
+        (source_type, article_id)
+    ).fetchone()
+    if row:
+        return (True, row[0])
+
+    # Build local HEAD URL
+    _ensure_zim_map()
+    content_path = _zim_map.get(source_type)
+    if not content_path:
+        return (False, None)
+
+    head_url = f"{KIWIX_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}"
+
+    try:
+        resp = http_requests.head(head_url, timeout=HEAD_TIMEOUT, allow_redirects=True)
+        if resp.status_code == 200:
+            kiwix_url = build_kiwix_url(source_type, article_id)
+            # Cache positive result
+            now = int(time.time())
+            db.execute("""
+                INSERT OR REPLACE INTO wiki_cache (source_type, article_id, kiwix_url, cached_at)
+                VALUES (?, ?, ?, ?)
+            """, (source_type, article_id, kiwix_url, now))
+            db.commit()
+            return (True, kiwix_url)
+        else:
+            return (False, None)
+    except Exception as e:
+        logger.debug(f"Kiwix HEAD failed for {source_type}/{article_id}: {e}")
+        return (False, None)
+
+
+# ── Primary entry point ────────────────────────────────────────────────
+
+def rewrite_wiki_link(tag_name, value):
+    """
+    Rewrite an OSM wiki tag value to a local Kiwix URL if available.
+
+    Returns (url, 'local'|'public') or (None, None) if unrecognized.
+    """
+    classified = classify_wiki_link(tag_name, value)
+    if not classified:
+        return (value, 'original')
+
+    source_type, article_id = classified
+
+    # Try local Kiwix
+    found, kiwix_url = check_kiwix_has_article(source_type, article_id)
+    if found and kiwix_url:
+        return (kiwix_url, 'local')
+
+    # Fall back to public URL
+    public_url = build_public_url(source_type, article_id)
+    if public_url:
+        return (public_url, 'public')
+
+    return (value, 'original')
+
+
+# ── Discovery stubs (disabled, for future activation) ───────────────────
+
+def discover_wikivoyage_article(name, category, lat, lon):
+    """
+    Discover a related Wikivoyage article for a place.
+    Enabled by has_wiki_discovery. Currently returns None.
+    """
+    return None
+
+
+def discover_appropedia_article(name, category):
+    """
+    Discover a related Appropedia article for a place.
+    Enabled by has_wiki_discovery. Currently returns None.
+    """
+    return None