recon/lib/wiki_rewrite.py

"""
Wiki link rewriter — rewrites OSM wikipedia/wikidata/wikivoyage/appropedia
links to local Kiwix URLs where the article exists in a loaded ZIM.

Falls back silently to public URLs when article is unavailable locally.
Caches positive results only in place_cache.db.

Kiwix catalog is parsed from the OPDS Atom feed at startup and refreshed
hourly to pick up newly loaded ZIMs without a restart.

Operations note:
  - After loading a new ZIM, either restart RECON (forces fresh catalog
    fetch) or wait up to 1 hour for automatic refresh.
  - To invalidate the wiki cache (e.g. after ZIM update):
      sqlite3 /opt/recon/data/place_cache.db "DELETE FROM wiki_cache;"
"""
import os
import re
import sqlite3
import time
import xml.etree.ElementTree as ET
from urllib.parse import unquote, quote

import requests as http_requests

from .utils import setup_logging

logger = setup_logging('recon.wiki_rewrite')

# ── Configuration ───────────────────────────────────────────────────────

KIWIX_BASE = "http://localhost:8430"
KIWIX_PUBLIC_BASE = "https://wiki.echo6.co"
KIWIX_CATALOG_URL = f"{KIWIX_BASE}/catalog/v2/entries"
HEAD_TIMEOUT = 1.5  # seconds
CATALOG_REFRESH_INTERVAL = 3600  # 1 hour

# OPDS Atom namespace
_ATOM_NS = "http://www.w3.org/2005/Atom"

# ── ZIM catalog map ─────────────────────────────────────────────────────

_zim_map = {}        # source_type → content_path  e.g. 'wikipedia' → 'wikipedia_en_all_maxi_2026-02'
_zim_map_ts = 0.0    # last refresh timestamp

# Prefix-to-source-type mapping (order matters: longest prefix first)
_ZIM_PREFIX_MAP = [
    ('wikipedia_en_all', 'wikipedia'),
    ('appropedia_en_all', 'appropedia'),
    ('wikivoyage_en', 'wikivoyage'),
    ('wikidata_en', 'wikidata'),
]


def _discover_zims():
    """Parse Kiwix OPDS Atom catalog to map source types to content paths."""
    global _zim_map, _zim_map_ts

    try:
        resp = http_requests.get(KIWIX_CATALOG_URL, timeout=5)
        if resp.status_code != 200:
            logger.warning(f"Kiwix catalog returned HTTP {resp.status_code}")
            return

        root = ET.fromstring(resp.content)
        new_map = {}

        for entry in root.findall(f"{{{_ATOM_NS}}}entry"):
            name_el = entry.find(f"{{{_ATOM_NS}}}name")
            if name_el is None:
                continue
            book_name = name_el.text or ""

            # <link type="text/html" href="/content/..."/>
            content_path = None
            for link in entry.findall(f"{{{_ATOM_NS}}}link"):
                if link.get("type") == "text/html":
                    href = link.get("href", "")
                    if href.startswith("/content/"):
                        content_path = href[len("/content/"):]
                    break

            if not content_path:
                continue

            # Match book name against known prefixes
            for prefix, source_type in _ZIM_PREFIX_MAP:
                if book_name.startswith(prefix):
                    new_map[source_type] = content_path
                    break

        _zim_map = new_map
        _zim_map_ts = time.time()
        logger.info(f"ZIM catalog refreshed: {new_map}")

    except Exception as e:
        logger.warning(f"Failed to discover ZIMs from Kiwix catalog: {e}")


def _ensure_zim_map():
    """Lazy-load and refresh ZIM map if stale."""
    if not _zim_map or (time.time() - _zim_map_ts) > CATALOG_REFRESH_INTERVAL:
        _discover_zims()


# ── Database (wiki_cache in place_cache.db) ─────────────────────────────

_db_conn = None


def _get_db():
    """Return a module-level SQLite connection to place_cache.db (lazy init)."""
    global _db_conn
    if _db_conn is not None:
        return _db_conn

    db_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
    os.makedirs(db_dir, exist_ok=True)
    db_path = os.path.join(db_dir, 'place_cache.db')

    _db_conn = sqlite3.connect(db_path, check_same_thread=False)
    _db_conn.execute("PRAGMA journal_mode=WAL")
    _db_conn.execute("PRAGMA synchronous=NORMAL")
    _db_conn.execute("""
        CREATE TABLE IF NOT EXISTS wiki_cache (
            source_type TEXT NOT NULL,
            article_id  TEXT NOT NULL,
            kiwix_url   TEXT NOT NULL,
            cached_at   INTEGER NOT NULL,
            PRIMARY KEY (source_type, article_id)
        )
    """)
    _db_conn.commit()
    logger.info(f"Wiki cache table ready in {db_path}")
    return _db_conn


# ── URL classification ──────────────────────────────────────────────────

# Patterns for OSM wikipedia/wikidata tag values
_WIKI_TAG_RE = re.compile(r'^(?:en:)?(.+)$')  # "en:Title" or just "Title"
_WIKI_URL_RE = re.compile(r'https?://en\.wikipedia\.org/wiki/(.+)')
_WIKIDATA_TAG_RE = re.compile(r'^(Q\d+)$')
_WIKIDATA_URL_RE = re.compile(r'https?://(?:www\.)?wikidata\.org/wiki/(Q\d+)')
_WIKIVOYAGE_URL_RE = re.compile(r'https?://en\.wikivoyage\.org/wiki/(.+)')
_APPROPEDIA_URL_RE = re.compile(r'https?://(?:www\.)?appropedia\.org/(?:wiki/)?(.+)')


def _normalize_article_id(article_id):
    """Normalize article ID to MediaWiki/Kiwix convention: spaces → underscores."""
    return article_id.replace(' ', '_')


def classify_wiki_link(tag_name, value):
    """
    Classify an OSM extratag value into (source_type, article_id) or None.

    tag_name: the extratags key ('wikipedia', 'wikidata', etc.)
    value: the raw tag value from OSM

    Article IDs are normalized to MediaWiki convention (spaces → underscores).
    """
    if not value or not isinstance(value, str):
        return None

    value = value.strip()

    if tag_name == 'wikidata':
        m = _WIKIDATA_TAG_RE.match(value)
        if m:
            return ('wikidata', m.group(1))
        m = _WIKIDATA_URL_RE.match(value)
        if m:
            return ('wikidata', m.group(1))
        return None

    if tag_name == 'wikipedia':
        # URL form: https://en.wikipedia.org/wiki/Title
        m = _WIKI_URL_RE.match(value)
        if m:
            return ('wikipedia', _normalize_article_id(unquote(m.group(1))))
        # Tag form: "en:Title" or "Title"
        m = _WIKI_TAG_RE.match(value)
        if m:
            return ('wikipedia', _normalize_article_id(m.group(1)))
        return None

    if tag_name == 'wikivoyage':
        m = _WIKIVOYAGE_URL_RE.match(value)
        if m:
            return ('wikivoyage', _normalize_article_id(unquote(m.group(1))))
        # Plain tag: "en:Title" or "Title"
        m = _WIKI_TAG_RE.match(value)
        if m:
            return ('wikivoyage', _normalize_article_id(m.group(1)))
        return None

    if tag_name == 'appropedia':
        m = _APPROPEDIA_URL_RE.match(value)
        if m:
            return ('appropedia', _normalize_article_id(unquote(m.group(1))))
        return ('appropedia', _normalize_article_id(value))

    return None


# ── URL builders ────────────────────────────────────────────────────────

def build_kiwix_url(source_type, article_id):
    """Build a public Kiwix URL. Returns None if source_type not in ZIM map."""
    _ensure_zim_map()
    content_path = _zim_map.get(source_type)
    if not content_path:
        return None
    return f"{KIWIX_PUBLIC_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}"


_PUBLIC_URL_TEMPLATES = {
    'wikipedia':  "https://en.wikipedia.org/wiki/{id}",
    'wikidata':   "https://www.wikidata.org/wiki/{id}",
    'wikivoyage': "https://en.wikivoyage.org/wiki/{id}",
    'appropedia': "https://www.appropedia.org/wiki/{id}",
}


def build_public_url(source_type, article_id):
    """Build the canonical public URL for a wiki article."""
    tmpl = _PUBLIC_URL_TEMPLATES.get(source_type)
    if not tmpl:
        return None
    return tmpl.format(id=quote(article_id, safe='/:@!$&\'()*+,;='))


# ── Kiwix availability check ───────────────────────────────────────────

def check_kiwix_has_article(source_type, article_id):
    """
    Check if an article exists in local Kiwix.

    Returns (bool, url):
      - (True, kiwix_public_url) if article exists locally
      - (False, None) if not found or Kiwix unavailable

    Only positive results are cached.
    """
    # Check cache first
    db = _get_db()
    row = db.execute(
        "SELECT kiwix_url FROM wiki_cache WHERE source_type=? AND article_id=?",
        (source_type, article_id)
    ).fetchone()
    if row:
        return (True, row[0])

    # Build local HEAD URL
    _ensure_zim_map()
    content_path = _zim_map.get(source_type)
    if not content_path:
        return (False, None)

    head_url = f"{KIWIX_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}"

    try:
        resp = http_requests.head(head_url, timeout=HEAD_TIMEOUT, allow_redirects=True)
        if resp.status_code == 200:
            kiwix_url = build_kiwix_url(source_type, article_id)
            # Cache positive result
            now = int(time.time())
            db.execute("""
                INSERT OR REPLACE INTO wiki_cache (source_type, article_id, kiwix_url, cached_at)
                VALUES (?, ?, ?, ?)
            """, (source_type, article_id, kiwix_url, now))
            db.commit()
            return (True, kiwix_url)
        else:
            return (False, None)
    except Exception as e:
        logger.debug(f"Kiwix HEAD failed for {source_type}/{article_id}: {e}")
        return (False, None)


# ── Primary entry point ────────────────────────────────────────────────

def rewrite_wiki_link(tag_name, value):
    """
    Rewrite an OSM wiki tag value to a local Kiwix URL if available.

    Returns (url, 'local'|'public') or (None, None) if unrecognized.
    """
    classified = classify_wiki_link(tag_name, value)
    if not classified:
        return (value, 'original')

    source_type, article_id = classified

    # Try local Kiwix
    found, kiwix_url = check_kiwix_has_article(source_type, article_id)
    if found and kiwix_url:
        return (kiwix_url, 'local')

    # Fall back to public URL
    public_url = build_public_url(source_type, article_id)
    if public_url:
        return (public_url, 'public')

    return (value, 'original')


# ── Discovery stubs (disabled, for future activation) ───────────────────

def discover_wikivoyage_article(name, category, lat, lon):
    """
    Discover a related Wikivoyage article for a place.
    Enabled by has_wiki_discovery. Currently returns None.
    """
    return None


def discover_appropedia_article(name, category):
    """
    Discover a related Appropedia article for a place.
    Enabled by has_wiki_discovery. Currently returns None.
    """
    return None
Add wiki link rewriting to local Kiwix Rewrites OSM wikipedia/wikidata/wikivoyage/appropedia extratag values to local Kiwix URLs (wiki.echo6.co) when the article exists in a loaded ZIM, falling back silently to public URLs otherwise. - New lib/wiki_rewrite.py: URL classification, Kiwix OPDS catalog discovery (xml.etree.ElementTree), HEAD-based availability check, positive-only SQLite cache, disabled discovery stubs - place_detail.py: _enrich_wiki_links() at both Nominatim and Overpass enrichment sites, before cache_put - Profile flags: has_wiki_rewriting (home/regional: true, minimal: false), has_wiki_discovery (all: false, stubs for future activation) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-04-23 06:34:22 +00:00			`"""`
			`Wiki link rewriter — rewrites OSM wikipedia/wikidata/wikivoyage/appropedia`
			`links to local Kiwix URLs where the article exists in a loaded ZIM.`

			`Falls back silently to public URLs when article is unavailable locally.`
			`Caches positive results only in place_cache.db.`

			`Kiwix catalog is parsed from the OPDS Atom feed at startup and refreshed`
			`hourly to pick up newly loaded ZIMs without a restart.`

			`Operations note:`
			`- After loading a new ZIM, either restart RECON (forces fresh catalog`
			`fetch) or wait up to 1 hour for automatic refresh.`
			`- To invalidate the wiki cache (e.g. after ZIM update):`
			`sqlite3 /opt/recon/data/place_cache.db "DELETE FROM wiki_cache;"`
			`"""`
			`import os`
			`import re`
			`import sqlite3`
			`import time`
			`import xml.etree.ElementTree as ET`
			`from urllib.parse import unquote, quote`

			`import requests as http_requests`

			`from .utils import setup_logging`

			`logger = setup_logging('recon.wiki_rewrite')`

			`# ── Configuration ───────────────────────────────────────────────────────`

			`KIWIX_BASE = "http://localhost:8430"`
			`KIWIX_PUBLIC_BASE = "https://wiki.echo6.co"`
			`KIWIX_CATALOG_URL = f"{KIWIX_BASE}/catalog/v2/entries"`
			`HEAD_TIMEOUT = 1.5 # seconds`
			`CATALOG_REFRESH_INTERVAL = 3600 # 1 hour`

			`# OPDS Atom namespace`
			`_ATOM_NS = "http://www.w3.org/2005/Atom"`

			`# ── ZIM catalog map ─────────────────────────────────────────────────────`

			`_zim_map = {} # source_type → content_path e.g. 'wikipedia' → 'wikipedia_en_all_maxi_2026-02'`
			`_zim_map_ts = 0.0 # last refresh timestamp`

			`# Prefix-to-source-type mapping (order matters: longest prefix first)`
			`_ZIM_PREFIX_MAP = [`
			`('wikipedia_en_all', 'wikipedia'),`
			`('appropedia_en_all', 'appropedia'),`
			`('wikivoyage_en', 'wikivoyage'),`
			`('wikidata_en', 'wikidata'),`
			`]`


			`def _discover_zims():`
			`"""Parse Kiwix OPDS Atom catalog to map source types to content paths."""`
			`global _zim_map, _zim_map_ts`

			`try:`
			`resp = http_requests.get(KIWIX_CATALOG_URL, timeout=5)`
			`if resp.status_code != 200:`
			`logger.warning(f"Kiwix catalog returned HTTP {resp.status_code}")`
			`return`

			`root = ET.fromstring(resp.content)`
			`new_map = {}`

			`for entry in root.findall(f"{{{_ATOM_NS}}}entry"):`
			`name_el = entry.find(f"{{{_ATOM_NS}}}name")`
			`if name_el is None:`
			`continue`
			`book_name = name_el.text or ""`

			`# <link type="text/html" href="/content/..."/>`
			`content_path = None`
			`for link in entry.findall(f"{{{_ATOM_NS}}}link"):`
			`if link.get("type") == "text/html":`
			`href = link.get("href", "")`
			`if href.startswith("/content/"):`
			`content_path = href[len("/content/"):]`
			`break`

			`if not content_path:`
			`continue`

			`# Match book name against known prefixes`
			`for prefix, source_type in _ZIM_PREFIX_MAP:`
			`if book_name.startswith(prefix):`
			`new_map[source_type] = content_path`
			`break`

			`_zim_map = new_map`
			`_zim_map_ts = time.time()`
			`logger.info(f"ZIM catalog refreshed: {new_map}")`

			`except Exception as e:`
			`logger.warning(f"Failed to discover ZIMs from Kiwix catalog: {e}")`


			`def _ensure_zim_map():`
			`"""Lazy-load and refresh ZIM map if stale."""`
			`if not _zim_map or (time.time() - _zim_map_ts) > CATALOG_REFRESH_INTERVAL:`
			`_discover_zims()`


			`# ── Database (wiki_cache in place_cache.db) ─────────────────────────────`

			`_db_conn = None`


			`def _get_db():`
			`"""Return a module-level SQLite connection to place_cache.db (lazy init)."""`
			`global _db_conn`
			`if _db_conn is not None:`
			`return _db_conn`

			`db_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')`
			`os.makedirs(db_dir, exist_ok=True)`
			`db_path = os.path.join(db_dir, 'place_cache.db')`

			`_db_conn = sqlite3.connect(db_path, check_same_thread=False)`
			`_db_conn.execute("PRAGMA journal_mode=WAL")`
			`_db_conn.execute("PRAGMA synchronous=NORMAL")`
			`_db_conn.execute("""`
			`CREATE TABLE IF NOT EXISTS wiki_cache (`
			`source_type TEXT NOT NULL,`
			`article_id TEXT NOT NULL,`
			`kiwix_url TEXT NOT NULL,`
			`cached_at INTEGER NOT NULL,`
			`PRIMARY KEY (source_type, article_id)`
			`)`
			`""")`
			`_db_conn.commit()`
			`logger.info(f"Wiki cache table ready in {db_path}")`
			`return _db_conn`


			`# ── URL classification ──────────────────────────────────────────────────`

			`# Patterns for OSM wikipedia/wikidata tag values`
			`_WIKI_TAG_RE = re.compile(r'^(?:en:)?(.+)$') # "en:Title" or just "Title"`
			`_WIKI_URL_RE = re.compile(r'https?://en\.wikipedia\.org/wiki/(.+)')`
			`_WIKIDATA_TAG_RE = re.compile(r'^(Q\d+)$')`
			`_WIKIDATA_URL_RE = re.compile(r'https?://(?:www\.)?wikidata\.org/wiki/(Q\d+)')`
			`_WIKIVOYAGE_URL_RE = re.compile(r'https?://en\.wikivoyage\.org/wiki/(.+)')`
			`_APPROPEDIA_URL_RE = re.compile(r'https?://(?:www\.)?appropedia\.org/(?:wiki/)?(.+)')`


			`def _normalize_article_id(article_id):`
			`"""Normalize article ID to MediaWiki/Kiwix convention: spaces → underscores."""`
			`return article_id.replace(' ', '_')`


			`def classify_wiki_link(tag_name, value):`
			`"""`
			`Classify an OSM extratag value into (source_type, article_id) or None.`

			`tag_name: the extratags key ('wikipedia', 'wikidata', etc.)`
			`value: the raw tag value from OSM`

			`Article IDs are normalized to MediaWiki convention (spaces → underscores).`
			`"""`
			`if not value or not isinstance(value, str):`
			`return None`

			`value = value.strip()`

			`if tag_name == 'wikidata':`
			`m = _WIKIDATA_TAG_RE.match(value)`
			`if m:`
			`return ('wikidata', m.group(1))`
			`m = _WIKIDATA_URL_RE.match(value)`
			`if m:`
			`return ('wikidata', m.group(1))`
			`return None`

			`if tag_name == 'wikipedia':`
			`# URL form: https://en.wikipedia.org/wiki/Title`
			`m = _WIKI_URL_RE.match(value)`
			`if m:`
			`return ('wikipedia', _normalize_article_id(unquote(m.group(1))))`
			`# Tag form: "en:Title" or "Title"`
			`m = _WIKI_TAG_RE.match(value)`
			`if m:`
			`return ('wikipedia', _normalize_article_id(m.group(1)))`
			`return None`

			`if tag_name == 'wikivoyage':`
			`m = _WIKIVOYAGE_URL_RE.match(value)`
			`if m:`
			`return ('wikivoyage', _normalize_article_id(unquote(m.group(1))))`
			`# Plain tag: "en:Title" or "Title"`
			`m = _WIKI_TAG_RE.match(value)`
			`if m:`
			`return ('wikivoyage', _normalize_article_id(m.group(1)))`
			`return None`

			`if tag_name == 'appropedia':`
			`m = _APPROPEDIA_URL_RE.match(value)`
			`if m:`
			`return ('appropedia', _normalize_article_id(unquote(m.group(1))))`
			`return ('appropedia', _normalize_article_id(value))`

			`return None`


			`# ── URL builders ────────────────────────────────────────────────────────`

			`def build_kiwix_url(source_type, article_id):`
			`"""Build a public Kiwix URL. Returns None if source_type not in ZIM map."""`
			`_ensure_zim_map()`
			`content_path = _zim_map.get(source_type)`
			`if not content_path:`
			`return None`
			`return f"{KIWIX_PUBLIC_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}"`


			`_PUBLIC_URL_TEMPLATES = {`
			`'wikipedia': "https://en.wikipedia.org/wiki/{id}",`
			`'wikidata': "https://www.wikidata.org/wiki/{id}",`
			`'wikivoyage': "https://en.wikivoyage.org/wiki/{id}",`
			`'appropedia': "https://www.appropedia.org/wiki/{id}",`
			`}`


			`def build_public_url(source_type, article_id):`
			`"""Build the canonical public URL for a wiki article."""`
			`tmpl = _PUBLIC_URL_TEMPLATES.get(source_type)`
			`if not tmpl:`
			`return None`
			`return tmpl.format(id=quote(article_id, safe='/:@!$&\'()*+,;='))`


			`# ── Kiwix availability check ───────────────────────────────────────────`

			`def check_kiwix_has_article(source_type, article_id):`
			`"""`
			`Check if an article exists in local Kiwix.`

			`Returns (bool, url):`
			`- (True, kiwix_public_url) if article exists locally`
			`- (False, None) if not found or Kiwix unavailable`

			`Only positive results are cached.`
			`"""`
			`# Check cache first`
			`db = _get_db()`
			`row = db.execute(`
			`"SELECT kiwix_url FROM wiki_cache WHERE source_type=? AND article_id=?",`
			`(source_type, article_id)`
			`).fetchone()`
			`if row:`
			`return (True, row[0])`

			`# Build local HEAD URL`
			`_ensure_zim_map()`
			`content_path = _zim_map.get(source_type)`
			`if not content_path:`
			`return (False, None)`

			`head_url = f"{KIWIX_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}"`

			`try:`
			`resp = http_requests.head(head_url, timeout=HEAD_TIMEOUT, allow_redirects=True)`
			`if resp.status_code == 200:`
			`kiwix_url = build_kiwix_url(source_type, article_id)`
			`# Cache positive result`
			`now = int(time.time())`
			`db.execute("""`
			`INSERT OR REPLACE INTO wiki_cache (source_type, article_id, kiwix_url, cached_at)`
			`VALUES (?, ?, ?, ?)`
			`""", (source_type, article_id, kiwix_url, now))`
			`db.commit()`
			`return (True, kiwix_url)`
			`else:`
			`return (False, None)`
			`except Exception as e:`
			`logger.debug(f"Kiwix HEAD failed for {source_type}/{article_id}: {e}")`
			`return (False, None)`


			`# ── Primary entry point ────────────────────────────────────────────────`

			`def rewrite_wiki_link(tag_name, value):`
			`"""`
			`Rewrite an OSM wiki tag value to a local Kiwix URL if available.`

			`Returns (url, 'local'\|'public') or (None, None) if unrecognized.`
			`"""`
			`classified = classify_wiki_link(tag_name, value)`
			`if not classified:`
			`return (value, 'original')`

			`source_type, article_id = classified`

			`# Try local Kiwix`
			`found, kiwix_url = check_kiwix_has_article(source_type, article_id)`
			`if found and kiwix_url:`
			`return (kiwix_url, 'local')`

			`# Fall back to public URL`
			`public_url = build_public_url(source_type, article_id)`
			`if public_url:`
			`return (public_url, 'public')`

			`return (value, 'original')`


			`# ── Discovery stubs (disabled, for future activation) ───────────────────`

			`def discover_wikivoyage_article(name, category, lat, lon):`
			`"""`
			`Discover a related Wikivoyage article for a place.`
			`Enabled by has_wiki_discovery. Currently returns None.`
			`"""`
			`return None`


			`def discover_appropedia_article(name, category):`
			`"""`
			`Discover a related Appropedia article for a place.`
			`Enabled by has_wiki_discovery. Currently returns None.`
			`"""`
			`return None`