diff --git a/config/profiles/home.yaml b/config/profiles/home.yaml index a65cab1..91fcc93 100644 --- a/config/profiles/home.yaml +++ b/config/profiles/home.yaml @@ -38,10 +38,13 @@ features: has_3d_terrain: false has_traffic_overlay: true has_landclass: true + has_public_lands_layer: true has_address_book_write: false has_overture_enrichment: true has_google_places_enrichment: true has_contacts: true + has_wiki_rewriting: true + has_wiki_discovery: false defaults: center: [42.5736, -114.6066] diff --git a/config/profiles/minimal_pi.yaml b/config/profiles/minimal_pi.yaml index e855382..e3ae0fd 100644 --- a/config/profiles/minimal_pi.yaml +++ b/config/profiles/minimal_pi.yaml @@ -33,10 +33,13 @@ features: has_3d_terrain: false has_traffic_overlay: false has_landclass: false + has_public_lands_layer: false has_address_book_write: true has_overture_enrichment: false has_google_places_enrichment: false has_contacts: false + has_wiki_rewriting: false + has_wiki_discovery: false defaults: center: [44.0, -114.0] diff --git a/config/profiles/regional_pi.yaml b/config/profiles/regional_pi.yaml index 9a80e51..2bd6c8a 100644 --- a/config/profiles/regional_pi.yaml +++ b/config/profiles/regional_pi.yaml @@ -38,10 +38,13 @@ features: has_3d_terrain: false has_traffic_overlay: true has_landclass: true + has_public_lands_layer: true has_address_book_write: true has_overture_enrichment: false has_google_places_enrichment: false has_contacts: false + has_wiki_rewriting: true + has_wiki_discovery: false defaults: center: [44.0, -114.0] diff --git a/lib/place_detail.py b/lib/place_detail.py index 9c71b3b..8dc8e64 100644 --- a/lib/place_detail.py +++ b/lib/place_detail.py @@ -272,6 +272,56 @@ def _apply_google_data(result, google_data, gaps): result['extratags'] = extratags + + +# ── Wiki link rewriting ───────────────────────────────────────────────── + +# Extratag keys that may contain wiki references +_WIKI_TAGS = ('wikipedia', 'wikidata', 'wikivoyage', 'appropedia') + + +def _enrich_wiki_links(result): + """ + Rewrite wiki-related extratags to local Kiwix URLs where available. + Falls back to public URLs. Only runs when has_wiki_rewriting is enabled. + Returns the (possibly enriched) result dict. + """ + try: + from .deployment_config import get_deployment_config + deploy_config = get_deployment_config() + features = deploy_config.get('features', {}) + if not features.get('has_wiki_rewriting', False): + return result + except Exception: + return result + + try: + from .wiki_rewrite import rewrite_wiki_link + except ImportError: + logger.debug("wiki_rewrite module not available") + return result + + extratags = result.get('extratags', {}) + if not extratags: + return result + + rewrites = {} + for tag in _WIKI_TAGS: + value = extratags.get(tag) + if not value: + continue + url, status = rewrite_wiki_link(tag, value) + if status != 'original': + extratags[tag] = url + rewrites[tag] = status + + if rewrites: + result['extratags'] = extratags + result.setdefault('sources', {})['wiki_rewrites'] = rewrites + logger.debug(f"Wiki rewrites for {result.get('osm_type')}/{result.get('osm_id')}: {rewrites}") + + return result + # ── Nominatim parsing ─────────────────────────────────────────────────── # Nominatim address array uses rank_address to indicate what each entry is. @@ -560,6 +610,7 @@ def get_place_detail(osm_type, osm_id): if nominatim_result: nominatim_result = _enrich_with_overture(nominatim_result, osm_type, osm_id) nominatim_result = _enrich_with_google(nominatim_result, osm_type, osm_id) + nominatim_result = _enrich_wiki_links(nominatim_result) cache_put(osm_type, osm_id, nominatim_result, 'nominatim_local') return nominatim_result, 200 @@ -592,6 +643,7 @@ def get_place_detail(osm_type, osm_id): if overpass_result: overpass_result = _enrich_with_overture(overpass_result, osm_type, osm_id) overpass_result = _enrich_with_google(overpass_result, osm_type, osm_id) + overpass_result = _enrich_wiki_links(overpass_result) cache_put(osm_type, osm_id, overpass_result, 'overpass') return overpass_result, 200 diff --git a/lib/wiki_rewrite.py b/lib/wiki_rewrite.py new file mode 100644 index 0000000..d884635 --- /dev/null +++ b/lib/wiki_rewrite.py @@ -0,0 +1,324 @@ +""" +Wiki link rewriter — rewrites OSM wikipedia/wikidata/wikivoyage/appropedia +links to local Kiwix URLs where the article exists in a loaded ZIM. + +Falls back silently to public URLs when article is unavailable locally. +Caches positive results only in place_cache.db. + +Kiwix catalog is parsed from the OPDS Atom feed at startup and refreshed +hourly to pick up newly loaded ZIMs without a restart. + +Operations note: + - After loading a new ZIM, either restart RECON (forces fresh catalog + fetch) or wait up to 1 hour for automatic refresh. + - To invalidate the wiki cache (e.g. after ZIM update): + sqlite3 /opt/recon/data/place_cache.db "DELETE FROM wiki_cache;" +""" +import os +import re +import sqlite3 +import time +import xml.etree.ElementTree as ET +from urllib.parse import unquote, quote + +import requests as http_requests + +from .utils import setup_logging + +logger = setup_logging('recon.wiki_rewrite') + +# ── Configuration ─────────────────────────────────────────────────────── + +KIWIX_BASE = "http://localhost:8430" +KIWIX_PUBLIC_BASE = "https://wiki.echo6.co" +KIWIX_CATALOG_URL = f"{KIWIX_BASE}/catalog/v2/entries" +HEAD_TIMEOUT = 1.5 # seconds +CATALOG_REFRESH_INTERVAL = 3600 # 1 hour + +# OPDS Atom namespace +_ATOM_NS = "http://www.w3.org/2005/Atom" + +# ── ZIM catalog map ───────────────────────────────────────────────────── + +_zim_map = {} # source_type → content_path e.g. 'wikipedia' → 'wikipedia_en_all_maxi_2026-02' +_zim_map_ts = 0.0 # last refresh timestamp + +# Prefix-to-source-type mapping (order matters: longest prefix first) +_ZIM_PREFIX_MAP = [ + ('wikipedia_en_all', 'wikipedia'), + ('appropedia_en_all', 'appropedia'), + ('wikivoyage_en', 'wikivoyage'), + ('wikidata_en', 'wikidata'), +] + + +def _discover_zims(): + """Parse Kiwix OPDS Atom catalog to map source types to content paths.""" + global _zim_map, _zim_map_ts + + try: + resp = http_requests.get(KIWIX_CATALOG_URL, timeout=5) + if resp.status_code != 200: + logger.warning(f"Kiwix catalog returned HTTP {resp.status_code}") + return + + root = ET.fromstring(resp.content) + new_map = {} + + for entry in root.findall(f"{{{_ATOM_NS}}}entry"): + name_el = entry.find(f"{{{_ATOM_NS}}}name") + if name_el is None: + continue + book_name = name_el.text or "" + + # + content_path = None + for link in entry.findall(f"{{{_ATOM_NS}}}link"): + if link.get("type") == "text/html": + href = link.get("href", "") + if href.startswith("/content/"): + content_path = href[len("/content/"):] + break + + if not content_path: + continue + + # Match book name against known prefixes + for prefix, source_type in _ZIM_PREFIX_MAP: + if book_name.startswith(prefix): + new_map[source_type] = content_path + break + + _zim_map = new_map + _zim_map_ts = time.time() + logger.info(f"ZIM catalog refreshed: {new_map}") + + except Exception as e: + logger.warning(f"Failed to discover ZIMs from Kiwix catalog: {e}") + + +def _ensure_zim_map(): + """Lazy-load and refresh ZIM map if stale.""" + if not _zim_map or (time.time() - _zim_map_ts) > CATALOG_REFRESH_INTERVAL: + _discover_zims() + + +# ── Database (wiki_cache in place_cache.db) ───────────────────────────── + +_db_conn = None + + +def _get_db(): + """Return a module-level SQLite connection to place_cache.db (lazy init).""" + global _db_conn + if _db_conn is not None: + return _db_conn + + db_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data') + os.makedirs(db_dir, exist_ok=True) + db_path = os.path.join(db_dir, 'place_cache.db') + + _db_conn = sqlite3.connect(db_path, check_same_thread=False) + _db_conn.execute("PRAGMA journal_mode=WAL") + _db_conn.execute("PRAGMA synchronous=NORMAL") + _db_conn.execute(""" + CREATE TABLE IF NOT EXISTS wiki_cache ( + source_type TEXT NOT NULL, + article_id TEXT NOT NULL, + kiwix_url TEXT NOT NULL, + cached_at INTEGER NOT NULL, + PRIMARY KEY (source_type, article_id) + ) + """) + _db_conn.commit() + logger.info(f"Wiki cache table ready in {db_path}") + return _db_conn + + +# ── URL classification ────────────────────────────────────────────────── + +# Patterns for OSM wikipedia/wikidata tag values +_WIKI_TAG_RE = re.compile(r'^(?:en:)?(.+)$') # "en:Title" or just "Title" +_WIKI_URL_RE = re.compile(r'https?://en\.wikipedia\.org/wiki/(.+)') +_WIKIDATA_TAG_RE = re.compile(r'^(Q\d+)$') +_WIKIDATA_URL_RE = re.compile(r'https?://(?:www\.)?wikidata\.org/wiki/(Q\d+)') +_WIKIVOYAGE_URL_RE = re.compile(r'https?://en\.wikivoyage\.org/wiki/(.+)') +_APPROPEDIA_URL_RE = re.compile(r'https?://(?:www\.)?appropedia\.org/(?:wiki/)?(.+)') + + +def _normalize_article_id(article_id): + """Normalize article ID to MediaWiki/Kiwix convention: spaces → underscores.""" + return article_id.replace(' ', '_') + + +def classify_wiki_link(tag_name, value): + """ + Classify an OSM extratag value into (source_type, article_id) or None. + + tag_name: the extratags key ('wikipedia', 'wikidata', etc.) + value: the raw tag value from OSM + + Article IDs are normalized to MediaWiki convention (spaces → underscores). + """ + if not value or not isinstance(value, str): + return None + + value = value.strip() + + if tag_name == 'wikidata': + m = _WIKIDATA_TAG_RE.match(value) + if m: + return ('wikidata', m.group(1)) + m = _WIKIDATA_URL_RE.match(value) + if m: + return ('wikidata', m.group(1)) + return None + + if tag_name == 'wikipedia': + # URL form: https://en.wikipedia.org/wiki/Title + m = _WIKI_URL_RE.match(value) + if m: + return ('wikipedia', _normalize_article_id(unquote(m.group(1)))) + # Tag form: "en:Title" or "Title" + m = _WIKI_TAG_RE.match(value) + if m: + return ('wikipedia', _normalize_article_id(m.group(1))) + return None + + if tag_name == 'wikivoyage': + m = _WIKIVOYAGE_URL_RE.match(value) + if m: + return ('wikivoyage', _normalize_article_id(unquote(m.group(1)))) + # Plain tag: "en:Title" or "Title" + m = _WIKI_TAG_RE.match(value) + if m: + return ('wikivoyage', _normalize_article_id(m.group(1))) + return None + + if tag_name == 'appropedia': + m = _APPROPEDIA_URL_RE.match(value) + if m: + return ('appropedia', _normalize_article_id(unquote(m.group(1)))) + return ('appropedia', _normalize_article_id(value)) + + return None + + +# ── URL builders ──────────────────────────────────────────────────────── + +def build_kiwix_url(source_type, article_id): + """Build a public Kiwix URL. Returns None if source_type not in ZIM map.""" + _ensure_zim_map() + content_path = _zim_map.get(source_type) + if not content_path: + return None + return f"{KIWIX_PUBLIC_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}" + + +_PUBLIC_URL_TEMPLATES = { + 'wikipedia': "https://en.wikipedia.org/wiki/{id}", + 'wikidata': "https://www.wikidata.org/wiki/{id}", + 'wikivoyage': "https://en.wikivoyage.org/wiki/{id}", + 'appropedia': "https://www.appropedia.org/wiki/{id}", +} + + +def build_public_url(source_type, article_id): + """Build the canonical public URL for a wiki article.""" + tmpl = _PUBLIC_URL_TEMPLATES.get(source_type) + if not tmpl: + return None + return tmpl.format(id=quote(article_id, safe='/:@!$&\'()*+,;=')) + + +# ── Kiwix availability check ─────────────────────────────────────────── + +def check_kiwix_has_article(source_type, article_id): + """ + Check if an article exists in local Kiwix. + + Returns (bool, url): + - (True, kiwix_public_url) if article exists locally + - (False, None) if not found or Kiwix unavailable + + Only positive results are cached. + """ + # Check cache first + db = _get_db() + row = db.execute( + "SELECT kiwix_url FROM wiki_cache WHERE source_type=? AND article_id=?", + (source_type, article_id) + ).fetchone() + if row: + return (True, row[0]) + + # Build local HEAD URL + _ensure_zim_map() + content_path = _zim_map.get(source_type) + if not content_path: + return (False, None) + + head_url = f"{KIWIX_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}" + + try: + resp = http_requests.head(head_url, timeout=HEAD_TIMEOUT, allow_redirects=True) + if resp.status_code == 200: + kiwix_url = build_kiwix_url(source_type, article_id) + # Cache positive result + now = int(time.time()) + db.execute(""" + INSERT OR REPLACE INTO wiki_cache (source_type, article_id, kiwix_url, cached_at) + VALUES (?, ?, ?, ?) + """, (source_type, article_id, kiwix_url, now)) + db.commit() + return (True, kiwix_url) + else: + return (False, None) + except Exception as e: + logger.debug(f"Kiwix HEAD failed for {source_type}/{article_id}: {e}") + return (False, None) + + +# ── Primary entry point ──────────────────────────────────────────────── + +def rewrite_wiki_link(tag_name, value): + """ + Rewrite an OSM wiki tag value to a local Kiwix URL if available. + + Returns (url, 'local'|'public') or (None, None) if unrecognized. + """ + classified = classify_wiki_link(tag_name, value) + if not classified: + return (value, 'original') + + source_type, article_id = classified + + # Try local Kiwix + found, kiwix_url = check_kiwix_has_article(source_type, article_id) + if found and kiwix_url: + return (kiwix_url, 'local') + + # Fall back to public URL + public_url = build_public_url(source_type, article_id) + if public_url: + return (public_url, 'public') + + return (value, 'original') + + +# ── Discovery stubs (disabled, for future activation) ─────────────────── + +def discover_wikivoyage_article(name, category, lat, lon): + """ + Discover a related Wikivoyage article for a place. + Enabled by has_wiki_discovery. Currently returns None. + """ + return None + + +def discover_appropedia_article(name, category): + """ + Discover a related Appropedia article for a place. + Enabled by has_wiki_discovery. Currently returns None. + """ + return None